{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999398785546805, "eval_steps": 500, "global_step": 8316, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "auxiliary_loss_clip": 0.05193558, "auxiliary_loss_mlp": 0.02258645, "balance_loss_clip": 2.45764518, "balance_loss_mlp": 1.81232548, "epoch": 0.00012024289063909097, "flos": 24932483919360.0, "grad_norm": 39.9092030007325, "language_loss": 2.5809741, "learning_rate": 0.0, "loss": 1.90436125, "num_input_tokens_seen": 20375, "step": 1, "time_per_iteration": 16.93146824836731 }, { "auxiliary_loss_clip": 0.03505141, "auxiliary_loss_mlp": 0.01483073, "balance_loss_clip": 1.64233541, "balance_loss_mlp": 1.17827916, "epoch": 0.00024048578127818193, "flos": 30664624377600.0, "grad_norm": 55.06769726920434, "language_loss": 1.89096117, "learning_rate": 5.021476677069823e-07, "loss": 1.94084334, "num_input_tokens_seen": 39035, "step": 2, "time_per_iteration": 2.6039035320281982 }, { "auxiliary_loss_clip": 0.03523698, "auxiliary_loss_mlp": 0.01528064, "balance_loss_clip": 1.649333, "balance_loss_mlp": 1.22269797, "epoch": 0.0003607286719172729, "flos": 19026227969280.0, "grad_norm": 40.2554894850374, "language_loss": 1.6150949, "learning_rate": 7.958852231401551e-07, "loss": 1.66561258, "num_input_tokens_seen": 57600, "step": 3, "time_per_iteration": 2.544471263885498 }, { "auxiliary_loss_clip": 0.03465806, "auxiliary_loss_mlp": 0.01553392, "balance_loss_clip": 1.63918197, "balance_loss_mlp": 1.25107729, "epoch": 0.00048097156255636386, "flos": 19316314206720.0, "grad_norm": 37.3588227126071, "language_loss": 1.64637542, "learning_rate": 1.0042953354139647e-06, "loss": 1.69656754, "num_input_tokens_seen": 76465, "step": 4, "time_per_iteration": 2.629526376724243 }, { "auxiliary_loss_clip": 0.03477928, "auxiliary_loss_mlp": 0.014873, "balance_loss_clip": 1.64129782, "balance_loss_mlp": 1.1857481, "epoch": 0.0006012144531954548, "flos": 13991264893440.0, "grad_norm": 54.42144504297324, "language_loss": 1.93297172, "learning_rate": 1.1659507774310057e-06, "loss": 1.98262393, "num_input_tokens_seen": 94350, "step": 5, "time_per_iteration": 2.921468496322632 }, { "auxiliary_loss_clip": 0.03471652, "auxiliary_loss_mlp": 0.01527653, "balance_loss_clip": 1.64258587, "balance_loss_mlp": 1.22324038, "epoch": 0.0007214573438345458, "flos": 23148988225920.0, "grad_norm": 44.81670089207379, "language_loss": 1.61469388, "learning_rate": 1.2980328908471373e-06, "loss": 1.66468692, "num_input_tokens_seen": 114595, "step": 6, "time_per_iteration": 2.9095523357391357 }, { "auxiliary_loss_clip": 0.03522986, "auxiliary_loss_mlp": 0.01395251, "balance_loss_clip": 1.75981927, "balance_loss_mlp": 1.17705059, "epoch": 0.0008417002344736367, "flos": 67663246170240.0, "grad_norm": 4.605124612865274, "language_loss": 0.81462961, "learning_rate": 1.4097067265369432e-06, "loss": 0.86381197, "num_input_tokens_seen": 179590, "step": 7, "time_per_iteration": 3.3247365951538086 }, { "auxiliary_loss_clip": 0.03485548, "auxiliary_loss_mlp": 0.01505806, "balance_loss_clip": 1.64518929, "balance_loss_mlp": 1.20520818, "epoch": 0.0009619431251127277, "flos": 21281381504640.0, "grad_norm": 41.31484902795418, "language_loss": 1.58596849, "learning_rate": 1.506443003120947e-06, "loss": 1.6358819, "num_input_tokens_seen": 195090, "step": 8, "time_per_iteration": 2.9696059226989746 }, { "auxiliary_loss_clip": 0.03468373, "auxiliary_loss_mlp": 0.01534008, "balance_loss_clip": 1.63961077, "balance_loss_mlp": 1.21700656, "epoch": 0.0010821860157518186, "flos": 23331342597120.0, "grad_norm": 18.91606353784702, "language_loss": 1.47664285, "learning_rate": 1.5917704462803102e-06, "loss": 1.52666664, "num_input_tokens_seen": 211635, "step": 9, "time_per_iteration": 2.87300443649292 }, { "auxiliary_loss_clip": 0.03449899, "auxiliary_loss_mlp": 0.01537669, "balance_loss_clip": 1.6401993, "balance_loss_mlp": 1.21628094, "epoch": 0.0012024289063909096, "flos": 17010166337280.0, "grad_norm": 13.12300246605859, "language_loss": 1.5308615, "learning_rate": 1.6680984451379884e-06, "loss": 1.58073711, "num_input_tokens_seen": 224705, "step": 10, "time_per_iteration": 2.9772186279296875 }, { "auxiliary_loss_clip": 0.03456519, "auxiliary_loss_mlp": 0.01531424, "balance_loss_clip": 1.63706553, "balance_loss_mlp": 1.20889127, "epoch": 0.0013226717970300007, "flos": 21288133261440.0, "grad_norm": 15.66798966365629, "language_loss": 1.32501817, "learning_rate": 1.7371455188905097e-06, "loss": 1.37489748, "num_input_tokens_seen": 244635, "step": 11, "time_per_iteration": 2.8545846939086914 }, { "auxiliary_loss_clip": 0.03487306, "auxiliary_loss_mlp": 0.01488202, "balance_loss_clip": 1.64411879, "balance_loss_mlp": 1.18302643, "epoch": 0.0014429146876690916, "flos": 27237884935680.0, "grad_norm": 11.350133725425517, "language_loss": 1.25365233, "learning_rate": 1.8001805585541196e-06, "loss": 1.30340743, "num_input_tokens_seen": 265765, "step": 12, "time_per_iteration": 2.932328701019287 }, { "auxiliary_loss_clip": 0.03455878, "auxiliary_loss_mlp": 0.01512805, "balance_loss_clip": 1.64310932, "balance_loss_mlp": 1.20762897, "epoch": 0.0015631575783081825, "flos": 19062174504960.0, "grad_norm": 6.649512621355787, "language_loss": 1.29173923, "learning_rate": 1.8581671739548328e-06, "loss": 1.3414259, "num_input_tokens_seen": 283500, "step": 13, "time_per_iteration": 2.794553756713867 }, { "auxiliary_loss_clip": 0.03448032, "auxiliary_loss_mlp": 0.01496911, "balance_loss_clip": 1.63838029, "balance_loss_mlp": 1.19955528, "epoch": 0.0016834004689472734, "flos": 48139473985920.0, "grad_norm": 6.3797901443545015, "language_loss": 1.13568556, "learning_rate": 1.9118543942439254e-06, "loss": 1.18513501, "num_input_tokens_seen": 305685, "step": 14, "time_per_iteration": 4.938063383102417 }, { "auxiliary_loss_clip": 0.03442607, "auxiliary_loss_mlp": 0.0153484, "balance_loss_clip": 1.63571203, "balance_loss_mlp": 1.22375131, "epoch": 0.0018036433595863645, "flos": 34970026314240.0, "grad_norm": 5.420767013542678, "language_loss": 1.12698269, "learning_rate": 1.961836000571161e-06, "loss": 1.1767571, "num_input_tokens_seen": 327340, "step": 15, "time_per_iteration": 3.891227960586548 }, { "auxiliary_loss_clip": 0.03367675, "auxiliary_loss_mlp": 0.01357016, "balance_loss_clip": 1.72421837, "balance_loss_mlp": 1.14644456, "epoch": 0.0019238862502254555, "flos": 59768284440960.0, "grad_norm": 3.7822258071905757, "language_loss": 0.64647269, "learning_rate": 2.0085906708279293e-06, "loss": 0.69371963, "num_input_tokens_seen": 382710, "step": 16, "time_per_iteration": 3.26826548576355 }, { "auxiliary_loss_clip": 0.03421502, "auxiliary_loss_mlp": 0.01512502, "balance_loss_clip": 1.63148022, "balance_loss_mlp": 1.20847034, "epoch": 0.0020441291408645466, "flos": 20814543417600.0, "grad_norm": 4.434096526787578, "language_loss": 1.16022873, "learning_rate": 2.0525099325728135e-06, "loss": 1.20956874, "num_input_tokens_seen": 400890, "step": 17, "time_per_iteration": 2.8019587993621826 }, { "auxiliary_loss_clip": 0.03312595, "auxiliary_loss_mlp": 0.01344283, "balance_loss_clip": 1.71309245, "balance_loss_mlp": 1.1367631, "epoch": 0.0021643720315036373, "flos": 63857001582720.0, "grad_norm": 3.549743130858788, "language_loss": 0.72151923, "learning_rate": 2.0939181139872922e-06, "loss": 0.76808798, "num_input_tokens_seen": 462605, "step": 18, "time_per_iteration": 3.273317575454712 }, { "auxiliary_loss_clip": 0.03441758, "auxiliary_loss_mlp": 0.01513038, "balance_loss_clip": 1.63500786, "balance_loss_mlp": 1.21491957, "epoch": 0.0022846149221427284, "flos": 31284981192960.0, "grad_norm": 4.878492016490422, "language_loss": 1.01644552, "learning_rate": 2.1330868934640175e-06, "loss": 1.06599355, "num_input_tokens_seen": 483280, "step": 19, "time_per_iteration": 2.96445369720459 }, { "auxiliary_loss_clip": 0.03246474, "auxiliary_loss_mlp": 0.01324667, "balance_loss_clip": 1.6973958, "balance_loss_mlp": 1.12019908, "epoch": 0.002404857812781819, "flos": 51083648161920.0, "grad_norm": 3.561212263217037, "language_loss": 0.76427865, "learning_rate": 2.170246112844971e-06, "loss": 0.80999005, "num_input_tokens_seen": 537620, "step": 20, "time_per_iteration": 3.0685648918151855 }, { "auxiliary_loss_clip": 0.03332759, "auxiliary_loss_mlp": 0.01484672, "balance_loss_clip": 1.61918783, "balance_loss_mlp": 1.18827057, "epoch": 0.0025251007034209102, "flos": 15815347309440.0, "grad_norm": 3.9688629026291973, "language_loss": 1.01540518, "learning_rate": 2.2055919496770983e-06, "loss": 1.06357944, "num_input_tokens_seen": 555760, "step": 21, "time_per_iteration": 2.833526372909546 }, { "auxiliary_loss_clip": 0.03304403, "auxiliary_loss_mlp": 0.01438349, "balance_loss_clip": 1.61586738, "balance_loss_mlp": 1.15682411, "epoch": 0.0026453435940600014, "flos": 37851857458560.0, "grad_norm": 4.083835323219352, "language_loss": 0.8942095, "learning_rate": 2.2392931865974923e-06, "loss": 0.94163704, "num_input_tokens_seen": 578450, "step": 22, "time_per_iteration": 2.996194839477539 }, { "auxiliary_loss_clip": 0.0327631, "auxiliary_loss_mlp": 0.01448073, "balance_loss_clip": 1.6009357, "balance_loss_mlp": 1.16044497, "epoch": 0.002765586484699092, "flos": 21141976821120.0, "grad_norm": 34.847637812604574, "language_loss": 1.01756394, "learning_rate": 2.271496085962064e-06, "loss": 1.06480777, "num_input_tokens_seen": 596145, "step": 23, "time_per_iteration": 2.860384225845337 }, { "auxiliary_loss_clip": 0.03260925, "auxiliary_loss_mlp": 0.01430761, "balance_loss_clip": 1.59979367, "balance_loss_mlp": 1.16068006, "epoch": 0.002885829375338183, "flos": 20667381396480.0, "grad_norm": 3.3857419267848687, "language_loss": 1.02758169, "learning_rate": 2.3023282262611022e-06, "loss": 1.07449853, "num_input_tokens_seen": 614920, "step": 24, "time_per_iteration": 2.8859052658081055 }, { "auxiliary_loss_clip": 0.03217133, "auxiliary_loss_mlp": 0.01405949, "balance_loss_clip": 1.59774673, "balance_loss_mlp": 1.14139998, "epoch": 0.003006072265977274, "flos": 34823869873920.0, "grad_norm": 3.4101497069249427, "language_loss": 0.92544693, "learning_rate": 2.3319015548620114e-06, "loss": 0.97167772, "num_input_tokens_seen": 636060, "step": 25, "time_per_iteration": 2.918639898300171 }, { "auxiliary_loss_clip": 0.03209834, "auxiliary_loss_mlp": 0.01382354, "balance_loss_clip": 1.59205794, "balance_loss_mlp": 1.12524319, "epoch": 0.003126315156616365, "flos": 24422021118720.0, "grad_norm": 2.3702487812962176, "language_loss": 0.93006068, "learning_rate": 2.3603148416618152e-06, "loss": 0.97598255, "num_input_tokens_seen": 655575, "step": 26, "time_per_iteration": 2.894437789916992 }, { "auxiliary_loss_clip": 0.03173484, "auxiliary_loss_mlp": 0.01381043, "balance_loss_clip": 1.5846926, "balance_loss_mlp": 1.13366032, "epoch": 0.003246558047255456, "flos": 23622326674560.0, "grad_norm": 2.7745914468689596, "language_loss": 1.01044846, "learning_rate": 2.3876556694204647e-06, "loss": 1.0559938, "num_input_tokens_seen": 675730, "step": 27, "time_per_iteration": 2.8325555324554443 }, { "auxiliary_loss_clip": 0.03150223, "auxiliary_loss_mlp": 0.0140087, "balance_loss_clip": 1.58338547, "balance_loss_mlp": 1.13250637, "epoch": 0.003366800937894547, "flos": 17820275725440.0, "grad_norm": 2.6461102706897144, "language_loss": 0.90715837, "learning_rate": 2.414002061950908e-06, "loss": 0.95266926, "num_input_tokens_seen": 694605, "step": 28, "time_per_iteration": 2.812558889389038 }, { "auxiliary_loss_clip": 0.03135772, "auxiliary_loss_mlp": 0.01393471, "balance_loss_clip": 1.57824349, "balance_loss_mlp": 1.13540637, "epoch": 0.003487043828533638, "flos": 24426115269120.0, "grad_norm": 2.3162438455491334, "language_loss": 0.99903917, "learning_rate": 2.4394238264681557e-06, "loss": 1.04433155, "num_input_tokens_seen": 714340, "step": 29, "time_per_iteration": 2.9216537475585938 }, { "auxiliary_loss_clip": 0.03141953, "auxiliary_loss_mlp": 0.01363171, "balance_loss_clip": 1.57822204, "balance_loss_mlp": 1.10148263, "epoch": 0.003607286719172729, "flos": 26140311002880.0, "grad_norm": 2.0330939718818404, "language_loss": 0.99713457, "learning_rate": 2.4639836682781433e-06, "loss": 1.04218578, "num_input_tokens_seen": 734470, "step": 30, "time_per_iteration": 2.973302125930786 }, { "auxiliary_loss_clip": 0.03098388, "auxiliary_loss_mlp": 0.01411618, "balance_loss_clip": 1.57263994, "balance_loss_mlp": 1.13905811, "epoch": 0.00372752960981182, "flos": 20593082113920.0, "grad_norm": 2.564195220517676, "language_loss": 1.00361085, "learning_rate": 2.487738122623307e-06, "loss": 1.04871082, "num_input_tokens_seen": 753380, "step": 31, "time_per_iteration": 2.8497416973114014 }, { "auxiliary_loss_clip": 0.03079866, "auxiliary_loss_mlp": 0.0138039, "balance_loss_clip": 1.56638432, "balance_loss_mlp": 1.12633109, "epoch": 0.003847772500450911, "flos": 22674608282880.0, "grad_norm": 2.6692228981972854, "language_loss": 0.99038821, "learning_rate": 2.510738338534912e-06, "loss": 1.03499079, "num_input_tokens_seen": 772105, "step": 32, "time_per_iteration": 2.834355115890503 }, { "auxiliary_loss_clip": 0.02968851, "auxiliary_loss_mlp": 0.01346632, "balance_loss_clip": 1.53999829, "balance_loss_mlp": 1.10687816, "epoch": 0.003968015391090002, "flos": 17967796882560.0, "grad_norm": 2.7819633330194655, "language_loss": 1.02576137, "learning_rate": 2.5330307420306648e-06, "loss": 1.06891632, "num_input_tokens_seen": 788955, "step": 33, "time_per_iteration": 2.863095283508301 }, { "auxiliary_loss_clip": 0.02888835, "auxiliary_loss_mlp": 0.01351996, "balance_loss_clip": 1.52351427, "balance_loss_mlp": 1.10899985, "epoch": 0.004088258281729093, "flos": 27304103658240.0, "grad_norm": 8.572716293855695, "language_loss": 0.88375056, "learning_rate": 2.554657600279796e-06, "loss": 0.92615879, "num_input_tokens_seen": 810230, "step": 34, "time_per_iteration": 2.8568320274353027 }, { "auxiliary_loss_clip": 0.02854125, "auxiliary_loss_mlp": 0.01349264, "balance_loss_clip": 1.5113802, "balance_loss_mlp": 1.10512388, "epoch": 0.004208501172368184, "flos": 23258587599360.0, "grad_norm": 3.7400032869079025, "language_loss": 1.0344125, "learning_rate": 2.5756575039679493e-06, "loss": 1.07644653, "num_input_tokens_seen": 829780, "step": 35, "time_per_iteration": 2.8420159816741943 }, { "auxiliary_loss_clip": 0.02828838, "auxiliary_loss_mlp": 0.0131067, "balance_loss_clip": 1.50485766, "balance_loss_mlp": 1.08464897, "epoch": 0.0043287440630072746, "flos": 17312104062720.0, "grad_norm": 3.488941888719664, "language_loss": 0.95270777, "learning_rate": 2.5960657816942747e-06, "loss": 0.99410284, "num_input_tokens_seen": 848695, "step": 36, "time_per_iteration": 2.8730809688568115 }, { "auxiliary_loss_clip": 0.02406716, "auxiliary_loss_mlp": 0.0126923, "balance_loss_clip": 1.46862507, "balance_loss_mlp": 1.1082499, "epoch": 0.004448986953646365, "flos": 53092491160320.0, "grad_norm": 1.3813536658734016, "language_loss": 0.60949177, "learning_rate": 2.6159148575788668e-06, "loss": 0.6462512, "num_input_tokens_seen": 906730, "step": 37, "time_per_iteration": 3.2495594024658203 }, { "auxiliary_loss_clip": 0.02758766, "auxiliary_loss_mlp": 0.01339965, "balance_loss_clip": 1.49091446, "balance_loss_mlp": 1.1011647, "epoch": 0.004569229844285457, "flos": 13444165866240.0, "grad_norm": 2.5128372248170416, "language_loss": 0.98894882, "learning_rate": 2.635234561171e-06, "loss": 1.02993608, "num_input_tokens_seen": 925125, "step": 38, "time_per_iteration": 2.8711650371551514 }, { "auxiliary_loss_clip": 0.02725461, "auxiliary_loss_mlp": 0.01291295, "balance_loss_clip": 1.48637366, "balance_loss_mlp": 1.07786274, "epoch": 0.0046894727349245475, "flos": 16209609966720.0, "grad_norm": 2.3642156249914885, "language_loss": 0.94398344, "learning_rate": 2.6540523970949877e-06, "loss": 0.98415101, "num_input_tokens_seen": 939970, "step": 39, "time_per_iteration": 2.734158754348755 }, { "auxiliary_loss_clip": 0.02724081, "auxiliary_loss_mlp": 0.01306803, "balance_loss_clip": 1.48933625, "balance_loss_mlp": 1.07887483, "epoch": 0.004809715625563638, "flos": 23914244505600.0, "grad_norm": 2.8780268756799714, "language_loss": 0.92616236, "learning_rate": 2.6723937805519533e-06, "loss": 0.9664712, "num_input_tokens_seen": 957470, "step": 40, "time_per_iteration": 3.7720131874084473 }, { "auxiliary_loss_clip": 0.02692955, "auxiliary_loss_mlp": 0.01293327, "balance_loss_clip": 1.47693062, "balance_loss_mlp": 1.08285058, "epoch": 0.00492995851620273, "flos": 20773030273920.0, "grad_norm": 2.2187087183703524, "language_loss": 0.93298197, "learning_rate": 2.690282243737839e-06, "loss": 0.97284478, "num_input_tokens_seen": 976405, "step": 41, "time_per_iteration": 3.978334665298462 }, { "auxiliary_loss_clip": 0.02670709, "auxiliary_loss_mlp": 0.01299446, "balance_loss_clip": 1.46818006, "balance_loss_mlp": 1.08601332, "epoch": 0.0050502014068418205, "flos": 20338655103360.0, "grad_norm": 2.8631201590317863, "language_loss": 0.9963544, "learning_rate": 2.7077396173840807e-06, "loss": 1.03605604, "num_input_tokens_seen": 994690, "step": 42, "time_per_iteration": 2.7881276607513428 }, { "auxiliary_loss_clip": 0.02641417, "auxiliary_loss_mlp": 0.01298487, "balance_loss_clip": 1.46462142, "balance_loss_mlp": 1.07723451, "epoch": 0.005170444297480911, "flos": 25994872834560.0, "grad_norm": 2.1725079673897905, "language_loss": 0.92908984, "learning_rate": 2.7247861909342594e-06, "loss": 0.96848887, "num_input_tokens_seen": 1015615, "step": 43, "time_per_iteration": 2.880636215209961 }, { "auxiliary_loss_clip": 0.02621125, "auxiliary_loss_mlp": 0.01316529, "balance_loss_clip": 1.45914221, "balance_loss_mlp": 1.09851885, "epoch": 0.005290687188120003, "flos": 20954055841920.0, "grad_norm": 2.3450884458445347, "language_loss": 0.83091569, "learning_rate": 2.7414408543044743e-06, "loss": 0.87029219, "num_input_tokens_seen": 1031255, "step": 44, "time_per_iteration": 2.856205701828003 }, { "auxiliary_loss_clip": 0.0258179, "auxiliary_loss_mlp": 0.01265591, "balance_loss_clip": 1.44435716, "balance_loss_mlp": 1.06694031, "epoch": 0.005410930078759093, "flos": 15851401585920.0, "grad_norm": 3.2538208395866293, "language_loss": 0.79450488, "learning_rate": 2.7577212237113157e-06, "loss": 0.83297867, "num_input_tokens_seen": 1048295, "step": 45, "time_per_iteration": 2.750166416168213 }, { "auxiliary_loss_clip": 0.02536431, "auxiliary_loss_mlp": 0.01272853, "balance_loss_clip": 1.44301558, "balance_loss_mlp": 1.07601428, "epoch": 0.005531172969398184, "flos": 21104988791040.0, "grad_norm": 2.0193076783603465, "language_loss": 1.04423904, "learning_rate": 2.7736437536690466e-06, "loss": 1.0823319, "num_input_tokens_seen": 1067925, "step": 46, "time_per_iteration": 2.8479154109954834 }, { "auxiliary_loss_clip": 0.02529191, "auxiliary_loss_mlp": 0.01253486, "balance_loss_clip": 1.44099021, "balance_loss_mlp": 1.06008089, "epoch": 0.005651415860037276, "flos": 20844887431680.0, "grad_norm": 2.1980359338317306, "language_loss": 1.08085704, "learning_rate": 2.789223836941131e-06, "loss": 1.11868382, "num_input_tokens_seen": 1088060, "step": 47, "time_per_iteration": 2.8182520866394043 }, { "auxiliary_loss_clip": 0.02484026, "auxiliary_loss_mlp": 0.01237392, "balance_loss_clip": 1.42620373, "balance_loss_mlp": 1.05619371, "epoch": 0.005771658750676366, "flos": 13260195383040.0, "grad_norm": 2.3112295075809484, "language_loss": 1.08649659, "learning_rate": 2.8044758939680847e-06, "loss": 1.12371087, "num_input_tokens_seen": 1104130, "step": 48, "time_per_iteration": 2.768869400024414 }, { "auxiliary_loss_clip": 0.02446764, "auxiliary_loss_mlp": 0.01246472, "balance_loss_clip": 1.41519725, "balance_loss_mlp": 1.06203103, "epoch": 0.005891901641315457, "flos": 24425396997120.0, "grad_norm": 2.46389125668671, "language_loss": 1.02210903, "learning_rate": 2.8194134530738863e-06, "loss": 1.05904138, "num_input_tokens_seen": 1122900, "step": 49, "time_per_iteration": 2.8086564540863037 }, { "auxiliary_loss_clip": 0.02425921, "auxiliary_loss_mlp": 0.01226736, "balance_loss_clip": 1.41447139, "balance_loss_mlp": 1.05240417, "epoch": 0.006012144531954548, "flos": 23076197314560.0, "grad_norm": 2.9770319933323526, "language_loss": 0.90179539, "learning_rate": 2.834049222568994e-06, "loss": 0.93832201, "num_input_tokens_seen": 1140250, "step": 50, "time_per_iteration": 2.9698691368103027 }, { "auxiliary_loss_clip": 0.02449688, "auxiliary_loss_mlp": 0.01234987, "balance_loss_clip": 1.41862488, "balance_loss_mlp": 1.05350304, "epoch": 0.006132387422593639, "flos": 22528775064960.0, "grad_norm": 2.1064351793208216, "language_loss": 0.92640108, "learning_rate": 2.848395155712969e-06, "loss": 0.9632479, "num_input_tokens_seen": 1160470, "step": 51, "time_per_iteration": 2.917454957962036 }, { "auxiliary_loss_clip": 0.02389157, "auxiliary_loss_mlp": 0.01227121, "balance_loss_clip": 1.40304196, "balance_loss_mlp": 1.05698574, "epoch": 0.00625263031323273, "flos": 27628340751360.0, "grad_norm": 2.1660587833425713, "language_loss": 0.9772203, "learning_rate": 2.8624625093687977e-06, "loss": 1.01338315, "num_input_tokens_seen": 1177605, "step": 52, "time_per_iteration": 2.873462438583374 }, { "auxiliary_loss_clip": 0.02384833, "auxiliary_loss_mlp": 0.01228564, "balance_loss_clip": 1.40669513, "balance_loss_mlp": 1.05690312, "epoch": 0.006372873203871821, "flos": 23110671392640.0, "grad_norm": 2.2087786803110965, "language_loss": 0.88972574, "learning_rate": 2.876261897070029e-06, "loss": 0.92585969, "num_input_tokens_seen": 1197735, "step": 53, "time_per_iteration": 2.8417415618896484 }, { "auxiliary_loss_clip": 0.0237556, "auxiliary_loss_mlp": 0.01236063, "balance_loss_clip": 1.40374446, "balance_loss_mlp": 1.06859815, "epoch": 0.006493116094510912, "flos": 22856028900480.0, "grad_norm": 2.803404416658769, "language_loss": 0.92620933, "learning_rate": 2.889803337127447e-06, "loss": 0.96232557, "num_input_tokens_seen": 1216335, "step": 54, "time_per_iteration": 2.920837640762329 }, { "auxiliary_loss_clip": 0.02322462, "auxiliary_loss_mlp": 0.01218144, "balance_loss_clip": 1.38629115, "balance_loss_mlp": 1.05783129, "epoch": 0.006613358985150003, "flos": 23071708114560.0, "grad_norm": 3.3748234442824683, "language_loss": 0.84637725, "learning_rate": 2.903096296321516e-06, "loss": 0.88178325, "num_input_tokens_seen": 1234480, "step": 55, "time_per_iteration": 2.792464256286621 }, { "auxiliary_loss_clip": 0.0231942, "auxiliary_loss_mlp": 0.01227128, "balance_loss_clip": 1.38671744, "balance_loss_mlp": 1.0643357, "epoch": 0.006733601875789094, "flos": 26537662229760.0, "grad_norm": 2.0952059509343712, "language_loss": 0.91533434, "learning_rate": 2.9161497296578907e-06, "loss": 0.95079982, "num_input_tokens_seen": 1253870, "step": 56, "time_per_iteration": 2.7706298828125 }, { "auxiliary_loss_clip": 0.02320809, "auxiliary_loss_mlp": 0.01229527, "balance_loss_clip": 1.38792491, "balance_loss_mlp": 1.06110811, "epoch": 0.006853844766428185, "flos": 15523178083200.0, "grad_norm": 2.4620710016239373, "language_loss": 0.86037171, "learning_rate": 2.928972116604173e-06, "loss": 0.8958751, "num_input_tokens_seen": 1270145, "step": 57, "time_per_iteration": 2.7857491970062256 }, { "auxiliary_loss_clip": 0.02268831, "auxiliary_loss_mlp": 0.01193206, "balance_loss_clip": 1.37879324, "balance_loss_mlp": 1.04004586, "epoch": 0.006974087657067276, "flos": 24243760897920.0, "grad_norm": 5.559813675663293, "language_loss": 1.02087164, "learning_rate": 2.9415714941751377e-06, "loss": 1.05549192, "num_input_tokens_seen": 1291365, "step": 58, "time_per_iteration": 2.859163761138916 }, { "auxiliary_loss_clip": 0.02291654, "auxiliary_loss_mlp": 0.01205938, "balance_loss_clip": 1.37745643, "balance_loss_mlp": 1.05411339, "epoch": 0.007094330547706367, "flos": 25772513690880.0, "grad_norm": 2.139716563432531, "language_loss": 0.93577367, "learning_rate": 2.9539554871897396e-06, "loss": 0.97074962, "num_input_tokens_seen": 1311535, "step": 59, "time_per_iteration": 2.83764386177063 }, { "auxiliary_loss_clip": 0.02245985, "auxiliary_loss_mlp": 0.01220772, "balance_loss_clip": 1.36705387, "balance_loss_mlp": 1.06513286, "epoch": 0.007214573438345458, "flos": 21319015979520.0, "grad_norm": 2.075280005340262, "language_loss": 0.97441161, "learning_rate": 2.9661313359851253e-06, "loss": 1.00907922, "num_input_tokens_seen": 1329420, "step": 60, "time_per_iteration": 2.8543570041656494 }, { "auxiliary_loss_clip": 0.02223388, "auxiliary_loss_mlp": 0.01204947, "balance_loss_clip": 1.36439979, "balance_loss_mlp": 1.05712795, "epoch": 0.007334816328984549, "flos": 24937088192640.0, "grad_norm": 2.1215832459039285, "language_loss": 0.94013405, "learning_rate": 2.978105921839922e-06, "loss": 0.97441745, "num_input_tokens_seen": 1349965, "step": 61, "time_per_iteration": 2.8203046321868896 }, { "auxiliary_loss_clip": 0.0220463, "auxiliary_loss_mlp": 0.01209723, "balance_loss_clip": 1.35731721, "balance_loss_mlp": 1.06304765, "epoch": 0.00745505921962364, "flos": 18510586277760.0, "grad_norm": 4.204521923503308, "language_loss": 0.72338015, "learning_rate": 2.9898857903302893e-06, "loss": 0.75752366, "num_input_tokens_seen": 1368915, "step": 62, "time_per_iteration": 2.8457043170928955 }, { "auxiliary_loss_clip": 0.02201079, "auxiliary_loss_mlp": 0.01195569, "balance_loss_clip": 1.35779762, "balance_loss_mlp": 1.05375719, "epoch": 0.007575302110262731, "flos": 18477656484480.0, "grad_norm": 2.9908749617796726, "language_loss": 0.88055605, "learning_rate": 3.001477172817253e-06, "loss": 0.91452253, "num_input_tokens_seen": 1386805, "step": 63, "time_per_iteration": 2.8341126441955566 }, { "auxiliary_loss_clip": 0.02177915, "auxiliary_loss_mlp": 0.01204084, "balance_loss_clip": 1.3504405, "balance_loss_mlp": 1.0650388, "epoch": 0.007695545000901822, "flos": 24973178382720.0, "grad_norm": 2.5793619695235153, "language_loss": 0.96204597, "learning_rate": 3.012886006241894e-06, "loss": 0.995866, "num_input_tokens_seen": 1406190, "step": 64, "time_per_iteration": 2.9686625003814697 }, { "auxiliary_loss_clip": 0.02158083, "auxiliary_loss_mlp": 0.0120375, "balance_loss_clip": 1.348629, "balance_loss_mlp": 1.06022263, "epoch": 0.007815787891540913, "flos": 21324223451520.0, "grad_norm": 3.160873106419765, "language_loss": 0.88222778, "learning_rate": 3.0241179513858383e-06, "loss": 0.91584617, "num_input_tokens_seen": 1425500, "step": 65, "time_per_iteration": 3.0059993267059326 }, { "auxiliary_loss_clip": 0.02135299, "auxiliary_loss_mlp": 0.01198929, "balance_loss_clip": 1.33741951, "balance_loss_mlp": 1.05759442, "epoch": 0.007936030782180003, "flos": 21575777374080.0, "grad_norm": 2.2122224474209573, "language_loss": 0.87891328, "learning_rate": 3.035178409737647e-06, "loss": 0.91225564, "num_input_tokens_seen": 1442950, "step": 66, "time_per_iteration": 3.870715379714966 }, { "auxiliary_loss_clip": 0.02130466, "auxiliary_loss_mlp": 0.0119049, "balance_loss_clip": 1.34025276, "balance_loss_mlp": 1.05773902, "epoch": 0.008056273672819095, "flos": 20120785159680.0, "grad_norm": 2.112708091940364, "language_loss": 0.88701493, "learning_rate": 3.046072539090907e-06, "loss": 0.92022455, "num_input_tokens_seen": 1460915, "step": 67, "time_per_iteration": 4.9296875 }, { "auxiliary_loss_clip": 0.02101606, "auxiliary_loss_mlp": 0.01183105, "balance_loss_clip": 1.32455397, "balance_loss_mlp": 1.0500679, "epoch": 0.008176516563458186, "flos": 18333116156160.0, "grad_norm": 2.336853365238484, "language_loss": 1.04630542, "learning_rate": 3.056805267986779e-06, "loss": 1.07915246, "num_input_tokens_seen": 1478385, "step": 68, "time_per_iteration": 3.746598958969116 }, { "auxiliary_loss_clip": 0.02073116, "auxiliary_loss_mlp": 0.01176772, "balance_loss_clip": 1.31887233, "balance_loss_mlp": 1.05021942, "epoch": 0.008296759454097276, "flos": 21872076664320.0, "grad_norm": 2.857652937150936, "language_loss": 0.9527036, "learning_rate": 3.0673813091022194e-06, "loss": 0.98520243, "num_input_tokens_seen": 1497605, "step": 69, "time_per_iteration": 2.855372667312622 }, { "auxiliary_loss_clip": 0.01803256, "auxiliary_loss_mlp": 0.01278075, "balance_loss_clip": 1.30750811, "balance_loss_mlp": 1.20025504, "epoch": 0.008417002344736368, "flos": 63408228036480.0, "grad_norm": 1.3882312671694803, "language_loss": 0.62096173, "learning_rate": 3.0778051716749317e-06, "loss": 0.65177506, "num_input_tokens_seen": 1561150, "step": 70, "time_per_iteration": 3.3941500186920166 }, { "auxiliary_loss_clip": 0.02029494, "auxiliary_loss_mlp": 0.01172261, "balance_loss_clip": 1.29396427, "balance_loss_mlp": 1.04551804, "epoch": 0.008537245235375458, "flos": 22966454286720.0, "grad_norm": 2.7007744980121284, "language_loss": 0.90323335, "learning_rate": 3.0880811730470094e-06, "loss": 0.93525094, "num_input_tokens_seen": 1580605, "step": 71, "time_per_iteration": 2.8262276649475098 }, { "auxiliary_loss_clip": 0.01765477, "auxiliary_loss_mlp": 0.01185147, "balance_loss_clip": 1.29280806, "balance_loss_mlp": 1.11419356, "epoch": 0.008657488126014549, "flos": 61984046712960.0, "grad_norm": 1.1705979013628816, "language_loss": 0.58661723, "learning_rate": 3.098213449401257e-06, "loss": 0.61612344, "num_input_tokens_seen": 1647535, "step": 72, "time_per_iteration": 3.344963550567627 }, { "auxiliary_loss_clip": 0.02010648, "auxiliary_loss_mlp": 0.01174135, "balance_loss_clip": 1.2959826, "balance_loss_mlp": 1.05840659, "epoch": 0.00877773101665364, "flos": 30296791152000.0, "grad_norm": 2.515112911292111, "language_loss": 0.98941648, "learning_rate": 3.1082059657570015e-06, "loss": 1.02126431, "num_input_tokens_seen": 1666770, "step": 73, "time_per_iteration": 2.8477511405944824 }, { "auxiliary_loss_clip": 0.01993104, "auxiliary_loss_mlp": 0.01179728, "balance_loss_clip": 1.29328334, "balance_loss_mlp": 1.06395173, "epoch": 0.00889797390729273, "flos": 23514056104320.0, "grad_norm": 2.154713707914552, "language_loss": 0.96566051, "learning_rate": 3.1180625252858496e-06, "loss": 0.99738884, "num_input_tokens_seen": 1685200, "step": 74, "time_per_iteration": 2.8118324279785156 }, { "auxiliary_loss_clip": 0.01963055, "auxiliary_loss_mlp": 0.01159285, "balance_loss_clip": 1.28295636, "balance_loss_mlp": 1.05414248, "epoch": 0.009018216797931822, "flos": 23075838178560.0, "grad_norm": 2.706565793418982, "language_loss": 0.80068547, "learning_rate": 3.1277867780021663e-06, "loss": 0.83190882, "num_input_tokens_seen": 1701835, "step": 75, "time_per_iteration": 2.828486442565918 }, { "auxiliary_loss_clip": 0.01936062, "auxiliary_loss_mlp": 0.01161477, "balance_loss_clip": 1.28189147, "balance_loss_mlp": 1.05113649, "epoch": 0.009138459688570914, "flos": 15918877284480.0, "grad_norm": 2.157224736345352, "language_loss": 0.9567256, "learning_rate": 3.1373822288779824e-06, "loss": 0.98770094, "num_input_tokens_seen": 1718415, "step": 76, "time_per_iteration": 2.8161323070526123 }, { "auxiliary_loss_clip": 0.0194339, "auxiliary_loss_mlp": 0.0117716, "balance_loss_clip": 1.27806771, "balance_loss_mlp": 1.07082486, "epoch": 0.009258702579210003, "flos": 27016531372800.0, "grad_norm": 1.9845137485050308, "language_loss": 0.7960391, "learning_rate": 3.1468522454274533e-06, "loss": 0.82724464, "num_input_tokens_seen": 1738770, "step": 77, "time_per_iteration": 2.8483705520629883 }, { "auxiliary_loss_clip": 0.01930704, "auxiliary_loss_mlp": 0.0116722, "balance_loss_clip": 1.27360451, "balance_loss_mlp": 1.06450915, "epoch": 0.009378945469849095, "flos": 26903196984960.0, "grad_norm": 1.9634680245022171, "language_loss": 0.9199301, "learning_rate": 3.15620006480197e-06, "loss": 0.95090938, "num_input_tokens_seen": 1758040, "step": 78, "time_per_iteration": 2.8253872394561768 }, { "auxiliary_loss_clip": 0.0191971, "auxiliary_loss_mlp": 0.0115899, "balance_loss_clip": 1.27313268, "balance_loss_mlp": 1.05694711, "epoch": 0.009499188360488187, "flos": 35694236327040.0, "grad_norm": 301.51724779460676, "language_loss": 0.74930453, "learning_rate": 3.1654288004333087e-06, "loss": 0.78009152, "num_input_tokens_seen": 1776705, "step": 79, "time_per_iteration": 2.94392466545105 }, { "auxiliary_loss_clip": 0.01886342, "auxiliary_loss_mlp": 0.01163249, "balance_loss_clip": 1.26303053, "balance_loss_mlp": 1.06587863, "epoch": 0.009619431251127276, "flos": 21503201944320.0, "grad_norm": 2.7328277438174173, "language_loss": 0.76015806, "learning_rate": 3.1745414482589353e-06, "loss": 0.790654, "num_input_tokens_seen": 1795915, "step": 80, "time_per_iteration": 2.8701491355895996 }, { "auxiliary_loss_clip": 0.01872633, "auxiliary_loss_mlp": 0.01164988, "balance_loss_clip": 1.25850523, "balance_loss_mlp": 1.06242025, "epoch": 0.009739674141766368, "flos": 17421056991360.0, "grad_norm": 6.771235900731743, "language_loss": 0.86937845, "learning_rate": 3.1835408925606204e-06, "loss": 0.89975464, "num_input_tokens_seen": 1814055, "step": 81, "time_per_iteration": 2.8529906272888184 }, { "auxiliary_loss_clip": 0.01865719, "auxiliary_loss_mlp": 0.01144388, "balance_loss_clip": 1.26054907, "balance_loss_mlp": 1.0578897, "epoch": 0.00985991703240546, "flos": 27527109246720.0, "grad_norm": 3.084224974109649, "language_loss": 0.89320409, "learning_rate": 3.1924299114448214e-06, "loss": 0.92330515, "num_input_tokens_seen": 1834535, "step": 82, "time_per_iteration": 2.955183267593384 }, { "auxiliary_loss_clip": 0.01861449, "auxiliary_loss_mlp": 0.01166054, "balance_loss_clip": 1.25544691, "balance_loss_mlp": 1.06553626, "epoch": 0.00998015992304455, "flos": 13808084509440.0, "grad_norm": 2.820272347211432, "language_loss": 0.83458406, "learning_rate": 3.2012111819909055e-06, "loss": 0.86485898, "num_input_tokens_seen": 1851865, "step": 83, "time_per_iteration": 2.9132113456726074 }, { "auxiliary_loss_clip": 0.01838296, "auxiliary_loss_mlp": 0.01155929, "balance_loss_clip": 1.25353873, "balance_loss_mlp": 1.06323171, "epoch": 0.010100402813683641, "flos": 20191385341440.0, "grad_norm": 2.915392889565791, "language_loss": 0.95026112, "learning_rate": 3.2098872850910627e-06, "loss": 0.98020327, "num_input_tokens_seen": 1868540, "step": 84, "time_per_iteration": 2.882713794708252 }, { "auxiliary_loss_clip": 0.01834499, "auxiliary_loss_mlp": 0.01147615, "balance_loss_clip": 1.24585938, "balance_loss_mlp": 1.05768383, "epoch": 0.010220645704322733, "flos": 17201642762880.0, "grad_norm": 2.3678635112502007, "language_loss": 0.89182413, "learning_rate": 3.2184607100038194e-06, "loss": 0.92164528, "num_input_tokens_seen": 1887180, "step": 85, "time_per_iteration": 2.8029696941375732 }, { "auxiliary_loss_clip": 0.01819243, "auxiliary_loss_mlp": 0.01153699, "balance_loss_clip": 1.24782491, "balance_loss_mlp": 1.06395817, "epoch": 0.010340888594961822, "flos": 21470415805440.0, "grad_norm": 2.333472163259405, "language_loss": 0.93061095, "learning_rate": 3.2269338586412414e-06, "loss": 0.96034032, "num_input_tokens_seen": 1904765, "step": 86, "time_per_iteration": 2.7794930934906006 }, { "auxiliary_loss_clip": 0.01801845, "auxiliary_loss_mlp": 0.01125439, "balance_loss_clip": 1.2411356, "balance_loss_mlp": 1.04504371, "epoch": 0.010461131485600914, "flos": 23002831785600.0, "grad_norm": 2.355425410025708, "language_loss": 0.96659118, "learning_rate": 3.2353090496083106e-06, "loss": 0.99586403, "num_input_tokens_seen": 1922600, "step": 87, "time_per_iteration": 2.748521089553833 }, { "auxiliary_loss_clip": 0.01790914, "auxiliary_loss_mlp": 0.01139323, "balance_loss_clip": 1.23779225, "balance_loss_mlp": 1.05678225, "epoch": 0.010581374376240005, "flos": 33546850571520.0, "grad_norm": 2.374535177793643, "language_loss": 0.8131671, "learning_rate": 3.2435885220114572e-06, "loss": 0.84246945, "num_input_tokens_seen": 1943950, "step": 88, "time_per_iteration": 2.9460341930389404 }, { "auxiliary_loss_clip": 0.01789628, "auxiliary_loss_mlp": 0.01132892, "balance_loss_clip": 1.2391299, "balance_loss_mlp": 1.05168676, "epoch": 0.010701617266879095, "flos": 21763087822080.0, "grad_norm": 2.2577183477323812, "language_loss": 0.93958461, "learning_rate": 3.2517744390519113e-06, "loss": 0.96880978, "num_input_tokens_seen": 1962815, "step": 89, "time_per_iteration": 2.8981640338897705 }, { "auxiliary_loss_clip": 0.01775071, "auxiliary_loss_mlp": 0.01132854, "balance_loss_clip": 1.23053217, "balance_loss_mlp": 1.05579758, "epoch": 0.010821860157518187, "flos": 19060199256960.0, "grad_norm": 2.0569696637809725, "language_loss": 0.75224411, "learning_rate": 3.259868891418298e-06, "loss": 0.78132343, "num_input_tokens_seen": 1980580, "step": 90, "time_per_iteration": 2.9575023651123047 }, { "auxiliary_loss_clip": 0.01781479, "auxiliary_loss_mlp": 0.01135841, "balance_loss_clip": 1.23589838, "balance_loss_mlp": 1.05802107, "epoch": 0.010942103048157278, "flos": 25447378757760.0, "grad_norm": 3.239539756926512, "language_loss": 0.85161495, "learning_rate": 3.2678739004917757e-06, "loss": 0.88078815, "num_input_tokens_seen": 2000315, "step": 91, "time_per_iteration": 2.831373453140259 }, { "auxiliary_loss_clip": 0.01768195, "auxiliary_loss_mlp": 0.01136416, "balance_loss_clip": 1.23409796, "balance_loss_mlp": 1.06078911, "epoch": 0.011062345938796368, "flos": 27493928058240.0, "grad_norm": 1.7166965690716605, "language_loss": 0.92319292, "learning_rate": 3.275791421376029e-06, "loss": 0.95223904, "num_input_tokens_seen": 2023760, "step": 92, "time_per_iteration": 3.875816583633423 }, { "auxiliary_loss_clip": 0.01753544, "auxiliary_loss_mlp": 0.01131215, "balance_loss_clip": 1.22645545, "balance_loss_mlp": 1.06030917, "epoch": 0.01118258882943546, "flos": 16071210864000.0, "grad_norm": 2.300885675939803, "language_loss": 0.96126735, "learning_rate": 3.2836233457634622e-06, "loss": 0.99011493, "num_input_tokens_seen": 2041895, "step": 93, "time_per_iteration": 3.8518943786621094 }, { "auxiliary_loss_clip": 0.01757787, "auxiliary_loss_mlp": 0.01133057, "balance_loss_clip": 1.22947836, "balance_loss_mlp": 1.06081665, "epoch": 0.011302831720074551, "flos": 20668602458880.0, "grad_norm": 1.943513881420043, "language_loss": 0.85403442, "learning_rate": 3.2913715046481135e-06, "loss": 0.88294291, "num_input_tokens_seen": 2061640, "step": 94, "time_per_iteration": 4.597077369689941 }, { "auxiliary_loss_clip": 0.01742733, "auxiliary_loss_mlp": 0.01115959, "balance_loss_clip": 1.22673178, "balance_loss_mlp": 1.04367018, "epoch": 0.011423074610713641, "flos": 13072238490240.0, "grad_norm": 3.470419477654237, "language_loss": 0.88937926, "learning_rate": 3.299037670895023e-06, "loss": 0.91796619, "num_input_tokens_seen": 2078255, "step": 95, "time_per_iteration": 2.7618167400360107 }, { "auxiliary_loss_clip": 0.01739185, "auxiliary_loss_mlp": 0.01123342, "balance_loss_clip": 1.22658312, "balance_loss_mlp": 1.05343735, "epoch": 0.011543317501352733, "flos": 30335646689280.0, "grad_norm": 2.126142261304448, "language_loss": 0.80446529, "learning_rate": 3.3066235616750667e-06, "loss": 0.8330906, "num_input_tokens_seen": 2099490, "step": 96, "time_per_iteration": 2.831902503967285 }, { "auxiliary_loss_clip": 0.01706022, "auxiliary_loss_mlp": 0.01116166, "balance_loss_clip": 1.21377552, "balance_loss_mlp": 1.05002868, "epoch": 0.011663560391991824, "flos": 15522962601600.0, "grad_norm": 2.2270031183927914, "language_loss": 0.92568636, "learning_rate": 3.3141308407736276e-06, "loss": 0.95390821, "num_input_tokens_seen": 2116125, "step": 97, "time_per_iteration": 2.9180245399475098 }, { "auxiliary_loss_clip": 0.01715195, "auxiliary_loss_mlp": 0.01114008, "balance_loss_clip": 1.21356583, "balance_loss_mlp": 1.0493964, "epoch": 0.011783803282630914, "flos": 19902125116800.0, "grad_norm": 1.9938127759119637, "language_loss": 0.86855137, "learning_rate": 3.321561120780869e-06, "loss": 0.89684343, "num_input_tokens_seen": 2134835, "step": 98, "time_per_iteration": 2.812070608139038 }, { "auxiliary_loss_clip": 0.01707853, "auxiliary_loss_mlp": 0.01122499, "balance_loss_clip": 1.22023249, "balance_loss_mlp": 1.05459762, "epoch": 0.011904046173270006, "flos": 22340674517760.0, "grad_norm": 2.1933710309457912, "language_loss": 1.01428938, "learning_rate": 3.3289159651708192e-06, "loss": 1.04259288, "num_input_tokens_seen": 2152410, "step": 99, "time_per_iteration": 2.833592176437378 }, { "auxiliary_loss_clip": 0.01707477, "auxiliary_loss_mlp": 0.0112121, "balance_loss_clip": 1.21437311, "balance_loss_mlp": 1.05864859, "epoch": 0.012024289063909096, "flos": 19100060375040.0, "grad_norm": 1.8673763298384833, "language_loss": 0.97708881, "learning_rate": 3.3361968902759768e-06, "loss": 1.00537562, "num_input_tokens_seen": 2172090, "step": 100, "time_per_iteration": 2.776411771774292 }, { "auxiliary_loss_clip": 0.0166831, "auxiliary_loss_mlp": 0.01106319, "balance_loss_clip": 1.20020413, "balance_loss_mlp": 1.04585588, "epoch": 0.012144531954548187, "flos": 15012205159680.0, "grad_norm": 2.3312810918012383, "language_loss": 0.93992198, "learning_rate": 3.343405367163663e-06, "loss": 0.96766818, "num_input_tokens_seen": 2189020, "step": 101, "time_per_iteration": 2.846339702606201 }, { "auxiliary_loss_clip": 0.01680842, "auxiliary_loss_mlp": 0.01118463, "balance_loss_clip": 1.20852757, "balance_loss_mlp": 1.05642653, "epoch": 0.012264774845187279, "flos": 15122020014720.0, "grad_norm": 2.3417657415902373, "language_loss": 0.81337643, "learning_rate": 3.350542823419951e-06, "loss": 0.84136945, "num_input_tokens_seen": 2205620, "step": 102, "time_per_iteration": 2.754270553588867 }, { "auxiliary_loss_clip": 0.01684209, "auxiliary_loss_mlp": 0.01109719, "balance_loss_clip": 1.20652628, "balance_loss_mlp": 1.05524051, "epoch": 0.012385017735826368, "flos": 13949248959360.0, "grad_norm": 5.82401887105771, "language_loss": 0.87563419, "learning_rate": 3.3576106448465615e-06, "loss": 0.90357357, "num_input_tokens_seen": 2219000, "step": 103, "time_per_iteration": 2.807176351547241 }, { "auxiliary_loss_clip": 0.01660399, "auxiliary_loss_mlp": 0.01108504, "balance_loss_clip": 1.19859433, "balance_loss_mlp": 1.0475167, "epoch": 0.01250526062646546, "flos": 23623260428160.0, "grad_norm": 5.943129485444414, "language_loss": 0.88263607, "learning_rate": 3.3646101770757797e-06, "loss": 0.91032505, "num_input_tokens_seen": 2237790, "step": 104, "time_per_iteration": 2.8185107707977295 }, { "auxiliary_loss_clip": 0.016561, "auxiliary_loss_mlp": 0.01108653, "balance_loss_clip": 1.20328283, "balance_loss_mlp": 1.05529523, "epoch": 0.012625503517104552, "flos": 34640078958720.0, "grad_norm": 1.7608581703747466, "language_loss": 0.85541642, "learning_rate": 3.371542727108104e-06, "loss": 0.88306397, "num_input_tokens_seen": 2259965, "step": 105, "time_per_iteration": 2.8997950553894043 }, { "auxiliary_loss_clip": 0.01666822, "auxiliary_loss_mlp": 0.01122603, "balance_loss_clip": 1.20014322, "balance_loss_mlp": 1.06624079, "epoch": 0.012745746407743641, "flos": 17821891837440.0, "grad_norm": 4.340764282798978, "language_loss": 0.8995086, "learning_rate": 3.3784095647770114e-06, "loss": 0.92740285, "num_input_tokens_seen": 2278610, "step": 106, "time_per_iteration": 2.756409168243408 }, { "auxiliary_loss_clip": 0.01660563, "auxiliary_loss_mlp": 0.0110542, "balance_loss_clip": 1.19923329, "balance_loss_mlp": 1.05077422, "epoch": 0.012865989298382733, "flos": 20595057361920.0, "grad_norm": 2.083072226303842, "language_loss": 0.88735211, "learning_rate": 3.3852119241449547e-06, "loss": 0.915012, "num_input_tokens_seen": 2297730, "step": 107, "time_per_iteration": 2.7366011142730713 }, { "auxiliary_loss_clip": 0.01647621, "auxiliary_loss_mlp": 0.01104527, "balance_loss_clip": 1.19369483, "balance_loss_mlp": 1.05033469, "epoch": 0.012986232189021825, "flos": 23948969978880.0, "grad_norm": 3.2601733042440233, "language_loss": 0.96312928, "learning_rate": 3.3919510048344295e-06, "loss": 0.99065071, "num_input_tokens_seen": 2315740, "step": 108, "time_per_iteration": 2.8434176445007324 }, { "auxiliary_loss_clip": 0.01626901, "auxiliary_loss_mlp": 0.01097917, "balance_loss_clip": 1.18842435, "balance_loss_mlp": 1.05068684, "epoch": 0.013106475079660914, "flos": 23725425686400.0, "grad_norm": 2.3291878472548424, "language_loss": 0.86606324, "learning_rate": 3.3986279732976907e-06, "loss": 0.89331138, "num_input_tokens_seen": 2334215, "step": 109, "time_per_iteration": 2.774916410446167 }, { "auxiliary_loss_clip": 0.01624357, "auxiliary_loss_mlp": 0.01096089, "balance_loss_clip": 1.19078255, "balance_loss_mlp": 1.04280281, "epoch": 0.013226717970300006, "flos": 21102438925440.0, "grad_norm": 1.9805692710256764, "language_loss": 0.95560634, "learning_rate": 3.4052439640284983e-06, "loss": 0.98281074, "num_input_tokens_seen": 2353130, "step": 110, "time_per_iteration": 2.8224289417266846 }, { "auxiliary_loss_clip": 0.01628133, "auxiliary_loss_mlp": 0.01120827, "balance_loss_clip": 1.19014084, "balance_loss_mlp": 1.06775498, "epoch": 0.013346960860939098, "flos": 24863902231680.0, "grad_norm": 1.8930539014400443, "language_loss": 0.81251204, "learning_rate": 3.4118000807190217e-06, "loss": 0.84000164, "num_input_tokens_seen": 2374010, "step": 111, "time_per_iteration": 2.7814128398895264 }, { "auxiliary_loss_clip": 0.01623901, "auxiliary_loss_mlp": 0.01122893, "balance_loss_clip": 1.18561673, "balance_loss_mlp": 1.07511449, "epoch": 0.013467203751578187, "flos": 28181940140160.0, "grad_norm": 1.8401926437026568, "language_loss": 0.75924194, "learning_rate": 3.4182973973648723e-06, "loss": 0.7867099, "num_input_tokens_seen": 2395220, "step": 112, "time_per_iteration": 2.815429449081421 }, { "auxiliary_loss_clip": 0.01620832, "auxiliary_loss_mlp": 0.01107001, "balance_loss_clip": 1.18908656, "balance_loss_mlp": 1.05841112, "epoch": 0.013587446642217279, "flos": 18916233546240.0, "grad_norm": 2.7975025587261912, "language_loss": 0.94909108, "learning_rate": 3.424736959321014e-06, "loss": 0.97636938, "num_input_tokens_seen": 2413025, "step": 113, "time_per_iteration": 2.8028059005737305 }, { "auxiliary_loss_clip": 0.0161495, "auxiliary_loss_mlp": 0.011005, "balance_loss_clip": 1.18380809, "balance_loss_mlp": 1.04785728, "epoch": 0.01370768953285637, "flos": 23988615615360.0, "grad_norm": 2.319303888972359, "language_loss": 0.88632673, "learning_rate": 3.431119784311155e-06, "loss": 0.9134813, "num_input_tokens_seen": 2432700, "step": 114, "time_per_iteration": 2.7867040634155273 }, { "auxiliary_loss_clip": 0.01605982, "auxiliary_loss_mlp": 0.01104479, "balance_loss_clip": 1.18399036, "balance_loss_mlp": 1.05414867, "epoch": 0.01382793242349546, "flos": 39202565512320.0, "grad_norm": 1.8781404078646662, "language_loss": 0.7770766, "learning_rate": 3.43744686339307e-06, "loss": 0.80418122, "num_input_tokens_seen": 2455020, "step": 115, "time_per_iteration": 2.95804500579834 }, { "auxiliary_loss_clip": 0.01589208, "auxiliary_loss_mlp": 0.01082202, "balance_loss_clip": 1.17315388, "balance_loss_mlp": 1.04038334, "epoch": 0.013948175314134552, "flos": 41353506714240.0, "grad_norm": 2.1967230438652026, "language_loss": 0.9094125, "learning_rate": 3.44371916188212e-06, "loss": 0.93612659, "num_input_tokens_seen": 2475775, "step": 116, "time_per_iteration": 2.918935537338257 }, { "auxiliary_loss_clip": 0.01587665, "auxiliary_loss_mlp": 0.0108938, "balance_loss_clip": 1.1716727, "balance_loss_mlp": 1.04958749, "epoch": 0.014068418204773643, "flos": 22453542028800.0, "grad_norm": 1.925590891466393, "language_loss": 0.85930252, "learning_rate": 3.449937620235143e-06, "loss": 0.88607299, "num_input_tokens_seen": 2496370, "step": 117, "time_per_iteration": 2.840698003768921 }, { "auxiliary_loss_clip": 0.01575804, "auxiliary_loss_mlp": 0.0108867, "balance_loss_clip": 1.17121637, "balance_loss_mlp": 1.04613566, "epoch": 0.014188661095412733, "flos": 23805147922560.0, "grad_norm": 1.6817446586472073, "language_loss": 0.89248705, "learning_rate": 3.456103154896722e-06, "loss": 0.91913176, "num_input_tokens_seen": 2517645, "step": 118, "time_per_iteration": 3.770326614379883 }, { "auxiliary_loss_clip": 0.0157077, "auxiliary_loss_mlp": 0.01090904, "balance_loss_clip": 1.16740608, "balance_loss_mlp": 1.04746413, "epoch": 0.014308903986051825, "flos": 23660248458240.0, "grad_norm": 1.906981293454238, "language_loss": 0.92221957, "learning_rate": 3.462216659109757e-06, "loss": 0.94883627, "num_input_tokens_seen": 2537825, "step": 119, "time_per_iteration": 3.6665713787078857 }, { "auxiliary_loss_clip": 0.01573906, "auxiliary_loss_mlp": 0.01093468, "balance_loss_clip": 1.16873968, "balance_loss_mlp": 1.05245972, "epoch": 0.014429146876690916, "flos": 20667991927680.0, "grad_norm": 3.6362147486086895, "language_loss": 0.8542521, "learning_rate": 3.4682790036921077e-06, "loss": 0.88092577, "num_input_tokens_seen": 2556485, "step": 120, "time_per_iteration": 3.668272018432617 }, { "auxiliary_loss_clip": 0.01551242, "auxiliary_loss_mlp": 0.01094067, "balance_loss_clip": 1.16201544, "balance_loss_mlp": 1.05687332, "epoch": 0.014549389767330006, "flos": 20229199384320.0, "grad_norm": 2.157556871413569, "language_loss": 0.83149475, "learning_rate": 3.4742910377810193e-06, "loss": 0.85794783, "num_input_tokens_seen": 2573945, "step": 121, "time_per_iteration": 2.722315549850464 }, { "auxiliary_loss_clip": 0.01556162, "auxiliary_loss_mlp": 0.010905, "balance_loss_clip": 1.16094136, "balance_loss_mlp": 1.0589931, "epoch": 0.014669632657969098, "flos": 18004174381440.0, "grad_norm": 2.0462045849774135, "language_loss": 0.88718939, "learning_rate": 3.4802535895469042e-06, "loss": 0.913656, "num_input_tokens_seen": 2592695, "step": 122, "time_per_iteration": 2.7231016159057617 }, { "auxiliary_loss_clip": 0.01546073, "auxiliary_loss_mlp": 0.01088378, "balance_loss_clip": 1.15642071, "balance_loss_mlp": 1.05397379, "epoch": 0.01478987554860819, "flos": 22741796672640.0, "grad_norm": 1.9352649669993673, "language_loss": 0.89613032, "learning_rate": 3.4861674668779934e-06, "loss": 0.9224748, "num_input_tokens_seen": 2610925, "step": 123, "time_per_iteration": 2.8793256282806396 }, { "auxiliary_loss_clip": 0.01544374, "auxiliary_loss_mlp": 0.01096678, "balance_loss_clip": 1.15946794, "balance_loss_mlp": 1.06115377, "epoch": 0.01491011843924728, "flos": 17198590106880.0, "grad_norm": 7.261317733517108, "language_loss": 0.842448, "learning_rate": 3.492033458037272e-06, "loss": 0.86885858, "num_input_tokens_seen": 2629495, "step": 124, "time_per_iteration": 2.7452244758605957 }, { "auxiliary_loss_clip": 0.01538929, "auxiliary_loss_mlp": 0.01083063, "balance_loss_clip": 1.1528852, "balance_loss_mlp": 1.04803932, "epoch": 0.01503036132988637, "flos": 17673867889920.0, "grad_norm": 2.314743309775876, "language_loss": 0.87087321, "learning_rate": 3.497852332293018e-06, "loss": 0.89709312, "num_input_tokens_seen": 2645070, "step": 125, "time_per_iteration": 2.803575038909912 }, { "auxiliary_loss_clip": 0.01544861, "auxiliary_loss_mlp": 0.01081872, "balance_loss_clip": 1.15903211, "balance_loss_mlp": 1.04989982, "epoch": 0.015150604220525462, "flos": 18878239935360.0, "grad_norm": 2.191550090483368, "language_loss": 0.9653185, "learning_rate": 3.5036248405242356e-06, "loss": 0.99158585, "num_input_tokens_seen": 2663825, "step": 126, "time_per_iteration": 2.801375150680542 }, { "auxiliary_loss_clip": 0.01534694, "auxiliary_loss_mlp": 0.01075439, "balance_loss_clip": 1.15346074, "balance_loss_mlp": 1.04277563, "epoch": 0.015270847111164552, "flos": 39420184060800.0, "grad_norm": 2.512329547628919, "language_loss": 0.82722437, "learning_rate": 3.509351715802146e-06, "loss": 0.85332572, "num_input_tokens_seen": 2684710, "step": 127, "time_per_iteration": 2.9612722396850586 }, { "auxiliary_loss_clip": 0.01536194, "auxiliary_loss_mlp": 0.01066496, "balance_loss_clip": 1.15500295, "balance_loss_mlp": 1.03347516, "epoch": 0.015391090001803644, "flos": 43762466286720.0, "grad_norm": 2.1161479046794214, "language_loss": 0.78413993, "learning_rate": 3.5150336739488763e-06, "loss": 0.81016678, "num_input_tokens_seen": 2706995, "step": 128, "time_per_iteration": 3.0303494930267334 }, { "auxiliary_loss_clip": 0.01518399, "auxiliary_loss_mlp": 0.0108415, "balance_loss_clip": 1.15139461, "balance_loss_mlp": 1.05279803, "epoch": 0.015511332892442733, "flos": 18916341287040.0, "grad_norm": 1.7588093714239625, "language_loss": 0.83949232, "learning_rate": 3.5206714140744143e-06, "loss": 0.86551774, "num_input_tokens_seen": 2727050, "step": 129, "time_per_iteration": 2.733250141143799 }, { "auxiliary_loss_clip": 0.01531244, "auxiliary_loss_mlp": 0.01080692, "balance_loss_clip": 1.15614152, "balance_loss_mlp": 1.04988813, "epoch": 0.015631575783081827, "flos": 24535283679360.0, "grad_norm": 5.39085427985959, "language_loss": 0.87959713, "learning_rate": 3.5262656190928208e-06, "loss": 0.90571654, "num_input_tokens_seen": 2745350, "step": 130, "time_per_iteration": 2.847851276397705 }, { "auxiliary_loss_clip": 0.01489378, "auxiliary_loss_mlp": 0.01122354, "balance_loss_clip": 1.20969009, "balance_loss_mlp": 1.10928869, "epoch": 0.015751818673720917, "flos": 62328536098560.0, "grad_norm": 1.0618178918088155, "language_loss": 0.71579093, "learning_rate": 3.5318169562186737e-06, "loss": 0.74190825, "num_input_tokens_seen": 2814195, "step": 131, "time_per_iteration": 3.2981908321380615 }, { "auxiliary_loss_clip": 0.0151314, "auxiliary_loss_mlp": 0.01093529, "balance_loss_clip": 1.14969826, "balance_loss_mlp": 1.06146169, "epoch": 0.015872061564360006, "flos": 23878549365120.0, "grad_norm": 2.015770904062002, "language_loss": 0.82017833, "learning_rate": 3.5373260774446292e-06, "loss": 0.84624505, "num_input_tokens_seen": 2834645, "step": 132, "time_per_iteration": 2.8385653495788574 }, { "auxiliary_loss_clip": 0.01508492, "auxiliary_loss_mlp": 0.0108493, "balance_loss_clip": 1.14303863, "balance_loss_mlp": 1.05422175, "epoch": 0.0159923044549991, "flos": 23367899664000.0, "grad_norm": 2.0105876321948974, "language_loss": 0.90227884, "learning_rate": 3.542793620000961e-06, "loss": 0.92821312, "num_input_tokens_seen": 2854120, "step": 133, "time_per_iteration": 2.783970594406128 }, { "auxiliary_loss_clip": 0.01500983, "auxiliary_loss_mlp": 0.01081917, "balance_loss_clip": 1.14379644, "balance_loss_mlp": 1.0495398, "epoch": 0.01611254734563819, "flos": 17858305249920.0, "grad_norm": 2.2864047048013783, "language_loss": 0.86791444, "learning_rate": 3.5482202067978894e-06, "loss": 0.8937434, "num_input_tokens_seen": 2871330, "step": 134, "time_per_iteration": 2.879163980484009 }, { "auxiliary_loss_clip": 0.01509313, "auxiliary_loss_mlp": 0.01070119, "balance_loss_clip": 1.14719629, "balance_loss_mlp": 1.03879106, "epoch": 0.01623279023627728, "flos": 20954774113920.0, "grad_norm": 3.3546537171823587, "language_loss": 0.76318514, "learning_rate": 3.553606446851471e-06, "loss": 0.78897941, "num_input_tokens_seen": 2888070, "step": 135, "time_per_iteration": 2.7965753078460693 }, { "auxiliary_loss_clip": 0.01495727, "auxiliary_loss_mlp": 0.01071947, "balance_loss_clip": 1.14303327, "balance_loss_mlp": 1.04191828, "epoch": 0.016353033126916373, "flos": 15742412743680.0, "grad_norm": 1.826323802988155, "language_loss": 0.83349037, "learning_rate": 3.5589529356937613e-06, "loss": 0.8591671, "num_input_tokens_seen": 2906465, "step": 136, "time_per_iteration": 2.7232277393341064 }, { "auxiliary_loss_clip": 0.01497661, "auxiliary_loss_mlp": 0.01064781, "balance_loss_clip": 1.14415514, "balance_loss_mlp": 1.03729129, "epoch": 0.016473276017555463, "flos": 18807280617600.0, "grad_norm": 1.7717107250541237, "language_loss": 0.7699219, "learning_rate": 3.5642602557679627e-06, "loss": 0.79554629, "num_input_tokens_seen": 2924915, "step": 137, "time_per_iteration": 2.7518906593322754 }, { "auxiliary_loss_clip": 0.01497824, "auxiliary_loss_mlp": 0.01086892, "balance_loss_clip": 1.14766812, "balance_loss_mlp": 1.05836535, "epoch": 0.016593518908194552, "flos": 24352641999360.0, "grad_norm": 3.9288123953432748, "language_loss": 0.84201348, "learning_rate": 3.569528976809202e-06, "loss": 0.86786062, "num_input_tokens_seen": 2942130, "step": 138, "time_per_iteration": 2.75209903717041 }, { "auxiliary_loss_clip": 0.01488303, "auxiliary_loss_mlp": 0.01072239, "balance_loss_clip": 1.13988817, "balance_loss_mlp": 1.04269862, "epoch": 0.016713761798833646, "flos": 22346133384960.0, "grad_norm": 1.7496705604914897, "language_loss": 0.89873123, "learning_rate": 3.5747596562115522e-06, "loss": 0.92433667, "num_input_tokens_seen": 2962745, "step": 139, "time_per_iteration": 2.821406126022339 }, { "auxiliary_loss_clip": 0.0149455, "auxiliary_loss_mlp": 0.01071879, "balance_loss_clip": 1.14374828, "balance_loss_mlp": 1.04421031, "epoch": 0.016834004689472735, "flos": 17821820010240.0, "grad_norm": 3.325422290350783, "language_loss": 0.91135687, "learning_rate": 3.5799528393819138e-06, "loss": 0.93702114, "num_input_tokens_seen": 2981825, "step": 140, "time_per_iteration": 2.7482259273529053 }, { "auxiliary_loss_clip": 0.01477492, "auxiliary_loss_mlp": 0.01075178, "balance_loss_clip": 1.13642311, "balance_loss_mlp": 1.04907155, "epoch": 0.016954247580111825, "flos": 20519501103360.0, "grad_norm": 2.667511668127444, "language_loss": 0.88013244, "learning_rate": 3.585109060081286e-06, "loss": 0.90565908, "num_input_tokens_seen": 3001625, "step": 141, "time_per_iteration": 2.7828361988067627 }, { "auxiliary_loss_clip": 0.01482129, "auxiliary_loss_mlp": 0.0106384, "balance_loss_clip": 1.13785815, "balance_loss_mlp": 1.03956938, "epoch": 0.017074490470750915, "flos": 22088869200000.0, "grad_norm": 1.8975787568347704, "language_loss": 0.78479123, "learning_rate": 3.590228840753992e-06, "loss": 0.810251, "num_input_tokens_seen": 3022055, "step": 142, "time_per_iteration": 3.0053141117095947 }, { "auxiliary_loss_clip": 0.01475871, "auxiliary_loss_mlp": 0.01067983, "balance_loss_clip": 1.13650632, "balance_loss_mlp": 1.0405525, "epoch": 0.01719473336139001, "flos": 15997270717440.0, "grad_norm": 3.965304335669434, "language_loss": 0.87404269, "learning_rate": 3.5953126928453423e-06, "loss": 0.89948124, "num_input_tokens_seen": 3039605, "step": 143, "time_per_iteration": 2.7325820922851562 }, { "auxiliary_loss_clip": 0.01469906, "auxiliary_loss_mlp": 0.01071633, "balance_loss_clip": 1.13682842, "balance_loss_mlp": 1.04462016, "epoch": 0.017314976252029098, "flos": 22492038430080.0, "grad_norm": 2.108917571855877, "language_loss": 0.80516148, "learning_rate": 3.600361117108239e-06, "loss": 0.83057684, "num_input_tokens_seen": 3059405, "step": 144, "time_per_iteration": 2.823667526245117 }, { "auxiliary_loss_clip": 0.0147108, "auxiliary_loss_mlp": 0.01054433, "balance_loss_clip": 1.1365416, "balance_loss_mlp": 1.02964973, "epoch": 0.017435219142668188, "flos": 22018053536640.0, "grad_norm": 1.8481420239443216, "language_loss": 0.97110063, "learning_rate": 3.6053746038991616e-06, "loss": 0.99635577, "num_input_tokens_seen": 3078490, "step": 145, "time_per_iteration": 3.7023744583129883 }, { "auxiliary_loss_clip": 0.01427659, "auxiliary_loss_mlp": 0.0105978, "balance_loss_clip": 1.17067599, "balance_loss_mlp": 1.05100632, "epoch": 0.01755546203330728, "flos": 72240526149120.0, "grad_norm": 1.0663277604125132, "language_loss": 0.58527219, "learning_rate": 3.6103536334639843e-06, "loss": 0.61014664, "num_input_tokens_seen": 3131755, "step": 146, "time_per_iteration": 5.162090301513672 }, { "auxiliary_loss_clip": 0.01455639, "auxiliary_loss_mlp": 0.01066167, "balance_loss_clip": 1.12897897, "balance_loss_mlp": 1.04234934, "epoch": 0.01767570492394637, "flos": 25337061112320.0, "grad_norm": 1.9790540925266615, "language_loss": 0.85418373, "learning_rate": 3.615298676214041e-06, "loss": 0.8794018, "num_input_tokens_seen": 3152035, "step": 147, "time_per_iteration": 3.7477235794067383 }, { "auxiliary_loss_clip": 0.01461689, "auxiliary_loss_mlp": 0.01063071, "balance_loss_clip": 1.13180065, "balance_loss_mlp": 1.03957534, "epoch": 0.01779594781458546, "flos": 20449188230400.0, "grad_norm": 2.256235109162072, "language_loss": 0.89056456, "learning_rate": 3.6202101929928317e-06, "loss": 0.91581213, "num_input_tokens_seen": 3170625, "step": 148, "time_per_iteration": 2.7935397624969482 }, { "auxiliary_loss_clip": 0.01452222, "auxiliary_loss_mlp": 0.01083033, "balance_loss_clip": 1.12925434, "balance_loss_mlp": 1.05872679, "epoch": 0.017916190705224554, "flos": 16253601148800.0, "grad_norm": 1.8873662640801434, "language_loss": 0.8843621, "learning_rate": 3.6250886353337413e-06, "loss": 0.9097147, "num_input_tokens_seen": 3188155, "step": 149, "time_per_iteration": 2.776301622390747 }, { "auxiliary_loss_clip": 0.01453469, "auxiliary_loss_mlp": 0.01076682, "balance_loss_clip": 1.12669659, "balance_loss_mlp": 1.05338871, "epoch": 0.018036433595863644, "flos": 23330588411520.0, "grad_norm": 1.9068660312678443, "language_loss": 0.86418921, "learning_rate": 3.6299344457091488e-06, "loss": 0.88949072, "num_input_tokens_seen": 3209015, "step": 150, "time_per_iteration": 2.9013586044311523 }, { "auxiliary_loss_clip": 0.01452603, "auxiliary_loss_mlp": 0.01056302, "balance_loss_clip": 1.12729383, "balance_loss_mlp": 1.03432012, "epoch": 0.018156676486502734, "flos": 18588010043520.0, "grad_norm": 2.7142489790643425, "language_loss": 0.93932533, "learning_rate": 3.634748057771256e-06, "loss": 0.96441436, "num_input_tokens_seen": 3224955, "step": 151, "time_per_iteration": 2.8608245849609375 }, { "auxiliary_loss_clip": 0.01441356, "auxiliary_loss_mlp": 0.01065498, "balance_loss_clip": 1.12395728, "balance_loss_mlp": 1.04300284, "epoch": 0.018276919377141827, "flos": 25448707560960.0, "grad_norm": 1.7520614381110933, "language_loss": 0.85843146, "learning_rate": 3.639529896584965e-06, "loss": 0.88349998, "num_input_tokens_seen": 3246330, "step": 152, "time_per_iteration": 2.9517054557800293 }, { "auxiliary_loss_clip": 0.01452159, "auxiliary_loss_mlp": 0.01069125, "balance_loss_clip": 1.1275754, "balance_loss_mlp": 1.04686821, "epoch": 0.018397162267780917, "flos": 20047311889920.0, "grad_norm": 2.7620214992192986, "language_loss": 0.89052153, "learning_rate": 3.6442803788531233e-06, "loss": 0.91573435, "num_input_tokens_seen": 3264290, "step": 153, "time_per_iteration": 2.9430079460144043 }, { "auxiliary_loss_clip": 0.01454678, "auxiliary_loss_mlp": 0.01082058, "balance_loss_clip": 1.12816453, "balance_loss_mlp": 1.05900264, "epoch": 0.018517405158420007, "flos": 27565282425600.0, "grad_norm": 3.8101731171629516, "language_loss": 0.95912588, "learning_rate": 3.6489999131344357e-06, "loss": 0.98449314, "num_input_tokens_seen": 3287065, "step": 154, "time_per_iteration": 2.859893798828125 }, { "auxiliary_loss_clip": 0.01437865, "auxiliary_loss_mlp": 0.01066095, "balance_loss_clip": 1.12605584, "balance_loss_mlp": 1.04475701, "epoch": 0.0186376480490591, "flos": 19354056422400.0, "grad_norm": 2.099246339287386, "language_loss": 0.9066745, "learning_rate": 3.653688900054313e-06, "loss": 0.93171406, "num_input_tokens_seen": 3305595, "step": 155, "time_per_iteration": 2.8999202251434326 }, { "auxiliary_loss_clip": 0.01443615, "auxiliary_loss_mlp": 0.01059834, "balance_loss_clip": 1.12343311, "balance_loss_mlp": 1.03708935, "epoch": 0.01875789093969819, "flos": 26687840993280.0, "grad_norm": 1.9395159291819726, "language_loss": 0.76151037, "learning_rate": 3.6583477325089526e-06, "loss": 0.78654492, "num_input_tokens_seen": 3326135, "step": 156, "time_per_iteration": 2.8902225494384766 }, { "auxiliary_loss_clip": 0.01438561, "auxiliary_loss_mlp": 0.01066686, "balance_loss_clip": 1.12534213, "balance_loss_mlp": 1.04460835, "epoch": 0.01887813383033728, "flos": 24353001135360.0, "grad_norm": 2.4392812679877953, "language_loss": 1.04014206, "learning_rate": 3.6629767958628916e-06, "loss": 1.06519461, "num_input_tokens_seen": 3343510, "step": 157, "time_per_iteration": 2.7752747535705566 }, { "auxiliary_loss_clip": 0.01443233, "auxiliary_loss_mlp": 0.01069782, "balance_loss_clip": 1.12450647, "balance_loss_mlp": 1.04856324, "epoch": 0.018998376720976373, "flos": 14647532330880.0, "grad_norm": 2.0837961520300916, "language_loss": 0.85604715, "learning_rate": 3.667576468140291e-06, "loss": 0.88117731, "num_input_tokens_seen": 3361325, "step": 158, "time_per_iteration": 2.7817928791046143 }, { "auxiliary_loss_clip": 0.01434222, "auxiliary_loss_mlp": 0.01058272, "balance_loss_clip": 1.12250996, "balance_loss_mlp": 1.03763735, "epoch": 0.019118619611615463, "flos": 29305261146240.0, "grad_norm": 2.4475144681527965, "language_loss": 0.88940084, "learning_rate": 3.672147120210184e-06, "loss": 0.91432571, "num_input_tokens_seen": 3377925, "step": 159, "time_per_iteration": 2.94972825050354 }, { "auxiliary_loss_clip": 0.01437843, "auxiliary_loss_mlp": 0.01064038, "balance_loss_clip": 1.12586784, "balance_loss_mlp": 1.04176974, "epoch": 0.019238862502254553, "flos": 20886723797760.0, "grad_norm": 2.1060408400438786, "language_loss": 0.86102581, "learning_rate": 3.6766891159659177e-06, "loss": 0.88604462, "num_input_tokens_seen": 3396335, "step": 160, "time_per_iteration": 2.861445426940918 }, { "auxiliary_loss_clip": 0.01426619, "auxiliary_loss_mlp": 0.01070959, "balance_loss_clip": 1.12065387, "balance_loss_mlp": 1.04808283, "epoch": 0.019359105392893646, "flos": 21360672777600.0, "grad_norm": 2.878680084264272, "language_loss": 0.87978154, "learning_rate": 3.6812028124990075e-06, "loss": 0.90475732, "num_input_tokens_seen": 3413605, "step": 161, "time_per_iteration": 2.7992868423461914 }, { "auxiliary_loss_clip": 0.01421058, "auxiliary_loss_mlp": 0.01055264, "balance_loss_clip": 1.11692226, "balance_loss_mlp": 1.03510547, "epoch": 0.019479348283532736, "flos": 16283729681280.0, "grad_norm": 5.121601731141664, "language_loss": 0.81450897, "learning_rate": 3.6856885602676016e-06, "loss": 0.83927214, "num_input_tokens_seen": 3429640, "step": 162, "time_per_iteration": 2.8092498779296875 }, { "auxiliary_loss_clip": 0.01426916, "auxiliary_loss_mlp": 0.01066086, "balance_loss_clip": 1.12022662, "balance_loss_mlp": 1.04666686, "epoch": 0.019599591174171826, "flos": 22091239497600.0, "grad_norm": 2.1532510477321534, "language_loss": 0.94107336, "learning_rate": 3.6901467032597733e-06, "loss": 0.96600336, "num_input_tokens_seen": 3448125, "step": 163, "time_per_iteration": 2.8487915992736816 }, { "auxiliary_loss_clip": 0.01427101, "auxiliary_loss_mlp": 0.01061774, "balance_loss_clip": 1.11874282, "balance_loss_mlp": 1.04143667, "epoch": 0.01971983406481092, "flos": 19609668581760.0, "grad_norm": 2.206185974703032, "language_loss": 0.87379444, "learning_rate": 3.694577579151804e-06, "loss": 0.89868307, "num_input_tokens_seen": 3466535, "step": 164, "time_per_iteration": 2.887148141860962 }, { "auxiliary_loss_clip": 0.01421665, "auxiliary_loss_mlp": 0.01066907, "balance_loss_clip": 1.12039554, "balance_loss_mlp": 1.04691601, "epoch": 0.01984007695545001, "flos": 19099342103040.0, "grad_norm": 2.276538317418956, "language_loss": 0.73576796, "learning_rate": 3.6989815194616703e-06, "loss": 0.76065373, "num_input_tokens_seen": 3483730, "step": 165, "time_per_iteration": 2.8569586277008057 }, { "auxiliary_loss_clip": 0.01429303, "auxiliary_loss_mlp": 0.01070127, "balance_loss_clip": 1.11823416, "balance_loss_mlp": 1.04823995, "epoch": 0.0199603198460891, "flos": 20848406964480.0, "grad_norm": 4.08622385482129, "language_loss": 0.79881382, "learning_rate": 3.703358849697888e-06, "loss": 0.82380813, "num_input_tokens_seen": 3503640, "step": 166, "time_per_iteration": 2.847348690032959 }, { "auxiliary_loss_clip": 0.0141941, "auxiliary_loss_mlp": 0.01063396, "balance_loss_clip": 1.11780179, "balance_loss_mlp": 1.04309464, "epoch": 0.020080562736728192, "flos": 21870747861120.0, "grad_norm": 1.952512468450802, "language_loss": 0.82658011, "learning_rate": 3.7077098895038803e-06, "loss": 0.85140818, "num_input_tokens_seen": 3523010, "step": 167, "time_per_iteration": 2.895167827606201 }, { "auxiliary_loss_clip": 0.01426635, "auxiliary_loss_mlp": 0.01053949, "balance_loss_clip": 1.122401, "balance_loss_mlp": 1.0326817, "epoch": 0.020200805627367282, "flos": 21688788539520.0, "grad_norm": 2.7303998099510762, "language_loss": 0.96938533, "learning_rate": 3.712034952798045e-06, "loss": 0.99419123, "num_input_tokens_seen": 3541125, "step": 168, "time_per_iteration": 2.960090398788452 }, { "auxiliary_loss_clip": 0.01429478, "auxiliary_loss_mlp": 0.01060669, "balance_loss_clip": 1.11801815, "balance_loss_mlp": 1.03891301, "epoch": 0.02032104851800637, "flos": 33543043729920.0, "grad_norm": 2.616898472657432, "language_loss": 0.8488369, "learning_rate": 3.7163343479096656e-06, "loss": 0.87373835, "num_input_tokens_seen": 3562700, "step": 169, "time_per_iteration": 3.042736768722534 }, { "auxiliary_loss_clip": 0.01416908, "auxiliary_loss_mlp": 0.01066104, "balance_loss_clip": 1.11606753, "balance_loss_mlp": 1.04608905, "epoch": 0.020441291408645465, "flos": 31686965274240.0, "grad_norm": 4.381136376339378, "language_loss": 0.82700789, "learning_rate": 3.720608377710802e-06, "loss": 0.85183805, "num_input_tokens_seen": 3582790, "step": 170, "time_per_iteration": 2.8863537311553955 }, { "auxiliary_loss_clip": 0.01413617, "auxiliary_loss_mlp": 0.01057147, "balance_loss_clip": 1.11486959, "balance_loss_mlp": 1.03529584, "epoch": 0.020561534299284555, "flos": 20886687884160.0, "grad_norm": 2.400236343040946, "language_loss": 0.8663826, "learning_rate": 3.7248573397443277e-06, "loss": 0.89109021, "num_input_tokens_seen": 3601715, "step": 171, "time_per_iteration": 3.772618532180786 }, { "auxiliary_loss_clip": 0.01413857, "auxiliary_loss_mlp": 0.01053935, "balance_loss_clip": 1.11581278, "balance_loss_mlp": 1.03437245, "epoch": 0.020681777189923645, "flos": 20996610480000.0, "grad_norm": 2.7898531352510982, "language_loss": 0.97356367, "learning_rate": 3.729081526348224e-06, "loss": 0.9982416, "num_input_tokens_seen": 3620245, "step": 172, "time_per_iteration": 4.725063323974609 }, { "auxiliary_loss_clip": 0.01409262, "auxiliary_loss_mlp": 0.01058648, "balance_loss_clip": 1.11371493, "balance_loss_mlp": 1.03960991, "epoch": 0.020802020080562738, "flos": 28257532312320.0, "grad_norm": 2.342300000273299, "language_loss": 0.84906852, "learning_rate": 3.7332812247762777e-06, "loss": 0.87374759, "num_input_tokens_seen": 3641545, "step": 173, "time_per_iteration": 4.509430408477783 }, { "auxiliary_loss_clip": 0.01413568, "auxiliary_loss_mlp": 0.0106375, "balance_loss_clip": 1.11582756, "balance_loss_mlp": 1.04510617, "epoch": 0.020922262971201828, "flos": 19681274344320.0, "grad_norm": 2.336676967339027, "language_loss": 0.95439529, "learning_rate": 3.737456717315293e-06, "loss": 0.97916842, "num_input_tokens_seen": 3660510, "step": 174, "time_per_iteration": 2.8510043621063232 }, { "auxiliary_loss_clip": 0.01408619, "auxiliary_loss_mlp": 0.01058613, "balance_loss_clip": 1.11615229, "balance_loss_mlp": 1.04039764, "epoch": 0.021042505861840918, "flos": 15666353694720.0, "grad_norm": 2.03788154736055, "language_loss": 0.90792227, "learning_rate": 3.7416082813989552e-06, "loss": 0.9325946, "num_input_tokens_seen": 3677505, "step": 175, "time_per_iteration": 2.8411688804626465 }, { "auxiliary_loss_clip": 0.01413191, "auxiliary_loss_mlp": 0.01057988, "balance_loss_clip": 1.11426222, "balance_loss_mlp": 1.038831, "epoch": 0.02116274875248001, "flos": 21142012734720.0, "grad_norm": 2.3410994578035687, "language_loss": 0.89677417, "learning_rate": 3.745736189718439e-06, "loss": 0.9214859, "num_input_tokens_seen": 3696760, "step": 176, "time_per_iteration": 2.9625511169433594 }, { "auxiliary_loss_clip": 0.01403533, "auxiliary_loss_mlp": 0.0106132, "balance_loss_clip": 1.112198, "balance_loss_mlp": 1.04417777, "epoch": 0.0212829916431191, "flos": 24715770543360.0, "grad_norm": 3.2869308137075985, "language_loss": 0.7276783, "learning_rate": 3.749840710329894e-06, "loss": 0.75232685, "num_input_tokens_seen": 3717465, "step": 177, "time_per_iteration": 2.8339407444000244 }, { "auxiliary_loss_clip": 0.01418722, "auxiliary_loss_mlp": 0.0105815, "balance_loss_clip": 1.11889255, "balance_loss_mlp": 1.03889775, "epoch": 0.02140323453375819, "flos": 16645493508480.0, "grad_norm": 2.8680833140168467, "language_loss": 0.98235011, "learning_rate": 3.7539221067588938e-06, "loss": 1.00711882, "num_input_tokens_seen": 3731440, "step": 178, "time_per_iteration": 2.788883924484253 }, { "auxiliary_loss_clip": 0.01413388, "auxiliary_loss_mlp": 0.01066921, "balance_loss_clip": 1.11543417, "balance_loss_mlp": 1.04669166, "epoch": 0.021523477424397284, "flos": 20299332689280.0, "grad_norm": 3.92826722627951, "language_loss": 0.93772984, "learning_rate": 3.757980638101964e-06, "loss": 0.962533, "num_input_tokens_seen": 3744935, "step": 179, "time_per_iteration": 2.7027599811553955 }, { "auxiliary_loss_clip": 0.01412451, "auxiliary_loss_mlp": 0.01053735, "balance_loss_clip": 1.11511683, "balance_loss_mlp": 1.03393471, "epoch": 0.021643720315036374, "flos": 26104005331200.0, "grad_norm": 5.13390803109414, "language_loss": 0.89641804, "learning_rate": 3.7620165591252806e-06, "loss": 0.92107987, "num_input_tokens_seen": 3763035, "step": 180, "time_per_iteration": 3.0522994995117188 }, { "auxiliary_loss_clip": 0.01400262, "auxiliary_loss_mlp": 0.01057177, "balance_loss_clip": 1.11049175, "balance_loss_mlp": 1.03882492, "epoch": 0.021763963205675464, "flos": 24787663614720.0, "grad_norm": 1.957978582786506, "language_loss": 0.94240814, "learning_rate": 3.766030120360636e-06, "loss": 0.96698248, "num_input_tokens_seen": 3782665, "step": 181, "time_per_iteration": 2.9523510932922363 }, { "auxiliary_loss_clip": 0.01397571, "auxiliary_loss_mlp": 0.01055716, "balance_loss_clip": 1.10744953, "balance_loss_mlp": 1.03832364, "epoch": 0.021884206096314557, "flos": 25813559957760.0, "grad_norm": 2.3935198662757777, "language_loss": 0.90360129, "learning_rate": 3.7700215681987578e-06, "loss": 0.92813414, "num_input_tokens_seen": 3802435, "step": 182, "time_per_iteration": 2.8956785202026367 }, { "auxiliary_loss_clip": 0.01404168, "auxiliary_loss_mlp": 0.01060731, "balance_loss_clip": 1.11018026, "balance_loss_mlp": 1.04070342, "epoch": 0.022004448986953647, "flos": 20082719721600.0, "grad_norm": 1.7125837382237554, "language_loss": 0.82105464, "learning_rate": 3.7739911449800767e-06, "loss": 0.84570366, "num_input_tokens_seen": 3822490, "step": 183, "time_per_iteration": 2.8408918380737305 }, { "auxiliary_loss_clip": 0.01400797, "auxiliary_loss_mlp": 0.0106522, "balance_loss_clip": 1.10934591, "balance_loss_mlp": 1.04582477, "epoch": 0.022124691877592736, "flos": 20480609652480.0, "grad_norm": 2.5063380905704835, "language_loss": 0.80638361, "learning_rate": 3.7779390890830114e-06, "loss": 0.83104372, "num_input_tokens_seen": 3841140, "step": 184, "time_per_iteration": 3.0006637573242188 }, { "auxiliary_loss_clip": 0.01399336, "auxiliary_loss_mlp": 0.01053862, "balance_loss_clip": 1.10539579, "balance_loss_mlp": 1.03586102, "epoch": 0.02224493476823183, "flos": 23586847015680.0, "grad_norm": 1.8701675691113178, "language_loss": 0.86091632, "learning_rate": 3.7818656350098723e-06, "loss": 0.88544828, "num_input_tokens_seen": 3862090, "step": 185, "time_per_iteration": 2.9074525833129883 }, { "auxiliary_loss_clip": 0.01400879, "auxiliary_loss_mlp": 0.01056308, "balance_loss_clip": 1.11024189, "balance_loss_mlp": 1.03772306, "epoch": 0.02236517765887092, "flos": 16909940413440.0, "grad_norm": 2.8935338518973515, "language_loss": 0.77159846, "learning_rate": 3.7857710134704447e-06, "loss": 0.79617029, "num_input_tokens_seen": 3881025, "step": 186, "time_per_iteration": 2.787224769592285 }, { "auxiliary_loss_clip": 0.01395475, "auxiliary_loss_mlp": 0.0104979, "balance_loss_clip": 1.11084688, "balance_loss_mlp": 1.03193259, "epoch": 0.02248542054951001, "flos": 43508182930560.0, "grad_norm": 2.7835211111969693, "language_loss": 0.79261839, "learning_rate": 3.7896554514633234e-06, "loss": 0.81707096, "num_input_tokens_seen": 3905310, "step": 187, "time_per_iteration": 3.0838499069213867 }, { "auxiliary_loss_clip": 0.01395948, "auxiliary_loss_mlp": 0.01061845, "balance_loss_clip": 1.10805297, "balance_loss_mlp": 1.04265237, "epoch": 0.022605663440149103, "flos": 23367648268800.0, "grad_norm": 1.9558132512893731, "language_loss": 0.84049499, "learning_rate": 3.7935191723550955e-06, "loss": 0.86507291, "num_input_tokens_seen": 3924265, "step": 188, "time_per_iteration": 2.7603394985198975 }, { "auxiliary_loss_clip": 0.01388439, "auxiliary_loss_mlp": 0.01061808, "balance_loss_clip": 1.10672164, "balance_loss_mlp": 1.0431875, "epoch": 0.022725906330788193, "flos": 29019915504000.0, "grad_norm": 2.091130946882619, "language_loss": 0.88578439, "learning_rate": 3.797362395957408e-06, "loss": 0.9102869, "num_input_tokens_seen": 3944830, "step": 189, "time_per_iteration": 2.8311073780059814 }, { "auxiliary_loss_clip": 0.01400722, "auxiliary_loss_mlp": 0.01052336, "balance_loss_clip": 1.11303163, "balance_loss_mlp": 1.03395438, "epoch": 0.022846149221427282, "flos": 24496176746880.0, "grad_norm": 3.6644763053099942, "language_loss": 0.78317755, "learning_rate": 3.8011853386020055e-06, "loss": 0.80770814, "num_input_tokens_seen": 3965735, "step": 190, "time_per_iteration": 2.74949312210083 }, { "auxiliary_loss_clip": 0.01394473, "auxiliary_loss_mlp": 0.01066882, "balance_loss_clip": 1.10827041, "balance_loss_mlp": 1.04681873, "epoch": 0.022966392112066376, "flos": 15523537219200.0, "grad_norm": 18.190000642405728, "language_loss": 0.89665169, "learning_rate": 3.804988213213804e-06, "loss": 0.92126524, "num_input_tokens_seen": 3983975, "step": 191, "time_per_iteration": 2.643934965133667 }, { "auxiliary_loss_clip": 0.01382352, "auxiliary_loss_mlp": 0.01028123, "balance_loss_clip": 1.15664887, "balance_loss_mlp": 1.0204463, "epoch": 0.023086635002705466, "flos": 55650408433920.0, "grad_norm": 1.0198405636131567, "language_loss": 0.63119149, "learning_rate": 3.808771229382049e-06, "loss": 0.65529621, "num_input_tokens_seen": 4043440, "step": 192, "time_per_iteration": 3.1428020000457764 }, { "auxiliary_loss_clip": 0.01387545, "auxiliary_loss_mlp": 0.01055287, "balance_loss_clip": 1.10394669, "balance_loss_mlp": 1.03764451, "epoch": 0.023206877893344555, "flos": 19313441118720.0, "grad_norm": 2.0386159939108266, "language_loss": 0.84386778, "learning_rate": 3.8125345934296324e-06, "loss": 0.86829615, "num_input_tokens_seen": 4061750, "step": 193, "time_per_iteration": 2.700408935546875 }, { "auxiliary_loss_clip": 0.01395999, "auxiliary_loss_mlp": 0.01047126, "balance_loss_clip": 1.10739911, "balance_loss_mlp": 1.0297339, "epoch": 0.02332712078398365, "flos": 23072965090560.0, "grad_norm": 2.1261933172657663, "language_loss": 0.87797987, "learning_rate": 3.81627850848061e-06, "loss": 0.9024111, "num_input_tokens_seen": 4082345, "step": 194, "time_per_iteration": 2.723050355911255 }, { "auxiliary_loss_clip": 0.01382668, "auxiliary_loss_mlp": 0.01059555, "balance_loss_clip": 1.10182691, "balance_loss_mlp": 1.04237676, "epoch": 0.02344736367462274, "flos": 24425971614720.0, "grad_norm": 4.926104658184472, "language_loss": 0.85863626, "learning_rate": 3.820003174525994e-06, "loss": 0.88305843, "num_input_tokens_seen": 4101770, "step": 195, "time_per_iteration": 2.6876473426818848 }, { "auxiliary_loss_clip": 0.01388321, "auxiliary_loss_mlp": 0.01063812, "balance_loss_clip": 1.10611331, "balance_loss_mlp": 1.04584706, "epoch": 0.02356760656526183, "flos": 21579799697280.0, "grad_norm": 2.319081129398241, "language_loss": 0.8321718, "learning_rate": 3.823708788487851e-06, "loss": 0.85669315, "num_input_tokens_seen": 4118770, "step": 196, "time_per_iteration": 2.898146390914917 }, { "auxiliary_loss_clip": 0.01385547, "auxiliary_loss_mlp": 0.01065099, "balance_loss_clip": 1.10821486, "balance_loss_mlp": 1.047611, "epoch": 0.02368784945590092, "flos": 25193598192000.0, "grad_norm": 1.8217452491005444, "language_loss": 0.84488428, "learning_rate": 3.827395544281781e-06, "loss": 0.86939073, "num_input_tokens_seen": 4141110, "step": 197, "time_per_iteration": 3.7884578704833984 }, { "auxiliary_loss_clip": 0.01383269, "auxiliary_loss_mlp": 0.01057552, "balance_loss_clip": 1.1036694, "balance_loss_mlp": 1.03977764, "epoch": 0.02380809234654001, "flos": 27562481164800.0, "grad_norm": 2.5061029128595886, "language_loss": 0.79253548, "learning_rate": 3.831063632877802e-06, "loss": 0.81694376, "num_input_tokens_seen": 4161430, "step": 198, "time_per_iteration": 3.822237730026245 }, { "auxiliary_loss_clip": 0.01388741, "auxiliary_loss_mlp": 0.01064444, "balance_loss_clip": 1.1099875, "balance_loss_mlp": 1.04706359, "epoch": 0.0239283352371791, "flos": 18259786540800.0, "grad_norm": 2.5854835755926406, "language_loss": 0.75891387, "learning_rate": 3.834713242359712e-06, "loss": 0.78344572, "num_input_tokens_seen": 4179260, "step": 199, "time_per_iteration": 3.992347240447998 }, { "auxiliary_loss_clip": 0.0138463, "auxiliary_loss_mlp": 0.0105978, "balance_loss_clip": 1.10382211, "balance_loss_mlp": 1.04179168, "epoch": 0.02404857812781819, "flos": 21395110942080.0, "grad_norm": 1.808582828290575, "language_loss": 0.87028229, "learning_rate": 3.838344557982959e-06, "loss": 0.8947264, "num_input_tokens_seen": 4200640, "step": 200, "time_per_iteration": 4.35762357711792 }, { "auxiliary_loss_clip": 0.01377269, "auxiliary_loss_mlp": 0.01059963, "balance_loss_clip": 1.10019851, "balance_loss_mlp": 1.04230785, "epoch": 0.024168821018457284, "flos": 16654256426880.0, "grad_norm": 3.33442721989088, "language_loss": 0.84975541, "learning_rate": 3.841957762231063e-06, "loss": 0.87412769, "num_input_tokens_seen": 4218170, "step": 201, "time_per_iteration": 2.811209201812744 }, { "auxiliary_loss_clip": 0.01376805, "auxiliary_loss_mlp": 0.01058342, "balance_loss_clip": 1.10008597, "balance_loss_mlp": 1.04108012, "epoch": 0.024289063909096374, "flos": 22820872464000.0, "grad_norm": 2.1314749189311955, "language_loss": 0.87830037, "learning_rate": 3.8455530348706454e-06, "loss": 0.90265191, "num_input_tokens_seen": 4237770, "step": 202, "time_per_iteration": 2.8144078254699707 }, { "auxiliary_loss_clip": 0.01380381, "auxiliary_loss_mlp": 0.01068892, "balance_loss_clip": 1.10520148, "balance_loss_mlp": 1.05089188, "epoch": 0.024409306799735464, "flos": 17748598135680.0, "grad_norm": 2.0199057387890758, "language_loss": 0.77470893, "learning_rate": 3.849130553005099e-06, "loss": 0.79920167, "num_input_tokens_seen": 4255985, "step": 203, "time_per_iteration": 2.713247537612915 }, { "auxiliary_loss_clip": 0.01379844, "auxiliary_loss_mlp": 0.01053162, "balance_loss_clip": 1.10251141, "balance_loss_mlp": 1.03635371, "epoch": 0.024529549690374557, "flos": 21616213109760.0, "grad_norm": 3.324196977077497, "language_loss": 0.83609068, "learning_rate": 3.852690491126933e-06, "loss": 0.8604207, "num_input_tokens_seen": 4276035, "step": 204, "time_per_iteration": 2.872939348220825 }, { "auxiliary_loss_clip": 0.01374827, "auxiliary_loss_mlp": 0.01062696, "balance_loss_clip": 1.09742403, "balance_loss_mlp": 1.04442143, "epoch": 0.024649792581013647, "flos": 25551662918400.0, "grad_norm": 2.4731100254754086, "language_loss": 0.91180462, "learning_rate": 3.856233021168845e-06, "loss": 0.93617982, "num_input_tokens_seen": 4295730, "step": 205, "time_per_iteration": 2.912294864654541 }, { "auxiliary_loss_clip": 0.01365764, "auxiliary_loss_mlp": 0.01059547, "balance_loss_clip": 1.09741199, "balance_loss_mlp": 1.04383528, "epoch": 0.024770035471652737, "flos": 34495574544000.0, "grad_norm": 2.335356856652316, "language_loss": 0.91265738, "learning_rate": 3.859758312553544e-06, "loss": 0.93691051, "num_input_tokens_seen": 4317950, "step": 206, "time_per_iteration": 3.0504953861236572 }, { "auxiliary_loss_clip": 0.01378355, "auxiliary_loss_mlp": 0.01055588, "balance_loss_clip": 1.10356236, "balance_loss_mlp": 1.03970885, "epoch": 0.02489027836229183, "flos": 21505428587520.0, "grad_norm": 1.8916638960907033, "language_loss": 0.91866714, "learning_rate": 3.8632665322423735e-06, "loss": 0.94300658, "num_input_tokens_seen": 4337605, "step": 207, "time_per_iteration": 2.764852285385132 }, { "auxiliary_loss_clip": 0.01369059, "auxiliary_loss_mlp": 0.01058639, "balance_loss_clip": 1.09862745, "balance_loss_mlp": 1.04286742, "epoch": 0.02501052125293092, "flos": 23219013790080.0, "grad_norm": 1.767724440057138, "language_loss": 0.86057639, "learning_rate": 3.866757844782762e-06, "loss": 0.88485336, "num_input_tokens_seen": 4358110, "step": 208, "time_per_iteration": 2.978095769882202 }, { "auxiliary_loss_clip": 0.01376909, "auxiliary_loss_mlp": 0.01065027, "balance_loss_clip": 1.10240662, "balance_loss_mlp": 1.04826641, "epoch": 0.02513076414357001, "flos": 26388920010240.0, "grad_norm": 3.087107560186556, "language_loss": 0.91740888, "learning_rate": 3.870232412354527e-06, "loss": 0.94182825, "num_input_tokens_seen": 4374955, "step": 209, "time_per_iteration": 2.859707832336426 }, { "auxiliary_loss_clip": 0.01368467, "auxiliary_loss_mlp": 0.01058434, "balance_loss_clip": 1.09654808, "balance_loss_mlp": 1.04145837, "epoch": 0.025251007034209103, "flos": 13590430047360.0, "grad_norm": 2.245914970413465, "language_loss": 0.9252156, "learning_rate": 3.873690394815086e-06, "loss": 0.94948459, "num_input_tokens_seen": 4391535, "step": 210, "time_per_iteration": 2.862405300140381 }, { "auxiliary_loss_clip": 0.01372176, "auxiliary_loss_mlp": 0.01055406, "balance_loss_clip": 1.09759021, "balance_loss_mlp": 1.03853798, "epoch": 0.025371249924848193, "flos": 15049229103360.0, "grad_norm": 2.9436874439018132, "language_loss": 0.91022885, "learning_rate": 3.877131949743587e-06, "loss": 0.93450475, "num_input_tokens_seen": 4408400, "step": 211, "time_per_iteration": 2.8382811546325684 }, { "auxiliary_loss_clip": 0.01370519, "auxiliary_loss_mlp": 0.01053349, "balance_loss_clip": 1.09993005, "balance_loss_mlp": 1.03793526, "epoch": 0.025491492815487283, "flos": 25553853648000.0, "grad_norm": 2.2151466007252054, "language_loss": 0.7780987, "learning_rate": 3.880557232483993e-06, "loss": 0.80233735, "num_input_tokens_seen": 4427840, "step": 212, "time_per_iteration": 2.756894826889038 }, { "auxiliary_loss_clip": 0.01369944, "auxiliary_loss_mlp": 0.01052778, "balance_loss_clip": 1.09703386, "balance_loss_mlp": 1.037269, "epoch": 0.025611735706126376, "flos": 20630752502400.0, "grad_norm": 2.125725981538077, "language_loss": 0.86656141, "learning_rate": 3.883966396187164e-06, "loss": 0.89078867, "num_input_tokens_seen": 4447110, "step": 213, "time_per_iteration": 2.902561902999878 }, { "auxiliary_loss_clip": 0.01369733, "auxiliary_loss_mlp": 0.01057033, "balance_loss_clip": 1.09935975, "balance_loss_mlp": 1.04189336, "epoch": 0.025731978596765466, "flos": 19062282245760.0, "grad_norm": 2.039591104360469, "language_loss": 0.8988986, "learning_rate": 3.887359591851937e-06, "loss": 0.92316628, "num_input_tokens_seen": 4464715, "step": 214, "time_per_iteration": 2.792357921600342 }, { "auxiliary_loss_clip": 0.01364396, "auxiliary_loss_mlp": 0.01055154, "balance_loss_clip": 1.09523392, "balance_loss_mlp": 1.03751087, "epoch": 0.025852221487404556, "flos": 22163814927360.0, "grad_norm": 1.9759827978338111, "language_loss": 0.92281759, "learning_rate": 3.890736968365265e-06, "loss": 0.94701314, "num_input_tokens_seen": 4485030, "step": 215, "time_per_iteration": 2.8147928714752197 }, { "auxiliary_loss_clip": 0.01361677, "auxiliary_loss_mlp": 0.01053194, "balance_loss_clip": 1.09330797, "balance_loss_mlp": 1.03617048, "epoch": 0.02597246437804365, "flos": 26541971861760.0, "grad_norm": 3.2669666007763576, "language_loss": 0.85050762, "learning_rate": 3.894098672541412e-06, "loss": 0.87465632, "num_input_tokens_seen": 4505935, "step": 216, "time_per_iteration": 2.831508159637451 }, { "auxiliary_loss_clip": 0.0136606, "auxiliary_loss_mlp": 0.01049374, "balance_loss_clip": 1.09674108, "balance_loss_mlp": 1.03307772, "epoch": 0.02609270726868274, "flos": 32671671696000.0, "grad_norm": 1.7895670377643496, "language_loss": 0.75436872, "learning_rate": 3.89744484916025e-06, "loss": 0.77852303, "num_input_tokens_seen": 4527045, "step": 217, "time_per_iteration": 2.952389717102051 }, { "auxiliary_loss_clip": 0.01375125, "auxiliary_loss_mlp": 0.01061347, "balance_loss_clip": 1.09967208, "balance_loss_mlp": 1.04359674, "epoch": 0.02621295015932183, "flos": 26243553669120.0, "grad_norm": 1.8633421645783101, "language_loss": 0.8734507, "learning_rate": 3.900775641004673e-06, "loss": 0.89781541, "num_input_tokens_seen": 4546360, "step": 218, "time_per_iteration": 2.8581180572509766 }, { "auxiliary_loss_clip": 0.01378067, "auxiliary_loss_mlp": 0.01055304, "balance_loss_clip": 1.10093427, "balance_loss_mlp": 1.03748202, "epoch": 0.026333193049960922, "flos": 42921402353280.0, "grad_norm": 2.907989564455654, "language_loss": 0.7380808, "learning_rate": 3.904091188897156e-06, "loss": 0.76241446, "num_input_tokens_seen": 4565495, "step": 219, "time_per_iteration": 2.9329168796539307 }, { "auxiliary_loss_clip": 0.01366949, "auxiliary_loss_mlp": 0.01065141, "balance_loss_clip": 1.09899962, "balance_loss_mlp": 1.04776037, "epoch": 0.026453435940600012, "flos": 17963846386560.0, "grad_norm": 2.3618749881219734, "language_loss": 0.81953013, "learning_rate": 3.90739163173548e-06, "loss": 0.84385103, "num_input_tokens_seen": 4583330, "step": 220, "time_per_iteration": 2.780705451965332 }, { "auxiliary_loss_clip": 0.01365232, "auxiliary_loss_mlp": 0.01050859, "balance_loss_clip": 1.09492016, "balance_loss_mlp": 1.03394353, "epoch": 0.026573678831239102, "flos": 18984319776000.0, "grad_norm": 3.8605708051834537, "language_loss": 0.88440263, "learning_rate": 3.910677106527646e-06, "loss": 0.90856355, "num_input_tokens_seen": 4600520, "step": 221, "time_per_iteration": 2.783273696899414 }, { "auxiliary_loss_clip": 0.01358499, "auxiliary_loss_mlp": 0.01067352, "balance_loss_clip": 1.0927099, "balance_loss_mlp": 1.04958987, "epoch": 0.026693921721878195, "flos": 29241448634880.0, "grad_norm": 2.5181676443083716, "language_loss": 0.84181613, "learning_rate": 3.913947748426004e-06, "loss": 0.86607462, "num_input_tokens_seen": 4617340, "step": 222, "time_per_iteration": 2.8548529148101807 }, { "auxiliary_loss_clip": 0.01364975, "auxiliary_loss_mlp": 0.01065055, "balance_loss_clip": 1.09751916, "balance_loss_mlp": 1.0474, "epoch": 0.026814164612517285, "flos": 14128083797760.0, "grad_norm": 2.742483597409194, "language_loss": 0.76227331, "learning_rate": 3.9172036907606136e-06, "loss": 0.78657365, "num_input_tokens_seen": 4630820, "step": 223, "time_per_iteration": 2.7587270736694336 }, { "auxiliary_loss_clip": 0.01365337, "auxiliary_loss_mlp": 0.01060837, "balance_loss_clip": 1.09553933, "balance_loss_mlp": 1.04413617, "epoch": 0.026934407503156375, "flos": 23511973115520.0, "grad_norm": 1.7382987997224117, "language_loss": 0.95016575, "learning_rate": 3.920445065071855e-06, "loss": 0.97442752, "num_input_tokens_seen": 4651985, "step": 224, "time_per_iteration": 3.754178524017334 }, { "auxiliary_loss_clip": 0.01363541, "auxiliary_loss_mlp": 0.01057198, "balance_loss_clip": 1.09656656, "balance_loss_mlp": 1.04056787, "epoch": 0.027054650393795468, "flos": 28950356816640.0, "grad_norm": 3.342288576760265, "language_loss": 0.79916584, "learning_rate": 3.923672001142322e-06, "loss": 0.82337326, "num_input_tokens_seen": 4672295, "step": 225, "time_per_iteration": 4.711054563522339 }, { "auxiliary_loss_clip": 0.01358603, "auxiliary_loss_mlp": 0.01055885, "balance_loss_clip": 1.09248781, "balance_loss_mlp": 1.03920734, "epoch": 0.027174893284434558, "flos": 31431568596480.0, "grad_norm": 2.3046032559784786, "language_loss": 0.84515381, "learning_rate": 3.926884627027996e-06, "loss": 0.8692987, "num_input_tokens_seen": 4696065, "step": 226, "time_per_iteration": 3.960643768310547 }, { "auxiliary_loss_clip": 0.013535, "auxiliary_loss_mlp": 0.01052103, "balance_loss_clip": 1.09115863, "balance_loss_mlp": 1.03637981, "epoch": 0.027295136175073648, "flos": 22054466949120.0, "grad_norm": 1.825593981920795, "language_loss": 0.77659029, "learning_rate": 3.930083069088744e-06, "loss": 0.80064631, "num_input_tokens_seen": 4716065, "step": 227, "time_per_iteration": 2.8244121074676514 }, { "auxiliary_loss_clip": 0.01351728, "auxiliary_loss_mlp": 0.01008936, "balance_loss_clip": 1.14566338, "balance_loss_mlp": 1.00144947, "epoch": 0.02741537906571274, "flos": 60800752972800.0, "grad_norm": 0.9829223914073513, "language_loss": 0.59261966, "learning_rate": 3.933267452018137e-06, "loss": 0.61622626, "num_input_tokens_seen": 4775860, "step": 228, "time_per_iteration": 3.30627703666687 }, { "auxiliary_loss_clip": 0.01356256, "auxiliary_loss_mlp": 0.01060587, "balance_loss_clip": 1.0954591, "balance_loss_mlp": 1.0447861, "epoch": 0.02753562195635183, "flos": 24606278910720.0, "grad_norm": 2.3876961923293996, "language_loss": 0.84483469, "learning_rate": 3.936437898872622e-06, "loss": 0.86900312, "num_input_tokens_seen": 4795835, "step": 229, "time_per_iteration": 2.7739250659942627 }, { "auxiliary_loss_clip": 0.01355956, "auxiliary_loss_mlp": 0.01054932, "balance_loss_clip": 1.09010506, "balance_loss_mlp": 1.03882658, "epoch": 0.02765586484699092, "flos": 34094236907520.0, "grad_norm": 3.338688588474045, "language_loss": 0.79843658, "learning_rate": 3.9395945311000525e-06, "loss": 0.82254553, "num_input_tokens_seen": 4817460, "step": 230, "time_per_iteration": 2.7805514335632324 }, { "auxiliary_loss_clip": 0.01359238, "auxiliary_loss_mlp": 0.01059772, "balance_loss_clip": 1.09289837, "balance_loss_mlp": 1.04231954, "epoch": 0.027776107737630014, "flos": 14829922615680.0, "grad_norm": 2.7968501468645646, "language_loss": 0.90959692, "learning_rate": 3.942737468567608e-06, "loss": 0.93378699, "num_input_tokens_seen": 4835475, "step": 231, "time_per_iteration": 2.833622694015503 }, { "auxiliary_loss_clip": 0.01356135, "auxiliary_loss_mlp": 0.01058254, "balance_loss_clip": 1.09250712, "balance_loss_mlp": 1.04233968, "epoch": 0.027896350628269104, "flos": 47920347066240.0, "grad_norm": 1.9650220291167344, "language_loss": 0.85874051, "learning_rate": 3.9458668295891026e-06, "loss": 0.88288438, "num_input_tokens_seen": 4857760, "step": 232, "time_per_iteration": 2.9319586753845215 }, { "auxiliary_loss_clip": 0.01355945, "auxiliary_loss_mlp": 0.01048267, "balance_loss_clip": 1.09329104, "balance_loss_mlp": 1.03182757, "epoch": 0.028016593518908194, "flos": 21684550734720.0, "grad_norm": 2.3687696972257326, "language_loss": 0.86969769, "learning_rate": 3.948982730951712e-06, "loss": 0.8937397, "num_input_tokens_seen": 4875855, "step": 233, "time_per_iteration": 2.7377872467041016 }, { "auxiliary_loss_clip": 0.01361839, "auxiliary_loss_mlp": 0.0105058, "balance_loss_clip": 1.09510922, "balance_loss_mlp": 1.03164947, "epoch": 0.028136836409547287, "flos": 18439483305600.0, "grad_norm": 2.5282878395511195, "language_loss": 0.82011247, "learning_rate": 3.9520852879421254e-06, "loss": 0.84423667, "num_input_tokens_seen": 4893200, "step": 234, "time_per_iteration": 2.6918909549713135 }, { "auxiliary_loss_clip": 0.01352183, "auxiliary_loss_mlp": 0.01065687, "balance_loss_clip": 1.09239531, "balance_loss_mlp": 1.04864001, "epoch": 0.028257079300186377, "flos": 31576934937600.0, "grad_norm": 1.957984124056265, "language_loss": 0.81262422, "learning_rate": 3.955174614372137e-06, "loss": 0.83680296, "num_input_tokens_seen": 4912965, "step": 235, "time_per_iteration": 2.9748342037200928 }, { "auxiliary_loss_clip": 0.01355812, "auxiliary_loss_mlp": 0.01058631, "balance_loss_clip": 1.09160399, "balance_loss_mlp": 1.04283607, "epoch": 0.028377322190825467, "flos": 23513337832320.0, "grad_norm": 3.335303691336275, "language_loss": 0.84425843, "learning_rate": 3.9582508226037045e-06, "loss": 0.86840296, "num_input_tokens_seen": 4933105, "step": 236, "time_per_iteration": 2.839087724685669 }, { "auxiliary_loss_clip": 0.01360722, "auxiliary_loss_mlp": 0.01059377, "balance_loss_clip": 1.09278226, "balance_loss_mlp": 1.04268789, "epoch": 0.02849756508146456, "flos": 20479604071680.0, "grad_norm": 2.52823867418392, "language_loss": 0.94219446, "learning_rate": 3.9613140235734636e-06, "loss": 0.9663955, "num_input_tokens_seen": 4950085, "step": 237, "time_per_iteration": 2.7339391708374023 }, { "auxiliary_loss_clip": 0.01351446, "auxiliary_loss_mlp": 0.01057854, "balance_loss_clip": 1.09007931, "balance_loss_mlp": 1.04065251, "epoch": 0.02861780797210365, "flos": 14283362292480.0, "grad_norm": 2.0437999870095176, "language_loss": 0.8143425, "learning_rate": 3.96436432681674e-06, "loss": 0.83843553, "num_input_tokens_seen": 4968075, "step": 238, "time_per_iteration": 2.804539918899536 }, { "auxiliary_loss_clip": 0.01350492, "auxiliary_loss_mlp": 0.01053394, "balance_loss_clip": 1.09071827, "balance_loss_mlp": 1.03708577, "epoch": 0.02873805086274274, "flos": 25808532053760.0, "grad_norm": 2.4232854688586642, "language_loss": 0.8911292, "learning_rate": 3.967401840491044e-06, "loss": 0.91516805, "num_input_tokens_seen": 4987355, "step": 239, "time_per_iteration": 2.761491537094116 }, { "auxiliary_loss_clip": 0.01344518, "auxiliary_loss_mlp": 0.01050057, "balance_loss_clip": 1.08924556, "balance_loss_mlp": 1.03472638, "epoch": 0.028858293753381833, "flos": 17304238984320.0, "grad_norm": 2.2056444458295106, "language_loss": 0.8789646, "learning_rate": 3.97042667139909e-06, "loss": 0.90291035, "num_input_tokens_seen": 5004680, "step": 240, "time_per_iteration": 2.741469144821167 }, { "auxiliary_loss_clip": 0.01345401, "auxiliary_loss_mlp": 0.01051131, "balance_loss_clip": 1.08770394, "balance_loss_mlp": 1.03358293, "epoch": 0.028978536644020923, "flos": 23038347358080.0, "grad_norm": 1.9929343412609741, "language_loss": 0.87692463, "learning_rate": 3.973438925011327e-06, "loss": 0.90088999, "num_input_tokens_seen": 5022965, "step": 241, "time_per_iteration": 2.8094449043273926 }, { "auxiliary_loss_clip": 0.01352256, "auxiliary_loss_mlp": 0.01058744, "balance_loss_clip": 1.0885793, "balance_loss_mlp": 1.04219735, "epoch": 0.029098779534660012, "flos": 28329712692480.0, "grad_norm": 2.524534450575997, "language_loss": 0.91162586, "learning_rate": 3.976438705488002e-06, "loss": 0.93573582, "num_input_tokens_seen": 5042625, "step": 242, "time_per_iteration": 2.8605704307556152 }, { "auxiliary_loss_clip": 0.01349569, "auxiliary_loss_mlp": 0.01051723, "balance_loss_clip": 1.09175897, "balance_loss_mlp": 1.03435445, "epoch": 0.029219022425299106, "flos": 13881665520000.0, "grad_norm": 2.475436967166845, "language_loss": 0.92801178, "learning_rate": 3.9794261157007744e-06, "loss": 0.9520247, "num_input_tokens_seen": 5060380, "step": 243, "time_per_iteration": 2.826836347579956 }, { "auxiliary_loss_clip": 0.01352717, "auxiliary_loss_mlp": 0.01055407, "balance_loss_clip": 1.09154761, "balance_loss_mlp": 1.03876543, "epoch": 0.029339265315938196, "flos": 19422501788160.0, "grad_norm": 2.7942228008909655, "language_loss": 0.85037196, "learning_rate": 3.982401257253887e-06, "loss": 0.87445325, "num_input_tokens_seen": 5078720, "step": 244, "time_per_iteration": 2.7898318767547607 }, { "auxiliary_loss_clip": 0.01343495, "auxiliary_loss_mlp": 0.01055148, "balance_loss_clip": 1.08615863, "balance_loss_mlp": 1.04009187, "epoch": 0.029459508206577285, "flos": 15669550005120.0, "grad_norm": 2.6876937479538245, "language_loss": 0.89673173, "learning_rate": 3.985364230504893e-06, "loss": 0.92071819, "num_input_tokens_seen": 5096605, "step": 245, "time_per_iteration": 2.7678215503692627 }, { "auxiliary_loss_clip": 0.0134974, "auxiliary_loss_mlp": 0.01049858, "balance_loss_clip": 1.09137106, "balance_loss_mlp": 1.03428912, "epoch": 0.02957975109721638, "flos": 28220975245440.0, "grad_norm": 2.0802531039598815, "language_loss": 0.84282118, "learning_rate": 3.988315134584976e-06, "loss": 0.86681712, "num_input_tokens_seen": 5116285, "step": 246, "time_per_iteration": 2.7121732234954834 }, { "auxiliary_loss_clip": 0.01350946, "auxiliary_loss_mlp": 0.01058133, "balance_loss_clip": 1.08865762, "balance_loss_mlp": 1.04217112, "epoch": 0.02969999398785547, "flos": 24315869450880.0, "grad_norm": 1.995951889746286, "language_loss": 0.80279595, "learning_rate": 3.991254067418851e-06, "loss": 0.82688677, "num_input_tokens_seen": 5136825, "step": 247, "time_per_iteration": 2.8841910362243652 }, { "auxiliary_loss_clip": 0.01343713, "auxiliary_loss_mlp": 0.01050978, "balance_loss_clip": 1.08861578, "balance_loss_mlp": 1.03583848, "epoch": 0.02982023687849456, "flos": 35078584193280.0, "grad_norm": 1.891869971766899, "language_loss": 0.83039021, "learning_rate": 3.994181125744254e-06, "loss": 0.8543371, "num_input_tokens_seen": 5158630, "step": 248, "time_per_iteration": 2.812683343887329 }, { "auxiliary_loss_clip": 0.01338455, "auxiliary_loss_mlp": 0.01050356, "balance_loss_clip": 1.08539104, "balance_loss_mlp": 1.0348947, "epoch": 0.02994047976913365, "flos": 26177155378560.0, "grad_norm": 4.430149305503461, "language_loss": 0.74203026, "learning_rate": 3.99709640513106e-06, "loss": 0.76591837, "num_input_tokens_seen": 5179510, "step": 249, "time_per_iteration": 2.7500202655792236 }, { "auxiliary_loss_clip": 0.01353834, "auxiliary_loss_mlp": 0.01052368, "balance_loss_clip": 1.09119201, "balance_loss_mlp": 1.03459418, "epoch": 0.03006072265977274, "flos": 25625028447360.0, "grad_norm": 2.906261030439666, "language_loss": 0.85582519, "learning_rate": 4e-06, "loss": 0.87988722, "num_input_tokens_seen": 5199345, "step": 250, "time_per_iteration": 3.760000228881836 }, { "auxiliary_loss_clip": 0.01346482, "auxiliary_loss_mlp": 0.01057366, "balance_loss_clip": 1.0907805, "balance_loss_mlp": 1.04068899, "epoch": 0.03018096555041183, "flos": 22127078292480.0, "grad_norm": 2.614007030013376, "language_loss": 0.88796735, "learning_rate": 3.999999848300794e-06, "loss": 0.9120059, "num_input_tokens_seen": 5218330, "step": 251, "time_per_iteration": 3.74057674407959 }, { "auxiliary_loss_clip": 0.01333906, "auxiliary_loss_mlp": 0.01059567, "balance_loss_clip": 1.08117855, "balance_loss_mlp": 1.04422474, "epoch": 0.030301208441050925, "flos": 30188197359360.0, "grad_norm": 1.6945724212522841, "language_loss": 0.89264178, "learning_rate": 3.999999393203203e-06, "loss": 0.9165765, "num_input_tokens_seen": 5240740, "step": 252, "time_per_iteration": 3.7808425426483154 }, { "auxiliary_loss_clip": 0.01341264, "auxiliary_loss_mlp": 0.01050621, "balance_loss_clip": 1.08747911, "balance_loss_mlp": 1.03520679, "epoch": 0.030421451331690014, "flos": 23621392920960.0, "grad_norm": 2.0108427878170896, "language_loss": 0.85207289, "learning_rate": 3.999998634707293e-06, "loss": 0.87599176, "num_input_tokens_seen": 5260290, "step": 253, "time_per_iteration": 3.7796642780303955 }, { "auxiliary_loss_clip": 0.01345429, "auxiliary_loss_mlp": 0.01069334, "balance_loss_clip": 1.09061027, "balance_loss_mlp": 1.05343139, "epoch": 0.030541694222329104, "flos": 27928446883200.0, "grad_norm": 2.143539705153979, "language_loss": 0.96284056, "learning_rate": 3.999997572813182e-06, "loss": 0.98698825, "num_input_tokens_seen": 5278100, "step": 254, "time_per_iteration": 2.7946088314056396 }, { "auxiliary_loss_clip": 0.01341339, "auxiliary_loss_mlp": 0.01066179, "balance_loss_clip": 1.08503747, "balance_loss_mlp": 1.05082524, "epoch": 0.030661937112968194, "flos": 18588441006720.0, "grad_norm": 1.8918031101978667, "language_loss": 0.87841243, "learning_rate": 3.999996207521028e-06, "loss": 0.90248764, "num_input_tokens_seen": 5296810, "step": 255, "time_per_iteration": 2.7751781940460205 }, { "auxiliary_loss_clip": 0.01349219, "auxiliary_loss_mlp": 0.0105989, "balance_loss_clip": 1.08890891, "balance_loss_mlp": 1.04354072, "epoch": 0.030782180003607287, "flos": 12969139478400.0, "grad_norm": 2.562615041558185, "language_loss": 0.82257867, "learning_rate": 3.999994538831039e-06, "loss": 0.84666973, "num_input_tokens_seen": 5313395, "step": 256, "time_per_iteration": 2.7830471992492676 }, { "auxiliary_loss_clip": 0.01338111, "auxiliary_loss_mlp": 0.01065823, "balance_loss_clip": 1.08464313, "balance_loss_mlp": 1.04988456, "epoch": 0.030902422894246377, "flos": 23335364920320.0, "grad_norm": 2.1762189614585363, "language_loss": 0.85888195, "learning_rate": 3.99999256674347e-06, "loss": 0.88292128, "num_input_tokens_seen": 5333545, "step": 257, "time_per_iteration": 2.8147852420806885 }, { "auxiliary_loss_clip": 0.0131472, "auxiliary_loss_mlp": 0.01010844, "balance_loss_clip": 1.1194005, "balance_loss_mlp": 1.00435948, "epoch": 0.031022665784885467, "flos": 55094151438720.0, "grad_norm": 1.0154440289597992, "language_loss": 0.53459579, "learning_rate": 3.999990291258618e-06, "loss": 0.55785143, "num_input_tokens_seen": 5392235, "step": 258, "time_per_iteration": 3.3248157501220703 }, { "auxiliary_loss_clip": 0.01341069, "auxiliary_loss_mlp": 0.01062295, "balance_loss_clip": 1.08878613, "balance_loss_mlp": 1.04654729, "epoch": 0.03114290867552456, "flos": 19317786664320.0, "grad_norm": 2.434094248297329, "language_loss": 0.86691082, "learning_rate": 3.999987712376829e-06, "loss": 0.89094442, "num_input_tokens_seen": 5410555, "step": 259, "time_per_iteration": 2.6898930072784424 }, { "auxiliary_loss_clip": 0.01338493, "auxiliary_loss_mlp": 0.01050859, "balance_loss_clip": 1.08846045, "balance_loss_mlp": 1.03638744, "epoch": 0.031263151566163654, "flos": 20959442881920.0, "grad_norm": 2.120325255473747, "language_loss": 0.82192755, "learning_rate": 3.999984830098494e-06, "loss": 0.84582102, "num_input_tokens_seen": 5430135, "step": 260, "time_per_iteration": 2.7578890323638916 }, { "auxiliary_loss_clip": 0.01341414, "auxiliary_loss_mlp": 0.01053148, "balance_loss_clip": 1.08864427, "balance_loss_mlp": 1.03675699, "epoch": 0.03138339445680274, "flos": 14793006412800.0, "grad_norm": 3.154294658856665, "language_loss": 0.98045939, "learning_rate": 3.999981644424051e-06, "loss": 1.00440502, "num_input_tokens_seen": 5444935, "step": 261, "time_per_iteration": 2.7330894470214844 }, { "auxiliary_loss_clip": 0.01338761, "auxiliary_loss_mlp": 0.01046685, "balance_loss_clip": 1.08690691, "balance_loss_mlp": 1.03085363, "epoch": 0.03150363734744183, "flos": 11655599022720.0, "grad_norm": 2.294573954453593, "language_loss": 0.86166799, "learning_rate": 3.999978155353982e-06, "loss": 0.88552237, "num_input_tokens_seen": 5462080, "step": 262, "time_per_iteration": 2.814061403274536 }, { "auxiliary_loss_clip": 0.01337114, "auxiliary_loss_mlp": 0.01057008, "balance_loss_clip": 1.08533859, "balance_loss_mlp": 1.04135573, "epoch": 0.03162388023808092, "flos": 33727732485120.0, "grad_norm": 2.35659898189183, "language_loss": 0.8027966, "learning_rate": 3.9999743628888186e-06, "loss": 0.82673776, "num_input_tokens_seen": 5483870, "step": 263, "time_per_iteration": 2.865885019302368 }, { "auxiliary_loss_clip": 0.01335019, "auxiliary_loss_mlp": 0.01049679, "balance_loss_clip": 1.08393061, "balance_loss_mlp": 1.03375244, "epoch": 0.03174412312872001, "flos": 20810952057600.0, "grad_norm": 10.13617508480722, "language_loss": 0.89747643, "learning_rate": 3.999970267029133e-06, "loss": 0.92132342, "num_input_tokens_seen": 5502830, "step": 264, "time_per_iteration": 2.682522773742676 }, { "auxiliary_loss_clip": 0.01333119, "auxiliary_loss_mlp": 0.01046874, "balance_loss_clip": 1.08525133, "balance_loss_mlp": 1.0305897, "epoch": 0.0318643660193591, "flos": 23727939638400.0, "grad_norm": 2.6490153885746976, "language_loss": 0.80137634, "learning_rate": 3.999965867775548e-06, "loss": 0.82517624, "num_input_tokens_seen": 5523225, "step": 265, "time_per_iteration": 2.7803115844726562 }, { "auxiliary_loss_clip": 0.01334896, "auxiliary_loss_mlp": 0.01050706, "balance_loss_clip": 1.08416414, "balance_loss_mlp": 1.03662133, "epoch": 0.0319846089099982, "flos": 13917863450880.0, "grad_norm": 2.930041308305657, "language_loss": 0.86793482, "learning_rate": 3.9999611651287315e-06, "loss": 0.89179087, "num_input_tokens_seen": 5541380, "step": 266, "time_per_iteration": 2.670983076095581 }, { "auxiliary_loss_clip": 0.01335829, "auxiliary_loss_mlp": 0.01051631, "balance_loss_clip": 1.08499122, "balance_loss_mlp": 1.0363605, "epoch": 0.03210485180063729, "flos": 14753253035520.0, "grad_norm": 4.402189857787081, "language_loss": 0.78667253, "learning_rate": 3.999956159089396e-06, "loss": 0.81054711, "num_input_tokens_seen": 5558830, "step": 267, "time_per_iteration": 2.750776767730713 }, { "auxiliary_loss_clip": 0.01340149, "auxiliary_loss_mlp": 0.01050059, "balance_loss_clip": 1.08784854, "balance_loss_mlp": 1.03478849, "epoch": 0.03222509469127638, "flos": 28913153304960.0, "grad_norm": 2.1270388007513326, "language_loss": 0.79765791, "learning_rate": 3.999950849658302e-06, "loss": 0.82156003, "num_input_tokens_seen": 5577750, "step": 268, "time_per_iteration": 2.8195526599884033 }, { "auxiliary_loss_clip": 0.01335383, "auxiliary_loss_mlp": 0.01054652, "balance_loss_clip": 1.08577859, "balance_loss_mlp": 1.04073977, "epoch": 0.03234533758191547, "flos": 16946389739520.0, "grad_norm": 2.3631095268204385, "language_loss": 0.84717572, "learning_rate": 3.999945236836254e-06, "loss": 0.87107611, "num_input_tokens_seen": 5596715, "step": 269, "time_per_iteration": 2.7771990299224854 }, { "auxiliary_loss_clip": 0.01343569, "auxiliary_loss_mlp": 0.01049996, "balance_loss_clip": 1.08970451, "balance_loss_mlp": 1.03348541, "epoch": 0.03246558047255456, "flos": 18989096284800.0, "grad_norm": 2.7826732624690225, "language_loss": 0.94655418, "learning_rate": 3.999939320624103e-06, "loss": 0.97048986, "num_input_tokens_seen": 5611865, "step": 270, "time_per_iteration": 2.712599277496338 }, { "auxiliary_loss_clip": 0.01335325, "auxiliary_loss_mlp": 0.01063901, "balance_loss_clip": 1.08437634, "balance_loss_mlp": 1.04827321, "epoch": 0.03258582336319365, "flos": 23728334688000.0, "grad_norm": 1.8694266051910164, "language_loss": 0.90182853, "learning_rate": 3.999933101022749e-06, "loss": 0.92582083, "num_input_tokens_seen": 5632270, "step": 271, "time_per_iteration": 2.7494728565216064 }, { "auxiliary_loss_clip": 0.01335672, "auxiliary_loss_mlp": 0.01053647, "balance_loss_clip": 1.0874449, "balance_loss_mlp": 1.03908002, "epoch": 0.032706066253832745, "flos": 27670823562240.0, "grad_norm": 1.8785610861320698, "language_loss": 0.86680198, "learning_rate": 3.999926578033132e-06, "loss": 0.89069515, "num_input_tokens_seen": 5652085, "step": 272, "time_per_iteration": 2.8322463035583496 }, { "auxiliary_loss_clip": 0.01340419, "auxiliary_loss_mlp": 0.01051383, "balance_loss_clip": 1.08744144, "balance_loss_mlp": 1.03630257, "epoch": 0.032826309144471835, "flos": 45624685968000.0, "grad_norm": 2.3885248198282167, "language_loss": 0.63436443, "learning_rate": 3.999919751656244e-06, "loss": 0.65828246, "num_input_tokens_seen": 5678985, "step": 273, "time_per_iteration": 3.1950628757476807 }, { "auxiliary_loss_clip": 0.01333864, "auxiliary_loss_mlp": 0.01057033, "balance_loss_clip": 1.08325434, "balance_loss_mlp": 1.04150033, "epoch": 0.032946552035110925, "flos": 25812374808960.0, "grad_norm": 2.411603503630089, "language_loss": 0.75582439, "learning_rate": 3.9999126218931195e-06, "loss": 0.77973342, "num_input_tokens_seen": 5697020, "step": 274, "time_per_iteration": 2.875514507293701 }, { "auxiliary_loss_clip": 0.01338811, "auxiliary_loss_mlp": 0.01056945, "balance_loss_clip": 1.08901715, "balance_loss_mlp": 1.0412097, "epoch": 0.033066794925750015, "flos": 15121984101120.0, "grad_norm": 2.1073870248007065, "language_loss": 0.89776468, "learning_rate": 3.99990518874484e-06, "loss": 0.92172223, "num_input_tokens_seen": 5713460, "step": 275, "time_per_iteration": 3.08976149559021 }, { "auxiliary_loss_clip": 0.01329352, "auxiliary_loss_mlp": 0.01042646, "balance_loss_clip": 1.08598566, "balance_loss_mlp": 1.02772701, "epoch": 0.033187037816389105, "flos": 22776593973120.0, "grad_norm": 2.471721137064735, "language_loss": 0.93099582, "learning_rate": 3.999897452212534e-06, "loss": 0.95471585, "num_input_tokens_seen": 5730790, "step": 276, "time_per_iteration": 2.996884346008301 }, { "auxiliary_loss_clip": 0.01333054, "auxiliary_loss_mlp": 0.0105767, "balance_loss_clip": 1.08272767, "balance_loss_mlp": 1.04217267, "epoch": 0.033307280707028195, "flos": 23331414424320.0, "grad_norm": 2.4431540404729994, "language_loss": 1.00313139, "learning_rate": 3.999889412297374e-06, "loss": 1.02703857, "num_input_tokens_seen": 5750215, "step": 277, "time_per_iteration": 4.797788619995117 }, { "auxiliary_loss_clip": 0.01334221, "auxiliary_loss_mlp": 0.01052133, "balance_loss_clip": 1.08362806, "balance_loss_mlp": 1.0371362, "epoch": 0.03342752359766729, "flos": 28840290566400.0, "grad_norm": 2.268673381822304, "language_loss": 0.79084277, "learning_rate": 3.999881069000581e-06, "loss": 0.81470627, "num_input_tokens_seen": 5769945, "step": 278, "time_per_iteration": 4.816899538040161 }, { "auxiliary_loss_clip": 0.01332677, "auxiliary_loss_mlp": 0.01049694, "balance_loss_clip": 1.08520579, "balance_loss_mlp": 1.03408909, "epoch": 0.03354776648830638, "flos": 19384544090880.0, "grad_norm": 2.638383407728002, "language_loss": 0.87031281, "learning_rate": 3.99987242232342e-06, "loss": 0.89413655, "num_input_tokens_seen": 5784950, "step": 279, "time_per_iteration": 3.902925729751587 }, { "auxiliary_loss_clip": 0.01335851, "auxiliary_loss_mlp": 0.01055032, "balance_loss_clip": 1.08831894, "balance_loss_mlp": 1.0404532, "epoch": 0.03366800937894547, "flos": 17858628472320.0, "grad_norm": 1.919184804840094, "language_loss": 0.80051982, "learning_rate": 3.9998634722672026e-06, "loss": 0.82442868, "num_input_tokens_seen": 5805005, "step": 280, "time_per_iteration": 2.874401569366455 }, { "auxiliary_loss_clip": 0.01334646, "auxiliary_loss_mlp": 0.01049199, "balance_loss_clip": 1.08674943, "balance_loss_mlp": 1.03435755, "epoch": 0.03378825226958456, "flos": 35951033635200.0, "grad_norm": 2.3192546642351166, "language_loss": 0.78251791, "learning_rate": 3.999854218833286e-06, "loss": 0.80635637, "num_input_tokens_seen": 5825825, "step": 281, "time_per_iteration": 3.0265681743621826 }, { "auxiliary_loss_clip": 0.01332161, "auxiliary_loss_mlp": 0.0105613, "balance_loss_clip": 1.0857079, "balance_loss_mlp": 1.04067481, "epoch": 0.03390849516022365, "flos": 25702488126720.0, "grad_norm": 1.8732207222454849, "language_loss": 0.82027668, "learning_rate": 3.999844662023075e-06, "loss": 0.8441596, "num_input_tokens_seen": 5845700, "step": 282, "time_per_iteration": 2.9238085746765137 }, { "auxiliary_loss_clip": 0.01326631, "auxiliary_loss_mlp": 0.01054599, "balance_loss_clip": 1.08266163, "balance_loss_mlp": 1.03914905, "epoch": 0.03402873805086274, "flos": 21284505987840.0, "grad_norm": 1.905076230276993, "language_loss": 0.91959739, "learning_rate": 3.999834801838018e-06, "loss": 0.94340968, "num_input_tokens_seen": 5864680, "step": 283, "time_per_iteration": 2.992034673690796 }, { "auxiliary_loss_clip": 0.0132734, "auxiliary_loss_mlp": 0.01052178, "balance_loss_clip": 1.08590043, "balance_loss_mlp": 1.03724086, "epoch": 0.03414898094150183, "flos": 22710913954560.0, "grad_norm": 2.1515170731138418, "language_loss": 0.74014837, "learning_rate": 3.9998246382796115e-06, "loss": 0.76394355, "num_input_tokens_seen": 5884260, "step": 284, "time_per_iteration": 3.007387638092041 }, { "auxiliary_loss_clip": 0.0133372, "auxiliary_loss_mlp": 0.01048198, "balance_loss_clip": 1.08513808, "balance_loss_mlp": 1.0334872, "epoch": 0.03426922383214093, "flos": 18879927874560.0, "grad_norm": 2.5457733154589754, "language_loss": 0.90788841, "learning_rate": 3.999814171349399e-06, "loss": 0.93170756, "num_input_tokens_seen": 5902120, "step": 285, "time_per_iteration": 3.014099359512329 }, { "auxiliary_loss_clip": 0.01324616, "auxiliary_loss_mlp": 0.01056538, "balance_loss_clip": 1.0808742, "balance_loss_mlp": 1.04117203, "epoch": 0.03438946672278002, "flos": 34752012716160.0, "grad_norm": 1.6477751495960673, "language_loss": 0.73696887, "learning_rate": 3.9998034010489655e-06, "loss": 0.76078039, "num_input_tokens_seen": 5925810, "step": 286, "time_per_iteration": 2.9971230030059814 }, { "auxiliary_loss_clip": 0.01329317, "auxiliary_loss_mlp": 0.01057402, "balance_loss_clip": 1.08567965, "balance_loss_mlp": 1.04265535, "epoch": 0.03450970961341911, "flos": 22164102236160.0, "grad_norm": 2.4511204484528313, "language_loss": 0.75955689, "learning_rate": 3.999792327379946e-06, "loss": 0.78342408, "num_input_tokens_seen": 5945185, "step": 287, "time_per_iteration": 3.0328640937805176 }, { "auxiliary_loss_clip": 0.0133218, "auxiliary_loss_mlp": 0.01053437, "balance_loss_clip": 1.08898067, "balance_loss_mlp": 1.0380466, "epoch": 0.034629952504058197, "flos": 21725740656000.0, "grad_norm": 2.6329887218208143, "language_loss": 0.96411794, "learning_rate": 3.999780950344021e-06, "loss": 0.98797411, "num_input_tokens_seen": 5963375, "step": 288, "time_per_iteration": 2.8782455921173096 }, { "auxiliary_loss_clip": 0.01332046, "auxiliary_loss_mlp": 0.01049087, "balance_loss_clip": 1.08504343, "balance_loss_mlp": 1.0335058, "epoch": 0.034750195394697286, "flos": 20047994248320.0, "grad_norm": 2.307860927626218, "language_loss": 0.82494497, "learning_rate": 3.999769269942916e-06, "loss": 0.84875625, "num_input_tokens_seen": 5983415, "step": 289, "time_per_iteration": 2.9414219856262207 }, { "auxiliary_loss_clip": 0.01325531, "auxiliary_loss_mlp": 0.0104743, "balance_loss_clip": 1.08327079, "balance_loss_mlp": 1.03374457, "epoch": 0.034870438285336376, "flos": 27965865876480.0, "grad_norm": 1.9694836348025704, "language_loss": 0.81247228, "learning_rate": 3.999757286178402e-06, "loss": 0.83620191, "num_input_tokens_seen": 6005850, "step": 290, "time_per_iteration": 2.9119231700897217 }, { "auxiliary_loss_clip": 0.01331674, "auxiliary_loss_mlp": 0.01049706, "balance_loss_clip": 1.08558512, "balance_loss_mlp": 1.03519821, "epoch": 0.03499068117597547, "flos": 22017514832640.0, "grad_norm": 1.905862730589274, "language_loss": 0.90586925, "learning_rate": 3.999744999052299e-06, "loss": 0.92968309, "num_input_tokens_seen": 6027240, "step": 291, "time_per_iteration": 2.8453922271728516 }, { "auxiliary_loss_clip": 0.01288001, "auxiliary_loss_mlp": 0.01011335, "balance_loss_clip": 1.10210133, "balance_loss_mlp": 1.00525558, "epoch": 0.03511092406661456, "flos": 57242147725440.0, "grad_norm": 0.9595006736483973, "language_loss": 0.61195719, "learning_rate": 3.9997324085664675e-06, "loss": 0.63495064, "num_input_tokens_seen": 6087470, "step": 292, "time_per_iteration": 3.2357025146484375 }, { "auxiliary_loss_clip": 0.01319766, "auxiliary_loss_mlp": 0.01048374, "balance_loss_clip": 1.07728648, "balance_loss_mlp": 1.03400898, "epoch": 0.03523116695725365, "flos": 22928065626240.0, "grad_norm": 2.869619989552672, "language_loss": 0.91821748, "learning_rate": 3.999719514722821e-06, "loss": 0.94189882, "num_input_tokens_seen": 6107600, "step": 293, "time_per_iteration": 2.7365808486938477 }, { "auxiliary_loss_clip": 0.01323971, "auxiliary_loss_mlp": 0.01049809, "balance_loss_clip": 1.08306551, "balance_loss_mlp": 1.03522992, "epoch": 0.03535140984789274, "flos": 36903241226880.0, "grad_norm": 6.09119243874939, "language_loss": 0.74759972, "learning_rate": 3.999706317523314e-06, "loss": 0.77133751, "num_input_tokens_seen": 6126160, "step": 294, "time_per_iteration": 2.877718687057495 }, { "auxiliary_loss_clip": 0.01320646, "auxiliary_loss_mlp": 0.01046563, "balance_loss_clip": 1.07987952, "balance_loss_mlp": 1.03093445, "epoch": 0.03547165273853183, "flos": 20449152316800.0, "grad_norm": 2.624246936025058, "language_loss": 0.86087579, "learning_rate": 3.999692816969948e-06, "loss": 0.88454783, "num_input_tokens_seen": 6145695, "step": 295, "time_per_iteration": 2.6943910121917725 }, { "auxiliary_loss_clip": 0.01280639, "auxiliary_loss_mlp": 0.01006181, "balance_loss_clip": 1.09662211, "balance_loss_mlp": 1.00036359, "epoch": 0.03559189562917092, "flos": 69850564871040.0, "grad_norm": 1.197052067114665, "language_loss": 0.69463921, "learning_rate": 3.999679013064772e-06, "loss": 0.71750748, "num_input_tokens_seen": 6212440, "step": 296, "time_per_iteration": 3.324094295501709 }, { "auxiliary_loss_clip": 0.01319411, "auxiliary_loss_mlp": 0.01052807, "balance_loss_clip": 1.08070254, "balance_loss_mlp": 1.03676116, "epoch": 0.03571213851981002, "flos": 21651944163840.0, "grad_norm": 2.7142877571275994, "language_loss": 0.85939074, "learning_rate": 3.99966490580988e-06, "loss": 0.88311291, "num_input_tokens_seen": 6229800, "step": 297, "time_per_iteration": 2.738821029663086 }, { "auxiliary_loss_clip": 0.01326735, "auxiliary_loss_mlp": 0.01062846, "balance_loss_clip": 1.08369386, "balance_loss_mlp": 1.04714584, "epoch": 0.03583238141044911, "flos": 43945610757120.0, "grad_norm": 4.861398621535113, "language_loss": 0.65815103, "learning_rate": 3.999650495207411e-06, "loss": 0.68204677, "num_input_tokens_seen": 6255825, "step": 298, "time_per_iteration": 2.8925209045410156 }, { "auxiliary_loss_clip": 0.01325613, "auxiliary_loss_mlp": 0.01054217, "balance_loss_clip": 1.08506501, "balance_loss_mlp": 1.04001927, "epoch": 0.0359526243010882, "flos": 18910810592640.0, "grad_norm": 3.1487424909900747, "language_loss": 0.90568697, "learning_rate": 3.999635781259553e-06, "loss": 0.92948526, "num_input_tokens_seen": 6271090, "step": 299, "time_per_iteration": 2.762096643447876 }, { "auxiliary_loss_clip": 0.01271218, "auxiliary_loss_mlp": 0.01007493, "balance_loss_clip": 1.08970952, "balance_loss_mlp": 1.00196218, "epoch": 0.03607286719172729, "flos": 61668892782720.0, "grad_norm": 0.911635046966202, "language_loss": 0.52280062, "learning_rate": 3.999620763968535e-06, "loss": 0.54558772, "num_input_tokens_seen": 6329965, "step": 300, "time_per_iteration": 3.2504632472991943 }, { "auxiliary_loss_clip": 0.01320997, "auxiliary_loss_mlp": 0.01054889, "balance_loss_clip": 1.08066034, "balance_loss_mlp": 1.0396179, "epoch": 0.03619311008236638, "flos": 27819062991360.0, "grad_norm": 1.9062759514035112, "language_loss": 0.86556613, "learning_rate": 3.999605443336638e-06, "loss": 0.88932502, "num_input_tokens_seen": 6352095, "step": 301, "time_per_iteration": 2.921802520751953 }, { "auxiliary_loss_clip": 0.01322069, "auxiliary_loss_mlp": 0.0106026, "balance_loss_clip": 1.0812676, "balance_loss_mlp": 1.04435706, "epoch": 0.03631335297300547, "flos": 13621133197440.0, "grad_norm": 2.698809729491605, "language_loss": 0.89149332, "learning_rate": 3.999589819366185e-06, "loss": 0.91531664, "num_input_tokens_seen": 6365885, "step": 302, "time_per_iteration": 2.8086605072021484 }, { "auxiliary_loss_clip": 0.01325089, "auxiliary_loss_mlp": 0.01057097, "balance_loss_clip": 1.08344913, "balance_loss_mlp": 1.04258883, "epoch": 0.036433595863644565, "flos": 27631788456960.0, "grad_norm": 2.131166430993745, "language_loss": 0.8475284, "learning_rate": 3.999573892059547e-06, "loss": 0.87135029, "num_input_tokens_seen": 6385015, "step": 303, "time_per_iteration": 4.154699087142944 }, { "auxiliary_loss_clip": 0.01329823, "auxiliary_loss_mlp": 0.01058214, "balance_loss_clip": 1.08446765, "balance_loss_mlp": 1.04234743, "epoch": 0.036553838754283655, "flos": 24572020314240.0, "grad_norm": 2.3031250671151944, "language_loss": 0.81124938, "learning_rate": 3.999557661419138e-06, "loss": 0.83512974, "num_input_tokens_seen": 6405165, "step": 304, "time_per_iteration": 3.7228312492370605 }, { "auxiliary_loss_clip": 0.01323353, "auxiliary_loss_mlp": 0.01056332, "balance_loss_clip": 1.08427703, "balance_loss_mlp": 1.04127574, "epoch": 0.036674081644922744, "flos": 23404313076480.0, "grad_norm": 2.261789808653853, "language_loss": 0.81653714, "learning_rate": 3.9995411274474225e-06, "loss": 0.84033394, "num_input_tokens_seen": 6424445, "step": 305, "time_per_iteration": 3.7561237812042236 }, { "auxiliary_loss_clip": 0.01326624, "auxiliary_loss_mlp": 0.0104709, "balance_loss_clip": 1.08569002, "balance_loss_mlp": 1.03249276, "epoch": 0.036794324535561834, "flos": 27489690253440.0, "grad_norm": 2.4210992833610283, "language_loss": 0.81626189, "learning_rate": 3.999524290146908e-06, "loss": 0.83999902, "num_input_tokens_seen": 6444650, "step": 306, "time_per_iteration": 3.8149116039276123 }, { "auxiliary_loss_clip": 0.01327253, "auxiliary_loss_mlp": 0.01046803, "balance_loss_clip": 1.08602071, "balance_loss_mlp": 1.03253937, "epoch": 0.036914567426200924, "flos": 19463476227840.0, "grad_norm": 3.0452421336442512, "language_loss": 0.92544329, "learning_rate": 3.9995071495201485e-06, "loss": 0.94918382, "num_input_tokens_seen": 6461755, "step": 307, "time_per_iteration": 2.834885358810425 }, { "auxiliary_loss_clip": 0.01321567, "auxiliary_loss_mlp": 0.01051525, "balance_loss_clip": 1.08373666, "balance_loss_mlp": 1.03732157, "epoch": 0.037034810316840014, "flos": 22309324922880.0, "grad_norm": 2.4527673285571696, "language_loss": 0.98022008, "learning_rate": 3.999489705569744e-06, "loss": 1.00395095, "num_input_tokens_seen": 6479455, "step": 308, "time_per_iteration": 2.8708949089050293 }, { "auxiliary_loss_clip": 0.01320413, "auxiliary_loss_mlp": 0.01047604, "balance_loss_clip": 1.08209157, "balance_loss_mlp": 1.03259563, "epoch": 0.03715505320747911, "flos": 18588333265920.0, "grad_norm": 2.3714448534966603, "language_loss": 0.86282825, "learning_rate": 3.999471958298341e-06, "loss": 0.88650841, "num_input_tokens_seen": 6498365, "step": 309, "time_per_iteration": 2.858302593231201 }, { "auxiliary_loss_clip": 0.01326679, "auxiliary_loss_mlp": 0.01059119, "balance_loss_clip": 1.08622813, "balance_loss_mlp": 1.04288316, "epoch": 0.0372752960981182, "flos": 35955343267200.0, "grad_norm": 1.905195815116642, "language_loss": 0.76348108, "learning_rate": 3.999453907708631e-06, "loss": 0.78733909, "num_input_tokens_seen": 6520770, "step": 310, "time_per_iteration": 3.103311061859131 }, { "auxiliary_loss_clip": 0.01314555, "auxiliary_loss_mlp": 0.0105425, "balance_loss_clip": 1.07953393, "balance_loss_mlp": 1.04012346, "epoch": 0.03739553898875729, "flos": 20814040627200.0, "grad_norm": 1.8248802136251068, "language_loss": 0.81159097, "learning_rate": 3.999435553803353e-06, "loss": 0.83527899, "num_input_tokens_seen": 6540170, "step": 311, "time_per_iteration": 2.951188087463379 }, { "auxiliary_loss_clip": 0.0131954, "auxiliary_loss_mlp": 0.01050545, "balance_loss_clip": 1.08018661, "balance_loss_mlp": 1.03544712, "epoch": 0.03751578187939638, "flos": 20264140339200.0, "grad_norm": 2.698171524703693, "language_loss": 0.83492982, "learning_rate": 3.999416896585292e-06, "loss": 0.85863066, "num_input_tokens_seen": 6557200, "step": 312, "time_per_iteration": 2.856870651245117 }, { "auxiliary_loss_clip": 0.01315384, "auxiliary_loss_mlp": 0.01051762, "balance_loss_clip": 1.0789789, "balance_loss_mlp": 1.03785038, "epoch": 0.03763602477003547, "flos": 20668063754880.0, "grad_norm": 2.642520938277664, "language_loss": 0.85167658, "learning_rate": 3.9993979360572775e-06, "loss": 0.87534809, "num_input_tokens_seen": 6577340, "step": 313, "time_per_iteration": 2.7421157360076904 }, { "auxiliary_loss_clip": 0.01329527, "auxiliary_loss_mlp": 0.01054983, "balance_loss_clip": 1.08576345, "balance_loss_mlp": 1.03964055, "epoch": 0.03775626766067456, "flos": 16691352197760.0, "grad_norm": 2.242316852725957, "language_loss": 0.82805365, "learning_rate": 3.999378672222185e-06, "loss": 0.85189879, "num_input_tokens_seen": 6595125, "step": 314, "time_per_iteration": 2.8486857414245605 }, { "auxiliary_loss_clip": 0.0131898, "auxiliary_loss_mlp": 0.01055032, "balance_loss_clip": 1.08244181, "balance_loss_mlp": 1.04119182, "epoch": 0.03787651055131366, "flos": 21141797253120.0, "grad_norm": 2.199632311734852, "language_loss": 0.82976884, "learning_rate": 3.9993591050829385e-06, "loss": 0.85350895, "num_input_tokens_seen": 6612990, "step": 315, "time_per_iteration": 2.87632155418396 }, { "auxiliary_loss_clip": 0.01319669, "auxiliary_loss_mlp": 0.01050852, "balance_loss_clip": 1.08172226, "balance_loss_mlp": 1.03711879, "epoch": 0.037996753441952746, "flos": 22018089450240.0, "grad_norm": 2.0417883400613315, "language_loss": 0.79275954, "learning_rate": 3.999339234642506e-06, "loss": 0.81646478, "num_input_tokens_seen": 6632740, "step": 316, "time_per_iteration": 2.8845036029815674 }, { "auxiliary_loss_clip": 0.01319716, "auxiliary_loss_mlp": 0.01044867, "balance_loss_clip": 1.08102012, "balance_loss_mlp": 1.03008556, "epoch": 0.038116996332591836, "flos": 27709391790720.0, "grad_norm": 2.9165728229836025, "language_loss": 0.83816576, "learning_rate": 3.9993190609038994e-06, "loss": 0.86181164, "num_input_tokens_seen": 6651505, "step": 317, "time_per_iteration": 2.9241151809692383 }, { "auxiliary_loss_clip": 0.01319386, "auxiliary_loss_mlp": 0.01059667, "balance_loss_clip": 1.08257389, "balance_loss_mlp": 1.04498076, "epoch": 0.038237239223230926, "flos": 21178067011200.0, "grad_norm": 4.613904903911313, "language_loss": 0.83106774, "learning_rate": 3.999298583870182e-06, "loss": 0.85485828, "num_input_tokens_seen": 6671090, "step": 318, "time_per_iteration": 2.771679401397705 }, { "auxiliary_loss_clip": 0.0132054, "auxiliary_loss_mlp": 0.01052397, "balance_loss_clip": 1.08289397, "balance_loss_mlp": 1.03772235, "epoch": 0.038357482113870016, "flos": 25556618995200.0, "grad_norm": 2.014814023531459, "language_loss": 0.77408767, "learning_rate": 3.999277803544458e-06, "loss": 0.79781699, "num_input_tokens_seen": 6691245, "step": 319, "time_per_iteration": 2.8759708404541016 }, { "auxiliary_loss_clip": 0.01244852, "auxiliary_loss_mlp": 0.01009791, "balance_loss_clip": 1.07486522, "balance_loss_mlp": 1.00371158, "epoch": 0.038477725004509106, "flos": 59227578034560.0, "grad_norm": 0.9632921703533713, "language_loss": 0.62357849, "learning_rate": 3.999256719929882e-06, "loss": 0.64612496, "num_input_tokens_seen": 6752520, "step": 320, "time_per_iteration": 3.2682385444641113 }, { "auxiliary_loss_clip": 0.01242001, "auxiliary_loss_mlp": 0.01009092, "balance_loss_clip": 1.07292545, "balance_loss_mlp": 1.00313175, "epoch": 0.0385979678951482, "flos": 67317676398720.0, "grad_norm": 1.2319905280377472, "language_loss": 0.67135763, "learning_rate": 3.999235333029651e-06, "loss": 0.69386858, "num_input_tokens_seen": 6806460, "step": 321, "time_per_iteration": 3.2760655879974365 }, { "auxiliary_loss_clip": 0.01320501, "auxiliary_loss_mlp": 0.01051155, "balance_loss_clip": 1.0844872, "balance_loss_mlp": 1.03699279, "epoch": 0.03871821078578729, "flos": 22746752749440.0, "grad_norm": 1.9490761170730044, "language_loss": 0.81585377, "learning_rate": 3.999213642847009e-06, "loss": 0.83957034, "num_input_tokens_seen": 6827045, "step": 322, "time_per_iteration": 2.8751957416534424 }, { "auxiliary_loss_clip": 0.013189, "auxiliary_loss_mlp": 0.01055727, "balance_loss_clip": 1.08158362, "balance_loss_mlp": 1.0394367, "epoch": 0.03883845367642638, "flos": 26280613526400.0, "grad_norm": 1.811260617654674, "language_loss": 0.91293871, "learning_rate": 3.999191649385247e-06, "loss": 0.93668503, "num_input_tokens_seen": 6848220, "step": 323, "time_per_iteration": 2.794689893722534 }, { "auxiliary_loss_clip": 0.01233988, "auxiliary_loss_mlp": 0.01007291, "balance_loss_clip": 1.06738114, "balance_loss_mlp": 1.00125885, "epoch": 0.03895869656706547, "flos": 56962835568000.0, "grad_norm": 0.9119540271433784, "language_loss": 0.59783268, "learning_rate": 3.999169352647702e-06, "loss": 0.62024546, "num_input_tokens_seen": 6909400, "step": 324, "time_per_iteration": 3.2495334148406982 }, { "auxiliary_loss_clip": 0.01326216, "auxiliary_loss_mlp": 0.01052804, "balance_loss_clip": 1.08384681, "balance_loss_mlp": 1.03637743, "epoch": 0.03907893945770456, "flos": 24863363527680.0, "grad_norm": 6.4473083332769505, "language_loss": 0.83027285, "learning_rate": 3.999146752637755e-06, "loss": 0.85406303, "num_input_tokens_seen": 6930445, "step": 325, "time_per_iteration": 2.7671587467193604 }, { "auxiliary_loss_clip": 0.01320831, "auxiliary_loss_mlp": 0.01054937, "balance_loss_clip": 1.0823679, "balance_loss_mlp": 1.03995204, "epoch": 0.03919918234834365, "flos": 18368595815040.0, "grad_norm": 2.221807347690745, "language_loss": 0.89848423, "learning_rate": 3.999123849358836e-06, "loss": 0.92224193, "num_input_tokens_seen": 6948110, "step": 326, "time_per_iteration": 2.8494577407836914 }, { "auxiliary_loss_clip": 0.01316215, "auxiliary_loss_mlp": 0.01056683, "balance_loss_clip": 1.08054447, "balance_loss_mlp": 1.04132271, "epoch": 0.03931942523898275, "flos": 25225414663680.0, "grad_norm": 2.3792374065622877, "language_loss": 0.74734658, "learning_rate": 3.999100642814418e-06, "loss": 0.77107555, "num_input_tokens_seen": 6968550, "step": 327, "time_per_iteration": 2.902641773223877 }, { "auxiliary_loss_clip": 0.0131781, "auxiliary_loss_mlp": 0.01047202, "balance_loss_clip": 1.08346963, "balance_loss_mlp": 1.03259301, "epoch": 0.03943966812962184, "flos": 23257905240960.0, "grad_norm": 3.2049258859618086, "language_loss": 0.88403565, "learning_rate": 3.999077133008022e-06, "loss": 0.90768576, "num_input_tokens_seen": 6987135, "step": 328, "time_per_iteration": 2.7407875061035156 }, { "auxiliary_loss_clip": 0.01322855, "auxiliary_loss_mlp": 0.01057115, "balance_loss_clip": 1.08533061, "balance_loss_mlp": 1.04235709, "epoch": 0.03955991102026093, "flos": 29168837291520.0, "grad_norm": 1.9324334263933953, "language_loss": 0.90669274, "learning_rate": 3.9990533199432145e-06, "loss": 0.9304924, "num_input_tokens_seen": 7008630, "step": 329, "time_per_iteration": 2.8620643615722656 }, { "auxiliary_loss_clip": 0.01323108, "auxiliary_loss_mlp": 0.01050438, "balance_loss_clip": 1.08476293, "balance_loss_mlp": 1.03615069, "epoch": 0.03968015391090002, "flos": 17602441695360.0, "grad_norm": 2.6880827708180797, "language_loss": 0.75492859, "learning_rate": 3.999029203623608e-06, "loss": 0.77866399, "num_input_tokens_seen": 7026350, "step": 330, "time_per_iteration": 3.8683688640594482 }, { "auxiliary_loss_clip": 0.013158, "auxiliary_loss_mlp": 0.01053797, "balance_loss_clip": 1.08127153, "balance_loss_mlp": 1.03703582, "epoch": 0.03980039680153911, "flos": 21799285752960.0, "grad_norm": 2.00844354779415, "language_loss": 0.86717737, "learning_rate": 3.99900478405286e-06, "loss": 0.89087331, "num_input_tokens_seen": 7045660, "step": 331, "time_per_iteration": 4.632981538772583 }, { "auxiliary_loss_clip": 0.01316029, "auxiliary_loss_mlp": 0.01054014, "balance_loss_clip": 1.08300781, "balance_loss_mlp": 1.03872561, "epoch": 0.0399206396921782, "flos": 15195134148480.0, "grad_norm": 3.0988097216206376, "language_loss": 0.8273617, "learning_rate": 3.998980061234676e-06, "loss": 0.85106224, "num_input_tokens_seen": 7063575, "step": 332, "time_per_iteration": 3.882838010787964 }, { "auxiliary_loss_clip": 0.01317283, "auxiliary_loss_mlp": 0.01052665, "balance_loss_clip": 1.08234, "balance_loss_mlp": 1.03756118, "epoch": 0.040040882582817294, "flos": 14422910630400.0, "grad_norm": 2.548862003460459, "language_loss": 0.75827432, "learning_rate": 3.9989550351728055e-06, "loss": 0.78197378, "num_input_tokens_seen": 7080505, "step": 333, "time_per_iteration": 2.7380740642547607 }, { "auxiliary_loss_clip": 0.01313509, "auxiliary_loss_mlp": 0.01055305, "balance_loss_clip": 1.08216906, "balance_loss_mlp": 1.04053485, "epoch": 0.040161125473456384, "flos": 19280906375040.0, "grad_norm": 2.637939703896791, "language_loss": 0.84581316, "learning_rate": 3.998929705871046e-06, "loss": 0.86950129, "num_input_tokens_seen": 7097860, "step": 334, "time_per_iteration": 2.7103281021118164 }, { "auxiliary_loss_clip": 0.01313175, "auxiliary_loss_mlp": 0.01053588, "balance_loss_clip": 1.08238459, "balance_loss_mlp": 1.03912783, "epoch": 0.040281368364095474, "flos": 17821101738240.0, "grad_norm": 2.516336588705291, "language_loss": 0.88734138, "learning_rate": 3.99890407333324e-06, "loss": 0.91100895, "num_input_tokens_seen": 7116390, "step": 335, "time_per_iteration": 2.765214681625366 }, { "auxiliary_loss_clip": 0.0130954, "auxiliary_loss_mlp": 0.01049048, "balance_loss_clip": 1.07711852, "balance_loss_mlp": 1.03445721, "epoch": 0.040401611254734564, "flos": 19573757959680.0, "grad_norm": 1.7933976494645534, "language_loss": 0.86907059, "learning_rate": 3.998878137563275e-06, "loss": 0.89265651, "num_input_tokens_seen": 7135940, "step": 336, "time_per_iteration": 2.714104175567627 }, { "auxiliary_loss_clip": 0.01316871, "auxiliary_loss_mlp": 0.01044834, "balance_loss_clip": 1.08380342, "balance_loss_mlp": 1.03052306, "epoch": 0.040521854145373654, "flos": 22054466949120.0, "grad_norm": 1.90611798061865, "language_loss": 0.8562119, "learning_rate": 3.998851898565085e-06, "loss": 0.87982905, "num_input_tokens_seen": 7155745, "step": 337, "time_per_iteration": 2.7955691814422607 }, { "auxiliary_loss_clip": 0.01309754, "auxiliary_loss_mlp": 0.01049742, "balance_loss_clip": 1.07680273, "balance_loss_mlp": 1.03571057, "epoch": 0.04064209703601274, "flos": 22674644196480.0, "grad_norm": 2.2198804153280376, "language_loss": 0.82968748, "learning_rate": 3.998825356342653e-06, "loss": 0.85328245, "num_input_tokens_seen": 7175920, "step": 338, "time_per_iteration": 2.7004458904266357 }, { "auxiliary_loss_clip": 0.01314716, "auxiliary_loss_mlp": 0.01056698, "balance_loss_clip": 1.08073735, "balance_loss_mlp": 1.04135537, "epoch": 0.04076233992665183, "flos": 38582172783360.0, "grad_norm": 3.765694877542609, "language_loss": 0.72840965, "learning_rate": 3.998798510900003e-06, "loss": 0.75212383, "num_input_tokens_seen": 7198720, "step": 339, "time_per_iteration": 2.9000508785247803 }, { "auxiliary_loss_clip": 0.01316727, "auxiliary_loss_mlp": 0.010445, "balance_loss_clip": 1.08119261, "balance_loss_mlp": 1.0306654, "epoch": 0.04088258281729093, "flos": 25885309374720.0, "grad_norm": 2.6225422248489116, "language_loss": 0.83925784, "learning_rate": 3.998771362241207e-06, "loss": 0.8628701, "num_input_tokens_seen": 7219125, "step": 340, "time_per_iteration": 2.7893195152282715 }, { "auxiliary_loss_clip": 0.01309112, "auxiliary_loss_mlp": 0.01046898, "balance_loss_clip": 1.07872987, "balance_loss_mlp": 1.03283751, "epoch": 0.04100282570793002, "flos": 19789832223360.0, "grad_norm": 2.663769385957724, "language_loss": 0.88056576, "learning_rate": 3.998743910370385e-06, "loss": 0.90412587, "num_input_tokens_seen": 7237985, "step": 341, "time_per_iteration": 2.761690616607666 }, { "auxiliary_loss_clip": 0.01315972, "auxiliary_loss_mlp": 0.01060466, "balance_loss_clip": 1.08575475, "balance_loss_mlp": 1.04618442, "epoch": 0.04112306859856911, "flos": 22565152563840.0, "grad_norm": 2.2883476013529487, "language_loss": 0.73520863, "learning_rate": 3.998716155291702e-06, "loss": 0.758973, "num_input_tokens_seen": 7255825, "step": 342, "time_per_iteration": 2.824364423751831 }, { "auxiliary_loss_clip": 0.01316543, "auxiliary_loss_mlp": 0.01053018, "balance_loss_clip": 1.08386493, "balance_loss_mlp": 1.03746164, "epoch": 0.0412433114892082, "flos": 25040654081280.0, "grad_norm": 2.2826920627467544, "language_loss": 0.90553474, "learning_rate": 3.998688097009366e-06, "loss": 0.92923039, "num_input_tokens_seen": 7276590, "step": 343, "time_per_iteration": 2.756237268447876 }, { "auxiliary_loss_clip": 0.01313428, "auxiliary_loss_mlp": 0.01060434, "balance_loss_clip": 1.08419061, "balance_loss_mlp": 1.04516339, "epoch": 0.04136355437984729, "flos": 25191371548800.0, "grad_norm": 2.136372045784205, "language_loss": 0.79853332, "learning_rate": 3.998659735527636e-06, "loss": 0.82227194, "num_input_tokens_seen": 7295680, "step": 344, "time_per_iteration": 2.8397915363311768 }, { "auxiliary_loss_clip": 0.01313267, "auxiliary_loss_mlp": 0.0104795, "balance_loss_clip": 1.08095562, "balance_loss_mlp": 1.03432417, "epoch": 0.04148379727048638, "flos": 22966777509120.0, "grad_norm": 2.0951017675600436, "language_loss": 0.77735525, "learning_rate": 3.998631070850813e-06, "loss": 0.8009674, "num_input_tokens_seen": 7316300, "step": 345, "time_per_iteration": 2.817065954208374 }, { "auxiliary_loss_clip": 0.01310561, "auxiliary_loss_mlp": 0.01054527, "balance_loss_clip": 1.08035767, "balance_loss_mlp": 1.04009116, "epoch": 0.041604040161125476, "flos": 14063481187200.0, "grad_norm": 2.47871274647553, "language_loss": 0.83278584, "learning_rate": 3.9986021029832455e-06, "loss": 0.85643673, "num_input_tokens_seen": 7333615, "step": 346, "time_per_iteration": 2.7748830318450928 }, { "auxiliary_loss_clip": 0.01312455, "auxiliary_loss_mlp": 0.01047333, "balance_loss_clip": 1.07941318, "balance_loss_mlp": 1.03189576, "epoch": 0.041724283051764566, "flos": 12091877614080.0, "grad_norm": 4.789166854032187, "language_loss": 0.91402829, "learning_rate": 3.9985728319293285e-06, "loss": 0.93762624, "num_input_tokens_seen": 7347590, "step": 347, "time_per_iteration": 2.795156240463257 }, { "auxiliary_loss_clip": 0.01319434, "auxiliary_loss_mlp": 0.01053199, "balance_loss_clip": 1.08140278, "balance_loss_mlp": 1.03732061, "epoch": 0.041844525942403656, "flos": 12385303816320.0, "grad_norm": 2.215107671928857, "language_loss": 0.85048759, "learning_rate": 3.998543257693501e-06, "loss": 0.87421393, "num_input_tokens_seen": 7364345, "step": 348, "time_per_iteration": 2.7432749271392822 }, { "auxiliary_loss_clip": 0.01310733, "auxiliary_loss_mlp": 0.01057826, "balance_loss_clip": 1.08095694, "balance_loss_mlp": 1.04286504, "epoch": 0.041964768833042745, "flos": 23769345041280.0, "grad_norm": 2.328517451247474, "language_loss": 0.87828237, "learning_rate": 3.998513380280251e-06, "loss": 0.90196794, "num_input_tokens_seen": 7384625, "step": 349, "time_per_iteration": 2.8013176918029785 }, { "auxiliary_loss_clip": 0.01319262, "auxiliary_loss_mlp": 0.01051371, "balance_loss_clip": 1.08251429, "balance_loss_mlp": 1.03739953, "epoch": 0.042085011723681835, "flos": 11875336473600.0, "grad_norm": 3.695900934230636, "language_loss": 0.94798589, "learning_rate": 3.99848319969411e-06, "loss": 0.9716922, "num_input_tokens_seen": 7402225, "step": 350, "time_per_iteration": 2.6803433895111084 }, { "auxiliary_loss_clip": 0.01320512, "auxiliary_loss_mlp": 0.0106891, "balance_loss_clip": 1.08640742, "balance_loss_mlp": 1.05325747, "epoch": 0.042205254614320925, "flos": 16873957964160.0, "grad_norm": 2.360719645122846, "language_loss": 0.79276305, "learning_rate": 3.9984527159396564e-06, "loss": 0.8166573, "num_input_tokens_seen": 7420865, "step": 351, "time_per_iteration": 2.749013900756836 }, { "auxiliary_loss_clip": 0.01311427, "auxiliary_loss_mlp": 0.01050128, "balance_loss_clip": 1.07899165, "balance_loss_mlp": 1.03574586, "epoch": 0.04232549750496002, "flos": 25118508810240.0, "grad_norm": 3.642133934889627, "language_loss": 0.84257221, "learning_rate": 3.9984219290215154e-06, "loss": 0.86618781, "num_input_tokens_seen": 7441040, "step": 352, "time_per_iteration": 2.754053831100464 }, { "auxiliary_loss_clip": 0.01311972, "auxiliary_loss_mlp": 0.01055437, "balance_loss_clip": 1.08316958, "balance_loss_mlp": 1.04093564, "epoch": 0.04244574039559911, "flos": 26724541714560.0, "grad_norm": 1.7135410212828404, "language_loss": 0.89209425, "learning_rate": 3.998390838944356e-06, "loss": 0.91576833, "num_input_tokens_seen": 7462545, "step": 353, "time_per_iteration": 2.936676263809204 }, { "auxiliary_loss_clip": 0.01311354, "auxiliary_loss_mlp": 0.01045954, "balance_loss_clip": 1.08047271, "balance_loss_mlp": 1.03063548, "epoch": 0.0425659832862382, "flos": 20923244951040.0, "grad_norm": 2.225475664262407, "language_loss": 0.90558171, "learning_rate": 3.998359445712895e-06, "loss": 0.92915487, "num_input_tokens_seen": 7481650, "step": 354, "time_per_iteration": 2.7927215099334717 }, { "auxiliary_loss_clip": 0.01314666, "auxiliary_loss_mlp": 0.01045379, "balance_loss_clip": 1.08026826, "balance_loss_mlp": 1.03027499, "epoch": 0.04268622617687729, "flos": 23331127115520.0, "grad_norm": 43.59835216455777, "language_loss": 0.81301075, "learning_rate": 3.9983277493318955e-06, "loss": 0.83661115, "num_input_tokens_seen": 7500945, "step": 355, "time_per_iteration": 2.7937681674957275 }, { "auxiliary_loss_clip": 0.0131354, "auxiliary_loss_mlp": 0.01057456, "balance_loss_clip": 1.08108521, "balance_loss_mlp": 1.04185152, "epoch": 0.04280646906751638, "flos": 25994010908160.0, "grad_norm": 3.476603120096311, "language_loss": 0.81261718, "learning_rate": 3.998295749806165e-06, "loss": 0.83632714, "num_input_tokens_seen": 7522170, "step": 356, "time_per_iteration": 3.759117841720581 }, { "auxiliary_loss_clip": 0.01313723, "auxiliary_loss_mlp": 0.01056867, "balance_loss_clip": 1.08263218, "balance_loss_mlp": 1.04284823, "epoch": 0.04292671195815547, "flos": 26906824258560.0, "grad_norm": 2.386742317675454, "language_loss": 0.83394146, "learning_rate": 3.998263447140558e-06, "loss": 0.85764742, "num_input_tokens_seen": 7542370, "step": 357, "time_per_iteration": 2.736600160598755 }, { "auxiliary_loss_clip": 0.01304317, "auxiliary_loss_mlp": 0.01048709, "balance_loss_clip": 1.07463646, "balance_loss_mlp": 1.03410602, "epoch": 0.04304695484879457, "flos": 39457315745280.0, "grad_norm": 3.070586403914426, "language_loss": 0.81755793, "learning_rate": 3.998230841339976e-06, "loss": 0.84108824, "num_input_tokens_seen": 7564380, "step": 358, "time_per_iteration": 3.832752227783203 }, { "auxiliary_loss_clip": 0.01309974, "auxiliary_loss_mlp": 0.01045408, "balance_loss_clip": 1.07956445, "balance_loss_mlp": 1.03088832, "epoch": 0.04316719773943366, "flos": 19646297475840.0, "grad_norm": 2.123033186832179, "language_loss": 0.8504743, "learning_rate": 3.998197932409363e-06, "loss": 0.87402809, "num_input_tokens_seen": 7582390, "step": 359, "time_per_iteration": 3.9092698097229004 }, { "auxiliary_loss_clip": 0.0130335, "auxiliary_loss_mlp": 0.01046046, "balance_loss_clip": 1.0762049, "balance_loss_mlp": 1.03128767, "epoch": 0.04328744063007275, "flos": 22452320966400.0, "grad_norm": 2.4504945572706647, "language_loss": 0.86193073, "learning_rate": 3.9981647203537125e-06, "loss": 0.88542473, "num_input_tokens_seen": 7599890, "step": 360, "time_per_iteration": 2.724128007888794 }, { "auxiliary_loss_clip": 0.0131106, "auxiliary_loss_mlp": 0.01058025, "balance_loss_clip": 1.07923436, "balance_loss_mlp": 1.04186654, "epoch": 0.04340768352071184, "flos": 21283033530240.0, "grad_norm": 2.3386533817198587, "language_loss": 0.96085727, "learning_rate": 3.998131205178063e-06, "loss": 0.98454809, "num_input_tokens_seen": 7618360, "step": 361, "time_per_iteration": 2.800992965698242 }, { "auxiliary_loss_clip": 0.01312787, "auxiliary_loss_mlp": 0.01050746, "balance_loss_clip": 1.07969713, "balance_loss_mlp": 1.03459334, "epoch": 0.04352792641135093, "flos": 11583705951360.0, "grad_norm": 3.388259968577769, "language_loss": 0.76336157, "learning_rate": 3.998097386887498e-06, "loss": 0.7869969, "num_input_tokens_seen": 7635435, "step": 362, "time_per_iteration": 2.779536247253418 }, { "auxiliary_loss_clip": 0.01303543, "auxiliary_loss_mlp": 0.01059238, "balance_loss_clip": 1.07634187, "balance_loss_mlp": 1.04461145, "epoch": 0.04364816930199002, "flos": 23623547736960.0, "grad_norm": 1.7629160097241476, "language_loss": 0.84834599, "learning_rate": 3.998063265487148e-06, "loss": 0.87197375, "num_input_tokens_seen": 7656485, "step": 363, "time_per_iteration": 2.835533380508423 }, { "auxiliary_loss_clip": 0.01310417, "auxiliary_loss_mlp": 0.01053068, "balance_loss_clip": 1.08101773, "balance_loss_mlp": 1.03717732, "epoch": 0.043768412192629114, "flos": 14429734214400.0, "grad_norm": 2.053587091907351, "language_loss": 0.80956304, "learning_rate": 3.99802884098219e-06, "loss": 0.83319795, "num_input_tokens_seen": 7674595, "step": 364, "time_per_iteration": 2.880204439163208 }, { "auxiliary_loss_clip": 0.01310059, "auxiliary_loss_mlp": 0.01055121, "balance_loss_clip": 1.07814956, "balance_loss_mlp": 1.04058969, "epoch": 0.043888655083268203, "flos": 26468893641600.0, "grad_norm": 2.6373466496598725, "language_loss": 0.8264221, "learning_rate": 3.997994113377845e-06, "loss": 0.85007381, "num_input_tokens_seen": 7693495, "step": 365, "time_per_iteration": 2.769638776779175 }, { "auxiliary_loss_clip": 0.01306658, "auxiliary_loss_mlp": 0.01060777, "balance_loss_clip": 1.07694268, "balance_loss_mlp": 1.04516053, "epoch": 0.04400889797390729, "flos": 27235263242880.0, "grad_norm": 2.0412094548943083, "language_loss": 0.83621633, "learning_rate": 3.9979590826793815e-06, "loss": 0.8598907, "num_input_tokens_seen": 7714685, "step": 366, "time_per_iteration": 2.7885355949401855 }, { "auxiliary_loss_clip": 0.01315221, "auxiliary_loss_mlp": 0.01050785, "balance_loss_clip": 1.08196449, "balance_loss_mlp": 1.03341675, "epoch": 0.04412914086454638, "flos": 20119528183680.0, "grad_norm": 7.537453611574771, "language_loss": 0.81035703, "learning_rate": 3.997923748892113e-06, "loss": 0.83401704, "num_input_tokens_seen": 7734005, "step": 367, "time_per_iteration": 2.689116954803467 }, { "auxiliary_loss_clip": 0.01306758, "auxiliary_loss_mlp": 0.01055538, "balance_loss_clip": 1.07893193, "balance_loss_mlp": 1.04105401, "epoch": 0.04424938375518547, "flos": 22604618632320.0, "grad_norm": 1.8213920446825285, "language_loss": 0.8862623, "learning_rate": 3.9978881120214015e-06, "loss": 0.90988529, "num_input_tokens_seen": 7755525, "step": 368, "time_per_iteration": 2.7897732257843018 }, { "auxiliary_loss_clip": 0.01311162, "auxiliary_loss_mlp": 0.01050279, "balance_loss_clip": 1.07911611, "balance_loss_mlp": 1.03401875, "epoch": 0.04436962664582456, "flos": 24132365844480.0, "grad_norm": 1.895578955330728, "language_loss": 0.79244542, "learning_rate": 3.997852172072652e-06, "loss": 0.81605983, "num_input_tokens_seen": 7776740, "step": 369, "time_per_iteration": 2.7263665199279785 }, { "auxiliary_loss_clip": 0.01314723, "auxiliary_loss_mlp": 0.0105441, "balance_loss_clip": 1.08196783, "balance_loss_mlp": 1.03848326, "epoch": 0.04448986953646366, "flos": 18222906251520.0, "grad_norm": 2.659081448354084, "language_loss": 0.8964479, "learning_rate": 3.9978159290513155e-06, "loss": 0.92013919, "num_input_tokens_seen": 7794820, "step": 370, "time_per_iteration": 2.7867047786712646 }, { "auxiliary_loss_clip": 0.01316671, "auxiliary_loss_mlp": 0.0105424, "balance_loss_clip": 1.08236599, "balance_loss_mlp": 1.03809881, "epoch": 0.04461011242710275, "flos": 30117920400000.0, "grad_norm": 1.782290108576816, "language_loss": 0.80159664, "learning_rate": 3.997779382962892e-06, "loss": 0.8253057, "num_input_tokens_seen": 7817705, "step": 371, "time_per_iteration": 2.785094976425171 }, { "auxiliary_loss_clip": 0.01305194, "auxiliary_loss_mlp": 0.01045817, "balance_loss_clip": 1.07727957, "balance_loss_mlp": 1.03027201, "epoch": 0.04473035531774184, "flos": 29752529299200.0, "grad_norm": 2.8184705387512223, "language_loss": 0.737813, "learning_rate": 3.997742533812924e-06, "loss": 0.76132315, "num_input_tokens_seen": 7840970, "step": 372, "time_per_iteration": 2.7436835765838623 }, { "auxiliary_loss_clip": 0.01312565, "auxiliary_loss_mlp": 0.01047417, "balance_loss_clip": 1.08133316, "balance_loss_mlp": 1.03283715, "epoch": 0.04485059820838093, "flos": 13151565676800.0, "grad_norm": 2.4771440070823405, "language_loss": 0.92589653, "learning_rate": 3.997705381607001e-06, "loss": 0.94949627, "num_input_tokens_seen": 7857785, "step": 373, "time_per_iteration": 2.7933452129364014 }, { "auxiliary_loss_clip": 0.01221121, "auxiliary_loss_mlp": 0.01014514, "balance_loss_clip": 1.06260645, "balance_loss_mlp": 1.00893521, "epoch": 0.04497084109902002, "flos": 68094209548800.0, "grad_norm": 0.9778772275310355, "language_loss": 0.60309386, "learning_rate": 3.997667926350761e-06, "loss": 0.62545019, "num_input_tokens_seen": 7916115, "step": 374, "time_per_iteration": 3.2055187225341797 }, { "auxiliary_loss_clip": 0.01220094, "auxiliary_loss_mlp": 0.01012574, "balance_loss_clip": 1.06164312, "balance_loss_mlp": 1.00701928, "epoch": 0.04509108398965911, "flos": 64342263346560.0, "grad_norm": 0.902547418172891, "language_loss": 0.57797843, "learning_rate": 3.997630168049886e-06, "loss": 0.60030514, "num_input_tokens_seen": 7974480, "step": 375, "time_per_iteration": 3.4122965335845947 }, { "auxiliary_loss_clip": 0.01315334, "auxiliary_loss_mlp": 0.01051498, "balance_loss_clip": 1.0818013, "balance_loss_mlp": 1.03536904, "epoch": 0.045211326880298205, "flos": 22271115830400.0, "grad_norm": 2.241840362917402, "language_loss": 0.77451432, "learning_rate": 3.997592106710101e-06, "loss": 0.79818261, "num_input_tokens_seen": 7993940, "step": 376, "time_per_iteration": 2.84309458732605 }, { "auxiliary_loss_clip": 0.01306784, "auxiliary_loss_mlp": 0.01046848, "balance_loss_clip": 1.07796884, "balance_loss_mlp": 1.03229904, "epoch": 0.045331569770937295, "flos": 32159441796480.0, "grad_norm": 2.9477322716652514, "language_loss": 0.65890765, "learning_rate": 3.997553742337182e-06, "loss": 0.68244398, "num_input_tokens_seen": 8013365, "step": 377, "time_per_iteration": 2.8749940395355225 }, { "auxiliary_loss_clip": 0.0130995, "auxiliary_loss_mlp": 0.01052048, "balance_loss_clip": 1.08152187, "balance_loss_mlp": 1.03779042, "epoch": 0.045451812661576385, "flos": 22163455791360.0, "grad_norm": 1.872512254523775, "language_loss": 0.91333961, "learning_rate": 3.997515074936949e-06, "loss": 0.93695962, "num_input_tokens_seen": 8034240, "step": 378, "time_per_iteration": 2.8834428787231445 }, { "auxiliary_loss_clip": 0.01307062, "auxiliary_loss_mlp": 0.01047152, "balance_loss_clip": 1.08090675, "balance_loss_mlp": 1.0319643, "epoch": 0.045572055552215475, "flos": 16581968305920.0, "grad_norm": 3.9032667373435985, "language_loss": 0.86754322, "learning_rate": 3.997476104515268e-06, "loss": 0.89108533, "num_input_tokens_seen": 8052430, "step": 379, "time_per_iteration": 2.7815003395080566 }, { "auxiliary_loss_clip": 0.01302889, "auxiliary_loss_mlp": 0.01053317, "balance_loss_clip": 1.07869601, "balance_loss_mlp": 1.03741467, "epoch": 0.045692298442854565, "flos": 17603375448960.0, "grad_norm": 1.9063089782145275, "language_loss": 0.7755304, "learning_rate": 3.9974368310780485e-06, "loss": 0.79909241, "num_input_tokens_seen": 8069605, "step": 380, "time_per_iteration": 2.828080892562866 }, { "auxiliary_loss_clip": 0.01317116, "auxiliary_loss_mlp": 0.01056186, "balance_loss_clip": 1.08253288, "balance_loss_mlp": 1.03893614, "epoch": 0.045812541333493655, "flos": 26761098781440.0, "grad_norm": 5.810882235748213, "language_loss": 0.74407727, "learning_rate": 3.997397254631251e-06, "loss": 0.76781029, "num_input_tokens_seen": 8090225, "step": 381, "time_per_iteration": 2.8501110076904297 }, { "auxiliary_loss_clip": 0.01210444, "auxiliary_loss_mlp": 0.01011837, "balance_loss_clip": 1.05517554, "balance_loss_mlp": 1.00668681, "epoch": 0.04593278422413275, "flos": 60250349894400.0, "grad_norm": 0.8305384380078487, "language_loss": 0.60073268, "learning_rate": 3.997357375180878e-06, "loss": 0.6229555, "num_input_tokens_seen": 8154505, "step": 382, "time_per_iteration": 4.500580549240112 }, { "auxiliary_loss_clip": 0.01307356, "auxiliary_loss_mlp": 0.01056247, "balance_loss_clip": 1.08012509, "balance_loss_mlp": 1.04166138, "epoch": 0.04605302711477184, "flos": 21799249839360.0, "grad_norm": 2.3142964991195196, "language_loss": 0.75408721, "learning_rate": 3.997317192732979e-06, "loss": 0.77772331, "num_input_tokens_seen": 8173285, "step": 383, "time_per_iteration": 2.959583282470703 }, { "auxiliary_loss_clip": 0.01308618, "auxiliary_loss_mlp": 0.01065997, "balance_loss_clip": 1.07984984, "balance_loss_mlp": 1.05017781, "epoch": 0.04617327000541093, "flos": 19459705299840.0, "grad_norm": 2.014288349286507, "language_loss": 0.82559621, "learning_rate": 3.99727670729365e-06, "loss": 0.84934241, "num_input_tokens_seen": 8191845, "step": 384, "time_per_iteration": 2.7242226600646973 }, { "auxiliary_loss_clip": 0.01307234, "auxiliary_loss_mlp": 0.01055052, "balance_loss_clip": 1.08123231, "balance_loss_mlp": 1.03922105, "epoch": 0.04629351289605002, "flos": 25411468135680.0, "grad_norm": 1.8317714153645084, "language_loss": 0.78075671, "learning_rate": 3.997235918869033e-06, "loss": 0.80437958, "num_input_tokens_seen": 8212880, "step": 385, "time_per_iteration": 5.008902072906494 }, { "auxiliary_loss_clip": 0.0130597, "auxiliary_loss_mlp": 0.01043451, "balance_loss_clip": 1.0819242, "balance_loss_mlp": 1.0293963, "epoch": 0.04641375578668911, "flos": 20558284813440.0, "grad_norm": 2.5083688259788524, "language_loss": 0.82331729, "learning_rate": 3.997194827465315e-06, "loss": 0.84681153, "num_input_tokens_seen": 8231475, "step": 386, "time_per_iteration": 2.9355087280273438 }, { "auxiliary_loss_clip": 0.0130448, "auxiliary_loss_mlp": 0.01046823, "balance_loss_clip": 1.07854962, "balance_loss_mlp": 1.03128994, "epoch": 0.0465339986773282, "flos": 13188661447680.0, "grad_norm": 3.5503282146597943, "language_loss": 0.91470516, "learning_rate": 3.997153433088728e-06, "loss": 0.93821824, "num_input_tokens_seen": 8248600, "step": 387, "time_per_iteration": 2.8121206760406494 }, { "auxiliary_loss_clip": 0.01310875, "auxiliary_loss_mlp": 0.01049989, "balance_loss_clip": 1.08233356, "balance_loss_mlp": 1.03550482, "epoch": 0.0466542415679673, "flos": 25556547168000.0, "grad_norm": 1.9370053125954112, "language_loss": 0.80983108, "learning_rate": 3.997111735745554e-06, "loss": 0.83343971, "num_input_tokens_seen": 8271570, "step": 388, "time_per_iteration": 2.7902567386627197 }, { "auxiliary_loss_clip": 0.01307708, "auxiliary_loss_mlp": 0.01049209, "balance_loss_clip": 1.08184075, "balance_loss_mlp": 1.0345943, "epoch": 0.04677448445860639, "flos": 22236749493120.0, "grad_norm": 2.7190577462768943, "language_loss": 0.82601404, "learning_rate": 3.997069735442118e-06, "loss": 0.84958321, "num_input_tokens_seen": 8291265, "step": 389, "time_per_iteration": 2.924561023712158 }, { "auxiliary_loss_clip": 0.01299192, "auxiliary_loss_mlp": 0.01059842, "balance_loss_clip": 1.0756644, "balance_loss_mlp": 1.045614, "epoch": 0.04689472734924548, "flos": 28147825198080.0, "grad_norm": 1.432437553398722, "language_loss": 0.8034265, "learning_rate": 3.997027432184792e-06, "loss": 0.82701683, "num_input_tokens_seen": 8315925, "step": 390, "time_per_iteration": 2.886894464492798 }, { "auxiliary_loss_clip": 0.01304138, "auxiliary_loss_mlp": 0.01052953, "balance_loss_clip": 1.07912672, "balance_loss_mlp": 1.03872502, "epoch": 0.04701497023988457, "flos": 23148952312320.0, "grad_norm": 2.9003955157261765, "language_loss": 0.89283967, "learning_rate": 3.99698482597999e-06, "loss": 0.91641057, "num_input_tokens_seen": 8333605, "step": 391, "time_per_iteration": 2.9169225692749023 }, { "auxiliary_loss_clip": 0.01201137, "auxiliary_loss_mlp": 0.01010957, "balance_loss_clip": 1.04735184, "balance_loss_mlp": 1.00566375, "epoch": 0.04713521313052366, "flos": 64827668764800.0, "grad_norm": 0.8696193704754265, "language_loss": 0.63910252, "learning_rate": 3.99694191683418e-06, "loss": 0.66122341, "num_input_tokens_seen": 8394405, "step": 392, "time_per_iteration": 3.4143972396850586 }, { "auxiliary_loss_clip": 0.01311523, "auxiliary_loss_mlp": 0.01057431, "balance_loss_clip": 1.08333921, "balance_loss_mlp": 1.04198742, "epoch": 0.047255456021162746, "flos": 18771585477120.0, "grad_norm": 1.859862726107589, "language_loss": 0.81508207, "learning_rate": 3.996898704753867e-06, "loss": 0.83877158, "num_input_tokens_seen": 8412355, "step": 393, "time_per_iteration": 2.8709309101104736 }, { "auxiliary_loss_clip": 0.01306568, "auxiliary_loss_mlp": 0.01046676, "balance_loss_clip": 1.07994676, "balance_loss_mlp": 1.0327698, "epoch": 0.04737569891180184, "flos": 22053820504320.0, "grad_norm": 2.423641670456587, "language_loss": 0.87841177, "learning_rate": 3.996855189745609e-06, "loss": 0.90194428, "num_input_tokens_seen": 8431620, "step": 394, "time_per_iteration": 2.883173704147339 }, { "auxiliary_loss_clip": 0.01302269, "auxiliary_loss_mlp": 0.01048132, "balance_loss_clip": 1.07774425, "balance_loss_mlp": 1.03434539, "epoch": 0.04749594180244093, "flos": 29057370410880.0, "grad_norm": 15.524411930082127, "language_loss": 0.92649788, "learning_rate": 3.996811371816007e-06, "loss": 0.9500019, "num_input_tokens_seen": 8454045, "step": 395, "time_per_iteration": 3.0118556022644043 }, { "auxiliary_loss_clip": 0.01304828, "auxiliary_loss_mlp": 0.01056182, "balance_loss_clip": 1.08071375, "balance_loss_mlp": 1.04093504, "epoch": 0.04761618469308002, "flos": 35112268172160.0, "grad_norm": 1.913689171892591, "language_loss": 0.78205383, "learning_rate": 3.996767250971707e-06, "loss": 0.80566394, "num_input_tokens_seen": 8476785, "step": 396, "time_per_iteration": 2.8988349437713623 }, { "auxiliary_loss_clip": 0.01305607, "auxiliary_loss_mlp": 0.01044677, "balance_loss_clip": 1.0825088, "balance_loss_mlp": 1.03058648, "epoch": 0.04773642758371911, "flos": 25630702796160.0, "grad_norm": 3.226064462080889, "language_loss": 0.86643636, "learning_rate": 3.996722827219403e-06, "loss": 0.88993919, "num_input_tokens_seen": 8498400, "step": 397, "time_per_iteration": 2.849125862121582 }, { "auxiliary_loss_clip": 0.01309177, "auxiliary_loss_mlp": 0.01051209, "balance_loss_clip": 1.08293295, "balance_loss_mlp": 1.03643894, "epoch": 0.0478566704743582, "flos": 20631506688000.0, "grad_norm": 3.5780813759327743, "language_loss": 0.82724369, "learning_rate": 3.996678100565833e-06, "loss": 0.85084754, "num_input_tokens_seen": 8517455, "step": 398, "time_per_iteration": 2.8905787467956543 }, { "auxiliary_loss_clip": 0.01303965, "auxiliary_loss_mlp": 0.01046397, "balance_loss_clip": 1.07876253, "balance_loss_mlp": 1.03302121, "epoch": 0.04797691336499729, "flos": 18835721210880.0, "grad_norm": 2.9504322616974012, "language_loss": 0.88603401, "learning_rate": 3.996633071017783e-06, "loss": 0.90953755, "num_input_tokens_seen": 8534085, "step": 399, "time_per_iteration": 2.808953046798706 }, { "auxiliary_loss_clip": 0.01303586, "auxiliary_loss_mlp": 0.01049598, "balance_loss_clip": 1.08166766, "balance_loss_mlp": 1.03584111, "epoch": 0.04809715625563638, "flos": 21099673578240.0, "grad_norm": 2.144290163886173, "language_loss": 0.81269991, "learning_rate": 3.996587738582084e-06, "loss": 0.83623177, "num_input_tokens_seen": 8550885, "step": 400, "time_per_iteration": 2.686483383178711 }, { "auxiliary_loss_clip": 0.01299535, "auxiliary_loss_mlp": 0.01038237, "balance_loss_clip": 1.07765186, "balance_loss_mlp": 1.0248673, "epoch": 0.04821739914627548, "flos": 23805650712960.0, "grad_norm": 5.1329786428410324, "language_loss": 0.86427182, "learning_rate": 3.9965421032656115e-06, "loss": 0.8876496, "num_input_tokens_seen": 8570815, "step": 401, "time_per_iteration": 2.8014490604400635 }, { "auxiliary_loss_clip": 0.01304145, "auxiliary_loss_mlp": 0.01046742, "balance_loss_clip": 1.07964396, "balance_loss_mlp": 1.03261542, "epoch": 0.04833764203691457, "flos": 22200587475840.0, "grad_norm": 2.5410994778406444, "language_loss": 0.94569767, "learning_rate": 3.99649616507529e-06, "loss": 0.96920657, "num_input_tokens_seen": 8589910, "step": 402, "time_per_iteration": 2.8306703567504883 }, { "auxiliary_loss_clip": 0.01198711, "auxiliary_loss_mlp": 0.01011659, "balance_loss_clip": 1.04272187, "balance_loss_mlp": 1.00660408, "epoch": 0.04845788492755366, "flos": 65904376896000.0, "grad_norm": 0.8896308718735041, "language_loss": 0.63157308, "learning_rate": 3.996449924018088e-06, "loss": 0.65367675, "num_input_tokens_seen": 8650370, "step": 403, "time_per_iteration": 3.3656868934631348 }, { "auxiliary_loss_clip": 0.01298803, "auxiliary_loss_mlp": 0.01054061, "balance_loss_clip": 1.07758093, "balance_loss_mlp": 1.04161537, "epoch": 0.04857812781819275, "flos": 19281301424640.0, "grad_norm": 2.091073042369383, "language_loss": 0.79458237, "learning_rate": 3.99640338010102e-06, "loss": 0.81811106, "num_input_tokens_seen": 8669475, "step": 404, "time_per_iteration": 2.7586417198181152 }, { "auxiliary_loss_clip": 0.01296484, "auxiliary_loss_mlp": 0.01044874, "balance_loss_clip": 1.07529342, "balance_loss_mlp": 1.03094435, "epoch": 0.04869837070883184, "flos": 24062376193920.0, "grad_norm": 2.1690825572083257, "language_loss": 0.7892946, "learning_rate": 3.996356533331146e-06, "loss": 0.81270814, "num_input_tokens_seen": 8691345, "step": 405, "time_per_iteration": 2.7836930751800537 }, { "auxiliary_loss_clip": 0.01307688, "auxiliary_loss_mlp": 0.01044572, "balance_loss_clip": 1.08125842, "balance_loss_mlp": 1.03027892, "epoch": 0.04881861359947093, "flos": 25187169657600.0, "grad_norm": 4.007751302486021, "language_loss": 0.61828035, "learning_rate": 3.996309383715573e-06, "loss": 0.64180291, "num_input_tokens_seen": 8710125, "step": 406, "time_per_iteration": 2.847522258758545 }, { "auxiliary_loss_clip": 0.01306523, "auxiliary_loss_mlp": 0.01050036, "balance_loss_clip": 1.08001781, "balance_loss_mlp": 1.03575444, "epoch": 0.048938856490110025, "flos": 16362913213440.0, "grad_norm": 5.122319578914125, "language_loss": 0.73656595, "learning_rate": 3.996261931261454e-06, "loss": 0.76013154, "num_input_tokens_seen": 8728705, "step": 407, "time_per_iteration": 2.8939361572265625 }, { "auxiliary_loss_clip": 0.01302059, "auxiliary_loss_mlp": 0.01052444, "balance_loss_clip": 1.08068156, "balance_loss_mlp": 1.0382638, "epoch": 0.049059099380749115, "flos": 29895094379520.0, "grad_norm": 1.586259814957921, "language_loss": 0.86621428, "learning_rate": 3.996214175975987e-06, "loss": 0.8897593, "num_input_tokens_seen": 8749225, "step": 408, "time_per_iteration": 5.161776781082153 }, { "auxiliary_loss_clip": 0.01303533, "auxiliary_loss_mlp": 0.01056124, "balance_loss_clip": 1.08057737, "balance_loss_mlp": 1.04228401, "epoch": 0.049179342271388204, "flos": 35918858027520.0, "grad_norm": 3.6306888550725915, "language_loss": 0.79011929, "learning_rate": 3.996166117866417e-06, "loss": 0.81371588, "num_input_tokens_seen": 8771160, "step": 409, "time_per_iteration": 3.0325193405151367 }, { "auxiliary_loss_clip": 0.01288731, "auxiliary_loss_mlp": 0.0104603, "balance_loss_clip": 1.07148719, "balance_loss_mlp": 1.0325352, "epoch": 0.049299585162027294, "flos": 14611226659200.0, "grad_norm": 2.0496950111446086, "language_loss": 0.8697263, "learning_rate": 3.996117756940035e-06, "loss": 0.89307392, "num_input_tokens_seen": 8787845, "step": 410, "time_per_iteration": 2.7327380180358887 }, { "auxiliary_loss_clip": 0.01296973, "auxiliary_loss_mlp": 0.01053443, "balance_loss_clip": 1.07714891, "balance_loss_mlp": 1.03935218, "epoch": 0.049419828052666384, "flos": 19567939956480.0, "grad_norm": 2.1351797291073855, "language_loss": 0.97818112, "learning_rate": 3.996069093204175e-06, "loss": 1.00168526, "num_input_tokens_seen": 8803805, "step": 411, "time_per_iteration": 3.7758936882019043 }, { "auxiliary_loss_clip": 0.01308558, "auxiliary_loss_mlp": 0.0105332, "balance_loss_clip": 1.08481658, "balance_loss_mlp": 1.03947401, "epoch": 0.049540070943305474, "flos": 13659916907520.0, "grad_norm": 2.512332682149048, "language_loss": 0.87908065, "learning_rate": 3.996020126666221e-06, "loss": 0.90269947, "num_input_tokens_seen": 8820785, "step": 412, "time_per_iteration": 3.873279094696045 }, { "auxiliary_loss_clip": 0.01298896, "auxiliary_loss_mlp": 0.01043744, "balance_loss_clip": 1.07816243, "balance_loss_mlp": 1.03068507, "epoch": 0.04966031383394457, "flos": 21832035978240.0, "grad_norm": 2.091687826871724, "language_loss": 0.82158518, "learning_rate": 3.995970857333601e-06, "loss": 0.84501147, "num_input_tokens_seen": 8841195, "step": 413, "time_per_iteration": 2.7991294860839844 }, { "auxiliary_loss_clip": 0.01302259, "auxiliary_loss_mlp": 0.01051723, "balance_loss_clip": 1.07785738, "balance_loss_mlp": 1.03742957, "epoch": 0.04978055672458366, "flos": 28618793349120.0, "grad_norm": 1.8194217335854346, "language_loss": 0.79612589, "learning_rate": 3.995921285213789e-06, "loss": 0.81966567, "num_input_tokens_seen": 8861455, "step": 414, "time_per_iteration": 2.7611734867095947 }, { "auxiliary_loss_clip": 0.01296355, "auxiliary_loss_mlp": 0.01051025, "balance_loss_clip": 1.07649112, "balance_loss_mlp": 1.03767991, "epoch": 0.04990079961522275, "flos": 19828220883840.0, "grad_norm": 2.243621354964767, "language_loss": 0.80792272, "learning_rate": 3.995871410314305e-06, "loss": 0.83139646, "num_input_tokens_seen": 8880015, "step": 415, "time_per_iteration": 2.7792725563049316 }, { "auxiliary_loss_clip": 0.01184731, "auxiliary_loss_mlp": 0.01017295, "balance_loss_clip": 1.03869915, "balance_loss_mlp": 1.0128603, "epoch": 0.05002104250586184, "flos": 62735045293440.0, "grad_norm": 0.9115366448167603, "language_loss": 0.59614044, "learning_rate": 3.995821232642714e-06, "loss": 0.61816072, "num_input_tokens_seen": 8938420, "step": 416, "time_per_iteration": 3.4901039600372314 }, { "auxiliary_loss_clip": 0.0129072, "auxiliary_loss_mlp": 0.01056363, "balance_loss_clip": 1.07723343, "balance_loss_mlp": 1.04280877, "epoch": 0.05014128539650093, "flos": 27928518710400.0, "grad_norm": 4.802328717632212, "language_loss": 0.82445657, "learning_rate": 3.995770752206629e-06, "loss": 0.84792733, "num_input_tokens_seen": 8959495, "step": 417, "time_per_iteration": 2.9634697437286377 }, { "auxiliary_loss_clip": 0.01294991, "auxiliary_loss_mlp": 0.01049819, "balance_loss_clip": 1.07551813, "balance_loss_mlp": 1.03714705, "epoch": 0.05026152828714002, "flos": 17705576620800.0, "grad_norm": 2.2727190144510656, "language_loss": 0.97253728, "learning_rate": 3.995719969013709e-06, "loss": 0.99598545, "num_input_tokens_seen": 8976675, "step": 418, "time_per_iteration": 2.761638879776001 }, { "auxiliary_loss_clip": 0.01282509, "auxiliary_loss_mlp": 0.01041828, "balance_loss_clip": 1.07641768, "balance_loss_mlp": 1.02863717, "epoch": 0.05038177117777912, "flos": 19133277477120.0, "grad_norm": 3.8329038996266274, "language_loss": 0.85827351, "learning_rate": 3.995668883071655e-06, "loss": 0.88151681, "num_input_tokens_seen": 8992900, "step": 419, "time_per_iteration": 2.8916730880737305 }, { "auxiliary_loss_clip": 0.01297556, "auxiliary_loss_mlp": 0.01056998, "balance_loss_clip": 1.07784414, "balance_loss_mlp": 1.04289603, "epoch": 0.050502014068418206, "flos": 20667704618880.0, "grad_norm": 2.847522831118899, "language_loss": 0.91156572, "learning_rate": 3.995617494388219e-06, "loss": 0.93511128, "num_input_tokens_seen": 9011020, "step": 420, "time_per_iteration": 2.88614821434021 }, { "auxiliary_loss_clip": 0.01287008, "auxiliary_loss_mlp": 0.01047661, "balance_loss_clip": 1.07728767, "balance_loss_mlp": 1.03424954, "epoch": 0.050622256959057296, "flos": 21361103740800.0, "grad_norm": 2.5816821517707718, "language_loss": 0.804618, "learning_rate": 3.995565802971196e-06, "loss": 0.82796466, "num_input_tokens_seen": 9030995, "step": 421, "time_per_iteration": 2.8529446125030518 }, { "auxiliary_loss_clip": 0.0128183, "auxiliary_loss_mlp": 0.0105132, "balance_loss_clip": 1.07484365, "balance_loss_mlp": 1.03860044, "epoch": 0.050742499849696386, "flos": 27673588909440.0, "grad_norm": 2.3883139408672927, "language_loss": 0.67380327, "learning_rate": 3.995513808828427e-06, "loss": 0.69713473, "num_input_tokens_seen": 9053790, "step": 422, "time_per_iteration": 2.977619171142578 }, { "auxiliary_loss_clip": 0.01282747, "auxiliary_loss_mlp": 0.0104686, "balance_loss_clip": 1.07526684, "balance_loss_mlp": 1.03425932, "epoch": 0.050862742740335476, "flos": 19865999013120.0, "grad_norm": 2.045798984549256, "language_loss": 0.7666173, "learning_rate": 3.9954615119678e-06, "loss": 0.78991336, "num_input_tokens_seen": 9072345, "step": 423, "time_per_iteration": 2.8006865978240967 }, { "auxiliary_loss_clip": 0.01277, "auxiliary_loss_mlp": 0.01049006, "balance_loss_clip": 1.07639992, "balance_loss_mlp": 1.03629255, "epoch": 0.050982985630974566, "flos": 22085098272000.0, "grad_norm": 2.023273216404122, "language_loss": 0.80226803, "learning_rate": 3.995408912397248e-06, "loss": 0.82552809, "num_input_tokens_seen": 9090240, "step": 424, "time_per_iteration": 2.9227652549743652 }, { "auxiliary_loss_clip": 0.01282361, "auxiliary_loss_mlp": 0.01045562, "balance_loss_clip": 1.07575786, "balance_loss_mlp": 1.03218114, "epoch": 0.05110322852161366, "flos": 20740962407040.0, "grad_norm": 2.854585166892726, "language_loss": 0.93036836, "learning_rate": 3.99535601012475e-06, "loss": 0.95364767, "num_input_tokens_seen": 9105570, "step": 425, "time_per_iteration": 2.8514578342437744 }, { "auxiliary_loss_clip": 0.0127984, "auxiliary_loss_mlp": 0.01061482, "balance_loss_clip": 1.07946694, "balance_loss_mlp": 1.03896093, "epoch": 0.05122347141225275, "flos": 28547295327360.0, "grad_norm": 1.570904589534468, "language_loss": 0.75400615, "learning_rate": 3.995302805158333e-06, "loss": 0.77741939, "num_input_tokens_seen": 9128225, "step": 426, "time_per_iteration": 2.971506357192993 }, { "auxiliary_loss_clip": 0.01273389, "auxiliary_loss_mlp": 0.01048185, "balance_loss_clip": 1.07691014, "balance_loss_mlp": 1.03427875, "epoch": 0.05134371430289184, "flos": 19722679747200.0, "grad_norm": 2.371363917684427, "language_loss": 0.83830631, "learning_rate": 3.9952492975060665e-06, "loss": 0.86152208, "num_input_tokens_seen": 9148295, "step": 427, "time_per_iteration": 2.9415500164031982 }, { "auxiliary_loss_clip": 0.01285427, "auxiliary_loss_mlp": 0.01040775, "balance_loss_clip": 1.07313931, "balance_loss_mlp": 1.02667212, "epoch": 0.05146395719353093, "flos": 34458945649920.0, "grad_norm": 3.4156674427122136, "language_loss": 0.8507818, "learning_rate": 3.995195487176067e-06, "loss": 0.87404382, "num_input_tokens_seen": 9168525, "step": 428, "time_per_iteration": 2.953847885131836 }, { "auxiliary_loss_clip": 0.01293725, "auxiliary_loss_mlp": 0.01045405, "balance_loss_clip": 1.07624412, "balance_loss_mlp": 1.03188109, "epoch": 0.05158420008417002, "flos": 21760286561280.0, "grad_norm": 2.4079326625961697, "language_loss": 0.85720056, "learning_rate": 3.995141374176499e-06, "loss": 0.88059187, "num_input_tokens_seen": 9186920, "step": 429, "time_per_iteration": 2.968993902206421 }, { "auxiliary_loss_clip": 0.01180701, "auxiliary_loss_mlp": 0.01015291, "balance_loss_clip": 1.04124033, "balance_loss_mlp": 1.00256872, "epoch": 0.05170444297480911, "flos": 72553956226560.0, "grad_norm": 0.8729126046696631, "language_loss": 0.63106477, "learning_rate": 3.995086958515572e-06, "loss": 0.65302479, "num_input_tokens_seen": 9244940, "step": 430, "time_per_iteration": 3.3905580043792725 }, { "auxiliary_loss_clip": 0.01186246, "auxiliary_loss_mlp": 0.01015103, "balance_loss_clip": 1.04030573, "balance_loss_mlp": 1.00246036, "epoch": 0.05182468586544821, "flos": 62416159326720.0, "grad_norm": 0.85447791799146, "language_loss": 0.59933722, "learning_rate": 3.995032240201538e-06, "loss": 0.62135077, "num_input_tokens_seen": 9307335, "step": 431, "time_per_iteration": 3.313758611679077 }, { "auxiliary_loss_clip": 0.01181883, "auxiliary_loss_mlp": 0.01005071, "balance_loss_clip": 1.03862643, "balance_loss_mlp": 1.00025523, "epoch": 0.0519449287560873, "flos": 41225989432320.0, "grad_norm": 0.9433833055772698, "language_loss": 0.63086528, "learning_rate": 3.9949772192427e-06, "loss": 0.65273482, "num_input_tokens_seen": 9353960, "step": 432, "time_per_iteration": 3.072704315185547 }, { "auxiliary_loss_clip": 0.01284308, "auxiliary_loss_mlp": 0.01045167, "balance_loss_clip": 1.07598972, "balance_loss_mlp": 1.03166664, "epoch": 0.05206517164672639, "flos": 17494530261120.0, "grad_norm": 2.0337958563337892, "language_loss": 0.79346621, "learning_rate": 3.994921895647405e-06, "loss": 0.81676096, "num_input_tokens_seen": 9372130, "step": 433, "time_per_iteration": 2.9463021755218506 }, { "auxiliary_loss_clip": 0.01179077, "auxiliary_loss_mlp": 0.01004946, "balance_loss_clip": 1.03513575, "balance_loss_mlp": 1.00017726, "epoch": 0.05218541453736548, "flos": 64002762973440.0, "grad_norm": 0.842178315987628, "language_loss": 0.55348969, "learning_rate": 3.994866269424043e-06, "loss": 0.57533002, "num_input_tokens_seen": 9428500, "step": 434, "time_per_iteration": 5.542250871658325 }, { "auxiliary_loss_clip": 0.01269121, "auxiliary_loss_mlp": 0.01045337, "balance_loss_clip": 1.072788, "balance_loss_mlp": 1.03282619, "epoch": 0.05230565742800457, "flos": 19317319787520.0, "grad_norm": 3.667061578547234, "language_loss": 0.78519797, "learning_rate": 3.9948103405810545e-06, "loss": 0.80834258, "num_input_tokens_seen": 9447450, "step": 435, "time_per_iteration": 3.0180938243865967 }, { "auxiliary_loss_clip": 0.0126197, "auxiliary_loss_mlp": 0.01044698, "balance_loss_clip": 1.07630062, "balance_loss_mlp": 1.03259242, "epoch": 0.05242590031864366, "flos": 25298636538240.0, "grad_norm": 2.025656729287552, "language_loss": 0.86029404, "learning_rate": 3.994754109126923e-06, "loss": 0.88336074, "num_input_tokens_seen": 9468945, "step": 436, "time_per_iteration": 2.9943599700927734 }, { "auxiliary_loss_clip": 0.01254053, "auxiliary_loss_mlp": 0.01036244, "balance_loss_clip": 1.07309246, "balance_loss_mlp": 1.02345848, "epoch": 0.052546143209282754, "flos": 26211629456640.0, "grad_norm": 1.7309721001485203, "language_loss": 0.93638539, "learning_rate": 3.994697575070181e-06, "loss": 0.95928836, "num_input_tokens_seen": 9488405, "step": 437, "time_per_iteration": 3.8847267627716064 }, { "auxiliary_loss_clip": 0.0128389, "auxiliary_loss_mlp": 0.01056987, "balance_loss_clip": 1.07898688, "balance_loss_mlp": 1.04289031, "epoch": 0.052666386099921844, "flos": 22158140578560.0, "grad_norm": 2.332084684994859, "language_loss": 0.91468692, "learning_rate": 3.994640738419402e-06, "loss": 0.93809569, "num_input_tokens_seen": 9507780, "step": 438, "time_per_iteration": 3.845479965209961 }, { "auxiliary_loss_clip": 0.01285579, "auxiliary_loss_mlp": 0.010441, "balance_loss_clip": 1.07374382, "balance_loss_mlp": 1.0308975, "epoch": 0.052786628990560934, "flos": 23881817502720.0, "grad_norm": 3.6835796678058608, "language_loss": 0.80943674, "learning_rate": 3.9945835991832075e-06, "loss": 0.83273351, "num_input_tokens_seen": 9529665, "step": 439, "time_per_iteration": 2.7989253997802734 }, { "auxiliary_loss_clip": 0.01295041, "auxiliary_loss_mlp": 0.01052089, "balance_loss_clip": 1.07768428, "balance_loss_mlp": 1.03896976, "epoch": 0.052906871881200024, "flos": 24605021934720.0, "grad_norm": 2.1807440156864892, "language_loss": 0.92973799, "learning_rate": 3.994526157370268e-06, "loss": 0.95320934, "num_input_tokens_seen": 9548280, "step": 440, "time_per_iteration": 2.7923929691314697 }, { "auxiliary_loss_clip": 0.01169594, "auxiliary_loss_mlp": 0.01017146, "balance_loss_clip": 1.02766848, "balance_loss_mlp": 1.01242566, "epoch": 0.053027114771839114, "flos": 56461631143680.0, "grad_norm": 0.903587752900887, "language_loss": 0.59223306, "learning_rate": 3.994468412989296e-06, "loss": 0.61410046, "num_input_tokens_seen": 9609690, "step": 441, "time_per_iteration": 3.4616425037384033 }, { "auxiliary_loss_clip": 0.01261166, "auxiliary_loss_mlp": 0.01050212, "balance_loss_clip": 1.0764873, "balance_loss_mlp": 1.03703904, "epoch": 0.053147357662478203, "flos": 17311098481920.0, "grad_norm": 2.337105568860892, "language_loss": 0.92836022, "learning_rate": 3.994410366049052e-06, "loss": 0.95147395, "num_input_tokens_seen": 9627550, "step": 442, "time_per_iteration": 2.8363471031188965 }, { "auxiliary_loss_clip": 0.01283571, "auxiliary_loss_mlp": 0.01042954, "balance_loss_clip": 1.07374048, "balance_loss_mlp": 1.02961993, "epoch": 0.0532676005531173, "flos": 17164977955200.0, "grad_norm": 2.483161503212314, "language_loss": 0.83006644, "learning_rate": 3.994352016558341e-06, "loss": 0.85333174, "num_input_tokens_seen": 9644855, "step": 443, "time_per_iteration": 2.7534854412078857 }, { "auxiliary_loss_clip": 0.0128325, "auxiliary_loss_mlp": 0.01045641, "balance_loss_clip": 1.07719016, "balance_loss_mlp": 1.03193843, "epoch": 0.05338784344375639, "flos": 27819960831360.0, "grad_norm": 2.1432096075907787, "language_loss": 0.74184847, "learning_rate": 3.994293364526014e-06, "loss": 0.76513731, "num_input_tokens_seen": 9665740, "step": 444, "time_per_iteration": 2.8304717540740967 }, { "auxiliary_loss_clip": 0.01273031, "auxiliary_loss_mlp": 0.01045136, "balance_loss_clip": 1.07936335, "balance_loss_mlp": 1.03139079, "epoch": 0.05350808633439548, "flos": 21507691144320.0, "grad_norm": 1.9806353700948571, "language_loss": 0.8508206, "learning_rate": 3.99423440996097e-06, "loss": 0.87400222, "num_input_tokens_seen": 9685280, "step": 445, "time_per_iteration": 2.8157567977905273 }, { "auxiliary_loss_clip": 0.01284015, "auxiliary_loss_mlp": 0.01044649, "balance_loss_clip": 1.08024013, "balance_loss_mlp": 1.03139853, "epoch": 0.05362832922503457, "flos": 20084299920000.0, "grad_norm": 2.4179906725573277, "language_loss": 0.81244254, "learning_rate": 3.994175152872152e-06, "loss": 0.83572918, "num_input_tokens_seen": 9704365, "step": 446, "time_per_iteration": 2.836310386657715 }, { "auxiliary_loss_clip": 0.01289135, "auxiliary_loss_mlp": 0.01047866, "balance_loss_clip": 1.07670403, "balance_loss_mlp": 1.03424668, "epoch": 0.05374857211567366, "flos": 26137222433280.0, "grad_norm": 2.78364435907109, "language_loss": 0.78677058, "learning_rate": 3.994115593268548e-06, "loss": 0.81014055, "num_input_tokens_seen": 9724145, "step": 447, "time_per_iteration": 2.9522228240966797 }, { "auxiliary_loss_clip": 0.01291576, "auxiliary_loss_mlp": 0.01047228, "balance_loss_clip": 1.07665491, "balance_loss_mlp": 1.03381681, "epoch": 0.05386881500631275, "flos": 27486817165440.0, "grad_norm": 9.654792852768352, "language_loss": 0.82280076, "learning_rate": 3.994055731159195e-06, "loss": 0.84618878, "num_input_tokens_seen": 9741615, "step": 448, "time_per_iteration": 2.844768524169922 }, { "auxiliary_loss_clip": 0.01289164, "auxiliary_loss_mlp": 0.01049568, "balance_loss_clip": 1.07854044, "balance_loss_mlp": 1.03610361, "epoch": 0.053989057896951846, "flos": 23585087249280.0, "grad_norm": 1.8323773688737173, "language_loss": 0.86986673, "learning_rate": 3.993995566553172e-06, "loss": 0.89325404, "num_input_tokens_seen": 9760580, "step": 449, "time_per_iteration": 2.781611442565918 }, { "auxiliary_loss_clip": 0.01269565, "auxiliary_loss_mlp": 0.01048285, "balance_loss_clip": 1.07308722, "balance_loss_mlp": 1.03429008, "epoch": 0.054109300787590936, "flos": 25228862369280.0, "grad_norm": 1.561264804094607, "language_loss": 0.77172923, "learning_rate": 3.993935099459607e-06, "loss": 0.79490775, "num_input_tokens_seen": 9782195, "step": 450, "time_per_iteration": 2.803083658218384 }, { "auxiliary_loss_clip": 0.01281771, "auxiliary_loss_mlp": 0.01048474, "balance_loss_clip": 1.07091355, "balance_loss_mlp": 1.0350455, "epoch": 0.054229543678230026, "flos": 23841525421440.0, "grad_norm": 2.0067991763970925, "language_loss": 0.74280643, "learning_rate": 3.993874329887673e-06, "loss": 0.76610887, "num_input_tokens_seen": 9800850, "step": 451, "time_per_iteration": 2.845245599746704 }, { "auxiliary_loss_clip": 0.01282467, "auxiliary_loss_mlp": 0.01045998, "balance_loss_clip": 1.07621503, "balance_loss_mlp": 1.03316498, "epoch": 0.054349786568869116, "flos": 16320933192960.0, "grad_norm": 2.39943004428792, "language_loss": 0.86286151, "learning_rate": 3.993813257846589e-06, "loss": 0.88614619, "num_input_tokens_seen": 9817605, "step": 452, "time_per_iteration": 2.7455508708953857 }, { "auxiliary_loss_clip": 0.01294503, "auxiliary_loss_mlp": 0.01040679, "balance_loss_clip": 1.08103633, "balance_loss_mlp": 1.02721477, "epoch": 0.054470029459508205, "flos": 18660729127680.0, "grad_norm": 2.4172206755635144, "language_loss": 0.93047959, "learning_rate": 3.993751883345619e-06, "loss": 0.95383143, "num_input_tokens_seen": 9835965, "step": 453, "time_per_iteration": 2.7433619499206543 }, { "auxiliary_loss_clip": 0.0128023, "auxiliary_loss_mlp": 0.01049159, "balance_loss_clip": 1.07950258, "balance_loss_mlp": 1.03558135, "epoch": 0.054590272350147295, "flos": 17785298856960.0, "grad_norm": 3.1707066849093235, "language_loss": 0.87625813, "learning_rate": 3.993690206394073e-06, "loss": 0.89955199, "num_input_tokens_seen": 9852265, "step": 454, "time_per_iteration": 2.7892346382141113 }, { "auxiliary_loss_clip": 0.01293064, "auxiliary_loss_mlp": 0.01057174, "balance_loss_clip": 1.08110666, "balance_loss_mlp": 1.04366755, "epoch": 0.054710515240786385, "flos": 17785945301760.0, "grad_norm": 2.563834071383153, "language_loss": 0.87911665, "learning_rate": 3.993628227001307e-06, "loss": 0.902619, "num_input_tokens_seen": 9870465, "step": 455, "time_per_iteration": 2.8813493251800537 }, { "auxiliary_loss_clip": 0.01281786, "auxiliary_loss_mlp": 0.01048871, "balance_loss_clip": 1.07593155, "balance_loss_mlp": 1.03514433, "epoch": 0.05483075813142548, "flos": 48210900180480.0, "grad_norm": 2.417912268100088, "language_loss": 0.7121706, "learning_rate": 3.993565945176726e-06, "loss": 0.73547721, "num_input_tokens_seen": 9891490, "step": 456, "time_per_iteration": 3.0635104179382324 }, { "auxiliary_loss_clip": 0.0127902, "auxiliary_loss_mlp": 0.01052576, "balance_loss_clip": 1.07636213, "balance_loss_mlp": 1.03945732, "epoch": 0.05495100102206457, "flos": 19682244011520.0, "grad_norm": 3.2271326177276496, "language_loss": 0.83991742, "learning_rate": 3.993503360929776e-06, "loss": 0.86323333, "num_input_tokens_seen": 9910375, "step": 457, "time_per_iteration": 2.8587257862091064 }, { "auxiliary_loss_clip": 0.01270416, "auxiliary_loss_mlp": 0.01052655, "balance_loss_clip": 1.08045435, "balance_loss_mlp": 1.03920805, "epoch": 0.05507124391270366, "flos": 26360048453760.0, "grad_norm": 1.7701676503306152, "language_loss": 0.81192446, "learning_rate": 3.99344047426995e-06, "loss": 0.83515525, "num_input_tokens_seen": 9931635, "step": 458, "time_per_iteration": 2.9706811904907227 }, { "auxiliary_loss_clip": 0.01278491, "auxiliary_loss_mlp": 0.01050402, "balance_loss_clip": 1.0793246, "balance_loss_mlp": 1.03632343, "epoch": 0.05519148680334275, "flos": 22601314581120.0, "grad_norm": 2.114773629220582, "language_loss": 0.93777287, "learning_rate": 3.993377285206789e-06, "loss": 0.96106184, "num_input_tokens_seen": 9951420, "step": 459, "time_per_iteration": 2.8975417613983154 }, { "auxiliary_loss_clip": 0.01259724, "auxiliary_loss_mlp": 0.01049274, "balance_loss_clip": 1.07939756, "balance_loss_mlp": 1.03575563, "epoch": 0.05531172969398184, "flos": 40552519380480.0, "grad_norm": 1.8220020599072873, "language_loss": 0.86557794, "learning_rate": 3.99331379374988e-06, "loss": 0.88866788, "num_input_tokens_seen": 9975025, "step": 460, "time_per_iteration": 4.096879720687866 }, { "auxiliary_loss_clip": 0.01291065, "auxiliary_loss_mlp": 0.01047091, "balance_loss_clip": 1.07656312, "balance_loss_mlp": 1.03338778, "epoch": 0.05543197258462093, "flos": 23477894087040.0, "grad_norm": 2.013653386135781, "language_loss": 0.8003301, "learning_rate": 3.993249999908852e-06, "loss": 0.82371163, "num_input_tokens_seen": 9995175, "step": 461, "time_per_iteration": 3.7998440265655518 }, { "auxiliary_loss_clip": 0.01292077, "auxiliary_loss_mlp": 0.01055818, "balance_loss_clip": 1.07481289, "balance_loss_mlp": 1.04213846, "epoch": 0.05555221547526003, "flos": 18624603024000.0, "grad_norm": 2.502608028015049, "language_loss": 0.87372392, "learning_rate": 3.993185903693384e-06, "loss": 0.89720285, "num_input_tokens_seen": 10011975, "step": 462, "time_per_iteration": 2.7262966632843018 }, { "auxiliary_loss_clip": 0.01282566, "auxiliary_loss_mlp": 0.01046466, "balance_loss_clip": 1.07491016, "balance_loss_mlp": 1.03353775, "epoch": 0.05567245836589912, "flos": 23587098410880.0, "grad_norm": 1.942860491954527, "language_loss": 0.82435971, "learning_rate": 3.9931215051131995e-06, "loss": 0.84765005, "num_input_tokens_seen": 10032620, "step": 463, "time_per_iteration": 2.8049662113189697 }, { "auxiliary_loss_clip": 0.01291565, "auxiliary_loss_mlp": 0.01040751, "balance_loss_clip": 1.0777626, "balance_loss_mlp": 1.02795935, "epoch": 0.05579270125653821, "flos": 27746667129600.0, "grad_norm": 1.766640943759285, "language_loss": 0.80112183, "learning_rate": 3.993056804178068e-06, "loss": 0.82444507, "num_input_tokens_seen": 10054165, "step": 464, "time_per_iteration": 4.652029752731323 }, { "auxiliary_loss_clip": 0.01268578, "auxiliary_loss_mlp": 0.01044384, "balance_loss_clip": 1.0758009, "balance_loss_mlp": 1.0307641, "epoch": 0.0559129441471773, "flos": 27014161075200.0, "grad_norm": 2.418055459874594, "language_loss": 0.84377366, "learning_rate": 3.992991800897803e-06, "loss": 0.86690331, "num_input_tokens_seen": 10073970, "step": 465, "time_per_iteration": 2.8146896362304688 }, { "auxiliary_loss_clip": 0.01290292, "auxiliary_loss_mlp": 0.01048426, "balance_loss_clip": 1.07512116, "balance_loss_mlp": 1.03535461, "epoch": 0.05603318703781639, "flos": 15229787794560.0, "grad_norm": 2.6987114870362667, "language_loss": 0.89566338, "learning_rate": 3.9929264952822665e-06, "loss": 0.91905046, "num_input_tokens_seen": 10091505, "step": 466, "time_per_iteration": 2.7310237884521484 }, { "auxiliary_loss_clip": 0.01293654, "auxiliary_loss_mlp": 0.01035677, "balance_loss_clip": 1.07738733, "balance_loss_mlp": 1.02178884, "epoch": 0.05615342992845548, "flos": 22266482976000.0, "grad_norm": 2.3169762550390645, "language_loss": 0.88299274, "learning_rate": 3.992860887341366e-06, "loss": 0.90628606, "num_input_tokens_seen": 10109675, "step": 467, "time_per_iteration": 2.7390365600585938 }, { "auxiliary_loss_clip": 0.01270895, "auxiliary_loss_mlp": 0.01043102, "balance_loss_clip": 1.07872748, "balance_loss_mlp": 1.02914262, "epoch": 0.056273672819094574, "flos": 23584979508480.0, "grad_norm": 2.1534177940718364, "language_loss": 0.81444144, "learning_rate": 3.992794977085052e-06, "loss": 0.8375814, "num_input_tokens_seen": 10127675, "step": 468, "time_per_iteration": 2.850968599319458 }, { "auxiliary_loss_clip": 0.01278699, "auxiliary_loss_mlp": 0.01049569, "balance_loss_clip": 1.07827377, "balance_loss_mlp": 1.03625298, "epoch": 0.056393915709733664, "flos": 19858708552320.0, "grad_norm": 2.551444766310024, "language_loss": 0.84759319, "learning_rate": 3.992728764523326e-06, "loss": 0.8708759, "num_input_tokens_seen": 10146620, "step": 469, "time_per_iteration": 2.8515307903289795 }, { "auxiliary_loss_clip": 0.01273862, "auxiliary_loss_mlp": 0.01041181, "balance_loss_clip": 1.07338119, "balance_loss_mlp": 1.02891457, "epoch": 0.05651415860037275, "flos": 22163779013760.0, "grad_norm": 2.5108009927667494, "language_loss": 0.80899668, "learning_rate": 3.99266224966623e-06, "loss": 0.83214712, "num_input_tokens_seen": 10167535, "step": 470, "time_per_iteration": 2.7353873252868652 }, { "auxiliary_loss_clip": 0.01269117, "auxiliary_loss_mlp": 0.01047207, "balance_loss_clip": 1.0764401, "balance_loss_mlp": 1.03381944, "epoch": 0.05663440149101184, "flos": 19463548055040.0, "grad_norm": 2.3085184322487646, "language_loss": 0.879632, "learning_rate": 3.992595432523855e-06, "loss": 0.9027952, "num_input_tokens_seen": 10184825, "step": 471, "time_per_iteration": 2.6990489959716797 }, { "auxiliary_loss_clip": 0.01267072, "auxiliary_loss_mlp": 0.01048596, "balance_loss_clip": 1.07498145, "balance_loss_mlp": 1.03529859, "epoch": 0.05675464438165093, "flos": 22670226823680.0, "grad_norm": 5.1396281623332705, "language_loss": 0.85993522, "learning_rate": 3.992528313106338e-06, "loss": 0.88309193, "num_input_tokens_seen": 10203025, "step": 472, "time_per_iteration": 2.767595052719116 }, { "auxiliary_loss_clip": 0.01292574, "auxiliary_loss_mlp": 0.01057707, "balance_loss_clip": 1.07828331, "balance_loss_mlp": 1.03476477, "epoch": 0.05687488727229002, "flos": 16901177495040.0, "grad_norm": 3.7897232162364523, "language_loss": 0.82566273, "learning_rate": 3.9924608914238595e-06, "loss": 0.84916556, "num_input_tokens_seen": 10218020, "step": 473, "time_per_iteration": 2.911524772644043 }, { "auxiliary_loss_clip": 0.01289904, "auxiliary_loss_mlp": 0.01046153, "balance_loss_clip": 1.0770179, "balance_loss_mlp": 1.03271794, "epoch": 0.05699513016292912, "flos": 29168980945920.0, "grad_norm": 2.747455547397018, "language_loss": 0.84159631, "learning_rate": 3.992393167486648e-06, "loss": 0.86495686, "num_input_tokens_seen": 10237170, "step": 474, "time_per_iteration": 2.8158504962921143 }, { "auxiliary_loss_clip": 0.01291538, "auxiliary_loss_mlp": 0.01051281, "balance_loss_clip": 1.07699728, "balance_loss_mlp": 1.03800094, "epoch": 0.05711537305356821, "flos": 18916197632640.0, "grad_norm": 2.5564350720468956, "language_loss": 0.80813682, "learning_rate": 3.992325141304977e-06, "loss": 0.83156508, "num_input_tokens_seen": 10255125, "step": 475, "time_per_iteration": 2.7392184734344482 }, { "auxiliary_loss_clip": 0.01259575, "auxiliary_loss_mlp": 0.01049524, "balance_loss_clip": 1.07414007, "balance_loss_mlp": 1.03577876, "epoch": 0.0572356159442073, "flos": 26758979879040.0, "grad_norm": 2.132023645106107, "language_loss": 0.86852658, "learning_rate": 3.992256812889166e-06, "loss": 0.89161754, "num_input_tokens_seen": 10271230, "step": 476, "time_per_iteration": 2.8316292762756348 }, { "auxiliary_loss_clip": 0.01297917, "auxiliary_loss_mlp": 0.01037636, "balance_loss_clip": 1.08129478, "balance_loss_mlp": 1.02489281, "epoch": 0.05735585883484639, "flos": 35116146840960.0, "grad_norm": 2.794491603620876, "language_loss": 0.7674644, "learning_rate": 3.992188182249582e-06, "loss": 0.79081994, "num_input_tokens_seen": 10293125, "step": 477, "time_per_iteration": 2.803776502609253 }, { "auxiliary_loss_clip": 0.01280413, "auxiliary_loss_mlp": 0.01047331, "balance_loss_clip": 1.07744789, "balance_loss_mlp": 1.03323436, "epoch": 0.05747610172548548, "flos": 18734381965440.0, "grad_norm": 2.582721777192643, "language_loss": 0.90837395, "learning_rate": 3.992119249396633e-06, "loss": 0.93165147, "num_input_tokens_seen": 10311810, "step": 478, "time_per_iteration": 2.823617935180664 }, { "auxiliary_loss_clip": 0.01271725, "auxiliary_loss_mlp": 0.01062368, "balance_loss_clip": 1.07479262, "balance_loss_mlp": 1.03991032, "epoch": 0.05759634461612457, "flos": 27964752554880.0, "grad_norm": 2.085398191105161, "language_loss": 0.82238293, "learning_rate": 3.992050014340778e-06, "loss": 0.84572387, "num_input_tokens_seen": 10332165, "step": 479, "time_per_iteration": 2.8133389949798584 }, { "auxiliary_loss_clip": 0.01164719, "auxiliary_loss_mlp": 0.01014072, "balance_loss_clip": 1.03273964, "balance_loss_mlp": 1.00954175, "epoch": 0.057716587506763666, "flos": 69292009405440.0, "grad_norm": 0.8589191516968532, "language_loss": 0.54998714, "learning_rate": 3.99198047709252e-06, "loss": 0.57177502, "num_input_tokens_seen": 10393685, "step": 480, "time_per_iteration": 3.3588619232177734 }, { "auxiliary_loss_clip": 0.01276159, "auxiliary_loss_mlp": 0.01054557, "balance_loss_clip": 1.07568407, "balance_loss_mlp": 1.04105079, "epoch": 0.057836830397402755, "flos": 25009196745600.0, "grad_norm": 2.8083231430739612, "language_loss": 0.78685468, "learning_rate": 3.991910637662408e-06, "loss": 0.81016183, "num_input_tokens_seen": 10413975, "step": 481, "time_per_iteration": 2.8151957988739014 }, { "auxiliary_loss_clip": 0.01286103, "auxiliary_loss_mlp": 0.01053047, "balance_loss_clip": 1.07479119, "balance_loss_mlp": 1.04021454, "epoch": 0.057957073288041845, "flos": 25593894334080.0, "grad_norm": 1.9634035210462197, "language_loss": 0.81097412, "learning_rate": 3.9918404960610355e-06, "loss": 0.83436561, "num_input_tokens_seen": 10433005, "step": 482, "time_per_iteration": 2.80846905708313 }, { "auxiliary_loss_clip": 0.01289542, "auxiliary_loss_mlp": 0.01047864, "balance_loss_clip": 1.0760448, "balance_loss_mlp": 1.03422618, "epoch": 0.058077316178680935, "flos": 20777411733120.0, "grad_norm": 2.4691162044090467, "language_loss": 0.77841365, "learning_rate": 3.991770052299043e-06, "loss": 0.80178773, "num_input_tokens_seen": 10451235, "step": 483, "time_per_iteration": 2.7877249717712402 }, { "auxiliary_loss_clip": 0.01278008, "auxiliary_loss_mlp": 0.01044109, "balance_loss_clip": 1.07271528, "balance_loss_mlp": 1.03068626, "epoch": 0.058197559069320025, "flos": 18916484941440.0, "grad_norm": 2.394428604895375, "language_loss": 0.87357432, "learning_rate": 3.991699306387118e-06, "loss": 0.89679551, "num_input_tokens_seen": 10469705, "step": 484, "time_per_iteration": 2.7989490032196045 }, { "auxiliary_loss_clip": 0.01288275, "auxiliary_loss_mlp": 0.0104278, "balance_loss_clip": 1.07676053, "balance_loss_mlp": 1.02893364, "epoch": 0.058317801959959115, "flos": 24863327614080.0, "grad_norm": 1.878419299475565, "language_loss": 0.78235173, "learning_rate": 3.991628258335991e-06, "loss": 0.80566227, "num_input_tokens_seen": 10491910, "step": 485, "time_per_iteration": 2.800325632095337 }, { "auxiliary_loss_clip": 0.01267109, "auxiliary_loss_mlp": 0.0104649, "balance_loss_clip": 1.07331622, "balance_loss_mlp": 1.03310299, "epoch": 0.05843804485059821, "flos": 23257977068160.0, "grad_norm": 3.1438162843776114, "language_loss": 0.87596887, "learning_rate": 3.991556908156442e-06, "loss": 0.89910483, "num_input_tokens_seen": 10508435, "step": 486, "time_per_iteration": 5.064784288406372 }, { "auxiliary_loss_clip": 0.01279883, "auxiliary_loss_mlp": 0.01041693, "balance_loss_clip": 1.07292426, "balance_loss_mlp": 1.02770996, "epoch": 0.0585582877412373, "flos": 23150532510720.0, "grad_norm": 3.0446143568018966, "language_loss": 0.88017356, "learning_rate": 3.9914852558592914e-06, "loss": 0.90338933, "num_input_tokens_seen": 10529485, "step": 487, "time_per_iteration": 2.8409171104431152 }, { "auxiliary_loss_clip": 0.01282959, "auxiliary_loss_mlp": 0.01048093, "balance_loss_clip": 1.07622838, "balance_loss_mlp": 1.03542089, "epoch": 0.05867853063187639, "flos": 23506406507520.0, "grad_norm": 3.7256891258085134, "language_loss": 0.80576217, "learning_rate": 3.991413301455413e-06, "loss": 0.82907271, "num_input_tokens_seen": 10545935, "step": 488, "time_per_iteration": 2.9195823669433594 }, { "auxiliary_loss_clip": 0.01261418, "auxiliary_loss_mlp": 0.01050626, "balance_loss_clip": 1.07097793, "balance_loss_mlp": 1.03734636, "epoch": 0.05879877352251548, "flos": 29495803818240.0, "grad_norm": 2.913278581639044, "language_loss": 0.78378868, "learning_rate": 3.991341044955719e-06, "loss": 0.80690908, "num_input_tokens_seen": 10565690, "step": 489, "time_per_iteration": 3.8371357917785645 }, { "auxiliary_loss_clip": 0.01282114, "auxiliary_loss_mlp": 0.01056325, "balance_loss_clip": 1.07276654, "balance_loss_mlp": 1.03198004, "epoch": 0.05891901641315457, "flos": 20157485880960.0, "grad_norm": 2.475007682441161, "language_loss": 0.81572533, "learning_rate": 3.991268486371172e-06, "loss": 0.83910966, "num_input_tokens_seen": 10584245, "step": 490, "time_per_iteration": 3.806298017501831 }, { "auxiliary_loss_clip": 0.01284695, "auxiliary_loss_mlp": 0.01045682, "balance_loss_clip": 1.07605314, "balance_loss_mlp": 1.03184736, "epoch": 0.05903925930379366, "flos": 24644200694400.0, "grad_norm": 9.377971979774832, "language_loss": 0.87679964, "learning_rate": 3.991195625712779e-06, "loss": 0.90010339, "num_input_tokens_seen": 10601210, "step": 491, "time_per_iteration": 2.87563157081604 }, { "auxiliary_loss_clip": 0.01288764, "auxiliary_loss_mlp": 0.0104762, "balance_loss_clip": 1.07574904, "balance_loss_mlp": 1.03482878, "epoch": 0.05915950219443276, "flos": 21250391045760.0, "grad_norm": 1.9876897942013674, "language_loss": 0.81437528, "learning_rate": 3.991122462991592e-06, "loss": 0.83773911, "num_input_tokens_seen": 10620730, "step": 492, "time_per_iteration": 2.922931671142578 }, { "auxiliary_loss_clip": 0.01295967, "auxiliary_loss_mlp": 0.01045486, "balance_loss_clip": 1.07811296, "balance_loss_mlp": 1.0314672, "epoch": 0.05927974508507185, "flos": 9902727319680.0, "grad_norm": 3.7955962728610784, "language_loss": 0.80850244, "learning_rate": 3.991048998218712e-06, "loss": 0.83191693, "num_input_tokens_seen": 10634035, "step": 493, "time_per_iteration": 2.775310516357422 }, { "auxiliary_loss_clip": 0.01287696, "auxiliary_loss_mlp": 0.01044395, "balance_loss_clip": 1.07698548, "balance_loss_mlp": 1.03072739, "epoch": 0.05939998797571094, "flos": 18259499232000.0, "grad_norm": 3.559043032173019, "language_loss": 0.76273644, "learning_rate": 3.990975231405281e-06, "loss": 0.78605735, "num_input_tokens_seen": 10652485, "step": 494, "time_per_iteration": 2.8344459533691406 }, { "auxiliary_loss_clip": 0.01277658, "auxiliary_loss_mlp": 0.01057364, "balance_loss_clip": 1.07472014, "balance_loss_mlp": 1.04402494, "epoch": 0.05952023086635003, "flos": 28256598558720.0, "grad_norm": 1.8530972993440211, "language_loss": 0.79016936, "learning_rate": 3.990901162562491e-06, "loss": 0.81351954, "num_input_tokens_seen": 10673175, "step": 495, "time_per_iteration": 2.8343605995178223 }, { "auxiliary_loss_clip": 0.01266142, "auxiliary_loss_mlp": 0.01065018, "balance_loss_clip": 1.07066298, "balance_loss_mlp": 1.04235601, "epoch": 0.05964047375698912, "flos": 14902498045440.0, "grad_norm": 2.011680911586998, "language_loss": 0.90563387, "learning_rate": 3.9908267917015765e-06, "loss": 0.92894554, "num_input_tokens_seen": 10691235, "step": 496, "time_per_iteration": 2.8833651542663574 }, { "auxiliary_loss_clip": 0.01270716, "auxiliary_loss_mlp": 0.01045346, "balance_loss_clip": 1.07238197, "balance_loss_mlp": 1.0320785, "epoch": 0.059760716647628206, "flos": 23185581206400.0, "grad_norm": 2.149929961437535, "language_loss": 0.92867017, "learning_rate": 3.990752118833821e-06, "loss": 0.95183074, "num_input_tokens_seen": 10708675, "step": 497, "time_per_iteration": 2.813584566116333 }, { "auxiliary_loss_clip": 0.01288505, "auxiliary_loss_mlp": 0.0104571, "balance_loss_clip": 1.07689667, "balance_loss_mlp": 1.03265619, "epoch": 0.0598809595382673, "flos": 22746968231040.0, "grad_norm": 2.641231321202283, "language_loss": 0.7786603, "learning_rate": 3.990677143970553e-06, "loss": 0.80200237, "num_input_tokens_seen": 10729485, "step": 498, "time_per_iteration": 2.845771074295044 }, { "auxiliary_loss_clip": 0.01268948, "auxiliary_loss_mlp": 0.01041627, "balance_loss_clip": 1.07601476, "balance_loss_mlp": 1.02836549, "epoch": 0.06000120242890639, "flos": 22127221946880.0, "grad_norm": 5.839572360709669, "language_loss": 0.81364918, "learning_rate": 3.990601867123144e-06, "loss": 0.83675498, "num_input_tokens_seen": 10749210, "step": 499, "time_per_iteration": 2.8201346397399902 }, { "auxiliary_loss_clip": 0.01264452, "auxiliary_loss_mlp": 0.0104622, "balance_loss_clip": 1.07477999, "balance_loss_mlp": 1.03252268, "epoch": 0.06012144531954548, "flos": 19171773878400.0, "grad_norm": 2.5130352403631444, "language_loss": 0.85207492, "learning_rate": 3.990526288303014e-06, "loss": 0.87518167, "num_input_tokens_seen": 10768000, "step": 500, "time_per_iteration": 2.908595085144043 }, { "auxiliary_loss_clip": 0.01270584, "auxiliary_loss_mlp": 0.01059834, "balance_loss_clip": 1.07213974, "balance_loss_mlp": 1.03680396, "epoch": 0.06024168821018457, "flos": 22783345729920.0, "grad_norm": 2.8462348822967276, "language_loss": 0.90761054, "learning_rate": 3.9904504075216295e-06, "loss": 0.93091464, "num_input_tokens_seen": 10788760, "step": 501, "time_per_iteration": 2.8967933654785156 }, { "auxiliary_loss_clip": 0.01273492, "auxiliary_loss_mlp": 0.01045578, "balance_loss_clip": 1.07325375, "balance_loss_mlp": 1.03182685, "epoch": 0.06036193110082366, "flos": 18770687637120.0, "grad_norm": 3.5309074895250676, "language_loss": 0.9401691, "learning_rate": 3.990374224790501e-06, "loss": 0.96335977, "num_input_tokens_seen": 10806965, "step": 502, "time_per_iteration": 2.8185999393463135 }, { "auxiliary_loss_clip": 0.01275085, "auxiliary_loss_mlp": 0.01060401, "balance_loss_clip": 1.0740639, "balance_loss_mlp": 1.0462923, "epoch": 0.06048217399146275, "flos": 17201570935680.0, "grad_norm": 2.0810247957065426, "language_loss": 0.70946568, "learning_rate": 3.990297740121185e-06, "loss": 0.73282051, "num_input_tokens_seen": 10824900, "step": 503, "time_per_iteration": 2.8041961193084717 }, { "auxiliary_loss_clip": 0.01283494, "auxiliary_loss_mlp": 0.01067059, "balance_loss_clip": 1.0742892, "balance_loss_mlp": 1.04299605, "epoch": 0.06060241688210185, "flos": 24024131187840.0, "grad_norm": 2.8739960818136945, "language_loss": 0.78128266, "learning_rate": 3.990220953525284e-06, "loss": 0.80478823, "num_input_tokens_seen": 10842010, "step": 504, "time_per_iteration": 2.7247040271759033 }, { "auxiliary_loss_clip": 0.01266549, "auxiliary_loss_mlp": 0.01042955, "balance_loss_clip": 1.07467794, "balance_loss_mlp": 1.02906692, "epoch": 0.06072265977274094, "flos": 14611190745600.0, "grad_norm": 2.6506530977396245, "language_loss": 0.74246174, "learning_rate": 3.9901438650144465e-06, "loss": 0.76555675, "num_input_tokens_seen": 10858260, "step": 505, "time_per_iteration": 2.754124402999878 }, { "auxiliary_loss_clip": 0.01273188, "auxiliary_loss_mlp": 0.0104764, "balance_loss_clip": 1.0749855, "balance_loss_mlp": 1.03465211, "epoch": 0.06084290266338003, "flos": 20558284813440.0, "grad_norm": 4.718614710154794, "language_loss": 0.9190439, "learning_rate": 3.990066474600367e-06, "loss": 0.9422521, "num_input_tokens_seen": 10876230, "step": 506, "time_per_iteration": 2.765456438064575 }, { "auxiliary_loss_clip": 0.01262779, "auxiliary_loss_mlp": 0.01052133, "balance_loss_clip": 1.06877553, "balance_loss_mlp": 1.03767323, "epoch": 0.06096314555401912, "flos": 22309217182080.0, "grad_norm": 1.9950037374169263, "language_loss": 0.68002921, "learning_rate": 3.989988782294786e-06, "loss": 0.70317835, "num_input_tokens_seen": 10896320, "step": 507, "time_per_iteration": 2.791929006576538 }, { "auxiliary_loss_clip": 0.01253624, "auxiliary_loss_mlp": 0.01054922, "balance_loss_clip": 1.07389712, "balance_loss_mlp": 1.04213643, "epoch": 0.06108338844465821, "flos": 19131374056320.0, "grad_norm": 1.7896407094300149, "language_loss": 0.95155984, "learning_rate": 3.989910788109489e-06, "loss": 0.97464532, "num_input_tokens_seen": 10912970, "step": 508, "time_per_iteration": 2.870318651199341 }, { "auxiliary_loss_clip": 0.0127209, "auxiliary_loss_mlp": 0.01045501, "balance_loss_clip": 1.07420254, "balance_loss_mlp": 1.03250718, "epoch": 0.0612036313352973, "flos": 33584018169600.0, "grad_norm": 2.4352298821927962, "language_loss": 0.74767423, "learning_rate": 3.989832492056307e-06, "loss": 0.77085012, "num_input_tokens_seen": 10933995, "step": 509, "time_per_iteration": 2.8689661026000977 }, { "auxiliary_loss_clip": 0.0128211, "auxiliary_loss_mlp": 0.01037496, "balance_loss_clip": 1.07651353, "balance_loss_mlp": 1.02429962, "epoch": 0.06132387422593639, "flos": 27490552179840.0, "grad_norm": 4.188547809958518, "language_loss": 0.8089124, "learning_rate": 3.989753894147119e-06, "loss": 0.83210838, "num_input_tokens_seen": 10954120, "step": 510, "time_per_iteration": 2.786726951599121 }, { "auxiliary_loss_clip": 0.01275474, "auxiliary_loss_mlp": 0.0104406, "balance_loss_clip": 1.07544172, "balance_loss_mlp": 1.03123283, "epoch": 0.061444117116575485, "flos": 25885057979520.0, "grad_norm": 1.849880162721458, "language_loss": 0.79794168, "learning_rate": 3.989674994393846e-06, "loss": 0.82113707, "num_input_tokens_seen": 10973595, "step": 511, "time_per_iteration": 2.796072483062744 }, { "auxiliary_loss_clip": 0.01274954, "auxiliary_loss_mlp": 0.01054911, "balance_loss_clip": 1.07562089, "balance_loss_mlp": 1.04221535, "epoch": 0.061564360007214575, "flos": 28512031150080.0, "grad_norm": 2.542322423906823, "language_loss": 0.94007069, "learning_rate": 3.98959579280846e-06, "loss": 0.96336925, "num_input_tokens_seen": 10991995, "step": 512, "time_per_iteration": 3.750535011291504 }, { "auxiliary_loss_clip": 0.01256135, "auxiliary_loss_mlp": 0.01058145, "balance_loss_clip": 1.0806576, "balance_loss_mlp": 1.04563355, "epoch": 0.061684602897853665, "flos": 12094355652480.0, "grad_norm": 2.085485025609844, "language_loss": 0.82799375, "learning_rate": 3.989516289402973e-06, "loss": 0.85113657, "num_input_tokens_seen": 11007625, "step": 513, "time_per_iteration": 3.882488250732422 }, { "auxiliary_loss_clip": 0.01245496, "auxiliary_loss_mlp": 0.01051827, "balance_loss_clip": 1.07600152, "balance_loss_mlp": 1.03913128, "epoch": 0.061804845788492754, "flos": 19532639865600.0, "grad_norm": 2.9529461892688276, "language_loss": 0.80153048, "learning_rate": 3.989436484189447e-06, "loss": 0.82450378, "num_input_tokens_seen": 11025570, "step": 514, "time_per_iteration": 2.762591600418091 }, { "auxiliary_loss_clip": 0.01283022, "auxiliary_loss_mlp": 0.01049247, "balance_loss_clip": 1.07281256, "balance_loss_mlp": 1.03583574, "epoch": 0.061925088679131844, "flos": 15341111020800.0, "grad_norm": 2.7365452705945903, "language_loss": 0.80548435, "learning_rate": 3.9893563771799885e-06, "loss": 0.828807, "num_input_tokens_seen": 11042045, "step": 515, "time_per_iteration": 2.78471302986145 }, { "auxiliary_loss_clip": 0.01287772, "auxiliary_loss_mlp": 0.01059994, "balance_loss_clip": 1.07496452, "balance_loss_mlp": 1.0467205, "epoch": 0.062045331569770934, "flos": 25919927107200.0, "grad_norm": 2.6127890772678324, "language_loss": 0.86107802, "learning_rate": 3.989275968386749e-06, "loss": 0.88455564, "num_input_tokens_seen": 11059955, "step": 516, "time_per_iteration": 4.796777248382568 }, { "auxiliary_loss_clip": 0.01266393, "auxiliary_loss_mlp": 0.01053641, "balance_loss_clip": 1.07458568, "balance_loss_mlp": 1.04006946, "epoch": 0.06216557446041003, "flos": 28110621686400.0, "grad_norm": 3.1230281402898985, "language_loss": 0.76731563, "learning_rate": 3.989195257821926e-06, "loss": 0.7905159, "num_input_tokens_seen": 11078440, "step": 517, "time_per_iteration": 2.861618757247925 }, { "auxiliary_loss_clip": 0.01273392, "auxiliary_loss_mlp": 0.01045416, "balance_loss_clip": 1.0753119, "balance_loss_mlp": 1.03166485, "epoch": 0.06228581735104912, "flos": 23478181395840.0, "grad_norm": 2.2949211241872516, "language_loss": 0.84581435, "learning_rate": 3.989114245497765e-06, "loss": 0.8690024, "num_input_tokens_seen": 11098240, "step": 518, "time_per_iteration": 2.8100550174713135 }, { "auxiliary_loss_clip": 0.0128147, "auxiliary_loss_mlp": 0.01045393, "balance_loss_clip": 1.07050538, "balance_loss_mlp": 1.03310847, "epoch": 0.06240606024168821, "flos": 15195205975680.0, "grad_norm": 2.8720013122991435, "language_loss": 0.95366335, "learning_rate": 3.989032931426554e-06, "loss": 0.97693205, "num_input_tokens_seen": 11115395, "step": 519, "time_per_iteration": 2.7676336765289307 }, { "auxiliary_loss_clip": 0.01267756, "auxiliary_loss_mlp": 0.01043235, "balance_loss_clip": 1.07175446, "balance_loss_mlp": 1.03029513, "epoch": 0.06252630313232731, "flos": 20631829910400.0, "grad_norm": 1.998019884216617, "language_loss": 0.86508596, "learning_rate": 3.9889513156206295e-06, "loss": 0.88819593, "num_input_tokens_seen": 11134835, "step": 520, "time_per_iteration": 2.886974334716797 }, { "auxiliary_loss_clip": 0.01271919, "auxiliary_loss_mlp": 0.01050056, "balance_loss_clip": 1.07550764, "balance_loss_mlp": 1.03636515, "epoch": 0.06264654602296639, "flos": 20778058177920.0, "grad_norm": 2.9759094559717094, "language_loss": 0.73666441, "learning_rate": 3.988869398092371e-06, "loss": 0.75988412, "num_input_tokens_seen": 11154745, "step": 521, "time_per_iteration": 2.904930353164673 }, { "auxiliary_loss_clip": 0.01271619, "auxiliary_loss_mlp": 0.01054529, "balance_loss_clip": 1.07192588, "balance_loss_mlp": 1.0407722, "epoch": 0.06276678891360549, "flos": 29605798241280.0, "grad_norm": 2.4149613010535322, "language_loss": 0.78984165, "learning_rate": 3.988787178854206e-06, "loss": 0.81310308, "num_input_tokens_seen": 11174280, "step": 522, "time_per_iteration": 2.8666164875030518 }, { "auxiliary_loss_clip": 0.01284842, "auxiliary_loss_mlp": 0.01053887, "balance_loss_clip": 1.0758729, "balance_loss_mlp": 1.04129267, "epoch": 0.06288703180424457, "flos": 22126288193280.0, "grad_norm": 6.820498742153648, "language_loss": 0.87312943, "learning_rate": 3.988704657918608e-06, "loss": 0.89651668, "num_input_tokens_seen": 11193340, "step": 523, "time_per_iteration": 2.8360984325408936 }, { "auxiliary_loss_clip": 0.01278453, "auxiliary_loss_mlp": 0.01049992, "balance_loss_clip": 1.07580781, "balance_loss_mlp": 1.03773141, "epoch": 0.06300727469488367, "flos": 14976689587200.0, "grad_norm": 3.1086761392135673, "language_loss": 0.79725754, "learning_rate": 3.988621835298094e-06, "loss": 0.82054198, "num_input_tokens_seen": 11210555, "step": 524, "time_per_iteration": 2.892456293106079 }, { "auxiliary_loss_clip": 0.01278423, "auxiliary_loss_mlp": 0.01050485, "balance_loss_clip": 1.07345676, "balance_loss_mlp": 1.03891623, "epoch": 0.06312751758552275, "flos": 24535391420160.0, "grad_norm": 2.0899252657780902, "language_loss": 0.91959214, "learning_rate": 3.988538711005229e-06, "loss": 0.94288123, "num_input_tokens_seen": 11230010, "step": 525, "time_per_iteration": 2.820000171661377 }, { "auxiliary_loss_clip": 0.01269683, "auxiliary_loss_mlp": 0.01049017, "balance_loss_clip": 1.07228208, "balance_loss_mlp": 1.03682184, "epoch": 0.06324776047616185, "flos": 21507008785920.0, "grad_norm": 2.3688845327652572, "language_loss": 0.88302249, "learning_rate": 3.988455285052622e-06, "loss": 0.90620947, "num_input_tokens_seen": 11246190, "step": 526, "time_per_iteration": 2.826097011566162 }, { "auxiliary_loss_clip": 0.01282268, "auxiliary_loss_mlp": 0.01040859, "balance_loss_clip": 1.08276248, "balance_loss_mlp": 1.02827096, "epoch": 0.06336800336680094, "flos": 21688034353920.0, "grad_norm": 2.4384878335038587, "language_loss": 0.83761847, "learning_rate": 3.98837155745293e-06, "loss": 0.86084974, "num_input_tokens_seen": 11264230, "step": 527, "time_per_iteration": 2.7736318111419678 }, { "auxiliary_loss_clip": 0.01276146, "auxiliary_loss_mlp": 0.0104437, "balance_loss_clip": 1.07490432, "balance_loss_mlp": 1.03119779, "epoch": 0.06348824625744003, "flos": 19500895221120.0, "grad_norm": 2.6580536262073213, "language_loss": 0.76375461, "learning_rate": 3.988287528218854e-06, "loss": 0.78695977, "num_input_tokens_seen": 11283015, "step": 528, "time_per_iteration": 2.790905475616455 }, { "auxiliary_loss_clip": 0.01272328, "auxiliary_loss_mlp": 0.01047028, "balance_loss_clip": 1.07486045, "balance_loss_mlp": 1.03420663, "epoch": 0.06360848914807912, "flos": 15481233976320.0, "grad_norm": 2.0958122725830264, "language_loss": 0.90331924, "learning_rate": 3.98820319736314e-06, "loss": 0.92651284, "num_input_tokens_seen": 11299630, "step": 529, "time_per_iteration": 2.754676103591919 }, { "auxiliary_loss_clip": 0.01271497, "auxiliary_loss_mlp": 0.01045707, "balance_loss_clip": 1.07434011, "balance_loss_mlp": 1.03339303, "epoch": 0.0637287320387182, "flos": 20593369422720.0, "grad_norm": 1.871134802659894, "language_loss": 0.85436684, "learning_rate": 3.988118564898582e-06, "loss": 0.87753886, "num_input_tokens_seen": 11319170, "step": 530, "time_per_iteration": 2.8746886253356934 }, { "auxiliary_loss_clip": 0.0125423, "auxiliary_loss_mlp": 0.0105508, "balance_loss_clip": 1.07294989, "balance_loss_mlp": 1.03254604, "epoch": 0.0638489749293573, "flos": 17412222245760.0, "grad_norm": 2.5470315952262816, "language_loss": 0.88888514, "learning_rate": 3.988033630838019e-06, "loss": 0.91197824, "num_input_tokens_seen": 11333210, "step": 531, "time_per_iteration": 2.8260536193847656 }, { "auxiliary_loss_clip": 0.01280214, "auxiliary_loss_mlp": 0.01045328, "balance_loss_clip": 1.07769299, "balance_loss_mlp": 1.03291869, "epoch": 0.0639692178199964, "flos": 23807661874560.0, "grad_norm": 2.3351590759910534, "language_loss": 0.8832624, "learning_rate": 3.987948395194334e-06, "loss": 0.90651786, "num_input_tokens_seen": 11355590, "step": 532, "time_per_iteration": 2.882761001586914 }, { "auxiliary_loss_clip": 0.01270158, "auxiliary_loss_mlp": 0.01047603, "balance_loss_clip": 1.07148981, "balance_loss_mlp": 1.03535414, "epoch": 0.06408946071063548, "flos": 18477225521280.0, "grad_norm": 4.040757865363299, "language_loss": 0.76948869, "learning_rate": 3.987862857980458e-06, "loss": 0.79266632, "num_input_tokens_seen": 11371535, "step": 533, "time_per_iteration": 2.9016103744506836 }, { "auxiliary_loss_clip": 0.01270896, "auxiliary_loss_mlp": 0.01041053, "balance_loss_clip": 1.07355952, "balance_loss_mlp": 1.02785063, "epoch": 0.06420970360127458, "flos": 27162220936320.0, "grad_norm": 2.664760777562339, "language_loss": 0.76700443, "learning_rate": 3.987777019209368e-06, "loss": 0.79012394, "num_input_tokens_seen": 11392050, "step": 534, "time_per_iteration": 2.8557095527648926 }, { "auxiliary_loss_clip": 0.01276894, "auxiliary_loss_mlp": 0.01046023, "balance_loss_clip": 1.07160759, "balance_loss_mlp": 1.03357196, "epoch": 0.06432994649191366, "flos": 23659673840640.0, "grad_norm": 1.899438890237412, "language_loss": 0.81266606, "learning_rate": 3.987690878894084e-06, "loss": 0.83589524, "num_input_tokens_seen": 11411765, "step": 535, "time_per_iteration": 2.8560004234313965 }, { "auxiliary_loss_clip": 0.01275923, "auxiliary_loss_mlp": 0.01042308, "balance_loss_clip": 1.07510281, "balance_loss_mlp": 1.02932644, "epoch": 0.06445018938255276, "flos": 23403953940480.0, "grad_norm": 2.326171238186557, "language_loss": 0.84586203, "learning_rate": 3.987604437047673e-06, "loss": 0.8690443, "num_input_tokens_seen": 11431565, "step": 536, "time_per_iteration": 2.900845766067505 }, { "auxiliary_loss_clip": 0.0126797, "auxiliary_loss_mlp": 0.01046682, "balance_loss_clip": 1.07039642, "balance_loss_mlp": 1.03406918, "epoch": 0.06457043227319184, "flos": 19646692525440.0, "grad_norm": 2.182552388176667, "language_loss": 0.77677691, "learning_rate": 3.987517693683251e-06, "loss": 0.79992342, "num_input_tokens_seen": 11450140, "step": 537, "time_per_iteration": 2.889055013656616 }, { "auxiliary_loss_clip": 0.01266003, "auxiliary_loss_mlp": 0.01045341, "balance_loss_clip": 1.07264948, "balance_loss_mlp": 1.03229403, "epoch": 0.06469067516383094, "flos": 16978744915200.0, "grad_norm": 2.5265359602747437, "language_loss": 0.95732141, "learning_rate": 3.9874306488139745e-06, "loss": 0.98043483, "num_input_tokens_seen": 11465400, "step": 538, "time_per_iteration": 3.6608922481536865 }, { "auxiliary_loss_clip": 0.01257196, "auxiliary_loss_mlp": 0.01050812, "balance_loss_clip": 1.07177758, "balance_loss_mlp": 1.03790188, "epoch": 0.06481091805447003, "flos": 23296401642240.0, "grad_norm": 2.0380794196298724, "language_loss": 0.88057119, "learning_rate": 3.987343302453049e-06, "loss": 0.90365124, "num_input_tokens_seen": 11486675, "step": 539, "time_per_iteration": 3.8800950050354004 }, { "auxiliary_loss_clip": 0.01266407, "auxiliary_loss_mlp": 0.01049726, "balance_loss_clip": 1.07323575, "balance_loss_mlp": 1.03618956, "epoch": 0.06493116094510912, "flos": 29172356824320.0, "grad_norm": 1.7065998043222612, "language_loss": 0.82548606, "learning_rate": 3.987255654613724e-06, "loss": 0.84864736, "num_input_tokens_seen": 11510440, "step": 540, "time_per_iteration": 3.009535074234009 }, { "auxiliary_loss_clip": 0.01265571, "auxiliary_loss_mlp": 0.01039267, "balance_loss_clip": 1.0732373, "balance_loss_mlp": 1.02629113, "epoch": 0.06505140383574821, "flos": 19865065259520.0, "grad_norm": 4.480508058902246, "language_loss": 0.70937824, "learning_rate": 3.987167705309296e-06, "loss": 0.73242652, "num_input_tokens_seen": 11529715, "step": 541, "time_per_iteration": 2.9098918437957764 }, { "auxiliary_loss_clip": 0.0127793, "auxiliary_loss_mlp": 0.01063002, "balance_loss_clip": 1.07484543, "balance_loss_mlp": 1.04185963, "epoch": 0.0651716467263873, "flos": 17924703540480.0, "grad_norm": 3.309382632944437, "language_loss": 0.95182914, "learning_rate": 3.987079454553108e-06, "loss": 0.97523844, "num_input_tokens_seen": 11547665, "step": 542, "time_per_iteration": 3.6973001956939697 }, { "auxiliary_loss_clip": 0.01258816, "auxiliary_loss_mlp": 0.01043848, "balance_loss_clip": 1.07329631, "balance_loss_mlp": 1.03076458, "epoch": 0.0652918896170264, "flos": 20842840356480.0, "grad_norm": 1.963836131005322, "language_loss": 0.91056895, "learning_rate": 3.986990902358546e-06, "loss": 0.93359554, "num_input_tokens_seen": 11564605, "step": 543, "time_per_iteration": 3.8291568756103516 }, { "auxiliary_loss_clip": 0.01273966, "auxiliary_loss_mlp": 0.01041731, "balance_loss_clip": 1.07211113, "balance_loss_mlp": 1.02961361, "epoch": 0.06541213250766549, "flos": 21872507627520.0, "grad_norm": 2.2347104060428595, "language_loss": 0.93372655, "learning_rate": 3.986902048739045e-06, "loss": 0.95688343, "num_input_tokens_seen": 11584550, "step": 544, "time_per_iteration": 2.934039831161499 }, { "auxiliary_loss_clip": 0.01270784, "auxiliary_loss_mlp": 0.01048869, "balance_loss_clip": 1.0729444, "balance_loss_mlp": 1.03517151, "epoch": 0.06553237539830457, "flos": 23110743219840.0, "grad_norm": 4.726639611250166, "language_loss": 0.80363023, "learning_rate": 3.986812893708082e-06, "loss": 0.82682675, "num_input_tokens_seen": 11600740, "step": 545, "time_per_iteration": 2.830976963043213 }, { "auxiliary_loss_clip": 0.01277092, "auxiliary_loss_mlp": 0.01042207, "balance_loss_clip": 1.07813025, "balance_loss_mlp": 1.02921343, "epoch": 0.06565261828894367, "flos": 17923769786880.0, "grad_norm": 2.277161154343155, "language_loss": 0.81244624, "learning_rate": 3.9867234372791826e-06, "loss": 0.8356393, "num_input_tokens_seen": 11618695, "step": 546, "time_per_iteration": 2.852114677429199 }, { "auxiliary_loss_clip": 0.01272095, "auxiliary_loss_mlp": 0.01045211, "balance_loss_clip": 1.07158244, "balance_loss_mlp": 1.03253293, "epoch": 0.06577286117958275, "flos": 22783058421120.0, "grad_norm": 1.948185897798666, "language_loss": 0.87195379, "learning_rate": 3.986633679465918e-06, "loss": 0.89512688, "num_input_tokens_seen": 11638850, "step": 547, "time_per_iteration": 2.7867681980133057 }, { "auxiliary_loss_clip": 0.01264357, "auxiliary_loss_mlp": 0.01056718, "balance_loss_clip": 1.07432675, "balance_loss_mlp": 1.04412317, "epoch": 0.06589310407022185, "flos": 23696194993920.0, "grad_norm": 2.2732690188593687, "language_loss": 0.80487871, "learning_rate": 3.986543620281904e-06, "loss": 0.82808948, "num_input_tokens_seen": 11658500, "step": 548, "time_per_iteration": 2.880086660385132 }, { "auxiliary_loss_clip": 0.01254581, "auxiliary_loss_mlp": 0.01054549, "balance_loss_clip": 1.07417011, "balance_loss_mlp": 1.04243743, "epoch": 0.06601334696086093, "flos": 26864772410880.0, "grad_norm": 1.6824992740284912, "language_loss": 0.9116171, "learning_rate": 3.986453259740802e-06, "loss": 0.93470836, "num_input_tokens_seen": 11676670, "step": 549, "time_per_iteration": 2.8806371688842773 }, { "auxiliary_loss_clip": 0.01270491, "auxiliary_loss_mlp": 0.01045751, "balance_loss_clip": 1.07389522, "balance_loss_mlp": 1.03266144, "epoch": 0.06613358985150003, "flos": 12567694101120.0, "grad_norm": 3.0734907912301206, "language_loss": 0.79176551, "learning_rate": 3.986362597856319e-06, "loss": 0.81492794, "num_input_tokens_seen": 11693170, "step": 550, "time_per_iteration": 2.8545315265655518 }, { "auxiliary_loss_clip": 0.01268386, "auxiliary_loss_mlp": 0.0105425, "balance_loss_clip": 1.07437515, "balance_loss_mlp": 1.03273559, "epoch": 0.06625383274213913, "flos": 18332505624960.0, "grad_norm": 3.8137669937456766, "language_loss": 0.81569576, "learning_rate": 3.986271634642211e-06, "loss": 0.83892202, "num_input_tokens_seen": 11710150, "step": 551, "time_per_iteration": 2.871793746948242 }, { "auxiliary_loss_clip": 0.01278326, "auxiliary_loss_mlp": 0.01045527, "balance_loss_clip": 1.07434845, "balance_loss_mlp": 1.03292108, "epoch": 0.06637407563277821, "flos": 15375585098880.0, "grad_norm": 2.439496803042045, "language_loss": 0.81834543, "learning_rate": 3.986180370112274e-06, "loss": 0.84158397, "num_input_tokens_seen": 11726670, "step": 552, "time_per_iteration": 2.7724862098693848 }, { "auxiliary_loss_clip": 0.0127466, "auxiliary_loss_mlp": 0.01061315, "balance_loss_clip": 1.07201135, "balance_loss_mlp": 1.03951359, "epoch": 0.0664943185234173, "flos": 24025244509440.0, "grad_norm": 1.9024055653398142, "language_loss": 0.74175274, "learning_rate": 3.986088804280354e-06, "loss": 0.76511252, "num_input_tokens_seen": 11746400, "step": 553, "time_per_iteration": 2.8313143253326416 }, { "auxiliary_loss_clip": 0.01269691, "auxiliary_loss_mlp": 0.0104871, "balance_loss_clip": 1.07312167, "balance_loss_mlp": 1.03601408, "epoch": 0.06661456141405639, "flos": 20957503547520.0, "grad_norm": 2.5891392248762917, "language_loss": 0.94107848, "learning_rate": 3.985996937160342e-06, "loss": 0.96426249, "num_input_tokens_seen": 11765590, "step": 554, "time_per_iteration": 3.0576939582824707 }, { "auxiliary_loss_clip": 0.01270683, "auxiliary_loss_mlp": 0.01048455, "balance_loss_clip": 1.07356989, "balance_loss_mlp": 1.03482938, "epoch": 0.06673480430469549, "flos": 52223953322880.0, "grad_norm": 2.0372885060556203, "language_loss": 0.68835545, "learning_rate": 3.985904768766173e-06, "loss": 0.71154678, "num_input_tokens_seen": 11788365, "step": 555, "time_per_iteration": 3.100843906402588 }, { "auxiliary_loss_clip": 0.01269672, "auxiliary_loss_mlp": 0.01047008, "balance_loss_clip": 1.07503629, "balance_loss_mlp": 1.03351939, "epoch": 0.06685504719533458, "flos": 16217079995520.0, "grad_norm": 2.615457136797036, "language_loss": 0.75949162, "learning_rate": 3.98581229911183e-06, "loss": 0.7826584, "num_input_tokens_seen": 11807285, "step": 556, "time_per_iteration": 2.793134927749634 }, { "auxiliary_loss_clip": 0.01276252, "auxiliary_loss_mlp": 0.01044233, "balance_loss_clip": 1.07210708, "balance_loss_mlp": 1.03052974, "epoch": 0.06697529008597367, "flos": 22491535639680.0, "grad_norm": 1.992854757777519, "language_loss": 0.92080021, "learning_rate": 3.985719528211341e-06, "loss": 0.94400501, "num_input_tokens_seen": 11826655, "step": 557, "time_per_iteration": 2.7880334854125977 }, { "auxiliary_loss_clip": 0.01165356, "auxiliary_loss_mlp": 0.01019842, "balance_loss_clip": 1.03373337, "balance_loss_mlp": 1.0154078, "epoch": 0.06709553297661276, "flos": 62688216936960.0, "grad_norm": 0.8515530786096589, "language_loss": 0.63036025, "learning_rate": 3.985626456078777e-06, "loss": 0.65221214, "num_input_tokens_seen": 11891310, "step": 558, "time_per_iteration": 3.4647905826568604 }, { "auxiliary_loss_clip": 0.01270872, "auxiliary_loss_mlp": 0.01044356, "balance_loss_clip": 1.07654142, "balance_loss_mlp": 1.03147531, "epoch": 0.06721577586725185, "flos": 11216590997760.0, "grad_norm": 2.076229116439894, "language_loss": 0.86557949, "learning_rate": 3.985533082728259e-06, "loss": 0.88873178, "num_input_tokens_seen": 11906965, "step": 559, "time_per_iteration": 2.9440109729766846 }, { "auxiliary_loss_clip": 0.01281028, "auxiliary_loss_mlp": 0.01042607, "balance_loss_clip": 1.07342935, "balance_loss_mlp": 1.03012609, "epoch": 0.06733601875789094, "flos": 25922189664000.0, "grad_norm": 2.5914291274449504, "language_loss": 0.74911463, "learning_rate": 3.985439408173951e-06, "loss": 0.77235103, "num_input_tokens_seen": 11927190, "step": 560, "time_per_iteration": 2.904503107070923 }, { "auxiliary_loss_clip": 0.01280637, "auxiliary_loss_mlp": 0.01060973, "balance_loss_clip": 1.07430124, "balance_loss_mlp": 1.04700184, "epoch": 0.06745626164853002, "flos": 20813645577600.0, "grad_norm": 2.451234839776651, "language_loss": 0.70748031, "learning_rate": 3.9853454324300634e-06, "loss": 0.73089641, "num_input_tokens_seen": 11946400, "step": 561, "time_per_iteration": 2.770808219909668 }, { "auxiliary_loss_clip": 0.0126503, "auxiliary_loss_mlp": 0.01046417, "balance_loss_clip": 1.0757792, "balance_loss_mlp": 1.03287458, "epoch": 0.06757650453916912, "flos": 19829262378240.0, "grad_norm": 2.1814024943965538, "language_loss": 0.78138107, "learning_rate": 3.985251155510852e-06, "loss": 0.80449551, "num_input_tokens_seen": 11965430, "step": 562, "time_per_iteration": 2.9399473667144775 }, { "auxiliary_loss_clip": 0.01259079, "auxiliary_loss_mlp": 0.01053402, "balance_loss_clip": 1.07328594, "balance_loss_mlp": 1.04040253, "epoch": 0.06769674742980822, "flos": 25739224761600.0, "grad_norm": 1.8383846618193604, "language_loss": 0.80352646, "learning_rate": 3.98515657743062e-06, "loss": 0.82665128, "num_input_tokens_seen": 11984895, "step": 563, "time_per_iteration": 2.9791414737701416 }, { "auxiliary_loss_clip": 0.01270501, "auxiliary_loss_mlp": 0.01049456, "balance_loss_clip": 1.07210183, "balance_loss_mlp": 1.03739762, "epoch": 0.0678169903204473, "flos": 13074788355840.0, "grad_norm": 1.9735416202580496, "language_loss": 0.77819532, "learning_rate": 3.985061698203711e-06, "loss": 0.80139488, "num_input_tokens_seen": 12002010, "step": 564, "time_per_iteration": 3.9009385108947754 }, { "auxiliary_loss_clip": 0.0116317, "auxiliary_loss_mlp": 0.01006567, "balance_loss_clip": 1.0292902, "balance_loss_mlp": 1.00217998, "epoch": 0.0679372332110864, "flos": 70865830788480.0, "grad_norm": 0.8831257307775144, "language_loss": 0.63754904, "learning_rate": 3.984966517844523e-06, "loss": 0.65924644, "num_input_tokens_seen": 12057255, "step": 565, "time_per_iteration": 4.282278060913086 }, { "auxiliary_loss_clip": 0.01281293, "auxiliary_loss_mlp": 0.01048386, "balance_loss_clip": 1.07435358, "balance_loss_mlp": 1.03541028, "epoch": 0.06805747610172548, "flos": 28256418990720.0, "grad_norm": 3.995685440611116, "language_loss": 0.8016153, "learning_rate": 3.984871036367492e-06, "loss": 0.82491207, "num_input_tokens_seen": 12077280, "step": 566, "time_per_iteration": 2.815277576446533 }, { "auxiliary_loss_clip": 0.0126898, "auxiliary_loss_mlp": 0.01054234, "balance_loss_clip": 1.07335305, "balance_loss_mlp": 1.03326035, "epoch": 0.06817771899236458, "flos": 20120533764480.0, "grad_norm": 1.8337440641637075, "language_loss": 0.83264476, "learning_rate": 3.984775253787102e-06, "loss": 0.85587692, "num_input_tokens_seen": 12095570, "step": 567, "time_per_iteration": 2.8257763385772705 }, { "auxiliary_loss_clip": 0.01277523, "auxiliary_loss_mlp": 0.01052138, "balance_loss_clip": 1.07247865, "balance_loss_mlp": 1.03805983, "epoch": 0.06829796188300366, "flos": 17930629284480.0, "grad_norm": 4.198992071119496, "language_loss": 0.88290405, "learning_rate": 3.984679170117885e-06, "loss": 0.90620065, "num_input_tokens_seen": 12111775, "step": 568, "time_per_iteration": 4.66243839263916 }, { "auxiliary_loss_clip": 0.01268478, "auxiliary_loss_mlp": 0.01040349, "balance_loss_clip": 1.06917894, "balance_loss_mlp": 1.02864277, "epoch": 0.06841820477364276, "flos": 14501627285760.0, "grad_norm": 2.558471341510299, "language_loss": 0.78539348, "learning_rate": 3.984582785374415e-06, "loss": 0.80848175, "num_input_tokens_seen": 12129215, "step": 569, "time_per_iteration": 2.850443124771118 }, { "auxiliary_loss_clip": 0.01268582, "auxiliary_loss_mlp": 0.01050733, "balance_loss_clip": 1.07341051, "balance_loss_mlp": 1.03021288, "epoch": 0.06853844766428185, "flos": 21938474954880.0, "grad_norm": 2.537334067232046, "language_loss": 0.80746126, "learning_rate": 3.9844860995713155e-06, "loss": 0.83065444, "num_input_tokens_seen": 12148755, "step": 570, "time_per_iteration": 2.9138073921203613 }, { "auxiliary_loss_clip": 0.01275855, "auxiliary_loss_mlp": 0.01049446, "balance_loss_clip": 1.07824326, "balance_loss_mlp": 1.03661346, "epoch": 0.06865869055492094, "flos": 16800628348800.0, "grad_norm": 6.617401161749388, "language_loss": 0.8285501, "learning_rate": 3.9843891127232524e-06, "loss": 0.85180306, "num_input_tokens_seen": 12166290, "step": 571, "time_per_iteration": 2.9351093769073486 }, { "auxiliary_loss_clip": 0.01249706, "auxiliary_loss_mlp": 0.01041554, "balance_loss_clip": 1.07320738, "balance_loss_mlp": 1.03002059, "epoch": 0.06877893344556003, "flos": 19937281553280.0, "grad_norm": 2.5669877048128207, "language_loss": 0.66684425, "learning_rate": 3.984291824844938e-06, "loss": 0.68975687, "num_input_tokens_seen": 12181385, "step": 572, "time_per_iteration": 3.0180647373199463 }, { "auxiliary_loss_clip": 0.01280128, "auxiliary_loss_mlp": 0.01037045, "balance_loss_clip": 1.07250261, "balance_loss_mlp": 1.02461767, "epoch": 0.06889917633619912, "flos": 23039388852480.0, "grad_norm": 3.961083006856355, "language_loss": 0.84872931, "learning_rate": 3.984194235951132e-06, "loss": 0.87190104, "num_input_tokens_seen": 12197530, "step": 573, "time_per_iteration": 2.8952906131744385 }, { "auxiliary_loss_clip": 0.01279784, "auxiliary_loss_mlp": 0.01042216, "balance_loss_clip": 1.07499611, "balance_loss_mlp": 1.02876949, "epoch": 0.06901941922683821, "flos": 20960556203520.0, "grad_norm": 3.8048902027442963, "language_loss": 0.85115516, "learning_rate": 3.9840963460566375e-06, "loss": 0.87437522, "num_input_tokens_seen": 12216310, "step": 574, "time_per_iteration": 2.965738534927368 }, { "auxiliary_loss_clip": 0.01234325, "auxiliary_loss_mlp": 0.01044725, "balance_loss_clip": 1.07086134, "balance_loss_mlp": 1.03203511, "epoch": 0.06913966211747731, "flos": 24821850384000.0, "grad_norm": 1.6248852409696606, "language_loss": 0.89420009, "learning_rate": 3.983998155176305e-06, "loss": 0.91699064, "num_input_tokens_seen": 12236670, "step": 575, "time_per_iteration": 2.936257839202881 }, { "auxiliary_loss_clip": 0.01159295, "auxiliary_loss_mlp": 0.01016863, "balance_loss_clip": 1.02611125, "balance_loss_mlp": 1.01240432, "epoch": 0.06925990500811639, "flos": 58367446957440.0, "grad_norm": 0.8213460839124219, "language_loss": 0.57052898, "learning_rate": 3.9838996633250305e-06, "loss": 0.59229052, "num_input_tokens_seen": 12297185, "step": 576, "time_per_iteration": 3.3742127418518066 }, { "auxiliary_loss_clip": 0.01276456, "auxiliary_loss_mlp": 0.01044064, "balance_loss_clip": 1.07321811, "balance_loss_mlp": 1.0316304, "epoch": 0.06938014789875549, "flos": 12749940731520.0, "grad_norm": 2.1584405828232853, "language_loss": 0.88145947, "learning_rate": 3.983800870517753e-06, "loss": 0.90466464, "num_input_tokens_seen": 12313975, "step": 577, "time_per_iteration": 3.0546553134918213 }, { "auxiliary_loss_clip": 0.01268402, "auxiliary_loss_mlp": 0.01046538, "balance_loss_clip": 1.07351649, "balance_loss_mlp": 1.03458095, "epoch": 0.06950039078939457, "flos": 22820226019200.0, "grad_norm": 3.861558108487169, "language_loss": 0.78242421, "learning_rate": 3.983701776769463e-06, "loss": 0.80557358, "num_input_tokens_seen": 12331385, "step": 578, "time_per_iteration": 3.033407688140869 }, { "auxiliary_loss_clip": 0.01262914, "auxiliary_loss_mlp": 0.01046452, "balance_loss_clip": 1.07595599, "balance_loss_mlp": 1.03506196, "epoch": 0.06962063368003367, "flos": 21941348042880.0, "grad_norm": 1.969559303328759, "language_loss": 0.85661769, "learning_rate": 3.9836023820951885e-06, "loss": 0.87971139, "num_input_tokens_seen": 12350600, "step": 579, "time_per_iteration": 2.956401824951172 }, { "auxiliary_loss_clip": 0.01258835, "auxiliary_loss_mlp": 0.01040817, "balance_loss_clip": 1.07122684, "balance_loss_mlp": 1.02977228, "epoch": 0.06974087657067275, "flos": 20706021452160.0, "grad_norm": 12.103288543728482, "language_loss": 0.68651879, "learning_rate": 3.983502686510011e-06, "loss": 0.70951533, "num_input_tokens_seen": 12371430, "step": 580, "time_per_iteration": 2.997620105743408 }, { "auxiliary_loss_clip": 0.0127224, "auxiliary_loss_mlp": 0.01048948, "balance_loss_clip": 1.07211316, "balance_loss_mlp": 1.02847469, "epoch": 0.06986111946131185, "flos": 22638230784000.0, "grad_norm": 1.927628452196339, "language_loss": 0.73725998, "learning_rate": 3.9834026900290525e-06, "loss": 0.76047188, "num_input_tokens_seen": 12390825, "step": 581, "time_per_iteration": 3.1054999828338623 }, { "auxiliary_loss_clip": 0.01275106, "auxiliary_loss_mlp": 0.0105041, "balance_loss_clip": 1.07320774, "balance_loss_mlp": 1.03857279, "epoch": 0.06998136235195095, "flos": 26943453152640.0, "grad_norm": 2.045523554682664, "language_loss": 1.00095868, "learning_rate": 3.983302392667482e-06, "loss": 1.02421379, "num_input_tokens_seen": 12411670, "step": 582, "time_per_iteration": 2.9712324142456055 }, { "auxiliary_loss_clip": 0.01269187, "auxiliary_loss_mlp": 0.01041652, "balance_loss_clip": 1.07359111, "balance_loss_mlp": 1.02991021, "epoch": 0.07010160524259003, "flos": 22492505306880.0, "grad_norm": 1.6944090488259966, "language_loss": 0.93665385, "learning_rate": 3.983201794440517e-06, "loss": 0.95976222, "num_input_tokens_seen": 12431245, "step": 583, "time_per_iteration": 2.8104865550994873 }, { "auxiliary_loss_clip": 0.01253375, "auxiliary_loss_mlp": 0.01046733, "balance_loss_clip": 1.07113051, "balance_loss_mlp": 1.03528261, "epoch": 0.07022184813322913, "flos": 18332541538560.0, "grad_norm": 1.6730980627602174, "language_loss": 0.6760152, "learning_rate": 3.9831008953634165e-06, "loss": 0.69901633, "num_input_tokens_seen": 12450535, "step": 584, "time_per_iteration": 2.754673957824707 }, { "auxiliary_loss_clip": 0.01239059, "auxiliary_loss_mlp": 0.01047411, "balance_loss_clip": 1.07266736, "balance_loss_mlp": 1.03486454, "epoch": 0.07034209102386821, "flos": 24675550289280.0, "grad_norm": 2.3578523263147786, "language_loss": 0.81287396, "learning_rate": 3.9829996954514864e-06, "loss": 0.83573872, "num_input_tokens_seen": 12469675, "step": 585, "time_per_iteration": 2.8885936737060547 }, { "auxiliary_loss_clip": 0.01258346, "auxiliary_loss_mlp": 0.01049703, "balance_loss_clip": 1.07228529, "balance_loss_mlp": 1.03800845, "epoch": 0.0704623339145073, "flos": 25995878415360.0, "grad_norm": 2.163549145493679, "language_loss": 0.84176886, "learning_rate": 3.982898194720079e-06, "loss": 0.86484939, "num_input_tokens_seen": 12490405, "step": 586, "time_per_iteration": 2.885371208190918 }, { "auxiliary_loss_clip": 0.01259946, "auxiliary_loss_mlp": 0.01053099, "balance_loss_clip": 1.07608485, "balance_loss_mlp": 1.03189242, "epoch": 0.0705825768051464, "flos": 25338318088320.0, "grad_norm": 1.943087016330623, "language_loss": 0.82361639, "learning_rate": 3.982796393184592e-06, "loss": 0.8467468, "num_input_tokens_seen": 12509485, "step": 587, "time_per_iteration": 2.861356496810913 }, { "auxiliary_loss_clip": 0.01155198, "auxiliary_loss_mlp": 0.01007862, "balance_loss_clip": 1.02640796, "balance_loss_mlp": 1.00335586, "epoch": 0.07070281969578548, "flos": 66047552507520.0, "grad_norm": 0.7990400042262248, "language_loss": 0.62681973, "learning_rate": 3.98269429086047e-06, "loss": 0.64845037, "num_input_tokens_seen": 12567325, "step": 588, "time_per_iteration": 3.2165849208831787 }, { "auxiliary_loss_clip": 0.01259104, "auxiliary_loss_mlp": 0.01046331, "balance_loss_clip": 1.07465327, "balance_loss_mlp": 1.0331049, "epoch": 0.07082306258642458, "flos": 23653568528640.0, "grad_norm": 4.96266524837487, "language_loss": 0.8600682, "learning_rate": 3.982591887763199e-06, "loss": 0.88312256, "num_input_tokens_seen": 12584785, "step": 589, "time_per_iteration": 2.853973627090454 }, { "auxiliary_loss_clip": 0.01238589, "auxiliary_loss_mlp": 0.01041938, "balance_loss_clip": 1.07169557, "balance_loss_mlp": 1.02965915, "epoch": 0.07094330547706366, "flos": 13880049408000.0, "grad_norm": 2.35489285475884, "language_loss": 0.81695908, "learning_rate": 3.982489183908316e-06, "loss": 0.83976436, "num_input_tokens_seen": 12601205, "step": 590, "time_per_iteration": 2.7365081310272217 }, { "auxiliary_loss_clip": 0.0122842, "auxiliary_loss_mlp": 0.01039527, "balance_loss_clip": 1.07223582, "balance_loss_mlp": 1.02802932, "epoch": 0.07106354836770276, "flos": 24645098534400.0, "grad_norm": 1.7674667788220961, "language_loss": 0.84574389, "learning_rate": 3.982386179311399e-06, "loss": 0.86842334, "num_input_tokens_seen": 12621725, "step": 591, "time_per_iteration": 5.2394020557403564 }, { "auxiliary_loss_clip": 0.01275483, "auxiliary_loss_mlp": 0.01053352, "balance_loss_clip": 1.07519555, "balance_loss_mlp": 1.04079938, "epoch": 0.07118379125834184, "flos": 16217223649920.0, "grad_norm": 2.473309929066003, "language_loss": 0.87702119, "learning_rate": 3.982282873988075e-06, "loss": 0.90030956, "num_input_tokens_seen": 12639600, "step": 592, "time_per_iteration": 2.8514647483825684 }, { "auxiliary_loss_clip": 0.01258899, "auxiliary_loss_mlp": 0.01040334, "balance_loss_clip": 1.07183516, "balance_loss_mlp": 1.02871656, "epoch": 0.07130403414898094, "flos": 19719986227200.0, "grad_norm": 1.707413131191471, "language_loss": 0.87066442, "learning_rate": 3.982179267954016e-06, "loss": 0.89365673, "num_input_tokens_seen": 12660030, "step": 593, "time_per_iteration": 2.877372980117798 }, { "auxiliary_loss_clip": 0.01272103, "auxiliary_loss_mlp": 0.01045946, "balance_loss_clip": 1.07112122, "balance_loss_mlp": 1.0338881, "epoch": 0.07142427703962004, "flos": 21871933009920.0, "grad_norm": 2.069665150151414, "language_loss": 0.95994812, "learning_rate": 3.982075361224937e-06, "loss": 0.98312867, "num_input_tokens_seen": 12678395, "step": 594, "time_per_iteration": 4.898026704788208 }, { "auxiliary_loss_clip": 0.01266289, "auxiliary_loss_mlp": 0.01060458, "balance_loss_clip": 1.0733794, "balance_loss_mlp": 1.03902042, "epoch": 0.07154451993025912, "flos": 18296595002880.0, "grad_norm": 2.374510067652038, "language_loss": 0.88084102, "learning_rate": 3.981971153816602e-06, "loss": 0.90410852, "num_input_tokens_seen": 12696000, "step": 595, "time_per_iteration": 2.838866949081421 }, { "auxiliary_loss_clip": 0.0127139, "auxiliary_loss_mlp": 0.01054442, "balance_loss_clip": 1.07430398, "balance_loss_mlp": 1.04270577, "epoch": 0.07166476282089822, "flos": 22160690444160.0, "grad_norm": 2.464434996535162, "language_loss": 0.96099633, "learning_rate": 3.981866645744819e-06, "loss": 0.98425466, "num_input_tokens_seen": 12716715, "step": 596, "time_per_iteration": 2.8659448623657227 }, { "auxiliary_loss_clip": 0.0127708, "auxiliary_loss_mlp": 0.01057611, "balance_loss_clip": 1.07399869, "balance_loss_mlp": 1.03578281, "epoch": 0.0717850057115373, "flos": 14136343925760.0, "grad_norm": 2.26030729977526, "language_loss": 0.81226766, "learning_rate": 3.9817618370254416e-06, "loss": 0.83561456, "num_input_tokens_seen": 12733370, "step": 597, "time_per_iteration": 2.6574652194976807 }, { "auxiliary_loss_clip": 0.01277424, "auxiliary_loss_mlp": 0.0103591, "balance_loss_clip": 1.07553589, "balance_loss_mlp": 1.02382779, "epoch": 0.0719052486021764, "flos": 30917794412160.0, "grad_norm": 2.125596338619234, "language_loss": 0.8759259, "learning_rate": 3.9816567276743684e-06, "loss": 0.89905918, "num_input_tokens_seen": 12753235, "step": 598, "time_per_iteration": 2.772387742996216 }, { "auxiliary_loss_clip": 0.01258059, "auxiliary_loss_mlp": 0.0103518, "balance_loss_clip": 1.07085085, "balance_loss_mlp": 1.02318192, "epoch": 0.0720254914928155, "flos": 21287019939840.0, "grad_norm": 2.3687008674515027, "language_loss": 0.77475977, "learning_rate": 3.9815513177075466e-06, "loss": 0.79769218, "num_input_tokens_seen": 12772020, "step": 599, "time_per_iteration": 2.688485860824585 }, { "auxiliary_loss_clip": 0.01266966, "auxiliary_loss_mlp": 0.01047149, "balance_loss_clip": 1.07632756, "balance_loss_mlp": 1.03596723, "epoch": 0.07214573438345458, "flos": 27819170732160.0, "grad_norm": 1.935335623210175, "language_loss": 0.70351279, "learning_rate": 3.9814456071409646e-06, "loss": 0.72665393, "num_input_tokens_seen": 12792555, "step": 600, "time_per_iteration": 2.8097498416900635 }, { "auxiliary_loss_clip": 0.01269357, "auxiliary_loss_mlp": 0.01050929, "balance_loss_clip": 1.07529652, "balance_loss_mlp": 1.038293, "epoch": 0.07226597727409367, "flos": 25483576688640.0, "grad_norm": 2.4204178270772028, "language_loss": 0.85598141, "learning_rate": 3.981339595990659e-06, "loss": 0.87918425, "num_input_tokens_seen": 12811085, "step": 601, "time_per_iteration": 2.8215465545654297 }, { "auxiliary_loss_clip": 0.01269721, "auxiliary_loss_mlp": 0.01045454, "balance_loss_clip": 1.07348704, "balance_loss_mlp": 1.03304434, "epoch": 0.07238622016473276, "flos": 23513840622720.0, "grad_norm": 2.851235370028904, "language_loss": 0.809681, "learning_rate": 3.981233284272713e-06, "loss": 0.83283269, "num_input_tokens_seen": 12830830, "step": 602, "time_per_iteration": 2.784794807434082 }, { "auxiliary_loss_clip": 0.01262278, "auxiliary_loss_mlp": 0.01042371, "balance_loss_clip": 1.07304168, "balance_loss_mlp": 1.03060555, "epoch": 0.07250646305537185, "flos": 25453519983360.0, "grad_norm": 1.5983149094349172, "language_loss": 0.90153009, "learning_rate": 3.981126672003253e-06, "loss": 0.92457658, "num_input_tokens_seen": 12853505, "step": 603, "time_per_iteration": 2.850980043411255 }, { "auxiliary_loss_clip": 0.01266476, "auxiliary_loss_mlp": 0.01038581, "balance_loss_clip": 1.06852829, "balance_loss_mlp": 1.02654088, "epoch": 0.07262670594601094, "flos": 27155038216320.0, "grad_norm": 2.1689807997081694, "language_loss": 0.77849668, "learning_rate": 3.981019759198451e-06, "loss": 0.80154729, "num_input_tokens_seen": 12872455, "step": 604, "time_per_iteration": 2.812654733657837 }, { "auxiliary_loss_clip": 0.01262843, "auxiliary_loss_mlp": 0.01047997, "balance_loss_clip": 1.0692656, "balance_loss_mlp": 1.03706551, "epoch": 0.07274694883665003, "flos": 26651607148800.0, "grad_norm": 2.361843544471588, "language_loss": 0.84265333, "learning_rate": 3.980912545874528e-06, "loss": 0.86576176, "num_input_tokens_seen": 12892620, "step": 605, "time_per_iteration": 2.850767135620117 }, { "auxiliary_loss_clip": 0.01258385, "auxiliary_loss_mlp": 0.01054237, "balance_loss_clip": 1.07201898, "balance_loss_mlp": 1.03288651, "epoch": 0.07286719172728913, "flos": 29862344154240.0, "grad_norm": 2.2163745012523477, "language_loss": 0.85606593, "learning_rate": 3.980805032047746e-06, "loss": 0.87919211, "num_input_tokens_seen": 12914090, "step": 606, "time_per_iteration": 2.7155404090881348 }, { "auxiliary_loss_clip": 0.01264353, "auxiliary_loss_mlp": 0.01049352, "balance_loss_clip": 1.07239044, "balance_loss_mlp": 1.03646588, "epoch": 0.07298743461792821, "flos": 17382057799680.0, "grad_norm": 1.8426753914817937, "language_loss": 0.81216717, "learning_rate": 3.980697217734415e-06, "loss": 0.8353042, "num_input_tokens_seen": 12931830, "step": 607, "time_per_iteration": 2.76446533203125 }, { "auxiliary_loss_clip": 0.01257166, "auxiliary_loss_mlp": 0.01058881, "balance_loss_clip": 1.07228041, "balance_loss_mlp": 1.03692472, "epoch": 0.07310767750856731, "flos": 19498201701120.0, "grad_norm": 1.9781676084889737, "language_loss": 0.91500509, "learning_rate": 3.980589102950891e-06, "loss": 0.93816561, "num_input_tokens_seen": 12949995, "step": 608, "time_per_iteration": 2.842074394226074 }, { "auxiliary_loss_clip": 0.01263676, "auxiliary_loss_mlp": 0.0104302, "balance_loss_clip": 1.076635, "balance_loss_mlp": 1.03016305, "epoch": 0.07322792039920639, "flos": 29168693637120.0, "grad_norm": 2.5042727705224, "language_loss": 0.76515388, "learning_rate": 3.9804806877135755e-06, "loss": 0.78822082, "num_input_tokens_seen": 12968040, "step": 609, "time_per_iteration": 2.853900671005249 }, { "auxiliary_loss_clip": 0.01273908, "auxiliary_loss_mlp": 0.01055381, "balance_loss_clip": 1.07124424, "balance_loss_mlp": 1.03411257, "epoch": 0.07334816328984549, "flos": 23477822259840.0, "grad_norm": 2.4932118762917446, "language_loss": 0.86429632, "learning_rate": 3.980371972038915e-06, "loss": 0.88758922, "num_input_tokens_seen": 12988530, "step": 610, "time_per_iteration": 2.7619411945343018 }, { "auxiliary_loss_clip": 0.01277989, "auxiliary_loss_mlp": 0.01041723, "balance_loss_clip": 1.07713854, "balance_loss_mlp": 1.02911663, "epoch": 0.07346840618048459, "flos": 22962467877120.0, "grad_norm": 2.1707225009407787, "language_loss": 0.84519935, "learning_rate": 3.980262955943399e-06, "loss": 0.86839652, "num_input_tokens_seen": 13008195, "step": 611, "time_per_iteration": 2.8006627559661865 }, { "auxiliary_loss_clip": 0.01260443, "auxiliary_loss_mlp": 0.01038379, "balance_loss_clip": 1.07474518, "balance_loss_mlp": 1.02642262, "epoch": 0.07358864907112367, "flos": 17673903803520.0, "grad_norm": 2.3998848223230493, "language_loss": 0.87225157, "learning_rate": 3.980153639443569e-06, "loss": 0.89523983, "num_input_tokens_seen": 13024180, "step": 612, "time_per_iteration": 2.78914475440979 }, { "auxiliary_loss_clip": 0.01275312, "auxiliary_loss_mlp": 0.01043671, "balance_loss_clip": 1.07637274, "balance_loss_mlp": 1.03123164, "epoch": 0.07370889196176277, "flos": 24097029840000.0, "grad_norm": 4.596938389957519, "language_loss": 0.80187213, "learning_rate": 3.980044022556005e-06, "loss": 0.82506198, "num_input_tokens_seen": 13043865, "step": 613, "time_per_iteration": 2.9118103981018066 }, { "auxiliary_loss_clip": 0.01265742, "auxiliary_loss_mlp": 0.01044608, "balance_loss_clip": 1.07158279, "balance_loss_mlp": 1.03170991, "epoch": 0.07382913485240185, "flos": 25885919905920.0, "grad_norm": 2.3058316978193774, "language_loss": 0.73031944, "learning_rate": 3.9799341052973375e-06, "loss": 0.75342292, "num_input_tokens_seen": 13063700, "step": 614, "time_per_iteration": 2.911677360534668 }, { "auxiliary_loss_clip": 0.01265578, "auxiliary_loss_mlp": 0.01049302, "balance_loss_clip": 1.07661748, "balance_loss_mlp": 1.03564048, "epoch": 0.07394937774304094, "flos": 16873850223360.0, "grad_norm": 2.4896594097542804, "language_loss": 0.74891639, "learning_rate": 3.979823887684241e-06, "loss": 0.77206516, "num_input_tokens_seen": 13082640, "step": 615, "time_per_iteration": 2.807159662246704 }, { "auxiliary_loss_clip": 0.01276853, "auxiliary_loss_mlp": 0.01053376, "balance_loss_clip": 1.07545424, "balance_loss_mlp": 1.04134202, "epoch": 0.07406962063368003, "flos": 20703471586560.0, "grad_norm": 2.1648639157357423, "language_loss": 0.84703064, "learning_rate": 3.979713369733434e-06, "loss": 0.8703329, "num_input_tokens_seen": 13100505, "step": 616, "time_per_iteration": 3.6920642852783203 }, { "auxiliary_loss_clip": 0.01263342, "auxiliary_loss_mlp": 0.01051183, "balance_loss_clip": 1.07673681, "balance_loss_mlp": 1.03802216, "epoch": 0.07418986352431912, "flos": 21430985650560.0, "grad_norm": 2.0564526899098032, "language_loss": 0.85133874, "learning_rate": 3.979602551461683e-06, "loss": 0.874484, "num_input_tokens_seen": 13121285, "step": 617, "time_per_iteration": 3.8939061164855957 }, { "auxiliary_loss_clip": 0.01260663, "auxiliary_loss_mlp": 0.01038455, "balance_loss_clip": 1.07354128, "balance_loss_mlp": 1.02710593, "epoch": 0.07431010641495822, "flos": 12021133777920.0, "grad_norm": 2.1494438531276105, "language_loss": 0.91651261, "learning_rate": 3.979491432885799e-06, "loss": 0.93950379, "num_input_tokens_seen": 13137550, "step": 618, "time_per_iteration": 2.7159059047698975 }, { "auxiliary_loss_clip": 0.01241607, "auxiliary_loss_mlp": 0.01053209, "balance_loss_clip": 1.07150698, "balance_loss_mlp": 1.03176653, "epoch": 0.0744303493055973, "flos": 20957575374720.0, "grad_norm": 3.5091260891744174, "language_loss": 0.82893115, "learning_rate": 3.97938001402264e-06, "loss": 0.85187924, "num_input_tokens_seen": 13156675, "step": 619, "time_per_iteration": 2.8414578437805176 }, { "auxiliary_loss_clip": 0.01254624, "auxiliary_loss_mlp": 0.01052851, "balance_loss_clip": 1.0764823, "balance_loss_mlp": 1.04113269, "epoch": 0.0745505921962364, "flos": 16253134272000.0, "grad_norm": 3.219967427909245, "language_loss": 0.79538393, "learning_rate": 3.979268294889105e-06, "loss": 0.81845874, "num_input_tokens_seen": 13172225, "step": 620, "time_per_iteration": 3.8092048168182373 }, { "auxiliary_loss_clip": 0.01273546, "auxiliary_loss_mlp": 0.0104739, "balance_loss_clip": 1.07447386, "balance_loss_mlp": 1.0345397, "epoch": 0.07467083508687548, "flos": 50944635550080.0, "grad_norm": 1.9337327318858708, "language_loss": 0.73850632, "learning_rate": 3.979156275502143e-06, "loss": 0.76171565, "num_input_tokens_seen": 13195885, "step": 621, "time_per_iteration": 3.006582498550415 }, { "auxiliary_loss_clip": 0.01260217, "auxiliary_loss_mlp": 0.01043085, "balance_loss_clip": 1.07472014, "balance_loss_mlp": 1.03038311, "epoch": 0.07479107797751458, "flos": 17529686697600.0, "grad_norm": 2.119051949208142, "language_loss": 0.91793931, "learning_rate": 3.979043955878749e-06, "loss": 0.94097233, "num_input_tokens_seen": 13213730, "step": 622, "time_per_iteration": 2.8909223079681396 }, { "auxiliary_loss_clip": 0.01259956, "auxiliary_loss_mlp": 0.01040371, "balance_loss_clip": 1.07429099, "balance_loss_mlp": 1.02850938, "epoch": 0.07491132086815366, "flos": 23473943591040.0, "grad_norm": 2.2264969753618247, "language_loss": 0.83137602, "learning_rate": 3.978931336035959e-06, "loss": 0.8543793, "num_input_tokens_seen": 13232540, "step": 623, "time_per_iteration": 2.7771620750427246 }, { "auxiliary_loss_clip": 0.01269487, "auxiliary_loss_mlp": 0.01038807, "balance_loss_clip": 1.07599688, "balance_loss_mlp": 1.02627826, "epoch": 0.07503156375879276, "flos": 20157557708160.0, "grad_norm": 2.122631434521804, "language_loss": 0.82134938, "learning_rate": 3.9788184159908595e-06, "loss": 0.84443229, "num_input_tokens_seen": 13249670, "step": 624, "time_per_iteration": 2.81837797164917 }, { "auxiliary_loss_clip": 0.01253684, "auxiliary_loss_mlp": 0.01046555, "balance_loss_clip": 1.0723455, "balance_loss_mlp": 1.03525448, "epoch": 0.07515180664943186, "flos": 15115519653120.0, "grad_norm": 4.472797749411679, "language_loss": 0.82323045, "learning_rate": 3.97870519576058e-06, "loss": 0.84623289, "num_input_tokens_seen": 13266095, "step": 625, "time_per_iteration": 2.8049449920654297 }, { "auxiliary_loss_clip": 0.01257253, "auxiliary_loss_mlp": 0.01054304, "balance_loss_clip": 1.07460105, "balance_loss_mlp": 1.03302813, "epoch": 0.07527204954007094, "flos": 21287702298240.0, "grad_norm": 2.312489892875157, "language_loss": 0.81244552, "learning_rate": 3.978591675362295e-06, "loss": 0.8355611, "num_input_tokens_seen": 13284810, "step": 626, "time_per_iteration": 3.025897264480591 }, { "auxiliary_loss_clip": 0.01250006, "auxiliary_loss_mlp": 0.01045465, "balance_loss_clip": 1.07553911, "balance_loss_mlp": 1.03330612, "epoch": 0.07539229243071004, "flos": 21324187537920.0, "grad_norm": 1.792871800559864, "language_loss": 0.87859941, "learning_rate": 3.978477854813226e-06, "loss": 0.90155411, "num_input_tokens_seen": 13304150, "step": 627, "time_per_iteration": 2.9261186122894287 }, { "auxiliary_loss_clip": 0.01271416, "auxiliary_loss_mlp": 0.01038851, "balance_loss_clip": 1.07595778, "balance_loss_mlp": 1.02722812, "epoch": 0.07551253532134912, "flos": 13042540920960.0, "grad_norm": 2.9719928909773996, "language_loss": 0.82062453, "learning_rate": 3.97836373413064e-06, "loss": 0.84372723, "num_input_tokens_seen": 13322205, "step": 628, "time_per_iteration": 2.7890589237213135 }, { "auxiliary_loss_clip": 0.01269109, "auxiliary_loss_mlp": 0.01038218, "balance_loss_clip": 1.0722611, "balance_loss_mlp": 1.02658916, "epoch": 0.07563277821198822, "flos": 19208761908480.0, "grad_norm": 2.116151301278675, "language_loss": 0.74634337, "learning_rate": 3.978249313331848e-06, "loss": 0.76941669, "num_input_tokens_seen": 13340435, "step": 629, "time_per_iteration": 2.7663495540618896 }, { "auxiliary_loss_clip": 0.01275427, "auxiliary_loss_mlp": 0.01053672, "balance_loss_clip": 1.07488132, "balance_loss_mlp": 1.0326004, "epoch": 0.07575302110262731, "flos": 19537200892800.0, "grad_norm": 9.302053941668593, "language_loss": 0.61838609, "learning_rate": 3.978134592434208e-06, "loss": 0.64167708, "num_input_tokens_seen": 13358185, "step": 630, "time_per_iteration": 2.7395989894866943 }, { "auxiliary_loss_clip": 0.0113905, "auxiliary_loss_mlp": 0.01003689, "balance_loss_clip": 1.03109765, "balance_loss_mlp": 1.00004148, "epoch": 0.0758732639932664, "flos": 67961808017280.0, "grad_norm": 1.016397417093734, "language_loss": 0.5942601, "learning_rate": 3.978019571455123e-06, "loss": 0.61568749, "num_input_tokens_seen": 13410130, "step": 631, "time_per_iteration": 3.3230319023132324 }, { "auxiliary_loss_clip": 0.01268844, "auxiliary_loss_mlp": 0.01040301, "balance_loss_clip": 1.07498097, "balance_loss_mlp": 1.02837467, "epoch": 0.07599350688390549, "flos": 18989204025600.0, "grad_norm": 2.3285161122613576, "language_loss": 0.84139907, "learning_rate": 3.977904250412042e-06, "loss": 0.86449051, "num_input_tokens_seen": 13429085, "step": 632, "time_per_iteration": 2.7239983081817627 }, { "auxiliary_loss_clip": 0.01269595, "auxiliary_loss_mlp": 0.01043106, "balance_loss_clip": 1.0770731, "balance_loss_mlp": 1.03156066, "epoch": 0.07611374977454458, "flos": 21069006341760.0, "grad_norm": 2.3076943783741237, "language_loss": 0.86010563, "learning_rate": 3.97778862932246e-06, "loss": 0.88323265, "num_input_tokens_seen": 13446250, "step": 633, "time_per_iteration": 2.704141139984131 }, { "auxiliary_loss_clip": 0.01225058, "auxiliary_loss_mlp": 0.01055656, "balance_loss_clip": 1.07167757, "balance_loss_mlp": 1.04365146, "epoch": 0.07623399266518367, "flos": 18514536773760.0, "grad_norm": 2.2863514962675686, "language_loss": 0.94265991, "learning_rate": 3.9776727082039144e-06, "loss": 0.9654671, "num_input_tokens_seen": 13463220, "step": 634, "time_per_iteration": 2.8308985233306885 }, { "auxiliary_loss_clip": 0.01157397, "auxiliary_loss_mlp": 0.01004678, "balance_loss_clip": 1.03020692, "balance_loss_mlp": 1.000458, "epoch": 0.07635423555582276, "flos": 44663036077440.0, "grad_norm": 0.808523194941654, "language_loss": 0.55467772, "learning_rate": 3.977556487073991e-06, "loss": 0.57629848, "num_input_tokens_seen": 13517775, "step": 635, "time_per_iteration": 3.1603293418884277 }, { "auxiliary_loss_clip": 0.01251942, "auxiliary_loss_mlp": 0.01046256, "balance_loss_clip": 1.07015896, "balance_loss_mlp": 1.03479981, "epoch": 0.07647447844646185, "flos": 21761148487680.0, "grad_norm": 1.795572155832439, "language_loss": 0.81551147, "learning_rate": 3.97743996595032e-06, "loss": 0.83849347, "num_input_tokens_seen": 13537815, "step": 636, "time_per_iteration": 2.812304735183716 }, { "auxiliary_loss_clip": 0.01275683, "auxiliary_loss_mlp": 0.01040747, "balance_loss_clip": 1.07641339, "balance_loss_mlp": 1.02894533, "epoch": 0.07659472133710095, "flos": 23806799948160.0, "grad_norm": 1.6061975819063086, "language_loss": 0.81796396, "learning_rate": 3.9773231448505804e-06, "loss": 0.84112823, "num_input_tokens_seen": 13559605, "step": 637, "time_per_iteration": 2.8902666568756104 }, { "auxiliary_loss_clip": 0.01260558, "auxiliary_loss_mlp": 0.0104782, "balance_loss_clip": 1.07394838, "balance_loss_mlp": 1.02626383, "epoch": 0.07671496422774003, "flos": 21469984842240.0, "grad_norm": 2.1062417077777376, "language_loss": 0.78383225, "learning_rate": 3.977206023792491e-06, "loss": 0.806916, "num_input_tokens_seen": 13579495, "step": 638, "time_per_iteration": 2.9271740913391113 }, { "auxiliary_loss_clip": 0.01264625, "auxiliary_loss_mlp": 0.01044054, "balance_loss_clip": 1.07369494, "balance_loss_mlp": 1.03240168, "epoch": 0.07683520711837913, "flos": 16980971558400.0, "grad_norm": 2.3060622105391677, "language_loss": 0.81251931, "learning_rate": 3.97708860279382e-06, "loss": 0.83560616, "num_input_tokens_seen": 13597605, "step": 639, "time_per_iteration": 2.958268642425537 }, { "auxiliary_loss_clip": 0.0125066, "auxiliary_loss_mlp": 0.01051633, "balance_loss_clip": 1.07467055, "balance_loss_mlp": 1.039819, "epoch": 0.07695545000901821, "flos": 23476744851840.0, "grad_norm": 1.7517180382476403, "language_loss": 0.78111291, "learning_rate": 3.97697088187238e-06, "loss": 0.8041358, "num_input_tokens_seen": 13618120, "step": 640, "time_per_iteration": 2.731362819671631 }, { "auxiliary_loss_clip": 0.01258557, "auxiliary_loss_mlp": 0.0103914, "balance_loss_clip": 1.07466388, "balance_loss_mlp": 1.02706361, "epoch": 0.07707569289965731, "flos": 17634258167040.0, "grad_norm": 2.4568868426019015, "language_loss": 0.92093205, "learning_rate": 3.976852861046029e-06, "loss": 0.94390899, "num_input_tokens_seen": 13634735, "step": 641, "time_per_iteration": 2.8646700382232666 }, { "auxiliary_loss_clip": 0.01247296, "auxiliary_loss_mlp": 0.01043033, "balance_loss_clip": 1.07614124, "balance_loss_mlp": 1.03169036, "epoch": 0.0771959357902964, "flos": 25775674087680.0, "grad_norm": 1.7180231002820483, "language_loss": 0.79900497, "learning_rate": 3.97673454033267e-06, "loss": 0.82190824, "num_input_tokens_seen": 13656835, "step": 642, "time_per_iteration": 3.85421085357666 }, { "auxiliary_loss_clip": 0.01259643, "auxiliary_loss_mlp": 0.01047903, "balance_loss_clip": 1.07045543, "balance_loss_mlp": 1.03573203, "epoch": 0.07731617868093549, "flos": 19828651847040.0, "grad_norm": 1.9819454957955527, "language_loss": 0.82544786, "learning_rate": 3.976615919750254e-06, "loss": 0.84852326, "num_input_tokens_seen": 13674535, "step": 643, "time_per_iteration": 2.80596661567688 }, { "auxiliary_loss_clip": 0.01262334, "auxiliary_loss_mlp": 0.01048606, "balance_loss_clip": 1.07198966, "balance_loss_mlp": 1.03615487, "epoch": 0.07743642157157458, "flos": 21324654414720.0, "grad_norm": 1.8328941041060212, "language_loss": 0.86862588, "learning_rate": 3.976496999316775e-06, "loss": 0.89173532, "num_input_tokens_seen": 13693290, "step": 644, "time_per_iteration": 3.898660182952881 }, { "auxiliary_loss_clip": 0.0126339, "auxiliary_loss_mlp": 0.01045489, "balance_loss_clip": 1.07779813, "balance_loss_mlp": 1.03304327, "epoch": 0.07755666446221367, "flos": 19969133938560.0, "grad_norm": 2.0024086190320634, "language_loss": 0.8457191, "learning_rate": 3.976377779050271e-06, "loss": 0.86880791, "num_input_tokens_seen": 13711420, "step": 645, "time_per_iteration": 2.803046464920044 }, { "auxiliary_loss_clip": 0.0125801, "auxiliary_loss_mlp": 0.01045274, "balance_loss_clip": 1.07156289, "balance_loss_mlp": 1.03269172, "epoch": 0.07767690735285276, "flos": 23623224514560.0, "grad_norm": 2.1370005607571834, "language_loss": 0.84496188, "learning_rate": 3.976258258968831e-06, "loss": 0.86799473, "num_input_tokens_seen": 13729965, "step": 646, "time_per_iteration": 4.728540897369385 }, { "auxiliary_loss_clip": 0.01252273, "auxiliary_loss_mlp": 0.01052746, "balance_loss_clip": 1.07159829, "balance_loss_mlp": 1.04084313, "epoch": 0.07779715024349185, "flos": 22236246702720.0, "grad_norm": 2.917952394035274, "language_loss": 0.74539912, "learning_rate": 3.976138439090583e-06, "loss": 0.76844931, "num_input_tokens_seen": 13748045, "step": 647, "time_per_iteration": 2.8280773162841797 }, { "auxiliary_loss_clip": 0.01255559, "auxiliary_loss_mlp": 0.01049062, "balance_loss_clip": 1.07445025, "balance_loss_mlp": 1.03728378, "epoch": 0.07791739313413094, "flos": 20955097336320.0, "grad_norm": 2.061664316823676, "language_loss": 0.85048783, "learning_rate": 3.976018319433706e-06, "loss": 0.87353408, "num_input_tokens_seen": 13765590, "step": 648, "time_per_iteration": 2.8513662815093994 }, { "auxiliary_loss_clip": 0.01264782, "auxiliary_loss_mlp": 0.01049178, "balance_loss_clip": 1.0727905, "balance_loss_mlp": 1.03631592, "epoch": 0.07803763602477004, "flos": 19312327797120.0, "grad_norm": 3.0518734236318426, "language_loss": 0.91456413, "learning_rate": 3.9758979000164205e-06, "loss": 0.93770379, "num_input_tokens_seen": 13782410, "step": 649, "time_per_iteration": 2.8141674995422363 }, { "auxiliary_loss_clip": 0.01264264, "auxiliary_loss_mlp": 0.01045463, "balance_loss_clip": 1.07673979, "balance_loss_mlp": 1.03347063, "epoch": 0.07815787891540912, "flos": 22710806213760.0, "grad_norm": 1.6954189203974415, "language_loss": 0.72259098, "learning_rate": 3.975777180856995e-06, "loss": 0.74568826, "num_input_tokens_seen": 13801530, "step": 650, "time_per_iteration": 2.883298397064209 }, { "auxiliary_loss_clip": 0.01272782, "auxiliary_loss_mlp": 0.01039206, "balance_loss_clip": 1.07190347, "balance_loss_mlp": 1.02705312, "epoch": 0.07827812180604822, "flos": 22711129436160.0, "grad_norm": 2.800459443118613, "language_loss": 0.86369812, "learning_rate": 3.975656161973742e-06, "loss": 0.88681805, "num_input_tokens_seen": 13820615, "step": 651, "time_per_iteration": 2.73734450340271 }, { "auxiliary_loss_clip": 0.01269309, "auxiliary_loss_mlp": 0.01044791, "balance_loss_clip": 1.06929469, "balance_loss_mlp": 1.0316596, "epoch": 0.0783983646966873, "flos": 21725597001600.0, "grad_norm": 2.403820342132189, "language_loss": 0.89059824, "learning_rate": 3.9755348433850194e-06, "loss": 0.9137392, "num_input_tokens_seen": 13835955, "step": 652, "time_per_iteration": 2.782066822052002 }, { "auxiliary_loss_clip": 0.01145496, "auxiliary_loss_mlp": 0.01004309, "balance_loss_clip": 1.02788568, "balance_loss_mlp": 1.00058985, "epoch": 0.0785186075873264, "flos": 60640877537280.0, "grad_norm": 0.9584167957638554, "language_loss": 0.63627386, "learning_rate": 3.975413225109232e-06, "loss": 0.65777183, "num_input_tokens_seen": 13896505, "step": 653, "time_per_iteration": 3.3538401126861572 }, { "auxiliary_loss_clip": 0.01264948, "auxiliary_loss_mlp": 0.01046965, "balance_loss_clip": 1.07165694, "balance_loss_mlp": 1.03406107, "epoch": 0.0786388504779655, "flos": 23877902920320.0, "grad_norm": 3.02728010713785, "language_loss": 0.93992937, "learning_rate": 3.975291307164829e-06, "loss": 0.96304846, "num_input_tokens_seen": 13915150, "step": 654, "time_per_iteration": 2.8497090339660645 }, { "auxiliary_loss_clip": 0.01244963, "auxiliary_loss_mlp": 0.01034112, "balance_loss_clip": 1.07086015, "balance_loss_mlp": 1.02233446, "epoch": 0.07875909336860458, "flos": 15158684822400.0, "grad_norm": 2.023558004514912, "language_loss": 0.85384524, "learning_rate": 3.975169089570306e-06, "loss": 0.87663603, "num_input_tokens_seen": 13933525, "step": 655, "time_per_iteration": 2.7856600284576416 }, { "auxiliary_loss_clip": 0.01255997, "auxiliary_loss_mlp": 0.01046514, "balance_loss_clip": 1.07584739, "balance_loss_mlp": 1.03362715, "epoch": 0.07887933625924368, "flos": 22236857233920.0, "grad_norm": 1.9806373331501836, "language_loss": 0.91365069, "learning_rate": 3.975046572344202e-06, "loss": 0.93667579, "num_input_tokens_seen": 13949985, "step": 656, "time_per_iteration": 2.679096221923828 }, { "auxiliary_loss_clip": 0.01251294, "auxiliary_loss_mlp": 0.01041279, "balance_loss_clip": 1.07031298, "balance_loss_mlp": 1.02951944, "epoch": 0.07899957914988276, "flos": 20777734955520.0, "grad_norm": 1.855389110237717, "language_loss": 0.71237481, "learning_rate": 3.974923755505103e-06, "loss": 0.7353006, "num_input_tokens_seen": 13969215, "step": 657, "time_per_iteration": 2.7681450843811035 }, { "auxiliary_loss_clip": 0.01242625, "auxiliary_loss_mlp": 0.01047228, "balance_loss_clip": 1.07140541, "balance_loss_mlp": 1.03531337, "epoch": 0.07911982204052186, "flos": 23003047267200.0, "grad_norm": 1.724556623699832, "language_loss": 0.9111259, "learning_rate": 3.974800639071641e-06, "loss": 0.93402445, "num_input_tokens_seen": 13989935, "step": 658, "time_per_iteration": 2.8090462684631348 }, { "auxiliary_loss_clip": 0.01231954, "auxiliary_loss_mlp": 0.0105248, "balance_loss_clip": 1.06994057, "balance_loss_mlp": 1.02937007, "epoch": 0.07924006493116094, "flos": 23111389664640.0, "grad_norm": 8.37580146303507, "language_loss": 1.0072788, "learning_rate": 3.974677223062492e-06, "loss": 1.03012311, "num_input_tokens_seen": 14007150, "step": 659, "time_per_iteration": 2.8720667362213135 }, { "auxiliary_loss_clip": 0.01259292, "auxiliary_loss_mlp": 0.01048315, "balance_loss_clip": 1.07352543, "balance_loss_mlp": 1.03644133, "epoch": 0.07936030782180004, "flos": 16472153450880.0, "grad_norm": 2.2853862915041607, "language_loss": 0.74398959, "learning_rate": 3.974553507496378e-06, "loss": 0.76706564, "num_input_tokens_seen": 14025725, "step": 660, "time_per_iteration": 2.7730178833007812 }, { "auxiliary_loss_clip": 0.012533, "auxiliary_loss_mlp": 0.01043792, "balance_loss_clip": 1.07358027, "balance_loss_mlp": 1.03090537, "epoch": 0.07948055071243913, "flos": 23733290764800.0, "grad_norm": 2.495716372846961, "language_loss": 0.8908633, "learning_rate": 3.974429492392068e-06, "loss": 0.91383421, "num_input_tokens_seen": 14045750, "step": 661, "time_per_iteration": 2.7671213150024414 }, { "auxiliary_loss_clip": 0.01265244, "auxiliary_loss_mlp": 0.01051108, "balance_loss_clip": 1.0697211, "balance_loss_mlp": 1.02889609, "epoch": 0.07960079360307822, "flos": 19573326996480.0, "grad_norm": 2.170675428543855, "language_loss": 0.91325063, "learning_rate": 3.974305177768373e-06, "loss": 0.93641412, "num_input_tokens_seen": 14063960, "step": 662, "time_per_iteration": 2.7227225303649902 }, { "auxiliary_loss_clip": 0.01245513, "auxiliary_loss_mlp": 0.01047538, "balance_loss_clip": 1.07447696, "balance_loss_mlp": 1.03565323, "epoch": 0.07972103649371731, "flos": 23513409659520.0, "grad_norm": 2.279469886508457, "language_loss": 0.86710602, "learning_rate": 3.974180563644152e-06, "loss": 0.89003658, "num_input_tokens_seen": 14082525, "step": 663, "time_per_iteration": 2.8027524948120117 }, { "auxiliary_loss_clip": 0.01267186, "auxiliary_loss_mlp": 0.01049369, "balance_loss_clip": 1.07555723, "balance_loss_mlp": 1.03588629, "epoch": 0.0798412793843564, "flos": 16726867770240.0, "grad_norm": 2.160494481959668, "language_loss": 0.89531338, "learning_rate": 3.97405565003831e-06, "loss": 0.91847897, "num_input_tokens_seen": 14098610, "step": 664, "time_per_iteration": 2.7676334381103516 }, { "auxiliary_loss_clip": 0.012605, "auxiliary_loss_mlp": 0.0105397, "balance_loss_clip": 1.0752182, "balance_loss_mlp": 1.04123223, "epoch": 0.07996152227499549, "flos": 18223337214720.0, "grad_norm": 2.7559594919952235, "language_loss": 0.78454721, "learning_rate": 3.973930436969794e-06, "loss": 0.80769193, "num_input_tokens_seen": 14117065, "step": 665, "time_per_iteration": 2.7635202407836914 }, { "auxiliary_loss_clip": 0.01255954, "auxiliary_loss_mlp": 0.01056146, "balance_loss_clip": 1.07457948, "balance_loss_mlp": 1.04400992, "epoch": 0.08008176516563459, "flos": 20594877793920.0, "grad_norm": 1.8539684264664158, "language_loss": 0.85679078, "learning_rate": 3.973804924457602e-06, "loss": 0.87991178, "num_input_tokens_seen": 14135145, "step": 666, "time_per_iteration": 2.7381629943847656 }, { "auxiliary_loss_clip": 0.01256284, "auxiliary_loss_mlp": 0.01041321, "balance_loss_clip": 1.07396173, "balance_loss_mlp": 1.02924514, "epoch": 0.08020200805627367, "flos": 31834306863360.0, "grad_norm": 1.7915281181125777, "language_loss": 0.85768801, "learning_rate": 3.973679112520771e-06, "loss": 0.88066399, "num_input_tokens_seen": 14156860, "step": 667, "time_per_iteration": 2.816403388977051 }, { "auxiliary_loss_clip": 0.01246407, "auxiliary_loss_mlp": 0.01046511, "balance_loss_clip": 1.0708952, "balance_loss_mlp": 1.03478718, "epoch": 0.08032225094691277, "flos": 17783503176960.0, "grad_norm": 1.8677883969713447, "language_loss": 0.99062788, "learning_rate": 3.973553001178389e-06, "loss": 1.01355696, "num_input_tokens_seen": 14174365, "step": 668, "time_per_iteration": 3.643951892852783 }, { "auxiliary_loss_clip": 0.01250528, "auxiliary_loss_mlp": 0.01044289, "balance_loss_clip": 1.07290709, "balance_loss_mlp": 1.03190327, "epoch": 0.08044249383755185, "flos": 24061693835520.0, "grad_norm": 1.994578821370718, "language_loss": 0.75506371, "learning_rate": 3.973426590449585e-06, "loss": 0.77801192, "num_input_tokens_seen": 14192320, "step": 669, "time_per_iteration": 2.900851011276245 }, { "auxiliary_loss_clip": 0.01251026, "auxiliary_loss_mlp": 0.01056906, "balance_loss_clip": 1.07425642, "balance_loss_mlp": 1.04422855, "epoch": 0.08056273672819095, "flos": 18223624523520.0, "grad_norm": 2.0166377533478594, "language_loss": 0.75569797, "learning_rate": 3.9732998803535364e-06, "loss": 0.77877724, "num_input_tokens_seen": 14210380, "step": 670, "time_per_iteration": 3.877272129058838 }, { "auxiliary_loss_clip": 0.01268775, "auxiliary_loss_mlp": 0.01048995, "balance_loss_clip": 1.07199645, "balance_loss_mlp": 1.03618574, "epoch": 0.08068297961883003, "flos": 19676856971520.0, "grad_norm": 2.2523430997909886, "language_loss": 0.85597217, "learning_rate": 3.973172870909465e-06, "loss": 0.87914991, "num_input_tokens_seen": 14225145, "step": 671, "time_per_iteration": 2.6967031955718994 }, { "auxiliary_loss_clip": 0.01266462, "auxiliary_loss_mlp": 0.01036906, "balance_loss_clip": 1.07075846, "balance_loss_mlp": 1.02429962, "epoch": 0.08080322250946913, "flos": 23148736830720.0, "grad_norm": 2.7542832202279213, "language_loss": 0.80813664, "learning_rate": 3.973045562136638e-06, "loss": 0.83117032, "num_input_tokens_seen": 14241960, "step": 672, "time_per_iteration": 4.712906122207642 }, { "auxiliary_loss_clip": 0.01267384, "auxiliary_loss_mlp": 0.01051818, "balance_loss_clip": 1.0731802, "balance_loss_mlp": 1.04049933, "epoch": 0.08092346540010822, "flos": 21763626526080.0, "grad_norm": 3.1513534935845966, "language_loss": 0.9148789, "learning_rate": 3.972917954054368e-06, "loss": 0.93807095, "num_input_tokens_seen": 14260515, "step": 673, "time_per_iteration": 2.6988935470581055 }, { "auxiliary_loss_clip": 0.01258291, "auxiliary_loss_mlp": 0.01058664, "balance_loss_clip": 1.07890773, "balance_loss_mlp": 1.04473424, "epoch": 0.08104370829074731, "flos": 21032485188480.0, "grad_norm": 2.4175906906299085, "language_loss": 0.81885642, "learning_rate": 3.972790046682013e-06, "loss": 0.842026, "num_input_tokens_seen": 14279190, "step": 674, "time_per_iteration": 2.7559170722961426 }, { "auxiliary_loss_clip": 0.01260507, "auxiliary_loss_mlp": 0.01053577, "balance_loss_clip": 1.07141995, "balance_loss_mlp": 1.04079163, "epoch": 0.0811639511813864, "flos": 20083186598400.0, "grad_norm": 1.826381819935272, "language_loss": 0.79244673, "learning_rate": 3.972661840038977e-06, "loss": 0.81558758, "num_input_tokens_seen": 14299480, "step": 675, "time_per_iteration": 2.8357951641082764 }, { "auxiliary_loss_clip": 0.01266343, "auxiliary_loss_mlp": 0.01042661, "balance_loss_clip": 1.0733285, "balance_loss_mlp": 1.03048408, "epoch": 0.08128419407202549, "flos": 16836718538880.0, "grad_norm": 2.156569203792795, "language_loss": 0.83251238, "learning_rate": 3.972533334144707e-06, "loss": 0.85560244, "num_input_tokens_seen": 14316405, "step": 676, "time_per_iteration": 2.708883285522461 }, { "auxiliary_loss_clip": 0.01271413, "auxiliary_loss_mlp": 0.01049066, "balance_loss_clip": 1.07424116, "balance_loss_mlp": 1.03616166, "epoch": 0.08140443696266458, "flos": 23769273214080.0, "grad_norm": 10.646382598126538, "language_loss": 0.78765678, "learning_rate": 3.972404529018699e-06, "loss": 0.81086159, "num_input_tokens_seen": 14336265, "step": 677, "time_per_iteration": 2.9029345512390137 }, { "auxiliary_loss_clip": 0.01255474, "auxiliary_loss_mlp": 0.01049896, "balance_loss_clip": 1.0695467, "balance_loss_mlp": 1.03770673, "epoch": 0.08152467985330367, "flos": 24390132819840.0, "grad_norm": 1.9812939971563008, "language_loss": 0.85373455, "learning_rate": 3.972275424680493e-06, "loss": 0.87678826, "num_input_tokens_seen": 14356375, "step": 678, "time_per_iteration": 2.788938283920288 }, { "auxiliary_loss_clip": 0.01266224, "auxiliary_loss_mlp": 0.01034965, "balance_loss_clip": 1.07164574, "balance_loss_mlp": 1.02290726, "epoch": 0.08164492274394276, "flos": 19317750750720.0, "grad_norm": 2.0607054012352433, "language_loss": 0.91917992, "learning_rate": 3.972146021149673e-06, "loss": 0.94219184, "num_input_tokens_seen": 14374650, "step": 679, "time_per_iteration": 2.7009997367858887 }, { "auxiliary_loss_clip": 0.01254683, "auxiliary_loss_mlp": 0.01048841, "balance_loss_clip": 1.07195246, "balance_loss_mlp": 1.03631258, "epoch": 0.08176516563458186, "flos": 14830461319680.0, "grad_norm": 2.3807236020080547, "language_loss": 0.78656822, "learning_rate": 3.972016318445868e-06, "loss": 0.80960345, "num_input_tokens_seen": 14392650, "step": 680, "time_per_iteration": 2.8522770404815674 }, { "auxiliary_loss_clip": 0.01263421, "auxiliary_loss_mlp": 0.01047131, "balance_loss_clip": 1.07087815, "balance_loss_mlp": 1.0347209, "epoch": 0.08188540852522094, "flos": 22602320161920.0, "grad_norm": 7.964211839714009, "language_loss": 0.92443514, "learning_rate": 3.971886316588757e-06, "loss": 0.94754064, "num_input_tokens_seen": 14413155, "step": 681, "time_per_iteration": 2.856001853942871 }, { "auxiliary_loss_clip": 0.01247751, "auxiliary_loss_mlp": 0.01042715, "balance_loss_clip": 1.0733273, "balance_loss_mlp": 1.03008485, "epoch": 0.08200565141586004, "flos": 19463727623040.0, "grad_norm": 2.3203095655114927, "language_loss": 0.7305845, "learning_rate": 3.9717560155980595e-06, "loss": 0.75348914, "num_input_tokens_seen": 14428805, "step": 682, "time_per_iteration": 2.8266475200653076 }, { "auxiliary_loss_clip": 0.01260576, "auxiliary_loss_mlp": 0.01046097, "balance_loss_clip": 1.07056522, "balance_loss_mlp": 1.03248918, "epoch": 0.08212589430649912, "flos": 20594662312320.0, "grad_norm": 3.027001724967576, "language_loss": 0.92367214, "learning_rate": 3.971625415493542e-06, "loss": 0.94673884, "num_input_tokens_seen": 14447125, "step": 683, "time_per_iteration": 2.8625898361206055 }, { "auxiliary_loss_clip": 0.01253754, "auxiliary_loss_mlp": 0.01047595, "balance_loss_clip": 1.07205307, "balance_loss_mlp": 1.0339216, "epoch": 0.08224613719713822, "flos": 25953611086080.0, "grad_norm": 1.9830342426273833, "language_loss": 0.87707198, "learning_rate": 3.971494516295017e-06, "loss": 0.90008557, "num_input_tokens_seen": 14466575, "step": 684, "time_per_iteration": 2.9030861854553223 }, { "auxiliary_loss_clip": 0.01256621, "auxiliary_loss_mlp": 0.0104214, "balance_loss_clip": 1.07017565, "balance_loss_mlp": 1.03061843, "epoch": 0.08236638008777732, "flos": 23768734510080.0, "grad_norm": 2.1397818204412298, "language_loss": 0.8545869, "learning_rate": 3.971363318022341e-06, "loss": 0.87757444, "num_input_tokens_seen": 14487915, "step": 685, "time_per_iteration": 2.801025629043579 }, { "auxiliary_loss_clip": 0.01258715, "auxiliary_loss_mlp": 0.01050713, "balance_loss_clip": 1.06916225, "balance_loss_mlp": 1.03752208, "epoch": 0.0824866229784164, "flos": 38799144887040.0, "grad_norm": 4.0627554594135935, "language_loss": 0.68603075, "learning_rate": 3.971231820695417e-06, "loss": 0.70912504, "num_input_tokens_seen": 14511530, "step": 686, "time_per_iteration": 2.939852714538574 }, { "auxiliary_loss_clip": 0.01262733, "auxiliary_loss_mlp": 0.01038597, "balance_loss_clip": 1.07149935, "balance_loss_mlp": 1.02588892, "epoch": 0.0826068658690555, "flos": 23107762391040.0, "grad_norm": 2.1316527971770722, "language_loss": 0.81270754, "learning_rate": 3.971100024334193e-06, "loss": 0.83572078, "num_input_tokens_seen": 14529050, "step": 687, "time_per_iteration": 2.806622266769409 }, { "auxiliary_loss_clip": 0.01236578, "auxiliary_loss_mlp": 0.01038028, "balance_loss_clip": 1.0676496, "balance_loss_mlp": 1.02591014, "epoch": 0.08272710875969458, "flos": 21136374299520.0, "grad_norm": 2.121863627690248, "language_loss": 0.86358899, "learning_rate": 3.970967928958663e-06, "loss": 0.88633502, "num_input_tokens_seen": 14546165, "step": 688, "time_per_iteration": 2.815053701400757 }, { "auxiliary_loss_clip": 0.01253067, "auxiliary_loss_mlp": 0.01044335, "balance_loss_clip": 1.06993961, "balance_loss_mlp": 1.03213382, "epoch": 0.08284735165033368, "flos": 19063000517760.0, "grad_norm": 2.909966676972128, "language_loss": 0.8374846, "learning_rate": 3.970835534588865e-06, "loss": 0.86045861, "num_input_tokens_seen": 14563660, "step": 689, "time_per_iteration": 2.7996768951416016 }, { "auxiliary_loss_clip": 0.01267436, "auxiliary_loss_mlp": 0.01053985, "balance_loss_clip": 1.07877207, "balance_loss_mlp": 1.04239213, "epoch": 0.08296759454097276, "flos": 16727442387840.0, "grad_norm": 1.8433138191979817, "language_loss": 0.85872209, "learning_rate": 3.970702841244883e-06, "loss": 0.88193631, "num_input_tokens_seen": 14581980, "step": 690, "time_per_iteration": 2.8571386337280273 }, { "auxiliary_loss_clip": 0.01266066, "auxiliary_loss_mlp": 0.01046833, "balance_loss_clip": 1.07267523, "balance_loss_mlp": 1.03468001, "epoch": 0.08308783743161186, "flos": 18004928567040.0, "grad_norm": 1.8120515976787968, "language_loss": 0.82601076, "learning_rate": 3.970569848946847e-06, "loss": 0.84913981, "num_input_tokens_seen": 14601795, "step": 691, "time_per_iteration": 2.849839687347412 }, { "auxiliary_loss_clip": 0.01252955, "auxiliary_loss_mlp": 0.01044498, "balance_loss_clip": 1.07231784, "balance_loss_mlp": 1.03221965, "epoch": 0.08320808032225095, "flos": 15079788599040.0, "grad_norm": 2.527628811342929, "language_loss": 0.83510447, "learning_rate": 3.970436557714932e-06, "loss": 0.85807902, "num_input_tokens_seen": 14618315, "step": 692, "time_per_iteration": 2.731677293777466 }, { "auxiliary_loss_clip": 0.0125404, "auxiliary_loss_mlp": 0.01041463, "balance_loss_clip": 1.07015133, "balance_loss_mlp": 1.02840996, "epoch": 0.08332832321289003, "flos": 22383085501440.0, "grad_norm": 2.425202686144988, "language_loss": 0.86386687, "learning_rate": 3.970302967569358e-06, "loss": 0.88682187, "num_input_tokens_seen": 14636905, "step": 693, "time_per_iteration": 2.772919178009033 }, { "auxiliary_loss_clip": 0.01263701, "auxiliary_loss_mlp": 0.01044232, "balance_loss_clip": 1.07375741, "balance_loss_mlp": 1.03186393, "epoch": 0.08344856610352913, "flos": 24717386655360.0, "grad_norm": 1.9718575711070498, "language_loss": 0.68476009, "learning_rate": 3.9701690785303896e-06, "loss": 0.70783937, "num_input_tokens_seen": 14656100, "step": 694, "time_per_iteration": 3.672494649887085 }, { "auxiliary_loss_clip": 0.01269108, "auxiliary_loss_mlp": 0.01039079, "balance_loss_clip": 1.07252097, "balance_loss_mlp": 1.02677619, "epoch": 0.08356880899416821, "flos": 25370206387200.0, "grad_norm": 2.0395987452437017, "language_loss": 0.88169956, "learning_rate": 3.970034890618339e-06, "loss": 0.90478146, "num_input_tokens_seen": 14675790, "step": 695, "time_per_iteration": 3.7702252864837646 }, { "auxiliary_loss_clip": 0.01249308, "auxiliary_loss_mlp": 0.0105025, "balance_loss_clip": 1.06955719, "balance_loss_mlp": 1.03698826, "epoch": 0.08368905188480731, "flos": 24353072962560.0, "grad_norm": 2.0105757070130617, "language_loss": 0.8797825, "learning_rate": 3.969900403853562e-06, "loss": 0.90277809, "num_input_tokens_seen": 14694830, "step": 696, "time_per_iteration": 2.8297667503356934 }, { "auxiliary_loss_clip": 0.01268166, "auxiliary_loss_mlp": 0.01049982, "balance_loss_clip": 1.07269573, "balance_loss_mlp": 1.03766775, "epoch": 0.08380929477544641, "flos": 18037319656320.0, "grad_norm": 1.652025289695458, "language_loss": 0.7812916, "learning_rate": 3.96976561825646e-06, "loss": 0.8044731, "num_input_tokens_seen": 14711920, "step": 697, "time_per_iteration": 3.7602627277374268 }, { "auxiliary_loss_clip": 0.01255037, "auxiliary_loss_mlp": 0.01039392, "balance_loss_clip": 1.07376814, "balance_loss_mlp": 1.02664304, "epoch": 0.08392953766608549, "flos": 26286287875200.0, "grad_norm": 2.048420898419629, "language_loss": 0.8698467, "learning_rate": 3.969630533847479e-06, "loss": 0.89279103, "num_input_tokens_seen": 14730880, "step": 698, "time_per_iteration": 3.8874754905700684 }, { "auxiliary_loss_clip": 0.01260135, "auxiliary_loss_mlp": 0.0103596, "balance_loss_clip": 1.06870246, "balance_loss_mlp": 1.0247308, "epoch": 0.08404978055672459, "flos": 22492146170880.0, "grad_norm": 1.9384568856446955, "language_loss": 0.84174931, "learning_rate": 3.969495150647113e-06, "loss": 0.86471027, "num_input_tokens_seen": 14749050, "step": 699, "time_per_iteration": 2.8298144340515137 }, { "auxiliary_loss_clip": 0.01253662, "auxiliary_loss_mlp": 0.01038082, "balance_loss_clip": 1.0753634, "balance_loss_mlp": 1.02639961, "epoch": 0.08417002344736367, "flos": 24826878288000.0, "grad_norm": 2.237206463931409, "language_loss": 0.7660557, "learning_rate": 3.969359468675899e-06, "loss": 0.78897309, "num_input_tokens_seen": 14769180, "step": 700, "time_per_iteration": 2.86014461517334 }, { "auxiliary_loss_clip": 0.0125692, "auxiliary_loss_mlp": 0.01042507, "balance_loss_clip": 1.07132602, "balance_loss_mlp": 1.03019238, "epoch": 0.08429026633800277, "flos": 16945922862720.0, "grad_norm": 2.0777131682035788, "language_loss": 0.89686537, "learning_rate": 3.969223487954418e-06, "loss": 0.91985965, "num_input_tokens_seen": 14786640, "step": 701, "time_per_iteration": 2.799211263656616 }, { "auxiliary_loss_clip": 0.01248012, "auxiliary_loss_mlp": 0.01045104, "balance_loss_clip": 1.07419455, "balance_loss_mlp": 1.03327894, "epoch": 0.08441050922864185, "flos": 23841920471040.0, "grad_norm": 2.1421565591296456, "language_loss": 0.8267715, "learning_rate": 3.969087208503301e-06, "loss": 0.84970272, "num_input_tokens_seen": 14806720, "step": 702, "time_per_iteration": 2.85073184967041 }, { "auxiliary_loss_clip": 0.01240719, "auxiliary_loss_mlp": 0.01043435, "balance_loss_clip": 1.07437658, "balance_loss_mlp": 1.03072131, "epoch": 0.08453075211928095, "flos": 25520205582720.0, "grad_norm": 2.4761204679491517, "language_loss": 0.8479659, "learning_rate": 3.968950630343219e-06, "loss": 0.87080741, "num_input_tokens_seen": 14823705, "step": 703, "time_per_iteration": 2.7968196868896484 }, { "auxiliary_loss_clip": 0.01255761, "auxiliary_loss_mlp": 0.01048847, "balance_loss_clip": 1.07078481, "balance_loss_mlp": 1.03565025, "epoch": 0.08465099500992004, "flos": 19532496211200.0, "grad_norm": 2.2734134221459246, "language_loss": 0.93679869, "learning_rate": 3.968813753494892e-06, "loss": 0.95984477, "num_input_tokens_seen": 14841865, "step": 704, "time_per_iteration": 2.7775068283081055 }, { "auxiliary_loss_clip": 0.01235403, "auxiliary_loss_mlp": 0.0105938, "balance_loss_clip": 1.06670141, "balance_loss_mlp": 1.03678036, "epoch": 0.08477123790055913, "flos": 29351299403520.0, "grad_norm": 2.4856503993313908, "language_loss": 0.75815904, "learning_rate": 3.968676577979084e-06, "loss": 0.78110695, "num_input_tokens_seen": 14861415, "step": 705, "time_per_iteration": 2.792025089263916 }, { "auxiliary_loss_clip": 0.01246078, "auxiliary_loss_mlp": 0.01047384, "balance_loss_clip": 1.0726546, "balance_loss_mlp": 1.0358212, "epoch": 0.08489148079119822, "flos": 18624495283200.0, "grad_norm": 4.12364195592962, "language_loss": 0.78542846, "learning_rate": 3.968539103816605e-06, "loss": 0.80836314, "num_input_tokens_seen": 14879215, "step": 706, "time_per_iteration": 2.809025764465332 }, { "auxiliary_loss_clip": 0.01253647, "auxiliary_loss_mlp": 0.01056492, "balance_loss_clip": 1.07367373, "balance_loss_mlp": 1.03467321, "epoch": 0.0850117236818373, "flos": 23471393725440.0, "grad_norm": 1.834437627537439, "language_loss": 0.89528155, "learning_rate": 3.9684013310283085e-06, "loss": 0.91838288, "num_input_tokens_seen": 14897900, "step": 707, "time_per_iteration": 2.7920925617218018 }, { "auxiliary_loss_clip": 0.0125589, "auxiliary_loss_mlp": 0.01047811, "balance_loss_clip": 1.07406998, "balance_loss_mlp": 1.03639042, "epoch": 0.0851319665724764, "flos": 40625058896640.0, "grad_norm": 2.0742089036343554, "language_loss": 0.64197183, "learning_rate": 3.9682632596350956e-06, "loss": 0.66500884, "num_input_tokens_seen": 14919065, "step": 708, "time_per_iteration": 2.986259937286377 }, { "auxiliary_loss_clip": 0.01257106, "auxiliary_loss_mlp": 0.01031004, "balance_loss_clip": 1.07161582, "balance_loss_mlp": 1.0191133, "epoch": 0.0852522094631155, "flos": 15879554870400.0, "grad_norm": 1.8661836084688277, "language_loss": 0.78368169, "learning_rate": 3.968124889657911e-06, "loss": 0.80656278, "num_input_tokens_seen": 14934165, "step": 709, "time_per_iteration": 2.75335431098938 }, { "auxiliary_loss_clip": 0.01244986, "auxiliary_loss_mlp": 0.01041762, "balance_loss_clip": 1.0704, "balance_loss_mlp": 1.02912593, "epoch": 0.08537245235375458, "flos": 14567091822720.0, "grad_norm": 2.263011686023868, "language_loss": 0.90974867, "learning_rate": 3.967986221117746e-06, "loss": 0.93261611, "num_input_tokens_seen": 14950105, "step": 710, "time_per_iteration": 2.853106737136841 }, { "auxiliary_loss_clip": 0.01247014, "auxiliary_loss_mlp": 0.01040987, "balance_loss_clip": 1.07389116, "balance_loss_mlp": 1.02873278, "epoch": 0.08549269524439368, "flos": 26468929555200.0, "grad_norm": 2.3129825518832345, "language_loss": 0.86638927, "learning_rate": 3.967847254035635e-06, "loss": 0.88926929, "num_input_tokens_seen": 14969490, "step": 711, "time_per_iteration": 2.9830408096313477 }, { "auxiliary_loss_clip": 0.01256635, "auxiliary_loss_mlp": 0.01039754, "balance_loss_clip": 1.07141268, "balance_loss_mlp": 1.02709961, "epoch": 0.08561293813503276, "flos": 13590214565760.0, "grad_norm": 3.0590296483570394, "language_loss": 0.86705613, "learning_rate": 3.967707988432661e-06, "loss": 0.89002007, "num_input_tokens_seen": 14987195, "step": 712, "time_per_iteration": 2.824031114578247 }, { "auxiliary_loss_clip": 0.01264678, "auxiliary_loss_mlp": 0.01047219, "balance_loss_clip": 1.07032204, "balance_loss_mlp": 1.0354414, "epoch": 0.08573318102567186, "flos": 26943524979840.0, "grad_norm": 2.3361519852017274, "language_loss": 0.87910759, "learning_rate": 3.967568424329949e-06, "loss": 0.90222657, "num_input_tokens_seen": 15007620, "step": 713, "time_per_iteration": 2.8616185188293457 }, { "auxiliary_loss_clip": 0.0117529, "auxiliary_loss_mlp": 0.01024211, "balance_loss_clip": 1.05060983, "balance_loss_mlp": 1.02068222, "epoch": 0.08585342391631094, "flos": 67302739319040.0, "grad_norm": 0.8314667605980162, "language_loss": 0.55529362, "learning_rate": 3.967428561748671e-06, "loss": 0.57728863, "num_input_tokens_seen": 15075590, "step": 714, "time_per_iteration": 3.4515540599823 }, { "auxiliary_loss_clip": 0.01242538, "auxiliary_loss_mlp": 0.01043607, "balance_loss_clip": 1.07218027, "balance_loss_mlp": 1.03107786, "epoch": 0.08597366680695004, "flos": 22456594684800.0, "grad_norm": 2.3796174154813317, "language_loss": 0.87325066, "learning_rate": 3.967288400710045e-06, "loss": 0.89611208, "num_input_tokens_seen": 15095055, "step": 715, "time_per_iteration": 2.927586793899536 }, { "auxiliary_loss_clip": 0.01249254, "auxiliary_loss_mlp": 0.0104629, "balance_loss_clip": 1.0754112, "balance_loss_mlp": 1.03379107, "epoch": 0.08609390969758914, "flos": 23550505430400.0, "grad_norm": 2.025436805947306, "language_loss": 0.88686448, "learning_rate": 3.9671479412353335e-06, "loss": 0.9098199, "num_input_tokens_seen": 15113520, "step": 716, "time_per_iteration": 2.8762407302856445 }, { "auxiliary_loss_clip": 0.01261715, "auxiliary_loss_mlp": 0.01045371, "balance_loss_clip": 1.07322443, "balance_loss_mlp": 1.03401077, "epoch": 0.08621415258822822, "flos": 25885848078720.0, "grad_norm": 2.4631901571116006, "language_loss": 0.74341363, "learning_rate": 3.967007183345843e-06, "loss": 0.7664845, "num_input_tokens_seen": 15133375, "step": 717, "time_per_iteration": 2.832123279571533 }, { "auxiliary_loss_clip": 0.01258, "auxiliary_loss_mlp": 0.01046434, "balance_loss_clip": 1.07220101, "balance_loss_mlp": 1.03481698, "epoch": 0.08633439547886732, "flos": 13589568120960.0, "grad_norm": 2.896615482116121, "language_loss": 0.89535844, "learning_rate": 3.966866127062927e-06, "loss": 0.91840279, "num_input_tokens_seen": 15150500, "step": 718, "time_per_iteration": 2.877380132675171 }, { "auxiliary_loss_clip": 0.01168904, "auxiliary_loss_mlp": 0.01004197, "balance_loss_clip": 1.04707611, "balance_loss_mlp": 1.00090659, "epoch": 0.0864546383695064, "flos": 57767342434560.0, "grad_norm": 0.8809518004906117, "language_loss": 0.62665749, "learning_rate": 3.966724772407982e-06, "loss": 0.6483885, "num_input_tokens_seen": 15208015, "step": 719, "time_per_iteration": 3.1189699172973633 }, { "auxiliary_loss_clip": 0.01240575, "auxiliary_loss_mlp": 0.01057176, "balance_loss_clip": 1.06965733, "balance_loss_mlp": 1.04502881, "epoch": 0.0865748812601455, "flos": 20046952753920.0, "grad_norm": 1.9678450915525165, "language_loss": 0.88878608, "learning_rate": 3.966583119402454e-06, "loss": 0.91176355, "num_input_tokens_seen": 15224780, "step": 720, "time_per_iteration": 3.6711862087249756 }, { "auxiliary_loss_clip": 0.01254894, "auxiliary_loss_mlp": 0.01055609, "balance_loss_clip": 1.06894863, "balance_loss_mlp": 1.03330898, "epoch": 0.08669512415078459, "flos": 35262446935680.0, "grad_norm": 1.5892927468455573, "language_loss": 0.82262355, "learning_rate": 3.9664411680678305e-06, "loss": 0.84572858, "num_input_tokens_seen": 15246535, "step": 721, "time_per_iteration": 3.8554646968841553 }, { "auxiliary_loss_clip": 0.01156892, "auxiliary_loss_mlp": 0.01007358, "balance_loss_clip": 1.04317033, "balance_loss_mlp": 1.00423479, "epoch": 0.08681536704142367, "flos": 65654870048640.0, "grad_norm": 0.8458577845260227, "language_loss": 0.61481386, "learning_rate": 3.966298918425644e-06, "loss": 0.63645631, "num_input_tokens_seen": 15304025, "step": 722, "time_per_iteration": 3.21661639213562 }, { "auxiliary_loss_clip": 0.01263168, "auxiliary_loss_mlp": 0.01056344, "balance_loss_clip": 1.07155085, "balance_loss_mlp": 1.04383254, "epoch": 0.08693560993206277, "flos": 34529940881280.0, "grad_norm": 2.0591986266837154, "language_loss": 0.82940453, "learning_rate": 3.966156370497476e-06, "loss": 0.85259962, "num_input_tokens_seen": 15327635, "step": 723, "time_per_iteration": 4.034039497375488 }, { "auxiliary_loss_clip": 0.01263199, "auxiliary_loss_mlp": 0.01052027, "balance_loss_clip": 1.07002449, "balance_loss_mlp": 1.03939104, "epoch": 0.08705585282270185, "flos": 23149419189120.0, "grad_norm": 1.7814229165902988, "language_loss": 0.89053893, "learning_rate": 3.96601352430495e-06, "loss": 0.91369128, "num_input_tokens_seen": 15347405, "step": 724, "time_per_iteration": 3.706817388534546 }, { "auxiliary_loss_clip": 0.01252016, "auxiliary_loss_mlp": 0.01053463, "balance_loss_clip": 1.07004952, "balance_loss_mlp": 1.04077911, "epoch": 0.08717609571334095, "flos": 29497599498240.0, "grad_norm": 1.4784617941718365, "language_loss": 0.83293545, "learning_rate": 3.965870379869735e-06, "loss": 0.85599029, "num_input_tokens_seen": 15369450, "step": 725, "time_per_iteration": 2.9023818969726562 }, { "auxiliary_loss_clip": 0.01260753, "auxiliary_loss_mlp": 0.0104465, "balance_loss_clip": 1.06939995, "balance_loss_mlp": 1.03266335, "epoch": 0.08729633860398003, "flos": 20667489137280.0, "grad_norm": 2.1177323902909175, "language_loss": 0.87066317, "learning_rate": 3.965726937213547e-06, "loss": 0.89371717, "num_input_tokens_seen": 15388085, "step": 726, "time_per_iteration": 2.780424118041992 }, { "auxiliary_loss_clip": 0.01256718, "auxiliary_loss_mlp": 0.01054996, "balance_loss_clip": 1.06880403, "balance_loss_mlp": 1.04227662, "epoch": 0.08741658149461913, "flos": 18369493655040.0, "grad_norm": 3.2884398968894333, "language_loss": 0.81226522, "learning_rate": 3.965583196358144e-06, "loss": 0.83538234, "num_input_tokens_seen": 15407120, "step": 727, "time_per_iteration": 2.8160171508789062 }, { "auxiliary_loss_clip": 0.01263541, "auxiliary_loss_mlp": 0.01037966, "balance_loss_clip": 1.06923723, "balance_loss_mlp": 1.02531195, "epoch": 0.08753682438525823, "flos": 18729677283840.0, "grad_norm": 2.3643612660598623, "language_loss": 0.74475181, "learning_rate": 3.965439157325335e-06, "loss": 0.76776683, "num_input_tokens_seen": 15424485, "step": 728, "time_per_iteration": 2.7066738605499268 }, { "auxiliary_loss_clip": 0.01248449, "auxiliary_loss_mlp": 0.01048054, "balance_loss_clip": 1.06812763, "balance_loss_mlp": 1.03531671, "epoch": 0.08765706727589731, "flos": 27776113303680.0, "grad_norm": 2.0536725081496323, "language_loss": 0.75776833, "learning_rate": 3.965294820136968e-06, "loss": 0.78073335, "num_input_tokens_seen": 15446285, "step": 729, "time_per_iteration": 3.0122106075286865 }, { "auxiliary_loss_clip": 0.01251034, "auxiliary_loss_mlp": 0.01049164, "balance_loss_clip": 1.07164311, "balance_loss_mlp": 1.03658164, "epoch": 0.08777731016653641, "flos": 24389127239040.0, "grad_norm": 2.8252004124307293, "language_loss": 0.87296194, "learning_rate": 3.965150184814938e-06, "loss": 0.89596391, "num_input_tokens_seen": 15465770, "step": 730, "time_per_iteration": 2.7880399227142334 }, { "auxiliary_loss_clip": 0.01246008, "auxiliary_loss_mlp": 0.01050359, "balance_loss_clip": 1.0729636, "balance_loss_mlp": 1.03812766, "epoch": 0.08789755305717549, "flos": 21981855605760.0, "grad_norm": 2.9141637419192072, "language_loss": 0.76465398, "learning_rate": 3.965005251381189e-06, "loss": 0.78761768, "num_input_tokens_seen": 15483705, "step": 731, "time_per_iteration": 2.8289425373077393 }, { "auxiliary_loss_clip": 0.01154693, "auxiliary_loss_mlp": 0.01014025, "balance_loss_clip": 1.03308594, "balance_loss_mlp": 1.01066327, "epoch": 0.08801779594781459, "flos": 58360120583040.0, "grad_norm": 0.8994523254179527, "language_loss": 0.64568973, "learning_rate": 3.964860019857705e-06, "loss": 0.66737688, "num_input_tokens_seen": 15548620, "step": 732, "time_per_iteration": 3.2922937870025635 }, { "auxiliary_loss_clip": 0.01261694, "auxiliary_loss_mlp": 0.01050853, "balance_loss_clip": 1.07324243, "balance_loss_mlp": 1.03861594, "epoch": 0.08813803883845367, "flos": 23294785530240.0, "grad_norm": 1.9916432676862623, "language_loss": 0.84551036, "learning_rate": 3.964714490266518e-06, "loss": 0.86863589, "num_input_tokens_seen": 15569265, "step": 733, "time_per_iteration": 2.765261173248291 }, { "auxiliary_loss_clip": 0.01146756, "auxiliary_loss_mlp": 0.01006859, "balance_loss_clip": 1.03190899, "balance_loss_mlp": 1.00347328, "epoch": 0.08825828172909277, "flos": 63424924882560.0, "grad_norm": 0.8905374982077153, "language_loss": 0.64556164, "learning_rate": 3.964568662629706e-06, "loss": 0.66709781, "num_input_tokens_seen": 15630570, "step": 734, "time_per_iteration": 3.2190985679626465 }, { "auxiliary_loss_clip": 0.01253499, "auxiliary_loss_mlp": 0.01045046, "balance_loss_clip": 1.06860316, "balance_loss_mlp": 1.03291619, "epoch": 0.08837852461973186, "flos": 26720986268160.0, "grad_norm": 2.7259420543710275, "language_loss": 0.84582472, "learning_rate": 3.9644225369693895e-06, "loss": 0.86881018, "num_input_tokens_seen": 15650870, "step": 735, "time_per_iteration": 2.766090154647827 }, { "auxiliary_loss_clip": 0.01262209, "auxiliary_loss_mlp": 0.01042285, "balance_loss_clip": 1.07072854, "balance_loss_mlp": 1.03057218, "epoch": 0.08849876751037095, "flos": 27265427688960.0, "grad_norm": 2.99890531328398, "language_loss": 0.87318677, "learning_rate": 3.964276113307735e-06, "loss": 0.89623165, "num_input_tokens_seen": 15670835, "step": 736, "time_per_iteration": 2.767143726348877 }, { "auxiliary_loss_clip": 0.01241916, "auxiliary_loss_mlp": 0.01042958, "balance_loss_clip": 1.06912231, "balance_loss_mlp": 1.02985096, "epoch": 0.08861901040101004, "flos": 19828759587840.0, "grad_norm": 1.8306430408060457, "language_loss": 0.80786079, "learning_rate": 3.9641293916669574e-06, "loss": 0.83070958, "num_input_tokens_seen": 15689795, "step": 737, "time_per_iteration": 2.772869110107422 }, { "auxiliary_loss_clip": 0.01249117, "auxiliary_loss_mlp": 0.01047563, "balance_loss_clip": 1.07187366, "balance_loss_mlp": 1.0343194, "epoch": 0.08873925329164913, "flos": 23658704173440.0, "grad_norm": 2.0954788976371104, "language_loss": 0.82473314, "learning_rate": 3.9639823720693115e-06, "loss": 0.84769994, "num_input_tokens_seen": 15711650, "step": 738, "time_per_iteration": 2.795116424560547 }, { "auxiliary_loss_clip": 0.01139628, "auxiliary_loss_mlp": 0.01009759, "balance_loss_clip": 1.0261097, "balance_loss_mlp": 1.00596833, "epoch": 0.08885949618228822, "flos": 71831541893760.0, "grad_norm": 0.8483055074338696, "language_loss": 0.60052115, "learning_rate": 3.963835054537102e-06, "loss": 0.622015, "num_input_tokens_seen": 15780615, "step": 739, "time_per_iteration": 3.3590877056121826 }, { "auxiliary_loss_clip": 0.01253051, "auxiliary_loss_mlp": 0.01045917, "balance_loss_clip": 1.0691334, "balance_loss_mlp": 1.03359056, "epoch": 0.08897973907292732, "flos": 22346169298560.0, "grad_norm": 2.685220730575366, "language_loss": 0.60983843, "learning_rate": 3.963687439092676e-06, "loss": 0.63282812, "num_input_tokens_seen": 15801300, "step": 740, "time_per_iteration": 2.763028621673584 }, { "auxiliary_loss_clip": 0.01252631, "auxiliary_loss_mlp": 0.01061591, "balance_loss_clip": 1.06771469, "balance_loss_mlp": 1.04923475, "epoch": 0.0890999819635664, "flos": 21251827589760.0, "grad_norm": 2.2652662395529424, "language_loss": 0.80597973, "learning_rate": 3.963539525758427e-06, "loss": 0.82912195, "num_input_tokens_seen": 15820860, "step": 741, "time_per_iteration": 2.723115921020508 }, { "auxiliary_loss_clip": 0.01253873, "auxiliary_loss_mlp": 0.01045999, "balance_loss_clip": 1.06814814, "balance_loss_mlp": 1.03342807, "epoch": 0.0892202248542055, "flos": 25370888745600.0, "grad_norm": 2.41717235791629, "language_loss": 0.68017972, "learning_rate": 3.9633913145567925e-06, "loss": 0.70317841, "num_input_tokens_seen": 15841350, "step": 742, "time_per_iteration": 2.7855775356292725 }, { "auxiliary_loss_clip": 0.01254088, "auxiliary_loss_mlp": 0.01046985, "balance_loss_clip": 1.07088268, "balance_loss_mlp": 1.03506958, "epoch": 0.08934046774484458, "flos": 24457895827200.0, "grad_norm": 1.8192423636261057, "language_loss": 0.81595659, "learning_rate": 3.9632428055102575e-06, "loss": 0.83896732, "num_input_tokens_seen": 15861360, "step": 743, "time_per_iteration": 2.815627336502075 }, { "auxiliary_loss_clip": 0.01260156, "auxiliary_loss_mlp": 0.0104623, "balance_loss_clip": 1.0723989, "balance_loss_mlp": 1.03452373, "epoch": 0.08946071063548368, "flos": 35772773414400.0, "grad_norm": 2.3370635519578125, "language_loss": 0.6673907, "learning_rate": 3.9630939986413495e-06, "loss": 0.6904546, "num_input_tokens_seen": 15883160, "step": 744, "time_per_iteration": 3.060509443283081 }, { "auxiliary_loss_clip": 0.01236923, "auxiliary_loss_mlp": 0.01057755, "balance_loss_clip": 1.07225585, "balance_loss_mlp": 1.04548883, "epoch": 0.08958095352612276, "flos": 14356584167040.0, "grad_norm": 1.7526456056190334, "language_loss": 0.77926904, "learning_rate": 3.962944893972643e-06, "loss": 0.80221581, "num_input_tokens_seen": 15901610, "step": 745, "time_per_iteration": 2.761317729949951 }, { "auxiliary_loss_clip": 0.01249862, "auxiliary_loss_mlp": 0.01046757, "balance_loss_clip": 1.06976652, "balance_loss_mlp": 1.03475285, "epoch": 0.08970119641676186, "flos": 17853277345920.0, "grad_norm": 2.9505465173119427, "language_loss": 0.90307343, "learning_rate": 3.962795491526756e-06, "loss": 0.9260397, "num_input_tokens_seen": 15918770, "step": 746, "time_per_iteration": 2.726489782333374 }, { "auxiliary_loss_clip": 0.01266904, "auxiliary_loss_mlp": 0.01051251, "balance_loss_clip": 1.07177293, "balance_loss_mlp": 1.03837633, "epoch": 0.08982143930740095, "flos": 20811670329600.0, "grad_norm": 2.765496082597112, "language_loss": 0.89217377, "learning_rate": 3.962645791326354e-06, "loss": 0.91535532, "num_input_tokens_seen": 15938025, "step": 747, "time_per_iteration": 3.537186622619629 }, { "auxiliary_loss_clip": 0.01250632, "auxiliary_loss_mlp": 0.01043152, "balance_loss_clip": 1.06711864, "balance_loss_mlp": 1.03186345, "epoch": 0.08994168219804004, "flos": 24097712198400.0, "grad_norm": 3.0082869587462864, "language_loss": 0.83181477, "learning_rate": 3.962495793394146e-06, "loss": 0.85475266, "num_input_tokens_seen": 15957215, "step": 748, "time_per_iteration": 3.7500200271606445 }, { "auxiliary_loss_clip": 0.01147554, "auxiliary_loss_mlp": 0.01038366, "balance_loss_clip": 1.02704883, "balance_loss_mlp": 1.03493249, "epoch": 0.09006192508867913, "flos": 57188893812480.0, "grad_norm": 0.75069881443369, "language_loss": 0.6119352, "learning_rate": 3.9623454977528864e-06, "loss": 0.63379443, "num_input_tokens_seen": 16015870, "step": 749, "time_per_iteration": 3.107102394104004 }, { "auxiliary_loss_clip": 0.01251944, "auxiliary_loss_mlp": 0.01045357, "balance_loss_clip": 1.07210159, "balance_loss_mlp": 1.03294766, "epoch": 0.09018216797931822, "flos": 20487505063680.0, "grad_norm": 3.8836947380902345, "language_loss": 0.84836459, "learning_rate": 3.962194904425375e-06, "loss": 0.87133753, "num_input_tokens_seen": 16036500, "step": 750, "time_per_iteration": 4.656393766403198 }, { "auxiliary_loss_clip": 0.01253065, "auxiliary_loss_mlp": 0.0103906, "balance_loss_clip": 1.06837475, "balance_loss_mlp": 1.02640557, "epoch": 0.09030241086995731, "flos": 22638123043200.0, "grad_norm": 2.3819867465333555, "language_loss": 0.67899251, "learning_rate": 3.9620440134344566e-06, "loss": 0.70191371, "num_input_tokens_seen": 16054655, "step": 751, "time_per_iteration": 2.6250205039978027 }, { "auxiliary_loss_clip": 0.01248165, "auxiliary_loss_mlp": 0.01049112, "balance_loss_clip": 1.06939363, "balance_loss_mlp": 1.03641582, "epoch": 0.09042265376059641, "flos": 21871502046720.0, "grad_norm": 2.3202278251736197, "language_loss": 0.82494736, "learning_rate": 3.9618928248030215e-06, "loss": 0.84792018, "num_input_tokens_seen": 16074165, "step": 752, "time_per_iteration": 2.7468137741088867 }, { "auxiliary_loss_clip": 0.01253709, "auxiliary_loss_mlp": 0.01046654, "balance_loss_clip": 1.0701586, "balance_loss_mlp": 1.03471541, "epoch": 0.0905428966512355, "flos": 24316192673280.0, "grad_norm": 2.214519958397697, "language_loss": 0.82999301, "learning_rate": 3.961741338554005e-06, "loss": 0.85299665, "num_input_tokens_seen": 16092505, "step": 753, "time_per_iteration": 2.836132526397705 }, { "auxiliary_loss_clip": 0.0125357, "auxiliary_loss_mlp": 0.01048167, "balance_loss_clip": 1.0674572, "balance_loss_mlp": 1.03476238, "epoch": 0.09066313954187459, "flos": 35845061535360.0, "grad_norm": 2.3973857967201018, "language_loss": 0.75805444, "learning_rate": 3.9615895547103865e-06, "loss": 0.78107178, "num_input_tokens_seen": 16116150, "step": 754, "time_per_iteration": 2.880683660507202 }, { "auxiliary_loss_clip": 0.01249333, "auxiliary_loss_mlp": 0.01047746, "balance_loss_clip": 1.06706691, "balance_loss_mlp": 1.03491879, "epoch": 0.09078338243251367, "flos": 29168729550720.0, "grad_norm": 3.375780640590632, "language_loss": 0.77551967, "learning_rate": 3.961437473295193e-06, "loss": 0.79849041, "num_input_tokens_seen": 16136295, "step": 755, "time_per_iteration": 2.797001361846924 }, { "auxiliary_loss_clip": 0.012295, "auxiliary_loss_mlp": 0.01049295, "balance_loss_clip": 1.06722033, "balance_loss_mlp": 1.0372963, "epoch": 0.09090362532315277, "flos": 21907699977600.0, "grad_norm": 2.18541168117316, "language_loss": 0.72370452, "learning_rate": 3.961285094331495e-06, "loss": 0.74649239, "num_input_tokens_seen": 16154210, "step": 756, "time_per_iteration": 2.8183975219726562 }, { "auxiliary_loss_clip": 0.01254176, "auxiliary_loss_mlp": 0.01046786, "balance_loss_clip": 1.06410837, "balance_loss_mlp": 1.03407812, "epoch": 0.09102386821379185, "flos": 27344503480320.0, "grad_norm": 1.8955692804477449, "language_loss": 0.86048138, "learning_rate": 3.961132417842406e-06, "loss": 0.88349104, "num_input_tokens_seen": 16173995, "step": 757, "time_per_iteration": 2.864346504211426 }, { "auxiliary_loss_clip": 0.01244066, "auxiliary_loss_mlp": 0.01046254, "balance_loss_clip": 1.06784022, "balance_loss_mlp": 1.03446984, "epoch": 0.09114411110443095, "flos": 20813501923200.0, "grad_norm": 3.543824464837932, "language_loss": 0.75168025, "learning_rate": 3.960979443851089e-06, "loss": 0.7745834, "num_input_tokens_seen": 16191020, "step": 758, "time_per_iteration": 2.695565938949585 }, { "auxiliary_loss_clip": 0.01251338, "auxiliary_loss_mlp": 0.01044818, "balance_loss_clip": 1.07214904, "balance_loss_mlp": 1.03228271, "epoch": 0.09126435399507005, "flos": 26145949438080.0, "grad_norm": 1.8596671578776636, "language_loss": 0.78749096, "learning_rate": 3.96082617238075e-06, "loss": 0.81045246, "num_input_tokens_seen": 16213645, "step": 759, "time_per_iteration": 2.848029851913452 }, { "auxiliary_loss_clip": 0.01251438, "auxiliary_loss_mlp": 0.01040622, "balance_loss_clip": 1.06824148, "balance_loss_mlp": 1.02915418, "epoch": 0.09138459688570913, "flos": 24388911757440.0, "grad_norm": 2.3597001278767396, "language_loss": 0.79478842, "learning_rate": 3.960672603454639e-06, "loss": 0.81770903, "num_input_tokens_seen": 16233625, "step": 760, "time_per_iteration": 2.766967296600342 }, { "auxiliary_loss_clip": 0.01251531, "auxiliary_loss_mlp": 0.01048032, "balance_loss_clip": 1.07091153, "balance_loss_mlp": 1.0359025, "epoch": 0.09150483977634823, "flos": 21032664756480.0, "grad_norm": 3.4819480314388462, "language_loss": 0.77205944, "learning_rate": 3.960518737096054e-06, "loss": 0.79505509, "num_input_tokens_seen": 16253255, "step": 761, "time_per_iteration": 2.8032002449035645 }, { "auxiliary_loss_clip": 0.01256699, "auxiliary_loss_mlp": 0.01048964, "balance_loss_clip": 1.06910062, "balance_loss_mlp": 1.03757918, "epoch": 0.09162508266698731, "flos": 22856998567680.0, "grad_norm": 2.339103764156495, "language_loss": 0.73281151, "learning_rate": 3.960364573328334e-06, "loss": 0.75586808, "num_input_tokens_seen": 16272580, "step": 762, "time_per_iteration": 2.7341620922088623 }, { "auxiliary_loss_clip": 0.01246183, "auxiliary_loss_mlp": 0.01043066, "balance_loss_clip": 1.06834662, "balance_loss_mlp": 1.03085327, "epoch": 0.0917453255576264, "flos": 21724411852800.0, "grad_norm": 2.568058103986779, "language_loss": 0.88457769, "learning_rate": 3.9602101121748675e-06, "loss": 0.90747011, "num_input_tokens_seen": 16293075, "step": 763, "time_per_iteration": 2.815026044845581 }, { "auxiliary_loss_clip": 0.01246764, "auxiliary_loss_mlp": 0.01037827, "balance_loss_clip": 1.06685185, "balance_loss_mlp": 1.026425, "epoch": 0.0918655684482655, "flos": 14609215497600.0, "grad_norm": 2.413486591893491, "language_loss": 0.7271595, "learning_rate": 3.960055353659085e-06, "loss": 0.75000542, "num_input_tokens_seen": 16310185, "step": 764, "time_per_iteration": 2.879667043685913 }, { "auxiliary_loss_clip": 0.01248009, "auxiliary_loss_mlp": 0.01046868, "balance_loss_clip": 1.06725407, "balance_loss_mlp": 1.03443468, "epoch": 0.09198581133890459, "flos": 23435016226560.0, "grad_norm": 2.412009475400055, "language_loss": 0.83470762, "learning_rate": 3.959900297804465e-06, "loss": 0.85765636, "num_input_tokens_seen": 16330355, "step": 765, "time_per_iteration": 2.8121654987335205 }, { "auxiliary_loss_clip": 0.01235624, "auxiliary_loss_mlp": 0.01039037, "balance_loss_clip": 1.06725347, "balance_loss_mlp": 1.02765298, "epoch": 0.09210605422954368, "flos": 16795887753600.0, "grad_norm": 2.215956987935204, "language_loss": 0.7744478, "learning_rate": 3.9597449446345276e-06, "loss": 0.79719436, "num_input_tokens_seen": 16347600, "step": 766, "time_per_iteration": 2.8519484996795654 }, { "auxiliary_loss_clip": 0.01240076, "auxiliary_loss_mlp": 0.01045301, "balance_loss_clip": 1.07062137, "balance_loss_mlp": 1.03379726, "epoch": 0.09222629712018277, "flos": 22674249146880.0, "grad_norm": 2.3119060554259123, "language_loss": 0.8365348, "learning_rate": 3.95958929417284e-06, "loss": 0.85938859, "num_input_tokens_seen": 16365755, "step": 767, "time_per_iteration": 2.7640535831451416 }, { "auxiliary_loss_clip": 0.0114636, "auxiliary_loss_mlp": 0.01012548, "balance_loss_clip": 1.02543545, "balance_loss_mlp": 1.00940132, "epoch": 0.09234654001082186, "flos": 69976756327680.0, "grad_norm": 0.7310570400615695, "language_loss": 0.58735764, "learning_rate": 3.9594333464430145e-06, "loss": 0.60894668, "num_input_tokens_seen": 16435245, "step": 768, "time_per_iteration": 3.434337854385376 }, { "auxiliary_loss_clip": 0.01221155, "auxiliary_loss_mlp": 0.0104478, "balance_loss_clip": 1.06714785, "balance_loss_mlp": 1.03327644, "epoch": 0.09246678290146094, "flos": 20011437181440.0, "grad_norm": 2.203412492486691, "language_loss": 0.88055611, "learning_rate": 3.959277101468709e-06, "loss": 0.90321547, "num_input_tokens_seen": 16454795, "step": 769, "time_per_iteration": 2.8477656841278076 }, { "auxiliary_loss_clip": 0.01236711, "auxiliary_loss_mlp": 0.01034907, "balance_loss_clip": 1.06919944, "balance_loss_mlp": 1.02361774, "epoch": 0.09258702579210004, "flos": 17747448900480.0, "grad_norm": 2.788486918189553, "language_loss": 0.78851056, "learning_rate": 3.959120559273624e-06, "loss": 0.81122673, "num_input_tokens_seen": 16472580, "step": 770, "time_per_iteration": 2.770678758621216 }, { "auxiliary_loss_clip": 0.0123641, "auxiliary_loss_mlp": 0.01040118, "balance_loss_clip": 1.06871903, "balance_loss_mlp": 1.02816153, "epoch": 0.09270726868273914, "flos": 20886544229760.0, "grad_norm": 2.099084660061466, "language_loss": 0.83553267, "learning_rate": 3.958963719881509e-06, "loss": 0.85829794, "num_input_tokens_seen": 16490670, "step": 771, "time_per_iteration": 2.888719081878662 }, { "auxiliary_loss_clip": 0.01252336, "auxiliary_loss_mlp": 0.01046869, "balance_loss_clip": 1.07148266, "balance_loss_mlp": 1.03454852, "epoch": 0.09282751157337822, "flos": 17015697031680.0, "grad_norm": 2.9623431255462354, "language_loss": 0.94001698, "learning_rate": 3.958806583316154e-06, "loss": 0.963009, "num_input_tokens_seen": 16508640, "step": 772, "time_per_iteration": 2.762852668762207 }, { "auxiliary_loss_clip": 0.01256719, "auxiliary_loss_mlp": 0.01042558, "balance_loss_clip": 1.06914401, "balance_loss_mlp": 1.03105986, "epoch": 0.09294775446401732, "flos": 32523647748480.0, "grad_norm": 2.048245966527559, "language_loss": 0.79098755, "learning_rate": 3.9586491496013985e-06, "loss": 0.81398028, "num_input_tokens_seen": 16531035, "step": 773, "time_per_iteration": 3.941164016723633 }, { "auxiliary_loss_clip": 0.01260026, "auxiliary_loss_mlp": 0.01053872, "balance_loss_clip": 1.07139707, "balance_loss_mlp": 1.04249358, "epoch": 0.0930679973546564, "flos": 18259750627200.0, "grad_norm": 2.547366895097859, "language_loss": 0.82944787, "learning_rate": 3.958491418761124e-06, "loss": 0.85258687, "num_input_tokens_seen": 16548605, "step": 774, "time_per_iteration": 4.024899482727051 }, { "auxiliary_loss_clip": 0.0124796, "auxiliary_loss_mlp": 0.01042145, "balance_loss_clip": 1.06584835, "balance_loss_mlp": 1.03025401, "epoch": 0.0931882402452955, "flos": 21099745405440.0, "grad_norm": 3.3093354517390474, "language_loss": 0.7265166, "learning_rate": 3.958333390819258e-06, "loss": 0.74941766, "num_input_tokens_seen": 16565535, "step": 775, "time_per_iteration": 2.76822829246521 }, { "auxiliary_loss_clip": 0.01253498, "auxiliary_loss_mlp": 0.01047895, "balance_loss_clip": 1.06630111, "balance_loss_mlp": 1.03677249, "epoch": 0.0933084831359346, "flos": 24207275658240.0, "grad_norm": 2.1323109494763433, "language_loss": 0.80304712, "learning_rate": 3.9581750657997754e-06, "loss": 0.82606107, "num_input_tokens_seen": 16584900, "step": 776, "time_per_iteration": 4.717688322067261 }, { "auxiliary_loss_clip": 0.01244169, "auxiliary_loss_mlp": 0.0104187, "balance_loss_clip": 1.06573606, "balance_loss_mlp": 1.03048599, "epoch": 0.09342872602657368, "flos": 25480272637440.0, "grad_norm": 2.619866636629306, "language_loss": 0.89596546, "learning_rate": 3.95801644372669e-06, "loss": 0.91882586, "num_input_tokens_seen": 16604805, "step": 777, "time_per_iteration": 2.815662145614624 }, { "auxiliary_loss_clip": 0.01257783, "auxiliary_loss_mlp": 0.01032825, "balance_loss_clip": 1.06938267, "balance_loss_mlp": 1.02126145, "epoch": 0.09354896891721277, "flos": 23149060053120.0, "grad_norm": 2.218798840235867, "language_loss": 0.84781098, "learning_rate": 3.957857524624068e-06, "loss": 0.87071699, "num_input_tokens_seen": 16623685, "step": 778, "time_per_iteration": 2.836052417755127 }, { "auxiliary_loss_clip": 0.0124986, "auxiliary_loss_mlp": 0.01045876, "balance_loss_clip": 1.06797993, "balance_loss_mlp": 1.03414559, "epoch": 0.09366921180785186, "flos": 24279563779200.0, "grad_norm": 1.722809770467278, "language_loss": 0.8990109, "learning_rate": 3.957698308516016e-06, "loss": 0.92196822, "num_input_tokens_seen": 16644985, "step": 779, "time_per_iteration": 2.810272216796875 }, { "auxiliary_loss_clip": 0.01248762, "auxiliary_loss_mlp": 0.01060492, "balance_loss_clip": 1.06998634, "balance_loss_mlp": 1.03729463, "epoch": 0.09378945469849095, "flos": 18730036419840.0, "grad_norm": 2.868414118228639, "language_loss": 0.82296395, "learning_rate": 3.957538795426688e-06, "loss": 0.84605646, "num_input_tokens_seen": 16662410, "step": 780, "time_per_iteration": 2.7582929134368896 }, { "auxiliary_loss_clip": 0.01249922, "auxiliary_loss_mlp": 0.01044158, "balance_loss_clip": 1.06970561, "balance_loss_mlp": 1.03261256, "epoch": 0.09390969758913004, "flos": 23218834222080.0, "grad_norm": 2.686050922468883, "language_loss": 0.77481663, "learning_rate": 3.9573789853802804e-06, "loss": 0.79775739, "num_input_tokens_seen": 16680885, "step": 781, "time_per_iteration": 2.815770387649536 }, { "auxiliary_loss_clip": 0.01246535, "auxiliary_loss_mlp": 0.01052346, "balance_loss_clip": 1.06980658, "balance_loss_mlp": 1.02908242, "epoch": 0.09402994047976913, "flos": 19646728439040.0, "grad_norm": 2.4860669747690856, "language_loss": 0.74884689, "learning_rate": 3.957218878401037e-06, "loss": 0.77183574, "num_input_tokens_seen": 16699375, "step": 782, "time_per_iteration": 2.8903584480285645 }, { "auxiliary_loss_clip": 0.01259505, "auxiliary_loss_mlp": 0.01045362, "balance_loss_clip": 1.07024789, "balance_loss_mlp": 1.03256464, "epoch": 0.09415018337040823, "flos": 29420463041280.0, "grad_norm": 1.7954606286645984, "language_loss": 0.88982898, "learning_rate": 3.957058474513246e-06, "loss": 0.91287756, "num_input_tokens_seen": 16719230, "step": 783, "time_per_iteration": 2.794841766357422 }, { "auxiliary_loss_clip": 0.01249423, "auxiliary_loss_mlp": 0.01048884, "balance_loss_clip": 1.06859374, "balance_loss_mlp": 1.03792894, "epoch": 0.09427042626104731, "flos": 24572092141440.0, "grad_norm": 1.7946821699789681, "language_loss": 0.79004401, "learning_rate": 3.956897773741241e-06, "loss": 0.81302714, "num_input_tokens_seen": 16738220, "step": 784, "time_per_iteration": 2.752150774002075 }, { "auxiliary_loss_clip": 0.01234673, "auxiliary_loss_mlp": 0.0103954, "balance_loss_clip": 1.06687844, "balance_loss_mlp": 1.02839375, "epoch": 0.09439066915168641, "flos": 26359581576960.0, "grad_norm": 2.0457467783892724, "language_loss": 0.7197696, "learning_rate": 3.956736776109398e-06, "loss": 0.74251175, "num_input_tokens_seen": 16759395, "step": 785, "time_per_iteration": 2.8121466636657715 }, { "auxiliary_loss_clip": 0.01243705, "auxiliary_loss_mlp": 0.0105299, "balance_loss_clip": 1.06768215, "balance_loss_mlp": 1.02971351, "epoch": 0.09451091204232549, "flos": 19427278296960.0, "grad_norm": 2.563298372622372, "language_loss": 0.83443075, "learning_rate": 3.956575481642143e-06, "loss": 0.85739774, "num_input_tokens_seen": 16778285, "step": 786, "time_per_iteration": 2.7952072620391846 }, { "auxiliary_loss_clip": 0.01237215, "auxiliary_loss_mlp": 0.01039723, "balance_loss_clip": 1.06771278, "balance_loss_mlp": 1.02902448, "epoch": 0.09463115493296459, "flos": 25368051571200.0, "grad_norm": 2.3679429657739384, "language_loss": 0.74676621, "learning_rate": 3.956413890363943e-06, "loss": 0.76953566, "num_input_tokens_seen": 16795265, "step": 787, "time_per_iteration": 2.9039320945739746 }, { "auxiliary_loss_clip": 0.01250505, "auxiliary_loss_mlp": 0.010524, "balance_loss_clip": 1.06845021, "balance_loss_mlp": 1.0404017, "epoch": 0.09475139782360369, "flos": 10123254869760.0, "grad_norm": 2.4480491886321767, "language_loss": 0.8178432, "learning_rate": 3.956252002299312e-06, "loss": 0.84087229, "num_input_tokens_seen": 16811165, "step": 788, "time_per_iteration": 2.742640733718872 }, { "auxiliary_loss_clip": 0.01252245, "auxiliary_loss_mlp": 0.0104298, "balance_loss_clip": 1.06657457, "balance_loss_mlp": 1.03127325, "epoch": 0.09487164071424277, "flos": 17231088936960.0, "grad_norm": 2.6630913323707324, "language_loss": 0.90543598, "learning_rate": 3.956089817472807e-06, "loss": 0.92838824, "num_input_tokens_seen": 16828470, "step": 789, "time_per_iteration": 2.825951337814331 }, { "auxiliary_loss_clip": 0.01242885, "auxiliary_loss_mlp": 0.01043712, "balance_loss_clip": 1.06957519, "balance_loss_mlp": 1.0324285, "epoch": 0.09499188360488187, "flos": 30849564528000.0, "grad_norm": 14.761633602578602, "language_loss": 0.85781932, "learning_rate": 3.955927335909032e-06, "loss": 0.88068527, "num_input_tokens_seen": 16851680, "step": 790, "time_per_iteration": 2.82415509223938 }, { "auxiliary_loss_clip": 0.01228078, "auxiliary_loss_mlp": 0.01038654, "balance_loss_clip": 1.06688106, "balance_loss_mlp": 1.028193, "epoch": 0.09511212649552095, "flos": 29351694453120.0, "grad_norm": 2.475564584189349, "language_loss": 0.75828469, "learning_rate": 3.955764557632634e-06, "loss": 0.78095198, "num_input_tokens_seen": 16871490, "step": 791, "time_per_iteration": 2.85154128074646 }, { "auxiliary_loss_clip": 0.01237923, "auxiliary_loss_mlp": 0.01046866, "balance_loss_clip": 1.06659448, "balance_loss_mlp": 1.0358094, "epoch": 0.09523236938616005, "flos": 10378687461120.0, "grad_norm": 2.4466876130010493, "language_loss": 0.94440711, "learning_rate": 3.955601482668309e-06, "loss": 0.967255, "num_input_tokens_seen": 16889350, "step": 792, "time_per_iteration": 2.8488636016845703 }, { "auxiliary_loss_clip": 0.01236405, "auxiliary_loss_mlp": 0.01042333, "balance_loss_clip": 1.06849432, "balance_loss_mlp": 1.03171122, "epoch": 0.09535261227679913, "flos": 19061815368960.0, "grad_norm": 2.104552056561372, "language_loss": 0.88566744, "learning_rate": 3.955438111040794e-06, "loss": 0.90845484, "num_input_tokens_seen": 16907625, "step": 793, "time_per_iteration": 2.7383787631988525 }, { "auxiliary_loss_clip": 0.01232018, "auxiliary_loss_mlp": 0.01040155, "balance_loss_clip": 1.06647098, "balance_loss_mlp": 1.02860403, "epoch": 0.09547285516743823, "flos": 20922993555840.0, "grad_norm": 1.9199075964488457, "language_loss": 0.80349714, "learning_rate": 3.955274442774873e-06, "loss": 0.8262189, "num_input_tokens_seen": 16926205, "step": 794, "time_per_iteration": 2.7453484535217285 }, { "auxiliary_loss_clip": 0.01253846, "auxiliary_loss_mlp": 0.01039755, "balance_loss_clip": 1.07128954, "balance_loss_mlp": 1.02831709, "epoch": 0.09559309805807732, "flos": 30154405639680.0, "grad_norm": 3.0745156586665487, "language_loss": 0.70824254, "learning_rate": 3.9551104778953725e-06, "loss": 0.73117852, "num_input_tokens_seen": 16946500, "step": 795, "time_per_iteration": 2.780731678009033 }, { "auxiliary_loss_clip": 0.01243682, "auxiliary_loss_mlp": 0.01034923, "balance_loss_clip": 1.06686735, "balance_loss_mlp": 1.02363408, "epoch": 0.0957133409487164, "flos": 21066743784960.0, "grad_norm": 2.305892249306919, "language_loss": 0.85402298, "learning_rate": 3.954946216427167e-06, "loss": 0.87680906, "num_input_tokens_seen": 16966960, "step": 796, "time_per_iteration": 2.7828900814056396 }, { "auxiliary_loss_clip": 0.01134077, "auxiliary_loss_mlp": 0.01008224, "balance_loss_clip": 1.02518821, "balance_loss_mlp": 1.00488651, "epoch": 0.0958335838393555, "flos": 71297979315840.0, "grad_norm": 0.885655235329743, "language_loss": 0.61548352, "learning_rate": 3.954781658395176e-06, "loss": 0.63690662, "num_input_tokens_seen": 17023215, "step": 797, "time_per_iteration": 3.3122541904449463 }, { "auxiliary_loss_clip": 0.01253677, "auxiliary_loss_mlp": 0.01044521, "balance_loss_clip": 1.06863523, "balance_loss_mlp": 1.03082347, "epoch": 0.09595382672999458, "flos": 21872974504320.0, "grad_norm": 1.8639296402891774, "language_loss": 0.92004877, "learning_rate": 3.95461680382436e-06, "loss": 0.94303071, "num_input_tokens_seen": 17042140, "step": 798, "time_per_iteration": 2.8693299293518066 }, { "auxiliary_loss_clip": 0.01252779, "auxiliary_loss_mlp": 0.01050611, "balance_loss_clip": 1.07016253, "balance_loss_mlp": 1.03900599, "epoch": 0.09607406962063368, "flos": 18695562341760.0, "grad_norm": 4.472627534954327, "language_loss": 0.86457539, "learning_rate": 3.9544516527397295e-06, "loss": 0.88760924, "num_input_tokens_seen": 17058490, "step": 799, "time_per_iteration": 3.7402000427246094 }, { "auxiliary_loss_clip": 0.01236074, "auxiliary_loss_mlp": 0.01041496, "balance_loss_clip": 1.06758046, "balance_loss_mlp": 1.02955103, "epoch": 0.09619431251127276, "flos": 22568456615040.0, "grad_norm": 3.190305863603266, "language_loss": 0.80830634, "learning_rate": 3.954286205166338e-06, "loss": 0.83108199, "num_input_tokens_seen": 17079655, "step": 800, "time_per_iteration": 2.7713770866394043 }, { "auxiliary_loss_clip": 0.01257319, "auxiliary_loss_mlp": 0.01040601, "balance_loss_clip": 1.0723685, "balance_loss_mlp": 1.02778029, "epoch": 0.09631455540191186, "flos": 14246230608000.0, "grad_norm": 2.988833314171334, "language_loss": 0.84212309, "learning_rate": 3.954120461129282e-06, "loss": 0.86510229, "num_input_tokens_seen": 17097065, "step": 801, "time_per_iteration": 3.671769142150879 }, { "auxiliary_loss_clip": 0.01256519, "auxiliary_loss_mlp": 0.01039793, "balance_loss_clip": 1.06956911, "balance_loss_mlp": 1.02846766, "epoch": 0.09643479829255096, "flos": 20740387789440.0, "grad_norm": 1.9870880100680508, "language_loss": 0.83408868, "learning_rate": 3.953954420653706e-06, "loss": 0.85705185, "num_input_tokens_seen": 17114090, "step": 802, "time_per_iteration": 3.579859972000122 }, { "auxiliary_loss_clip": 0.0125244, "auxiliary_loss_mlp": 0.01043913, "balance_loss_clip": 1.06952214, "balance_loss_mlp": 1.03140235, "epoch": 0.09655504118319004, "flos": 24420476833920.0, "grad_norm": 2.243031433519575, "language_loss": 0.88198316, "learning_rate": 3.953788083764798e-06, "loss": 0.90494674, "num_input_tokens_seen": 17133325, "step": 803, "time_per_iteration": 2.804014205932617 }, { "auxiliary_loss_clip": 0.01233118, "auxiliary_loss_mlp": 0.01049253, "balance_loss_clip": 1.06798685, "balance_loss_mlp": 1.037624, "epoch": 0.09667528407382914, "flos": 18441961344000.0, "grad_norm": 2.362184047057097, "language_loss": 0.92395335, "learning_rate": 3.953621450487792e-06, "loss": 0.94677711, "num_input_tokens_seen": 17151945, "step": 804, "time_per_iteration": 2.788952112197876 }, { "auxiliary_loss_clip": 0.01143453, "auxiliary_loss_mlp": 0.01003095, "balance_loss_clip": 1.0244137, "balance_loss_mlp": 1.00007939, "epoch": 0.09679552696446822, "flos": 70816455544320.0, "grad_norm": 0.8401465587028932, "language_loss": 0.6116817, "learning_rate": 3.953454520847964e-06, "loss": 0.63314724, "num_input_tokens_seen": 17216790, "step": 805, "time_per_iteration": 3.464216709136963 }, { "auxiliary_loss_clip": 0.01244269, "auxiliary_loss_mlp": 0.01047867, "balance_loss_clip": 1.07144022, "balance_loss_mlp": 1.03454518, "epoch": 0.09691576985510732, "flos": 21945514020480.0, "grad_norm": 2.060168502532585, "language_loss": 0.73559785, "learning_rate": 3.9532872948706395e-06, "loss": 0.75851923, "num_input_tokens_seen": 17236285, "step": 806, "time_per_iteration": 2.7540342807769775 }, { "auxiliary_loss_clip": 0.01253733, "auxiliary_loss_mlp": 0.0104363, "balance_loss_clip": 1.07075, "balance_loss_mlp": 1.03152466, "epoch": 0.09703601274574641, "flos": 17965211103360.0, "grad_norm": 2.8986741330217085, "language_loss": 0.82580525, "learning_rate": 3.9531197725811845e-06, "loss": 0.84877884, "num_input_tokens_seen": 17251670, "step": 807, "time_per_iteration": 2.678237199783325 }, { "auxiliary_loss_clip": 0.01255219, "auxiliary_loss_mlp": 0.01042992, "balance_loss_clip": 1.07238138, "balance_loss_mlp": 1.03134537, "epoch": 0.0971562556363855, "flos": 22162162901760.0, "grad_norm": 1.8332679175541549, "language_loss": 0.87912536, "learning_rate": 3.952951954005013e-06, "loss": 0.90210754, "num_input_tokens_seen": 17271355, "step": 808, "time_per_iteration": 2.80698561668396 }, { "auxiliary_loss_clip": 0.01241386, "auxiliary_loss_mlp": 0.01040079, "balance_loss_clip": 1.06507432, "balance_loss_mlp": 1.02822959, "epoch": 0.0972764985270246, "flos": 25848716394240.0, "grad_norm": 1.7941006546923617, "language_loss": 0.84681439, "learning_rate": 3.952783839167584e-06, "loss": 0.86962903, "num_input_tokens_seen": 17291400, "step": 809, "time_per_iteration": 2.671708583831787 }, { "auxiliary_loss_clip": 0.01250874, "auxiliary_loss_mlp": 0.01036536, "balance_loss_clip": 1.06636596, "balance_loss_mlp": 1.02504456, "epoch": 0.09739674141766368, "flos": 20339373375360.0, "grad_norm": 3.57139695104535, "language_loss": 0.74231416, "learning_rate": 3.952615428094398e-06, "loss": 0.76518822, "num_input_tokens_seen": 17310920, "step": 810, "time_per_iteration": 2.6113440990448 }, { "auxiliary_loss_clip": 0.0123165, "auxiliary_loss_mlp": 0.01048217, "balance_loss_clip": 1.0688324, "balance_loss_mlp": 1.03646863, "epoch": 0.09751698430830277, "flos": 15743059188480.0, "grad_norm": 1.7604818414375838, "language_loss": 0.73246706, "learning_rate": 3.952446720811004e-06, "loss": 0.75526571, "num_input_tokens_seen": 17329245, "step": 811, "time_per_iteration": 2.6127538681030273 }, { "auxiliary_loss_clip": 0.01129785, "auxiliary_loss_mlp": 0.01010384, "balance_loss_clip": 1.02162027, "balance_loss_mlp": 1.00753498, "epoch": 0.09763722719894186, "flos": 63716806800000.0, "grad_norm": 0.8411980845448138, "language_loss": 0.63601589, "learning_rate": 3.952277717342995e-06, "loss": 0.65741765, "num_input_tokens_seen": 17395680, "step": 812, "time_per_iteration": 3.338615894317627 }, { "auxiliary_loss_clip": 0.01255091, "auxiliary_loss_mlp": 0.01044381, "balance_loss_clip": 1.07162821, "balance_loss_mlp": 1.03310394, "epoch": 0.09775747008958095, "flos": 22090916275200.0, "grad_norm": 1.8933118920245713, "language_loss": 0.85350931, "learning_rate": 3.952108417716009e-06, "loss": 0.87650406, "num_input_tokens_seen": 17415135, "step": 813, "time_per_iteration": 2.609428644180298 }, { "auxiliary_loss_clip": 0.0126152, "auxiliary_loss_mlp": 0.01047554, "balance_loss_clip": 1.07910311, "balance_loss_mlp": 1.03461421, "epoch": 0.09787771298022005, "flos": 21286050272640.0, "grad_norm": 1.8062918002781645, "language_loss": 0.84830803, "learning_rate": 3.951938821955727e-06, "loss": 0.87139875, "num_input_tokens_seen": 17434535, "step": 814, "time_per_iteration": 2.542775869369507 }, { "auxiliary_loss_clip": 0.0124685, "auxiliary_loss_mlp": 0.01043351, "balance_loss_clip": 1.07137918, "balance_loss_mlp": 1.03128695, "epoch": 0.09799795587085913, "flos": 22054574689920.0, "grad_norm": 4.334545539100061, "language_loss": 0.76576173, "learning_rate": 3.9517689300878786e-06, "loss": 0.78866369, "num_input_tokens_seen": 17454270, "step": 815, "time_per_iteration": 2.61431884765625 }, { "auxiliary_loss_clip": 0.01252414, "auxiliary_loss_mlp": 0.01036583, "balance_loss_clip": 1.06675529, "balance_loss_mlp": 1.02538311, "epoch": 0.09811819876149823, "flos": 22163743100160.0, "grad_norm": 1.90468251348983, "language_loss": 0.78775072, "learning_rate": 3.951598742138236e-06, "loss": 0.81064063, "num_input_tokens_seen": 17472995, "step": 816, "time_per_iteration": 2.587822437286377 }, { "auxiliary_loss_clip": 0.01252481, "auxiliary_loss_mlp": 0.01036086, "balance_loss_clip": 1.0677439, "balance_loss_mlp": 1.02417099, "epoch": 0.09823844165213731, "flos": 22231111057920.0, "grad_norm": 2.0725379701844666, "language_loss": 0.79620522, "learning_rate": 3.951428258132615e-06, "loss": 0.8190909, "num_input_tokens_seen": 17491115, "step": 817, "time_per_iteration": 2.7709081172943115 }, { "auxiliary_loss_clip": 0.01249985, "auxiliary_loss_mlp": 0.0105109, "balance_loss_clip": 1.06973171, "balance_loss_mlp": 1.03828108, "epoch": 0.09835868454277641, "flos": 22487728798080.0, "grad_norm": 2.9191858168128273, "language_loss": 0.84369767, "learning_rate": 3.951257478096879e-06, "loss": 0.8667084, "num_input_tokens_seen": 17509480, "step": 818, "time_per_iteration": 2.888441324234009 }, { "auxiliary_loss_clip": 0.01250026, "auxiliary_loss_mlp": 0.01051079, "balance_loss_clip": 1.07144284, "balance_loss_mlp": 1.0268414, "epoch": 0.0984789274334155, "flos": 16362554077440.0, "grad_norm": 3.48114319291654, "language_loss": 0.68081218, "learning_rate": 3.951086402056936e-06, "loss": 0.70382321, "num_input_tokens_seen": 17524080, "step": 819, "time_per_iteration": 2.94992733001709 }, { "auxiliary_loss_clip": 0.01226763, "auxiliary_loss_mlp": 0.01058337, "balance_loss_clip": 1.07219267, "balance_loss_mlp": 1.034338, "epoch": 0.09859917032405459, "flos": 24243545416320.0, "grad_norm": 1.9137675635649434, "language_loss": 0.83916789, "learning_rate": 3.950915030038735e-06, "loss": 0.86201888, "num_input_tokens_seen": 17543875, "step": 820, "time_per_iteration": 3.1665616035461426 }, { "auxiliary_loss_clip": 0.01245589, "auxiliary_loss_mlp": 0.01043907, "balance_loss_clip": 1.06932056, "balance_loss_mlp": 1.03102648, "epoch": 0.09871941321469369, "flos": 17420195064960.0, "grad_norm": 2.6972427905262726, "language_loss": 0.83752728, "learning_rate": 3.9507433620682765e-06, "loss": 0.86042225, "num_input_tokens_seen": 17560810, "step": 821, "time_per_iteration": 2.9808785915374756 }, { "auxiliary_loss_clip": 0.01235151, "auxiliary_loss_mlp": 0.0104525, "balance_loss_clip": 1.06429124, "balance_loss_mlp": 1.03298891, "epoch": 0.09883965610533277, "flos": 28477341590400.0, "grad_norm": 1.966718630074403, "language_loss": 0.87997109, "learning_rate": 3.9505713981716e-06, "loss": 0.90277511, "num_input_tokens_seen": 17583640, "step": 822, "time_per_iteration": 2.8511390686035156 }, { "auxiliary_loss_clip": 0.01242008, "auxiliary_loss_mlp": 0.01038152, "balance_loss_clip": 1.06878304, "balance_loss_mlp": 1.02603436, "epoch": 0.09895989899597187, "flos": 23693932437120.0, "grad_norm": 2.076603669817647, "language_loss": 0.80980027, "learning_rate": 3.950399138374795e-06, "loss": 0.83260185, "num_input_tokens_seen": 17602720, "step": 823, "time_per_iteration": 2.7034223079681396 }, { "auxiliary_loss_clip": 0.01247702, "auxiliary_loss_mlp": 0.01036304, "balance_loss_clip": 1.06771755, "balance_loss_mlp": 1.02420402, "epoch": 0.09908014188661095, "flos": 24679608526080.0, "grad_norm": 7.722723281815372, "language_loss": 0.74283546, "learning_rate": 3.95022658270399e-06, "loss": 0.76567554, "num_input_tokens_seen": 17623085, "step": 824, "time_per_iteration": 2.7165281772613525 }, { "auxiliary_loss_clip": 0.01241901, "auxiliary_loss_mlp": 0.01041123, "balance_loss_clip": 1.0697757, "balance_loss_mlp": 1.02933919, "epoch": 0.09920038477725004, "flos": 14064307200000.0, "grad_norm": 2.3025740260884513, "language_loss": 0.78356451, "learning_rate": 3.9500537311853635e-06, "loss": 0.80639482, "num_input_tokens_seen": 17641040, "step": 825, "time_per_iteration": 3.649359941482544 }, { "auxiliary_loss_clip": 0.01246318, "auxiliary_loss_mlp": 0.01045937, "balance_loss_clip": 1.06695485, "balance_loss_mlp": 1.03384876, "epoch": 0.09932062766788914, "flos": 13407070095360.0, "grad_norm": 2.4220587390164514, "language_loss": 0.83494258, "learning_rate": 3.949880583845136e-06, "loss": 0.8578651, "num_input_tokens_seen": 17659115, "step": 826, "time_per_iteration": 2.772705316543579 }, { "auxiliary_loss_clip": 0.01247731, "auxiliary_loss_mlp": 0.01046988, "balance_loss_clip": 1.07121551, "balance_loss_mlp": 1.03567505, "epoch": 0.09944087055852822, "flos": 19500751566720.0, "grad_norm": 1.9324801746441989, "language_loss": 0.81126869, "learning_rate": 3.949707140709575e-06, "loss": 0.83421588, "num_input_tokens_seen": 17678845, "step": 827, "time_per_iteration": 3.9289801120758057 }, { "auxiliary_loss_clip": 0.01252515, "auxiliary_loss_mlp": 0.01036164, "balance_loss_clip": 1.06711304, "balance_loss_mlp": 1.02387345, "epoch": 0.09956111344916732, "flos": 17749100926080.0, "grad_norm": 2.3811911950023097, "language_loss": 0.8332752, "learning_rate": 3.949533401804991e-06, "loss": 0.85616195, "num_input_tokens_seen": 17695750, "step": 828, "time_per_iteration": 2.720952033996582 }, { "auxiliary_loss_clip": 0.01247615, "auxiliary_loss_mlp": 0.01064455, "balance_loss_clip": 1.06757379, "balance_loss_mlp": 1.03869057, "epoch": 0.0996813563398064, "flos": 17967581400960.0, "grad_norm": 2.0858157084406495, "language_loss": 0.90564299, "learning_rate": 3.949359367157739e-06, "loss": 0.92876375, "num_input_tokens_seen": 17714445, "step": 829, "time_per_iteration": 3.7392303943634033 }, { "auxiliary_loss_clip": 0.01249852, "auxiliary_loss_mlp": 0.01046873, "balance_loss_clip": 1.06754577, "balance_loss_mlp": 1.03464186, "epoch": 0.0998015992304455, "flos": 17457039440640.0, "grad_norm": 2.0851234456534162, "language_loss": 0.75354308, "learning_rate": 3.949185036794222e-06, "loss": 0.77651036, "num_input_tokens_seen": 17732455, "step": 830, "time_per_iteration": 2.751166582107544 }, { "auxiliary_loss_clip": 0.01250601, "auxiliary_loss_mlp": 0.01046492, "balance_loss_clip": 1.06792891, "balance_loss_mlp": 1.03467298, "epoch": 0.0999218421210846, "flos": 25888757080320.0, "grad_norm": 1.7481540519302345, "language_loss": 0.78677458, "learning_rate": 3.949010410740884e-06, "loss": 0.80974555, "num_input_tokens_seen": 17755280, "step": 831, "time_per_iteration": 2.792191743850708 }, { "auxiliary_loss_clip": 0.01237628, "auxiliary_loss_mlp": 0.01051036, "balance_loss_clip": 1.0694598, "balance_loss_mlp": 1.02645636, "epoch": 0.10004208501172368, "flos": 21215916967680.0, "grad_norm": 1.6859505134349737, "language_loss": 0.86584783, "learning_rate": 3.948835489024216e-06, "loss": 0.8887344, "num_input_tokens_seen": 17775015, "step": 832, "time_per_iteration": 2.807192087173462 }, { "auxiliary_loss_clip": 0.01249063, "auxiliary_loss_mlp": 0.01043055, "balance_loss_clip": 1.06707478, "balance_loss_mlp": 1.03063941, "epoch": 0.10016232790236278, "flos": 17348409734400.0, "grad_norm": 2.0625348002486477, "language_loss": 0.90366232, "learning_rate": 3.948660271670755e-06, "loss": 0.92658347, "num_input_tokens_seen": 17792165, "step": 833, "time_per_iteration": 2.7188658714294434 }, { "auxiliary_loss_clip": 0.01237049, "auxiliary_loss_mlp": 0.01038126, "balance_loss_clip": 1.0642966, "balance_loss_mlp": 1.02677107, "epoch": 0.10028257079300186, "flos": 25666541591040.0, "grad_norm": 5.419820719166812, "language_loss": 0.83978373, "learning_rate": 3.948484758707079e-06, "loss": 0.86253542, "num_input_tokens_seen": 17811765, "step": 834, "time_per_iteration": 2.8059849739074707 }, { "auxiliary_loss_clip": 0.01234237, "auxiliary_loss_mlp": 0.01033712, "balance_loss_clip": 1.06875086, "balance_loss_mlp": 1.02141511, "epoch": 0.10040281368364096, "flos": 25156035544320.0, "grad_norm": 2.0245716559042113, "language_loss": 0.83657956, "learning_rate": 3.948308950159815e-06, "loss": 0.85925901, "num_input_tokens_seen": 17830445, "step": 835, "time_per_iteration": 2.813962459564209 }, { "auxiliary_loss_clip": 0.01235769, "auxiliary_loss_mlp": 0.0104179, "balance_loss_clip": 1.06896639, "balance_loss_mlp": 1.02839053, "epoch": 0.10052305657428004, "flos": 17603303621760.0, "grad_norm": 3.789702241875587, "language_loss": 0.75886613, "learning_rate": 3.9481328460556326e-06, "loss": 0.78164172, "num_input_tokens_seen": 17847665, "step": 836, "time_per_iteration": 2.8133883476257324 }, { "auxiliary_loss_clip": 0.01233573, "auxiliary_loss_mlp": 0.01041882, "balance_loss_clip": 1.06618953, "balance_loss_mlp": 1.02925801, "epoch": 0.10064329946491914, "flos": 18660154510080.0, "grad_norm": 2.756580316428835, "language_loss": 0.89711106, "learning_rate": 3.9479564464212455e-06, "loss": 0.91986561, "num_input_tokens_seen": 17866825, "step": 837, "time_per_iteration": 2.790709972381592 }, { "auxiliary_loss_clip": 0.01255891, "auxiliary_loss_mlp": 0.01039287, "balance_loss_clip": 1.06675196, "balance_loss_mlp": 1.02683508, "epoch": 0.10076354235555823, "flos": 17199056983680.0, "grad_norm": 2.4242778707152244, "language_loss": 0.76567125, "learning_rate": 3.947779751283414e-06, "loss": 0.78862309, "num_input_tokens_seen": 17883995, "step": 838, "time_per_iteration": 2.69792103767395 }, { "auxiliary_loss_clip": 0.01248377, "auxiliary_loss_mlp": 0.01054945, "balance_loss_clip": 1.07011747, "balance_loss_mlp": 1.03020406, "epoch": 0.10088378524619732, "flos": 22962252395520.0, "grad_norm": 1.943613901878396, "language_loss": 0.76128137, "learning_rate": 3.947602760668944e-06, "loss": 0.78431457, "num_input_tokens_seen": 17903785, "step": 839, "time_per_iteration": 2.727902889251709 }, { "auxiliary_loss_clip": 0.0124827, "auxiliary_loss_mlp": 0.01046468, "balance_loss_clip": 1.06889629, "balance_loss_mlp": 1.03486335, "epoch": 0.10100402813683641, "flos": 37885828746240.0, "grad_norm": 2.0709302865112513, "language_loss": 0.7158674, "learning_rate": 3.947425474604684e-06, "loss": 0.73881483, "num_input_tokens_seen": 17927720, "step": 840, "time_per_iteration": 2.9103474617004395 }, { "auxiliary_loss_clip": 0.01241371, "auxiliary_loss_mlp": 0.01046807, "balance_loss_clip": 1.06643355, "balance_loss_mlp": 1.03505325, "epoch": 0.1011242710274755, "flos": 21543458112000.0, "grad_norm": 2.3974476400608355, "language_loss": 0.92422748, "learning_rate": 3.947247893117528e-06, "loss": 0.94710922, "num_input_tokens_seen": 17946225, "step": 841, "time_per_iteration": 2.7867681980133057 }, { "auxiliary_loss_clip": 0.01242288, "auxiliary_loss_mlp": 0.01039611, "balance_loss_clip": 1.06765485, "balance_loss_mlp": 1.0280534, "epoch": 0.10124451391811459, "flos": 13621456419840.0, "grad_norm": 3.030719696948576, "language_loss": 0.70004213, "learning_rate": 3.947070016234413e-06, "loss": 0.72286111, "num_input_tokens_seen": 17962015, "step": 842, "time_per_iteration": 2.6667325496673584 }, { "auxiliary_loss_clip": 0.01252837, "auxiliary_loss_mlp": 0.01041082, "balance_loss_clip": 1.06885517, "balance_loss_mlp": 1.02841008, "epoch": 0.10136475680875369, "flos": 16649228522880.0, "grad_norm": 2.467230963375393, "language_loss": 0.74894834, "learning_rate": 3.946891843982326e-06, "loss": 0.77188754, "num_input_tokens_seen": 17979680, "step": 843, "time_per_iteration": 2.775205612182617 }, { "auxiliary_loss_clip": 0.01247053, "auxiliary_loss_mlp": 0.01039946, "balance_loss_clip": 1.06730831, "balance_loss_mlp": 1.02763772, "epoch": 0.10148499969939277, "flos": 19461034103040.0, "grad_norm": 2.3565520211309474, "language_loss": 0.74472153, "learning_rate": 3.9467133763882935e-06, "loss": 0.7675916, "num_input_tokens_seen": 17998145, "step": 844, "time_per_iteration": 2.7071855068206787 }, { "auxiliary_loss_clip": 0.01238887, "auxiliary_loss_mlp": 0.01048894, "balance_loss_clip": 1.0669806, "balance_loss_mlp": 1.03691959, "epoch": 0.10160524259003187, "flos": 21104988791040.0, "grad_norm": 2.1259754338555252, "language_loss": 0.86351597, "learning_rate": 3.9465346134793905e-06, "loss": 0.88639379, "num_input_tokens_seen": 18017955, "step": 845, "time_per_iteration": 2.730617046356201 }, { "auxiliary_loss_clip": 0.01234075, "auxiliary_loss_mlp": 0.01046149, "balance_loss_clip": 1.06686091, "balance_loss_mlp": 1.03401399, "epoch": 0.10172548548067095, "flos": 17712687513600.0, "grad_norm": 2.28148459623744, "language_loss": 0.79900163, "learning_rate": 3.9463555552827335e-06, "loss": 0.82180393, "num_input_tokens_seen": 18035125, "step": 846, "time_per_iteration": 2.81539249420166 }, { "auxiliary_loss_clip": 0.01238433, "auxiliary_loss_mlp": 0.01045962, "balance_loss_clip": 1.06828403, "balance_loss_mlp": 1.03461337, "epoch": 0.10184572837131005, "flos": 21104845136640.0, "grad_norm": 2.459827513389045, "language_loss": 0.86304039, "learning_rate": 3.946176201825487e-06, "loss": 0.88588434, "num_input_tokens_seen": 18053160, "step": 847, "time_per_iteration": 2.739212989807129 }, { "auxiliary_loss_clip": 0.01244061, "auxiliary_loss_mlp": 0.01040081, "balance_loss_clip": 1.06901765, "balance_loss_mlp": 1.02735496, "epoch": 0.10196597126194913, "flos": 26067591918720.0, "grad_norm": 1.9786171025554258, "language_loss": 0.83447087, "learning_rate": 3.9459965531348575e-06, "loss": 0.85731232, "num_input_tokens_seen": 18072815, "step": 848, "time_per_iteration": 2.8938515186309814 }, { "auxiliary_loss_clip": 0.01241407, "auxiliary_loss_mlp": 0.01061126, "balance_loss_clip": 1.06593859, "balance_loss_mlp": 1.03649914, "epoch": 0.10208621415258823, "flos": 29314634595840.0, "grad_norm": 2.3311986071444375, "language_loss": 0.85705662, "learning_rate": 3.945816609238098e-06, "loss": 0.88008195, "num_input_tokens_seen": 18092225, "step": 849, "time_per_iteration": 2.90035080909729 }, { "auxiliary_loss_clip": 0.01228979, "auxiliary_loss_mlp": 0.01042085, "balance_loss_clip": 1.0710237, "balance_loss_mlp": 1.03005075, "epoch": 0.10220645704322733, "flos": 23805794367360.0, "grad_norm": 1.9097428104186707, "language_loss": 0.8511681, "learning_rate": 3.945636370162507e-06, "loss": 0.87387872, "num_input_tokens_seen": 18112335, "step": 850, "time_per_iteration": 2.891876459121704 }, { "auxiliary_loss_clip": 0.01242264, "auxiliary_loss_mlp": 0.01043076, "balance_loss_clip": 1.06549239, "balance_loss_mlp": 1.03205466, "epoch": 0.10232669993386641, "flos": 23218546913280.0, "grad_norm": 1.8695629518844519, "language_loss": 0.79298592, "learning_rate": 3.945455835935425e-06, "loss": 0.81583929, "num_input_tokens_seen": 18131520, "step": 851, "time_per_iteration": 3.8366963863372803 }, { "auxiliary_loss_clip": 0.01245446, "auxiliary_loss_mlp": 0.01036466, "balance_loss_clip": 1.07019353, "balance_loss_mlp": 1.02493262, "epoch": 0.1024469428245055, "flos": 22922929981440.0, "grad_norm": 3.335893204513816, "language_loss": 0.75264418, "learning_rate": 3.94527500658424e-06, "loss": 0.77546328, "num_input_tokens_seen": 18149185, "step": 852, "time_per_iteration": 4.695363521575928 }, { "auxiliary_loss_clip": 0.01233409, "auxiliary_loss_mlp": 0.0104047, "balance_loss_clip": 1.0688858, "balance_loss_mlp": 1.02887726, "epoch": 0.10256718571514459, "flos": 31359495957120.0, "grad_norm": 2.612659776823827, "language_loss": 0.81229055, "learning_rate": 3.945093882136382e-06, "loss": 0.83502942, "num_input_tokens_seen": 18172960, "step": 853, "time_per_iteration": 2.88545298576355 }, { "auxiliary_loss_clip": 0.01246408, "auxiliary_loss_mlp": 0.01051855, "balance_loss_clip": 1.0701766, "balance_loss_mlp": 1.02829754, "epoch": 0.10268742860578368, "flos": 23474877344640.0, "grad_norm": 1.9714610155836931, "language_loss": 0.84410405, "learning_rate": 3.944912462619329e-06, "loss": 0.86708665, "num_input_tokens_seen": 18191925, "step": 854, "time_per_iteration": 3.764402151107788 }, { "auxiliary_loss_clip": 0.01250034, "auxiliary_loss_mlp": 0.01051203, "balance_loss_clip": 1.07092106, "balance_loss_mlp": 1.03913903, "epoch": 0.10280767149642277, "flos": 25520313323520.0, "grad_norm": 2.4388229598641327, "language_loss": 0.80965948, "learning_rate": 3.9447307480606025e-06, "loss": 0.83267188, "num_input_tokens_seen": 18212010, "step": 855, "time_per_iteration": 2.8496358394622803 }, { "auxiliary_loss_clip": 0.01236909, "auxiliary_loss_mlp": 0.0104355, "balance_loss_clip": 1.06821883, "balance_loss_mlp": 1.03098512, "epoch": 0.10292791438706186, "flos": 17347691462400.0, "grad_norm": 2.2403792240227975, "language_loss": 0.89941585, "learning_rate": 3.944548738487767e-06, "loss": 0.92222047, "num_input_tokens_seen": 18229525, "step": 856, "time_per_iteration": 2.7451353073120117 }, { "auxiliary_loss_clip": 0.01256298, "auxiliary_loss_mlp": 0.01047805, "balance_loss_clip": 1.06896138, "balance_loss_mlp": 1.03524065, "epoch": 0.10304815727770096, "flos": 27052693390080.0, "grad_norm": 2.093534751268398, "language_loss": 0.90503407, "learning_rate": 3.944366433928434e-06, "loss": 0.92807519, "num_input_tokens_seen": 18249505, "step": 857, "time_per_iteration": 2.752077579498291 }, { "auxiliary_loss_clip": 0.01236633, "auxiliary_loss_mlp": 0.01038586, "balance_loss_clip": 1.06720424, "balance_loss_mlp": 1.02656972, "epoch": 0.10316840016834004, "flos": 22782591544320.0, "grad_norm": 4.38803268125093, "language_loss": 0.83577853, "learning_rate": 3.9441838344102594e-06, "loss": 0.85853076, "num_input_tokens_seen": 18269230, "step": 858, "time_per_iteration": 2.7536935806274414 }, { "auxiliary_loss_clip": 0.01248618, "auxiliary_loss_mlp": 0.01042148, "balance_loss_clip": 1.06922853, "balance_loss_mlp": 1.029917, "epoch": 0.10328864305897914, "flos": 20704584908160.0, "grad_norm": 3.225542262215515, "language_loss": 0.67157185, "learning_rate": 3.944000939960943e-06, "loss": 0.69447947, "num_input_tokens_seen": 18287955, "step": 859, "time_per_iteration": 2.792832851409912 }, { "auxiliary_loss_clip": 0.01251302, "auxiliary_loss_mlp": 0.01045446, "balance_loss_clip": 1.06759584, "balance_loss_mlp": 1.0341568, "epoch": 0.10340888594961822, "flos": 28478814048000.0, "grad_norm": 1.655953620111518, "language_loss": 0.79878604, "learning_rate": 3.943817750608229e-06, "loss": 0.8217535, "num_input_tokens_seen": 18310505, "step": 860, "time_per_iteration": 2.875016212463379 }, { "auxiliary_loss_clip": 0.01252718, "auxiliary_loss_mlp": 0.01046753, "balance_loss_clip": 1.07073498, "balance_loss_mlp": 1.03571391, "epoch": 0.10352912884025732, "flos": 13370333460480.0, "grad_norm": 2.392774712016236, "language_loss": 0.82649732, "learning_rate": 3.943634266379908e-06, "loss": 0.84949201, "num_input_tokens_seen": 18327400, "step": 861, "time_per_iteration": 2.817561626434326 }, { "auxiliary_loss_clip": 0.01252924, "auxiliary_loss_mlp": 0.01035749, "balance_loss_clip": 1.0701201, "balance_loss_mlp": 1.02389383, "epoch": 0.10364937173089642, "flos": 25558558329600.0, "grad_norm": 1.6579168757987237, "language_loss": 0.84772974, "learning_rate": 3.943450487303815e-06, "loss": 0.87061644, "num_input_tokens_seen": 18347895, "step": 862, "time_per_iteration": 2.703089714050293 }, { "auxiliary_loss_clip": 0.01247614, "auxiliary_loss_mlp": 0.01049517, "balance_loss_clip": 1.07102311, "balance_loss_mlp": 1.03711307, "epoch": 0.1037696146215355, "flos": 21215486004480.0, "grad_norm": 2.851865805074018, "language_loss": 0.85379612, "learning_rate": 3.943266413407827e-06, "loss": 0.8767674, "num_input_tokens_seen": 18367170, "step": 863, "time_per_iteration": 2.7073943614959717 }, { "auxiliary_loss_clip": 0.01250175, "auxiliary_loss_mlp": 0.01042843, "balance_loss_clip": 1.06876564, "balance_loss_mlp": 1.03091002, "epoch": 0.1038898575121746, "flos": 25807382818560.0, "grad_norm": 2.1822153104397555, "language_loss": 0.85394251, "learning_rate": 3.94308204471987e-06, "loss": 0.87687272, "num_input_tokens_seen": 18386185, "step": 864, "time_per_iteration": 2.8160269260406494 }, { "auxiliary_loss_clip": 0.01241498, "auxiliary_loss_mlp": 0.0104379, "balance_loss_clip": 1.06872725, "balance_loss_mlp": 1.03217924, "epoch": 0.10401010040281368, "flos": 19062425900160.0, "grad_norm": 3.0248195337441635, "language_loss": 0.74552232, "learning_rate": 3.942897381267912e-06, "loss": 0.76837516, "num_input_tokens_seen": 18402550, "step": 865, "time_per_iteration": 2.84598445892334 }, { "auxiliary_loss_clip": 0.0124947, "auxiliary_loss_mlp": 0.01043263, "balance_loss_clip": 1.06942081, "balance_loss_mlp": 1.03097868, "epoch": 0.10413034329345278, "flos": 16355119962240.0, "grad_norm": 3.017245722205518, "language_loss": 0.65918237, "learning_rate": 3.942712423079965e-06, "loss": 0.68210971, "num_input_tokens_seen": 18418940, "step": 866, "time_per_iteration": 2.9108242988586426 }, { "auxiliary_loss_clip": 0.01227543, "auxiliary_loss_mlp": 0.01036551, "balance_loss_clip": 1.06685328, "balance_loss_mlp": 1.02535129, "epoch": 0.10425058618409186, "flos": 17236511890560.0, "grad_norm": 2.589674443315421, "language_loss": 0.89737576, "learning_rate": 3.942527170184088e-06, "loss": 0.92001677, "num_input_tokens_seen": 18435560, "step": 867, "time_per_iteration": 2.7852203845977783 }, { "auxiliary_loss_clip": 0.01254072, "auxiliary_loss_mlp": 0.01047334, "balance_loss_clip": 1.06874239, "balance_loss_mlp": 1.0350256, "epoch": 0.10437082907473096, "flos": 17967365919360.0, "grad_norm": 2.2469364198074113, "language_loss": 0.77460843, "learning_rate": 3.942341622608385e-06, "loss": 0.7976225, "num_input_tokens_seen": 18452590, "step": 868, "time_per_iteration": 2.7520227432250977 }, { "auxiliary_loss_clip": 0.01248705, "auxiliary_loss_mlp": 0.0104018, "balance_loss_clip": 1.07291842, "balance_loss_mlp": 1.02890277, "epoch": 0.10449107196537005, "flos": 36283315374720.0, "grad_norm": 3.2717076028275125, "language_loss": 0.77644205, "learning_rate": 3.942155780381001e-06, "loss": 0.79933083, "num_input_tokens_seen": 18476325, "step": 869, "time_per_iteration": 2.9024930000305176 }, { "auxiliary_loss_clip": 0.01249482, "auxiliary_loss_mlp": 0.01042742, "balance_loss_clip": 1.0696857, "balance_loss_mlp": 1.03092265, "epoch": 0.10461131485600914, "flos": 23802095266560.0, "grad_norm": 2.4107502834328405, "language_loss": 0.75930965, "learning_rate": 3.94196964353013e-06, "loss": 0.78223193, "num_input_tokens_seen": 18495775, "step": 870, "time_per_iteration": 2.856259822845459 }, { "auxiliary_loss_clip": 0.01242809, "auxiliary_loss_mlp": 0.0104892, "balance_loss_clip": 1.06878734, "balance_loss_mlp": 1.02505088, "epoch": 0.10473155774664823, "flos": 18405476104320.0, "grad_norm": 2.0482276468813025, "language_loss": 0.80624264, "learning_rate": 3.941783212084008e-06, "loss": 0.82915986, "num_input_tokens_seen": 18513530, "step": 871, "time_per_iteration": 2.8009486198425293 }, { "auxiliary_loss_clip": 0.01231001, "auxiliary_loss_mlp": 0.01042893, "balance_loss_clip": 1.06931949, "balance_loss_mlp": 1.03150272, "epoch": 0.10485180063728732, "flos": 25592637358080.0, "grad_norm": 2.616039742888629, "language_loss": 0.79362321, "learning_rate": 3.941596486070916e-06, "loss": 0.8163622, "num_input_tokens_seen": 18531575, "step": 872, "time_per_iteration": 2.746345043182373 }, { "auxiliary_loss_clip": 0.01226654, "auxiliary_loss_mlp": 0.01044912, "balance_loss_clip": 1.06905532, "balance_loss_mlp": 1.03217483, "epoch": 0.10497204352792641, "flos": 27088747666560.0, "grad_norm": 3.0379413140710647, "language_loss": 0.58218598, "learning_rate": 3.941409465519182e-06, "loss": 0.60490167, "num_input_tokens_seen": 18552100, "step": 873, "time_per_iteration": 2.9402718544006348 }, { "auxiliary_loss_clip": 0.01239019, "auxiliary_loss_mlp": 0.01040792, "balance_loss_clip": 1.06690681, "balance_loss_mlp": 1.02922249, "epoch": 0.10509228641856551, "flos": 32858479353600.0, "grad_norm": 1.941465896945166, "language_loss": 0.85523355, "learning_rate": 3.941222150457176e-06, "loss": 0.87803161, "num_input_tokens_seen": 18575355, "step": 874, "time_per_iteration": 2.8769404888153076 }, { "auxiliary_loss_clip": 0.01251054, "auxiliary_loss_mlp": 0.01041225, "balance_loss_clip": 1.06655121, "balance_loss_mlp": 1.03040111, "epoch": 0.10521252930920459, "flos": 14319165173760.0, "grad_norm": 3.189793348271846, "language_loss": 0.71276367, "learning_rate": 3.941034540913311e-06, "loss": 0.73568642, "num_input_tokens_seen": 18592885, "step": 875, "time_per_iteration": 2.757279634475708 }, { "auxiliary_loss_clip": 0.01249361, "auxiliary_loss_mlp": 0.01058262, "balance_loss_clip": 1.06966615, "balance_loss_mlp": 1.03286123, "epoch": 0.10533277219984369, "flos": 21687028773120.0, "grad_norm": 1.642727139522823, "language_loss": 0.82547164, "learning_rate": 3.940846636916051e-06, "loss": 0.84854788, "num_input_tokens_seen": 18612920, "step": 876, "time_per_iteration": 2.803462028503418 }, { "auxiliary_loss_clip": 0.0124035, "auxiliary_loss_mlp": 0.01038283, "balance_loss_clip": 1.07080054, "balance_loss_mlp": 1.02645707, "epoch": 0.10545301509048277, "flos": 22269787027200.0, "grad_norm": 2.09472342185113, "language_loss": 0.86414444, "learning_rate": 3.940658438493899e-06, "loss": 0.88693082, "num_input_tokens_seen": 18630765, "step": 877, "time_per_iteration": 3.8218274116516113 }, { "auxiliary_loss_clip": 0.0125504, "auxiliary_loss_mlp": 0.01035914, "balance_loss_clip": 1.06756425, "balance_loss_mlp": 1.02368271, "epoch": 0.10557325798112187, "flos": 22199725549440.0, "grad_norm": 2.879703119857944, "language_loss": 0.76350802, "learning_rate": 3.940469945675405e-06, "loss": 0.78641754, "num_input_tokens_seen": 18649150, "step": 878, "time_per_iteration": 3.7670934200286865 }, { "auxiliary_loss_clip": 0.01215241, "auxiliary_loss_mlp": 0.0103898, "balance_loss_clip": 1.06818843, "balance_loss_mlp": 1.02836394, "epoch": 0.10569350087176095, "flos": 25775889569280.0, "grad_norm": 1.9301066201558843, "language_loss": 0.91299111, "learning_rate": 3.940281158489163e-06, "loss": 0.93553334, "num_input_tokens_seen": 18668380, "step": 879, "time_per_iteration": 3.886686086654663 }, { "auxiliary_loss_clip": 0.01231734, "auxiliary_loss_mlp": 0.01040454, "balance_loss_clip": 1.07014239, "balance_loss_mlp": 1.02900386, "epoch": 0.10581374376240005, "flos": 17311385790720.0, "grad_norm": 2.4167583493466136, "language_loss": 0.82878339, "learning_rate": 3.940092076963812e-06, "loss": 0.85150528, "num_input_tokens_seen": 18685875, "step": 880, "time_per_iteration": 2.8120815753936768 }, { "auxiliary_loss_clip": 0.01250857, "auxiliary_loss_mlp": 0.01043967, "balance_loss_clip": 1.07347131, "balance_loss_mlp": 1.03209388, "epoch": 0.10593398665303914, "flos": 34349454017280.0, "grad_norm": 2.432881584740356, "language_loss": 0.7874366, "learning_rate": 3.9399027011280355e-06, "loss": 0.81038493, "num_input_tokens_seen": 18707970, "step": 881, "time_per_iteration": 3.792334794998169 }, { "auxiliary_loss_clip": 0.01245001, "auxiliary_loss_mlp": 0.01041284, "balance_loss_clip": 1.07213974, "balance_loss_mlp": 1.0294112, "epoch": 0.10605422954367823, "flos": 23257977068160.0, "grad_norm": 2.353524464244968, "language_loss": 0.77310312, "learning_rate": 3.939713031010561e-06, "loss": 0.79596591, "num_input_tokens_seen": 18726335, "step": 882, "time_per_iteration": 2.898613452911377 }, { "auxiliary_loss_clip": 0.01238513, "auxiliary_loss_mlp": 0.01040501, "balance_loss_clip": 1.06860054, "balance_loss_mlp": 1.02904522, "epoch": 0.10617447243431732, "flos": 22820118278400.0, "grad_norm": 2.1306371873015415, "language_loss": 0.7783004, "learning_rate": 3.939523066640163e-06, "loss": 0.8010906, "num_input_tokens_seen": 18745230, "step": 883, "time_per_iteration": 2.8556156158447266 }, { "auxiliary_loss_clip": 0.01247341, "auxiliary_loss_mlp": 0.01039041, "balance_loss_clip": 1.06786108, "balance_loss_mlp": 1.02757955, "epoch": 0.10629471532495641, "flos": 24386577373440.0, "grad_norm": 2.3675030370694885, "language_loss": 0.81371266, "learning_rate": 3.939332808045657e-06, "loss": 0.83657646, "num_input_tokens_seen": 18764880, "step": 884, "time_per_iteration": 2.8031904697418213 }, { "auxiliary_loss_clip": 0.01236985, "auxiliary_loss_mlp": 0.0104049, "balance_loss_clip": 1.06972897, "balance_loss_mlp": 1.02887321, "epoch": 0.1064149582155955, "flos": 21105491581440.0, "grad_norm": 1.8525688092182069, "language_loss": 0.84383756, "learning_rate": 3.939142255255906e-06, "loss": 0.86661232, "num_input_tokens_seen": 18785765, "step": 885, "time_per_iteration": 2.8282570838928223 }, { "auxiliary_loss_clip": 0.01244928, "auxiliary_loss_mlp": 0.01045261, "balance_loss_clip": 1.06906986, "balance_loss_mlp": 1.03333402, "epoch": 0.1065352011062346, "flos": 20702035042560.0, "grad_norm": 2.023270994144896, "language_loss": 0.86681056, "learning_rate": 3.938951408299817e-06, "loss": 0.88971251, "num_input_tokens_seen": 18804605, "step": 886, "time_per_iteration": 2.7579290866851807 }, { "auxiliary_loss_clip": 0.01141824, "auxiliary_loss_mlp": 0.01015637, "balance_loss_clip": 1.03958607, "balance_loss_mlp": 1.01239479, "epoch": 0.10665544399687368, "flos": 62659632689280.0, "grad_norm": 0.8027992557964336, "language_loss": 0.54339242, "learning_rate": 3.938760267206342e-06, "loss": 0.56496704, "num_input_tokens_seen": 18866425, "step": 887, "time_per_iteration": 3.257416248321533 }, { "auxiliary_loss_clip": 0.01250862, "auxiliary_loss_mlp": 0.01037744, "balance_loss_clip": 1.06960249, "balance_loss_mlp": 1.02591836, "epoch": 0.10677568688751278, "flos": 26140382830080.0, "grad_norm": 2.14454432306022, "language_loss": 0.78621042, "learning_rate": 3.938568832004475e-06, "loss": 0.80909646, "num_input_tokens_seen": 18885130, "step": 888, "time_per_iteration": 2.756239414215088 }, { "auxiliary_loss_clip": 0.01231687, "auxiliary_loss_mlp": 0.01048419, "balance_loss_clip": 1.06832933, "balance_loss_mlp": 1.03725529, "epoch": 0.10689592977815186, "flos": 12786533712000.0, "grad_norm": 2.264783721247642, "language_loss": 0.75501287, "learning_rate": 3.938377102723257e-06, "loss": 0.77781391, "num_input_tokens_seen": 18902265, "step": 889, "time_per_iteration": 2.7613821029663086 }, { "auxiliary_loss_clip": 0.01226863, "auxiliary_loss_mlp": 0.01049041, "balance_loss_clip": 1.07420945, "balance_loss_mlp": 1.03668499, "epoch": 0.10701617266879096, "flos": 22126683242880.0, "grad_norm": 2.25846482064921, "language_loss": 0.83309484, "learning_rate": 3.938185079391774e-06, "loss": 0.85585392, "num_input_tokens_seen": 18919310, "step": 890, "time_per_iteration": 2.8766226768493652 }, { "auxiliary_loss_clip": 0.0125288, "auxiliary_loss_mlp": 0.0104487, "balance_loss_clip": 1.06868601, "balance_loss_mlp": 1.03305638, "epoch": 0.10713641555943004, "flos": 19745625559680.0, "grad_norm": 2.5787827288500194, "language_loss": 1.05779052, "learning_rate": 3.937992762039157e-06, "loss": 1.08076811, "num_input_tokens_seen": 18932635, "step": 891, "time_per_iteration": 2.8548648357391357 }, { "auxiliary_loss_clip": 0.0124374, "auxiliary_loss_mlp": 0.0104974, "balance_loss_clip": 1.0693866, "balance_loss_mlp": 1.03855848, "epoch": 0.10725665845006914, "flos": 23952992302080.0, "grad_norm": 2.183523706665765, "language_loss": 0.80458963, "learning_rate": 3.937800150694577e-06, "loss": 0.82752436, "num_input_tokens_seen": 18953810, "step": 892, "time_per_iteration": 2.8853511810302734 }, { "auxiliary_loss_clip": 0.0123368, "auxiliary_loss_mlp": 0.01042856, "balance_loss_clip": 1.06863952, "balance_loss_mlp": 1.03109622, "epoch": 0.10737690134070824, "flos": 18551704371840.0, "grad_norm": 2.17396115555653, "language_loss": 0.76426721, "learning_rate": 3.937607245387255e-06, "loss": 0.7870326, "num_input_tokens_seen": 18973175, "step": 893, "time_per_iteration": 2.9322643280029297 }, { "auxiliary_loss_clip": 0.01251888, "auxiliary_loss_mlp": 0.01036354, "balance_loss_clip": 1.07144976, "balance_loss_mlp": 1.02597117, "epoch": 0.10749714423134732, "flos": 22707609903360.0, "grad_norm": 3.121399573413518, "language_loss": 0.72342205, "learning_rate": 3.937414046146455e-06, "loss": 0.74630451, "num_input_tokens_seen": 18991130, "step": 894, "time_per_iteration": 2.836453914642334 }, { "auxiliary_loss_clip": 0.01256007, "auxiliary_loss_mlp": 0.0104124, "balance_loss_clip": 1.07349515, "balance_loss_mlp": 1.03005254, "epoch": 0.10761738712198642, "flos": 21106066199040.0, "grad_norm": 2.434220471202079, "language_loss": 0.75261736, "learning_rate": 3.9372205530014845e-06, "loss": 0.7755897, "num_input_tokens_seen": 19009610, "step": 895, "time_per_iteration": 2.8820345401763916 }, { "auxiliary_loss_clip": 0.01252427, "auxiliary_loss_mlp": 0.01049853, "balance_loss_clip": 1.06774426, "balance_loss_mlp": 1.03727615, "epoch": 0.1077376300126255, "flos": 23766723348480.0, "grad_norm": 2.2573404678809834, "language_loss": 0.71259511, "learning_rate": 3.937026765981696e-06, "loss": 0.73561782, "num_input_tokens_seen": 19029680, "step": 896, "time_per_iteration": 2.7680675983428955 }, { "auxiliary_loss_clip": 0.01244967, "auxiliary_loss_mlp": 0.01039007, "balance_loss_clip": 1.07425487, "balance_loss_mlp": 1.02792621, "epoch": 0.1078578729032646, "flos": 20919581763840.0, "grad_norm": 2.0959272028296314, "language_loss": 0.79724121, "learning_rate": 3.936832685116488e-06, "loss": 0.820081, "num_input_tokens_seen": 19047775, "step": 897, "time_per_iteration": 2.8523614406585693 }, { "auxiliary_loss_clip": 0.01252353, "auxiliary_loss_mlp": 0.01049575, "balance_loss_clip": 1.06957901, "balance_loss_mlp": 1.0378325, "epoch": 0.10797811579390369, "flos": 14829886702080.0, "grad_norm": 3.3778362333228653, "language_loss": 0.89904898, "learning_rate": 3.936638310435301e-06, "loss": 0.92206824, "num_input_tokens_seen": 19065640, "step": 898, "time_per_iteration": 2.8318896293640137 }, { "auxiliary_loss_clip": 0.01252783, "auxiliary_loss_mlp": 0.01044588, "balance_loss_clip": 1.07090902, "balance_loss_mlp": 1.03318596, "epoch": 0.10809835868454278, "flos": 19536985411200.0, "grad_norm": 2.2780080549515636, "language_loss": 0.81397539, "learning_rate": 3.936443641967623e-06, "loss": 0.83694911, "num_input_tokens_seen": 19084470, "step": 899, "time_per_iteration": 2.711665153503418 }, { "auxiliary_loss_clip": 0.01246049, "auxiliary_loss_mlp": 0.01034042, "balance_loss_clip": 1.07088399, "balance_loss_mlp": 1.02265787, "epoch": 0.10821860157518187, "flos": 18442320480000.0, "grad_norm": 3.0058050791018687, "language_loss": 0.83351529, "learning_rate": 3.936248679742983e-06, "loss": 0.85631621, "num_input_tokens_seen": 19102965, "step": 900, "time_per_iteration": 2.7346572875976562 }, { "auxiliary_loss_clip": 0.01134823, "auxiliary_loss_mlp": 0.01004484, "balance_loss_clip": 1.03182006, "balance_loss_mlp": 1.00169492, "epoch": 0.10833884446582095, "flos": 49359468447360.0, "grad_norm": 1.059400274527463, "language_loss": 0.70118082, "learning_rate": 3.936053423790959e-06, "loss": 0.72257394, "num_input_tokens_seen": 19151285, "step": 901, "time_per_iteration": 3.086912155151367 }, { "auxiliary_loss_clip": 0.01250997, "auxiliary_loss_mlp": 0.01046443, "balance_loss_clip": 1.06862164, "balance_loss_mlp": 1.03459358, "epoch": 0.10845908735646005, "flos": 20411912891520.0, "grad_norm": 1.9833171695973006, "language_loss": 0.77389145, "learning_rate": 3.935857874141168e-06, "loss": 0.79686582, "num_input_tokens_seen": 19170120, "step": 902, "time_per_iteration": 2.7621817588806152 }, { "auxiliary_loss_clip": 0.01238162, "auxiliary_loss_mlp": 0.01052065, "balance_loss_clip": 1.07051444, "balance_loss_mlp": 1.04072237, "epoch": 0.10857933024709913, "flos": 14027750133120.0, "grad_norm": 2.3464573343182327, "language_loss": 0.83471411, "learning_rate": 3.935662030823279e-06, "loss": 0.85761636, "num_input_tokens_seen": 19186305, "step": 903, "time_per_iteration": 4.788660764694214 }, { "auxiliary_loss_clip": 0.01249225, "auxiliary_loss_mlp": 0.01053466, "balance_loss_clip": 1.06806684, "balance_loss_mlp": 1.04130673, "epoch": 0.10869957313773823, "flos": 13369004657280.0, "grad_norm": 2.5853186607895045, "language_loss": 0.7229706, "learning_rate": 3.935465893866998e-06, "loss": 0.74599755, "num_input_tokens_seen": 19204530, "step": 904, "time_per_iteration": 2.8148622512817383 }, { "auxiliary_loss_clip": 0.0124126, "auxiliary_loss_mlp": 0.01041631, "balance_loss_clip": 1.070768, "balance_loss_mlp": 1.03002024, "epoch": 0.10881981602837733, "flos": 25807095509760.0, "grad_norm": 2.9546657266962795, "language_loss": 0.80141962, "learning_rate": 3.935269463302079e-06, "loss": 0.82424855, "num_input_tokens_seen": 19222735, "step": 905, "time_per_iteration": 3.974177837371826 }, { "auxiliary_loss_clip": 0.01255155, "auxiliary_loss_mlp": 0.01042962, "balance_loss_clip": 1.07243824, "balance_loss_mlp": 1.03109443, "epoch": 0.10894005891901641, "flos": 20777555387520.0, "grad_norm": 1.806614628394978, "language_loss": 0.76735568, "learning_rate": 3.935072739158322e-06, "loss": 0.79033685, "num_input_tokens_seen": 19242445, "step": 906, "time_per_iteration": 3.9312164783477783 }, { "auxiliary_loss_clip": 0.01243306, "auxiliary_loss_mlp": 0.01046846, "balance_loss_clip": 1.06905866, "balance_loss_mlp": 1.036111, "epoch": 0.10906030180965551, "flos": 26649883296000.0, "grad_norm": 1.5356611005744973, "language_loss": 0.79763758, "learning_rate": 3.934875721465569e-06, "loss": 0.82053912, "num_input_tokens_seen": 19262865, "step": 907, "time_per_iteration": 2.9026734828948975 }, { "auxiliary_loss_clip": 0.01243444, "auxiliary_loss_mlp": 0.01041135, "balance_loss_clip": 1.07084835, "balance_loss_mlp": 1.02926207, "epoch": 0.10918054470029459, "flos": 36534402420480.0, "grad_norm": 2.6976831544129287, "language_loss": 0.71593285, "learning_rate": 3.9346784102537076e-06, "loss": 0.73877859, "num_input_tokens_seen": 19285000, "step": 908, "time_per_iteration": 2.9102673530578613 }, { "auxiliary_loss_clip": 0.01253088, "auxiliary_loss_mlp": 0.01044208, "balance_loss_clip": 1.07014596, "balance_loss_mlp": 1.03309202, "epoch": 0.10930078759093369, "flos": 21762549118080.0, "grad_norm": 2.081906779556614, "language_loss": 0.78465307, "learning_rate": 3.934480805552669e-06, "loss": 0.80762601, "num_input_tokens_seen": 19306010, "step": 909, "time_per_iteration": 2.847489356994629 }, { "auxiliary_loss_clip": 0.01249254, "auxiliary_loss_mlp": 0.01059049, "balance_loss_clip": 1.06759942, "balance_loss_mlp": 1.03403282, "epoch": 0.10942103048157277, "flos": 22601781457920.0, "grad_norm": 1.9046067078491178, "language_loss": 0.88079494, "learning_rate": 3.93428290739243e-06, "loss": 0.90387797, "num_input_tokens_seen": 19325380, "step": 910, "time_per_iteration": 2.751296281814575 }, { "auxiliary_loss_clip": 0.01247701, "auxiliary_loss_mlp": 0.01044047, "balance_loss_clip": 1.07188475, "balance_loss_mlp": 1.03270435, "epoch": 0.10954127337221187, "flos": 15045781397760.0, "grad_norm": 2.8089341041444116, "language_loss": 0.79923761, "learning_rate": 3.9340847158030125e-06, "loss": 0.822155, "num_input_tokens_seen": 19338960, "step": 911, "time_per_iteration": 2.7907192707061768 }, { "auxiliary_loss_clip": 0.01248401, "auxiliary_loss_mlp": 0.01048059, "balance_loss_clip": 1.06807017, "balance_loss_mlp": 1.03536892, "epoch": 0.10966151626285096, "flos": 21650974496640.0, "grad_norm": 1.8598266612337018, "language_loss": 0.75301456, "learning_rate": 3.9338862308144814e-06, "loss": 0.77597916, "num_input_tokens_seen": 19357780, "step": 912, "time_per_iteration": 2.762669563293457 }, { "auxiliary_loss_clip": 0.01251805, "auxiliary_loss_mlp": 0.01039834, "balance_loss_clip": 1.07081234, "balance_loss_mlp": 1.02881289, "epoch": 0.10978175915349005, "flos": 20121359777280.0, "grad_norm": 1.6826209328036834, "language_loss": 0.84747589, "learning_rate": 3.933687452456946e-06, "loss": 0.87039232, "num_input_tokens_seen": 19377680, "step": 913, "time_per_iteration": 2.8181090354919434 }, { "auxiliary_loss_clip": 0.01242696, "auxiliary_loss_mlp": 0.01037805, "balance_loss_clip": 1.07057512, "balance_loss_mlp": 1.0252763, "epoch": 0.10990200204412914, "flos": 20412667077120.0, "grad_norm": 2.8040035718186407, "language_loss": 0.86306632, "learning_rate": 3.933488380760562e-06, "loss": 0.88587129, "num_input_tokens_seen": 19397040, "step": 914, "time_per_iteration": 2.844904661178589 }, { "auxiliary_loss_clip": 0.01252393, "auxiliary_loss_mlp": 0.01050972, "balance_loss_clip": 1.06989515, "balance_loss_mlp": 1.02532125, "epoch": 0.11002224493476823, "flos": 17530117660800.0, "grad_norm": 2.698145087040996, "language_loss": 0.87113321, "learning_rate": 3.9332890157555286e-06, "loss": 0.89416695, "num_input_tokens_seen": 19413975, "step": 915, "time_per_iteration": 2.898489475250244 }, { "auxiliary_loss_clip": 0.01249218, "auxiliary_loss_mlp": 0.01046533, "balance_loss_clip": 1.07132471, "balance_loss_mlp": 1.03474951, "epoch": 0.11014248782540732, "flos": 12203093099520.0, "grad_norm": 2.2062040196197934, "language_loss": 0.76085734, "learning_rate": 3.933089357472088e-06, "loss": 0.78381491, "num_input_tokens_seen": 19432005, "step": 916, "time_per_iteration": 2.867419481277466 }, { "auxiliary_loss_clip": 0.0125201, "auxiliary_loss_mlp": 0.01042189, "balance_loss_clip": 1.07144547, "balance_loss_mlp": 1.03048217, "epoch": 0.11026273071604642, "flos": 22382977760640.0, "grad_norm": 3.1994260348465366, "language_loss": 0.85675609, "learning_rate": 3.932889405940529e-06, "loss": 0.87969816, "num_input_tokens_seen": 19450100, "step": 917, "time_per_iteration": 2.7224156856536865 }, { "auxiliary_loss_clip": 0.01248624, "auxiliary_loss_mlp": 0.01045439, "balance_loss_clip": 1.07499743, "balance_loss_mlp": 1.03372681, "epoch": 0.1103829736066855, "flos": 19829046896640.0, "grad_norm": 2.446961832408989, "language_loss": 0.79996276, "learning_rate": 3.932689161191184e-06, "loss": 0.82290328, "num_input_tokens_seen": 19467805, "step": 918, "time_per_iteration": 2.7773001194000244 }, { "auxiliary_loss_clip": 0.01248631, "auxiliary_loss_mlp": 0.01034921, "balance_loss_clip": 1.06927347, "balance_loss_mlp": 1.02378666, "epoch": 0.1105032164973246, "flos": 22669616292480.0, "grad_norm": 2.0895885480511245, "language_loss": 0.88228893, "learning_rate": 3.93248862325443e-06, "loss": 0.90512443, "num_input_tokens_seen": 19486710, "step": 919, "time_per_iteration": 2.856862783432007 }, { "auxiliary_loss_clip": 0.01136367, "auxiliary_loss_mlp": 0.01014917, "balance_loss_clip": 1.02561593, "balance_loss_mlp": 1.01206779, "epoch": 0.11062345938796368, "flos": 66483507876480.0, "grad_norm": 0.9322762301310786, "language_loss": 0.64421469, "learning_rate": 3.932287792160688e-06, "loss": 0.66572756, "num_input_tokens_seen": 19545170, "step": 920, "time_per_iteration": 3.2364633083343506 }, { "auxiliary_loss_clip": 0.01254669, "auxiliary_loss_mlp": 0.01036471, "balance_loss_clip": 1.07215261, "balance_loss_mlp": 1.02484798, "epoch": 0.11074370227860278, "flos": 21907771804800.0, "grad_norm": 2.494520788747652, "language_loss": 0.80712891, "learning_rate": 3.932086667940424e-06, "loss": 0.83004028, "num_input_tokens_seen": 19561875, "step": 921, "time_per_iteration": 2.793415069580078 }, { "auxiliary_loss_clip": 0.01243174, "auxiliary_loss_mlp": 0.01058717, "balance_loss_clip": 1.06891716, "balance_loss_mlp": 1.03272069, "epoch": 0.11086394516924186, "flos": 28658115763200.0, "grad_norm": 1.9637054536345058, "language_loss": 0.81878734, "learning_rate": 3.93188525062415e-06, "loss": 0.84180629, "num_input_tokens_seen": 19582340, "step": 922, "time_per_iteration": 2.8975439071655273 }, { "auxiliary_loss_clip": 0.01252159, "auxiliary_loss_mlp": 0.01047844, "balance_loss_clip": 1.07202172, "balance_loss_mlp": 1.03578579, "epoch": 0.11098418805988096, "flos": 24535247765760.0, "grad_norm": 1.9298647840474945, "language_loss": 0.86125439, "learning_rate": 3.931683540242418e-06, "loss": 0.88425446, "num_input_tokens_seen": 19603405, "step": 923, "time_per_iteration": 2.8340024948120117 }, { "auxiliary_loss_clip": 0.01242989, "auxiliary_loss_mlp": 0.01039404, "balance_loss_clip": 1.06992126, "balance_loss_mlp": 1.02734625, "epoch": 0.11110443095052006, "flos": 22960384888320.0, "grad_norm": 2.602440352268072, "language_loss": 0.91060603, "learning_rate": 3.9314815368258295e-06, "loss": 0.93342996, "num_input_tokens_seen": 19619885, "step": 924, "time_per_iteration": 2.7708823680877686 }, { "auxiliary_loss_clip": 0.01249881, "auxiliary_loss_mlp": 0.01045076, "balance_loss_clip": 1.0713973, "balance_loss_mlp": 1.03406668, "epoch": 0.11122467384115914, "flos": 18950025265920.0, "grad_norm": 1.699706112605806, "language_loss": 0.78907746, "learning_rate": 3.9312792404050275e-06, "loss": 0.81202704, "num_input_tokens_seen": 19637940, "step": 925, "time_per_iteration": 2.8436927795410156 }, { "auxiliary_loss_clip": 0.01249983, "auxiliary_loss_mlp": 0.01042873, "balance_loss_clip": 1.07011306, "balance_loss_mlp": 1.03188157, "epoch": 0.11134491673179824, "flos": 25082957324160.0, "grad_norm": 3.3154576313472974, "language_loss": 0.77121699, "learning_rate": 3.9310766510107e-06, "loss": 0.79414546, "num_input_tokens_seen": 19657115, "step": 926, "time_per_iteration": 2.8522913455963135 }, { "auxiliary_loss_clip": 0.0124575, "auxiliary_loss_mlp": 0.01042713, "balance_loss_clip": 1.07034063, "balance_loss_mlp": 1.03022039, "epoch": 0.11146515962243732, "flos": 24499121662080.0, "grad_norm": 2.1383467625228767, "language_loss": 0.92135018, "learning_rate": 3.9308737686735806e-06, "loss": 0.94423485, "num_input_tokens_seen": 19677075, "step": 927, "time_per_iteration": 2.8536665439605713 }, { "auxiliary_loss_clip": 0.01254153, "auxiliary_loss_mlp": 0.01039953, "balance_loss_clip": 1.07151103, "balance_loss_mlp": 1.02812171, "epoch": 0.11158540251307641, "flos": 22343763087360.0, "grad_norm": 2.065196245582182, "language_loss": 0.83007324, "learning_rate": 3.9306705934244455e-06, "loss": 0.85301423, "num_input_tokens_seen": 19697155, "step": 928, "time_per_iteration": 2.7228927612304688 }, { "auxiliary_loss_clip": 0.01231977, "auxiliary_loss_mlp": 0.01041113, "balance_loss_clip": 1.06904519, "balance_loss_mlp": 1.02948415, "epoch": 0.11170564540371551, "flos": 19902304684800.0, "grad_norm": 1.8651536354149507, "language_loss": 0.88313007, "learning_rate": 3.930467125294116e-06, "loss": 0.90586102, "num_input_tokens_seen": 19716705, "step": 929, "time_per_iteration": 4.768381595611572 }, { "auxiliary_loss_clip": 0.01120911, "auxiliary_loss_mlp": 0.01007224, "balance_loss_clip": 1.02858377, "balance_loss_mlp": 1.00443399, "epoch": 0.1118258882943546, "flos": 64586239499520.0, "grad_norm": 0.9269180672902438, "language_loss": 0.60471392, "learning_rate": 3.930263364313458e-06, "loss": 0.62599528, "num_input_tokens_seen": 19767275, "step": 930, "time_per_iteration": 4.202829360961914 }, { "auxiliary_loss_clip": 0.01238497, "auxiliary_loss_mlp": 0.01048698, "balance_loss_clip": 1.07120061, "balance_loss_mlp": 1.03659213, "epoch": 0.11194613118499369, "flos": 17201965985280.0, "grad_norm": 1.89981533293993, "language_loss": 0.83035666, "learning_rate": 3.930059310513384e-06, "loss": 0.85322863, "num_input_tokens_seen": 19786315, "step": 931, "time_per_iteration": 2.875528335571289 }, { "auxiliary_loss_clip": 0.01230322, "auxiliary_loss_mlp": 0.01061872, "balance_loss_clip": 1.07236671, "balance_loss_mlp": 1.03274179, "epoch": 0.11206637407563277, "flos": 31863465728640.0, "grad_norm": 1.7289195121410603, "language_loss": 0.8379007, "learning_rate": 3.929854963924846e-06, "loss": 0.86082262, "num_input_tokens_seen": 19806580, "step": 932, "time_per_iteration": 3.8331427574157715 }, { "auxiliary_loss_clip": 0.0123897, "auxiliary_loss_mlp": 0.01044989, "balance_loss_clip": 1.06868458, "balance_loss_mlp": 1.03299046, "epoch": 0.11218661696627187, "flos": 21945621761280.0, "grad_norm": 4.321140242089656, "language_loss": 0.77262878, "learning_rate": 3.929650324578845e-06, "loss": 0.79546839, "num_input_tokens_seen": 19826045, "step": 933, "time_per_iteration": 2.7367255687713623 }, { "auxiliary_loss_clip": 0.01248335, "auxiliary_loss_mlp": 0.0105069, "balance_loss_clip": 1.07034731, "balance_loss_mlp": 1.03862023, "epoch": 0.11230685985691095, "flos": 25878198481920.0, "grad_norm": 8.2964576469411, "language_loss": 0.82202512, "learning_rate": 3.929445392506423e-06, "loss": 0.84501535, "num_input_tokens_seen": 19843985, "step": 934, "time_per_iteration": 2.8040616512298584 }, { "auxiliary_loss_clip": 0.01245265, "auxiliary_loss_mlp": 0.01046219, "balance_loss_clip": 1.0718044, "balance_loss_mlp": 1.03482306, "epoch": 0.11242710274755005, "flos": 22231506107520.0, "grad_norm": 2.094781409540121, "language_loss": 0.75866938, "learning_rate": 3.92924016773867e-06, "loss": 0.7815842, "num_input_tokens_seen": 19860480, "step": 935, "time_per_iteration": 2.725297451019287 }, { "auxiliary_loss_clip": 0.01241355, "auxiliary_loss_mlp": 0.01056293, "balance_loss_clip": 1.06956959, "balance_loss_mlp": 1.02799094, "epoch": 0.11254734563818915, "flos": 17712184723200.0, "grad_norm": 2.2269835295053686, "language_loss": 0.73666394, "learning_rate": 3.9290346503067175e-06, "loss": 0.75964034, "num_input_tokens_seen": 19877145, "step": 936, "time_per_iteration": 2.9228954315185547 }, { "auxiliary_loss_clip": 0.01251976, "auxiliary_loss_mlp": 0.0104128, "balance_loss_clip": 1.06969357, "balance_loss_mlp": 1.03036642, "epoch": 0.11266758852882823, "flos": 54930397334400.0, "grad_norm": 2.6139215292076607, "language_loss": 0.7867918, "learning_rate": 3.9288288402417415e-06, "loss": 0.80972445, "num_input_tokens_seen": 19903405, "step": 937, "time_per_iteration": 3.115772008895874 }, { "auxiliary_loss_clip": 0.01251488, "auxiliary_loss_mlp": 0.01037781, "balance_loss_clip": 1.07139897, "balance_loss_mlp": 1.02550292, "epoch": 0.11278783141946733, "flos": 18878132194560.0, "grad_norm": 2.8824024381074174, "language_loss": 0.70326543, "learning_rate": 3.928622737574964e-06, "loss": 0.72615808, "num_input_tokens_seen": 19918740, "step": 938, "time_per_iteration": 2.738718032836914 }, { "auxiliary_loss_clip": 0.0124597, "auxiliary_loss_mlp": 0.0103727, "balance_loss_clip": 1.07046914, "balance_loss_mlp": 1.02547991, "epoch": 0.11290807431010641, "flos": 26469252777600.0, "grad_norm": 2.809657987117608, "language_loss": 0.91156757, "learning_rate": 3.928416342337652e-06, "loss": 0.93439996, "num_input_tokens_seen": 19938475, "step": 939, "time_per_iteration": 2.8235549926757812 }, { "auxiliary_loss_clip": 0.01242083, "auxiliary_loss_mlp": 0.01048495, "balance_loss_clip": 1.07006955, "balance_loss_mlp": 1.03667498, "epoch": 0.1130283172007455, "flos": 22710590732160.0, "grad_norm": 1.8789148451253217, "language_loss": 0.82879412, "learning_rate": 3.928209654561113e-06, "loss": 0.85169983, "num_input_tokens_seen": 19959310, "step": 940, "time_per_iteration": 2.747596263885498 }, { "auxiliary_loss_clip": 0.01235416, "auxiliary_loss_mlp": 0.01037505, "balance_loss_clip": 1.06858182, "balance_loss_mlp": 1.0256083, "epoch": 0.1131485600913846, "flos": 23219911630080.0, "grad_norm": 1.9726117110754786, "language_loss": 0.81232756, "learning_rate": 3.928002674276703e-06, "loss": 0.83505678, "num_input_tokens_seen": 19978700, "step": 941, "time_per_iteration": 3.11974835395813 }, { "auxiliary_loss_clip": 0.01219149, "auxiliary_loss_mlp": 0.01039617, "balance_loss_clip": 1.06762719, "balance_loss_mlp": 1.02748704, "epoch": 0.11326880298202369, "flos": 14064271286400.0, "grad_norm": 3.7333700734404163, "language_loss": 0.75782174, "learning_rate": 3.92779540151582e-06, "loss": 0.78040934, "num_input_tokens_seen": 19995785, "step": 942, "time_per_iteration": 2.935824155807495 }, { "auxiliary_loss_clip": 0.01243694, "auxiliary_loss_mlp": 0.01038812, "balance_loss_clip": 1.0682317, "balance_loss_mlp": 1.02708793, "epoch": 0.11338904587266278, "flos": 16325386479360.0, "grad_norm": 1.8380723100637648, "language_loss": 0.85716319, "learning_rate": 3.927587836309907e-06, "loss": 0.87998825, "num_input_tokens_seen": 20013615, "step": 943, "time_per_iteration": 2.8071887493133545 }, { "auxiliary_loss_clip": 0.01236851, "auxiliary_loss_mlp": 0.0104144, "balance_loss_clip": 1.0691365, "balance_loss_mlp": 1.02953088, "epoch": 0.11350928876330187, "flos": 24426258923520.0, "grad_norm": 2.0838450948024225, "language_loss": 0.78246921, "learning_rate": 3.927379978690452e-06, "loss": 0.80525208, "num_input_tokens_seen": 20032880, "step": 944, "time_per_iteration": 2.7959675788879395 }, { "auxiliary_loss_clip": 0.01231711, "auxiliary_loss_mlp": 0.01037847, "balance_loss_clip": 1.06893444, "balance_loss_mlp": 1.02655172, "epoch": 0.11362953165394096, "flos": 24497074586880.0, "grad_norm": 2.3776566761457394, "language_loss": 0.87425458, "learning_rate": 3.927171828688987e-06, "loss": 0.89695013, "num_input_tokens_seen": 20052405, "step": 945, "time_per_iteration": 2.8315927982330322 }, { "auxiliary_loss_clip": 0.01252286, "auxiliary_loss_mlp": 0.01035977, "balance_loss_clip": 1.07168138, "balance_loss_mlp": 1.02384198, "epoch": 0.11374977454458005, "flos": 24060831909120.0, "grad_norm": 2.4152626147253744, "language_loss": 0.82471085, "learning_rate": 3.926963386337088e-06, "loss": 0.84759349, "num_input_tokens_seen": 20070635, "step": 946, "time_per_iteration": 2.8119490146636963 }, { "auxiliary_loss_clip": 0.0125679, "auxiliary_loss_mlp": 0.01038507, "balance_loss_clip": 1.06991374, "balance_loss_mlp": 1.02664018, "epoch": 0.11387001743521914, "flos": 39457638967680.0, "grad_norm": 2.171524735555395, "language_loss": 0.70211589, "learning_rate": 3.926754651666375e-06, "loss": 0.72506887, "num_input_tokens_seen": 20091195, "step": 947, "time_per_iteration": 2.9677817821502686 }, { "auxiliary_loss_clip": 0.01240398, "auxiliary_loss_mlp": 0.01048977, "balance_loss_clip": 1.07081771, "balance_loss_mlp": 1.03713918, "epoch": 0.11399026032585824, "flos": 25082454533760.0, "grad_norm": 3.2892817123405336, "language_loss": 0.78060538, "learning_rate": 3.926545624708513e-06, "loss": 0.8034991, "num_input_tokens_seen": 20110435, "step": 948, "time_per_iteration": 2.9191927909851074 }, { "auxiliary_loss_clip": 0.01236183, "auxiliary_loss_mlp": 0.01041617, "balance_loss_clip": 1.06761265, "balance_loss_mlp": 1.03076291, "epoch": 0.11411050321649732, "flos": 17961835224960.0, "grad_norm": 2.646619891600514, "language_loss": 0.85428071, "learning_rate": 3.926336305495213e-06, "loss": 0.87705874, "num_input_tokens_seen": 20128995, "step": 949, "time_per_iteration": 2.8020219802856445 }, { "auxiliary_loss_clip": 0.0122895, "auxiliary_loss_mlp": 0.01043899, "balance_loss_clip": 1.07221138, "balance_loss_mlp": 1.03295541, "epoch": 0.11423074610713642, "flos": 22455409536000.0, "grad_norm": 2.116072913923697, "language_loss": 0.89219528, "learning_rate": 3.926126694058226e-06, "loss": 0.91492373, "num_input_tokens_seen": 20148145, "step": 950, "time_per_iteration": 2.760474443435669 }, { "auxiliary_loss_clip": 0.01227338, "auxiliary_loss_mlp": 0.01039651, "balance_loss_clip": 1.06896067, "balance_loss_mlp": 1.02780759, "epoch": 0.1143509889977755, "flos": 19717687756800.0, "grad_norm": 1.492577169141709, "language_loss": 0.82135797, "learning_rate": 3.92591679042935e-06, "loss": 0.84402788, "num_input_tokens_seen": 20168035, "step": 951, "time_per_iteration": 2.7688140869140625 }, { "auxiliary_loss_clip": 0.01248778, "auxiliary_loss_mlp": 0.01041273, "balance_loss_clip": 1.07144785, "balance_loss_mlp": 1.02929282, "epoch": 0.1144712318884146, "flos": 19822869757440.0, "grad_norm": 1.7516340617099821, "language_loss": 0.82251048, "learning_rate": 3.92570659464043e-06, "loss": 0.84541106, "num_input_tokens_seen": 20186095, "step": 952, "time_per_iteration": 2.78524112701416 }, { "auxiliary_loss_clip": 0.01239007, "auxiliary_loss_mlp": 0.01058714, "balance_loss_clip": 1.06680989, "balance_loss_mlp": 1.03132927, "epoch": 0.1145914747790537, "flos": 14939198766720.0, "grad_norm": 1.8295328906072976, "language_loss": 0.7948662, "learning_rate": 3.925496106723349e-06, "loss": 0.81784344, "num_input_tokens_seen": 20203535, "step": 953, "time_per_iteration": 2.7968382835388184 }, { "auxiliary_loss_clip": 0.01247521, "auxiliary_loss_mlp": 0.01046275, "balance_loss_clip": 1.06901896, "balance_loss_mlp": 1.03484297, "epoch": 0.11471171766969278, "flos": 19865029345920.0, "grad_norm": 2.561394431605893, "language_loss": 0.84212261, "learning_rate": 3.9252853267100405e-06, "loss": 0.86506057, "num_input_tokens_seen": 20222780, "step": 954, "time_per_iteration": 2.768340826034546 }, { "auxiliary_loss_clip": 0.01236781, "auxiliary_loss_mlp": 0.01040421, "balance_loss_clip": 1.07301676, "balance_loss_mlp": 1.02860188, "epoch": 0.11483196056033187, "flos": 22526476594560.0, "grad_norm": 1.8391564436027854, "language_loss": 0.83668137, "learning_rate": 3.9250742546324786e-06, "loss": 0.85945338, "num_input_tokens_seen": 20243015, "step": 955, "time_per_iteration": 3.7590622901916504 }, { "auxiliary_loss_clip": 0.01239883, "auxiliary_loss_mlp": 0.01040173, "balance_loss_clip": 1.06754148, "balance_loss_mlp": 1.02839482, "epoch": 0.11495220345097096, "flos": 28220292887040.0, "grad_norm": 1.7285000047838852, "language_loss": 0.86919141, "learning_rate": 3.924862890522683e-06, "loss": 0.89199197, "num_input_tokens_seen": 20263025, "step": 956, "time_per_iteration": 2.766704797744751 }, { "auxiliary_loss_clip": 0.01248194, "auxiliary_loss_mlp": 0.0103743, "balance_loss_clip": 1.06987023, "balance_loss_mlp": 1.02519274, "epoch": 0.11507244634161005, "flos": 17492267704320.0, "grad_norm": 2.134915581587885, "language_loss": 0.8652522, "learning_rate": 3.9246512344127174e-06, "loss": 0.88810843, "num_input_tokens_seen": 20280685, "step": 957, "time_per_iteration": 3.752478837966919 }, { "auxiliary_loss_clip": 0.01213368, "auxiliary_loss_mlp": 0.01044039, "balance_loss_clip": 1.06797814, "balance_loss_mlp": 1.0316236, "epoch": 0.11519268923224914, "flos": 22564937082240.0, "grad_norm": 3.0069712701173197, "language_loss": 0.82395077, "learning_rate": 3.9244392863346895e-06, "loss": 0.84652483, "num_input_tokens_seen": 20300090, "step": 958, "time_per_iteration": 3.8366599082946777 }, { "auxiliary_loss_clip": 0.01242947, "auxiliary_loss_mlp": 0.01047163, "balance_loss_clip": 1.07062268, "balance_loss_mlp": 1.03367496, "epoch": 0.11531293212288823, "flos": 16982839065600.0, "grad_norm": 1.861575618103434, "language_loss": 0.92353058, "learning_rate": 3.9242270463207524e-06, "loss": 0.94643164, "num_input_tokens_seen": 20318480, "step": 959, "time_per_iteration": 2.7507383823394775 }, { "auxiliary_loss_clip": 0.01219851, "auxiliary_loss_mlp": 0.01039998, "balance_loss_clip": 1.06663275, "balance_loss_mlp": 1.02831531, "epoch": 0.11543317501352733, "flos": 12422004537600.0, "grad_norm": 2.6008267017433075, "language_loss": 0.85360152, "learning_rate": 3.924014514403102e-06, "loss": 0.87620008, "num_input_tokens_seen": 20334635, "step": 960, "time_per_iteration": 2.737813949584961 }, { "auxiliary_loss_clip": 0.01224426, "auxiliary_loss_mlp": 0.0104125, "balance_loss_clip": 1.06862009, "balance_loss_mlp": 1.02886438, "epoch": 0.11555341790416641, "flos": 19821648695040.0, "grad_norm": 3.1753846045143885, "language_loss": 0.91094482, "learning_rate": 3.92380169061398e-06, "loss": 0.93360156, "num_input_tokens_seen": 20352415, "step": 961, "time_per_iteration": 2.75221586227417 }, { "auxiliary_loss_clip": 0.01232146, "auxiliary_loss_mlp": 0.01053736, "balance_loss_clip": 1.06526756, "balance_loss_mlp": 1.02647853, "epoch": 0.11567366079480551, "flos": 25738865625600.0, "grad_norm": 2.0568985410044474, "language_loss": 0.84062314, "learning_rate": 3.9235885749856705e-06, "loss": 0.86348188, "num_input_tokens_seen": 20371095, "step": 962, "time_per_iteration": 2.8639252185821533 }, { "auxiliary_loss_clip": 0.01239061, "auxiliary_loss_mlp": 0.01042394, "balance_loss_clip": 1.06799436, "balance_loss_mlp": 1.03103292, "epoch": 0.1157939036854446, "flos": 18223301301120.0, "grad_norm": 1.9045081324216429, "language_loss": 0.83007926, "learning_rate": 3.9233751675505035e-06, "loss": 0.85289383, "num_input_tokens_seen": 20389805, "step": 963, "time_per_iteration": 2.767240524291992 }, { "auxiliary_loss_clip": 0.01236452, "auxiliary_loss_mlp": 0.01044012, "balance_loss_clip": 1.06995773, "balance_loss_mlp": 1.031394, "epoch": 0.11591414657608369, "flos": 23073755189760.0, "grad_norm": 1.9348563430076602, "language_loss": 0.84708416, "learning_rate": 3.923161468340853e-06, "loss": 0.86988878, "num_input_tokens_seen": 20409640, "step": 964, "time_per_iteration": 2.7915596961975098 }, { "auxiliary_loss_clip": 0.01223462, "auxiliary_loss_mlp": 0.010378, "balance_loss_clip": 1.06755066, "balance_loss_mlp": 1.02517605, "epoch": 0.11603438946672277, "flos": 19461716461440.0, "grad_norm": 2.113368536412823, "language_loss": 0.81813872, "learning_rate": 3.9229474773891374e-06, "loss": 0.84075129, "num_input_tokens_seen": 20428180, "step": 965, "time_per_iteration": 2.883780002593994 }, { "auxiliary_loss_clip": 0.01249015, "auxiliary_loss_mlp": 0.01042844, "balance_loss_clip": 1.06919336, "balance_loss_mlp": 1.03022575, "epoch": 0.11615463235736187, "flos": 26831986272000.0, "grad_norm": 1.8065449901535378, "language_loss": 0.83765203, "learning_rate": 3.922733194727818e-06, "loss": 0.86057061, "num_input_tokens_seen": 20447975, "step": 966, "time_per_iteration": 2.8221521377563477 }, { "auxiliary_loss_clip": 0.01249648, "auxiliary_loss_mlp": 0.01042494, "balance_loss_clip": 1.06865907, "balance_loss_mlp": 1.02958918, "epoch": 0.11627487524800097, "flos": 18580324533120.0, "grad_norm": 2.050819499327881, "language_loss": 0.87516212, "learning_rate": 3.922518620389402e-06, "loss": 0.89808357, "num_input_tokens_seen": 20464840, "step": 967, "time_per_iteration": 2.8257908821105957 }, { "auxiliary_loss_clip": 0.01195704, "auxiliary_loss_mlp": 0.01043114, "balance_loss_clip": 1.0677762, "balance_loss_mlp": 1.03122878, "epoch": 0.11639511813864005, "flos": 18150474476160.0, "grad_norm": 2.1839253959209186, "language_loss": 0.89455509, "learning_rate": 3.922303754406439e-06, "loss": 0.91694331, "num_input_tokens_seen": 20482680, "step": 968, "time_per_iteration": 2.9607906341552734 }, { "auxiliary_loss_clip": 0.01222994, "auxiliary_loss_mlp": 0.01041946, "balance_loss_clip": 1.06977654, "balance_loss_mlp": 1.02994752, "epoch": 0.11651536102927915, "flos": 20922023888640.0, "grad_norm": 1.9400988737002869, "language_loss": 0.79180235, "learning_rate": 3.922088596811526e-06, "loss": 0.81445175, "num_input_tokens_seen": 20501810, "step": 969, "time_per_iteration": 2.926133632659912 }, { "auxiliary_loss_clip": 0.01235141, "auxiliary_loss_mlp": 0.01046665, "balance_loss_clip": 1.06688023, "balance_loss_mlp": 1.03477371, "epoch": 0.11663560391991823, "flos": 16508602776960.0, "grad_norm": 2.099402971229422, "language_loss": 0.86852324, "learning_rate": 3.9218731476373e-06, "loss": 0.89134127, "num_input_tokens_seen": 20517995, "step": 970, "time_per_iteration": 2.765552043914795 }, { "auxiliary_loss_clip": 0.01250625, "auxiliary_loss_mlp": 0.01049007, "balance_loss_clip": 1.07037365, "balance_loss_mlp": 1.0367707, "epoch": 0.11675584681055733, "flos": 19865029345920.0, "grad_norm": 2.202945163437194, "language_loss": 0.84676754, "learning_rate": 3.9216574069164455e-06, "loss": 0.86976385, "num_input_tokens_seen": 20536970, "step": 971, "time_per_iteration": 2.7339611053466797 }, { "auxiliary_loss_clip": 0.01244718, "auxiliary_loss_mlp": 0.01037601, "balance_loss_clip": 1.06511903, "balance_loss_mlp": 1.02673471, "epoch": 0.11687608970119642, "flos": 21944364785280.0, "grad_norm": 1.7666037712453924, "language_loss": 0.80131865, "learning_rate": 3.921441374681691e-06, "loss": 0.82414186, "num_input_tokens_seen": 20557030, "step": 972, "time_per_iteration": 2.770988702774048 }, { "auxiliary_loss_clip": 0.01230556, "auxiliary_loss_mlp": 0.01035988, "balance_loss_clip": 1.06392038, "balance_loss_mlp": 1.02333999, "epoch": 0.1169963325918355, "flos": 24061155131520.0, "grad_norm": 1.9097456786021136, "language_loss": 0.65202719, "learning_rate": 3.921225050965808e-06, "loss": 0.67469263, "num_input_tokens_seen": 20576915, "step": 973, "time_per_iteration": 2.725677013397217 }, { "auxiliary_loss_clip": 0.01236346, "auxiliary_loss_mlp": 0.01040317, "balance_loss_clip": 1.0691731, "balance_loss_mlp": 1.02726388, "epoch": 0.1171165754824746, "flos": 23368151059200.0, "grad_norm": 4.007577247448941, "language_loss": 0.75147259, "learning_rate": 3.921008435801612e-06, "loss": 0.77423918, "num_input_tokens_seen": 20596000, "step": 974, "time_per_iteration": 2.764052629470825 }, { "auxiliary_loss_clip": 0.01234725, "auxiliary_loss_mlp": 0.01039812, "balance_loss_clip": 1.06663835, "balance_loss_mlp": 1.02829027, "epoch": 0.11723681837311369, "flos": 18552243075840.0, "grad_norm": 3.4243803416087744, "language_loss": 0.76091003, "learning_rate": 3.920791529221963e-06, "loss": 0.78365541, "num_input_tokens_seen": 20614675, "step": 975, "time_per_iteration": 2.8305413722991943 }, { "auxiliary_loss_clip": 0.01236534, "auxiliary_loss_mlp": 0.01063103, "balance_loss_clip": 1.06555212, "balance_loss_mlp": 1.03319287, "epoch": 0.11735706126375278, "flos": 23550541344000.0, "grad_norm": 1.959917374348214, "language_loss": 0.7664755, "learning_rate": 3.920574331259768e-06, "loss": 0.78947186, "num_input_tokens_seen": 20635875, "step": 976, "time_per_iteration": 2.8739848136901855 }, { "auxiliary_loss_clip": 0.01230617, "auxiliary_loss_mlp": 0.01045797, "balance_loss_clip": 1.06763387, "balance_loss_mlp": 1.03399551, "epoch": 0.11747730415439187, "flos": 22381541216640.0, "grad_norm": 3.3020209443762307, "language_loss": 0.79583287, "learning_rate": 3.9203568419479716e-06, "loss": 0.81859696, "num_input_tokens_seen": 20656430, "step": 977, "time_per_iteration": 2.8047420978546143 }, { "auxiliary_loss_clip": 0.01238686, "auxiliary_loss_mlp": 0.01043612, "balance_loss_clip": 1.06814349, "balance_loss_mlp": 1.03147602, "epoch": 0.11759754704503096, "flos": 22200731130240.0, "grad_norm": 2.0923002095546233, "language_loss": 0.75453997, "learning_rate": 3.92013906131957e-06, "loss": 0.77736294, "num_input_tokens_seen": 20675360, "step": 978, "time_per_iteration": 2.678025960922241 }, { "auxiliary_loss_clip": 0.01230494, "auxiliary_loss_mlp": 0.01039921, "balance_loss_clip": 1.06545115, "balance_loss_mlp": 1.02846539, "epoch": 0.11771778993567006, "flos": 22309755886080.0, "grad_norm": 2.4781642475810033, "language_loss": 0.82582575, "learning_rate": 3.9199209894076e-06, "loss": 0.84852993, "num_input_tokens_seen": 20695675, "step": 979, "time_per_iteration": 2.769765853881836 }, { "auxiliary_loss_clip": 0.01250883, "auxiliary_loss_mlp": 0.01042358, "balance_loss_clip": 1.06620538, "balance_loss_mlp": 1.03057384, "epoch": 0.11783803282630914, "flos": 21288169175040.0, "grad_norm": 1.9820194357315308, "language_loss": 0.90035212, "learning_rate": 3.919702626245142e-06, "loss": 0.92328441, "num_input_tokens_seen": 20715330, "step": 980, "time_per_iteration": 2.813004732131958 }, { "auxiliary_loss_clip": 0.01229054, "auxiliary_loss_mlp": 0.01044638, "balance_loss_clip": 1.06667733, "balance_loss_mlp": 1.03263962, "epoch": 0.11795827571694824, "flos": 25371535190400.0, "grad_norm": 4.9461932581063826, "language_loss": 0.66381311, "learning_rate": 3.919483971865322e-06, "loss": 0.68655008, "num_input_tokens_seen": 20735325, "step": 981, "time_per_iteration": 3.759364604949951 }, { "auxiliary_loss_clip": 0.01234495, "auxiliary_loss_mlp": 0.01034877, "balance_loss_clip": 1.06633854, "balance_loss_mlp": 1.02405322, "epoch": 0.11807851860758732, "flos": 23622218933760.0, "grad_norm": 2.0573068772600918, "language_loss": 0.87819076, "learning_rate": 3.91926502630131e-06, "loss": 0.90088451, "num_input_tokens_seen": 20755940, "step": 982, "time_per_iteration": 3.7916100025177 }, { "auxiliary_loss_clip": 0.0124711, "auxiliary_loss_mlp": 0.01046865, "balance_loss_clip": 1.07081509, "balance_loss_mlp": 1.03509855, "epoch": 0.11819876149822642, "flos": 24972496024320.0, "grad_norm": 2.640723257587569, "language_loss": 0.72150379, "learning_rate": 3.91904578958632e-06, "loss": 0.74444354, "num_input_tokens_seen": 20775355, "step": 983, "time_per_iteration": 2.7929999828338623 }, { "auxiliary_loss_clip": 0.01249634, "auxiliary_loss_mlp": 0.01031858, "balance_loss_clip": 1.06863976, "balance_loss_mlp": 1.02067602, "epoch": 0.11831900438886551, "flos": 23003226835200.0, "grad_norm": 2.096107257828546, "language_loss": 0.84294116, "learning_rate": 3.918826261753608e-06, "loss": 0.86575615, "num_input_tokens_seen": 20794935, "step": 984, "time_per_iteration": 2.7459876537323 }, { "auxiliary_loss_clip": 0.01238379, "auxiliary_loss_mlp": 0.0104178, "balance_loss_clip": 1.06601155, "balance_loss_mlp": 1.03096735, "epoch": 0.1184392472795046, "flos": 27965147604480.0, "grad_norm": 2.678347185097338, "language_loss": 0.71004701, "learning_rate": 3.918606442836478e-06, "loss": 0.73284864, "num_input_tokens_seen": 20817155, "step": 985, "time_per_iteration": 3.805248737335205 }, { "auxiliary_loss_clip": 0.01240441, "auxiliary_loss_mlp": 0.01048504, "balance_loss_clip": 1.06967151, "balance_loss_mlp": 1.03778148, "epoch": 0.1185594901701437, "flos": 19898497843200.0, "grad_norm": 1.9793237083055002, "language_loss": 0.77379918, "learning_rate": 3.918386332868277e-06, "loss": 0.79668868, "num_input_tokens_seen": 20835125, "step": 986, "time_per_iteration": 2.674849271774292 }, { "auxiliary_loss_clip": 0.01237995, "auxiliary_loss_mlp": 0.01042588, "balance_loss_clip": 1.0694865, "balance_loss_mlp": 1.03138876, "epoch": 0.11867973306078278, "flos": 18912354877440.0, "grad_norm": 1.8773425343035837, "language_loss": 0.9447149, "learning_rate": 3.918165931882394e-06, "loss": 0.96752083, "num_input_tokens_seen": 20853525, "step": 987, "time_per_iteration": 2.7732436656951904 }, { "auxiliary_loss_clip": 0.01219895, "auxiliary_loss_mlp": 0.01038878, "balance_loss_clip": 1.06931615, "balance_loss_mlp": 1.02740979, "epoch": 0.11879997595142187, "flos": 16982803152000.0, "grad_norm": 2.2827352898642017, "language_loss": 0.75535893, "learning_rate": 3.917945239912264e-06, "loss": 0.77794671, "num_input_tokens_seen": 20871000, "step": 988, "time_per_iteration": 2.7904882431030273 }, { "auxiliary_loss_clip": 0.01232668, "auxiliary_loss_mlp": 0.01040014, "balance_loss_clip": 1.0694046, "balance_loss_mlp": 1.02920771, "epoch": 0.11892021884206096, "flos": 17530369056000.0, "grad_norm": 2.1395929209081195, "language_loss": 0.75360358, "learning_rate": 3.917724256991367e-06, "loss": 0.77633035, "num_input_tokens_seen": 20889745, "step": 989, "time_per_iteration": 2.7888684272766113 }, { "auxiliary_loss_clip": 0.01228888, "auxiliary_loss_mlp": 0.01039666, "balance_loss_clip": 1.06970775, "balance_loss_mlp": 1.02871072, "epoch": 0.11904046173270005, "flos": 30955895763840.0, "grad_norm": 2.2464755362104785, "language_loss": 0.81229734, "learning_rate": 3.9175029831532245e-06, "loss": 0.83498287, "num_input_tokens_seen": 20909260, "step": 990, "time_per_iteration": 2.8977298736572266 }, { "auxiliary_loss_clip": 0.01238558, "auxiliary_loss_mlp": 0.01039933, "balance_loss_clip": 1.06925404, "balance_loss_mlp": 1.02906704, "epoch": 0.11916070462333915, "flos": 20157234485760.0, "grad_norm": 2.0694833207677537, "language_loss": 0.88956398, "learning_rate": 3.917281418431404e-06, "loss": 0.91234893, "num_input_tokens_seen": 20928305, "step": 991, "time_per_iteration": 2.8600940704345703 }, { "auxiliary_loss_clip": 0.01238409, "auxiliary_loss_mlp": 0.01039722, "balance_loss_clip": 1.07050538, "balance_loss_mlp": 1.02772343, "epoch": 0.11928094751397823, "flos": 23551115961600.0, "grad_norm": 2.1322343465911415, "language_loss": 0.77067828, "learning_rate": 3.917059562859516e-06, "loss": 0.79345953, "num_input_tokens_seen": 20947630, "step": 992, "time_per_iteration": 2.7803966999053955 }, { "auxiliary_loss_clip": 0.01233221, "auxiliary_loss_mlp": 0.01041157, "balance_loss_clip": 1.07082963, "balance_loss_mlp": 1.0307796, "epoch": 0.11940119040461733, "flos": 23908426502400.0, "grad_norm": 2.048086922727037, "language_loss": 0.88626635, "learning_rate": 3.916837416471218e-06, "loss": 0.90901017, "num_input_tokens_seen": 20964250, "step": 993, "time_per_iteration": 2.854881525039673 }, { "auxiliary_loss_clip": 0.01241389, "auxiliary_loss_mlp": 0.01039998, "balance_loss_clip": 1.06808197, "balance_loss_mlp": 1.02867913, "epoch": 0.11952143329525641, "flos": 13844533835520.0, "grad_norm": 2.6340258601340794, "language_loss": 0.7232163, "learning_rate": 3.916614979300207e-06, "loss": 0.74603015, "num_input_tokens_seen": 20979095, "step": 994, "time_per_iteration": 2.8394930362701416 }, { "auxiliary_loss_clip": 0.0122528, "auxiliary_loss_mlp": 0.01041111, "balance_loss_clip": 1.07290614, "balance_loss_mlp": 1.02951241, "epoch": 0.11964167618589551, "flos": 27015525792000.0, "grad_norm": 2.192563411030235, "language_loss": 0.78578925, "learning_rate": 3.9163922513802274e-06, "loss": 0.8084532, "num_input_tokens_seen": 21001430, "step": 995, "time_per_iteration": 2.986959457397461 }, { "auxiliary_loss_clip": 0.01252235, "auxiliary_loss_mlp": 0.01035647, "balance_loss_clip": 1.0692966, "balance_loss_mlp": 1.02358842, "epoch": 0.1197619190765346, "flos": 12567622273920.0, "grad_norm": 5.270231461940922, "language_loss": 0.8217684, "learning_rate": 3.916169232745067e-06, "loss": 0.84464717, "num_input_tokens_seen": 21019105, "step": 996, "time_per_iteration": 2.7174758911132812 }, { "auxiliary_loss_clip": 0.01234644, "auxiliary_loss_mlp": 0.01042578, "balance_loss_clip": 1.06876135, "balance_loss_mlp": 1.03022194, "epoch": 0.11988216196717369, "flos": 16909437623040.0, "grad_norm": 3.1877968624791846, "language_loss": 0.91901994, "learning_rate": 3.915945923428559e-06, "loss": 0.94179213, "num_input_tokens_seen": 21035630, "step": 997, "time_per_iteration": 2.7589271068573 }, { "auxiliary_loss_clip": 0.01244269, "auxiliary_loss_mlp": 0.01038359, "balance_loss_clip": 1.0696559, "balance_loss_mlp": 1.02676034, "epoch": 0.12000240485781279, "flos": 16216577205120.0, "grad_norm": 2.42786153970931, "language_loss": 0.83132994, "learning_rate": 3.915722323464577e-06, "loss": 0.85415626, "num_input_tokens_seen": 21054235, "step": 998, "time_per_iteration": 2.850278377532959 }, { "auxiliary_loss_clip": 0.01249283, "auxiliary_loss_mlp": 0.01036567, "balance_loss_clip": 1.07113504, "balance_loss_mlp": 1.02538538, "epoch": 0.12012264774845187, "flos": 49344887525760.0, "grad_norm": 2.662086166400138, "language_loss": 0.70505309, "learning_rate": 3.91549843288704e-06, "loss": 0.72791159, "num_input_tokens_seen": 21077915, "step": 999, "time_per_iteration": 3.098916530609131 }, { "auxiliary_loss_clip": 0.01236579, "auxiliary_loss_mlp": 0.01053682, "balance_loss_clip": 1.06755137, "balance_loss_mlp": 1.02638113, "epoch": 0.12024289063909097, "flos": 26979435601920.0, "grad_norm": 2.2826738987863333, "language_loss": 0.79398656, "learning_rate": 3.915274251729916e-06, "loss": 0.81688917, "num_input_tokens_seen": 21099205, "step": 1000, "time_per_iteration": 2.8959693908691406 }, { "auxiliary_loss_clip": 0.01238379, "auxiliary_loss_mlp": 0.01045651, "balance_loss_clip": 1.07034135, "balance_loss_mlp": 1.03365874, "epoch": 0.12036313352973005, "flos": 19537308633600.0, "grad_norm": 2.050315499292593, "language_loss": 0.90227908, "learning_rate": 3.91504978002721e-06, "loss": 0.9251194, "num_input_tokens_seen": 21118260, "step": 1001, "time_per_iteration": 2.8849761486053467 }, { "auxiliary_loss_clip": 0.0124507, "auxiliary_loss_mlp": 0.01051999, "balance_loss_clip": 1.06895804, "balance_loss_mlp": 1.02557313, "epoch": 0.12048337642036915, "flos": 17268256535040.0, "grad_norm": 2.1198806738544835, "language_loss": 0.76039016, "learning_rate": 3.914825017812974e-06, "loss": 0.78336084, "num_input_tokens_seen": 21134910, "step": 1002, "time_per_iteration": 2.6356964111328125 }, { "auxiliary_loss_clip": 0.01242703, "auxiliary_loss_mlp": 0.01040973, "balance_loss_clip": 1.07279718, "balance_loss_mlp": 1.02863503, "epoch": 0.12060361931100824, "flos": 22856962654080.0, "grad_norm": 2.383284954285103, "language_loss": 0.72361797, "learning_rate": 3.9145999651213065e-06, "loss": 0.74645472, "num_input_tokens_seen": 21154150, "step": 1003, "time_per_iteration": 2.7512366771698 }, { "auxiliary_loss_clip": 0.01244984, "auxiliary_loss_mlp": 0.01042058, "balance_loss_clip": 1.06898081, "balance_loss_mlp": 1.03106713, "epoch": 0.12072386220164733, "flos": 16726795943040.0, "grad_norm": 3.1243843260752406, "language_loss": 0.88652295, "learning_rate": 3.9143746219863465e-06, "loss": 0.90939331, "num_input_tokens_seen": 21171255, "step": 1004, "time_per_iteration": 2.663829803466797 }, { "auxiliary_loss_clip": 0.01147339, "auxiliary_loss_mlp": 0.01007331, "balance_loss_clip": 1.03605318, "balance_loss_mlp": 1.00460136, "epoch": 0.12084410509228642, "flos": 55144176105600.0, "grad_norm": 0.9462654785590084, "language_loss": 0.64730132, "learning_rate": 3.914148988442278e-06, "loss": 0.66884804, "num_input_tokens_seen": 21227045, "step": 1005, "time_per_iteration": 3.2330899238586426 }, { "auxiliary_loss_clip": 0.01232095, "auxiliary_loss_mlp": 0.01041449, "balance_loss_clip": 1.07035244, "balance_loss_mlp": 1.02951002, "epoch": 0.1209643479829255, "flos": 26760236855040.0, "grad_norm": 2.787701022468161, "language_loss": 0.95403272, "learning_rate": 3.91392306452333e-06, "loss": 0.9767682, "num_input_tokens_seen": 21244120, "step": 1006, "time_per_iteration": 2.827268600463867 }, { "auxiliary_loss_clip": 0.01253747, "auxiliary_loss_mlp": 0.01041372, "balance_loss_clip": 1.07165265, "balance_loss_mlp": 1.02946341, "epoch": 0.1210845908735646, "flos": 11035026725760.0, "grad_norm": 4.151120038721776, "language_loss": 0.66691613, "learning_rate": 3.913696850263774e-06, "loss": 0.68986738, "num_input_tokens_seen": 21258485, "step": 1007, "time_per_iteration": 3.807771682739258 }, { "auxiliary_loss_clip": 0.01246612, "auxiliary_loss_mlp": 0.01035857, "balance_loss_clip": 1.07163322, "balance_loss_mlp": 1.02404356, "epoch": 0.1212048337642037, "flos": 20484631975680.0, "grad_norm": 2.44840637966266, "language_loss": 0.79026592, "learning_rate": 3.913470345697929e-06, "loss": 0.81309056, "num_input_tokens_seen": 21277115, "step": 1008, "time_per_iteration": 3.7417845726013184 }, { "auxiliary_loss_clip": 0.01234416, "auxiliary_loss_mlp": 0.01042708, "balance_loss_clip": 1.07031894, "balance_loss_mlp": 1.03042936, "epoch": 0.12132507665484278, "flos": 22346061557760.0, "grad_norm": 2.113731039940211, "language_loss": 0.85414326, "learning_rate": 3.913243550860153e-06, "loss": 0.8769145, "num_input_tokens_seen": 21294880, "step": 1009, "time_per_iteration": 2.792998790740967 }, { "auxiliary_loss_clip": 0.01252863, "auxiliary_loss_mlp": 0.01039361, "balance_loss_clip": 1.07463217, "balance_loss_mlp": 1.02746356, "epoch": 0.12144531954548188, "flos": 29314957818240.0, "grad_norm": 2.2708614080317395, "language_loss": 0.76081622, "learning_rate": 3.913016465784852e-06, "loss": 0.78373843, "num_input_tokens_seen": 21315555, "step": 1010, "time_per_iteration": 2.8063178062438965 }, { "auxiliary_loss_clip": 0.01235142, "auxiliary_loss_mlp": 0.01040868, "balance_loss_clip": 1.07157302, "balance_loss_mlp": 1.02936983, "epoch": 0.12156556243612096, "flos": 20485242506880.0, "grad_norm": 3.540110899636048, "language_loss": 0.72299993, "learning_rate": 3.912789090506474e-06, "loss": 0.74576002, "num_input_tokens_seen": 21334815, "step": 1011, "time_per_iteration": 3.7459964752197266 }, { "auxiliary_loss_clip": 0.01241965, "auxiliary_loss_mlp": 0.01042693, "balance_loss_clip": 1.0664469, "balance_loss_mlp": 1.02954471, "epoch": 0.12168580532676006, "flos": 16472009796480.0, "grad_norm": 2.5147054444900117, "language_loss": 0.72011751, "learning_rate": 3.9125614250595114e-06, "loss": 0.74296415, "num_input_tokens_seen": 21351025, "step": 1012, "time_per_iteration": 2.768200159072876 }, { "auxiliary_loss_clip": 0.01248065, "auxiliary_loss_mlp": 0.01044475, "balance_loss_clip": 1.06789756, "balance_loss_mlp": 1.03248227, "epoch": 0.12180604821739914, "flos": 15341290588800.0, "grad_norm": 2.3732611694065775, "language_loss": 0.88865656, "learning_rate": 3.912333469478502e-06, "loss": 0.91158199, "num_input_tokens_seen": 21368990, "step": 1013, "time_per_iteration": 2.77782940864563 }, { "auxiliary_loss_clip": 0.01243039, "auxiliary_loss_mlp": 0.0103402, "balance_loss_clip": 1.06997347, "balance_loss_mlp": 1.02189696, "epoch": 0.12192629110803824, "flos": 19318038059520.0, "grad_norm": 3.030379408556437, "language_loss": 0.78037202, "learning_rate": 3.912105223798025e-06, "loss": 0.80314267, "num_input_tokens_seen": 21388410, "step": 1014, "time_per_iteration": 3.005843162536621 }, { "auxiliary_loss_clip": 0.01142409, "auxiliary_loss_mlp": 0.01015896, "balance_loss_clip": 1.03412318, "balance_loss_mlp": 1.01297498, "epoch": 0.12204653399867733, "flos": 47725354085760.0, "grad_norm": 1.0064095722729105, "language_loss": 0.67637968, "learning_rate": 3.9118766880527065e-06, "loss": 0.69796264, "num_input_tokens_seen": 21442845, "step": 1015, "time_per_iteration": 3.224456310272217 }, { "auxiliary_loss_clip": 0.01223106, "auxiliary_loss_mlp": 0.0104086, "balance_loss_clip": 1.06658006, "balance_loss_mlp": 1.02844429, "epoch": 0.12216677688931642, "flos": 18221936584320.0, "grad_norm": 2.0810230315911373, "language_loss": 0.7378028, "learning_rate": 3.9116478622772145e-06, "loss": 0.76044244, "num_input_tokens_seen": 21461420, "step": 1016, "time_per_iteration": 2.7758169174194336 }, { "auxiliary_loss_clip": 0.01240665, "auxiliary_loss_mlp": 0.01036694, "balance_loss_clip": 1.06731129, "balance_loss_mlp": 1.02394485, "epoch": 0.12228701977995551, "flos": 27525636789120.0, "grad_norm": 1.8857514051251048, "language_loss": 0.88255274, "learning_rate": 3.911418746506261e-06, "loss": 0.90532637, "num_input_tokens_seen": 21481550, "step": 1017, "time_per_iteration": 2.795165538787842 }, { "auxiliary_loss_clip": 0.01249868, "auxiliary_loss_mlp": 0.01036579, "balance_loss_clip": 1.0737735, "balance_loss_mlp": 1.02474797, "epoch": 0.1224072626705946, "flos": 21798136517760.0, "grad_norm": 1.6759447345009195, "language_loss": 0.78247452, "learning_rate": 3.911189340774604e-06, "loss": 0.80533898, "num_input_tokens_seen": 21501680, "step": 1018, "time_per_iteration": 2.8489575386047363 }, { "auxiliary_loss_clip": 0.01254032, "auxiliary_loss_mlp": 0.01039195, "balance_loss_clip": 1.07220948, "balance_loss_mlp": 1.02766109, "epoch": 0.1225275055612337, "flos": 20703758895360.0, "grad_norm": 1.7402560354934706, "language_loss": 0.79394788, "learning_rate": 3.910959645117043e-06, "loss": 0.81688011, "num_input_tokens_seen": 21521015, "step": 1019, "time_per_iteration": 2.8299965858459473 }, { "auxiliary_loss_clip": 0.01134642, "auxiliary_loss_mlp": 0.01011598, "balance_loss_clip": 1.03192067, "balance_loss_mlp": 0.99951547, "epoch": 0.12264774845187278, "flos": 57745294462080.0, "grad_norm": 0.819439598922522, "language_loss": 0.56747985, "learning_rate": 3.910729659568423e-06, "loss": 0.58894229, "num_input_tokens_seen": 21578200, "step": 1020, "time_per_iteration": 3.2173655033111572 }, { "auxiliary_loss_clip": 0.01243603, "auxiliary_loss_mlp": 0.01044594, "balance_loss_clip": 1.07236004, "balance_loss_mlp": 1.03354371, "epoch": 0.12276799134251187, "flos": 26396282298240.0, "grad_norm": 1.8654215298408032, "language_loss": 0.82554054, "learning_rate": 3.9104993841636344e-06, "loss": 0.84842253, "num_input_tokens_seen": 21598770, "step": 1021, "time_per_iteration": 2.8524696826934814 }, { "auxiliary_loss_clip": 0.0123741, "auxiliary_loss_mlp": 0.01062588, "balance_loss_clip": 1.0704397, "balance_loss_mlp": 1.03687501, "epoch": 0.12288823423315097, "flos": 21064193919360.0, "grad_norm": 1.9262791563702757, "language_loss": 0.80803156, "learning_rate": 3.910268818937608e-06, "loss": 0.83103156, "num_input_tokens_seen": 21616925, "step": 1022, "time_per_iteration": 2.9094247817993164 }, { "auxiliary_loss_clip": 0.0122959, "auxiliary_loss_mlp": 0.01048169, "balance_loss_clip": 1.06817079, "balance_loss_mlp": 1.03696311, "epoch": 0.12300847712379005, "flos": 12312441077760.0, "grad_norm": 2.845629331121213, "language_loss": 0.87600261, "learning_rate": 3.9100379639253196e-06, "loss": 0.89878023, "num_input_tokens_seen": 21633645, "step": 1023, "time_per_iteration": 2.8634033203125 }, { "auxiliary_loss_clip": 0.01239338, "auxiliary_loss_mlp": 0.01038705, "balance_loss_clip": 1.0659852, "balance_loss_mlp": 1.02646232, "epoch": 0.12312872001442915, "flos": 16762239688320.0, "grad_norm": 2.568050655145364, "language_loss": 0.86328483, "learning_rate": 3.909806819161791e-06, "loss": 0.88606524, "num_input_tokens_seen": 21649120, "step": 1024, "time_per_iteration": 2.6810197830200195 }, { "auxiliary_loss_clip": 0.01240925, "auxiliary_loss_mlp": 0.01038756, "balance_loss_clip": 1.06780088, "balance_loss_mlp": 1.02674007, "epoch": 0.12324896290506823, "flos": 18404937400320.0, "grad_norm": 2.3445183579780053, "language_loss": 0.8624593, "learning_rate": 3.909575384682086e-06, "loss": 0.88525605, "num_input_tokens_seen": 21668000, "step": 1025, "time_per_iteration": 2.832249879837036 }, { "auxiliary_loss_clip": 0.01251236, "auxiliary_loss_mlp": 0.01042077, "balance_loss_clip": 1.071172, "balance_loss_mlp": 1.03077042, "epoch": 0.12336920579570733, "flos": 18915407533440.0, "grad_norm": 1.8039861555151315, "language_loss": 0.69441265, "learning_rate": 3.9093436605213144e-06, "loss": 0.71734577, "num_input_tokens_seen": 21688500, "step": 1026, "time_per_iteration": 2.858882427215576 }, { "auxiliary_loss_clip": 0.01236276, "auxiliary_loss_mlp": 0.01036209, "balance_loss_clip": 1.06553912, "balance_loss_mlp": 1.02420461, "epoch": 0.12348944868634643, "flos": 23878369797120.0, "grad_norm": 2.2636908283707564, "language_loss": 0.79231739, "learning_rate": 3.909111646714627e-06, "loss": 0.81504226, "num_input_tokens_seen": 21709345, "step": 1027, "time_per_iteration": 2.9010512828826904 }, { "auxiliary_loss_clip": 0.01244965, "auxiliary_loss_mlp": 0.01033949, "balance_loss_clip": 1.06605792, "balance_loss_mlp": 1.02286243, "epoch": 0.12360969157698551, "flos": 19026084314880.0, "grad_norm": 2.3344857979517157, "language_loss": 0.72269005, "learning_rate": 3.9088793432972206e-06, "loss": 0.74547923, "num_input_tokens_seen": 21728165, "step": 1028, "time_per_iteration": 2.7453391551971436 }, { "auxiliary_loss_clip": 0.01231852, "auxiliary_loss_mlp": 0.01040797, "balance_loss_clip": 1.06811023, "balance_loss_mlp": 1.02858388, "epoch": 0.1237299344676246, "flos": 13224607983360.0, "grad_norm": 3.0395641730591096, "language_loss": 0.8226856, "learning_rate": 3.908646750304336e-06, "loss": 0.84541214, "num_input_tokens_seen": 21745850, "step": 1029, "time_per_iteration": 2.8606929779052734 }, { "auxiliary_loss_clip": 0.01244718, "auxiliary_loss_mlp": 0.01049334, "balance_loss_clip": 1.0716331, "balance_loss_mlp": 1.03731728, "epoch": 0.12385017735826369, "flos": 20485673470080.0, "grad_norm": 1.736640473803107, "language_loss": 0.87255943, "learning_rate": 3.908413867771257e-06, "loss": 0.89549994, "num_input_tokens_seen": 21764760, "step": 1030, "time_per_iteration": 2.8837008476257324 }, { "auxiliary_loss_clip": 0.01242233, "auxiliary_loss_mlp": 0.01036642, "balance_loss_clip": 1.06825459, "balance_loss_mlp": 1.02587199, "epoch": 0.12397042024890279, "flos": 17347835116800.0, "grad_norm": 2.1644110472156792, "language_loss": 0.80745691, "learning_rate": 3.908180695733311e-06, "loss": 0.83024567, "num_input_tokens_seen": 21784250, "step": 1031, "time_per_iteration": 2.827136516571045 }, { "auxiliary_loss_clip": 0.01213238, "auxiliary_loss_mlp": 0.01038294, "balance_loss_clip": 1.06765532, "balance_loss_mlp": 1.02596211, "epoch": 0.12409066313954187, "flos": 20412343854720.0, "grad_norm": 1.786554068410113, "language_loss": 0.82799238, "learning_rate": 3.907947234225871e-06, "loss": 0.85050768, "num_input_tokens_seen": 21803260, "step": 1032, "time_per_iteration": 2.9198124408721924 }, { "auxiliary_loss_clip": 0.01224456, "auxiliary_loss_mlp": 0.01037028, "balance_loss_clip": 1.0705111, "balance_loss_mlp": 1.02551246, "epoch": 0.12421090603018096, "flos": 20736688688640.0, "grad_norm": 2.0490625985113264, "language_loss": 0.87136763, "learning_rate": 3.907713483284352e-06, "loss": 0.89398241, "num_input_tokens_seen": 21822735, "step": 1033, "time_per_iteration": 3.869858503341675 }, { "auxiliary_loss_clip": 0.01215952, "auxiliary_loss_mlp": 0.01047766, "balance_loss_clip": 1.06756818, "balance_loss_mlp": 1.03590429, "epoch": 0.12433114892082006, "flos": 24498834353280.0, "grad_norm": 2.4945176474568203, "language_loss": 0.97551411, "learning_rate": 3.907479442944216e-06, "loss": 0.99815136, "num_input_tokens_seen": 21841140, "step": 1034, "time_per_iteration": 3.9980783462524414 }, { "auxiliary_loss_clip": 0.01244071, "auxiliary_loss_mlp": 0.01034627, "balance_loss_clip": 1.07078993, "balance_loss_mlp": 1.02393985, "epoch": 0.12445139181145914, "flos": 19682315838720.0, "grad_norm": 2.193374799251978, "language_loss": 0.92263448, "learning_rate": 3.907245113240963e-06, "loss": 0.94542146, "num_input_tokens_seen": 21859260, "step": 1035, "time_per_iteration": 2.880251884460449 }, { "auxiliary_loss_clip": 0.01240644, "auxiliary_loss_mlp": 0.01040992, "balance_loss_clip": 1.07173944, "balance_loss_mlp": 1.02875519, "epoch": 0.12457163470209824, "flos": 46423087522560.0, "grad_norm": 3.4255791817045123, "language_loss": 0.73489404, "learning_rate": 3.907010494210144e-06, "loss": 0.75771034, "num_input_tokens_seen": 21881920, "step": 1036, "time_per_iteration": 3.092686414718628 }, { "auxiliary_loss_clip": 0.01248112, "auxiliary_loss_mlp": 0.01041157, "balance_loss_clip": 1.06989384, "balance_loss_mlp": 1.02883625, "epoch": 0.12469187759273732, "flos": 20376289578240.0, "grad_norm": 6.552491352161927, "language_loss": 0.9193176, "learning_rate": 3.9067755858873495e-06, "loss": 0.94221032, "num_input_tokens_seen": 21898720, "step": 1037, "time_per_iteration": 3.7857377529144287 }, { "auxiliary_loss_clip": 0.01137114, "auxiliary_loss_mlp": 0.01009838, "balance_loss_clip": 1.03154421, "balance_loss_mlp": 1.00694144, "epoch": 0.12481212048337642, "flos": 69224641447680.0, "grad_norm": 0.8646650955470166, "language_loss": 0.62838149, "learning_rate": 3.906540388308214e-06, "loss": 0.64985102, "num_input_tokens_seen": 21958305, "step": 1038, "time_per_iteration": 3.3519763946533203 }, { "auxiliary_loss_clip": 0.01218008, "auxiliary_loss_mlp": 0.01036146, "balance_loss_clip": 1.06945598, "balance_loss_mlp": 1.02499986, "epoch": 0.12493236337401552, "flos": 18223696350720.0, "grad_norm": 3.010381533989509, "language_loss": 0.81446946, "learning_rate": 3.906304901508417e-06, "loss": 0.83701098, "num_input_tokens_seen": 21977205, "step": 1039, "time_per_iteration": 2.80419921875 }, { "auxiliary_loss_clip": 0.01249219, "auxiliary_loss_mlp": 0.01047515, "balance_loss_clip": 1.07430041, "balance_loss_mlp": 1.03560615, "epoch": 0.12505260626465461, "flos": 30044375303040.0, "grad_norm": 2.4323738876810963, "language_loss": 0.75860995, "learning_rate": 3.9060691255236835e-06, "loss": 0.78157735, "num_input_tokens_seen": 21997770, "step": 1040, "time_per_iteration": 2.7682290077209473 }, { "auxiliary_loss_clip": 0.01241193, "auxiliary_loss_mlp": 0.01041339, "balance_loss_clip": 1.06703925, "balance_loss_mlp": 1.02934051, "epoch": 0.1251728491552937, "flos": 24433980347520.0, "grad_norm": 2.3352559893386133, "language_loss": 0.80610073, "learning_rate": 3.905833060389778e-06, "loss": 0.82892609, "num_input_tokens_seen": 22021890, "step": 1041, "time_per_iteration": 2.87939453125 }, { "auxiliary_loss_clip": 0.01250469, "auxiliary_loss_mlp": 0.01050537, "balance_loss_clip": 1.07117283, "balance_loss_mlp": 1.02403533, "epoch": 0.12529309204593278, "flos": 27119809952640.0, "grad_norm": 2.959432792124691, "language_loss": 0.78651059, "learning_rate": 3.905596706142513e-06, "loss": 0.8095206, "num_input_tokens_seen": 22043300, "step": 1042, "time_per_iteration": 2.872035026550293 }, { "auxiliary_loss_clip": 0.01228977, "auxiliary_loss_mlp": 0.01037655, "balance_loss_clip": 1.06615067, "balance_loss_mlp": 1.02535248, "epoch": 0.12541333493657186, "flos": 30774151923840.0, "grad_norm": 2.8676062072292083, "language_loss": 0.85950476, "learning_rate": 3.9053600628177435e-06, "loss": 0.88217103, "num_input_tokens_seen": 22062910, "step": 1043, "time_per_iteration": 2.8519065380096436 }, { "auxiliary_loss_clip": 0.01243716, "auxiliary_loss_mlp": 0.01035326, "balance_loss_clip": 1.06498241, "balance_loss_mlp": 1.02411997, "epoch": 0.12553357782721097, "flos": 23659566099840.0, "grad_norm": 2.533765029488608, "language_loss": 0.84538871, "learning_rate": 3.905123130451367e-06, "loss": 0.8681792, "num_input_tokens_seen": 22084010, "step": 1044, "time_per_iteration": 2.7803080081939697 }, { "auxiliary_loss_clip": 0.01248722, "auxiliary_loss_mlp": 0.01041709, "balance_loss_clip": 1.06864047, "balance_loss_mlp": 1.0305388, "epoch": 0.12565382071785006, "flos": 24863758577280.0, "grad_norm": 1.7834185189896887, "language_loss": 0.79547596, "learning_rate": 3.904885909079326e-06, "loss": 0.8183803, "num_input_tokens_seen": 22102795, "step": 1045, "time_per_iteration": 2.784945249557495 }, { "auxiliary_loss_clip": 0.01241641, "auxiliary_loss_mlp": 0.01041127, "balance_loss_clip": 1.06777215, "balance_loss_mlp": 1.02979064, "epoch": 0.12577406360848914, "flos": 21360780518400.0, "grad_norm": 2.662659545971727, "language_loss": 0.78272516, "learning_rate": 3.904648398737607e-06, "loss": 0.80555296, "num_input_tokens_seen": 22121360, "step": 1046, "time_per_iteration": 2.955049753189087 }, { "auxiliary_loss_clip": 0.01244898, "auxiliary_loss_mlp": 0.01039647, "balance_loss_clip": 1.06642151, "balance_loss_mlp": 1.02821457, "epoch": 0.12589430649912825, "flos": 36138056774400.0, "grad_norm": 1.9602432692531115, "language_loss": 0.7827276, "learning_rate": 3.9044105994622406e-06, "loss": 0.80557311, "num_input_tokens_seen": 22142505, "step": 1047, "time_per_iteration": 2.923278331756592 }, { "auxiliary_loss_clip": 0.01241543, "auxiliary_loss_mlp": 0.01056123, "balance_loss_clip": 1.06888795, "balance_loss_mlp": 1.02859426, "epoch": 0.12601454938976733, "flos": 25337671643520.0, "grad_norm": 2.6062985914237315, "language_loss": 0.81450683, "learning_rate": 3.9041725112893005e-06, "loss": 0.83748347, "num_input_tokens_seen": 22163730, "step": 1048, "time_per_iteration": 2.9465808868408203 }, { "auxiliary_loss_clip": 0.01229704, "auxiliary_loss_mlp": 0.01041422, "balance_loss_clip": 1.06766105, "balance_loss_mlp": 1.02935755, "epoch": 0.12613479228040642, "flos": 15560094286080.0, "grad_norm": 1.8656992691999068, "language_loss": 0.74947774, "learning_rate": 3.903934134254904e-06, "loss": 0.77218902, "num_input_tokens_seen": 22181520, "step": 1049, "time_per_iteration": 2.834836006164551 }, { "auxiliary_loss_clip": 0.01249978, "auxiliary_loss_mlp": 0.01035381, "balance_loss_clip": 1.06871104, "balance_loss_mlp": 1.02415144, "epoch": 0.1262550351710455, "flos": 21470595373440.0, "grad_norm": 2.240899560879907, "language_loss": 0.85006511, "learning_rate": 3.903695468395213e-06, "loss": 0.87291873, "num_input_tokens_seen": 22199390, "step": 1050, "time_per_iteration": 2.750648260116577 }, { "auxiliary_loss_clip": 0.01245797, "auxiliary_loss_mlp": 0.01045334, "balance_loss_clip": 1.06820464, "balance_loss_mlp": 1.03273404, "epoch": 0.1263752780616846, "flos": 31576719456000.0, "grad_norm": 3.6468148924843904, "language_loss": 0.55667454, "learning_rate": 3.903456513746434e-06, "loss": 0.57958585, "num_input_tokens_seen": 22220365, "step": 1051, "time_per_iteration": 2.9932281970977783 }, { "auxiliary_loss_clip": 0.01244882, "auxiliary_loss_mlp": 0.01039008, "balance_loss_clip": 1.06665576, "balance_loss_mlp": 1.02824974, "epoch": 0.1264955209523237, "flos": 28768217927040.0, "grad_norm": 1.8060942344625417, "language_loss": 0.87563634, "learning_rate": 3.903217270344815e-06, "loss": 0.89847529, "num_input_tokens_seen": 22240615, "step": 1052, "time_per_iteration": 2.7813754081726074 }, { "auxiliary_loss_clip": 0.0122739, "auxiliary_loss_mlp": 0.01035628, "balance_loss_clip": 1.0655899, "balance_loss_mlp": 1.02353454, "epoch": 0.12661576384296278, "flos": 29241125412480.0, "grad_norm": 1.8125703305322516, "language_loss": 0.82665992, "learning_rate": 3.902977738226648e-06, "loss": 0.84929013, "num_input_tokens_seen": 22261350, "step": 1053, "time_per_iteration": 3.0360987186431885 }, { "auxiliary_loss_clip": 0.01246381, "auxiliary_loss_mlp": 0.01041314, "balance_loss_clip": 1.07008183, "balance_loss_mlp": 1.02824283, "epoch": 0.12673600673360189, "flos": 20850346298880.0, "grad_norm": 2.0790907010167223, "language_loss": 0.9121834, "learning_rate": 3.902737917428273e-06, "loss": 0.93506038, "num_input_tokens_seen": 22279515, "step": 1054, "time_per_iteration": 2.7994871139526367 }, { "auxiliary_loss_clip": 0.0124839, "auxiliary_loss_mlp": 0.01045895, "balance_loss_clip": 1.06903446, "balance_loss_mlp": 1.03410554, "epoch": 0.12685624962424097, "flos": 25263695583360.0, "grad_norm": 1.8806230099512533, "language_loss": 0.83960098, "learning_rate": 3.902497807986068e-06, "loss": 0.86254382, "num_input_tokens_seen": 22299535, "step": 1055, "time_per_iteration": 2.7601146697998047 }, { "auxiliary_loss_clip": 0.01241018, "auxiliary_loss_mlp": 0.01041133, "balance_loss_clip": 1.06843424, "balance_loss_mlp": 1.02930188, "epoch": 0.12697649251488005, "flos": 27527109246720.0, "grad_norm": 2.0250370479609625, "language_loss": 0.8416267, "learning_rate": 3.902257409936458e-06, "loss": 0.86444819, "num_input_tokens_seen": 22320300, "step": 1056, "time_per_iteration": 2.893678665161133 }, { "auxiliary_loss_clip": 0.01237532, "auxiliary_loss_mlp": 0.01037116, "balance_loss_clip": 1.06780684, "balance_loss_mlp": 1.02629173, "epoch": 0.12709673540551916, "flos": 21251863503360.0, "grad_norm": 3.1483106498191638, "language_loss": 0.8411901, "learning_rate": 3.902016723315912e-06, "loss": 0.86393654, "num_input_tokens_seen": 22338240, "step": 1057, "time_per_iteration": 2.770129919052124 }, { "auxiliary_loss_clip": 0.01242193, "auxiliary_loss_mlp": 0.01044277, "balance_loss_clip": 1.06814766, "balance_loss_mlp": 1.03296399, "epoch": 0.12721697829615825, "flos": 25337707557120.0, "grad_norm": 2.795641023324031, "language_loss": 0.69517517, "learning_rate": 3.901775748160941e-06, "loss": 0.71803987, "num_input_tokens_seen": 22357420, "step": 1058, "time_per_iteration": 2.7879726886749268 }, { "auxiliary_loss_clip": 0.01137385, "auxiliary_loss_mlp": 0.01004837, "balance_loss_clip": 1.03476799, "balance_loss_mlp": 1.00180948, "epoch": 0.12733722118679733, "flos": 61943287754880.0, "grad_norm": 0.7976142679667529, "language_loss": 0.60824287, "learning_rate": 3.901534484508101e-06, "loss": 0.62966514, "num_input_tokens_seen": 22420095, "step": 1059, "time_per_iteration": 4.317529916763306 }, { "auxiliary_loss_clip": 0.01225888, "auxiliary_loss_mlp": 0.01038749, "balance_loss_clip": 1.06456077, "balance_loss_mlp": 1.0267272, "epoch": 0.1274574640774364, "flos": 26976742081920.0, "grad_norm": 2.14195515955295, "language_loss": 0.74994427, "learning_rate": 3.901292932393991e-06, "loss": 0.77259064, "num_input_tokens_seen": 22438975, "step": 1060, "time_per_iteration": 3.9010426998138428 }, { "auxiliary_loss_clip": 0.01252805, "auxiliary_loss_mlp": 0.01047223, "balance_loss_clip": 1.07239962, "balance_loss_mlp": 1.035779, "epoch": 0.12757770696807552, "flos": 22236318529920.0, "grad_norm": 3.2278748636460914, "language_loss": 0.85058975, "learning_rate": 3.9010510918552555e-06, "loss": 0.87359005, "num_input_tokens_seen": 22458050, "step": 1061, "time_per_iteration": 2.790416717529297 }, { "auxiliary_loss_clip": 0.01239187, "auxiliary_loss_mlp": 0.01040223, "balance_loss_clip": 1.06971765, "balance_loss_mlp": 1.02749741, "epoch": 0.1276979498587146, "flos": 28547905858560.0, "grad_norm": 2.4027641530264927, "language_loss": 0.74699837, "learning_rate": 3.900808962928581e-06, "loss": 0.76979244, "num_input_tokens_seen": 22475665, "step": 1062, "time_per_iteration": 2.8573546409606934 }, { "auxiliary_loss_clip": 0.01244244, "auxiliary_loss_mlp": 0.0104513, "balance_loss_clip": 1.06786978, "balance_loss_mlp": 1.03421617, "epoch": 0.1278181927493537, "flos": 17420338719360.0, "grad_norm": 2.555423530116885, "language_loss": 0.89656198, "learning_rate": 3.900566545650698e-06, "loss": 0.91945571, "num_input_tokens_seen": 22493335, "step": 1063, "time_per_iteration": 3.6986396312713623 }, { "auxiliary_loss_clip": 0.01241016, "auxiliary_loss_mlp": 0.01042999, "balance_loss_clip": 1.06903195, "balance_loss_mlp": 1.03221631, "epoch": 0.1279384356399928, "flos": 21138636856320.0, "grad_norm": 2.1328185268712105, "language_loss": 0.81954902, "learning_rate": 3.900323840058381e-06, "loss": 0.84238917, "num_input_tokens_seen": 22511045, "step": 1064, "time_per_iteration": 2.806058168411255 }, { "auxiliary_loss_clip": 0.01241717, "auxiliary_loss_mlp": 0.0104182, "balance_loss_clip": 1.06677842, "balance_loss_mlp": 1.03089476, "epoch": 0.12805867853063188, "flos": 26576733248640.0, "grad_norm": 3.015482985554622, "language_loss": 0.8183825, "learning_rate": 3.900080846188449e-06, "loss": 0.84121788, "num_input_tokens_seen": 22529635, "step": 1065, "time_per_iteration": 2.854233980178833 }, { "auxiliary_loss_clip": 0.01244474, "auxiliary_loss_mlp": 0.01037048, "balance_loss_clip": 1.06602955, "balance_loss_mlp": 1.02550292, "epoch": 0.12817892142127096, "flos": 16436206915200.0, "grad_norm": 1.9453458206341971, "language_loss": 0.81137156, "learning_rate": 3.8998375640777625e-06, "loss": 0.83418679, "num_input_tokens_seen": 22547505, "step": 1066, "time_per_iteration": 2.7474677562713623 }, { "auxiliary_loss_clip": 0.01144398, "auxiliary_loss_mlp": 0.01006948, "balance_loss_clip": 1.03687191, "balance_loss_mlp": 1.00384831, "epoch": 0.12829916431191005, "flos": 60757049099520.0, "grad_norm": 0.7068335069966304, "language_loss": 0.52586722, "learning_rate": 3.899593993763229e-06, "loss": 0.54738069, "num_input_tokens_seen": 22608465, "step": 1067, "time_per_iteration": 3.210822105407715 }, { "auxiliary_loss_clip": 0.01227661, "auxiliary_loss_mlp": 0.01044705, "balance_loss_clip": 1.06899583, "balance_loss_mlp": 1.0324322, "epoch": 0.12841940720254916, "flos": 29786895636480.0, "grad_norm": 2.28931047964699, "language_loss": 0.81112742, "learning_rate": 3.899350135281796e-06, "loss": 0.8338511, "num_input_tokens_seen": 22629465, "step": 1068, "time_per_iteration": 2.8749122619628906 }, { "auxiliary_loss_clip": 0.01235734, "auxiliary_loss_mlp": 0.01039216, "balance_loss_clip": 1.06792521, "balance_loss_mlp": 1.02869034, "epoch": 0.12853965009318824, "flos": 25951851319680.0, "grad_norm": 1.9065654085811374, "language_loss": 0.79424983, "learning_rate": 3.8991059886704585e-06, "loss": 0.81699932, "num_input_tokens_seen": 22648970, "step": 1069, "time_per_iteration": 2.757014751434326 }, { "auxiliary_loss_clip": 0.01227167, "auxiliary_loss_mlp": 0.01039386, "balance_loss_clip": 1.06842184, "balance_loss_mlp": 1.02826977, "epoch": 0.12865989298382732, "flos": 30846871008000.0, "grad_norm": 2.444841940877716, "language_loss": 0.83071041, "learning_rate": 3.898861553966252e-06, "loss": 0.85337591, "num_input_tokens_seen": 22668620, "step": 1070, "time_per_iteration": 2.813030242919922 }, { "auxiliary_loss_clip": 0.01219534, "auxiliary_loss_mlp": 0.01040052, "balance_loss_clip": 1.0686686, "balance_loss_mlp": 1.02909696, "epoch": 0.12878013587446643, "flos": 25885776251520.0, "grad_norm": 1.7699503314123766, "language_loss": 0.88045144, "learning_rate": 3.898616831206257e-06, "loss": 0.90304726, "num_input_tokens_seen": 22689045, "step": 1071, "time_per_iteration": 2.9164304733276367 }, { "auxiliary_loss_clip": 0.01226101, "auxiliary_loss_mlp": 0.0104686, "balance_loss_clip": 1.06647563, "balance_loss_mlp": 1.03520703, "epoch": 0.12890037876510552, "flos": 23333138277120.0, "grad_norm": 2.782497265187845, "language_loss": 0.76819426, "learning_rate": 3.8983718204276e-06, "loss": 0.79092383, "num_input_tokens_seen": 22711265, "step": 1072, "time_per_iteration": 2.7668535709381104 }, { "auxiliary_loss_clip": 0.01233472, "auxiliary_loss_mlp": 0.01044666, "balance_loss_clip": 1.06728363, "balance_loss_mlp": 1.03282309, "epoch": 0.1290206216557446, "flos": 23587242065280.0, "grad_norm": 1.8558449591479709, "language_loss": 0.82802331, "learning_rate": 3.898126521667446e-06, "loss": 0.85080469, "num_input_tokens_seen": 22731420, "step": 1073, "time_per_iteration": 2.779758930206299 }, { "auxiliary_loss_clip": 0.01241949, "auxiliary_loss_mlp": 0.01044438, "balance_loss_clip": 1.06764793, "balance_loss_mlp": 1.0325228, "epoch": 0.12914086454638368, "flos": 24170610850560.0, "grad_norm": 2.338693404319486, "language_loss": 0.83350742, "learning_rate": 3.897880934963007e-06, "loss": 0.85637134, "num_input_tokens_seen": 22750970, "step": 1074, "time_per_iteration": 2.8295273780822754 }, { "auxiliary_loss_clip": 0.01233849, "auxiliary_loss_mlp": 0.01045363, "balance_loss_clip": 1.06652188, "balance_loss_mlp": 1.03375196, "epoch": 0.1292611074370228, "flos": 20267157081600.0, "grad_norm": 2.09152338616218, "language_loss": 0.78458774, "learning_rate": 3.89763506035154e-06, "loss": 0.80737984, "num_input_tokens_seen": 22768820, "step": 1075, "time_per_iteration": 2.837815761566162 }, { "auxiliary_loss_clip": 0.01227002, "auxiliary_loss_mlp": 0.01037647, "balance_loss_clip": 1.06437039, "balance_loss_mlp": 1.02699518, "epoch": 0.12938135032766188, "flos": 27377684668800.0, "grad_norm": 1.8493537408189513, "language_loss": 0.81440294, "learning_rate": 3.897388897870343e-06, "loss": 0.83704942, "num_input_tokens_seen": 22789460, "step": 1076, "time_per_iteration": 2.8388702869415283 }, { "auxiliary_loss_clip": 0.01244907, "auxiliary_loss_mlp": 0.01039414, "balance_loss_clip": 1.06536651, "balance_loss_mlp": 1.0270406, "epoch": 0.12950159321830096, "flos": 29277107861760.0, "grad_norm": 3.1703183422545766, "language_loss": 0.75078499, "learning_rate": 3.89714244755676e-06, "loss": 0.77362812, "num_input_tokens_seen": 22810820, "step": 1077, "time_per_iteration": 2.899547576904297 }, { "auxiliary_loss_clip": 0.01212858, "auxiliary_loss_mlp": 0.01053083, "balance_loss_clip": 1.06306291, "balance_loss_mlp": 1.04114413, "epoch": 0.12962183610894007, "flos": 24534888629760.0, "grad_norm": 15.759361221458642, "language_loss": 0.86297357, "learning_rate": 3.896895709448175e-06, "loss": 0.88563299, "num_input_tokens_seen": 22830570, "step": 1078, "time_per_iteration": 2.854064464569092 }, { "auxiliary_loss_clip": 0.01211895, "auxiliary_loss_mlp": 0.0103549, "balance_loss_clip": 1.06057024, "balance_loss_mlp": 1.02434349, "epoch": 0.12974207899957915, "flos": 11215944552960.0, "grad_norm": 2.766225449255973, "language_loss": 0.77112281, "learning_rate": 3.896648683582019e-06, "loss": 0.79359663, "num_input_tokens_seen": 22845905, "step": 1079, "time_per_iteration": 2.7366139888763428 }, { "auxiliary_loss_clip": 0.01218295, "auxiliary_loss_mlp": 0.01038412, "balance_loss_clip": 1.06362712, "balance_loss_mlp": 1.02826691, "epoch": 0.12986232189021824, "flos": 24717889445760.0, "grad_norm": 2.359414291900923, "language_loss": 0.809587, "learning_rate": 3.896401369995766e-06, "loss": 0.83215404, "num_input_tokens_seen": 22865710, "step": 1080, "time_per_iteration": 2.8602755069732666 }, { "auxiliary_loss_clip": 0.01241222, "auxiliary_loss_mlp": 0.01037135, "balance_loss_clip": 1.06542945, "balance_loss_mlp": 1.02604246, "epoch": 0.12998256478085732, "flos": 23915357827200.0, "grad_norm": 1.7819133691771791, "language_loss": 0.79335105, "learning_rate": 3.896153768726932e-06, "loss": 0.81613457, "num_input_tokens_seen": 22886020, "step": 1081, "time_per_iteration": 2.9008946418762207 }, { "auxiliary_loss_clip": 0.0123494, "auxiliary_loss_mlp": 0.0103356, "balance_loss_clip": 1.06531632, "balance_loss_mlp": 1.02297974, "epoch": 0.13010280767149643, "flos": 18624207974400.0, "grad_norm": 3.7758578295513825, "language_loss": 0.88158464, "learning_rate": 3.8959058798130806e-06, "loss": 0.90426964, "num_input_tokens_seen": 22903995, "step": 1082, "time_per_iteration": 2.6911263465881348 }, { "auxiliary_loss_clip": 0.01234256, "auxiliary_loss_mlp": 0.01054847, "balance_loss_clip": 1.06761205, "balance_loss_mlp": 1.02508831, "epoch": 0.1302230505621355, "flos": 22783992174720.0, "grad_norm": 1.6504098811864072, "language_loss": 0.7509042, "learning_rate": 3.895657703291814e-06, "loss": 0.77379525, "num_input_tokens_seen": 22924100, "step": 1083, "time_per_iteration": 2.7975292205810547 }, { "auxiliary_loss_clip": 0.01242103, "auxiliary_loss_mlp": 0.01037049, "balance_loss_clip": 1.06351876, "balance_loss_mlp": 1.0258255, "epoch": 0.1303432934527746, "flos": 21323612920320.0, "grad_norm": 3.5291261575157935, "language_loss": 0.79754841, "learning_rate": 3.895409239200781e-06, "loss": 0.82033992, "num_input_tokens_seen": 22939985, "step": 1084, "time_per_iteration": 3.7529098987579346 }, { "auxiliary_loss_clip": 0.01228969, "auxiliary_loss_mlp": 0.01033541, "balance_loss_clip": 1.06349194, "balance_loss_mlp": 1.02286017, "epoch": 0.1304635363434137, "flos": 20922490765440.0, "grad_norm": 3.1015978407796423, "language_loss": 0.91220975, "learning_rate": 3.895160487577673e-06, "loss": 0.93483496, "num_input_tokens_seen": 22957555, "step": 1085, "time_per_iteration": 3.6477057933807373 }, { "auxiliary_loss_clip": 0.01128373, "auxiliary_loss_mlp": 0.01006889, "balance_loss_clip": 1.02396107, "balance_loss_mlp": 1.0042429, "epoch": 0.1305837792340528, "flos": 63245659080960.0, "grad_norm": 0.7910573973565805, "language_loss": 0.60875106, "learning_rate": 3.894911448460226e-06, "loss": 0.63010371, "num_input_tokens_seen": 23016870, "step": 1086, "time_per_iteration": 4.123334884643555 }, { "auxiliary_loss_clip": 0.01196726, "auxiliary_loss_mlp": 0.01042808, "balance_loss_clip": 1.06463218, "balance_loss_mlp": 1.03117371, "epoch": 0.13070402212469187, "flos": 26428852955520.0, "grad_norm": 2.0405633014836706, "language_loss": 0.72787267, "learning_rate": 3.8946621218862195e-06, "loss": 0.75026798, "num_input_tokens_seen": 23037870, "step": 1087, "time_per_iteration": 2.9683868885040283 }, { "auxiliary_loss_clip": 0.0122916, "auxiliary_loss_mlp": 0.01031924, "balance_loss_clip": 1.06616926, "balance_loss_mlp": 1.0213623, "epoch": 0.13082426501533098, "flos": 27673409341440.0, "grad_norm": 2.268794603046158, "language_loss": 0.88703722, "learning_rate": 3.894412507893475e-06, "loss": 0.90964806, "num_input_tokens_seen": 23058150, "step": 1088, "time_per_iteration": 2.8740756511688232 }, { "auxiliary_loss_clip": 0.01234018, "auxiliary_loss_mlp": 0.01042281, "balance_loss_clip": 1.06841052, "balance_loss_mlp": 1.03116441, "epoch": 0.13094450790597006, "flos": 24826770547200.0, "grad_norm": 2.2008917402461767, "language_loss": 0.72157294, "learning_rate": 3.894162606519859e-06, "loss": 0.74433595, "num_input_tokens_seen": 23077100, "step": 1089, "time_per_iteration": 3.860114812850952 }, { "auxiliary_loss_clip": 0.01221409, "auxiliary_loss_mlp": 0.01037594, "balance_loss_clip": 1.06477439, "balance_loss_mlp": 1.02669823, "epoch": 0.13106475079660915, "flos": 19062605468160.0, "grad_norm": 2.0402628149450206, "language_loss": 0.76930809, "learning_rate": 3.893912417803282e-06, "loss": 0.79189813, "num_input_tokens_seen": 23096815, "step": 1090, "time_per_iteration": 2.76599383354187 }, { "auxiliary_loss_clip": 0.01221259, "auxiliary_loss_mlp": 0.01036545, "balance_loss_clip": 1.06255507, "balance_loss_mlp": 1.02540493, "epoch": 0.13118499368724823, "flos": 28913189218560.0, "grad_norm": 1.7362706685488893, "language_loss": 0.76698673, "learning_rate": 3.8936619417816975e-06, "loss": 0.78956479, "num_input_tokens_seen": 23117145, "step": 1091, "time_per_iteration": 2.9165890216827393 }, { "auxiliary_loss_clip": 0.01226946, "auxiliary_loss_mlp": 0.01042564, "balance_loss_clip": 1.06611037, "balance_loss_mlp": 1.03133488, "epoch": 0.13130523657788734, "flos": 14283398206080.0, "grad_norm": 2.3263079883143973, "language_loss": 0.71939361, "learning_rate": 3.8934111784931015e-06, "loss": 0.74208879, "num_input_tokens_seen": 23134595, "step": 1092, "time_per_iteration": 2.838902235031128 }, { "auxiliary_loss_clip": 0.01129876, "auxiliary_loss_mlp": 0.01002856, "balance_loss_clip": 1.02389634, "balance_loss_mlp": 0.99998301, "epoch": 0.13142547946852642, "flos": 70174155519360.0, "grad_norm": 0.9337592149081884, "language_loss": 0.59082615, "learning_rate": 3.893160127975535e-06, "loss": 0.61215341, "num_input_tokens_seen": 23195285, "step": 1093, "time_per_iteration": 3.491894483566284 }, { "auxiliary_loss_clip": 0.01231447, "auxiliary_loss_mlp": 0.01039285, "balance_loss_clip": 1.06649888, "balance_loss_mlp": 1.02866316, "epoch": 0.1315457223591655, "flos": 45805998844800.0, "grad_norm": 2.3366294492373743, "language_loss": 0.81328863, "learning_rate": 3.8929087902670826e-06, "loss": 0.83599591, "num_input_tokens_seen": 23216915, "step": 1094, "time_per_iteration": 2.960214853286743 }, { "auxiliary_loss_clip": 0.01132873, "auxiliary_loss_mlp": 0.010028, "balance_loss_clip": 1.02314019, "balance_loss_mlp": 0.99996233, "epoch": 0.13166596524980462, "flos": 62881165820160.0, "grad_norm": 0.926666800462546, "language_loss": 0.60745966, "learning_rate": 3.8926571654058715e-06, "loss": 0.62881643, "num_input_tokens_seen": 23273560, "step": 1095, "time_per_iteration": 3.1818108558654785 }, { "auxiliary_loss_clip": 0.01223977, "auxiliary_loss_mlp": 0.01050988, "balance_loss_clip": 1.06377447, "balance_loss_mlp": 1.03975224, "epoch": 0.1317862081404437, "flos": 23586523793280.0, "grad_norm": 2.2544055151967193, "language_loss": 0.76643479, "learning_rate": 3.892405253430074e-06, "loss": 0.78918433, "num_input_tokens_seen": 23291080, "step": 1096, "time_per_iteration": 2.8486361503601074 }, { "auxiliary_loss_clip": 0.01236677, "auxiliary_loss_mlp": 0.01053802, "balance_loss_clip": 1.06599903, "balance_loss_mlp": 1.02423835, "epoch": 0.13190645103108278, "flos": 20260764460800.0, "grad_norm": 1.922063926904009, "language_loss": 0.82720435, "learning_rate": 3.892153054377904e-06, "loss": 0.85010916, "num_input_tokens_seen": 23308485, "step": 1097, "time_per_iteration": 2.825011968612671 }, { "auxiliary_loss_clip": 0.01119823, "auxiliary_loss_mlp": 0.01028929, "balance_loss_clip": 1.03004313, "balance_loss_mlp": 1.02622306, "epoch": 0.13202669392172187, "flos": 53455440136320.0, "grad_norm": 0.9362778718611668, "language_loss": 0.59428835, "learning_rate": 3.891900568287619e-06, "loss": 0.61577588, "num_input_tokens_seen": 23360870, "step": 1098, "time_per_iteration": 3.199488401412964 }, { "auxiliary_loss_clip": 0.01231961, "auxiliary_loss_mlp": 0.01039105, "balance_loss_clip": 1.06534982, "balance_loss_mlp": 1.0276078, "epoch": 0.13214693681236098, "flos": 15851293845120.0, "grad_norm": 2.899715260881156, "language_loss": 0.72302562, "learning_rate": 3.891647795197523e-06, "loss": 0.74573636, "num_input_tokens_seen": 23376910, "step": 1099, "time_per_iteration": 2.7459216117858887 }, { "auxiliary_loss_clip": 0.01233985, "auxiliary_loss_mlp": 0.01038177, "balance_loss_clip": 1.06330848, "balance_loss_mlp": 1.02608347, "epoch": 0.13226717970300006, "flos": 19353840940800.0, "grad_norm": 2.259721534991545, "language_loss": 0.69050515, "learning_rate": 3.8913947351459605e-06, "loss": 0.7132268, "num_input_tokens_seen": 23394450, "step": 1100, "time_per_iteration": 2.801035165786743 }, { "auxiliary_loss_clip": 0.01236652, "auxiliary_loss_mlp": 0.01040594, "balance_loss_clip": 1.06231999, "balance_loss_mlp": 1.03015089, "epoch": 0.13238742259363914, "flos": 20698084546560.0, "grad_norm": 2.192632067011124, "language_loss": 0.67539287, "learning_rate": 3.89114138817132e-06, "loss": 0.6981653, "num_input_tokens_seen": 23411115, "step": 1101, "time_per_iteration": 2.8167455196380615 }, { "auxiliary_loss_clip": 0.01234079, "auxiliary_loss_mlp": 0.01036087, "balance_loss_clip": 1.06643724, "balance_loss_mlp": 1.02477455, "epoch": 0.13250766548427825, "flos": 21032449274880.0, "grad_norm": 1.8186952729691337, "language_loss": 0.84345675, "learning_rate": 3.890887754312035e-06, "loss": 0.86615843, "num_input_tokens_seen": 23429360, "step": 1102, "time_per_iteration": 2.73189377784729 }, { "auxiliary_loss_clip": 0.01224369, "auxiliary_loss_mlp": 0.01039596, "balance_loss_clip": 1.06067622, "balance_loss_mlp": 1.02759194, "epoch": 0.13262790837491734, "flos": 22637871648000.0, "grad_norm": 1.7931476993494122, "language_loss": 0.8781637, "learning_rate": 3.890633833606581e-06, "loss": 0.90080333, "num_input_tokens_seen": 23449050, "step": 1103, "time_per_iteration": 2.761997938156128 }, { "auxiliary_loss_clip": 0.01234204, "auxiliary_loss_mlp": 0.010347, "balance_loss_clip": 1.06657243, "balance_loss_mlp": 1.02413225, "epoch": 0.13274815126555642, "flos": 19683141851520.0, "grad_norm": 1.9165271734379792, "language_loss": 0.69652784, "learning_rate": 3.890379626093477e-06, "loss": 0.71921694, "num_input_tokens_seen": 23468800, "step": 1104, "time_per_iteration": 2.798722267150879 }, { "auxiliary_loss_clip": 0.01209333, "auxiliary_loss_mlp": 0.0103746, "balance_loss_clip": 1.06205654, "balance_loss_mlp": 1.02563429, "epoch": 0.1328683941561955, "flos": 21317687176320.0, "grad_norm": 2.055116959404213, "language_loss": 0.92210835, "learning_rate": 3.890125131811287e-06, "loss": 0.94457632, "num_input_tokens_seen": 23486850, "step": 1105, "time_per_iteration": 2.858773708343506 }, { "auxiliary_loss_clip": 0.01221333, "auxiliary_loss_mlp": 0.01034935, "balance_loss_clip": 1.06178403, "balance_loss_mlp": 1.0244683, "epoch": 0.1329886370468346, "flos": 13699131580800.0, "grad_norm": 1.9176670166563272, "language_loss": 0.7531209, "learning_rate": 3.889870350798618e-06, "loss": 0.77568364, "num_input_tokens_seen": 23504195, "step": 1106, "time_per_iteration": 2.911362409591675 }, { "auxiliary_loss_clip": 0.0123824, "auxiliary_loss_mlp": 0.01036128, "balance_loss_clip": 1.06227434, "balance_loss_mlp": 1.02520239, "epoch": 0.1331088799374737, "flos": 21032413361280.0, "grad_norm": 1.5882104334162577, "language_loss": 0.78612721, "learning_rate": 3.889615283094119e-06, "loss": 0.80887085, "num_input_tokens_seen": 23523385, "step": 1107, "time_per_iteration": 2.934587001800537 }, { "auxiliary_loss_clip": 0.01243917, "auxiliary_loss_mlp": 0.01037985, "balance_loss_clip": 1.06319559, "balance_loss_mlp": 1.02612364, "epoch": 0.13322912282811278, "flos": 18260432985600.0, "grad_norm": 2.440865123490901, "language_loss": 0.84867221, "learning_rate": 3.889359928736485e-06, "loss": 0.87149119, "num_input_tokens_seen": 23541330, "step": 1108, "time_per_iteration": 2.761753559112549 }, { "auxiliary_loss_clip": 0.01227214, "auxiliary_loss_mlp": 0.01054841, "balance_loss_clip": 1.06465054, "balance_loss_mlp": 1.02525806, "epoch": 0.1333493657187519, "flos": 24460876656000.0, "grad_norm": 2.692105287118973, "language_loss": 0.91270256, "learning_rate": 3.889104287764451e-06, "loss": 0.93552309, "num_input_tokens_seen": 23561705, "step": 1109, "time_per_iteration": 2.7253310680389404 }, { "auxiliary_loss_clip": 0.01225193, "auxiliary_loss_mlp": 0.01037571, "balance_loss_clip": 1.0637145, "balance_loss_mlp": 1.02724183, "epoch": 0.13346960860939097, "flos": 22158930677760.0, "grad_norm": 2.058048608683929, "language_loss": 0.90640616, "learning_rate": 3.888848360216798e-06, "loss": 0.92903376, "num_input_tokens_seen": 23579350, "step": 1110, "time_per_iteration": 3.79683518409729 }, { "auxiliary_loss_clip": 0.0113139, "auxiliary_loss_mlp": 0.01020814, "balance_loss_clip": 1.02125359, "balance_loss_mlp": 1.01792955, "epoch": 0.13358985150003005, "flos": 67931212608000.0, "grad_norm": 0.8158345649411362, "language_loss": 0.56557655, "learning_rate": 3.888592146132351e-06, "loss": 0.5870986, "num_input_tokens_seen": 23640620, "step": 1111, "time_per_iteration": 4.344780445098877 }, { "auxiliary_loss_clip": 0.01235059, "auxiliary_loss_mlp": 0.01033568, "balance_loss_clip": 1.06469393, "balance_loss_mlp": 1.02276134, "epoch": 0.13371009439066917, "flos": 26834284742400.0, "grad_norm": 2.4252060000693736, "language_loss": 0.7831614, "learning_rate": 3.888335645549978e-06, "loss": 0.80584764, "num_input_tokens_seen": 23661040, "step": 1112, "time_per_iteration": 3.6668307781219482 }, { "auxiliary_loss_clip": 0.01238369, "auxiliary_loss_mlp": 0.01036757, "balance_loss_clip": 1.06297064, "balance_loss_mlp": 1.02556944, "epoch": 0.13383033728130825, "flos": 26322844942080.0, "grad_norm": 2.5659729883422937, "language_loss": 0.81789833, "learning_rate": 3.888078858508588e-06, "loss": 0.84064966, "num_input_tokens_seen": 23680900, "step": 1113, "time_per_iteration": 2.7173478603363037 }, { "auxiliary_loss_clip": 0.012273, "auxiliary_loss_mlp": 0.01037634, "balance_loss_clip": 1.06428409, "balance_loss_mlp": 1.02698874, "epoch": 0.13395058017194733, "flos": 22563931501440.0, "grad_norm": 1.792039514596871, "language_loss": 0.84626925, "learning_rate": 3.8878217850471365e-06, "loss": 0.8689186, "num_input_tokens_seen": 23700815, "step": 1114, "time_per_iteration": 2.8225250244140625 }, { "auxiliary_loss_clip": 0.01246062, "auxiliary_loss_mlp": 0.01035926, "balance_loss_clip": 1.06793928, "balance_loss_mlp": 1.02580488, "epoch": 0.13407082306258641, "flos": 25810938264960.0, "grad_norm": 2.0291930176476836, "language_loss": 0.74001026, "learning_rate": 3.887564425204621e-06, "loss": 0.76283014, "num_input_tokens_seen": 23722500, "step": 1115, "time_per_iteration": 4.467906713485718 }, { "auxiliary_loss_clip": 0.01120196, "auxiliary_loss_mlp": 0.01002439, "balance_loss_clip": 1.0224334, "balance_loss_mlp": 0.99948233, "epoch": 0.13419106595322552, "flos": 68338365269760.0, "grad_norm": 0.8458411267824909, "language_loss": 0.54609323, "learning_rate": 3.887306779020083e-06, "loss": 0.56731957, "num_input_tokens_seen": 23777155, "step": 1116, "time_per_iteration": 3.3777780532836914 }, { "auxiliary_loss_clip": 0.01235654, "auxiliary_loss_mlp": 0.01033233, "balance_loss_clip": 1.06318915, "balance_loss_mlp": 1.02256978, "epoch": 0.1343113088438646, "flos": 20449080489600.0, "grad_norm": 2.1821591440351376, "language_loss": 0.69984603, "learning_rate": 3.887048846532608e-06, "loss": 0.72253489, "num_input_tokens_seen": 23794130, "step": 1117, "time_per_iteration": 2.834641456604004 }, { "auxiliary_loss_clip": 0.01119575, "auxiliary_loss_mlp": 0.01004609, "balance_loss_clip": 1.01865208, "balance_loss_mlp": 1.00162911, "epoch": 0.1344315517345037, "flos": 67389784951680.0, "grad_norm": 0.7599037618115193, "language_loss": 0.58098865, "learning_rate": 3.8867906277813224e-06, "loss": 0.60223043, "num_input_tokens_seen": 23852285, "step": 1118, "time_per_iteration": 3.3232855796813965 }, { "auxiliary_loss_clip": 0.01240103, "auxiliary_loss_mlp": 0.01062087, "balance_loss_clip": 1.06452036, "balance_loss_mlp": 1.03264976, "epoch": 0.1345517946251428, "flos": 40734442788480.0, "grad_norm": 2.1961264893836083, "language_loss": 0.7416302, "learning_rate": 3.886532122805399e-06, "loss": 0.76465201, "num_input_tokens_seen": 23874765, "step": 1119, "time_per_iteration": 2.9365949630737305 }, { "auxiliary_loss_clip": 0.01213658, "auxiliary_loss_mlp": 0.01044581, "balance_loss_clip": 1.06849909, "balance_loss_mlp": 1.03376234, "epoch": 0.13467203751578188, "flos": 22816850140800.0, "grad_norm": 3.610932723623974, "language_loss": 0.8987366, "learning_rate": 3.886273331644053e-06, "loss": 0.92131907, "num_input_tokens_seen": 23893635, "step": 1120, "time_per_iteration": 2.8547093868255615 }, { "auxiliary_loss_clip": 0.01216409, "auxiliary_loss_mlp": 0.01036286, "balance_loss_clip": 1.06302977, "balance_loss_mlp": 1.02505112, "epoch": 0.13479228040642097, "flos": 17091576512640.0, "grad_norm": 2.0822918325441275, "language_loss": 0.82419038, "learning_rate": 3.886014254336542e-06, "loss": 0.84671736, "num_input_tokens_seen": 23910110, "step": 1121, "time_per_iteration": 2.8441028594970703 }, { "auxiliary_loss_clip": 0.01234521, "auxiliary_loss_mlp": 0.01043105, "balance_loss_clip": 1.06527412, "balance_loss_mlp": 1.03268635, "epoch": 0.13491252329706005, "flos": 23730525417600.0, "grad_norm": 1.90419647237803, "language_loss": 0.92466736, "learning_rate": 3.885754890922168e-06, "loss": 0.9474436, "num_input_tokens_seen": 23930440, "step": 1122, "time_per_iteration": 2.8541336059570312 }, { "auxiliary_loss_clip": 0.01208293, "auxiliary_loss_mlp": 0.01039076, "balance_loss_clip": 1.06459904, "balance_loss_mlp": 1.02726269, "epoch": 0.13503276618769916, "flos": 34127058960000.0, "grad_norm": 1.7027893916036152, "language_loss": 0.78257877, "learning_rate": 3.885495241440277e-06, "loss": 0.8050524, "num_input_tokens_seen": 23954535, "step": 1123, "time_per_iteration": 2.993713617324829 }, { "auxiliary_loss_clip": 0.0123704, "auxiliary_loss_mlp": 0.0104177, "balance_loss_clip": 1.06288302, "balance_loss_mlp": 1.030743, "epoch": 0.13515300907833824, "flos": 17712328377600.0, "grad_norm": 1.798572505052952, "language_loss": 0.74180698, "learning_rate": 3.885235305930257e-06, "loss": 0.76459503, "num_input_tokens_seen": 23972735, "step": 1124, "time_per_iteration": 2.880706787109375 }, { "auxiliary_loss_clip": 0.01225206, "auxiliary_loss_mlp": 0.01045386, "balance_loss_clip": 1.06802011, "balance_loss_mlp": 1.03391218, "epoch": 0.13527325196897733, "flos": 20260872201600.0, "grad_norm": 2.0749598010581636, "language_loss": 0.8564651, "learning_rate": 3.884975084431539e-06, "loss": 0.87917107, "num_input_tokens_seen": 23987685, "step": 1125, "time_per_iteration": 2.8948769569396973 }, { "auxiliary_loss_clip": 0.01225384, "auxiliary_loss_mlp": 0.01060472, "balance_loss_clip": 1.06235516, "balance_loss_mlp": 1.03185081, "epoch": 0.13539349485961644, "flos": 18186492839040.0, "grad_norm": 4.375283616125121, "language_loss": 0.91741204, "learning_rate": 3.8847145769836e-06, "loss": 0.94027054, "num_input_tokens_seen": 24004105, "step": 1126, "time_per_iteration": 2.7978463172912598 }, { "auxiliary_loss_clip": 0.01245831, "auxiliary_loss_mlp": 0.01047559, "balance_loss_clip": 1.0659523, "balance_loss_mlp": 1.03646708, "epoch": 0.13551373775025552, "flos": 19317463441920.0, "grad_norm": 2.3393266925606606, "language_loss": 0.65931976, "learning_rate": 3.884453783625959e-06, "loss": 0.68225372, "num_input_tokens_seen": 24021715, "step": 1127, "time_per_iteration": 2.7614519596099854 }, { "auxiliary_loss_clip": 0.01227185, "auxiliary_loss_mlp": 0.01039476, "balance_loss_clip": 1.06381977, "balance_loss_mlp": 1.02930164, "epoch": 0.1356339806408946, "flos": 20850813175680.0, "grad_norm": 2.285499330550791, "language_loss": 0.84828544, "learning_rate": 3.884192704398176e-06, "loss": 0.87095207, "num_input_tokens_seen": 24038915, "step": 1128, "time_per_iteration": 2.8117988109588623 }, { "auxiliary_loss_clip": 0.01237315, "auxiliary_loss_mlp": 0.01042519, "balance_loss_clip": 1.06380427, "balance_loss_mlp": 1.03133154, "epoch": 0.13575422353153369, "flos": 50476037696640.0, "grad_norm": 1.6181975760010048, "language_loss": 0.74455726, "learning_rate": 3.883931339339858e-06, "loss": 0.76735562, "num_input_tokens_seen": 24063300, "step": 1129, "time_per_iteration": 3.0198352336883545 }, { "auxiliary_loss_clip": 0.01240883, "auxiliary_loss_mlp": 0.01038964, "balance_loss_clip": 1.06415272, "balance_loss_mlp": 1.02741861, "epoch": 0.1358744664221728, "flos": 18150797698560.0, "grad_norm": 2.1099270864054422, "language_loss": 0.78467774, "learning_rate": 3.883669688490654e-06, "loss": 0.80747616, "num_input_tokens_seen": 24081070, "step": 1130, "time_per_iteration": 2.6915442943573 }, { "auxiliary_loss_clip": 0.01217211, "auxiliary_loss_mlp": 0.01057405, "balance_loss_clip": 1.06277013, "balance_loss_mlp": 1.02649391, "epoch": 0.13599470931281188, "flos": 18442966924800.0, "grad_norm": 2.0758940599559708, "language_loss": 0.8562575, "learning_rate": 3.883407751890256e-06, "loss": 0.87900364, "num_input_tokens_seen": 24099675, "step": 1131, "time_per_iteration": 2.8219704627990723 }, { "auxiliary_loss_clip": 0.01222625, "auxiliary_loss_mlp": 0.01042646, "balance_loss_clip": 1.06106162, "balance_loss_mlp": 1.03159547, "epoch": 0.13611495220345096, "flos": 26680766014080.0, "grad_norm": 1.9391979398930153, "language_loss": 0.85807532, "learning_rate": 3.8831455295783994e-06, "loss": 0.88072807, "num_input_tokens_seen": 24118925, "step": 1132, "time_per_iteration": 2.869452953338623 }, { "auxiliary_loss_clip": 0.01225967, "auxiliary_loss_mlp": 0.01044717, "balance_loss_clip": 1.06488323, "balance_loss_mlp": 1.03323758, "epoch": 0.13623519509409007, "flos": 21686238673920.0, "grad_norm": 1.7816332308223881, "language_loss": 0.74101239, "learning_rate": 3.882883021594864e-06, "loss": 0.7637192, "num_input_tokens_seen": 24137065, "step": 1133, "time_per_iteration": 2.7656586170196533 }, { "auxiliary_loss_clip": 0.01219738, "auxiliary_loss_mlp": 0.01039932, "balance_loss_clip": 1.06490982, "balance_loss_mlp": 1.02968037, "epoch": 0.13635543798472916, "flos": 14830389492480.0, "grad_norm": 2.6148408659262876, "language_loss": 0.86930597, "learning_rate": 3.8826202279794705e-06, "loss": 0.89190269, "num_input_tokens_seen": 24154125, "step": 1134, "time_per_iteration": 2.8445992469787598 }, { "auxiliary_loss_clip": 0.01242036, "auxiliary_loss_mlp": 0.01044092, "balance_loss_clip": 1.064906, "balance_loss_mlp": 1.03235614, "epoch": 0.13647568087536824, "flos": 22890323410560.0, "grad_norm": 1.997055910214939, "language_loss": 0.7032119, "learning_rate": 3.882357148772085e-06, "loss": 0.72607315, "num_input_tokens_seen": 24171550, "step": 1135, "time_per_iteration": 2.744658946990967 }, { "auxiliary_loss_clip": 0.01217095, "auxiliary_loss_mlp": 0.01039808, "balance_loss_clip": 1.06625557, "balance_loss_mlp": 1.02932954, "epoch": 0.13659592376600732, "flos": 19937927998080.0, "grad_norm": 2.3881945319459232, "language_loss": 0.84778559, "learning_rate": 3.882093784012617e-06, "loss": 0.87035465, "num_input_tokens_seen": 24190190, "step": 1136, "time_per_iteration": 2.780503749847412 }, { "auxiliary_loss_clip": 0.01229633, "auxiliary_loss_mlp": 0.010348, "balance_loss_clip": 1.06472301, "balance_loss_mlp": 1.0235709, "epoch": 0.13671616665664643, "flos": 21428579439360.0, "grad_norm": 1.7084018946890256, "language_loss": 0.8409394, "learning_rate": 3.881830133741019e-06, "loss": 0.86358368, "num_input_tokens_seen": 24209055, "step": 1137, "time_per_iteration": 4.64448618888855 }, { "auxiliary_loss_clip": 0.01226867, "auxiliary_loss_mlp": 0.01042634, "balance_loss_clip": 1.06662369, "balance_loss_mlp": 1.03188729, "epoch": 0.13683640954728551, "flos": 22778138257920.0, "grad_norm": 2.0961580786691334, "language_loss": 0.7653451, "learning_rate": 3.881566197997285e-06, "loss": 0.78804004, "num_input_tokens_seen": 24225490, "step": 1138, "time_per_iteration": 3.704754114151001 }, { "auxiliary_loss_clip": 0.01223472, "auxiliary_loss_mlp": 0.01035743, "balance_loss_clip": 1.06348348, "balance_loss_mlp": 1.02572298, "epoch": 0.1369566524379246, "flos": 21725884310400.0, "grad_norm": 1.5766490366493107, "language_loss": 0.74978822, "learning_rate": 3.881301976821456e-06, "loss": 0.77238041, "num_input_tokens_seen": 24245520, "step": 1139, "time_per_iteration": 2.8349342346191406 }, { "auxiliary_loss_clip": 0.01230237, "auxiliary_loss_mlp": 0.01043642, "balance_loss_clip": 1.06380773, "balance_loss_mlp": 1.03304446, "epoch": 0.1370768953285637, "flos": 18624459369600.0, "grad_norm": 2.9439056680282576, "language_loss": 0.90676558, "learning_rate": 3.881037470253612e-06, "loss": 0.92950428, "num_input_tokens_seen": 24265035, "step": 1140, "time_per_iteration": 3.827817440032959 }, { "auxiliary_loss_clip": 0.01219189, "auxiliary_loss_mlp": 0.01044271, "balance_loss_clip": 1.06387496, "balance_loss_mlp": 1.03323781, "epoch": 0.1371971382192028, "flos": 14939521989120.0, "grad_norm": 2.5756557423476205, "language_loss": 0.80042952, "learning_rate": 3.88077267833388e-06, "loss": 0.82306415, "num_input_tokens_seen": 24281550, "step": 1141, "time_per_iteration": 2.721512794494629 }, { "auxiliary_loss_clip": 0.01213023, "auxiliary_loss_mlp": 0.01042347, "balance_loss_clip": 1.06365681, "balance_loss_mlp": 1.03056312, "epoch": 0.13731738110984187, "flos": 19023785844480.0, "grad_norm": 2.3014386554057715, "language_loss": 0.83905685, "learning_rate": 3.880507601102427e-06, "loss": 0.86161059, "num_input_tokens_seen": 24299485, "step": 1142, "time_per_iteration": 2.789883613586426 }, { "auxiliary_loss_clip": 0.01237029, "auxiliary_loss_mlp": 0.01039683, "balance_loss_clip": 1.06386304, "balance_loss_mlp": 1.02807832, "epoch": 0.13743762400048098, "flos": 18187462506240.0, "grad_norm": 1.8712231084488609, "language_loss": 0.82259202, "learning_rate": 3.880242238599467e-06, "loss": 0.84535921, "num_input_tokens_seen": 24316010, "step": 1143, "time_per_iteration": 2.6474220752716064 }, { "auxiliary_loss_clip": 0.01235948, "auxiliary_loss_mlp": 0.01037419, "balance_loss_clip": 1.06294668, "balance_loss_mlp": 1.02689266, "epoch": 0.13755786689112007, "flos": 21031982398080.0, "grad_norm": 6.496697151459419, "language_loss": 0.83315432, "learning_rate": 3.879976590865254e-06, "loss": 0.85588795, "num_input_tokens_seen": 24335465, "step": 1144, "time_per_iteration": 2.784237861633301 }, { "auxiliary_loss_clip": 0.01224502, "auxiliary_loss_mlp": 0.0103858, "balance_loss_clip": 1.06202173, "balance_loss_mlp": 1.02800584, "epoch": 0.13767810978175915, "flos": 21360636864000.0, "grad_norm": 2.1808958849060454, "language_loss": 0.87465727, "learning_rate": 3.879710657940087e-06, "loss": 0.89728808, "num_input_tokens_seen": 24354415, "step": 1145, "time_per_iteration": 2.762718915939331 }, { "auxiliary_loss_clip": 0.01234674, "auxiliary_loss_mlp": 0.01040228, "balance_loss_clip": 1.06431234, "balance_loss_mlp": 1.02908182, "epoch": 0.13779835267239823, "flos": 30592084861440.0, "grad_norm": 2.0761445633402107, "language_loss": 0.70057881, "learning_rate": 3.879444439864308e-06, "loss": 0.72332776, "num_input_tokens_seen": 24373990, "step": 1146, "time_per_iteration": 2.819509744644165 }, { "auxiliary_loss_clip": 0.01231079, "auxiliary_loss_mlp": 0.01060262, "balance_loss_clip": 1.06122875, "balance_loss_mlp": 1.02764344, "epoch": 0.13791859556303734, "flos": 22669867687680.0, "grad_norm": 1.8637070272267686, "language_loss": 0.86421669, "learning_rate": 3.879177936678301e-06, "loss": 0.88713014, "num_input_tokens_seen": 24392995, "step": 1147, "time_per_iteration": 2.8132259845733643 }, { "auxiliary_loss_clip": 0.01234803, "auxiliary_loss_mlp": 0.01049109, "balance_loss_clip": 1.06628323, "balance_loss_mlp": 1.03720641, "epoch": 0.13803883845367643, "flos": 35224166016000.0, "grad_norm": 2.397046202143032, "language_loss": 0.77062714, "learning_rate": 3.878911148422496e-06, "loss": 0.79346621, "num_input_tokens_seen": 24414470, "step": 1148, "time_per_iteration": 2.9172322750091553 }, { "auxiliary_loss_clip": 0.01236237, "auxiliary_loss_mlp": 0.01036656, "balance_loss_clip": 1.06472969, "balance_loss_mlp": 1.02555168, "epoch": 0.1381590813443155, "flos": 32014542332160.0, "grad_norm": 2.4000939173393494, "language_loss": 0.70573813, "learning_rate": 3.878644075137364e-06, "loss": 0.72846705, "num_input_tokens_seen": 24435120, "step": 1149, "time_per_iteration": 2.8991799354553223 }, { "auxiliary_loss_clip": 0.01208241, "auxiliary_loss_mlp": 0.01039729, "balance_loss_clip": 1.06347096, "balance_loss_mlp": 1.02857113, "epoch": 0.13827932423495462, "flos": 17821855923840.0, "grad_norm": 1.9841752982390315, "language_loss": 0.79517448, "learning_rate": 3.878376716863418e-06, "loss": 0.81765419, "num_input_tokens_seen": 24451420, "step": 1150, "time_per_iteration": 2.8363118171691895 }, { "auxiliary_loss_clip": 0.01228025, "auxiliary_loss_mlp": 0.01042793, "balance_loss_clip": 1.06296372, "balance_loss_mlp": 1.03112817, "epoch": 0.1383995671255937, "flos": 19427098728960.0, "grad_norm": 2.1435816939114436, "language_loss": 0.71578234, "learning_rate": 3.878109073641219e-06, "loss": 0.73849058, "num_input_tokens_seen": 24470450, "step": 1151, "time_per_iteration": 2.8171470165252686 }, { "auxiliary_loss_clip": 0.01215343, "auxiliary_loss_mlp": 0.01040479, "balance_loss_clip": 1.06251383, "balance_loss_mlp": 1.02906442, "epoch": 0.13851981001623279, "flos": 28296603331200.0, "grad_norm": 1.654925357926291, "language_loss": 0.81090987, "learning_rate": 3.877841145511366e-06, "loss": 0.83346808, "num_input_tokens_seen": 24493190, "step": 1152, "time_per_iteration": 2.973691463470459 }, { "auxiliary_loss_clip": 0.0124113, "auxiliary_loss_mlp": 0.0104296, "balance_loss_clip": 1.06665754, "balance_loss_mlp": 1.03093827, "epoch": 0.13864005290687187, "flos": 21213079793280.0, "grad_norm": 1.8701386474428872, "language_loss": 0.82753241, "learning_rate": 3.8775729325145035e-06, "loss": 0.85037333, "num_input_tokens_seen": 24512425, "step": 1153, "time_per_iteration": 2.8113152980804443 }, { "auxiliary_loss_clip": 0.01110408, "auxiliary_loss_mlp": 0.01025149, "balance_loss_clip": 1.02201295, "balance_loss_mlp": 1.02263403, "epoch": 0.13876029579751098, "flos": 71653389413760.0, "grad_norm": 0.7893258587095688, "language_loss": 0.6469565, "learning_rate": 3.877304434691321e-06, "loss": 0.66831213, "num_input_tokens_seen": 24579275, "step": 1154, "time_per_iteration": 3.458279609680176 }, { "auxiliary_loss_clip": 0.01226725, "auxiliary_loss_mlp": 0.0103787, "balance_loss_clip": 1.06502473, "balance_loss_mlp": 1.02604449, "epoch": 0.13888053868815006, "flos": 21941348042880.0, "grad_norm": 1.9287258201476234, "language_loss": 0.79775929, "learning_rate": 3.877035652082548e-06, "loss": 0.82040524, "num_input_tokens_seen": 24598720, "step": 1155, "time_per_iteration": 2.786724090576172 }, { "auxiliary_loss_clip": 0.01225279, "auxiliary_loss_mlp": 0.01043884, "balance_loss_clip": 1.06810284, "balance_loss_mlp": 1.0320642, "epoch": 0.13900078157878915, "flos": 19608627087360.0, "grad_norm": 1.936039308081606, "language_loss": 0.85604835, "learning_rate": 3.87676658472896e-06, "loss": 0.87873995, "num_input_tokens_seen": 24617530, "step": 1156, "time_per_iteration": 2.933985710144043 }, { "auxiliary_loss_clip": 0.01238029, "auxiliary_loss_mlp": 0.01042551, "balance_loss_clip": 1.0650835, "balance_loss_mlp": 1.02954483, "epoch": 0.13912102446942826, "flos": 22638051216000.0, "grad_norm": 1.8914924934649886, "language_loss": 0.85534239, "learning_rate": 3.876497232671372e-06, "loss": 0.8781482, "num_input_tokens_seen": 24637485, "step": 1157, "time_per_iteration": 2.882725954055786 }, { "auxiliary_loss_clip": 0.01225082, "auxiliary_loss_mlp": 0.01033938, "balance_loss_clip": 1.06251335, "balance_loss_mlp": 1.02277386, "epoch": 0.13924126736006734, "flos": 29643324975360.0, "grad_norm": 2.783341555784092, "language_loss": 0.83851016, "learning_rate": 3.876227595950647e-06, "loss": 0.86110038, "num_input_tokens_seen": 24656915, "step": 1158, "time_per_iteration": 2.9172275066375732 }, { "auxiliary_loss_clip": 0.01239282, "auxiliary_loss_mlp": 0.0104313, "balance_loss_clip": 1.06421328, "balance_loss_mlp": 1.03195989, "epoch": 0.13936151025070642, "flos": 27417653527680.0, "grad_norm": 1.7284625449046507, "language_loss": 0.79015541, "learning_rate": 3.875957674607686e-06, "loss": 0.81297952, "num_input_tokens_seen": 24679190, "step": 1159, "time_per_iteration": 2.9713406562805176 }, { "auxiliary_loss_clip": 0.0122557, "auxiliary_loss_mlp": 0.01062258, "balance_loss_clip": 1.0613203, "balance_loss_mlp": 1.02946198, "epoch": 0.1394817531413455, "flos": 16399326625920.0, "grad_norm": 1.747869907303408, "language_loss": 0.88012004, "learning_rate": 3.8756874686834386e-06, "loss": 0.90299833, "num_input_tokens_seen": 24697405, "step": 1160, "time_per_iteration": 2.759495735168457 }, { "auxiliary_loss_clip": 0.01238371, "auxiliary_loss_mlp": 0.01062533, "balance_loss_clip": 1.06285882, "balance_loss_mlp": 1.03053212, "epoch": 0.13960199603198462, "flos": 30922319525760.0, "grad_norm": 1.8128807498848833, "language_loss": 0.80383217, "learning_rate": 3.875416978218893e-06, "loss": 0.82684124, "num_input_tokens_seen": 24720600, "step": 1161, "time_per_iteration": 2.9098422527313232 }, { "auxiliary_loss_clip": 0.01239832, "auxiliary_loss_mlp": 0.01044523, "balance_loss_clip": 1.06578565, "balance_loss_mlp": 1.03195834, "epoch": 0.1397222389226237, "flos": 18113773754880.0, "grad_norm": 4.510795889884032, "language_loss": 0.82805443, "learning_rate": 3.8751462032550835e-06, "loss": 0.85089803, "num_input_tokens_seen": 24737605, "step": 1162, "time_per_iteration": 2.852797031402588 }, { "auxiliary_loss_clip": 0.01224939, "auxiliary_loss_mlp": 0.01032505, "balance_loss_clip": 1.06500638, "balance_loss_mlp": 1.0222652, "epoch": 0.13984248181326278, "flos": 16872772815360.0, "grad_norm": 3.0468969409275597, "language_loss": 0.83084726, "learning_rate": 3.874875143833085e-06, "loss": 0.85342169, "num_input_tokens_seen": 24755845, "step": 1163, "time_per_iteration": 3.8258557319641113 }, { "auxiliary_loss_clip": 0.01232173, "auxiliary_loss_mlp": 0.01039174, "balance_loss_clip": 1.06359863, "balance_loss_mlp": 1.02659702, "epoch": 0.1399627247039019, "flos": 54121401267840.0, "grad_norm": 3.4738399201485044, "language_loss": 0.68930662, "learning_rate": 3.874603799994019e-06, "loss": 0.71202016, "num_input_tokens_seen": 24779380, "step": 1164, "time_per_iteration": 4.919122934341431 }, { "auxiliary_loss_clip": 0.01219195, "auxiliary_loss_mlp": 0.01038798, "balance_loss_clip": 1.06522155, "balance_loss_mlp": 1.02807558, "epoch": 0.14008296759454097, "flos": 11765521618560.0, "grad_norm": 1.925270573498952, "language_loss": 0.86923444, "learning_rate": 3.874332171779046e-06, "loss": 0.89181441, "num_input_tokens_seen": 24794260, "step": 1165, "time_per_iteration": 2.8027212619781494 }, { "auxiliary_loss_clip": 0.01219982, "auxiliary_loss_mlp": 0.01040177, "balance_loss_clip": 1.06001985, "balance_loss_mlp": 1.02825046, "epoch": 0.14020321048518006, "flos": 22017514832640.0, "grad_norm": 1.8864748844589516, "language_loss": 0.75883663, "learning_rate": 3.874060259229373e-06, "loss": 0.78143823, "num_input_tokens_seen": 24815835, "step": 1166, "time_per_iteration": 4.324167966842651 }, { "auxiliary_loss_clip": 0.0123604, "auxiliary_loss_mlp": 0.01040388, "balance_loss_clip": 1.06528509, "balance_loss_mlp": 1.02945018, "epoch": 0.14032345337581917, "flos": 23404313076480.0, "grad_norm": 2.088637725069223, "language_loss": 0.94227529, "learning_rate": 3.873788062386249e-06, "loss": 0.96503955, "num_input_tokens_seen": 24834095, "step": 1167, "time_per_iteration": 2.7384774684906006 }, { "auxiliary_loss_clip": 0.0123212, "auxiliary_loss_mlp": 0.01044845, "balance_loss_clip": 1.06815124, "balance_loss_mlp": 1.03384793, "epoch": 0.14044369626645825, "flos": 29645767100160.0, "grad_norm": 1.8837797914788519, "language_loss": 0.82196629, "learning_rate": 3.873515581290965e-06, "loss": 0.84473598, "num_input_tokens_seen": 24858900, "step": 1168, "time_per_iteration": 2.9164774417877197 }, { "auxiliary_loss_clip": 0.01225036, "auxiliary_loss_mlp": 0.01043916, "balance_loss_clip": 1.06658912, "balance_loss_mlp": 1.03204274, "epoch": 0.14056393915709733, "flos": 18332972501760.0, "grad_norm": 2.1582710832778913, "language_loss": 0.76028824, "learning_rate": 3.8732428159848575e-06, "loss": 0.78297776, "num_input_tokens_seen": 24877875, "step": 1169, "time_per_iteration": 2.7574081420898438 }, { "auxiliary_loss_clip": 0.01235707, "auxiliary_loss_mlp": 0.01046049, "balance_loss_clip": 1.06513822, "balance_loss_mlp": 1.03494501, "epoch": 0.14068418204773642, "flos": 26687517770880.0, "grad_norm": 1.9829149101270924, "language_loss": 0.78172106, "learning_rate": 3.872969766509304e-06, "loss": 0.80453861, "num_input_tokens_seen": 24898430, "step": 1170, "time_per_iteration": 2.8548216819763184 }, { "auxiliary_loss_clip": 0.01108222, "auxiliary_loss_mlp": 0.01003853, "balance_loss_clip": 1.01974154, "balance_loss_mlp": 1.00112331, "epoch": 0.14080442493837553, "flos": 65259314501760.0, "grad_norm": 0.76360018798034, "language_loss": 0.55655789, "learning_rate": 3.872696432905726e-06, "loss": 0.57767868, "num_input_tokens_seen": 24959250, "step": 1171, "time_per_iteration": 3.338840961456299 }, { "auxiliary_loss_clip": 0.01237366, "auxiliary_loss_mlp": 0.01037675, "balance_loss_clip": 1.06058574, "balance_loss_mlp": 1.02625453, "epoch": 0.1409246678290146, "flos": 25776715582080.0, "grad_norm": 2.509658657435637, "language_loss": 0.71445078, "learning_rate": 3.872422815215589e-06, "loss": 0.73720115, "num_input_tokens_seen": 24978330, "step": 1172, "time_per_iteration": 2.8217272758483887 }, { "auxiliary_loss_clip": 0.01231446, "auxiliary_loss_mlp": 0.01039569, "balance_loss_clip": 1.062801, "balance_loss_mlp": 1.02796948, "epoch": 0.1410449107196537, "flos": 21868521217920.0, "grad_norm": 2.1545181680321153, "language_loss": 0.74168199, "learning_rate": 3.8721489134803994e-06, "loss": 0.76439214, "num_input_tokens_seen": 24997120, "step": 1173, "time_per_iteration": 2.8207085132598877 }, { "auxiliary_loss_clip": 0.01230573, "auxiliary_loss_mlp": 0.0104535, "balance_loss_clip": 1.06258559, "balance_loss_mlp": 1.03369772, "epoch": 0.1411651536102928, "flos": 16684133564160.0, "grad_norm": 2.2154757427792955, "language_loss": 0.72506332, "learning_rate": 3.871874727741707e-06, "loss": 0.74782252, "num_input_tokens_seen": 25014350, "step": 1174, "time_per_iteration": 2.760079860687256 }, { "auxiliary_loss_clip": 0.01226397, "auxiliary_loss_mlp": 0.01042622, "balance_loss_clip": 1.06151283, "balance_loss_mlp": 1.03207755, "epoch": 0.1412853965009319, "flos": 20992264934400.0, "grad_norm": 1.8033132445252908, "language_loss": 0.96685612, "learning_rate": 3.871600258041108e-06, "loss": 0.9895463, "num_input_tokens_seen": 25033875, "step": 1175, "time_per_iteration": 2.7806379795074463 }, { "auxiliary_loss_clip": 0.01225699, "auxiliary_loss_mlp": 0.01037411, "balance_loss_clip": 1.06368554, "balance_loss_mlp": 1.02616978, "epoch": 0.14140563939157097, "flos": 20335279224960.0, "grad_norm": 2.569698463008965, "language_loss": 0.85785693, "learning_rate": 3.871325504420238e-06, "loss": 0.88048804, "num_input_tokens_seen": 25052865, "step": 1176, "time_per_iteration": 2.8042714595794678 }, { "auxiliary_loss_clip": 0.01237941, "auxiliary_loss_mlp": 0.01036736, "balance_loss_clip": 1.06263006, "balance_loss_mlp": 1.02607274, "epoch": 0.14152588228221005, "flos": 21068826773760.0, "grad_norm": 2.015933739973137, "language_loss": 0.8200804, "learning_rate": 3.871050466920776e-06, "loss": 0.8428272, "num_input_tokens_seen": 25072770, "step": 1177, "time_per_iteration": 2.7048542499542236 }, { "auxiliary_loss_clip": 0.01219046, "auxiliary_loss_mlp": 0.01042447, "balance_loss_clip": 1.06404114, "balance_loss_mlp": 1.03169465, "epoch": 0.14164612517284916, "flos": 18223157646720.0, "grad_norm": 1.8406100640420504, "language_loss": 0.79853737, "learning_rate": 3.870775145584447e-06, "loss": 0.82115233, "num_input_tokens_seen": 25090550, "step": 1178, "time_per_iteration": 2.6948320865631104 }, { "auxiliary_loss_clip": 0.0124002, "auxiliary_loss_mlp": 0.01045464, "balance_loss_clip": 1.06708288, "balance_loss_mlp": 1.03359115, "epoch": 0.14176636806348825, "flos": 22744454279040.0, "grad_norm": 3.059629772879455, "language_loss": 0.65474856, "learning_rate": 3.8704995404530145e-06, "loss": 0.67760348, "num_input_tokens_seen": 25106175, "step": 1179, "time_per_iteration": 2.7817537784576416 }, { "auxiliary_loss_clip": 0.01234019, "auxiliary_loss_mlp": 0.01040283, "balance_loss_clip": 1.06217539, "balance_loss_mlp": 1.02960205, "epoch": 0.14188661095412733, "flos": 22091095843200.0, "grad_norm": 1.9279481518934496, "language_loss": 0.85229921, "learning_rate": 3.87022365156829e-06, "loss": 0.87504232, "num_input_tokens_seen": 25126890, "step": 1180, "time_per_iteration": 2.782261610031128 }, { "auxiliary_loss_clip": 0.01210154, "auxiliary_loss_mlp": 0.01041394, "balance_loss_clip": 1.06264663, "balance_loss_mlp": 1.03002143, "epoch": 0.14200685384476644, "flos": 24352390604160.0, "grad_norm": 2.2620309093102997, "language_loss": 0.80944538, "learning_rate": 3.869947478972123e-06, "loss": 0.83196086, "num_input_tokens_seen": 25147915, "step": 1181, "time_per_iteration": 2.9120163917541504 }, { "auxiliary_loss_clip": 0.01226701, "auxiliary_loss_mlp": 0.01041188, "balance_loss_clip": 1.06333804, "balance_loss_mlp": 1.02920794, "epoch": 0.14212709673540552, "flos": 24022048199040.0, "grad_norm": 2.4658176039588424, "language_loss": 0.82478583, "learning_rate": 3.869671022706412e-06, "loss": 0.84746474, "num_input_tokens_seen": 25166645, "step": 1182, "time_per_iteration": 2.8207433223724365 }, { "auxiliary_loss_clip": 0.0120787, "auxiliary_loss_mlp": 0.01041204, "balance_loss_clip": 1.06444883, "balance_loss_mlp": 1.03007567, "epoch": 0.1422473396260446, "flos": 26431797870720.0, "grad_norm": 1.7753730100435317, "language_loss": 0.65175831, "learning_rate": 3.869394282813092e-06, "loss": 0.67424911, "num_input_tokens_seen": 25185845, "step": 1183, "time_per_iteration": 2.8623480796813965 }, { "auxiliary_loss_clip": 0.01234026, "auxiliary_loss_mlp": 0.01038063, "balance_loss_clip": 1.0632565, "balance_loss_mlp": 1.02674973, "epoch": 0.1423675825166837, "flos": 17055306754560.0, "grad_norm": 5.546268519776335, "language_loss": 0.89438075, "learning_rate": 3.869117259334147e-06, "loss": 0.91710168, "num_input_tokens_seen": 25203770, "step": 1184, "time_per_iteration": 2.8022570610046387 }, { "auxiliary_loss_clip": 0.01232377, "auxiliary_loss_mlp": 0.01045035, "balance_loss_clip": 1.06480479, "balance_loss_mlp": 1.0343895, "epoch": 0.1424878254073228, "flos": 17929480049280.0, "grad_norm": 1.746908745456139, "language_loss": 0.81889164, "learning_rate": 3.868839952311599e-06, "loss": 0.84166574, "num_input_tokens_seen": 25221725, "step": 1185, "time_per_iteration": 2.7508156299591064 }, { "auxiliary_loss_clip": 0.012264, "auxiliary_loss_mlp": 0.01040382, "balance_loss_clip": 1.06377506, "balance_loss_mlp": 1.02891374, "epoch": 0.14260806829796188, "flos": 20303606407680.0, "grad_norm": 3.1303419414891116, "language_loss": 0.80546343, "learning_rate": 3.868562361787516e-06, "loss": 0.8281312, "num_input_tokens_seen": 25240855, "step": 1186, "time_per_iteration": 2.791846513748169 }, { "auxiliary_loss_clip": 0.01207594, "auxiliary_loss_mlp": 0.01036022, "balance_loss_clip": 1.06260264, "balance_loss_mlp": 1.0254004, "epoch": 0.14272831118860096, "flos": 23185724860800.0, "grad_norm": 2.0808076506173614, "language_loss": 0.69087207, "learning_rate": 3.868284487804009e-06, "loss": 0.71330822, "num_input_tokens_seen": 25260085, "step": 1187, "time_per_iteration": 2.826266050338745 }, { "auxiliary_loss_clip": 0.0123728, "auxiliary_loss_mlp": 0.01033227, "balance_loss_clip": 1.06553912, "balance_loss_mlp": 1.0217948, "epoch": 0.14284855407924008, "flos": 27232210586880.0, "grad_norm": 1.7843191346141445, "language_loss": 0.7808125, "learning_rate": 3.86800633040323e-06, "loss": 0.80351758, "num_input_tokens_seen": 25280675, "step": 1188, "time_per_iteration": 3.8560101985931396 }, { "auxiliary_loss_clip": 0.01228474, "auxiliary_loss_mlp": 0.01060416, "balance_loss_clip": 1.06434906, "balance_loss_mlp": 1.02844715, "epoch": 0.14296879696987916, "flos": 28184202696960.0, "grad_norm": 2.2848981121712346, "language_loss": 0.78180057, "learning_rate": 3.867727889627376e-06, "loss": 0.80468941, "num_input_tokens_seen": 25300290, "step": 1189, "time_per_iteration": 3.806222915649414 }, { "auxiliary_loss_clip": 0.01216884, "auxiliary_loss_mlp": 0.01038472, "balance_loss_clip": 1.06518364, "balance_loss_mlp": 1.02765369, "epoch": 0.14308903986051824, "flos": 19390290266880.0, "grad_norm": 2.546435083848999, "language_loss": 0.77989346, "learning_rate": 3.867449165518687e-06, "loss": 0.80244702, "num_input_tokens_seen": 25316760, "step": 1190, "time_per_iteration": 3.821986436843872 }, { "auxiliary_loss_clip": 0.01239925, "auxiliary_loss_mlp": 0.01063861, "balance_loss_clip": 1.06287134, "balance_loss_mlp": 1.03167284, "epoch": 0.14320928275115732, "flos": 17457506317440.0, "grad_norm": 1.6689675574180063, "language_loss": 0.70970523, "learning_rate": 3.867170158119444e-06, "loss": 0.73274302, "num_input_tokens_seen": 25335760, "step": 1191, "time_per_iteration": 2.785083293914795 }, { "auxiliary_loss_clip": 0.01239595, "auxiliary_loss_mlp": 0.01044935, "balance_loss_clip": 1.06328309, "balance_loss_mlp": 1.03407538, "epoch": 0.14332952564179643, "flos": 21466070259840.0, "grad_norm": 1.883969867855859, "language_loss": 0.75388873, "learning_rate": 3.866890867471972e-06, "loss": 0.77673399, "num_input_tokens_seen": 25354230, "step": 1192, "time_per_iteration": 3.916444778442383 }, { "auxiliary_loss_clip": 0.01230482, "auxiliary_loss_mlp": 0.01039483, "balance_loss_clip": 1.06608212, "balance_loss_mlp": 1.02718091, "epoch": 0.14344976853243552, "flos": 16396992241920.0, "grad_norm": 7.080085164369446, "language_loss": 0.89669758, "learning_rate": 3.86661129361864e-06, "loss": 0.91939723, "num_input_tokens_seen": 25368720, "step": 1193, "time_per_iteration": 2.8625569343566895 }, { "auxiliary_loss_clip": 0.0122958, "auxiliary_loss_mlp": 0.01045876, "balance_loss_clip": 1.06347823, "balance_loss_mlp": 1.03391981, "epoch": 0.1435700114230746, "flos": 18916736336640.0, "grad_norm": 2.569651236467224, "language_loss": 0.8621583, "learning_rate": 3.866331436601859e-06, "loss": 0.88491285, "num_input_tokens_seen": 25386715, "step": 1194, "time_per_iteration": 2.805701971054077 }, { "auxiliary_loss_clip": 0.01242085, "auxiliary_loss_mlp": 0.01046689, "balance_loss_clip": 1.06683064, "balance_loss_mlp": 1.03450596, "epoch": 0.1436902543137137, "flos": 19755394058880.0, "grad_norm": 2.05896714984891, "language_loss": 0.74132812, "learning_rate": 3.866051296464083e-06, "loss": 0.76421589, "num_input_tokens_seen": 25405550, "step": 1195, "time_per_iteration": 2.7217605113983154 }, { "auxiliary_loss_clip": 0.01242486, "auxiliary_loss_mlp": 0.01066974, "balance_loss_clip": 1.06312478, "balance_loss_mlp": 1.03475916, "epoch": 0.1438104972043528, "flos": 14684807669760.0, "grad_norm": 2.271990497973448, "language_loss": 0.85301995, "learning_rate": 3.86577087324781e-06, "loss": 0.87611455, "num_input_tokens_seen": 25422040, "step": 1196, "time_per_iteration": 2.7251133918762207 }, { "auxiliary_loss_clip": 0.01229052, "auxiliary_loss_mlp": 0.01045251, "balance_loss_clip": 1.06473136, "balance_loss_mlp": 1.03436112, "epoch": 0.14393074009499188, "flos": 17092330698240.0, "grad_norm": 3.1644699852675355, "language_loss": 0.77182138, "learning_rate": 3.865490166995578e-06, "loss": 0.79456437, "num_input_tokens_seen": 25440270, "step": 1197, "time_per_iteration": 2.7536940574645996 }, { "auxiliary_loss_clip": 0.01238155, "auxiliary_loss_mlp": 0.01047756, "balance_loss_clip": 1.06649745, "balance_loss_mlp": 1.03656828, "epoch": 0.144050982985631, "flos": 30476200608000.0, "grad_norm": 3.09109521552726, "language_loss": 0.8422026, "learning_rate": 3.86520917774997e-06, "loss": 0.8650617, "num_input_tokens_seen": 25459705, "step": 1198, "time_per_iteration": 2.752945899963379 }, { "auxiliary_loss_clip": 0.01230354, "auxiliary_loss_mlp": 0.01039448, "balance_loss_clip": 1.0629257, "balance_loss_mlp": 1.02845657, "epoch": 0.14417122587627007, "flos": 17858484817920.0, "grad_norm": 2.6646302311470915, "language_loss": 0.75380743, "learning_rate": 3.864927905553614e-06, "loss": 0.77650547, "num_input_tokens_seen": 25477615, "step": 1199, "time_per_iteration": 2.7891359329223633 }, { "auxiliary_loss_clip": 0.01227392, "auxiliary_loss_mlp": 0.01045261, "balance_loss_clip": 1.0657289, "balance_loss_mlp": 1.03446066, "epoch": 0.14429146876690915, "flos": 21613914639360.0, "grad_norm": 1.794653560809112, "language_loss": 0.88763571, "learning_rate": 3.8646463504491765e-06, "loss": 0.91036224, "num_input_tokens_seen": 25497750, "step": 1200, "time_per_iteration": 2.718611240386963 }, { "auxiliary_loss_clip": 0.01236947, "auxiliary_loss_mlp": 0.01047408, "balance_loss_clip": 1.06758213, "balance_loss_mlp": 1.03556514, "epoch": 0.14441171165754824, "flos": 23258120722560.0, "grad_norm": 1.8131990277513037, "language_loss": 0.83414459, "learning_rate": 3.8643645124793705e-06, "loss": 0.85698813, "num_input_tokens_seen": 25516650, "step": 1201, "time_per_iteration": 2.756627321243286 }, { "auxiliary_loss_clip": 0.01231605, "auxiliary_loss_mlp": 0.01042644, "balance_loss_clip": 1.06425142, "balance_loss_mlp": 1.03034115, "epoch": 0.14453195454818735, "flos": 42854213963520.0, "grad_norm": 1.6475715833373996, "language_loss": 0.74971175, "learning_rate": 3.8640823916869515e-06, "loss": 0.7724542, "num_input_tokens_seen": 25540960, "step": 1202, "time_per_iteration": 2.9341397285461426 }, { "auxiliary_loss_clip": 0.0124033, "auxiliary_loss_mlp": 0.01038024, "balance_loss_clip": 1.06548464, "balance_loss_mlp": 1.02652025, "epoch": 0.14465219743882643, "flos": 27235873774080.0, "grad_norm": 1.8287254947201717, "language_loss": 0.78580874, "learning_rate": 3.863799988114714e-06, "loss": 0.80859232, "num_input_tokens_seen": 25562990, "step": 1203, "time_per_iteration": 2.766430616378784 }, { "auxiliary_loss_clip": 0.01243079, "auxiliary_loss_mlp": 0.01042527, "balance_loss_clip": 1.066046, "balance_loss_mlp": 1.03121436, "epoch": 0.1447724403294655, "flos": 16690705752960.0, "grad_norm": 3.8967851536240845, "language_loss": 0.70628488, "learning_rate": 3.863517301805502e-06, "loss": 0.72914088, "num_input_tokens_seen": 25581380, "step": 1204, "time_per_iteration": 2.8355603218078613 }, { "auxiliary_loss_clip": 0.01232599, "auxiliary_loss_mlp": 0.01042303, "balance_loss_clip": 1.06868911, "balance_loss_mlp": 1.03083491, "epoch": 0.14489268322010462, "flos": 20073741321600.0, "grad_norm": 2.6314276657247717, "language_loss": 0.97133148, "learning_rate": 3.863234332802196e-06, "loss": 0.99408048, "num_input_tokens_seen": 25593585, "step": 1205, "time_per_iteration": 2.7161054611206055 }, { "auxiliary_loss_clip": 0.0122965, "auxiliary_loss_mlp": 0.01053549, "balance_loss_clip": 1.06629181, "balance_loss_mlp": 1.04147315, "epoch": 0.1450129261107437, "flos": 27125627955840.0, "grad_norm": 2.19323264745064, "language_loss": 0.74551862, "learning_rate": 3.862951081147723e-06, "loss": 0.7683506, "num_input_tokens_seen": 25613750, "step": 1206, "time_per_iteration": 2.843705415725708 }, { "auxiliary_loss_clip": 0.01236412, "auxiliary_loss_mlp": 0.01035647, "balance_loss_clip": 1.06667066, "balance_loss_mlp": 1.02456045, "epoch": 0.1451331690013828, "flos": 25702344472320.0, "grad_norm": 2.9619082888758284, "language_loss": 0.78168714, "learning_rate": 3.862667546885053e-06, "loss": 0.80440772, "num_input_tokens_seen": 25632300, "step": 1207, "time_per_iteration": 2.9034018516540527 }, { "auxiliary_loss_clip": 0.01231418, "auxiliary_loss_mlp": 0.01039797, "balance_loss_clip": 1.0624553, "balance_loss_mlp": 1.02769756, "epoch": 0.14525341189202187, "flos": 25737393168000.0, "grad_norm": 2.2404068696556583, "language_loss": 0.73409605, "learning_rate": 3.8623837300571965e-06, "loss": 0.75680822, "num_input_tokens_seen": 25651285, "step": 1208, "time_per_iteration": 2.9257848262786865 }, { "auxiliary_loss_clip": 0.01240975, "auxiliary_loss_mlp": 0.01043445, "balance_loss_clip": 1.06463182, "balance_loss_mlp": 1.03151822, "epoch": 0.14537365478266098, "flos": 23073898844160.0, "grad_norm": 2.0337219010582674, "language_loss": 0.83984083, "learning_rate": 3.8620996307072085e-06, "loss": 0.86268508, "num_input_tokens_seen": 25671990, "step": 1209, "time_per_iteration": 2.8231754302978516 }, { "auxiliary_loss_clip": 0.01232443, "auxiliary_loss_mlp": 0.01038774, "balance_loss_clip": 1.06488872, "balance_loss_mlp": 1.02729416, "epoch": 0.14549389767330007, "flos": 20595021448320.0, "grad_norm": 3.098643081171537, "language_loss": 0.64461046, "learning_rate": 3.861815248878188e-06, "loss": 0.66732264, "num_input_tokens_seen": 25689475, "step": 1210, "time_per_iteration": 2.778979539871216 }, { "auxiliary_loss_clip": 0.01221814, "auxiliary_loss_mlp": 0.01035718, "balance_loss_clip": 1.06351805, "balance_loss_mlp": 1.0244168, "epoch": 0.14561414056393915, "flos": 15121804533120.0, "grad_norm": 2.5559561858839235, "language_loss": 0.8002584, "learning_rate": 3.861530584613274e-06, "loss": 0.82283378, "num_input_tokens_seen": 25707475, "step": 1211, "time_per_iteration": 2.843096971511841 }, { "auxiliary_loss_clip": 0.01240249, "auxiliary_loss_mlp": 0.01063401, "balance_loss_clip": 1.06925881, "balance_loss_mlp": 1.03130412, "epoch": 0.14573438345457826, "flos": 19427493778560.0, "grad_norm": 4.98530460609806, "language_loss": 0.82051015, "learning_rate": 3.86124563795565e-06, "loss": 0.84354663, "num_input_tokens_seen": 25726290, "step": 1212, "time_per_iteration": 2.76157808303833 }, { "auxiliary_loss_clip": 0.01242309, "auxiliary_loss_mlp": 0.01045495, "balance_loss_clip": 1.06752396, "balance_loss_mlp": 1.03390169, "epoch": 0.14585462634521734, "flos": 24828422572800.0, "grad_norm": 1.770817197280975, "language_loss": 0.70210546, "learning_rate": 3.860960408948543e-06, "loss": 0.72498345, "num_input_tokens_seen": 25748040, "step": 1213, "time_per_iteration": 2.8380889892578125 }, { "auxiliary_loss_clip": 0.01225159, "auxiliary_loss_mlp": 0.01033719, "balance_loss_clip": 1.06430936, "balance_loss_mlp": 1.02331817, "epoch": 0.14597486923585642, "flos": 15448627405440.0, "grad_norm": 2.3698947356847224, "language_loss": 0.89655536, "learning_rate": 3.860674897635222e-06, "loss": 0.91914415, "num_input_tokens_seen": 25764525, "step": 1214, "time_per_iteration": 2.7435290813446045 }, { "auxiliary_loss_clip": 0.01241226, "auxiliary_loss_mlp": 0.0104454, "balance_loss_clip": 1.0717144, "balance_loss_mlp": 1.03307784, "epoch": 0.1460951121264955, "flos": 16655154266880.0, "grad_norm": 2.0735698472105812, "language_loss": 0.83879668, "learning_rate": 3.860389104058998e-06, "loss": 0.86165434, "num_input_tokens_seen": 25782755, "step": 1215, "time_per_iteration": 3.7949161529541016 }, { "auxiliary_loss_clip": 0.01230182, "auxiliary_loss_mlp": 0.01037481, "balance_loss_clip": 1.06544137, "balance_loss_mlp": 1.02546477, "epoch": 0.14621535501713462, "flos": 24863291700480.0, "grad_norm": 3.0033664024818005, "language_loss": 0.72801536, "learning_rate": 3.860103028263227e-06, "loss": 0.75069195, "num_input_tokens_seen": 25805860, "step": 1216, "time_per_iteration": 4.0492236614227295 }, { "auxiliary_loss_clip": 0.01219673, "auxiliary_loss_mlp": 0.01039502, "balance_loss_clip": 1.06565583, "balance_loss_mlp": 1.02807045, "epoch": 0.1463355979077737, "flos": 25228000442880.0, "grad_norm": 3.075390525045765, "language_loss": 0.69734132, "learning_rate": 3.859816670291304e-06, "loss": 0.71993309, "num_input_tokens_seen": 25824955, "step": 1217, "time_per_iteration": 2.94364595413208 }, { "auxiliary_loss_clip": 0.01213039, "auxiliary_loss_mlp": 0.01041683, "balance_loss_clip": 1.0668962, "balance_loss_mlp": 1.0302633, "epoch": 0.14645584079841278, "flos": 22054143726720.0, "grad_norm": 3.363326764052608, "language_loss": 0.90086973, "learning_rate": 3.859530030186672e-06, "loss": 0.92341697, "num_input_tokens_seen": 25841965, "step": 1218, "time_per_iteration": 4.234528064727783 }, { "auxiliary_loss_clip": 0.01240872, "auxiliary_loss_mlp": 0.01042319, "balance_loss_clip": 1.07095337, "balance_loss_mlp": 1.03023684, "epoch": 0.1465760836890519, "flos": 23623870959360.0, "grad_norm": 6.7150855536360865, "language_loss": 0.8244074, "learning_rate": 3.859243107992813e-06, "loss": 0.84723926, "num_input_tokens_seen": 25860770, "step": 1219, "time_per_iteration": 2.7754387855529785 }, { "auxiliary_loss_clip": 0.01236096, "auxiliary_loss_mlp": 0.0103292, "balance_loss_clip": 1.06564713, "balance_loss_mlp": 1.02069521, "epoch": 0.14669632657969098, "flos": 37407893356800.0, "grad_norm": 2.700609439224727, "language_loss": 0.77937424, "learning_rate": 3.858955903753252e-06, "loss": 0.80206454, "num_input_tokens_seen": 25879410, "step": 1220, "time_per_iteration": 3.0560250282287598 }, { "auxiliary_loss_clip": 0.01236554, "auxiliary_loss_mlp": 0.01040755, "balance_loss_clip": 1.06559682, "balance_loss_mlp": 1.02974045, "epoch": 0.14681656947033006, "flos": 28365910623360.0, "grad_norm": 1.6033126679496852, "language_loss": 0.83677274, "learning_rate": 3.858668417511559e-06, "loss": 0.85954583, "num_input_tokens_seen": 25902160, "step": 1221, "time_per_iteration": 2.881399154663086 }, { "auxiliary_loss_clip": 0.01235982, "auxiliary_loss_mlp": 0.01044565, "balance_loss_clip": 1.06771946, "balance_loss_mlp": 1.03293562, "epoch": 0.14693681236096917, "flos": 18479488078080.0, "grad_norm": 2.5538223217265674, "language_loss": 0.76552105, "learning_rate": 3.8583806493113445e-06, "loss": 0.7883265, "num_input_tokens_seen": 25920505, "step": 1222, "time_per_iteration": 2.87839674949646 }, { "auxiliary_loss_clip": 0.01232888, "auxiliary_loss_mlp": 0.01041422, "balance_loss_clip": 1.06481433, "balance_loss_mlp": 1.03041911, "epoch": 0.14705705525160825, "flos": 20777806782720.0, "grad_norm": 3.1691252983387384, "language_loss": 0.82323945, "learning_rate": 3.858092599196263e-06, "loss": 0.84598255, "num_input_tokens_seen": 25938460, "step": 1223, "time_per_iteration": 2.745532751083374 }, { "auxiliary_loss_clip": 0.01233283, "auxiliary_loss_mlp": 0.01040155, "balance_loss_clip": 1.06556082, "balance_loss_mlp": 1.0280726, "epoch": 0.14717729814224734, "flos": 29932944336000.0, "grad_norm": 2.624261694800994, "language_loss": 0.8256461, "learning_rate": 3.857804267210012e-06, "loss": 0.84838045, "num_input_tokens_seen": 25957760, "step": 1224, "time_per_iteration": 2.8986313343048096 }, { "auxiliary_loss_clip": 0.01219612, "auxiliary_loss_mlp": 0.01032114, "balance_loss_clip": 1.06586361, "balance_loss_mlp": 1.02142739, "epoch": 0.14729754103288642, "flos": 20047491457920.0, "grad_norm": 5.0589552276555, "language_loss": 0.88433963, "learning_rate": 3.857515653396331e-06, "loss": 0.90685689, "num_input_tokens_seen": 25974970, "step": 1225, "time_per_iteration": 2.839731216430664 }, { "auxiliary_loss_clip": 0.01226088, "auxiliary_loss_mlp": 0.01037715, "balance_loss_clip": 1.0647192, "balance_loss_mlp": 1.02674735, "epoch": 0.14741778392352553, "flos": 19281516906240.0, "grad_norm": 2.3306755895544384, "language_loss": 0.86854684, "learning_rate": 3.857226757799002e-06, "loss": 0.89118487, "num_input_tokens_seen": 25992525, "step": 1226, "time_per_iteration": 2.874885320663452 }, { "auxiliary_loss_clip": 0.01234572, "auxiliary_loss_mlp": 0.01052767, "balance_loss_clip": 1.06664753, "balance_loss_mlp": 1.04033351, "epoch": 0.1475380268141646, "flos": 25411108999680.0, "grad_norm": 2.4659621617245264, "language_loss": 0.741889, "learning_rate": 3.85693758046185e-06, "loss": 0.7647624, "num_input_tokens_seen": 26010815, "step": 1227, "time_per_iteration": 2.8183133602142334 }, { "auxiliary_loss_clip": 0.01242173, "auxiliary_loss_mlp": 0.01048657, "balance_loss_clip": 1.07003796, "balance_loss_mlp": 1.03754663, "epoch": 0.1476582697048037, "flos": 20847652778880.0, "grad_norm": 4.585474237262582, "language_loss": 0.82937187, "learning_rate": 3.8566481214287435e-06, "loss": 0.85228014, "num_input_tokens_seen": 26028935, "step": 1228, "time_per_iteration": 2.666693925857544 }, { "auxiliary_loss_clip": 0.01228209, "auxiliary_loss_mlp": 0.01057148, "balance_loss_clip": 1.07102621, "balance_loss_mlp": 1.04485154, "epoch": 0.1477785125954428, "flos": 14028109269120.0, "grad_norm": 1.9428331654475521, "language_loss": 0.90688205, "learning_rate": 3.8563583807435935e-06, "loss": 0.92973566, "num_input_tokens_seen": 26045080, "step": 1229, "time_per_iteration": 2.8782596588134766 }, { "auxiliary_loss_clip": 0.01239004, "auxiliary_loss_mlp": 0.01058426, "balance_loss_clip": 1.06769502, "balance_loss_mlp": 1.02708769, "epoch": 0.1478987554860819, "flos": 20516699842560.0, "grad_norm": 1.910156838306197, "language_loss": 0.77217007, "learning_rate": 3.856068358450353e-06, "loss": 0.79514432, "num_input_tokens_seen": 26065030, "step": 1230, "time_per_iteration": 2.815002679824829 }, { "auxiliary_loss_clip": 0.0122503, "auxiliary_loss_mlp": 0.01038401, "balance_loss_clip": 1.06729102, "balance_loss_mlp": 1.0267539, "epoch": 0.14801899837672097, "flos": 17857012360320.0, "grad_norm": 1.769395047559881, "language_loss": 0.86048871, "learning_rate": 3.8557780545930186e-06, "loss": 0.88312304, "num_input_tokens_seen": 26083445, "step": 1231, "time_per_iteration": 2.8468499183654785 }, { "auxiliary_loss_clip": 0.01228052, "auxiliary_loss_mlp": 0.01038045, "balance_loss_clip": 1.06707299, "balance_loss_mlp": 1.02670169, "epoch": 0.14813924126736006, "flos": 20881408584960.0, "grad_norm": 1.8049906542780305, "language_loss": 0.79275811, "learning_rate": 3.855487469215628e-06, "loss": 0.81541908, "num_input_tokens_seen": 26102375, "step": 1232, "time_per_iteration": 2.861294746398926 }, { "auxiliary_loss_clip": 0.0122627, "auxiliary_loss_mlp": 0.01034834, "balance_loss_clip": 1.0671308, "balance_loss_mlp": 1.02393281, "epoch": 0.14825948415799917, "flos": 37414070496000.0, "grad_norm": 2.662777400490631, "language_loss": 0.72257197, "learning_rate": 3.855196602362264e-06, "loss": 0.74518299, "num_input_tokens_seen": 26125295, "step": 1233, "time_per_iteration": 3.029935121536255 }, { "auxiliary_loss_clip": 0.01235614, "auxiliary_loss_mlp": 0.01046772, "balance_loss_clip": 1.06394935, "balance_loss_mlp": 1.03506565, "epoch": 0.14837972704863825, "flos": 22014641744640.0, "grad_norm": 2.1685486119773256, "language_loss": 0.94327068, "learning_rate": 3.854905454077051e-06, "loss": 0.96609449, "num_input_tokens_seen": 26142905, "step": 1234, "time_per_iteration": 2.8618297576904297 }, { "auxiliary_loss_clip": 0.01210118, "auxiliary_loss_mlp": 0.01037576, "balance_loss_clip": 1.06342912, "balance_loss_mlp": 1.02641845, "epoch": 0.14849996993927733, "flos": 20996323171200.0, "grad_norm": 2.0212591879530515, "language_loss": 0.88505399, "learning_rate": 3.854614024404155e-06, "loss": 0.90753084, "num_input_tokens_seen": 26161215, "step": 1235, "time_per_iteration": 2.9385266304016113 }, { "auxiliary_loss_clip": 0.01218445, "auxiliary_loss_mlp": 0.01030511, "balance_loss_clip": 1.06395864, "balance_loss_mlp": 1.01970458, "epoch": 0.14862021282991644, "flos": 20047994248320.0, "grad_norm": 1.847067906236435, "language_loss": 0.89120591, "learning_rate": 3.8543223133877865e-06, "loss": 0.91369545, "num_input_tokens_seen": 26179810, "step": 1236, "time_per_iteration": 2.923892021179199 }, { "auxiliary_loss_clip": 0.01215466, "auxiliary_loss_mlp": 0.01046998, "balance_loss_clip": 1.06608677, "balance_loss_mlp": 1.03513098, "epoch": 0.14874045572055553, "flos": 22712027276160.0, "grad_norm": 1.7582052865679385, "language_loss": 0.88015151, "learning_rate": 3.854030321072198e-06, "loss": 0.90277618, "num_input_tokens_seen": 26199715, "step": 1237, "time_per_iteration": 2.8874216079711914 }, { "auxiliary_loss_clip": 0.01232677, "auxiliary_loss_mlp": 0.01042742, "balance_loss_clip": 1.06606627, "balance_loss_mlp": 1.03220391, "epoch": 0.1488606986111946, "flos": 25411288567680.0, "grad_norm": 1.9918350918258454, "language_loss": 0.73343432, "learning_rate": 3.853738047501682e-06, "loss": 0.75618851, "num_input_tokens_seen": 26220275, "step": 1238, "time_per_iteration": 2.9521071910858154 }, { "auxiliary_loss_clip": 0.01235891, "auxiliary_loss_mlp": 0.01039898, "balance_loss_clip": 1.06912732, "balance_loss_mlp": 1.02847815, "epoch": 0.1489809415018337, "flos": 17018749687680.0, "grad_norm": 2.180001540277059, "language_loss": 0.77628911, "learning_rate": 3.85344549272058e-06, "loss": 0.79904699, "num_input_tokens_seen": 26238255, "step": 1239, "time_per_iteration": 2.803828239440918 }, { "auxiliary_loss_clip": 0.01233264, "auxiliary_loss_mlp": 0.01045978, "balance_loss_clip": 1.06806695, "balance_loss_mlp": 1.03453982, "epoch": 0.1491011843924728, "flos": 33659394860160.0, "grad_norm": 1.849874860080262, "language_loss": 0.82611585, "learning_rate": 3.853152656773269e-06, "loss": 0.84890831, "num_input_tokens_seen": 26259690, "step": 1240, "time_per_iteration": 2.885289430618286 }, { "auxiliary_loss_clip": 0.01226951, "auxiliary_loss_mlp": 0.01041192, "balance_loss_clip": 1.06651044, "balance_loss_mlp": 1.02971268, "epoch": 0.14922142728311188, "flos": 21179000764800.0, "grad_norm": 1.6596713915245702, "language_loss": 0.84617412, "learning_rate": 3.852859539704174e-06, "loss": 0.8688556, "num_input_tokens_seen": 26278990, "step": 1241, "time_per_iteration": 4.793601036071777 }, { "auxiliary_loss_clip": 0.01223113, "auxiliary_loss_mlp": 0.01039579, "balance_loss_clip": 1.06563354, "balance_loss_mlp": 1.02889824, "epoch": 0.14934167017375097, "flos": 29860548474240.0, "grad_norm": 1.7987446960467892, "language_loss": 0.76382852, "learning_rate": 3.85256614155776e-06, "loss": 0.78645551, "num_input_tokens_seen": 26299120, "step": 1242, "time_per_iteration": 3.921685218811035 }, { "auxiliary_loss_clip": 0.01232526, "auxiliary_loss_mlp": 0.01036675, "balance_loss_clip": 1.06356239, "balance_loss_mlp": 1.02481961, "epoch": 0.14946191306439008, "flos": 17019216564480.0, "grad_norm": 2.0736933958334043, "language_loss": 0.74231267, "learning_rate": 3.852272462378535e-06, "loss": 0.76500463, "num_input_tokens_seen": 26316995, "step": 1243, "time_per_iteration": 2.844831705093384 }, { "auxiliary_loss_clip": 0.01230948, "auxiliary_loss_mlp": 0.01040783, "balance_loss_clip": 1.06621885, "balance_loss_mlp": 1.0302453, "epoch": 0.14958215595502916, "flos": 15669047214720.0, "grad_norm": 2.0090240232849497, "language_loss": 0.77740192, "learning_rate": 3.85197850221105e-06, "loss": 0.80011928, "num_input_tokens_seen": 26333295, "step": 1244, "time_per_iteration": 4.310895681381226 }, { "auxiliary_loss_clip": 0.01230323, "auxiliary_loss_mlp": 0.01035892, "balance_loss_clip": 1.06589484, "balance_loss_mlp": 1.02522826, "epoch": 0.14970239884566824, "flos": 33108560818560.0, "grad_norm": 1.8803644264057902, "language_loss": 0.7571066, "learning_rate": 3.851684261099899e-06, "loss": 0.77976871, "num_input_tokens_seen": 26355035, "step": 1245, "time_per_iteration": 2.881381034851074 }, { "auxiliary_loss_clip": 0.01229504, "auxiliary_loss_mlp": 0.01042137, "balance_loss_clip": 1.06777596, "balance_loss_mlp": 1.03063941, "epoch": 0.14982264173630733, "flos": 17821245392640.0, "grad_norm": 2.031676776197224, "language_loss": 0.86665124, "learning_rate": 3.851389739089718e-06, "loss": 0.88936764, "num_input_tokens_seen": 26371655, "step": 1246, "time_per_iteration": 2.8369839191436768 }, { "auxiliary_loss_clip": 0.0123449, "auxiliary_loss_mlp": 0.01049208, "balance_loss_clip": 1.06777358, "balance_loss_mlp": 1.0386641, "epoch": 0.14994288462694644, "flos": 32409559175040.0, "grad_norm": 2.7166643862181026, "language_loss": 0.80331904, "learning_rate": 3.851094936225186e-06, "loss": 0.82615602, "num_input_tokens_seen": 26392540, "step": 1247, "time_per_iteration": 2.9756805896759033 }, { "auxiliary_loss_clip": 0.01225122, "auxiliary_loss_mlp": 0.01038275, "balance_loss_clip": 1.06589222, "balance_loss_mlp": 1.02734971, "epoch": 0.15006312751758552, "flos": 31794661226880.0, "grad_norm": 1.4169797921168021, "language_loss": 0.76658392, "learning_rate": 3.850799852551024e-06, "loss": 0.78921789, "num_input_tokens_seen": 26414960, "step": 1248, "time_per_iteration": 2.844627857208252 }, { "auxiliary_loss_clip": 0.01229993, "auxiliary_loss_mlp": 0.01039425, "balance_loss_clip": 1.0665791, "balance_loss_mlp": 1.02914906, "epoch": 0.1501833704082246, "flos": 16618022582400.0, "grad_norm": 2.3002774625663047, "language_loss": 0.85859871, "learning_rate": 3.850504488111995e-06, "loss": 0.88129288, "num_input_tokens_seen": 26431635, "step": 1249, "time_per_iteration": 2.846977472305298 }, { "auxiliary_loss_clip": 0.01223268, "auxiliary_loss_mlp": 0.0103187, "balance_loss_clip": 1.06365895, "balance_loss_mlp": 1.02113557, "epoch": 0.15030361329886371, "flos": 23471178243840.0, "grad_norm": 2.1002857185280934, "language_loss": 0.82678413, "learning_rate": 3.850208842952907e-06, "loss": 0.84933555, "num_input_tokens_seen": 26450440, "step": 1250, "time_per_iteration": 2.9246253967285156 }, { "auxiliary_loss_clip": 0.01230702, "auxiliary_loss_mlp": 0.01037877, "balance_loss_clip": 1.06560838, "balance_loss_mlp": 1.02732134, "epoch": 0.1504238561895028, "flos": 25629409906560.0, "grad_norm": 1.817671047360191, "language_loss": 0.79314488, "learning_rate": 3.849912917118608e-06, "loss": 0.81583071, "num_input_tokens_seen": 26471480, "step": 1251, "time_per_iteration": 2.869830369949341 }, { "auxiliary_loss_clip": 0.0114414, "auxiliary_loss_mlp": 0.01002986, "balance_loss_clip": 1.03763747, "balance_loss_mlp": 1.00007713, "epoch": 0.15054409908014188, "flos": 52095146129280.0, "grad_norm": 0.8845480978329124, "language_loss": 0.59300715, "learning_rate": 3.849616710653992e-06, "loss": 0.61447841, "num_input_tokens_seen": 26532950, "step": 1252, "time_per_iteration": 3.3204333782196045 }, { "auxiliary_loss_clip": 0.01235023, "auxiliary_loss_mlp": 0.01039487, "balance_loss_clip": 1.06649375, "balance_loss_mlp": 1.02764952, "epoch": 0.150664341970781, "flos": 18880251096960.0, "grad_norm": 1.8510229146212787, "language_loss": 0.74923086, "learning_rate": 3.84932022360399e-06, "loss": 0.77197599, "num_input_tokens_seen": 26551615, "step": 1253, "time_per_iteration": 2.6907174587249756 }, { "auxiliary_loss_clip": 0.01224479, "auxiliary_loss_mlp": 0.01040977, "balance_loss_clip": 1.0669682, "balance_loss_mlp": 1.02982497, "epoch": 0.15078458486142007, "flos": 22163240309760.0, "grad_norm": 4.875118684915888, "language_loss": 0.84252179, "learning_rate": 3.849023456013581e-06, "loss": 0.86517632, "num_input_tokens_seen": 26569175, "step": 1254, "time_per_iteration": 2.8861682415008545 }, { "auxiliary_loss_clip": 0.01238527, "auxiliary_loss_mlp": 0.01037532, "balance_loss_clip": 1.06519818, "balance_loss_mlp": 1.02686858, "epoch": 0.15090482775205916, "flos": 26651894457600.0, "grad_norm": 2.6009922120483044, "language_loss": 0.6255725, "learning_rate": 3.848726407927784e-06, "loss": 0.64833307, "num_input_tokens_seen": 26589560, "step": 1255, "time_per_iteration": 2.8298115730285645 }, { "auxiliary_loss_clip": 0.01230508, "auxiliary_loss_mlp": 0.01039123, "balance_loss_clip": 1.066113, "balance_loss_mlp": 1.02859139, "epoch": 0.15102507064269824, "flos": 21798998444160.0, "grad_norm": 3.079478332473729, "language_loss": 0.86336887, "learning_rate": 3.84842907939166e-06, "loss": 0.88606519, "num_input_tokens_seen": 26608785, "step": 1256, "time_per_iteration": 2.8781583309173584 }, { "auxiliary_loss_clip": 0.01219171, "auxiliary_loss_mlp": 0.01037427, "balance_loss_clip": 1.06485939, "balance_loss_mlp": 1.02677584, "epoch": 0.15114531353333735, "flos": 22820908377600.0, "grad_norm": 2.871071776225546, "language_loss": 0.71028554, "learning_rate": 3.8481314704503146e-06, "loss": 0.73285151, "num_input_tokens_seen": 26628615, "step": 1257, "time_per_iteration": 2.820554733276367 }, { "auxiliary_loss_clip": 0.01231598, "auxiliary_loss_mlp": 0.01049912, "balance_loss_clip": 1.06789827, "balance_loss_mlp": 1.03856277, "epoch": 0.15126555642397643, "flos": 19682674974720.0, "grad_norm": 2.8447085934202283, "language_loss": 0.87966859, "learning_rate": 3.847833581148895e-06, "loss": 0.90248364, "num_input_tokens_seen": 26647525, "step": 1258, "time_per_iteration": 2.7016336917877197 }, { "auxiliary_loss_clip": 0.01235616, "auxiliary_loss_mlp": 0.01035421, "balance_loss_clip": 1.06364596, "balance_loss_mlp": 1.02420306, "epoch": 0.15138579931461552, "flos": 28726022424960.0, "grad_norm": 3.32448759687929, "language_loss": 0.80895686, "learning_rate": 3.84753541153259e-06, "loss": 0.83166718, "num_input_tokens_seen": 26667095, "step": 1259, "time_per_iteration": 2.8156723976135254 }, { "auxiliary_loss_clip": 0.01231232, "auxiliary_loss_mlp": 0.01042848, "balance_loss_clip": 1.06529498, "balance_loss_mlp": 1.0322144, "epoch": 0.15150604220525463, "flos": 22127006465280.0, "grad_norm": 1.9703136013834903, "language_loss": 0.83123124, "learning_rate": 3.847236961646633e-06, "loss": 0.85397208, "num_input_tokens_seen": 26686075, "step": 1260, "time_per_iteration": 2.7974050045013428 }, { "auxiliary_loss_clip": 0.01225969, "auxiliary_loss_mlp": 0.0103858, "balance_loss_clip": 1.06731057, "balance_loss_mlp": 1.02769649, "epoch": 0.1516262850958937, "flos": 12968708515200.0, "grad_norm": 3.3995597891907923, "language_loss": 0.78275967, "learning_rate": 3.846938231536296e-06, "loss": 0.80540514, "num_input_tokens_seen": 26701695, "step": 1261, "time_per_iteration": 2.8403420448303223 }, { "auxiliary_loss_clip": 0.01236411, "auxiliary_loss_mlp": 0.01040164, "balance_loss_clip": 1.06792617, "balance_loss_mlp": 1.02970362, "epoch": 0.1517465279865328, "flos": 21797130936960.0, "grad_norm": 2.197698856418652, "language_loss": 0.80660123, "learning_rate": 3.8466392212468995e-06, "loss": 0.82936698, "num_input_tokens_seen": 26721885, "step": 1262, "time_per_iteration": 2.836879014968872 }, { "auxiliary_loss_clip": 0.01140191, "auxiliary_loss_mlp": 0.01004855, "balance_loss_clip": 1.03638351, "balance_loss_mlp": 1.00165987, "epoch": 0.15186677087717187, "flos": 58174569901440.0, "grad_norm": 0.818115824597668, "language_loss": 0.61925024, "learning_rate": 3.8463399308238e-06, "loss": 0.6407007, "num_input_tokens_seen": 26780990, "step": 1263, "time_per_iteration": 3.258000373840332 }, { "auxiliary_loss_clip": 0.01233758, "auxiliary_loss_mlp": 0.01039707, "balance_loss_clip": 1.06743455, "balance_loss_mlp": 1.02916861, "epoch": 0.15198701376781099, "flos": 32669696448000.0, "grad_norm": 2.1942157709383032, "language_loss": 0.64071196, "learning_rate": 3.846040360312402e-06, "loss": 0.66344661, "num_input_tokens_seen": 26804250, "step": 1264, "time_per_iteration": 2.934662342071533 }, { "auxiliary_loss_clip": 0.0123508, "auxiliary_loss_mlp": 0.01033033, "balance_loss_clip": 1.06469226, "balance_loss_mlp": 1.02169609, "epoch": 0.15210725665845007, "flos": 28402575431040.0, "grad_norm": 2.1738002739223763, "language_loss": 0.81228721, "learning_rate": 3.8457405097581485e-06, "loss": 0.83496833, "num_input_tokens_seen": 26823240, "step": 1265, "time_per_iteration": 2.8101816177368164 }, { "auxiliary_loss_clip": 0.01224163, "auxiliary_loss_mlp": 0.01036808, "balance_loss_clip": 1.06494403, "balance_loss_mlp": 1.02662754, "epoch": 0.15222749954908915, "flos": 19938179393280.0, "grad_norm": 3.952495447683461, "language_loss": 0.78037214, "learning_rate": 3.8454403792065275e-06, "loss": 0.80298185, "num_input_tokens_seen": 26842060, "step": 1266, "time_per_iteration": 2.90946626663208 }, { "auxiliary_loss_clip": 0.01216245, "auxiliary_loss_mlp": 0.01033876, "balance_loss_clip": 1.06572735, "balance_loss_mlp": 1.02423763, "epoch": 0.15234774243972826, "flos": 21324223451520.0, "grad_norm": 2.3318739633712062, "language_loss": 0.85628504, "learning_rate": 3.845139968703068e-06, "loss": 0.87878621, "num_input_tokens_seen": 26859580, "step": 1267, "time_per_iteration": 3.743361473083496 }, { "auxiliary_loss_clip": 0.01219573, "auxiliary_loss_mlp": 0.01041213, "balance_loss_clip": 1.06595564, "balance_loss_mlp": 1.03097939, "epoch": 0.15246798533036734, "flos": 25957812977280.0, "grad_norm": 2.087062721518415, "language_loss": 0.83061767, "learning_rate": 3.844839278293342e-06, "loss": 0.85322553, "num_input_tokens_seen": 26880430, "step": 1268, "time_per_iteration": 3.8079745769500732 }, { "auxiliary_loss_clip": 0.01237103, "auxiliary_loss_mlp": 0.01034352, "balance_loss_clip": 1.06625128, "balance_loss_mlp": 1.02385604, "epoch": 0.15258822822100643, "flos": 25811907932160.0, "grad_norm": 2.4378262949825595, "language_loss": 0.77279508, "learning_rate": 3.8445383080229654e-06, "loss": 0.7955097, "num_input_tokens_seen": 26896445, "step": 1269, "time_per_iteration": 2.7223219871520996 }, { "auxiliary_loss_clip": 0.01223331, "auxiliary_loss_mlp": 0.01043111, "balance_loss_clip": 1.06616604, "balance_loss_mlp": 1.03229308, "epoch": 0.1527084711116455, "flos": 25265455349760.0, "grad_norm": 2.424390956464769, "language_loss": 0.73670483, "learning_rate": 3.844237057937593e-06, "loss": 0.75936919, "num_input_tokens_seen": 26915450, "step": 1270, "time_per_iteration": 4.398253440856934 }, { "auxiliary_loss_clip": 0.01234027, "auxiliary_loss_mlp": 0.01036301, "balance_loss_clip": 1.06204367, "balance_loss_mlp": 1.02469587, "epoch": 0.15282871400228462, "flos": 29240227572480.0, "grad_norm": 2.5788994743069904, "language_loss": 0.77698225, "learning_rate": 3.843935528082926e-06, "loss": 0.79968554, "num_input_tokens_seen": 26936475, "step": 1271, "time_per_iteration": 2.864910364151001 }, { "auxiliary_loss_clip": 0.01235349, "auxiliary_loss_mlp": 0.01038859, "balance_loss_clip": 1.06600022, "balance_loss_mlp": 1.02741528, "epoch": 0.1529489568929237, "flos": 20882952869760.0, "grad_norm": 1.7830649353833214, "language_loss": 0.85221267, "learning_rate": 3.843633718504704e-06, "loss": 0.87495476, "num_input_tokens_seen": 26954920, "step": 1272, "time_per_iteration": 2.834385395050049 }, { "auxiliary_loss_clip": 0.01225729, "auxiliary_loss_mlp": 0.01044288, "balance_loss_clip": 1.06277156, "balance_loss_mlp": 1.03360677, "epoch": 0.1530691997835628, "flos": 20083833043200.0, "grad_norm": 3.28575940310357, "language_loss": 0.90194869, "learning_rate": 3.843331629248715e-06, "loss": 0.92464882, "num_input_tokens_seen": 26972520, "step": 1273, "time_per_iteration": 2.951704263687134 }, { "auxiliary_loss_clip": 0.01235007, "auxiliary_loss_mlp": 0.01043184, "balance_loss_clip": 1.06527126, "balance_loss_mlp": 1.03258622, "epoch": 0.1531894426742019, "flos": 28759814144640.0, "grad_norm": 2.5404101238629804, "language_loss": 0.76341456, "learning_rate": 3.843029260360782e-06, "loss": 0.78619653, "num_input_tokens_seen": 26990890, "step": 1274, "time_per_iteration": 2.76232647895813 }, { "auxiliary_loss_clip": 0.01230463, "auxiliary_loss_mlp": 0.01043714, "balance_loss_clip": 1.06669497, "balance_loss_mlp": 1.03155446, "epoch": 0.15330968556484098, "flos": 22236282616320.0, "grad_norm": 1.8631299827614023, "language_loss": 0.78844976, "learning_rate": 3.8427266118867755e-06, "loss": 0.81119156, "num_input_tokens_seen": 27010640, "step": 1275, "time_per_iteration": 2.8557627201080322 }, { "auxiliary_loss_clip": 0.01224262, "auxiliary_loss_mlp": 0.01038599, "balance_loss_clip": 1.06534171, "balance_loss_mlp": 1.02680373, "epoch": 0.15342992845548006, "flos": 27527504296320.0, "grad_norm": 2.270157225195513, "language_loss": 0.82508969, "learning_rate": 3.842423683872608e-06, "loss": 0.84771836, "num_input_tokens_seen": 27031215, "step": 1276, "time_per_iteration": 2.8370821475982666 }, { "auxiliary_loss_clip": 0.01230931, "auxiliary_loss_mlp": 0.01034493, "balance_loss_clip": 1.06466508, "balance_loss_mlp": 1.02428889, "epoch": 0.15355017134611917, "flos": 19609596754560.0, "grad_norm": 2.8375846005897114, "language_loss": 0.77880841, "learning_rate": 3.842120476364232e-06, "loss": 0.80146265, "num_input_tokens_seen": 27049665, "step": 1277, "time_per_iteration": 2.830836534500122 }, { "auxiliary_loss_clip": 0.01237874, "auxiliary_loss_mlp": 0.01035991, "balance_loss_clip": 1.06462622, "balance_loss_mlp": 1.02424288, "epoch": 0.15367041423675826, "flos": 18478590238080.0, "grad_norm": 1.9840917215986775, "language_loss": 0.8351953, "learning_rate": 3.841816989407644e-06, "loss": 0.85793394, "num_input_tokens_seen": 27065155, "step": 1278, "time_per_iteration": 2.727531909942627 }, { "auxiliary_loss_clip": 0.01216561, "auxiliary_loss_mlp": 0.01038424, "balance_loss_clip": 1.06337929, "balance_loss_mlp": 1.02731943, "epoch": 0.15379065712739734, "flos": 41427662342400.0, "grad_norm": 2.0366482129011403, "language_loss": 0.76656312, "learning_rate": 3.841513223048884e-06, "loss": 0.78911293, "num_input_tokens_seen": 27085840, "step": 1279, "time_per_iteration": 3.1792640686035156 }, { "auxiliary_loss_clip": 0.01224595, "auxiliary_loss_mlp": 0.01046797, "balance_loss_clip": 1.0654633, "balance_loss_mlp": 1.03477454, "epoch": 0.15391090001803642, "flos": 22054215553920.0, "grad_norm": 3.1054439822252777, "language_loss": 0.78490168, "learning_rate": 3.841209177334031e-06, "loss": 0.80761564, "num_input_tokens_seen": 27104200, "step": 1280, "time_per_iteration": 2.865095376968384 }, { "auxiliary_loss_clip": 0.01226733, "auxiliary_loss_mlp": 0.01039374, "balance_loss_clip": 1.0630914, "balance_loss_mlp": 1.02838266, "epoch": 0.15403114290867553, "flos": 15450351258240.0, "grad_norm": 1.9210623003327068, "language_loss": 0.7466073, "learning_rate": 3.84090485230921e-06, "loss": 0.76926833, "num_input_tokens_seen": 27122440, "step": 1281, "time_per_iteration": 2.7508769035339355 }, { "auxiliary_loss_clip": 0.01233425, "auxiliary_loss_mlp": 0.01036829, "balance_loss_clip": 1.06427896, "balance_loss_mlp": 1.02601087, "epoch": 0.15415138579931462, "flos": 17929156826880.0, "grad_norm": 2.5966997054512544, "language_loss": 0.76515365, "learning_rate": 3.840600248020588e-06, "loss": 0.78785622, "num_input_tokens_seen": 27139380, "step": 1282, "time_per_iteration": 2.884328842163086 }, { "auxiliary_loss_clip": 0.0123561, "auxiliary_loss_mlp": 0.01042447, "balance_loss_clip": 1.06551158, "balance_loss_mlp": 1.03059745, "epoch": 0.1542716286899537, "flos": 11429325296640.0, "grad_norm": 2.408335860576239, "language_loss": 0.8014316, "learning_rate": 3.840295364514371e-06, "loss": 0.82421219, "num_input_tokens_seen": 27156760, "step": 1283, "time_per_iteration": 2.8298091888427734 }, { "auxiliary_loss_clip": 0.01232801, "auxiliary_loss_mlp": 0.01040951, "balance_loss_clip": 1.06825197, "balance_loss_mlp": 1.03071654, "epoch": 0.1543918715805928, "flos": 17420338719360.0, "grad_norm": 5.824665899389845, "language_loss": 0.78306305, "learning_rate": 3.83999020183681e-06, "loss": 0.80580056, "num_input_tokens_seen": 27175455, "step": 1284, "time_per_iteration": 2.7679831981658936 }, { "auxiliary_loss_clip": 0.01214848, "auxiliary_loss_mlp": 0.01037996, "balance_loss_clip": 1.0656426, "balance_loss_mlp": 1.02730322, "epoch": 0.1545121144712319, "flos": 17786376264960.0, "grad_norm": 2.0771926071380595, "language_loss": 0.78745759, "learning_rate": 3.839684760034199e-06, "loss": 0.809986, "num_input_tokens_seen": 27193660, "step": 1285, "time_per_iteration": 2.786858558654785 }, { "auxiliary_loss_clip": 0.01223653, "auxiliary_loss_mlp": 0.01034113, "balance_loss_clip": 1.07021821, "balance_loss_mlp": 1.02273417, "epoch": 0.15463235736187098, "flos": 28220185146240.0, "grad_norm": 2.370177654429271, "language_loss": 0.65264148, "learning_rate": 3.8393790391528716e-06, "loss": 0.67521918, "num_input_tokens_seen": 27214355, "step": 1286, "time_per_iteration": 2.910996675491333 }, { "auxiliary_loss_clip": 0.01225222, "auxiliary_loss_mlp": 0.01039282, "balance_loss_clip": 1.06137192, "balance_loss_mlp": 1.02882171, "epoch": 0.15475260025251006, "flos": 22856890826880.0, "grad_norm": 1.9969203046671964, "language_loss": 0.89295745, "learning_rate": 3.8390730392392075e-06, "loss": 0.91560256, "num_input_tokens_seen": 27234335, "step": 1287, "time_per_iteration": 2.8492040634155273 }, { "auxiliary_loss_clip": 0.01236519, "auxiliary_loss_mlp": 0.0104362, "balance_loss_clip": 1.06532717, "balance_loss_mlp": 1.03233695, "epoch": 0.15487284314314917, "flos": 17602872658560.0, "grad_norm": 2.4159594203387313, "language_loss": 0.79123878, "learning_rate": 3.838766760339626e-06, "loss": 0.81404024, "num_input_tokens_seen": 27252860, "step": 1288, "time_per_iteration": 2.8710806369781494 }, { "auxiliary_loss_clip": 0.01204327, "auxiliary_loss_mlp": 0.01031248, "balance_loss_clip": 1.06211734, "balance_loss_mlp": 1.01974404, "epoch": 0.15499308603378825, "flos": 20082037363200.0, "grad_norm": 2.805828438883823, "language_loss": 0.7941308, "learning_rate": 3.838460202500587e-06, "loss": 0.8164866, "num_input_tokens_seen": 27268650, "step": 1289, "time_per_iteration": 2.8310623168945312 }, { "auxiliary_loss_clip": 0.01222354, "auxiliary_loss_mlp": 0.01041077, "balance_loss_clip": 1.06948566, "balance_loss_mlp": 1.02995491, "epoch": 0.15511332892442733, "flos": 15918051271680.0, "grad_norm": 3.6463487868086952, "language_loss": 0.7433269, "learning_rate": 3.838153365768599e-06, "loss": 0.76596123, "num_input_tokens_seen": 27285160, "step": 1290, "time_per_iteration": 2.7784435749053955 }, { "auxiliary_loss_clip": 0.01222155, "auxiliary_loss_mlp": 0.01036719, "balance_loss_clip": 1.06940913, "balance_loss_mlp": 1.02635384, "epoch": 0.15523357181506645, "flos": 41282475569280.0, "grad_norm": 2.5469039711772377, "language_loss": 0.75211161, "learning_rate": 3.837846250190206e-06, "loss": 0.77470034, "num_input_tokens_seen": 27308025, "step": 1291, "time_per_iteration": 2.992082357406616 }, { "auxiliary_loss_clip": 0.01215448, "auxiliary_loss_mlp": 0.01060872, "balance_loss_clip": 1.06630576, "balance_loss_mlp": 1.02687263, "epoch": 0.15535381470570553, "flos": 18478769806080.0, "grad_norm": 2.2491064932242884, "language_loss": 0.77161562, "learning_rate": 3.837538855811998e-06, "loss": 0.79437882, "num_input_tokens_seen": 27326200, "step": 1292, "time_per_iteration": 2.8533358573913574 }, { "auxiliary_loss_clip": 0.01228748, "auxiliary_loss_mlp": 0.01037139, "balance_loss_clip": 1.06601262, "balance_loss_mlp": 1.0263027, "epoch": 0.1554740575963446, "flos": 13918150759680.0, "grad_norm": 2.643789061546489, "language_loss": 0.70883989, "learning_rate": 3.837231182680606e-06, "loss": 0.73149872, "num_input_tokens_seen": 27344165, "step": 1293, "time_per_iteration": 4.815764904022217 }, { "auxiliary_loss_clip": 0.01234526, "auxiliary_loss_mlp": 0.01034792, "balance_loss_clip": 1.06574917, "balance_loss_mlp": 1.02414024, "epoch": 0.1555943004869837, "flos": 20847078161280.0, "grad_norm": 1.7386083094530047, "language_loss": 0.75984085, "learning_rate": 3.836923230842706e-06, "loss": 0.782534, "num_input_tokens_seen": 27363280, "step": 1294, "time_per_iteration": 3.8172531127929688 }, { "auxiliary_loss_clip": 0.01225167, "auxiliary_loss_mlp": 0.01042131, "balance_loss_clip": 1.06540251, "balance_loss_mlp": 1.03161645, "epoch": 0.1557145433776228, "flos": 22085888371200.0, "grad_norm": 2.357213828306866, "language_loss": 0.806292, "learning_rate": 3.836615000345011e-06, "loss": 0.82896501, "num_input_tokens_seen": 27381460, "step": 1295, "time_per_iteration": 2.9127540588378906 }, { "auxiliary_loss_clip": 0.0123128, "auxiliary_loss_mlp": 0.0103273, "balance_loss_clip": 1.06355596, "balance_loss_mlp": 1.02280021, "epoch": 0.1558347862682619, "flos": 19791987039360.0, "grad_norm": 2.1177271112172558, "language_loss": 0.7788465, "learning_rate": 3.836306491234282e-06, "loss": 0.80148661, "num_input_tokens_seen": 27399310, "step": 1296, "time_per_iteration": 4.022518873214722 }, { "auxiliary_loss_clip": 0.01219427, "auxiliary_loss_mlp": 0.01040873, "balance_loss_clip": 1.06541264, "balance_loss_mlp": 1.0303762, "epoch": 0.15595502915890097, "flos": 17237086508160.0, "grad_norm": 2.1669868731626476, "language_loss": 0.75361454, "learning_rate": 3.835997703557317e-06, "loss": 0.77621758, "num_input_tokens_seen": 27416050, "step": 1297, "time_per_iteration": 2.8661272525787354 }, { "auxiliary_loss_clip": 0.01220855, "auxiliary_loss_mlp": 0.0103554, "balance_loss_clip": 1.0652256, "balance_loss_mlp": 1.02513313, "epoch": 0.15607527204954008, "flos": 19719519350400.0, "grad_norm": 1.8474110265224089, "language_loss": 0.80240917, "learning_rate": 3.83568863736096e-06, "loss": 0.82497311, "num_input_tokens_seen": 27434920, "step": 1298, "time_per_iteration": 2.9132585525512695 }, { "auxiliary_loss_clip": 0.01225443, "auxiliary_loss_mlp": 0.01037195, "balance_loss_clip": 1.06479526, "balance_loss_mlp": 1.02702069, "epoch": 0.15619551494017916, "flos": 18515650095360.0, "grad_norm": 2.5435775104518403, "language_loss": 0.88983309, "learning_rate": 3.8353792926920975e-06, "loss": 0.91245949, "num_input_tokens_seen": 27453570, "step": 1299, "time_per_iteration": 2.940373182296753 }, { "auxiliary_loss_clip": 0.01236357, "auxiliary_loss_mlp": 0.01043784, "balance_loss_clip": 1.06577802, "balance_loss_mlp": 1.03300738, "epoch": 0.15631575783081825, "flos": 19902125116800.0, "grad_norm": 2.1767730803816705, "language_loss": 0.81499606, "learning_rate": 3.835069669597655e-06, "loss": 0.83779752, "num_input_tokens_seen": 27471960, "step": 1300, "time_per_iteration": 2.9574155807495117 }, { "auxiliary_loss_clip": 0.01235134, "auxiliary_loss_mlp": 0.01058392, "balance_loss_clip": 1.06604075, "balance_loss_mlp": 1.02460229, "epoch": 0.15643600072145733, "flos": 20777663128320.0, "grad_norm": 2.2864561045324137, "language_loss": 0.79663986, "learning_rate": 3.834759768124603e-06, "loss": 0.81957507, "num_input_tokens_seen": 27490835, "step": 1301, "time_per_iteration": 2.810648202896118 }, { "auxiliary_loss_clip": 0.01225433, "auxiliary_loss_mlp": 0.01036258, "balance_loss_clip": 1.06722832, "balance_loss_mlp": 1.02472484, "epoch": 0.15655624361209644, "flos": 18546389159040.0, "grad_norm": 4.3601073510885815, "language_loss": 0.76320189, "learning_rate": 3.834449588319953e-06, "loss": 0.78581882, "num_input_tokens_seen": 27508870, "step": 1302, "time_per_iteration": 2.8149542808532715 }, { "auxiliary_loss_clip": 0.01224738, "auxiliary_loss_mlp": 0.01034592, "balance_loss_clip": 1.06550431, "balance_loss_mlp": 1.02505517, "epoch": 0.15667648650273552, "flos": 25229544727680.0, "grad_norm": 3.03514624523405, "language_loss": 0.85451192, "learning_rate": 3.834139130230758e-06, "loss": 0.87710524, "num_input_tokens_seen": 27528175, "step": 1303, "time_per_iteration": 2.8549411296844482 }, { "auxiliary_loss_clip": 0.0122443, "auxiliary_loss_mlp": 0.01036085, "balance_loss_clip": 1.05944395, "balance_loss_mlp": 1.0254755, "epoch": 0.1567967293933746, "flos": 24827093769600.0, "grad_norm": 2.0085200025426104, "language_loss": 0.81289554, "learning_rate": 3.833828393904117e-06, "loss": 0.83550066, "num_input_tokens_seen": 27548455, "step": 1304, "time_per_iteration": 2.883582353591919 }, { "auxiliary_loss_clip": 0.01214612, "auxiliary_loss_mlp": 0.01040866, "balance_loss_clip": 1.06407237, "balance_loss_mlp": 1.03115678, "epoch": 0.15691697228401372, "flos": 19164555244800.0, "grad_norm": 2.317489880891033, "language_loss": 0.77680027, "learning_rate": 3.833517379387165e-06, "loss": 0.79935515, "num_input_tokens_seen": 27564910, "step": 1305, "time_per_iteration": 2.8859732151031494 }, { "auxiliary_loss_clip": 0.01235367, "auxiliary_loss_mlp": 0.01034389, "balance_loss_clip": 1.06778467, "balance_loss_mlp": 1.02385056, "epoch": 0.1570372151746528, "flos": 24790931752320.0, "grad_norm": 2.449065674835158, "language_loss": 0.88837272, "learning_rate": 3.833206086727085e-06, "loss": 0.91107023, "num_input_tokens_seen": 27584260, "step": 1306, "time_per_iteration": 2.755089044570923 }, { "auxiliary_loss_clip": 0.01225097, "auxiliary_loss_mlp": 0.01035641, "balance_loss_clip": 1.06324863, "balance_loss_mlp": 1.02581811, "epoch": 0.15715745806529188, "flos": 24863650836480.0, "grad_norm": 6.401934042922084, "language_loss": 0.70820343, "learning_rate": 3.8328945159710994e-06, "loss": 0.73081076, "num_input_tokens_seen": 27604440, "step": 1307, "time_per_iteration": 2.946864128112793 }, { "auxiliary_loss_clip": 0.01236244, "auxiliary_loss_mlp": 0.01064032, "balance_loss_clip": 1.0680604, "balance_loss_mlp": 1.0316186, "epoch": 0.157277700955931, "flos": 21872148491520.0, "grad_norm": 2.3357432884951788, "language_loss": 0.886563, "learning_rate": 3.832582667166473e-06, "loss": 0.90956575, "num_input_tokens_seen": 27624250, "step": 1308, "time_per_iteration": 2.860837936401367 }, { "auxiliary_loss_clip": 0.01223623, "auxiliary_loss_mlp": 0.01031999, "balance_loss_clip": 1.06436455, "balance_loss_mlp": 1.02136612, "epoch": 0.15739794384657008, "flos": 24533344344960.0, "grad_norm": 1.7944477278180073, "language_loss": 0.81612951, "learning_rate": 3.8322705403605125e-06, "loss": 0.83868575, "num_input_tokens_seen": 27644595, "step": 1309, "time_per_iteration": 2.996676206588745 }, { "auxiliary_loss_clip": 0.01213792, "auxiliary_loss_mlp": 0.01028433, "balance_loss_clip": 1.06059575, "balance_loss_mlp": 1.01858044, "epoch": 0.15751818673720916, "flos": 17745329998080.0, "grad_norm": 2.0632701017860953, "language_loss": 0.8125549, "learning_rate": 3.831958135600568e-06, "loss": 0.83497715, "num_input_tokens_seen": 27662145, "step": 1310, "time_per_iteration": 2.8411898612976074 }, { "auxiliary_loss_clip": 0.01226516, "auxiliary_loss_mlp": 0.01037834, "balance_loss_clip": 1.0617516, "balance_loss_mlp": 1.02811837, "epoch": 0.15763842962784824, "flos": 17858520731520.0, "grad_norm": 2.272770262425784, "language_loss": 0.7998777, "learning_rate": 3.831645452934032e-06, "loss": 0.82252121, "num_input_tokens_seen": 27680575, "step": 1311, "time_per_iteration": 2.7290749549865723 }, { "auxiliary_loss_clip": 0.01232692, "auxiliary_loss_mlp": 0.01040201, "balance_loss_clip": 1.06512284, "balance_loss_mlp": 1.030146, "epoch": 0.15775867251848735, "flos": 26980908059520.0, "grad_norm": 1.820341745697119, "language_loss": 0.79826605, "learning_rate": 3.831332492408336e-06, "loss": 0.82099497, "num_input_tokens_seen": 27701985, "step": 1312, "time_per_iteration": 2.7274999618530273 }, { "auxiliary_loss_clip": 0.01217772, "auxiliary_loss_mlp": 0.01032994, "balance_loss_clip": 1.06148171, "balance_loss_mlp": 1.02293289, "epoch": 0.15787891540912644, "flos": 19240398812160.0, "grad_norm": 1.8224365795036956, "language_loss": 0.69314468, "learning_rate": 3.831019254070957e-06, "loss": 0.71565235, "num_input_tokens_seen": 27719770, "step": 1313, "time_per_iteration": 2.6746110916137695 }, { "auxiliary_loss_clip": 0.01216518, "auxiliary_loss_mlp": 0.01036283, "balance_loss_clip": 1.06171513, "balance_loss_mlp": 1.02668023, "epoch": 0.15799915829976552, "flos": 27271102037760.0, "grad_norm": 4.169927663609019, "language_loss": 0.94965684, "learning_rate": 3.8307057379694135e-06, "loss": 0.9721849, "num_input_tokens_seen": 27739105, "step": 1314, "time_per_iteration": 2.9338607788085938 }, { "auxiliary_loss_clip": 0.01231354, "auxiliary_loss_mlp": 0.0103608, "balance_loss_clip": 1.0613507, "balance_loss_mlp": 1.0257268, "epoch": 0.15811940119040463, "flos": 20405520270720.0, "grad_norm": 2.2809685720048907, "language_loss": 0.82245666, "learning_rate": 3.830391944151264e-06, "loss": 0.84513104, "num_input_tokens_seen": 27754985, "step": 1315, "time_per_iteration": 2.7213926315307617 }, { "auxiliary_loss_clip": 0.01223093, "auxiliary_loss_mlp": 0.01040527, "balance_loss_clip": 1.06247139, "balance_loss_mlp": 1.0301795, "epoch": 0.1582396440810437, "flos": 32599347661440.0, "grad_norm": 1.9802126852806057, "language_loss": 0.67419553, "learning_rate": 3.830077872664114e-06, "loss": 0.6968317, "num_input_tokens_seen": 27776110, "step": 1316, "time_per_iteration": 2.8491697311401367 }, { "auxiliary_loss_clip": 0.01210656, "auxiliary_loss_mlp": 0.01039213, "balance_loss_clip": 1.06017673, "balance_loss_mlp": 1.02884221, "epoch": 0.1583598869716828, "flos": 33800559310080.0, "grad_norm": 1.784002120064088, "language_loss": 0.72964799, "learning_rate": 3.829763523555604e-06, "loss": 0.75214666, "num_input_tokens_seen": 27796510, "step": 1317, "time_per_iteration": 2.9084694385528564 }, { "auxiliary_loss_clip": 0.01218368, "auxiliary_loss_mlp": 0.01031655, "balance_loss_clip": 1.06181324, "balance_loss_mlp": 1.02285171, "epoch": 0.15848012986232188, "flos": 24681332378880.0, "grad_norm": 2.5255087024175924, "language_loss": 0.77860856, "learning_rate": 3.829448896873423e-06, "loss": 0.80110884, "num_input_tokens_seen": 27815610, "step": 1318, "time_per_iteration": 2.8862829208374023 }, { "auxiliary_loss_clip": 0.01202701, "auxiliary_loss_mlp": 0.01063267, "balance_loss_clip": 1.06233263, "balance_loss_mlp": 1.03098655, "epoch": 0.158600372752961, "flos": 22602068766720.0, "grad_norm": 1.834431435055217, "language_loss": 0.79187983, "learning_rate": 3.829133992665299e-06, "loss": 0.81453949, "num_input_tokens_seen": 27834735, "step": 1319, "time_per_iteration": 4.826078176498413 }, { "auxiliary_loss_clip": 0.01215918, "auxiliary_loss_mlp": 0.01034269, "balance_loss_clip": 1.06101203, "balance_loss_mlp": 1.02454722, "epoch": 0.15872061564360007, "flos": 27927944092800.0, "grad_norm": 2.3794642269651285, "language_loss": 0.89193618, "learning_rate": 3.828818810979002e-06, "loss": 0.91443807, "num_input_tokens_seen": 27853065, "step": 1320, "time_per_iteration": 3.710981607437134 }, { "auxiliary_loss_clip": 0.01227135, "auxiliary_loss_mlp": 0.01037185, "balance_loss_clip": 1.06241155, "balance_loss_mlp": 1.02750516, "epoch": 0.15884085853423915, "flos": 23696805525120.0, "grad_norm": 1.882480983776891, "language_loss": 0.80028713, "learning_rate": 3.8285033518623454e-06, "loss": 0.82293034, "num_input_tokens_seen": 27873315, "step": 1321, "time_per_iteration": 2.7296297550201416 }, { "auxiliary_loss_clip": 0.01230717, "auxiliary_loss_mlp": 0.01037853, "balance_loss_clip": 1.06243086, "balance_loss_mlp": 1.02696955, "epoch": 0.15896110142487826, "flos": 23112359331840.0, "grad_norm": 3.029007455116237, "language_loss": 0.80916548, "learning_rate": 3.8281876153631845e-06, "loss": 0.83185118, "num_input_tokens_seen": 27890070, "step": 1322, "time_per_iteration": 3.770887613296509 }, { "auxiliary_loss_clip": 0.01212431, "auxiliary_loss_mlp": 0.01040348, "balance_loss_clip": 1.0610429, "balance_loss_mlp": 1.02996516, "epoch": 0.15908134431551735, "flos": 14685238632960.0, "grad_norm": 2.130355459791097, "language_loss": 0.64986956, "learning_rate": 3.827871601529416e-06, "loss": 0.67239738, "num_input_tokens_seen": 27908590, "step": 1323, "time_per_iteration": 2.930760622024536 }, { "auxiliary_loss_clip": 0.01214696, "auxiliary_loss_mlp": 0.01029915, "balance_loss_clip": 1.06242573, "balance_loss_mlp": 1.02028263, "epoch": 0.15920158720615643, "flos": 20193611984640.0, "grad_norm": 1.762939564677405, "language_loss": 0.80944979, "learning_rate": 3.827555310408979e-06, "loss": 0.83189595, "num_input_tokens_seen": 27927985, "step": 1324, "time_per_iteration": 2.8078737258911133 }, { "auxiliary_loss_clip": 0.01210841, "auxiliary_loss_mlp": 0.01032235, "balance_loss_clip": 1.06114125, "balance_loss_mlp": 1.02251399, "epoch": 0.1593218300967955, "flos": 24826626892800.0, "grad_norm": 2.405300511066256, "language_loss": 0.82941043, "learning_rate": 3.827238742049854e-06, "loss": 0.85184115, "num_input_tokens_seen": 27948280, "step": 1325, "time_per_iteration": 2.829782247543335 }, { "auxiliary_loss_clip": 0.01226814, "auxiliary_loss_mlp": 0.01032464, "balance_loss_clip": 1.06034136, "balance_loss_mlp": 1.02173495, "epoch": 0.15944207298743462, "flos": 28328707111680.0, "grad_norm": 2.0147820876688716, "language_loss": 0.51721323, "learning_rate": 3.826921896500066e-06, "loss": 0.53980601, "num_input_tokens_seen": 27969565, "step": 1326, "time_per_iteration": 2.8544676303863525 }, { "auxiliary_loss_clip": 0.01223858, "auxiliary_loss_mlp": 0.01033264, "balance_loss_clip": 1.0629158, "balance_loss_mlp": 1.02297604, "epoch": 0.1595623158780737, "flos": 22964838174720.0, "grad_norm": 2.1113921700375125, "language_loss": 0.78412747, "learning_rate": 3.826604773807678e-06, "loss": 0.80669868, "num_input_tokens_seen": 27987540, "step": 1327, "time_per_iteration": 2.8781704902648926 }, { "auxiliary_loss_clip": 0.01225089, "auxiliary_loss_mlp": 0.01035906, "balance_loss_clip": 1.06247091, "balance_loss_mlp": 1.02552307, "epoch": 0.1596825587687128, "flos": 19710540950400.0, "grad_norm": 2.6365087265606975, "language_loss": 0.73365521, "learning_rate": 3.826287374020798e-06, "loss": 0.75626516, "num_input_tokens_seen": 28002345, "step": 1328, "time_per_iteration": 2.7435190677642822 }, { "auxiliary_loss_clip": 0.01232322, "auxiliary_loss_mlp": 0.01039931, "balance_loss_clip": 1.06475973, "balance_loss_mlp": 1.02940452, "epoch": 0.1598028016593519, "flos": 22637727993600.0, "grad_norm": 2.398619010949612, "language_loss": 0.82048965, "learning_rate": 3.825969697187575e-06, "loss": 0.84321213, "num_input_tokens_seen": 28021675, "step": 1329, "time_per_iteration": 2.758148670196533 }, { "auxiliary_loss_clip": 0.01215963, "auxiliary_loss_mlp": 0.01038885, "balance_loss_clip": 1.06204391, "balance_loss_mlp": 1.02868056, "epoch": 0.15992304454999098, "flos": 20482908122880.0, "grad_norm": 1.7689340088803582, "language_loss": 0.69603062, "learning_rate": 3.8256517433562015e-06, "loss": 0.71857911, "num_input_tokens_seen": 28039615, "step": 1330, "time_per_iteration": 2.810176134109497 }, { "auxiliary_loss_clip": 0.01229376, "auxiliary_loss_mlp": 0.01032003, "balance_loss_clip": 1.06266642, "balance_loss_mlp": 1.02180421, "epoch": 0.16004328744063007, "flos": 17676094533120.0, "grad_norm": 2.254414959131903, "language_loss": 0.91670936, "learning_rate": 3.82533351257491e-06, "loss": 0.93932319, "num_input_tokens_seen": 28057565, "step": 1331, "time_per_iteration": 2.7785375118255615 }, { "auxiliary_loss_clip": 0.01222271, "auxiliary_loss_mlp": 0.01035622, "balance_loss_clip": 1.06351507, "balance_loss_mlp": 1.02570963, "epoch": 0.16016353033126918, "flos": 24098717779200.0, "grad_norm": 1.974508951655355, "language_loss": 0.88533032, "learning_rate": 3.825015004891975e-06, "loss": 0.90790915, "num_input_tokens_seen": 28076305, "step": 1332, "time_per_iteration": 2.8863608837127686 }, { "auxiliary_loss_clip": 0.01220647, "auxiliary_loss_mlp": 0.01031289, "balance_loss_clip": 1.06207514, "balance_loss_mlp": 1.02162743, "epoch": 0.16028377322190826, "flos": 27634841112960.0, "grad_norm": 3.3676836821638947, "language_loss": 0.75519854, "learning_rate": 3.824696220355716e-06, "loss": 0.77771795, "num_input_tokens_seen": 28097895, "step": 1333, "time_per_iteration": 2.8631606101989746 }, { "auxiliary_loss_clip": 0.0121294, "auxiliary_loss_mlp": 0.01031149, "balance_loss_clip": 1.05972624, "balance_loss_mlp": 1.0207417, "epoch": 0.16040401611254734, "flos": 20961202648320.0, "grad_norm": 1.927591138761728, "language_loss": 0.7891897, "learning_rate": 3.824377159014491e-06, "loss": 0.81163061, "num_input_tokens_seen": 28118790, "step": 1334, "time_per_iteration": 2.7778728008270264 }, { "auxiliary_loss_clip": 0.01217736, "auxiliary_loss_mlp": 0.01037893, "balance_loss_clip": 1.06026268, "balance_loss_mlp": 1.02768219, "epoch": 0.16052425900318643, "flos": 21247051080960.0, "grad_norm": 2.259298458209039, "language_loss": 0.85206693, "learning_rate": 3.824057820916702e-06, "loss": 0.87462318, "num_input_tokens_seen": 28135995, "step": 1335, "time_per_iteration": 2.7805631160736084 }, { "auxiliary_loss_clip": 0.01223717, "auxiliary_loss_mlp": 0.01032379, "balance_loss_clip": 1.06243396, "balance_loss_mlp": 1.02144718, "epoch": 0.16064450189382554, "flos": 15524004096000.0, "grad_norm": 2.4455321769599143, "language_loss": 0.72085369, "learning_rate": 3.8237382061107904e-06, "loss": 0.74341464, "num_input_tokens_seen": 28152715, "step": 1336, "time_per_iteration": 2.744434356689453 }, { "auxiliary_loss_clip": 0.01194811, "auxiliary_loss_mlp": 0.01043354, "balance_loss_clip": 1.05811334, "balance_loss_mlp": 1.03329253, "epoch": 0.16076474478446462, "flos": 21178497974400.0, "grad_norm": 2.5056994454858903, "language_loss": 0.78671759, "learning_rate": 3.823418314645243e-06, "loss": 0.80909926, "num_input_tokens_seen": 28171590, "step": 1337, "time_per_iteration": 2.915332317352295 }, { "auxiliary_loss_clip": 0.01194611, "auxiliary_loss_mlp": 0.01028464, "balance_loss_clip": 1.0609237, "balance_loss_mlp": 1.01966047, "epoch": 0.1608849876751037, "flos": 18366476912640.0, "grad_norm": 2.8396824711710837, "language_loss": 0.75268686, "learning_rate": 3.823098146568588e-06, "loss": 0.7749176, "num_input_tokens_seen": 28191295, "step": 1338, "time_per_iteration": 2.8115038871765137 }, { "auxiliary_loss_clip": 0.0122385, "auxiliary_loss_mlp": 0.01035082, "balance_loss_clip": 1.06220603, "balance_loss_mlp": 1.02521718, "epoch": 0.1610052305657428, "flos": 29497024880640.0, "grad_norm": 2.340072080802891, "language_loss": 0.71242428, "learning_rate": 3.822777701929394e-06, "loss": 0.7350136, "num_input_tokens_seen": 28213120, "step": 1339, "time_per_iteration": 2.8517510890960693 }, { "auxiliary_loss_clip": 0.01213354, "auxiliary_loss_mlp": 0.01033625, "balance_loss_clip": 1.05886436, "balance_loss_mlp": 1.02388, "epoch": 0.1611254734563819, "flos": 26797871329920.0, "grad_norm": 2.150796141428633, "language_loss": 0.73460793, "learning_rate": 3.8224569807762714e-06, "loss": 0.75707775, "num_input_tokens_seen": 28232440, "step": 1340, "time_per_iteration": 2.917647123336792 }, { "auxiliary_loss_clip": 0.01197095, "auxiliary_loss_mlp": 0.0103042, "balance_loss_clip": 1.06098175, "balance_loss_mlp": 1.02041256, "epoch": 0.16124571634702098, "flos": 22419570741120.0, "grad_norm": 3.843096863719526, "language_loss": 0.76384497, "learning_rate": 3.822135983157873e-06, "loss": 0.78612006, "num_input_tokens_seen": 28251715, "step": 1341, "time_per_iteration": 2.877815008163452 }, { "auxiliary_loss_clip": 0.01223694, "auxiliary_loss_mlp": 0.01050958, "balance_loss_clip": 1.05988121, "balance_loss_mlp": 1.02004027, "epoch": 0.16136595923766006, "flos": 10999116103680.0, "grad_norm": 2.038899695815094, "language_loss": 0.84172487, "learning_rate": 3.821814709122896e-06, "loss": 0.86447144, "num_input_tokens_seen": 28269765, "step": 1342, "time_per_iteration": 2.904167413711548 }, { "auxiliary_loss_clip": 0.01218434, "auxiliary_loss_mlp": 0.01033858, "balance_loss_clip": 1.06117415, "balance_loss_mlp": 1.02474427, "epoch": 0.16148620212829917, "flos": 21214983214080.0, "grad_norm": 2.1432845176392856, "language_loss": 0.84815663, "learning_rate": 3.821493158720076e-06, "loss": 0.87067956, "num_input_tokens_seen": 28288870, "step": 1343, "time_per_iteration": 2.919891119003296 }, { "auxiliary_loss_clip": 0.01218808, "auxiliary_loss_mlp": 0.01031254, "balance_loss_clip": 1.06129718, "balance_loss_mlp": 1.02054346, "epoch": 0.16160644501893826, "flos": 16758468760320.0, "grad_norm": 3.7886257674604207, "language_loss": 0.73182762, "learning_rate": 3.821171331998191e-06, "loss": 0.75432825, "num_input_tokens_seen": 28305400, "step": 1344, "time_per_iteration": 2.9227874279022217 }, { "auxiliary_loss_clip": 0.01144073, "auxiliary_loss_mlp": 0.01011832, "balance_loss_clip": 1.04725862, "balance_loss_mlp": 1.00950706, "epoch": 0.16172668790957734, "flos": 64444967308800.0, "grad_norm": 0.7124121112978155, "language_loss": 0.54467177, "learning_rate": 3.820849229006064e-06, "loss": 0.56623089, "num_input_tokens_seen": 28373150, "step": 1345, "time_per_iteration": 6.871700286865234 }, { "auxiliary_loss_clip": 0.01228549, "auxiliary_loss_mlp": 0.01047408, "balance_loss_clip": 1.06242514, "balance_loss_mlp": 1.03758526, "epoch": 0.16184693080021645, "flos": 23257689759360.0, "grad_norm": 1.925843665273008, "language_loss": 0.70867443, "learning_rate": 3.8205268497925564e-06, "loss": 0.73143399, "num_input_tokens_seen": 28393620, "step": 1346, "time_per_iteration": 2.8203530311584473 }, { "auxiliary_loss_clip": 0.01226518, "auxiliary_loss_mlp": 0.01033981, "balance_loss_clip": 1.06189799, "balance_loss_mlp": 1.02454019, "epoch": 0.16196717369085553, "flos": 17451113696640.0, "grad_norm": 3.0714529313947505, "language_loss": 0.78275859, "learning_rate": 3.8202041944065725e-06, "loss": 0.80536354, "num_input_tokens_seen": 28409440, "step": 1347, "time_per_iteration": 4.232586145401001 }, { "auxiliary_loss_clip": 0.01225195, "auxiliary_loss_mlp": 0.01035617, "balance_loss_clip": 1.06135023, "balance_loss_mlp": 1.02564526, "epoch": 0.16208741658149461, "flos": 23873377806720.0, "grad_norm": 2.080538065398988, "language_loss": 0.73770458, "learning_rate": 3.819881262897061e-06, "loss": 0.76031268, "num_input_tokens_seen": 28427575, "step": 1348, "time_per_iteration": 2.8017525672912598 }, { "auxiliary_loss_clip": 0.01213699, "auxiliary_loss_mlp": 0.01036217, "balance_loss_clip": 1.06223512, "balance_loss_mlp": 1.02668571, "epoch": 0.1622076594721337, "flos": 25884806584320.0, "grad_norm": 2.275993110196664, "language_loss": 0.73595941, "learning_rate": 3.819558055313008e-06, "loss": 0.75845855, "num_input_tokens_seen": 28448260, "step": 1349, "time_per_iteration": 2.8601324558258057 }, { "auxiliary_loss_clip": 0.0122676, "auxiliary_loss_mlp": 0.01034327, "balance_loss_clip": 1.06055772, "balance_loss_mlp": 1.02477276, "epoch": 0.1623279023627728, "flos": 21539759011200.0, "grad_norm": 2.566628592644802, "language_loss": 0.77352601, "learning_rate": 3.819234571703444e-06, "loss": 0.79613686, "num_input_tokens_seen": 28467085, "step": 1350, "time_per_iteration": 2.7866218090057373 }, { "auxiliary_loss_clip": 0.01217474, "auxiliary_loss_mlp": 0.01036762, "balance_loss_clip": 1.060588, "balance_loss_mlp": 1.02681446, "epoch": 0.1624481452534119, "flos": 22085421494400.0, "grad_norm": 2.529328112312667, "language_loss": 0.85846746, "learning_rate": 3.8189108121174435e-06, "loss": 0.88100982, "num_input_tokens_seen": 28486850, "step": 1351, "time_per_iteration": 2.740689516067505 }, { "auxiliary_loss_clip": 0.0120726, "auxiliary_loss_mlp": 0.01031858, "balance_loss_clip": 1.05918896, "balance_loss_mlp": 1.02169561, "epoch": 0.16256838814405097, "flos": 27087490690560.0, "grad_norm": 2.1375195616134697, "language_loss": 0.83462715, "learning_rate": 3.818586776604118e-06, "loss": 0.85701835, "num_input_tokens_seen": 28507490, "step": 1352, "time_per_iteration": 2.7670469284057617 }, { "auxiliary_loss_clip": 0.01215692, "auxiliary_loss_mlp": 0.01046462, "balance_loss_clip": 1.06272936, "balance_loss_mlp": 1.03669894, "epoch": 0.16268863103469008, "flos": 20120354196480.0, "grad_norm": 2.161958622565141, "language_loss": 0.61765993, "learning_rate": 3.818262465212625e-06, "loss": 0.64028144, "num_input_tokens_seen": 28527615, "step": 1353, "time_per_iteration": 2.750986337661743 }, { "auxiliary_loss_clip": 0.01215279, "auxiliary_loss_mlp": 0.01035605, "balance_loss_clip": 1.06188393, "balance_loss_mlp": 1.02478075, "epoch": 0.16280887392532917, "flos": 18332792933760.0, "grad_norm": 2.84085739687824, "language_loss": 0.7722069, "learning_rate": 3.817937877992161e-06, "loss": 0.79471564, "num_input_tokens_seen": 28544910, "step": 1354, "time_per_iteration": 2.836672306060791 }, { "auxiliary_loss_clip": 0.01217583, "auxiliary_loss_mlp": 0.01053458, "balance_loss_clip": 1.06339395, "balance_loss_mlp": 1.02220249, "epoch": 0.16292911681596825, "flos": 11874330892800.0, "grad_norm": 2.454991489371379, "language_loss": 0.8535918, "learning_rate": 3.817613014991967e-06, "loss": 0.87630212, "num_input_tokens_seen": 28561050, "step": 1355, "time_per_iteration": 2.8455915451049805 }, { "auxiliary_loss_clip": 0.01210505, "auxiliary_loss_mlp": 0.01036954, "balance_loss_clip": 1.06310165, "balance_loss_mlp": 1.02730429, "epoch": 0.16304935970660733, "flos": 26103466627200.0, "grad_norm": 3.3354085460902967, "language_loss": 0.76232147, "learning_rate": 3.817287876261323e-06, "loss": 0.78479612, "num_input_tokens_seen": 28581385, "step": 1356, "time_per_iteration": 2.848179817199707 }, { "auxiliary_loss_clip": 0.01217841, "auxiliary_loss_mlp": 0.0103083, "balance_loss_clip": 1.06337214, "balance_loss_mlp": 1.02113199, "epoch": 0.16316960259724644, "flos": 29351945848320.0, "grad_norm": 2.3338582190909163, "language_loss": 0.79978514, "learning_rate": 3.816962461849553e-06, "loss": 0.82227182, "num_input_tokens_seen": 28603255, "step": 1357, "time_per_iteration": 2.9051318168640137 }, { "auxiliary_loss_clip": 0.01211367, "auxiliary_loss_mlp": 0.01040285, "balance_loss_clip": 1.0619911, "balance_loss_mlp": 1.03051579, "epoch": 0.16328984548788553, "flos": 20886759711360.0, "grad_norm": 2.420239116040607, "language_loss": 0.84396815, "learning_rate": 3.8166367718060235e-06, "loss": 0.8664847, "num_input_tokens_seen": 28623145, "step": 1358, "time_per_iteration": 2.833662748336792 }, { "auxiliary_loss_clip": 0.01219399, "auxiliary_loss_mlp": 0.01040259, "balance_loss_clip": 1.05875981, "balance_loss_mlp": 1.03040051, "epoch": 0.1634100883785246, "flos": 18041090584320.0, "grad_norm": 3.9911746185147345, "language_loss": 0.76953709, "learning_rate": 3.816310806180139e-06, "loss": 0.79213369, "num_input_tokens_seen": 28641555, "step": 1359, "time_per_iteration": 2.8036997318267822 }, { "auxiliary_loss_clip": 0.01219371, "auxiliary_loss_mlp": 0.01039416, "balance_loss_clip": 1.06254578, "balance_loss_mlp": 1.028216, "epoch": 0.16353033126916372, "flos": 24572128055040.0, "grad_norm": 1.6561927501929132, "language_loss": 0.80832553, "learning_rate": 3.81598456502135e-06, "loss": 0.83091342, "num_input_tokens_seen": 28661575, "step": 1360, "time_per_iteration": 2.848877429962158 }, { "auxiliary_loss_clip": 0.0121593, "auxiliary_loss_mlp": 0.010401, "balance_loss_clip": 1.06216073, "balance_loss_mlp": 1.03018212, "epoch": 0.1636505741598028, "flos": 19892895321600.0, "grad_norm": 2.081273733688539, "language_loss": 0.86967731, "learning_rate": 3.8156580483791455e-06, "loss": 0.89223766, "num_input_tokens_seen": 28676765, "step": 1361, "time_per_iteration": 2.827965497970581 }, { "auxiliary_loss_clip": 0.01229391, "auxiliary_loss_mlp": 0.01035428, "balance_loss_clip": 1.06329858, "balance_loss_mlp": 1.02527153, "epoch": 0.16377081705044189, "flos": 28402611344640.0, "grad_norm": 2.3618065988109347, "language_loss": 0.7738148, "learning_rate": 3.815331256303059e-06, "loss": 0.79646295, "num_input_tokens_seen": 28696795, "step": 1362, "time_per_iteration": 2.8553378582000732 }, { "auxiliary_loss_clip": 0.01211374, "auxiliary_loss_mlp": 0.01041825, "balance_loss_clip": 1.06325579, "balance_loss_mlp": 1.03157294, "epoch": 0.163891059941081, "flos": 21908059113600.0, "grad_norm": 2.3766117596683727, "language_loss": 0.77240765, "learning_rate": 3.815004188842665e-06, "loss": 0.79493964, "num_input_tokens_seen": 28714835, "step": 1363, "time_per_iteration": 2.7878992557525635 }, { "auxiliary_loss_clip": 0.0121379, "auxiliary_loss_mlp": 0.01038253, "balance_loss_clip": 1.05891478, "balance_loss_mlp": 1.0277983, "epoch": 0.16401130283172008, "flos": 26797619934720.0, "grad_norm": 1.5506899466870094, "language_loss": 0.79527116, "learning_rate": 3.814676846047578e-06, "loss": 0.81779158, "num_input_tokens_seen": 28735710, "step": 1364, "time_per_iteration": 2.924954414367676 }, { "auxiliary_loss_clip": 0.01224504, "auxiliary_loss_mlp": 0.01044788, "balance_loss_clip": 1.06524169, "balance_loss_mlp": 1.03424382, "epoch": 0.16413154572235916, "flos": 32997417160320.0, "grad_norm": 1.788591803194986, "language_loss": 0.69686437, "learning_rate": 3.8143492279674565e-06, "loss": 0.71955729, "num_input_tokens_seen": 28758405, "step": 1365, "time_per_iteration": 2.953822135925293 }, { "auxiliary_loss_clip": 0.01127825, "auxiliary_loss_mlp": 0.01012195, "balance_loss_clip": 1.03631258, "balance_loss_mlp": 1.00977516, "epoch": 0.16425178861299825, "flos": 40113622074240.0, "grad_norm": 0.8444476431839159, "language_loss": 0.58419973, "learning_rate": 3.8140213346519997e-06, "loss": 0.60559994, "num_input_tokens_seen": 28809000, "step": 1366, "time_per_iteration": 3.0614771842956543 }, { "auxiliary_loss_clip": 0.01208087, "auxiliary_loss_mlp": 0.01034494, "balance_loss_clip": 1.06006372, "balance_loss_mlp": 1.02428365, "epoch": 0.16437203150363736, "flos": 25447486498560.0, "grad_norm": 2.3070335349305804, "language_loss": 0.76298839, "learning_rate": 3.813693166150948e-06, "loss": 0.78541422, "num_input_tokens_seen": 28829210, "step": 1367, "time_per_iteration": 2.8473219871520996 }, { "auxiliary_loss_clip": 0.01206948, "auxiliary_loss_mlp": 0.01034907, "balance_loss_clip": 1.06274152, "balance_loss_mlp": 1.02451766, "epoch": 0.16449227439427644, "flos": 23476888506240.0, "grad_norm": 2.2429024669449635, "language_loss": 0.85278654, "learning_rate": 3.813364722514086e-06, "loss": 0.87520504, "num_input_tokens_seen": 28847545, "step": 1368, "time_per_iteration": 2.8820159435272217 }, { "auxiliary_loss_clip": 0.01224349, "auxiliary_loss_mlp": 0.01037447, "balance_loss_clip": 1.06290054, "balance_loss_mlp": 1.02684903, "epoch": 0.16461251728491552, "flos": 13545217802880.0, "grad_norm": 4.375829361214872, "language_loss": 0.80791599, "learning_rate": 3.8130360037912368e-06, "loss": 0.83053398, "num_input_tokens_seen": 28863990, "step": 1369, "time_per_iteration": 2.853238821029663 }, { "auxiliary_loss_clip": 0.01222278, "auxiliary_loss_mlp": 0.01035081, "balance_loss_clip": 1.05915976, "balance_loss_mlp": 1.0251627, "epoch": 0.16473276017555463, "flos": 23003298662400.0, "grad_norm": 2.258470623078606, "language_loss": 0.81715339, "learning_rate": 3.812707010032268e-06, "loss": 0.83972692, "num_input_tokens_seen": 28883045, "step": 1370, "time_per_iteration": 2.7752208709716797 }, { "auxiliary_loss_clip": 0.01228675, "auxiliary_loss_mlp": 0.01035457, "balance_loss_clip": 1.06492972, "balance_loss_mlp": 1.02544343, "epoch": 0.16485300306619372, "flos": 24790680357120.0, "grad_norm": 2.761105405473958, "language_loss": 0.79246545, "learning_rate": 3.8123777412870863e-06, "loss": 0.81510675, "num_input_tokens_seen": 28902545, "step": 1371, "time_per_iteration": 4.703889846801758 }, { "auxiliary_loss_clip": 0.01224047, "auxiliary_loss_mlp": 0.0103856, "balance_loss_clip": 1.06070876, "balance_loss_mlp": 1.02821302, "epoch": 0.1649732459568328, "flos": 21106497162240.0, "grad_norm": 2.1998706984891565, "language_loss": 0.78227341, "learning_rate": 3.812048197605643e-06, "loss": 0.80489945, "num_input_tokens_seen": 28921440, "step": 1372, "time_per_iteration": 3.885763645172119 }, { "auxiliary_loss_clip": 0.01221718, "auxiliary_loss_mlp": 0.01030907, "balance_loss_clip": 1.05913711, "balance_loss_mlp": 1.02049971, "epoch": 0.16509348884747188, "flos": 20266726118400.0, "grad_norm": 2.1639891319628477, "language_loss": 0.81420624, "learning_rate": 3.8117183790379277e-06, "loss": 0.83673251, "num_input_tokens_seen": 28939890, "step": 1373, "time_per_iteration": 4.010171175003052 }, { "auxiliary_loss_clip": 0.01226911, "auxiliary_loss_mlp": 0.01030204, "balance_loss_clip": 1.06008172, "balance_loss_mlp": 1.01992834, "epoch": 0.165213731738111, "flos": 11035493602560.0, "grad_norm": 4.03796783014347, "language_loss": 0.94162536, "learning_rate": 3.811388285633976e-06, "loss": 0.96419656, "num_input_tokens_seen": 28955875, "step": 1374, "time_per_iteration": 2.789804220199585 }, { "auxiliary_loss_clip": 0.01215947, "auxiliary_loss_mlp": 0.0104099, "balance_loss_clip": 1.0617379, "balance_loss_mlp": 1.02965319, "epoch": 0.16533397462875007, "flos": 29972051268480.0, "grad_norm": 2.0932871611628934, "language_loss": 0.61874151, "learning_rate": 3.811057917443861e-06, "loss": 0.64131093, "num_input_tokens_seen": 28975140, "step": 1375, "time_per_iteration": 2.8164055347442627 }, { "auxiliary_loss_clip": 0.01136698, "auxiliary_loss_mlp": 0.01003269, "balance_loss_clip": 1.03642678, "balance_loss_mlp": 1.00069451, "epoch": 0.16545421751938916, "flos": 65556763027200.0, "grad_norm": 0.863031950409504, "language_loss": 0.68290377, "learning_rate": 3.8107272745177e-06, "loss": 0.7043035, "num_input_tokens_seen": 29047470, "step": 1376, "time_per_iteration": 3.4688117504119873 }, { "auxiliary_loss_clip": 0.0121888, "auxiliary_loss_mlp": 0.01040265, "balance_loss_clip": 1.06289446, "balance_loss_mlp": 1.03011417, "epoch": 0.16557446041002827, "flos": 22492361652480.0, "grad_norm": 1.9733073489496689, "language_loss": 0.78820801, "learning_rate": 3.8103963569056513e-06, "loss": 0.81079948, "num_input_tokens_seen": 29066605, "step": 1377, "time_per_iteration": 2.737182378768921 }, { "auxiliary_loss_clip": 0.01215049, "auxiliary_loss_mlp": 0.01034675, "balance_loss_clip": 1.06135178, "balance_loss_mlp": 1.02442932, "epoch": 0.16569470330066735, "flos": 24602723464320.0, "grad_norm": 2.235627793901815, "language_loss": 0.88054311, "learning_rate": 3.8100651646579146e-06, "loss": 0.90304035, "num_input_tokens_seen": 29085815, "step": 1378, "time_per_iteration": 2.787557363510132 }, { "auxiliary_loss_clip": 0.01217153, "auxiliary_loss_mlp": 0.01039648, "balance_loss_clip": 1.06145, "balance_loss_mlp": 1.02944922, "epoch": 0.16581494619130643, "flos": 15006207588480.0, "grad_norm": 2.543727071655484, "language_loss": 0.92756641, "learning_rate": 3.8097336978247317e-06, "loss": 0.95013446, "num_input_tokens_seen": 29102520, "step": 1379, "time_per_iteration": 2.6486153602600098 }, { "auxiliary_loss_clip": 0.01211951, "auxiliary_loss_mlp": 0.01034275, "balance_loss_clip": 1.0618124, "balance_loss_mlp": 1.0243504, "epoch": 0.16593518908194552, "flos": 17420338719360.0, "grad_norm": 2.2371911858085265, "language_loss": 0.89061368, "learning_rate": 3.8094019564563854e-06, "loss": 0.91307592, "num_input_tokens_seen": 29119450, "step": 1380, "time_per_iteration": 2.7728347778320312 }, { "auxiliary_loss_clip": 0.01227496, "auxiliary_loss_mlp": 0.01060069, "balance_loss_clip": 1.06014848, "balance_loss_mlp": 1.02419364, "epoch": 0.16605543197258463, "flos": 20412631163520.0, "grad_norm": 2.4968073921313385, "language_loss": 0.75293624, "learning_rate": 3.809069940603201e-06, "loss": 0.77581185, "num_input_tokens_seen": 29137405, "step": 1381, "time_per_iteration": 2.7718183994293213 }, { "auxiliary_loss_clip": 0.01210322, "auxiliary_loss_mlp": 0.01034464, "balance_loss_clip": 1.06141043, "balance_loss_mlp": 1.02338362, "epoch": 0.1661756748632237, "flos": 14209745368320.0, "grad_norm": 2.4004017046401915, "language_loss": 0.77520096, "learning_rate": 3.8087376503155452e-06, "loss": 0.79764879, "num_input_tokens_seen": 29154890, "step": 1382, "time_per_iteration": 2.9296507835388184 }, { "auxiliary_loss_clip": 0.01126984, "auxiliary_loss_mlp": 0.01005732, "balance_loss_clip": 1.03131795, "balance_loss_mlp": 1.00343156, "epoch": 0.1662959177538628, "flos": 66080877350400.0, "grad_norm": 0.9012706102157101, "language_loss": 0.56264293, "learning_rate": 3.808405085643826e-06, "loss": 0.58397007, "num_input_tokens_seen": 29219770, "step": 1383, "time_per_iteration": 3.448625087738037 }, { "auxiliary_loss_clip": 0.01228207, "auxiliary_loss_mlp": 0.01068659, "balance_loss_clip": 1.06148744, "balance_loss_mlp": 1.03349757, "epoch": 0.1664161606445019, "flos": 20740567357440.0, "grad_norm": 6.739026882316982, "language_loss": 0.88656569, "learning_rate": 3.8080722466384925e-06, "loss": 0.90953434, "num_input_tokens_seen": 29237620, "step": 1384, "time_per_iteration": 2.7615652084350586 }, { "auxiliary_loss_clip": 0.01229562, "auxiliary_loss_mlp": 0.01037379, "balance_loss_clip": 1.05888236, "balance_loss_mlp": 1.02635801, "epoch": 0.166536403535141, "flos": 25260930236160.0, "grad_norm": 2.41867840890956, "language_loss": 0.70727682, "learning_rate": 3.8077391333500376e-06, "loss": 0.72994626, "num_input_tokens_seen": 29256760, "step": 1385, "time_per_iteration": 2.8937666416168213 }, { "auxiliary_loss_clip": 0.01222095, "auxiliary_loss_mlp": 0.01032259, "balance_loss_clip": 1.06480277, "balance_loss_mlp": 1.02252007, "epoch": 0.16665664642578007, "flos": 25447450584960.0, "grad_norm": 1.7504114053407673, "language_loss": 0.76581466, "learning_rate": 3.8074057458289934e-06, "loss": 0.78835821, "num_input_tokens_seen": 29277450, "step": 1386, "time_per_iteration": 2.8396880626678467 }, { "auxiliary_loss_clip": 0.01223639, "auxiliary_loss_mlp": 0.01036415, "balance_loss_clip": 1.06140995, "balance_loss_mlp": 1.02576947, "epoch": 0.16677688931641918, "flos": 22200767043840.0, "grad_norm": 2.7861950800184254, "language_loss": 0.82691884, "learning_rate": 3.807072084125934e-06, "loss": 0.84951931, "num_input_tokens_seen": 29299300, "step": 1387, "time_per_iteration": 2.8125596046447754 }, { "auxiliary_loss_clip": 0.01216861, "auxiliary_loss_mlp": 0.01036197, "balance_loss_clip": 1.06153762, "balance_loss_mlp": 1.02619529, "epoch": 0.16689713220705826, "flos": 16945958776320.0, "grad_norm": 3.1440091097500553, "language_loss": 0.81125385, "learning_rate": 3.806738148291477e-06, "loss": 0.8337844, "num_input_tokens_seen": 29316125, "step": 1388, "time_per_iteration": 2.7864625453948975 }, { "auxiliary_loss_clip": 0.01213054, "auxiliary_loss_mlp": 0.01039191, "balance_loss_clip": 1.06370521, "balance_loss_mlp": 1.02879608, "epoch": 0.16701737509769735, "flos": 36244423923840.0, "grad_norm": 2.954710788973764, "language_loss": 0.71079928, "learning_rate": 3.8064039383762793e-06, "loss": 0.73332179, "num_input_tokens_seen": 29338490, "step": 1389, "time_per_iteration": 2.863373279571533 }, { "auxiliary_loss_clip": 0.01222011, "auxiliary_loss_mlp": 0.01038901, "balance_loss_clip": 1.06206036, "balance_loss_mlp": 1.02850616, "epoch": 0.16713761798833643, "flos": 23258659426560.0, "grad_norm": 2.462010388080146, "language_loss": 0.76865286, "learning_rate": 3.8060694544310396e-06, "loss": 0.79126191, "num_input_tokens_seen": 29357000, "step": 1390, "time_per_iteration": 2.7238879203796387 }, { "auxiliary_loss_clip": 0.01231639, "auxiliary_loss_mlp": 0.01043616, "balance_loss_clip": 1.06247902, "balance_loss_mlp": 1.03258896, "epoch": 0.16725786087897554, "flos": 25302515207040.0, "grad_norm": 1.7076330726554818, "language_loss": 0.78450537, "learning_rate": 3.8057346965065006e-06, "loss": 0.80725801, "num_input_tokens_seen": 29378230, "step": 1391, "time_per_iteration": 2.7405455112457275 }, { "auxiliary_loss_clip": 0.01220447, "auxiliary_loss_mlp": 0.01037778, "balance_loss_clip": 1.06253505, "balance_loss_mlp": 1.02812827, "epoch": 0.16737810376961462, "flos": 31831541516160.0, "grad_norm": 2.3068642414337446, "language_loss": 0.84330875, "learning_rate": 3.805399664653443e-06, "loss": 0.86589098, "num_input_tokens_seen": 29400370, "step": 1392, "time_per_iteration": 2.895733594894409 }, { "auxiliary_loss_clip": 0.01230708, "auxiliary_loss_mlp": 0.01036296, "balance_loss_clip": 1.06322908, "balance_loss_mlp": 1.02631783, "epoch": 0.1674983466602537, "flos": 27961843553280.0, "grad_norm": 3.104761126655898, "language_loss": 0.748312, "learning_rate": 3.805064358922692e-06, "loss": 0.77098197, "num_input_tokens_seen": 29418660, "step": 1393, "time_per_iteration": 2.7926065921783447 }, { "auxiliary_loss_clip": 0.01229976, "auxiliary_loss_mlp": 0.01043113, "balance_loss_clip": 1.06321692, "balance_loss_mlp": 1.03227711, "epoch": 0.16761858955089282, "flos": 21762656858880.0, "grad_norm": 2.7035044115363247, "language_loss": 0.81081057, "learning_rate": 3.8047287793651136e-06, "loss": 0.83354151, "num_input_tokens_seen": 29440105, "step": 1394, "time_per_iteration": 2.8169898986816406 }, { "auxiliary_loss_clip": 0.01220842, "auxiliary_loss_mlp": 0.0103297, "balance_loss_clip": 1.0618825, "balance_loss_mlp": 1.022331, "epoch": 0.1677388324415319, "flos": 23805507058560.0, "grad_norm": 1.7874670784903859, "language_loss": 0.88697249, "learning_rate": 3.8043929260316137e-06, "loss": 0.90951061, "num_input_tokens_seen": 29458260, "step": 1395, "time_per_iteration": 2.8692164421081543 }, { "auxiliary_loss_clip": 0.01224077, "auxiliary_loss_mlp": 0.01041838, "balance_loss_clip": 1.06754732, "balance_loss_mlp": 1.0307281, "epoch": 0.16785907533217098, "flos": 20558859431040.0, "grad_norm": 8.4074553068032, "language_loss": 0.83523321, "learning_rate": 3.8040567989731417e-06, "loss": 0.85789233, "num_input_tokens_seen": 29476205, "step": 1396, "time_per_iteration": 3.7341115474700928 }, { "auxiliary_loss_clip": 0.01218559, "auxiliary_loss_mlp": 0.01030905, "balance_loss_clip": 1.062397, "balance_loss_mlp": 1.0212791, "epoch": 0.16797931822281006, "flos": 15669657745920.0, "grad_norm": 5.59593032484463, "language_loss": 0.80015874, "learning_rate": 3.8037203982406876e-06, "loss": 0.82265329, "num_input_tokens_seen": 29494370, "step": 1397, "time_per_iteration": 2.831805467605591 }, { "auxiliary_loss_clip": 0.01228754, "auxiliary_loss_mlp": 0.01037533, "balance_loss_clip": 1.06195092, "balance_loss_mlp": 1.02665567, "epoch": 0.16809956111344918, "flos": 16541101607040.0, "grad_norm": 2.0380388195339445, "language_loss": 0.72994173, "learning_rate": 3.8033837238852835e-06, "loss": 0.7526046, "num_input_tokens_seen": 29511070, "step": 1398, "time_per_iteration": 4.695532321929932 }, { "auxiliary_loss_clip": 0.01211535, "auxiliary_loss_mlp": 0.01028817, "balance_loss_clip": 1.06207144, "balance_loss_mlp": 1.01910186, "epoch": 0.16821980400408826, "flos": 23258084808960.0, "grad_norm": 1.7188660208605084, "language_loss": 0.69792658, "learning_rate": 3.8030467759580017e-06, "loss": 0.72033012, "num_input_tokens_seen": 29531990, "step": 1399, "time_per_iteration": 4.109954833984375 }, { "auxiliary_loss_clip": 0.0122591, "auxiliary_loss_mlp": 0.01037542, "balance_loss_clip": 1.06174016, "balance_loss_mlp": 1.02656913, "epoch": 0.16834004689472734, "flos": 20774754126720.0, "grad_norm": 2.764932477806941, "language_loss": 0.86901808, "learning_rate": 3.802709554509958e-06, "loss": 0.89165264, "num_input_tokens_seen": 29549790, "step": 1400, "time_per_iteration": 2.8088958263397217 }, { "auxiliary_loss_clip": 0.01222434, "auxiliary_loss_mlp": 0.01035568, "balance_loss_clip": 1.06340957, "balance_loss_mlp": 1.02564394, "epoch": 0.16846028978536645, "flos": 26687302289280.0, "grad_norm": 2.3437471216008126, "language_loss": 0.79157245, "learning_rate": 3.8023720595923083e-06, "loss": 0.81415248, "num_input_tokens_seen": 29569045, "step": 1401, "time_per_iteration": 2.843254327774048 }, { "auxiliary_loss_clip": 0.01212322, "auxiliary_loss_mlp": 0.01040182, "balance_loss_clip": 1.06134534, "balance_loss_mlp": 1.03014505, "epoch": 0.16858053267600553, "flos": 18843298980480.0, "grad_norm": 3.634625506705171, "language_loss": 0.87148744, "learning_rate": 3.80203429125625e-06, "loss": 0.89401245, "num_input_tokens_seen": 29587220, "step": 1402, "time_per_iteration": 2.8445558547973633 }, { "auxiliary_loss_clip": 0.0119978, "auxiliary_loss_mlp": 0.01037798, "balance_loss_clip": 1.06212831, "balance_loss_mlp": 1.02768874, "epoch": 0.16870077556664462, "flos": 27744548227200.0, "grad_norm": 1.9346617487809705, "language_loss": 0.70227218, "learning_rate": 3.8016962495530225e-06, "loss": 0.72464794, "num_input_tokens_seen": 29606410, "step": 1403, "time_per_iteration": 2.9410903453826904 }, { "auxiliary_loss_clip": 0.01229842, "auxiliary_loss_mlp": 0.01040148, "balance_loss_clip": 1.06186843, "balance_loss_mlp": 1.03005648, "epoch": 0.1688210184572837, "flos": 13730768484480.0, "grad_norm": 2.586539233105944, "language_loss": 0.77169514, "learning_rate": 3.8013579345339063e-06, "loss": 0.79439503, "num_input_tokens_seen": 29621275, "step": 1404, "time_per_iteration": 2.716564416885376 }, { "auxiliary_loss_clip": 0.01221086, "auxiliary_loss_mlp": 0.01042314, "balance_loss_clip": 1.06306458, "balance_loss_mlp": 1.03115559, "epoch": 0.1689412613479228, "flos": 26468785900800.0, "grad_norm": 1.8823686827048725, "language_loss": 0.6939317, "learning_rate": 3.801019346250224e-06, "loss": 0.71656573, "num_input_tokens_seen": 29641420, "step": 1405, "time_per_iteration": 2.839477300643921 }, { "auxiliary_loss_clip": 0.0122417, "auxiliary_loss_mlp": 0.01034793, "balance_loss_clip": 1.06200874, "balance_loss_mlp": 1.02473772, "epoch": 0.1690615042385619, "flos": 21138852337920.0, "grad_norm": 2.660041427766537, "language_loss": 0.83469832, "learning_rate": 3.8006804847533395e-06, "loss": 0.857288, "num_input_tokens_seen": 29660935, "step": 1406, "time_per_iteration": 2.75211763381958 }, { "auxiliary_loss_clip": 0.01227561, "auxiliary_loss_mlp": 0.01033853, "balance_loss_clip": 1.06186855, "balance_loss_mlp": 1.02363038, "epoch": 0.16918174712920098, "flos": 20849340718080.0, "grad_norm": 1.9237575642632707, "language_loss": 0.85128158, "learning_rate": 3.8003413500946556e-06, "loss": 0.8738957, "num_input_tokens_seen": 29681045, "step": 1407, "time_per_iteration": 2.7448556423187256 }, { "auxiliary_loss_clip": 0.01223162, "auxiliary_loss_mlp": 0.01038518, "balance_loss_clip": 1.06301057, "balance_loss_mlp": 1.02811146, "epoch": 0.1693019900198401, "flos": 16983270028800.0, "grad_norm": 4.686863137653643, "language_loss": 0.82789564, "learning_rate": 3.8000019423256216e-06, "loss": 0.85051239, "num_input_tokens_seen": 29698810, "step": 1408, "time_per_iteration": 2.8281240463256836 }, { "auxiliary_loss_clip": 0.01211759, "auxiliary_loss_mlp": 0.01033351, "balance_loss_clip": 1.06367922, "balance_loss_mlp": 1.02299726, "epoch": 0.16942223291047917, "flos": 26796901662720.0, "grad_norm": 1.9241195984670723, "language_loss": 0.87948561, "learning_rate": 3.7996622614977234e-06, "loss": 0.90193665, "num_input_tokens_seen": 29720000, "step": 1409, "time_per_iteration": 2.7994415760040283 }, { "auxiliary_loss_clip": 0.01220383, "auxiliary_loss_mlp": 0.01034102, "balance_loss_clip": 1.06436133, "balance_loss_mlp": 1.02358234, "epoch": 0.16954247580111825, "flos": 18583700411520.0, "grad_norm": 2.2518249433676667, "language_loss": 0.78812444, "learning_rate": 3.799322307662492e-06, "loss": 0.81066924, "num_input_tokens_seen": 29737820, "step": 1410, "time_per_iteration": 2.765540599822998 }, { "auxiliary_loss_clip": 0.01217088, "auxiliary_loss_mlp": 0.01034538, "balance_loss_clip": 1.06294978, "balance_loss_mlp": 1.02389801, "epoch": 0.16966271869175734, "flos": 13983651210240.0, "grad_norm": 2.988031791831924, "language_loss": 0.83685899, "learning_rate": 3.798982080871496e-06, "loss": 0.85937524, "num_input_tokens_seen": 29752960, "step": 1411, "time_per_iteration": 2.7567155361175537 }, { "auxiliary_loss_clip": 0.01228951, "auxiliary_loss_mlp": 0.01038578, "balance_loss_clip": 1.05989265, "balance_loss_mlp": 1.02821898, "epoch": 0.16978296158239645, "flos": 37487328284160.0, "grad_norm": 2.256936558709874, "language_loss": 0.67711258, "learning_rate": 3.798641581176349e-06, "loss": 0.69978786, "num_input_tokens_seen": 29775240, "step": 1412, "time_per_iteration": 2.8664042949676514 }, { "auxiliary_loss_clip": 0.01223106, "auxiliary_loss_mlp": 0.01038577, "balance_loss_clip": 1.06211734, "balance_loss_mlp": 1.02792609, "epoch": 0.16990320447303553, "flos": 28328958506880.0, "grad_norm": 1.8848157768208946, "language_loss": 0.74614823, "learning_rate": 3.7983008086287044e-06, "loss": 0.76876509, "num_input_tokens_seen": 29796560, "step": 1413, "time_per_iteration": 2.940293550491333 }, { "auxiliary_loss_clip": 0.01224943, "auxiliary_loss_mlp": 0.01033393, "balance_loss_clip": 1.06616664, "balance_loss_mlp": 1.02259254, "epoch": 0.1700234473636746, "flos": 20188189031040.0, "grad_norm": 2.192820191787642, "language_loss": 0.79358166, "learning_rate": 3.797959763280257e-06, "loss": 0.81616497, "num_input_tokens_seen": 29815245, "step": 1414, "time_per_iteration": 2.836406707763672 }, { "auxiliary_loss_clip": 0.01228625, "auxiliary_loss_mlp": 0.01044327, "balance_loss_clip": 1.06287253, "balance_loss_mlp": 1.03348529, "epoch": 0.17014369025431372, "flos": 24858658846080.0, "grad_norm": 2.2957536903778046, "language_loss": 0.78895247, "learning_rate": 3.797618445182743e-06, "loss": 0.81168199, "num_input_tokens_seen": 29836640, "step": 1415, "time_per_iteration": 2.9101405143737793 }, { "auxiliary_loss_clip": 0.01212006, "auxiliary_loss_mlp": 0.01034948, "balance_loss_clip": 1.06326485, "balance_loss_mlp": 1.02406394, "epoch": 0.1702639331449528, "flos": 16467233287680.0, "grad_norm": 2.096742241936264, "language_loss": 0.8517983, "learning_rate": 3.79727685438794e-06, "loss": 0.87426782, "num_input_tokens_seen": 29850830, "step": 1416, "time_per_iteration": 2.741779327392578 }, { "auxiliary_loss_clip": 0.01127315, "auxiliary_loss_mlp": 0.01000927, "balance_loss_clip": 1.02823389, "balance_loss_mlp": 0.99859065, "epoch": 0.1703841760355919, "flos": 52508870979840.0, "grad_norm": 0.8369328241641939, "language_loss": 0.61617839, "learning_rate": 3.796934990947667e-06, "loss": 0.63746083, "num_input_tokens_seen": 29912515, "step": 1417, "time_per_iteration": 3.3064658641815186 }, { "auxiliary_loss_clip": 0.01126718, "auxiliary_loss_mlp": 0.01003893, "balance_loss_clip": 1.02750301, "balance_loss_mlp": 1.00129414, "epoch": 0.170504418926231, "flos": 49370637576960.0, "grad_norm": 0.8716084744695076, "language_loss": 0.62394893, "learning_rate": 3.7965928549137854e-06, "loss": 0.64525503, "num_input_tokens_seen": 29969330, "step": 1418, "time_per_iteration": 3.2987003326416016 }, { "auxiliary_loss_clip": 0.01224858, "auxiliary_loss_mlp": 0.01034493, "balance_loss_clip": 1.06104565, "balance_loss_mlp": 1.0233587, "epoch": 0.17062466181687008, "flos": 25849219184640.0, "grad_norm": 2.8152703617700383, "language_loss": 0.77643144, "learning_rate": 3.7962504463381953e-06, "loss": 0.79902494, "num_input_tokens_seen": 29990820, "step": 1419, "time_per_iteration": 2.7774245738983154 }, { "auxiliary_loss_clip": 0.01216633, "auxiliary_loss_mlp": 0.01069044, "balance_loss_clip": 1.0656451, "balance_loss_mlp": 1.03287172, "epoch": 0.17074490470750917, "flos": 20960412549120.0, "grad_norm": 1.7603970073747897, "language_loss": 0.78501225, "learning_rate": 3.7959077652728412e-06, "loss": 0.80786896, "num_input_tokens_seen": 30009275, "step": 1420, "time_per_iteration": 2.7224483489990234 }, { "auxiliary_loss_clip": 0.0122252, "auxiliary_loss_mlp": 0.01039343, "balance_loss_clip": 1.06082761, "balance_loss_mlp": 1.02730334, "epoch": 0.17086514759814825, "flos": 20959766104320.0, "grad_norm": 1.8736681880924926, "language_loss": 0.77174652, "learning_rate": 3.795564811769707e-06, "loss": 0.79436517, "num_input_tokens_seen": 30027630, "step": 1421, "time_per_iteration": 2.651724338531494 }, { "auxiliary_loss_clip": 0.01223743, "auxiliary_loss_mlp": 0.01040412, "balance_loss_clip": 1.06570292, "balance_loss_mlp": 1.0297904, "epoch": 0.17098539048878736, "flos": 28474073452800.0, "grad_norm": 2.0915182455778765, "language_loss": 0.78152645, "learning_rate": 3.795221585880818e-06, "loss": 0.80416799, "num_input_tokens_seen": 30048310, "step": 1422, "time_per_iteration": 2.8651955127716064 }, { "auxiliary_loss_clip": 0.01221958, "auxiliary_loss_mlp": 0.01045668, "balance_loss_clip": 1.06731832, "balance_loss_mlp": 1.03546953, "epoch": 0.17110563337942644, "flos": 16290014561280.0, "grad_norm": 2.126207400870213, "language_loss": 0.91317999, "learning_rate": 3.794878087658242e-06, "loss": 0.93585628, "num_input_tokens_seen": 30066080, "step": 1423, "time_per_iteration": 3.658323049545288 }, { "auxiliary_loss_clip": 0.01229589, "auxiliary_loss_mlp": 0.01037338, "balance_loss_clip": 1.06346166, "balance_loss_mlp": 1.02638865, "epoch": 0.17122587627006552, "flos": 29674207693440.0, "grad_norm": 3.802751335030676, "language_loss": 0.78792739, "learning_rate": 3.7945343171540873e-06, "loss": 0.8105967, "num_input_tokens_seen": 30086955, "step": 1424, "time_per_iteration": 4.844858169555664 }, { "auxiliary_loss_clip": 0.01232604, "auxiliary_loss_mlp": 0.01035643, "balance_loss_clip": 1.0634582, "balance_loss_mlp": 1.02518809, "epoch": 0.17134611916070464, "flos": 25338389915520.0, "grad_norm": 2.0471403988198076, "language_loss": 0.792974, "learning_rate": 3.7941902744205033e-06, "loss": 0.81565642, "num_input_tokens_seen": 30107990, "step": 1425, "time_per_iteration": 3.686161756515503 }, { "auxiliary_loss_clip": 0.01228517, "auxiliary_loss_mlp": 0.01036593, "balance_loss_clip": 1.06293559, "balance_loss_mlp": 1.02518439, "epoch": 0.17146636205134372, "flos": 13953845900160.0, "grad_norm": 2.5608962014449053, "language_loss": 0.83550739, "learning_rate": 3.7938459595096817e-06, "loss": 0.85815847, "num_input_tokens_seen": 30126535, "step": 1426, "time_per_iteration": 2.7846179008483887 }, { "auxiliary_loss_clip": 0.01235011, "auxiliary_loss_mlp": 0.01045045, "balance_loss_clip": 1.06448805, "balance_loss_mlp": 1.03332114, "epoch": 0.1715866049419828, "flos": 23915214172800.0, "grad_norm": 1.7476531941334772, "language_loss": 0.8625834, "learning_rate": 3.7935013724738545e-06, "loss": 0.8853839, "num_input_tokens_seen": 30147035, "step": 1427, "time_per_iteration": 2.7297909259796143 }, { "auxiliary_loss_clip": 0.01221519, "auxiliary_loss_mlp": 0.01034487, "balance_loss_clip": 1.06184983, "balance_loss_mlp": 1.02394927, "epoch": 0.17170684783262188, "flos": 22709369669760.0, "grad_norm": 1.9075232560294377, "language_loss": 0.77859843, "learning_rate": 3.7931565133652945e-06, "loss": 0.80115849, "num_input_tokens_seen": 30167110, "step": 1428, "time_per_iteration": 2.7309300899505615 }, { "auxiliary_loss_clip": 0.01230357, "auxiliary_loss_mlp": 0.01038342, "balance_loss_clip": 1.06253886, "balance_loss_mlp": 1.02718377, "epoch": 0.171827090723261, "flos": 26613290315520.0, "grad_norm": 2.358711171841132, "language_loss": 0.67957526, "learning_rate": 3.792811382236317e-06, "loss": 0.70226222, "num_input_tokens_seen": 30185620, "step": 1429, "time_per_iteration": 2.5638439655303955 }, { "auxiliary_loss_clip": 0.01231588, "auxiliary_loss_mlp": 0.01043096, "balance_loss_clip": 1.06323218, "balance_loss_mlp": 1.03260589, "epoch": 0.17194733361390008, "flos": 28148507556480.0, "grad_norm": 2.517325538019033, "language_loss": 0.78222275, "learning_rate": 3.792465979139279e-06, "loss": 0.80496961, "num_input_tokens_seen": 30208225, "step": 1430, "time_per_iteration": 2.6657817363739014 }, { "auxiliary_loss_clip": 0.011162, "auxiliary_loss_mlp": 0.01013057, "balance_loss_clip": 1.02724481, "balance_loss_mlp": 1.01070869, "epoch": 0.17206757650453916, "flos": 65530689753600.0, "grad_norm": 1.0591395560971602, "language_loss": 0.65652651, "learning_rate": 3.792120304126576e-06, "loss": 0.67781907, "num_input_tokens_seen": 30271600, "step": 1431, "time_per_iteration": 3.242316722869873 }, { "auxiliary_loss_clip": 0.01208579, "auxiliary_loss_mlp": 0.01029529, "balance_loss_clip": 1.0604949, "balance_loss_mlp": 1.01939654, "epoch": 0.17218781939517827, "flos": 22273486128000.0, "grad_norm": 1.8590502351370726, "language_loss": 0.83877558, "learning_rate": 3.791774357250649e-06, "loss": 0.86115664, "num_input_tokens_seen": 30290430, "step": 1432, "time_per_iteration": 2.6880970001220703 }, { "auxiliary_loss_clip": 0.01220882, "auxiliary_loss_mlp": 0.01036159, "balance_loss_clip": 1.06279778, "balance_loss_mlp": 1.02470851, "epoch": 0.17230806228581735, "flos": 14137313592960.0, "grad_norm": 2.9452730079180163, "language_loss": 0.7907843, "learning_rate": 3.7914281385639757e-06, "loss": 0.81335473, "num_input_tokens_seen": 30308305, "step": 1433, "time_per_iteration": 2.611133575439453 }, { "auxiliary_loss_clip": 0.01226607, "auxiliary_loss_mlp": 0.01038801, "balance_loss_clip": 1.0608629, "balance_loss_mlp": 1.02859092, "epoch": 0.17242830517645644, "flos": 20704836303360.0, "grad_norm": 1.8538230014795318, "language_loss": 0.79333174, "learning_rate": 3.7910816481190784e-06, "loss": 0.8159858, "num_input_tokens_seen": 30328120, "step": 1434, "time_per_iteration": 2.5718510150909424 }, { "auxiliary_loss_clip": 0.0121254, "auxiliary_loss_mlp": 0.01034397, "balance_loss_clip": 1.06069303, "balance_loss_mlp": 1.02424622, "epoch": 0.17254854806709552, "flos": 30774582887040.0, "grad_norm": 2.409054735572215, "language_loss": 0.75009477, "learning_rate": 3.7907348859685193e-06, "loss": 0.77256411, "num_input_tokens_seen": 30349825, "step": 1435, "time_per_iteration": 2.8038690090179443 }, { "auxiliary_loss_clip": 0.01219706, "auxiliary_loss_mlp": 0.01033715, "balance_loss_clip": 1.06168389, "balance_loss_mlp": 1.02325428, "epoch": 0.17266879095773463, "flos": 26614726859520.0, "grad_norm": 2.7232720953470424, "language_loss": 0.8083421, "learning_rate": 3.790387852164902e-06, "loss": 0.83087635, "num_input_tokens_seen": 30370555, "step": 1436, "time_per_iteration": 2.9197094440460205 }, { "auxiliary_loss_clip": 0.01227298, "auxiliary_loss_mlp": 0.01033215, "balance_loss_clip": 1.06075609, "balance_loss_mlp": 1.02289724, "epoch": 0.1727890338483737, "flos": 20266295155200.0, "grad_norm": 2.1303190269566685, "language_loss": 0.76715708, "learning_rate": 3.7900405467608707e-06, "loss": 0.7897622, "num_input_tokens_seen": 30390100, "step": 1437, "time_per_iteration": 2.902184247970581 }, { "auxiliary_loss_clip": 0.01205284, "auxiliary_loss_mlp": 0.01042607, "balance_loss_clip": 1.06247926, "balance_loss_mlp": 1.03187251, "epoch": 0.1729092767390128, "flos": 18179812909440.0, "grad_norm": 3.1435672987092587, "language_loss": 0.79629755, "learning_rate": 3.7896929698091114e-06, "loss": 0.81877649, "num_input_tokens_seen": 30402915, "step": 1438, "time_per_iteration": 2.968712568283081 }, { "auxiliary_loss_clip": 0.01237047, "auxiliary_loss_mlp": 0.01039216, "balance_loss_clip": 1.06840992, "balance_loss_mlp": 1.02833855, "epoch": 0.1730295196296519, "flos": 26759518583040.0, "grad_norm": 2.878999776456814, "language_loss": 0.68324852, "learning_rate": 3.7893451213623518e-06, "loss": 0.70601118, "num_input_tokens_seen": 30420145, "step": 1439, "time_per_iteration": 2.907750368118286 }, { "auxiliary_loss_clip": 0.01225467, "auxiliary_loss_mlp": 0.01062416, "balance_loss_clip": 1.06454265, "balance_loss_mlp": 1.02576363, "epoch": 0.173149762520291, "flos": 23842531002240.0, "grad_norm": 3.387769561143603, "language_loss": 0.82177508, "learning_rate": 3.7889970014733606e-06, "loss": 0.84465384, "num_input_tokens_seen": 30439250, "step": 1440, "time_per_iteration": 2.8703548908233643 }, { "auxiliary_loss_clip": 0.01203217, "auxiliary_loss_mlp": 0.01035096, "balance_loss_clip": 1.06424868, "balance_loss_mlp": 1.02369952, "epoch": 0.17327000541093007, "flos": 23368186972800.0, "grad_norm": 1.702181045031367, "language_loss": 0.77963698, "learning_rate": 3.7886486101949463e-06, "loss": 0.80202007, "num_input_tokens_seen": 30460430, "step": 1441, "time_per_iteration": 2.8585360050201416 }, { "auxiliary_loss_clip": 0.01207111, "auxiliary_loss_mlp": 0.0104579, "balance_loss_clip": 1.06570649, "balance_loss_mlp": 1.03453088, "epoch": 0.17339024830156918, "flos": 18221290139520.0, "grad_norm": 2.023247268101338, "language_loss": 0.87957895, "learning_rate": 3.7882999475799594e-06, "loss": 0.90210795, "num_input_tokens_seen": 30478465, "step": 1442, "time_per_iteration": 2.6931793689727783 }, { "auxiliary_loss_clip": 0.01198696, "auxiliary_loss_mlp": 0.01038924, "balance_loss_clip": 1.06607103, "balance_loss_mlp": 1.02730095, "epoch": 0.17351049119220827, "flos": 23332024955520.0, "grad_norm": 1.714050036840366, "language_loss": 0.8169837, "learning_rate": 3.787951013681293e-06, "loss": 0.83935988, "num_input_tokens_seen": 30496510, "step": 1443, "time_per_iteration": 2.805694103240967 }, { "auxiliary_loss_clip": 0.01228174, "auxiliary_loss_mlp": 0.01039804, "balance_loss_clip": 1.06435657, "balance_loss_mlp": 1.02907515, "epoch": 0.17363073408284735, "flos": 23803495896960.0, "grad_norm": 2.298913365009891, "language_loss": 0.77280247, "learning_rate": 3.787601808551879e-06, "loss": 0.79548222, "num_input_tokens_seen": 30516325, "step": 1444, "time_per_iteration": 2.7740399837493896 }, { "auxiliary_loss_clip": 0.01220207, "auxiliary_loss_mlp": 0.0103946, "balance_loss_clip": 1.06400013, "balance_loss_mlp": 1.028391, "epoch": 0.17375097697348643, "flos": 18515290959360.0, "grad_norm": 2.4808312777855583, "language_loss": 0.84183431, "learning_rate": 3.7872523322446926e-06, "loss": 0.86443096, "num_input_tokens_seen": 30535210, "step": 1445, "time_per_iteration": 2.737060546875 }, { "auxiliary_loss_clip": 0.01218678, "auxiliary_loss_mlp": 0.01034572, "balance_loss_clip": 1.06126451, "balance_loss_mlp": 1.0239861, "epoch": 0.17387121986412554, "flos": 38877897456000.0, "grad_norm": 1.7298152340177801, "language_loss": 0.60476083, "learning_rate": 3.7869025848127478e-06, "loss": 0.62729329, "num_input_tokens_seen": 30559405, "step": 1446, "time_per_iteration": 3.0490713119506836 }, { "auxiliary_loss_clip": 0.01228855, "auxiliary_loss_mlp": 0.01034008, "balance_loss_clip": 1.0632571, "balance_loss_mlp": 1.02274239, "epoch": 0.17399146275476463, "flos": 20375714960640.0, "grad_norm": 4.858704303122008, "language_loss": 0.80485964, "learning_rate": 3.786552566309102e-06, "loss": 0.8274883, "num_input_tokens_seen": 30577615, "step": 1447, "time_per_iteration": 2.7661426067352295 }, { "auxiliary_loss_clip": 0.01221641, "auxiliary_loss_mlp": 0.01057987, "balance_loss_clip": 1.06566787, "balance_loss_mlp": 1.02226484, "epoch": 0.1741117056454037, "flos": 19164339763200.0, "grad_norm": 2.7831620718968533, "language_loss": 0.86182916, "learning_rate": 3.7862022767868517e-06, "loss": 0.88462543, "num_input_tokens_seen": 30595205, "step": 1448, "time_per_iteration": 2.689361333847046 }, { "auxiliary_loss_clip": 0.01214244, "auxiliary_loss_mlp": 0.01039879, "balance_loss_clip": 1.06514835, "balance_loss_mlp": 1.03000879, "epoch": 0.17423194853604282, "flos": 25374300537600.0, "grad_norm": 2.311448853517574, "language_loss": 0.84352183, "learning_rate": 3.7858517162991367e-06, "loss": 0.86606306, "num_input_tokens_seen": 30615280, "step": 1449, "time_per_iteration": 3.7030234336853027 }, { "auxiliary_loss_clip": 0.01220662, "auxiliary_loss_mlp": 0.0103755, "balance_loss_clip": 1.06433833, "balance_loss_mlp": 1.0268867, "epoch": 0.1743521914266819, "flos": 25191874339200.0, "grad_norm": 2.509028902005309, "language_loss": 0.61568439, "learning_rate": 3.7855008848991363e-06, "loss": 0.63826656, "num_input_tokens_seen": 30633485, "step": 1450, "time_per_iteration": 4.771718502044678 }, { "auxiliary_loss_clip": 0.01217929, "auxiliary_loss_mlp": 0.01034523, "balance_loss_clip": 1.06376565, "balance_loss_mlp": 1.02492082, "epoch": 0.17447243431732098, "flos": 25666577504640.0, "grad_norm": 2.188915729632192, "language_loss": 0.77604645, "learning_rate": 3.7851497826400714e-06, "loss": 0.79857099, "num_input_tokens_seen": 30653625, "step": 1451, "time_per_iteration": 3.881272315979004 }, { "auxiliary_loss_clip": 0.01230982, "auxiliary_loss_mlp": 0.01036954, "balance_loss_clip": 1.06497586, "balance_loss_mlp": 1.02626061, "epoch": 0.17459267720796007, "flos": 36281950657920.0, "grad_norm": 1.8976008898382861, "language_loss": 0.76057625, "learning_rate": 3.7847984095752034e-06, "loss": 0.78325558, "num_input_tokens_seen": 30677080, "step": 1452, "time_per_iteration": 2.8309106826782227 }, { "auxiliary_loss_clip": 0.01227223, "auxiliary_loss_mlp": 0.01032149, "balance_loss_clip": 1.06110191, "balance_loss_mlp": 1.02096736, "epoch": 0.17471292009859918, "flos": 20011113959040.0, "grad_norm": 1.8720073200919414, "language_loss": 0.80377388, "learning_rate": 3.784446765757836e-06, "loss": 0.82636762, "num_input_tokens_seen": 30695725, "step": 1453, "time_per_iteration": 2.7521049976348877 }, { "auxiliary_loss_clip": 0.01199767, "auxiliary_loss_mlp": 0.0104338, "balance_loss_clip": 1.05883288, "balance_loss_mlp": 1.03261578, "epoch": 0.17483316298923826, "flos": 27819242559360.0, "grad_norm": 2.245060130922031, "language_loss": 0.78040987, "learning_rate": 3.7840948512413133e-06, "loss": 0.80284137, "num_input_tokens_seen": 30713310, "step": 1454, "time_per_iteration": 2.8863439559936523 }, { "auxiliary_loss_clip": 0.01209976, "auxiliary_loss_mlp": 0.01037157, "balance_loss_clip": 1.06122875, "balance_loss_mlp": 1.02539122, "epoch": 0.17495340587987734, "flos": 44017934791680.0, "grad_norm": 1.8951833076189986, "language_loss": 0.78936547, "learning_rate": 3.7837426660790196e-06, "loss": 0.81183684, "num_input_tokens_seen": 30734725, "step": 1455, "time_per_iteration": 3.0299932956695557 }, { "auxiliary_loss_clip": 0.01227524, "auxiliary_loss_mlp": 0.01044464, "balance_loss_clip": 1.06199193, "balance_loss_mlp": 1.03361654, "epoch": 0.17507364877051645, "flos": 20885825957760.0, "grad_norm": 2.025279525286116, "language_loss": 0.819309, "learning_rate": 3.783390210324382e-06, "loss": 0.84202886, "num_input_tokens_seen": 30754450, "step": 1456, "time_per_iteration": 2.8951330184936523 }, { "auxiliary_loss_clip": 0.01215895, "auxiliary_loss_mlp": 0.01034754, "balance_loss_clip": 1.06376696, "balance_loss_mlp": 1.02380419, "epoch": 0.17519389166115554, "flos": 24717602136960.0, "grad_norm": 1.9886997277435094, "language_loss": 0.72505915, "learning_rate": 3.7830374840308676e-06, "loss": 0.74756569, "num_input_tokens_seen": 30774605, "step": 1457, "time_per_iteration": 2.919145107269287 }, { "auxiliary_loss_clip": 0.01227329, "auxiliary_loss_mlp": 0.01044019, "balance_loss_clip": 1.06407833, "balance_loss_mlp": 1.03304017, "epoch": 0.17531413455179462, "flos": 23798144770560.0, "grad_norm": 2.480243056684342, "language_loss": 0.82024848, "learning_rate": 3.7826844872519842e-06, "loss": 0.84296191, "num_input_tokens_seen": 30792460, "step": 1458, "time_per_iteration": 2.8994996547698975 }, { "auxiliary_loss_clip": 0.01220458, "auxiliary_loss_mlp": 0.0103378, "balance_loss_clip": 1.06548727, "balance_loss_mlp": 1.02416563, "epoch": 0.1754343774424337, "flos": 24572379450240.0, "grad_norm": 2.0512851442350057, "language_loss": 0.71969521, "learning_rate": 3.782331220041282e-06, "loss": 0.74223757, "num_input_tokens_seen": 30812525, "step": 1459, "time_per_iteration": 2.8228187561035156 }, { "auxiliary_loss_clip": 0.01225258, "auxiliary_loss_mlp": 0.01034752, "balance_loss_clip": 1.0643599, "balance_loss_mlp": 1.02401745, "epoch": 0.17555462033307281, "flos": 18114599767680.0, "grad_norm": 1.9650112611316022, "language_loss": 0.82782012, "learning_rate": 3.7819776824523504e-06, "loss": 0.85042024, "num_input_tokens_seen": 30830390, "step": 1460, "time_per_iteration": 2.8882789611816406 }, { "auxiliary_loss_clip": 0.01230602, "auxiliary_loss_mlp": 0.01038884, "balance_loss_clip": 1.06349277, "balance_loss_mlp": 1.02850044, "epoch": 0.1756748632237119, "flos": 28366018364160.0, "grad_norm": 1.8895688721827288, "language_loss": 0.83516288, "learning_rate": 3.7816238745388213e-06, "loss": 0.8578577, "num_input_tokens_seen": 30849935, "step": 1461, "time_per_iteration": 2.836350440979004 }, { "auxiliary_loss_clip": 0.01224935, "auxiliary_loss_mlp": 0.01039631, "balance_loss_clip": 1.06098664, "balance_loss_mlp": 1.02959371, "epoch": 0.17579510611435098, "flos": 25732939881600.0, "grad_norm": 3.926024778629578, "language_loss": 0.86964589, "learning_rate": 3.781269796354367e-06, "loss": 0.89229155, "num_input_tokens_seen": 30869555, "step": 1462, "time_per_iteration": 2.9401743412017822 }, { "auxiliary_loss_clip": 0.01223536, "auxiliary_loss_mlp": 0.01046557, "balance_loss_clip": 1.06330419, "balance_loss_mlp": 1.03535151, "epoch": 0.1759153490049901, "flos": 18588081870720.0, "grad_norm": 1.7877022492793988, "language_loss": 0.86068535, "learning_rate": 3.7809154479527006e-06, "loss": 0.88338625, "num_input_tokens_seen": 30888760, "step": 1463, "time_per_iteration": 2.742443323135376 }, { "auxiliary_loss_clip": 0.0121266, "auxiliary_loss_mlp": 0.01040935, "balance_loss_clip": 1.06517375, "balance_loss_mlp": 1.03058159, "epoch": 0.17603559189562917, "flos": 18619323724800.0, "grad_norm": 3.7741978855286855, "language_loss": 0.84590602, "learning_rate": 3.780560829387577e-06, "loss": 0.86844194, "num_input_tokens_seen": 30907260, "step": 1464, "time_per_iteration": 2.8042519092559814 }, { "auxiliary_loss_clip": 0.0112204, "auxiliary_loss_mlp": 0.01006914, "balance_loss_clip": 1.02576327, "balance_loss_mlp": 1.00451803, "epoch": 0.17615583478626826, "flos": 60530775373440.0, "grad_norm": 0.8578761286524681, "language_loss": 0.57877874, "learning_rate": 3.7802059407127915e-06, "loss": 0.60006833, "num_input_tokens_seen": 30965810, "step": 1465, "time_per_iteration": 3.2621145248413086 }, { "auxiliary_loss_clip": 0.01215174, "auxiliary_loss_mlp": 0.01039805, "balance_loss_clip": 1.06242347, "balance_loss_mlp": 1.02964246, "epoch": 0.17627607767690734, "flos": 23616221362560.0, "grad_norm": 2.2255758175793225, "language_loss": 0.8594476, "learning_rate": 3.7798507819821797e-06, "loss": 0.88199741, "num_input_tokens_seen": 30982935, "step": 1466, "time_per_iteration": 2.7574920654296875 }, { "auxiliary_loss_clip": 0.01209946, "auxiliary_loss_mlp": 0.01043264, "balance_loss_clip": 1.06511116, "balance_loss_mlp": 1.03251147, "epoch": 0.17639632056754645, "flos": 17639070589440.0, "grad_norm": 2.2340208947795515, "language_loss": 0.78689349, "learning_rate": 3.7794953532496197e-06, "loss": 0.80942559, "num_input_tokens_seen": 30998840, "step": 1467, "time_per_iteration": 2.891672372817993 }, { "auxiliary_loss_clip": 0.01110493, "auxiliary_loss_mlp": 0.01030914, "balance_loss_clip": 1.03175688, "balance_loss_mlp": 1.00530005, "epoch": 0.17651656345818553, "flos": 57932604910080.0, "grad_norm": 0.864154081863958, "language_loss": 0.5792405, "learning_rate": 3.7791396545690295e-06, "loss": 0.6006546, "num_input_tokens_seen": 31060075, "step": 1468, "time_per_iteration": 3.310800313949585 }, { "auxiliary_loss_clip": 0.0122706, "auxiliary_loss_mlp": 0.01037455, "balance_loss_clip": 1.06538069, "balance_loss_mlp": 1.02607632, "epoch": 0.17663680634882462, "flos": 22929502170240.0, "grad_norm": 2.2277700189500194, "language_loss": 0.80568999, "learning_rate": 3.7787836859943685e-06, "loss": 0.82833517, "num_input_tokens_seen": 31078800, "step": 1469, "time_per_iteration": 2.8841593265533447 }, { "auxiliary_loss_clip": 0.01225262, "auxiliary_loss_mlp": 0.01035941, "balance_loss_clip": 1.06312966, "balance_loss_mlp": 1.02520049, "epoch": 0.17675704923946373, "flos": 22637979388800.0, "grad_norm": 2.9210575292375967, "language_loss": 0.78649294, "learning_rate": 3.7784274475796363e-06, "loss": 0.80910492, "num_input_tokens_seen": 31097430, "step": 1470, "time_per_iteration": 2.8148014545440674 }, { "auxiliary_loss_clip": 0.01216862, "auxiliary_loss_mlp": 0.01041379, "balance_loss_clip": 1.06281388, "balance_loss_mlp": 1.0304116, "epoch": 0.1768772921301028, "flos": 27126525795840.0, "grad_norm": 2.823360909373466, "language_loss": 0.76252937, "learning_rate": 3.7780709393788745e-06, "loss": 0.78511178, "num_input_tokens_seen": 31117905, "step": 1471, "time_per_iteration": 2.8934664726257324 }, { "auxiliary_loss_clip": 0.01227083, "auxiliary_loss_mlp": 0.01032509, "balance_loss_clip": 1.06214392, "balance_loss_mlp": 1.0216136, "epoch": 0.1769975350207419, "flos": 19172133014400.0, "grad_norm": 2.154084532374619, "language_loss": 0.753631, "learning_rate": 3.777714161446165e-06, "loss": 0.776227, "num_input_tokens_seen": 31137610, "step": 1472, "time_per_iteration": 2.796740770339966 }, { "auxiliary_loss_clip": 0.01224766, "auxiliary_loss_mlp": 0.01036402, "balance_loss_clip": 1.06337547, "balance_loss_mlp": 1.02529216, "epoch": 0.177117777911381, "flos": 36134932291200.0, "grad_norm": 2.2464180661272493, "language_loss": 0.69229579, "learning_rate": 3.7773571138356304e-06, "loss": 0.71490753, "num_input_tokens_seen": 31157780, "step": 1473, "time_per_iteration": 2.9186534881591797 }, { "auxiliary_loss_clip": 0.01201523, "auxiliary_loss_mlp": 0.01038755, "balance_loss_clip": 1.06366146, "balance_loss_mlp": 1.02823448, "epoch": 0.17723802080202009, "flos": 22090593052800.0, "grad_norm": 2.3541369872680824, "language_loss": 0.88909489, "learning_rate": 3.776999796601435e-06, "loss": 0.91149771, "num_input_tokens_seen": 31176540, "step": 1474, "time_per_iteration": 2.787398338317871 }, { "auxiliary_loss_clip": 0.01228683, "auxiliary_loss_mlp": 0.01043112, "balance_loss_clip": 1.06342173, "balance_loss_mlp": 1.03277063, "epoch": 0.17735826369265917, "flos": 30222671437440.0, "grad_norm": 2.6343754214616437, "language_loss": 0.73014468, "learning_rate": 3.776642209797783e-06, "loss": 0.75286257, "num_input_tokens_seen": 31198370, "step": 1475, "time_per_iteration": 3.8076181411743164 }, { "auxiliary_loss_clip": 0.01221421, "auxiliary_loss_mlp": 0.0103926, "balance_loss_clip": 1.06466663, "balance_loss_mlp": 1.02792287, "epoch": 0.17747850658329825, "flos": 21397588980480.0, "grad_norm": 3.0725982047189717, "language_loss": 0.77900863, "learning_rate": 3.7762843534789205e-06, "loss": 0.80161548, "num_input_tokens_seen": 31217120, "step": 1476, "time_per_iteration": 3.8222522735595703 }, { "auxiliary_loss_clip": 0.01229706, "auxiliary_loss_mlp": 0.01040422, "balance_loss_clip": 1.06469941, "balance_loss_mlp": 1.02944911, "epoch": 0.17759874947393736, "flos": 16983341856000.0, "grad_norm": 2.285298404089208, "language_loss": 0.88159645, "learning_rate": 3.7759262276991343e-06, "loss": 0.90429783, "num_input_tokens_seen": 31234730, "step": 1477, "time_per_iteration": 3.9630849361419678 }, { "auxiliary_loss_clip": 0.01226779, "auxiliary_loss_mlp": 0.01042231, "balance_loss_clip": 1.06465745, "balance_loss_mlp": 1.03194284, "epoch": 0.17771899236457644, "flos": 11546107390080.0, "grad_norm": 3.8485737256095978, "language_loss": 0.81273341, "learning_rate": 3.7755678325127506e-06, "loss": 0.83542347, "num_input_tokens_seen": 31252410, "step": 1478, "time_per_iteration": 2.9032695293426514 }, { "auxiliary_loss_clip": 0.01199075, "auxiliary_loss_mlp": 0.01038725, "balance_loss_clip": 1.05981159, "balance_loss_mlp": 1.02862751, "epoch": 0.17783923525521553, "flos": 18807747494400.0, "grad_norm": 2.1506969114672385, "language_loss": 0.75640911, "learning_rate": 3.7752091679741393e-06, "loss": 0.77878708, "num_input_tokens_seen": 31270200, "step": 1479, "time_per_iteration": 2.7763123512268066 }, { "auxiliary_loss_clip": 0.0122171, "auxiliary_loss_mlp": 0.0103574, "balance_loss_clip": 1.06227386, "balance_loss_mlp": 1.02458203, "epoch": 0.17795947814585464, "flos": 30408365773440.0, "grad_norm": 3.73603515335092, "language_loss": 0.77862883, "learning_rate": 3.774850234137708e-06, "loss": 0.80120331, "num_input_tokens_seen": 31287495, "step": 1480, "time_per_iteration": 2.7630865573883057 }, { "auxiliary_loss_clip": 0.01225275, "auxiliary_loss_mlp": 0.01038767, "balance_loss_clip": 1.06429863, "balance_loss_mlp": 1.02816927, "epoch": 0.17807972103649372, "flos": 24389055411840.0, "grad_norm": 2.4227038536640997, "language_loss": 0.82489073, "learning_rate": 3.7744910310579076e-06, "loss": 0.8475312, "num_input_tokens_seen": 31306420, "step": 1481, "time_per_iteration": 2.81874942779541 }, { "auxiliary_loss_clip": 0.01227775, "auxiliary_loss_mlp": 0.0104013, "balance_loss_clip": 1.06452298, "balance_loss_mlp": 1.02969289, "epoch": 0.1781999639271328, "flos": 20301559332480.0, "grad_norm": 1.9188605819217797, "language_loss": 0.85299248, "learning_rate": 3.774131558789229e-06, "loss": 0.87567157, "num_input_tokens_seen": 31325750, "step": 1482, "time_per_iteration": 2.796914577484131 }, { "auxiliary_loss_clip": 0.01230985, "auxiliary_loss_mlp": 0.01058892, "balance_loss_clip": 1.0658114, "balance_loss_mlp": 1.02245605, "epoch": 0.1783202068177719, "flos": 15924479806080.0, "grad_norm": 2.9155817775239057, "language_loss": 0.69854593, "learning_rate": 3.773771817386203e-06, "loss": 0.72144473, "num_input_tokens_seen": 31343080, "step": 1483, "time_per_iteration": 2.6892929077148438 }, { "auxiliary_loss_clip": 0.01220882, "auxiliary_loss_mlp": 0.01038841, "balance_loss_clip": 1.06316876, "balance_loss_mlp": 1.02838612, "epoch": 0.178440449708411, "flos": 20631758083200.0, "grad_norm": 1.6226688122761495, "language_loss": 0.79202574, "learning_rate": 3.773411806903403e-06, "loss": 0.814623, "num_input_tokens_seen": 31362160, "step": 1484, "time_per_iteration": 2.7669661045074463 }, { "auxiliary_loss_clip": 0.01215987, "auxiliary_loss_mlp": 0.01035917, "balance_loss_clip": 1.06727123, "balance_loss_mlp": 1.02585566, "epoch": 0.17856069259905008, "flos": 21686059105920.0, "grad_norm": 2.1433598463413084, "language_loss": 0.94733745, "learning_rate": 3.7730515273954415e-06, "loss": 0.96985644, "num_input_tokens_seen": 31380770, "step": 1485, "time_per_iteration": 2.7754976749420166 }, { "auxiliary_loss_clip": 0.01226915, "auxiliary_loss_mlp": 0.01036997, "balance_loss_clip": 1.06556475, "balance_loss_mlp": 1.02648866, "epoch": 0.17868093548968916, "flos": 26572962320640.0, "grad_norm": 1.8712984094531506, "language_loss": 0.85231131, "learning_rate": 3.772690978916973e-06, "loss": 0.87495041, "num_input_tokens_seen": 31400525, "step": 1486, "time_per_iteration": 2.7167890071868896 }, { "auxiliary_loss_clip": 0.0122269, "auxiliary_loss_mlp": 0.01035607, "balance_loss_clip": 1.06252372, "balance_loss_mlp": 1.02562904, "epoch": 0.17880117838032827, "flos": 18581006891520.0, "grad_norm": 3.0061871958900426, "language_loss": 0.86584592, "learning_rate": 3.772330161522693e-06, "loss": 0.88842881, "num_input_tokens_seen": 31418435, "step": 1487, "time_per_iteration": 2.7924416065216064 }, { "auxiliary_loss_clip": 0.01219316, "auxiliary_loss_mlp": 0.01038958, "balance_loss_clip": 1.06679869, "balance_loss_mlp": 1.02816916, "epoch": 0.17892142127096736, "flos": 26541217676160.0, "grad_norm": 1.95929244566604, "language_loss": 0.79670143, "learning_rate": 3.7719690752673365e-06, "loss": 0.8192842, "num_input_tokens_seen": 31439230, "step": 1488, "time_per_iteration": 2.8532326221466064 }, { "auxiliary_loss_clip": 0.01220614, "auxiliary_loss_mlp": 0.01034994, "balance_loss_clip": 1.06619751, "balance_loss_mlp": 1.02534413, "epoch": 0.17904166416160644, "flos": 23872623621120.0, "grad_norm": 1.8955117451930463, "language_loss": 0.78034902, "learning_rate": 3.7716077202056796e-06, "loss": 0.80290508, "num_input_tokens_seen": 31457705, "step": 1489, "time_per_iteration": 2.9344518184661865 }, { "auxiliary_loss_clip": 0.01210799, "auxiliary_loss_mlp": 0.01039484, "balance_loss_clip": 1.06506538, "balance_loss_mlp": 1.02887452, "epoch": 0.17916190705224552, "flos": 19134426712320.0, "grad_norm": 2.578968518864626, "language_loss": 0.93646055, "learning_rate": 3.7712460963925404e-06, "loss": 0.95896333, "num_input_tokens_seen": 31473645, "step": 1490, "time_per_iteration": 2.821519374847412 }, { "auxiliary_loss_clip": 0.0121558, "auxiliary_loss_mlp": 0.01031873, "balance_loss_clip": 1.06427431, "balance_loss_mlp": 1.02235448, "epoch": 0.17928214994288463, "flos": 25152120961920.0, "grad_norm": 3.404952927588053, "language_loss": 0.7561264, "learning_rate": 3.7708842038827775e-06, "loss": 0.77860093, "num_input_tokens_seen": 31492605, "step": 1491, "time_per_iteration": 2.94356107711792 }, { "auxiliary_loss_clip": 0.01225608, "auxiliary_loss_mlp": 0.01030472, "balance_loss_clip": 1.06480026, "balance_loss_mlp": 1.02024376, "epoch": 0.17940239283352372, "flos": 22384629786240.0, "grad_norm": 1.992639659606811, "language_loss": 0.85990763, "learning_rate": 3.770522042731288e-06, "loss": 0.88246846, "num_input_tokens_seen": 31514500, "step": 1492, "time_per_iteration": 2.76045560836792 }, { "auxiliary_loss_clip": 0.01207886, "auxiliary_loss_mlp": 0.01038592, "balance_loss_clip": 1.06627512, "balance_loss_mlp": 1.02790451, "epoch": 0.1795226357241628, "flos": 23178685795200.0, "grad_norm": 3.1589535422912354, "language_loss": 0.87803382, "learning_rate": 3.7701596129930122e-06, "loss": 0.90049857, "num_input_tokens_seen": 31533225, "step": 1493, "time_per_iteration": 2.911254644393921 }, { "auxiliary_loss_clip": 0.01222194, "auxiliary_loss_mlp": 0.01036632, "balance_loss_clip": 1.06367826, "balance_loss_mlp": 1.02615356, "epoch": 0.1796428786148019, "flos": 22090413484800.0, "grad_norm": 2.1608566879532165, "language_loss": 0.73314381, "learning_rate": 3.7697969147229315e-06, "loss": 0.75573206, "num_input_tokens_seen": 31551385, "step": 1494, "time_per_iteration": 2.930765390396118 }, { "auxiliary_loss_clip": 0.01223379, "auxiliary_loss_mlp": 0.01037828, "balance_loss_clip": 1.06474161, "balance_loss_mlp": 1.02752876, "epoch": 0.179763121505441, "flos": 21324618501120.0, "grad_norm": 11.880794627229713, "language_loss": 0.85159218, "learning_rate": 3.7694339479760647e-06, "loss": 0.87420428, "num_input_tokens_seen": 31570415, "step": 1495, "time_per_iteration": 2.707777261734009 }, { "auxiliary_loss_clip": 0.01112261, "auxiliary_loss_mlp": 0.01007421, "balance_loss_clip": 1.02226377, "balance_loss_mlp": 1.00491762, "epoch": 0.17988336439608008, "flos": 68161864815360.0, "grad_norm": 0.7813582796823806, "language_loss": 0.57306826, "learning_rate": 3.769070712807476e-06, "loss": 0.59426498, "num_input_tokens_seen": 31632445, "step": 1496, "time_per_iteration": 3.3474724292755127 }, { "auxiliary_loss_clip": 0.0119803, "auxiliary_loss_mlp": 0.01036479, "balance_loss_clip": 1.06337821, "balance_loss_mlp": 1.02630424, "epoch": 0.18000360728671919, "flos": 21945047143680.0, "grad_norm": 1.8399751345827773, "language_loss": 0.78901601, "learning_rate": 3.768707209272266e-06, "loss": 0.81136113, "num_input_tokens_seen": 31652575, "step": 1497, "time_per_iteration": 2.8208625316619873 }, { "auxiliary_loss_clip": 0.01219373, "auxiliary_loss_mlp": 0.01035515, "balance_loss_clip": 1.06260061, "balance_loss_mlp": 1.02516806, "epoch": 0.18012385017735827, "flos": 18986330937600.0, "grad_norm": 2.26818088554851, "language_loss": 0.76488388, "learning_rate": 3.768343437425579e-06, "loss": 0.78743273, "num_input_tokens_seen": 31671145, "step": 1498, "time_per_iteration": 2.7637736797332764 }, { "auxiliary_loss_clip": 0.01196934, "auxiliary_loss_mlp": 0.01039679, "balance_loss_clip": 1.06034708, "balance_loss_mlp": 1.02853251, "epoch": 0.18024409306799735, "flos": 19748103598080.0, "grad_norm": 2.8205903322506622, "language_loss": 0.86486399, "learning_rate": 3.7679793973225987e-06, "loss": 0.8872301, "num_input_tokens_seen": 31686955, "step": 1499, "time_per_iteration": 2.9017884731292725 }, { "auxiliary_loss_clip": 0.01108896, "auxiliary_loss_mlp": 0.01007077, "balance_loss_clip": 1.0243783, "balance_loss_mlp": 1.0045979, "epoch": 0.18036433595863643, "flos": 67227183060480.0, "grad_norm": 0.8674075482892012, "language_loss": 0.61642629, "learning_rate": 3.767615089018549e-06, "loss": 0.63758606, "num_input_tokens_seen": 31749300, "step": 1500, "time_per_iteration": 4.264541149139404 }, { "auxiliary_loss_clip": 0.01213697, "auxiliary_loss_mlp": 0.01039062, "balance_loss_clip": 1.06266522, "balance_loss_mlp": 1.02786815, "epoch": 0.18048457884927555, "flos": 18181464935040.0, "grad_norm": 2.137021330804048, "language_loss": 0.86249137, "learning_rate": 3.7672505125686966e-06, "loss": 0.88501894, "num_input_tokens_seen": 31765665, "step": 1501, "time_per_iteration": 2.752525806427002 }, { "auxiliary_loss_clip": 0.01213987, "auxiliary_loss_mlp": 0.01043163, "balance_loss_clip": 1.0646776, "balance_loss_mlp": 1.03239202, "epoch": 0.18060482173991463, "flos": 15813767111040.0, "grad_norm": 3.7312719323435077, "language_loss": 0.8436234, "learning_rate": 3.7668856680283455e-06, "loss": 0.86619484, "num_input_tokens_seen": 31782690, "step": 1502, "time_per_iteration": 4.849093198776245 }, { "auxiliary_loss_clip": 0.01226803, "auxiliary_loss_mlp": 0.01041638, "balance_loss_clip": 1.06364238, "balance_loss_mlp": 1.03081417, "epoch": 0.1807250646305537, "flos": 18587399512320.0, "grad_norm": 3.8649430863498218, "language_loss": 0.82778525, "learning_rate": 3.7665205554528437e-06, "loss": 0.85046965, "num_input_tokens_seen": 31802045, "step": 1503, "time_per_iteration": 3.766719341278076 }, { "auxiliary_loss_clip": 0.01218483, "auxiliary_loss_mlp": 0.01042678, "balance_loss_clip": 1.06378984, "balance_loss_mlp": 1.03198504, "epoch": 0.18084530752119282, "flos": 23149131880320.0, "grad_norm": 1.7139280494412614, "language_loss": 0.74450648, "learning_rate": 3.7661551748975782e-06, "loss": 0.7671181, "num_input_tokens_seen": 31820220, "step": 1504, "time_per_iteration": 2.762366533279419 }, { "auxiliary_loss_clip": 0.01112474, "auxiliary_loss_mlp": 0.01003622, "balance_loss_clip": 1.02083564, "balance_loss_mlp": 1.00111818, "epoch": 0.1809655504118319, "flos": 59803153568640.0, "grad_norm": 0.8146570466767272, "language_loss": 0.60462606, "learning_rate": 3.7657895264179772e-06, "loss": 0.62578702, "num_input_tokens_seen": 31876195, "step": 1505, "time_per_iteration": 3.349100351333618 }, { "auxiliary_loss_clip": 0.01212995, "auxiliary_loss_mlp": 0.01045819, "balance_loss_clip": 1.06305861, "balance_loss_mlp": 1.03539407, "epoch": 0.181085793302471, "flos": 44201941188480.0, "grad_norm": 1.8608322624455134, "language_loss": 0.74317491, "learning_rate": 3.765423610069509e-06, "loss": 0.7657631, "num_input_tokens_seen": 31901585, "step": 1506, "time_per_iteration": 2.9732091426849365 }, { "auxiliary_loss_clip": 0.01220464, "auxiliary_loss_mlp": 0.01034185, "balance_loss_clip": 1.0651505, "balance_loss_mlp": 1.02407026, "epoch": 0.18120603619311007, "flos": 34898384638080.0, "grad_norm": 1.9469443693745008, "language_loss": 0.72705644, "learning_rate": 3.765057425907683e-06, "loss": 0.74960291, "num_input_tokens_seen": 31923045, "step": 1507, "time_per_iteration": 2.9713327884674072 }, { "auxiliary_loss_clip": 0.01225078, "auxiliary_loss_mlp": 0.01038836, "balance_loss_clip": 1.06165957, "balance_loss_mlp": 1.02768946, "epoch": 0.18132627908374918, "flos": 21506757390720.0, "grad_norm": 1.8357189267409784, "language_loss": 0.78425652, "learning_rate": 3.764690973988048e-06, "loss": 0.80689567, "num_input_tokens_seen": 31943385, "step": 1508, "time_per_iteration": 2.823683500289917 }, { "auxiliary_loss_clip": 0.01215833, "auxiliary_loss_mlp": 0.01037726, "balance_loss_clip": 1.06074715, "balance_loss_mlp": 1.02802205, "epoch": 0.18144652197438826, "flos": 29057693633280.0, "grad_norm": 1.7849447244772587, "language_loss": 0.73770356, "learning_rate": 3.7643242543661967e-06, "loss": 0.76023912, "num_input_tokens_seen": 31966045, "step": 1509, "time_per_iteration": 2.9121761322021484 }, { "auxiliary_loss_clip": 0.01103648, "auxiliary_loss_mlp": 0.01006096, "balance_loss_clip": 1.02030134, "balance_loss_mlp": 1.00383079, "epoch": 0.18156676486502735, "flos": 68675064382080.0, "grad_norm": 0.8148293513773504, "language_loss": 0.60497469, "learning_rate": 3.7639572670977573e-06, "loss": 0.62607211, "num_input_tokens_seen": 32021540, "step": 1510, "time_per_iteration": 3.2195374965667725 }, { "auxiliary_loss_clip": 0.01218277, "auxiliary_loss_mlp": 0.01042665, "balance_loss_clip": 1.06126523, "balance_loss_mlp": 1.0313406, "epoch": 0.18168700775566646, "flos": 26471515334400.0, "grad_norm": 1.6285063470368777, "language_loss": 0.76836586, "learning_rate": 3.7635900122384042e-06, "loss": 0.79097527, "num_input_tokens_seen": 32044535, "step": 1511, "time_per_iteration": 2.8508799076080322 }, { "auxiliary_loss_clip": 0.0122245, "auxiliary_loss_mlp": 0.0104312, "balance_loss_clip": 1.06182909, "balance_loss_mlp": 1.03218246, "epoch": 0.18180725064630554, "flos": 15005668884480.0, "grad_norm": 3.932320738531143, "language_loss": 0.87426901, "learning_rate": 3.7632224898438477e-06, "loss": 0.89692467, "num_input_tokens_seen": 32061010, "step": 1512, "time_per_iteration": 2.841834545135498 }, { "auxiliary_loss_clip": 0.01217333, "auxiliary_loss_mlp": 0.01031377, "balance_loss_clip": 1.06007969, "balance_loss_mlp": 1.02105916, "epoch": 0.18192749353694462, "flos": 19682387665920.0, "grad_norm": 1.6859241229380921, "language_loss": 0.79207873, "learning_rate": 3.762854699969842e-06, "loss": 0.8145659, "num_input_tokens_seen": 32081520, "step": 1513, "time_per_iteration": 2.8878843784332275 }, { "auxiliary_loss_clip": 0.01217838, "auxiliary_loss_mlp": 0.01041673, "balance_loss_clip": 1.0605762, "balance_loss_mlp": 1.03138483, "epoch": 0.1820477364275837, "flos": 20702717400960.0, "grad_norm": 2.2305181890655446, "language_loss": 0.73199236, "learning_rate": 3.762486642672179e-06, "loss": 0.75458753, "num_input_tokens_seen": 32098460, "step": 1514, "time_per_iteration": 2.70786190032959 }, { "auxiliary_loss_clip": 0.0122322, "auxiliary_loss_mlp": 0.01038598, "balance_loss_clip": 1.06251991, "balance_loss_mlp": 1.02805436, "epoch": 0.18216797931822282, "flos": 17128708197120.0, "grad_norm": 1.9776312321906857, "language_loss": 0.86712891, "learning_rate": 3.7621183180066946e-06, "loss": 0.88974708, "num_input_tokens_seen": 32116420, "step": 1515, "time_per_iteration": 2.7418203353881836 }, { "auxiliary_loss_clip": 0.01218566, "auxiliary_loss_mlp": 0.01044484, "balance_loss_clip": 1.06084681, "balance_loss_mlp": 1.03382051, "epoch": 0.1822882222088619, "flos": 29242561956480.0, "grad_norm": 1.6771475026478917, "language_loss": 0.73810518, "learning_rate": 3.7617497260292625e-06, "loss": 0.76073563, "num_input_tokens_seen": 32138475, "step": 1516, "time_per_iteration": 2.8586246967315674 }, { "auxiliary_loss_clip": 0.01212026, "auxiliary_loss_mlp": 0.01041107, "balance_loss_clip": 1.05956221, "balance_loss_mlp": 1.03012204, "epoch": 0.18240846509950098, "flos": 17702739446400.0, "grad_norm": 2.8043446237324128, "language_loss": 0.78749919, "learning_rate": 3.7613808667957967e-06, "loss": 0.81003046, "num_input_tokens_seen": 32151165, "step": 1517, "time_per_iteration": 2.8402159214019775 }, { "auxiliary_loss_clip": 0.01219635, "auxiliary_loss_mlp": 0.01039674, "balance_loss_clip": 1.06176901, "balance_loss_mlp": 1.02894521, "epoch": 0.1825287079901401, "flos": 14790025584000.0, "grad_norm": 1.855735967397975, "language_loss": 0.91201234, "learning_rate": 3.7610117403622547e-06, "loss": 0.93460548, "num_input_tokens_seen": 32167725, "step": 1518, "time_per_iteration": 2.7975802421569824 }, { "auxiliary_loss_clip": 0.01208944, "auxiliary_loss_mlp": 0.01036045, "balance_loss_clip": 1.0607245, "balance_loss_mlp": 1.02502394, "epoch": 0.18264895088077918, "flos": 21946232292480.0, "grad_norm": 1.688347502749891, "language_loss": 0.89931679, "learning_rate": 3.7606423467846313e-06, "loss": 0.9217667, "num_input_tokens_seen": 32187330, "step": 1519, "time_per_iteration": 2.883141279220581 }, { "auxiliary_loss_clip": 0.0122235, "auxiliary_loss_mlp": 0.01038656, "balance_loss_clip": 1.06514859, "balance_loss_mlp": 1.02774215, "epoch": 0.18276919377141826, "flos": 20886759711360.0, "grad_norm": 1.5622675105908133, "language_loss": 0.79613185, "learning_rate": 3.760272686118964e-06, "loss": 0.81874186, "num_input_tokens_seen": 32205550, "step": 1520, "time_per_iteration": 2.777048349380493 }, { "auxiliary_loss_clip": 0.01218234, "auxiliary_loss_mlp": 0.0104154, "balance_loss_clip": 1.05780792, "balance_loss_mlp": 1.03158641, "epoch": 0.18288943666205737, "flos": 21469877101440.0, "grad_norm": 3.2424097348445, "language_loss": 0.92848003, "learning_rate": 3.7599027584213297e-06, "loss": 0.9510777, "num_input_tokens_seen": 32224430, "step": 1521, "time_per_iteration": 2.8279314041137695 }, { "auxiliary_loss_clip": 0.01227204, "auxiliary_loss_mlp": 0.01044165, "balance_loss_clip": 1.06109452, "balance_loss_mlp": 1.03329945, "epoch": 0.18300967955269645, "flos": 21539363961600.0, "grad_norm": 1.9996024144975346, "language_loss": 0.78142333, "learning_rate": 3.7595325637478465e-06, "loss": 0.80413705, "num_input_tokens_seen": 32242455, "step": 1522, "time_per_iteration": 2.7605419158935547 }, { "auxiliary_loss_clip": 0.01213135, "auxiliary_loss_mlp": 0.01040229, "balance_loss_clip": 1.06502461, "balance_loss_mlp": 1.02891064, "epoch": 0.18312992244333554, "flos": 28876237102080.0, "grad_norm": 2.102114838067775, "language_loss": 0.81756699, "learning_rate": 3.7591621021546723e-06, "loss": 0.84010065, "num_input_tokens_seen": 32264450, "step": 1523, "time_per_iteration": 2.7894093990325928 }, { "auxiliary_loss_clip": 0.01215769, "auxiliary_loss_mlp": 0.01046435, "balance_loss_clip": 1.06060278, "balance_loss_mlp": 1.03466296, "epoch": 0.18325016533397462, "flos": 20120102801280.0, "grad_norm": 1.74597638833608, "language_loss": 0.81497771, "learning_rate": 3.7587913736980062e-06, "loss": 0.83759975, "num_input_tokens_seen": 32284090, "step": 1524, "time_per_iteration": 2.747797727584839 }, { "auxiliary_loss_clip": 0.01194258, "auxiliary_loss_mlp": 0.01042976, "balance_loss_clip": 1.06014395, "balance_loss_mlp": 1.03183031, "epoch": 0.18337040822461373, "flos": 23329187781120.0, "grad_norm": 1.7803571067759412, "language_loss": 0.84381598, "learning_rate": 3.7584203784340865e-06, "loss": 0.86618829, "num_input_tokens_seen": 32303260, "step": 1525, "time_per_iteration": 2.8073270320892334 }, { "auxiliary_loss_clip": 0.01215729, "auxiliary_loss_mlp": 0.01032668, "balance_loss_clip": 1.06008434, "balance_loss_mlp": 1.02195716, "epoch": 0.1834906511152528, "flos": 25009555881600.0, "grad_norm": 2.30901536184522, "language_loss": 0.85893738, "learning_rate": 3.7580491164191938e-06, "loss": 0.88142133, "num_input_tokens_seen": 32321570, "step": 1526, "time_per_iteration": 3.732551336288452 }, { "auxiliary_loss_clip": 0.01113107, "auxiliary_loss_mlp": 0.0100367, "balance_loss_clip": 1.01927853, "balance_loss_mlp": 1.00164318, "epoch": 0.1836108940058919, "flos": 67251493589760.0, "grad_norm": 0.7583024772692878, "language_loss": 0.61261624, "learning_rate": 3.757677587709648e-06, "loss": 0.633784, "num_input_tokens_seen": 32384835, "step": 1527, "time_per_iteration": 3.3934261798858643 }, { "auxiliary_loss_clip": 0.01210115, "auxiliary_loss_mlp": 0.01036745, "balance_loss_clip": 1.06217241, "balance_loss_mlp": 1.02635002, "epoch": 0.183731136896531, "flos": 25738721971200.0, "grad_norm": 1.9463321873032764, "language_loss": 0.7568503, "learning_rate": 3.7573057923618095e-06, "loss": 0.77931893, "num_input_tokens_seen": 32404930, "step": 1528, "time_per_iteration": 4.681435823440552 }, { "auxiliary_loss_clip": 0.01205158, "auxiliary_loss_mlp": 0.01042781, "balance_loss_clip": 1.05857098, "balance_loss_mlp": 1.03205812, "epoch": 0.1838513797871701, "flos": 20449403712000.0, "grad_norm": 1.9389713480313875, "language_loss": 0.74409926, "learning_rate": 3.7569337304320793e-06, "loss": 0.76657867, "num_input_tokens_seen": 32424515, "step": 1529, "time_per_iteration": 3.7481205463409424 }, { "auxiliary_loss_clip": 0.01108986, "auxiliary_loss_mlp": 0.01001918, "balance_loss_clip": 1.0188396, "balance_loss_mlp": 0.99991506, "epoch": 0.18397162267780917, "flos": 68565141786240.0, "grad_norm": 0.8563462813425065, "language_loss": 0.64493746, "learning_rate": 3.756561401976899e-06, "loss": 0.6660465, "num_input_tokens_seen": 32484220, "step": 1530, "time_per_iteration": 3.217953681945801 }, { "auxiliary_loss_clip": 0.01229437, "auxiliary_loss_mlp": 0.01038472, "balance_loss_clip": 1.06307006, "balance_loss_mlp": 1.02827954, "epoch": 0.18409186556844825, "flos": 31941104976000.0, "grad_norm": 5.046613946898311, "language_loss": 0.82265067, "learning_rate": 3.7561888070527514e-06, "loss": 0.84532976, "num_input_tokens_seen": 32506260, "step": 1531, "time_per_iteration": 2.8910653591156006 }, { "auxiliary_loss_clip": 0.0120161, "auxiliary_loss_mlp": 0.01074311, "balance_loss_clip": 1.06079233, "balance_loss_mlp": 1.02875566, "epoch": 0.18421210845908736, "flos": 20120533764480.0, "grad_norm": 2.347758423546371, "language_loss": 0.79925728, "learning_rate": 3.7558159457161577e-06, "loss": 0.82201648, "num_input_tokens_seen": 32524225, "step": 1532, "time_per_iteration": 2.7974624633789062 }, { "auxiliary_loss_clip": 0.01218045, "auxiliary_loss_mlp": 0.01063837, "balance_loss_clip": 1.05970097, "balance_loss_mlp": 1.0209887, "epoch": 0.18433235134972645, "flos": 23110491824640.0, "grad_norm": 2.294943929120978, "language_loss": 0.78007215, "learning_rate": 3.755442818023681e-06, "loss": 0.80289096, "num_input_tokens_seen": 32543850, "step": 1533, "time_per_iteration": 2.797168254852295 }, { "auxiliary_loss_clip": 0.01213725, "auxiliary_loss_mlp": 0.01038974, "balance_loss_clip": 1.06037045, "balance_loss_mlp": 1.02777994, "epoch": 0.18445259424036553, "flos": 18291351617280.0, "grad_norm": 2.3769739547528004, "language_loss": 0.76336199, "learning_rate": 3.7550694240319246e-06, "loss": 0.78588903, "num_input_tokens_seen": 32561725, "step": 1534, "time_per_iteration": 2.8650283813476562 }, { "auxiliary_loss_clip": 0.012226, "auxiliary_loss_mlp": 0.01038218, "balance_loss_clip": 1.05794239, "balance_loss_mlp": 1.02757263, "epoch": 0.18457283713100464, "flos": 21324079797120.0, "grad_norm": 2.7564896023745584, "language_loss": 0.76504099, "learning_rate": 3.7546957637975326e-06, "loss": 0.78764915, "num_input_tokens_seen": 32579135, "step": 1535, "time_per_iteration": 2.7537803649902344 }, { "auxiliary_loss_clip": 0.01201355, "auxiliary_loss_mlp": 0.01040655, "balance_loss_clip": 1.06035042, "balance_loss_mlp": 1.03055763, "epoch": 0.18469308002164372, "flos": 20375679047040.0, "grad_norm": 1.6013369091613738, "language_loss": 0.74048704, "learning_rate": 3.7543218373771873e-06, "loss": 0.76290721, "num_input_tokens_seen": 32598460, "step": 1536, "time_per_iteration": 2.990959882736206 }, { "auxiliary_loss_clip": 0.01199727, "auxiliary_loss_mlp": 0.01074214, "balance_loss_clip": 1.06209517, "balance_loss_mlp": 1.02776933, "epoch": 0.1848133229122828, "flos": 26435892021120.0, "grad_norm": 1.4471129505854106, "language_loss": 0.78314304, "learning_rate": 3.753947644827615e-06, "loss": 0.80588245, "num_input_tokens_seen": 32621920, "step": 1537, "time_per_iteration": 2.9200596809387207 }, { "auxiliary_loss_clip": 0.01110176, "auxiliary_loss_mlp": 0.01006314, "balance_loss_clip": 1.01774096, "balance_loss_mlp": 1.00425184, "epoch": 0.1849335658029219, "flos": 70547447612160.0, "grad_norm": 0.9317564091818809, "language_loss": 0.57198983, "learning_rate": 3.753573186205579e-06, "loss": 0.59315473, "num_input_tokens_seen": 32690040, "step": 1538, "time_per_iteration": 3.478015422821045 }, { "auxiliary_loss_clip": 0.01209173, "auxiliary_loss_mlp": 0.01068425, "balance_loss_clip": 1.06022382, "balance_loss_mlp": 1.02733719, "epoch": 0.185053808693561, "flos": 17384140788480.0, "grad_norm": 2.2330413180444473, "language_loss": 0.77657056, "learning_rate": 3.753198461567885e-06, "loss": 0.79934651, "num_input_tokens_seen": 32707285, "step": 1539, "time_per_iteration": 2.755385160446167 }, { "auxiliary_loss_clip": 0.01201004, "auxiliary_loss_mlp": 0.01037422, "balance_loss_clip": 1.05910492, "balance_loss_mlp": 1.02728939, "epoch": 0.18517405158420008, "flos": 28986159697920.0, "grad_norm": 1.8456510016399157, "language_loss": 0.92175686, "learning_rate": 3.7528234709713783e-06, "loss": 0.94414115, "num_input_tokens_seen": 32730030, "step": 1540, "time_per_iteration": 2.908426523208618 }, { "auxiliary_loss_clip": 0.01222418, "auxiliary_loss_mlp": 0.01037831, "balance_loss_clip": 1.06145453, "balance_loss_mlp": 1.02739429, "epoch": 0.18529429447483917, "flos": 26794962328320.0, "grad_norm": 2.413859589353817, "language_loss": 0.83989328, "learning_rate": 3.7524482144729447e-06, "loss": 0.86249578, "num_input_tokens_seen": 32749485, "step": 1541, "time_per_iteration": 2.938439130783081 }, { "auxiliary_loss_clip": 0.01211029, "auxiliary_loss_mlp": 0.01032866, "balance_loss_clip": 1.06316638, "balance_loss_mlp": 1.02226257, "epoch": 0.18541453736547828, "flos": 13581595301760.0, "grad_norm": 2.722259027100719, "language_loss": 0.83922935, "learning_rate": 3.7520726921295106e-06, "loss": 0.86166829, "num_input_tokens_seen": 32766205, "step": 1542, "time_per_iteration": 2.87503719329834 }, { "auxiliary_loss_clip": 0.01217304, "auxiliary_loss_mlp": 0.01032619, "balance_loss_clip": 1.0591867, "balance_loss_mlp": 1.02198577, "epoch": 0.18553478025611736, "flos": 24025424077440.0, "grad_norm": 2.1970170268185516, "language_loss": 0.72502476, "learning_rate": 3.751696903998042e-06, "loss": 0.74752396, "num_input_tokens_seen": 32784840, "step": 1543, "time_per_iteration": 2.8494200706481934 }, { "auxiliary_loss_clip": 0.01218094, "auxiliary_loss_mlp": 0.01041703, "balance_loss_clip": 1.06123281, "balance_loss_mlp": 1.03143895, "epoch": 0.18565502314675644, "flos": 25885165720320.0, "grad_norm": 1.7710403672931683, "language_loss": 0.7021355, "learning_rate": 3.7513208501355456e-06, "loss": 0.72473347, "num_input_tokens_seen": 32805945, "step": 1544, "time_per_iteration": 2.842942237854004 }, { "auxiliary_loss_clip": 0.01212613, "auxiliary_loss_mlp": 0.01036159, "balance_loss_clip": 1.05896735, "balance_loss_mlp": 1.02577567, "epoch": 0.18577526603739553, "flos": 19610063631360.0, "grad_norm": 2.158746030391138, "language_loss": 0.83765757, "learning_rate": 3.750944530599069e-06, "loss": 0.86014521, "num_input_tokens_seen": 32825515, "step": 1545, "time_per_iteration": 2.82243275642395 }, { "auxiliary_loss_clip": 0.01231355, "auxiliary_loss_mlp": 0.01038997, "balance_loss_clip": 1.06349516, "balance_loss_mlp": 1.02863145, "epoch": 0.18589550892803464, "flos": 18474891137280.0, "grad_norm": 2.3360161239487245, "language_loss": 0.81266773, "learning_rate": 3.7505679454456992e-06, "loss": 0.83537126, "num_input_tokens_seen": 32842125, "step": 1546, "time_per_iteration": 2.6508147716522217 }, { "auxiliary_loss_clip": 0.0119487, "auxiliary_loss_mlp": 0.01035607, "balance_loss_clip": 1.06064749, "balance_loss_mlp": 1.025123, "epoch": 0.18601575181867372, "flos": 23549966726400.0, "grad_norm": 2.1781847928099105, "language_loss": 0.69898337, "learning_rate": 3.750191094732564e-06, "loss": 0.72128808, "num_input_tokens_seen": 32862990, "step": 1547, "time_per_iteration": 3.0335795879364014 }, { "auxiliary_loss_clip": 0.01199985, "auxiliary_loss_mlp": 0.01062479, "balance_loss_clip": 1.06192756, "balance_loss_mlp": 1.01934862, "epoch": 0.1861359947093128, "flos": 26360192108160.0, "grad_norm": 1.8232876786400334, "language_loss": 0.75391668, "learning_rate": 3.7498139785168313e-06, "loss": 0.77654135, "num_input_tokens_seen": 32883595, "step": 1548, "time_per_iteration": 2.920185089111328 }, { "auxiliary_loss_clip": 0.01220388, "auxiliary_loss_mlp": 0.01035901, "balance_loss_clip": 1.06253386, "balance_loss_mlp": 1.0255357, "epoch": 0.1862562375999519, "flos": 23331198942720.0, "grad_norm": 1.891518445402486, "language_loss": 0.77293336, "learning_rate": 3.749436596855709e-06, "loss": 0.79549623, "num_input_tokens_seen": 32902895, "step": 1549, "time_per_iteration": 2.773587226867676 }, { "auxiliary_loss_clip": 0.01214098, "auxiliary_loss_mlp": 0.01037262, "balance_loss_clip": 1.06034589, "balance_loss_mlp": 1.02694416, "epoch": 0.186376480490591, "flos": 16648222942080.0, "grad_norm": 2.0245501340333116, "language_loss": 0.90631062, "learning_rate": 3.749058949806446e-06, "loss": 0.92882425, "num_input_tokens_seen": 32919620, "step": 1550, "time_per_iteration": 2.7250442504882812 }, { "auxiliary_loss_clip": 0.01219216, "auxiliary_loss_mlp": 0.0103824, "balance_loss_clip": 1.05783641, "balance_loss_mlp": 1.02737427, "epoch": 0.18649672338123008, "flos": 21468656039040.0, "grad_norm": 1.7452233572231808, "language_loss": 0.84516859, "learning_rate": 3.748681037426331e-06, "loss": 0.86774319, "num_input_tokens_seen": 32938830, "step": 1551, "time_per_iteration": 2.8508846759796143 }, { "auxiliary_loss_clip": 0.01225695, "auxiliary_loss_mlp": 0.01039952, "balance_loss_clip": 1.06179643, "balance_loss_mlp": 1.02983105, "epoch": 0.1866169662718692, "flos": 12312728386560.0, "grad_norm": 2.5034276177145554, "language_loss": 0.91399646, "learning_rate": 3.7483028597726936e-06, "loss": 0.93665296, "num_input_tokens_seen": 32955600, "step": 1552, "time_per_iteration": 3.5829484462738037 }, { "auxiliary_loss_clip": 0.01216299, "auxiliary_loss_mlp": 0.01039639, "balance_loss_clip": 1.06291866, "balance_loss_mlp": 1.02858841, "epoch": 0.18673720916250827, "flos": 23581280407680.0, "grad_norm": 2.1194666456853626, "language_loss": 0.63206613, "learning_rate": 3.7479244169029017e-06, "loss": 0.65462548, "num_input_tokens_seen": 32975390, "step": 1553, "time_per_iteration": 2.908609390258789 }, { "auxiliary_loss_clip": 0.01221691, "auxiliary_loss_mlp": 0.01031977, "balance_loss_clip": 1.05876553, "balance_loss_mlp": 1.02108741, "epoch": 0.18685745205314735, "flos": 19718370115200.0, "grad_norm": 2.5789276197857016, "language_loss": 0.73058343, "learning_rate": 3.7475457088743658e-06, "loss": 0.75312012, "num_input_tokens_seen": 32992640, "step": 1554, "time_per_iteration": 4.716689348220825 }, { "auxiliary_loss_clip": 0.01206994, "auxiliary_loss_mlp": 0.01031393, "balance_loss_clip": 1.06022525, "balance_loss_mlp": 1.02065849, "epoch": 0.18697769494378644, "flos": 34204123589760.0, "grad_norm": 1.9810770613233752, "language_loss": 0.75018883, "learning_rate": 3.7471667357445348e-06, "loss": 0.7725727, "num_input_tokens_seen": 33012470, "step": 1555, "time_per_iteration": 3.8924760818481445 }, { "auxiliary_loss_clip": 0.01201042, "auxiliary_loss_mlp": 0.0103855, "balance_loss_clip": 1.05694413, "balance_loss_mlp": 1.0277077, "epoch": 0.18709793783442555, "flos": 34241327101440.0, "grad_norm": 2.712908485692449, "language_loss": 0.7239387, "learning_rate": 3.7467874975709e-06, "loss": 0.74633455, "num_input_tokens_seen": 33033275, "step": 1556, "time_per_iteration": 2.88730525970459 }, { "auxiliary_loss_clip": 0.01228539, "auxiliary_loss_mlp": 0.0103563, "balance_loss_clip": 1.06513, "balance_loss_mlp": 1.02517533, "epoch": 0.18721818072506463, "flos": 40734550529280.0, "grad_norm": 2.178904034674306, "language_loss": 0.78408712, "learning_rate": 3.7464079944109904e-06, "loss": 0.80672884, "num_input_tokens_seen": 33055135, "step": 1557, "time_per_iteration": 2.901276111602783 }, { "auxiliary_loss_clip": 0.0121704, "auxiliary_loss_mlp": 0.0103032, "balance_loss_clip": 1.06218147, "balance_loss_mlp": 1.02074111, "epoch": 0.18733842361570371, "flos": 22157386392960.0, "grad_norm": 2.287008515802667, "language_loss": 0.77439713, "learning_rate": 3.746028226322376e-06, "loss": 0.79687071, "num_input_tokens_seen": 33071015, "step": 1558, "time_per_iteration": 2.7777860164642334 }, { "auxiliary_loss_clip": 0.01211959, "auxiliary_loss_mlp": 0.0103923, "balance_loss_clip": 1.06039858, "balance_loss_mlp": 1.02898383, "epoch": 0.18745866650634282, "flos": 18914940656640.0, "grad_norm": 1.9219303399403769, "language_loss": 0.74969935, "learning_rate": 3.745648193362669e-06, "loss": 0.77221125, "num_input_tokens_seen": 33090370, "step": 1559, "time_per_iteration": 2.7661805152893066 }, { "auxiliary_loss_clip": 0.01216016, "auxiliary_loss_mlp": 0.01035131, "balance_loss_clip": 1.06193364, "balance_loss_mlp": 1.02552879, "epoch": 0.1875789093969819, "flos": 19314626267520.0, "grad_norm": 2.0483121634513366, "language_loss": 0.7214148, "learning_rate": 3.745267895589518e-06, "loss": 0.74392629, "num_input_tokens_seen": 33108910, "step": 1560, "time_per_iteration": 2.7058491706848145 }, { "auxiliary_loss_clip": 0.01217766, "auxiliary_loss_mlp": 0.01034693, "balance_loss_clip": 1.06128812, "balance_loss_mlp": 1.02469707, "epoch": 0.187699152287621, "flos": 17018965169280.0, "grad_norm": 2.022617222952523, "language_loss": 0.82055491, "learning_rate": 3.7448873330606154e-06, "loss": 0.84307957, "num_input_tokens_seen": 33126680, "step": 1561, "time_per_iteration": 2.725571393966675 }, { "auxiliary_loss_clip": 0.01208164, "auxiliary_loss_mlp": 0.01042264, "balance_loss_clip": 1.0641911, "balance_loss_mlp": 1.03179789, "epoch": 0.18781939517826007, "flos": 22346384780160.0, "grad_norm": 2.136832349678852, "language_loss": 0.87380886, "learning_rate": 3.7445065058336914e-06, "loss": 0.89631319, "num_input_tokens_seen": 33145550, "step": 1562, "time_per_iteration": 2.7656655311584473 }, { "auxiliary_loss_clip": 0.01201522, "auxiliary_loss_mlp": 0.01036401, "balance_loss_clip": 1.06211782, "balance_loss_mlp": 1.02686477, "epoch": 0.18793963806889918, "flos": 14611478054400.0, "grad_norm": 2.2760683743853476, "language_loss": 0.86698908, "learning_rate": 3.7441254139665176e-06, "loss": 0.8893683, "num_input_tokens_seen": 33161735, "step": 1563, "time_per_iteration": 2.8061158657073975 }, { "auxiliary_loss_clip": 0.01229529, "auxiliary_loss_mlp": 0.01038691, "balance_loss_clip": 1.06669211, "balance_loss_mlp": 1.02840877, "epoch": 0.18805988095953827, "flos": 17457075354240.0, "grad_norm": 1.8200534276622673, "language_loss": 0.82516146, "learning_rate": 3.743744057516905e-06, "loss": 0.84784365, "num_input_tokens_seen": 33179795, "step": 1564, "time_per_iteration": 2.797281503677368 }, { "auxiliary_loss_clip": 0.01212197, "auxiliary_loss_mlp": 0.01043302, "balance_loss_clip": 1.06357527, "balance_loss_mlp": 1.03300238, "epoch": 0.18818012385017735, "flos": 15043877976960.0, "grad_norm": 3.203012314740725, "language_loss": 0.87281036, "learning_rate": 3.743362436542706e-06, "loss": 0.89536536, "num_input_tokens_seen": 33194485, "step": 1565, "time_per_iteration": 2.8271493911743164 }, { "auxiliary_loss_clip": 0.01223437, "auxiliary_loss_mlp": 0.01031172, "balance_loss_clip": 1.06089866, "balance_loss_mlp": 1.02130747, "epoch": 0.18830036674081646, "flos": 47551975136640.0, "grad_norm": 1.781493316989938, "language_loss": 0.76718765, "learning_rate": 3.7429805511018115e-06, "loss": 0.78973377, "num_input_tokens_seen": 33216145, "step": 1566, "time_per_iteration": 2.9638831615448 }, { "auxiliary_loss_clip": 0.01216208, "auxiliary_loss_mlp": 0.01074456, "balance_loss_clip": 1.06820512, "balance_loss_mlp": 1.02987838, "epoch": 0.18842060963145554, "flos": 30044626698240.0, "grad_norm": 2.185107874599688, "language_loss": 0.77955395, "learning_rate": 3.7425984012521524e-06, "loss": 0.80246061, "num_input_tokens_seen": 33236345, "step": 1567, "time_per_iteration": 2.8843863010406494 }, { "auxiliary_loss_clip": 0.01105846, "auxiliary_loss_mlp": 0.01037487, "balance_loss_clip": 1.02245653, "balance_loss_mlp": 1.00008142, "epoch": 0.18854085252209463, "flos": 70318372625280.0, "grad_norm": 0.9154503614935997, "language_loss": 0.60382771, "learning_rate": 3.7422159870517025e-06, "loss": 0.62526107, "num_input_tokens_seen": 33301600, "step": 1568, "time_per_iteration": 3.381448268890381 }, { "auxiliary_loss_clip": 0.01215871, "auxiliary_loss_mlp": 0.0103393, "balance_loss_clip": 1.06191337, "balance_loss_mlp": 1.023839, "epoch": 0.1886610954127337, "flos": 21289318410240.0, "grad_norm": 1.946061993439021, "language_loss": 0.78950822, "learning_rate": 3.7418333085584717e-06, "loss": 0.81200624, "num_input_tokens_seen": 33322785, "step": 1569, "time_per_iteration": 2.7682180404663086 }, { "auxiliary_loss_clip": 0.012185, "auxiliary_loss_mlp": 0.01040417, "balance_loss_clip": 1.0653317, "balance_loss_mlp": 1.02933681, "epoch": 0.18878133830337282, "flos": 17266819991040.0, "grad_norm": 2.330783463765769, "language_loss": 0.90823305, "learning_rate": 3.7414503658305128e-06, "loss": 0.93082225, "num_input_tokens_seen": 33340020, "step": 1570, "time_per_iteration": 2.8198299407958984 }, { "auxiliary_loss_clip": 0.01215346, "auxiliary_loss_mlp": 0.01034792, "balance_loss_clip": 1.0596354, "balance_loss_mlp": 1.02434337, "epoch": 0.1889015811940119, "flos": 25775207210880.0, "grad_norm": 2.8472052796232052, "language_loss": 0.77286214, "learning_rate": 3.7410671589259185e-06, "loss": 0.79536343, "num_input_tokens_seen": 33358620, "step": 1571, "time_per_iteration": 2.7832858562469482 }, { "auxiliary_loss_clip": 0.01230313, "auxiliary_loss_mlp": 0.01032176, "balance_loss_clip": 1.06576467, "balance_loss_mlp": 1.02179873, "epoch": 0.18902182408465099, "flos": 21032197879680.0, "grad_norm": 4.5324284421739565, "language_loss": 0.79723155, "learning_rate": 3.7406836879028205e-06, "loss": 0.81985641, "num_input_tokens_seen": 33378845, "step": 1572, "time_per_iteration": 2.6894490718841553 }, { "auxiliary_loss_clip": 0.01220728, "auxiliary_loss_mlp": 0.01037095, "balance_loss_clip": 1.06417942, "balance_loss_mlp": 1.02740347, "epoch": 0.1891420669752901, "flos": 22272121411200.0, "grad_norm": 2.8415242354108146, "language_loss": 0.76621091, "learning_rate": 3.7402999528193907e-06, "loss": 0.78878915, "num_input_tokens_seen": 33398345, "step": 1573, "time_per_iteration": 2.7276084423065186 }, { "auxiliary_loss_clip": 0.01204683, "auxiliary_loss_mlp": 0.01072093, "balance_loss_clip": 1.06524837, "balance_loss_mlp": 1.02757621, "epoch": 0.18926230986592918, "flos": 22017802141440.0, "grad_norm": 2.597622017121826, "language_loss": 0.85738885, "learning_rate": 3.739915953733842e-06, "loss": 0.88015658, "num_input_tokens_seen": 33416390, "step": 1574, "time_per_iteration": 2.880479097366333 }, { "auxiliary_loss_clip": 0.01222804, "auxiliary_loss_mlp": 0.01039538, "balance_loss_clip": 1.06029534, "balance_loss_mlp": 1.02972662, "epoch": 0.18938255275656826, "flos": 24462672336000.0, "grad_norm": 1.9789661715196638, "language_loss": 0.81810689, "learning_rate": 3.7395316907044264e-06, "loss": 0.84073031, "num_input_tokens_seen": 33437175, "step": 1575, "time_per_iteration": 2.8647477626800537 }, { "auxiliary_loss_clip": 0.01218593, "auxiliary_loss_mlp": 0.01044782, "balance_loss_clip": 1.05916178, "balance_loss_mlp": 1.0332067, "epoch": 0.18950279564720737, "flos": 24427049022720.0, "grad_norm": 1.5727949569396882, "language_loss": 0.79212064, "learning_rate": 3.7391471637894364e-06, "loss": 0.81475437, "num_input_tokens_seen": 33459440, "step": 1576, "time_per_iteration": 2.78277850151062 }, { "auxiliary_loss_clip": 0.01215413, "auxiliary_loss_mlp": 0.01034472, "balance_loss_clip": 1.06011558, "balance_loss_mlp": 1.02420223, "epoch": 0.18962303853784646, "flos": 19756291898880.0, "grad_norm": 1.8046075364901586, "language_loss": 0.85529631, "learning_rate": 3.738762373047205e-06, "loss": 0.8777951, "num_input_tokens_seen": 33479360, "step": 1577, "time_per_iteration": 2.865567445755005 }, { "auxiliary_loss_clip": 0.01214423, "auxiliary_loss_mlp": 0.01044452, "balance_loss_clip": 1.06279337, "balance_loss_mlp": 1.0348016, "epoch": 0.18974328142848554, "flos": 21032054225280.0, "grad_norm": 2.003934707218376, "language_loss": 0.83494788, "learning_rate": 3.738377318536103e-06, "loss": 0.85753667, "num_input_tokens_seen": 33499245, "step": 1578, "time_per_iteration": 3.6784424781799316 }, { "auxiliary_loss_clip": 0.01218153, "auxiliary_loss_mlp": 0.01034637, "balance_loss_clip": 1.06070948, "balance_loss_mlp": 1.02490997, "epoch": 0.18986352431912462, "flos": 12966122736000.0, "grad_norm": 3.2975560997459734, "language_loss": 0.71216893, "learning_rate": 3.7379920003145447e-06, "loss": 0.73469687, "num_input_tokens_seen": 33513520, "step": 1579, "time_per_iteration": 2.7461307048797607 }, { "auxiliary_loss_clip": 0.01213593, "auxiliary_loss_mlp": 0.0104257, "balance_loss_clip": 1.06824732, "balance_loss_mlp": 1.03241336, "epoch": 0.18998376720976373, "flos": 23767908497280.0, "grad_norm": 2.3697185591925707, "language_loss": 0.83529675, "learning_rate": 3.7376064184409817e-06, "loss": 0.85785842, "num_input_tokens_seen": 33533100, "step": 1580, "time_per_iteration": 4.957995414733887 }, { "auxiliary_loss_clip": 0.01214217, "auxiliary_loss_mlp": 0.0103492, "balance_loss_clip": 1.06459069, "balance_loss_mlp": 1.02467942, "epoch": 0.19010401010040281, "flos": 22966023323520.0, "grad_norm": 1.4209822496657472, "language_loss": 0.86884803, "learning_rate": 3.7372205729739063e-06, "loss": 0.89133936, "num_input_tokens_seen": 33554915, "step": 1581, "time_per_iteration": 3.787034511566162 }, { "auxiliary_loss_clip": 0.01225821, "auxiliary_loss_mlp": 0.01038541, "balance_loss_clip": 1.06318581, "balance_loss_mlp": 1.02846813, "epoch": 0.1902242529910419, "flos": 19135647774720.0, "grad_norm": 13.728945768322882, "language_loss": 0.71838802, "learning_rate": 3.7368344639718514e-06, "loss": 0.74103165, "num_input_tokens_seen": 33572850, "step": 1582, "time_per_iteration": 2.7225332260131836 }, { "auxiliary_loss_clip": 0.0122235, "auxiliary_loss_mlp": 0.01033276, "balance_loss_clip": 1.06178677, "balance_loss_mlp": 1.02363217, "epoch": 0.190344495881681, "flos": 25483935824640.0, "grad_norm": 1.8174915020463933, "language_loss": 0.80283582, "learning_rate": 3.7364480914933895e-06, "loss": 0.82539213, "num_input_tokens_seen": 33593090, "step": 1583, "time_per_iteration": 2.9708104133605957 }, { "auxiliary_loss_clip": 0.01206156, "auxiliary_loss_mlp": 0.01064479, "balance_loss_clip": 1.06104684, "balance_loss_mlp": 1.01805007, "epoch": 0.1904647387723201, "flos": 26792843425920.0, "grad_norm": 3.290047823697397, "language_loss": 0.80803686, "learning_rate": 3.7360614555971325e-06, "loss": 0.83074319, "num_input_tokens_seen": 33612745, "step": 1584, "time_per_iteration": 2.9749326705932617 }, { "auxiliary_loss_clip": 0.01216797, "auxiliary_loss_mlp": 0.01072908, "balance_loss_clip": 1.06072044, "balance_loss_mlp": 1.02661705, "epoch": 0.19058498166295917, "flos": 23987753688960.0, "grad_norm": 1.9540420013618804, "language_loss": 0.85101265, "learning_rate": 3.735674556341733e-06, "loss": 0.87390971, "num_input_tokens_seen": 33632360, "step": 1585, "time_per_iteration": 2.8969967365264893 }, { "auxiliary_loss_clip": 0.01210723, "auxiliary_loss_mlp": 0.01042256, "balance_loss_clip": 1.06137061, "balance_loss_mlp": 1.03168774, "epoch": 0.19070522455359826, "flos": 28293299280000.0, "grad_norm": 2.245002704765353, "language_loss": 0.83000058, "learning_rate": 3.7352873937858835e-06, "loss": 0.85253036, "num_input_tokens_seen": 33653895, "step": 1586, "time_per_iteration": 2.8955798149108887 }, { "auxiliary_loss_clip": 0.01203403, "auxiliary_loss_mlp": 0.01071685, "balance_loss_clip": 1.05998564, "balance_loss_mlp": 1.02650309, "epoch": 0.19082546744423737, "flos": 25660220797440.0, "grad_norm": 2.0781978067649147, "language_loss": 0.71974784, "learning_rate": 3.734899967988316e-06, "loss": 0.7424987, "num_input_tokens_seen": 33672075, "step": 1587, "time_per_iteration": 2.80763840675354 }, { "auxiliary_loss_clip": 0.01206093, "auxiliary_loss_mlp": 0.01029276, "balance_loss_clip": 1.06086683, "balance_loss_mlp": 1.01997733, "epoch": 0.19094571033487645, "flos": 19719483436800.0, "grad_norm": 2.0311782826765827, "language_loss": 0.83897901, "learning_rate": 3.7345122790078026e-06, "loss": 0.86133265, "num_input_tokens_seen": 33689640, "step": 1588, "time_per_iteration": 2.8046646118164062 }, { "auxiliary_loss_clip": 0.0121798, "auxiliary_loss_mlp": 0.01037262, "balance_loss_clip": 1.06089163, "balance_loss_mlp": 1.02665854, "epoch": 0.19106595322551553, "flos": 21616320850560.0, "grad_norm": 7.526766882430357, "language_loss": 0.92493987, "learning_rate": 3.7341243269031556e-06, "loss": 0.94749236, "num_input_tokens_seen": 33708630, "step": 1589, "time_per_iteration": 2.679655075073242 }, { "auxiliary_loss_clip": 0.01210737, "auxiliary_loss_mlp": 0.0103138, "balance_loss_clip": 1.06245971, "balance_loss_mlp": 1.02195668, "epoch": 0.19118619611615464, "flos": 29896890059520.0, "grad_norm": 1.9075744945971305, "language_loss": 0.77581394, "learning_rate": 3.7337361117332275e-06, "loss": 0.79823506, "num_input_tokens_seen": 33730370, "step": 1590, "time_per_iteration": 2.7987053394317627 }, { "auxiliary_loss_clip": 0.0121447, "auxiliary_loss_mlp": 0.01031301, "balance_loss_clip": 1.0600282, "balance_loss_mlp": 1.02103686, "epoch": 0.19130643900679373, "flos": 17273428093440.0, "grad_norm": 3.2513408236095422, "language_loss": 0.77151692, "learning_rate": 3.7333476335569087e-06, "loss": 0.79397464, "num_input_tokens_seen": 33748370, "step": 1591, "time_per_iteration": 2.7224931716918945 }, { "auxiliary_loss_clip": 0.01220848, "auxiliary_loss_mlp": 0.01037694, "balance_loss_clip": 1.0655036, "balance_loss_mlp": 1.02825809, "epoch": 0.1914266818974328, "flos": 24826339584000.0, "grad_norm": 4.312776194206639, "language_loss": 0.66941983, "learning_rate": 3.7329588924331325e-06, "loss": 0.69200522, "num_input_tokens_seen": 33769575, "step": 1592, "time_per_iteration": 2.82220721244812 }, { "auxiliary_loss_clip": 0.01205538, "auxiliary_loss_mlp": 0.01030879, "balance_loss_clip": 1.06027055, "balance_loss_mlp": 1.02117503, "epoch": 0.1915469247880719, "flos": 18952467390720.0, "grad_norm": 2.200530731574606, "language_loss": 0.82591641, "learning_rate": 3.732569888420871e-06, "loss": 0.84828055, "num_input_tokens_seen": 33789110, "step": 1593, "time_per_iteration": 2.7417244911193848 }, { "auxiliary_loss_clip": 0.01220899, "auxiliary_loss_mlp": 0.01041313, "balance_loss_clip": 1.05775106, "balance_loss_mlp": 1.03113866, "epoch": 0.191667167678711, "flos": 21032952065280.0, "grad_norm": 2.2892978048785024, "language_loss": 0.82346487, "learning_rate": 3.732180621579134e-06, "loss": 0.84608698, "num_input_tokens_seen": 33808325, "step": 1594, "time_per_iteration": 2.7214858531951904 }, { "auxiliary_loss_clip": 0.0122609, "auxiliary_loss_mlp": 0.01046475, "balance_loss_clip": 1.06562352, "balance_loss_mlp": 1.03638363, "epoch": 0.1917874105693501, "flos": 34237663914240.0, "grad_norm": 1.9492320595233141, "language_loss": 0.81131476, "learning_rate": 3.7317910919669745e-06, "loss": 0.8340404, "num_input_tokens_seen": 33829520, "step": 1595, "time_per_iteration": 2.9888720512390137 }, { "auxiliary_loss_clip": 0.01220034, "auxiliary_loss_mlp": 0.01037071, "balance_loss_clip": 1.06319737, "balance_loss_mlp": 1.02731395, "epoch": 0.19190765345998917, "flos": 23550613171200.0, "grad_norm": 2.125427176897367, "language_loss": 0.76320088, "learning_rate": 3.7314012996434826e-06, "loss": 0.78577185, "num_input_tokens_seen": 33848250, "step": 1596, "time_per_iteration": 2.7429966926574707 }, { "auxiliary_loss_clip": 0.01215265, "auxiliary_loss_mlp": 0.01037114, "balance_loss_clip": 1.06131339, "balance_loss_mlp": 1.02760744, "epoch": 0.19202789635062828, "flos": 19861330245120.0, "grad_norm": 2.161377520613993, "language_loss": 0.81088638, "learning_rate": 3.7310112446677907e-06, "loss": 0.83341014, "num_input_tokens_seen": 33866160, "step": 1597, "time_per_iteration": 2.8790762424468994 }, { "auxiliary_loss_clip": 0.01225438, "auxiliary_loss_mlp": 0.01037587, "balance_loss_clip": 1.06317163, "balance_loss_mlp": 1.02733469, "epoch": 0.19214813924126736, "flos": 20922957642240.0, "grad_norm": 2.54944868964411, "language_loss": 0.69348609, "learning_rate": 3.7306209270990695e-06, "loss": 0.71611637, "num_input_tokens_seen": 33884165, "step": 1598, "time_per_iteration": 2.66438627243042 }, { "auxiliary_loss_clip": 0.01217807, "auxiliary_loss_mlp": 0.01036633, "balance_loss_clip": 1.06278098, "balance_loss_mlp": 1.02699447, "epoch": 0.19226838213190645, "flos": 26359725231360.0, "grad_norm": 2.3276176053318105, "language_loss": 0.86891782, "learning_rate": 3.7302303469965292e-06, "loss": 0.89146221, "num_input_tokens_seen": 33903705, "step": 1599, "time_per_iteration": 2.775125026702881 }, { "auxiliary_loss_clip": 0.01216908, "auxiliary_loss_mlp": 0.01035935, "balance_loss_clip": 1.06142998, "balance_loss_mlp": 1.0251888, "epoch": 0.19238862502254553, "flos": 20850525866880.0, "grad_norm": 2.0105905122100696, "language_loss": 0.70811236, "learning_rate": 3.7298395044194206e-06, "loss": 0.73064083, "num_input_tokens_seen": 33922515, "step": 1600, "time_per_iteration": 2.786820650100708 }, { "auxiliary_loss_clip": 0.01223076, "auxiliary_loss_mlp": 0.01037378, "balance_loss_clip": 1.0621922, "balance_loss_mlp": 1.02734053, "epoch": 0.19250886791318464, "flos": 21726063878400.0, "grad_norm": 1.8908160232590991, "language_loss": 0.94161773, "learning_rate": 3.7294483994270356e-06, "loss": 0.96422219, "num_input_tokens_seen": 33940840, "step": 1601, "time_per_iteration": 2.734833240509033 }, { "auxiliary_loss_clip": 0.01201144, "auxiliary_loss_mlp": 0.01033922, "balance_loss_clip": 1.0612421, "balance_loss_mlp": 1.02489161, "epoch": 0.19262911080382372, "flos": 23367827836800.0, "grad_norm": 2.1742264664642037, "language_loss": 0.77897441, "learning_rate": 3.7290570320787033e-06, "loss": 0.80132508, "num_input_tokens_seen": 33960420, "step": 1602, "time_per_iteration": 2.774101972579956 }, { "auxiliary_loss_clip": 0.01214312, "auxiliary_loss_mlp": 0.0103271, "balance_loss_clip": 1.05925822, "balance_loss_mlp": 1.02272069, "epoch": 0.1927493536944628, "flos": 21943502858880.0, "grad_norm": 1.9789119457554407, "language_loss": 0.71019149, "learning_rate": 3.728665402433793e-06, "loss": 0.73266166, "num_input_tokens_seen": 33978990, "step": 1603, "time_per_iteration": 2.7125656604766846 }, { "auxiliary_loss_clip": 0.01213795, "auxiliary_loss_mlp": 0.01039674, "balance_loss_clip": 1.06304121, "balance_loss_mlp": 1.03017306, "epoch": 0.19286959658510192, "flos": 16545590807040.0, "grad_norm": 2.6373037582612056, "language_loss": 0.86199391, "learning_rate": 3.7282735105517164e-06, "loss": 0.88452864, "num_input_tokens_seen": 33997115, "step": 1604, "time_per_iteration": 3.642531156539917 }, { "auxiliary_loss_clip": 0.01215346, "auxiliary_loss_mlp": 0.01032417, "balance_loss_clip": 1.06134522, "balance_loss_mlp": 1.02204573, "epoch": 0.192989839475741, "flos": 21616967295360.0, "grad_norm": 2.303384256313871, "language_loss": 0.67495537, "learning_rate": 3.727881356491922e-06, "loss": 0.69743299, "num_input_tokens_seen": 34015525, "step": 1605, "time_per_iteration": 2.745689630508423 }, { "auxiliary_loss_clip": 0.01220497, "auxiliary_loss_mlp": 0.01036884, "balance_loss_clip": 1.06156933, "balance_loss_mlp": 1.02725244, "epoch": 0.19311008236638008, "flos": 19281516906240.0, "grad_norm": 2.2364271406199836, "language_loss": 0.75464588, "learning_rate": 3.7274889403139002e-06, "loss": 0.77721965, "num_input_tokens_seen": 34033150, "step": 1606, "time_per_iteration": 2.676513433456421 }, { "auxiliary_loss_clip": 0.01199516, "auxiliary_loss_mlp": 0.01025659, "balance_loss_clip": 1.06003451, "balance_loss_mlp": 1.01630139, "epoch": 0.1932303252570192, "flos": 28652369587200.0, "grad_norm": 3.5768499706343464, "language_loss": 0.78358138, "learning_rate": 3.727096262077179e-06, "loss": 0.8058331, "num_input_tokens_seen": 34052145, "step": 1607, "time_per_iteration": 4.712014198303223 }, { "auxiliary_loss_clip": 0.01213452, "auxiliary_loss_mlp": 0.01040833, "balance_loss_clip": 1.05731463, "balance_loss_mlp": 1.03076625, "epoch": 0.19335056814765827, "flos": 18368990864640.0, "grad_norm": 1.7733300548888753, "language_loss": 0.85125369, "learning_rate": 3.7267033218413285e-06, "loss": 0.87379658, "num_input_tokens_seen": 34069940, "step": 1608, "time_per_iteration": 2.682288885116577 }, { "auxiliary_loss_clip": 0.01206841, "auxiliary_loss_mlp": 0.01040185, "balance_loss_clip": 1.06078219, "balance_loss_mlp": 1.03040981, "epoch": 0.19347081103829736, "flos": 13260877741440.0, "grad_norm": 2.094780997212621, "language_loss": 0.81233883, "learning_rate": 3.726310119665957e-06, "loss": 0.83480906, "num_input_tokens_seen": 34086275, "step": 1609, "time_per_iteration": 2.821974754333496 }, { "auxiliary_loss_clip": 0.01217736, "auxiliary_loss_mlp": 0.0103885, "balance_loss_clip": 1.06239462, "balance_loss_mlp": 1.02900887, "epoch": 0.19359105392893644, "flos": 20300122788480.0, "grad_norm": 1.7822915714100789, "language_loss": 0.85609829, "learning_rate": 3.725916655610713e-06, "loss": 0.87866414, "num_input_tokens_seen": 34105605, "step": 1610, "time_per_iteration": 2.7234909534454346 }, { "auxiliary_loss_clip": 0.01213207, "auxiliary_loss_mlp": 0.01035636, "balance_loss_clip": 1.06354785, "balance_loss_mlp": 1.02500224, "epoch": 0.19371129681957555, "flos": 20484596062080.0, "grad_norm": 2.463099565382263, "language_loss": 0.75327557, "learning_rate": 3.725522929735284e-06, "loss": 0.77576399, "num_input_tokens_seen": 34122540, "step": 1611, "time_per_iteration": 2.8103692531585693 }, { "auxiliary_loss_clip": 0.01223109, "auxiliary_loss_mlp": 0.01033553, "balance_loss_clip": 1.06157053, "balance_loss_mlp": 1.02445769, "epoch": 0.19383153971021463, "flos": 30445497457920.0, "grad_norm": 2.3525850324270703, "language_loss": 0.74058515, "learning_rate": 3.725128942099399e-06, "loss": 0.76315176, "num_input_tokens_seen": 34142940, "step": 1612, "time_per_iteration": 2.8090732097625732 }, { "auxiliary_loss_clip": 0.01206348, "auxiliary_loss_mlp": 0.01028363, "balance_loss_clip": 1.0593766, "balance_loss_mlp": 1.01871264, "epoch": 0.19395178260085372, "flos": 24569937325440.0, "grad_norm": 1.8708522809682204, "language_loss": 0.80076331, "learning_rate": 3.7247346927628245e-06, "loss": 0.82311046, "num_input_tokens_seen": 34162875, "step": 1613, "time_per_iteration": 2.8200466632843018 }, { "auxiliary_loss_clip": 0.01217328, "auxiliary_loss_mlp": 0.01080773, "balance_loss_clip": 1.0636884, "balance_loss_mlp": 1.0336287, "epoch": 0.19407202549149283, "flos": 28950608211840.0, "grad_norm": 1.8764178755343894, "language_loss": 0.78997743, "learning_rate": 3.7243401817853694e-06, "loss": 0.81295836, "num_input_tokens_seen": 34183565, "step": 1614, "time_per_iteration": 2.8154380321502686 }, { "auxiliary_loss_clip": 0.01211099, "auxiliary_loss_mlp": 0.01037655, "balance_loss_clip": 1.060413, "balance_loss_mlp": 1.02797484, "epoch": 0.1941922683821319, "flos": 18004497603840.0, "grad_norm": 1.9431862195628404, "language_loss": 0.71776402, "learning_rate": 3.723945409226879e-06, "loss": 0.74025154, "num_input_tokens_seen": 34202055, "step": 1615, "time_per_iteration": 2.7910401821136475 }, { "auxiliary_loss_clip": 0.01217899, "auxiliary_loss_mlp": 0.01027279, "balance_loss_clip": 1.06006968, "balance_loss_mlp": 1.01758695, "epoch": 0.194312511272771, "flos": 9720337034880.0, "grad_norm": 2.551859630305529, "language_loss": 0.80186701, "learning_rate": 3.723550375147241e-06, "loss": 0.82431883, "num_input_tokens_seen": 34216830, "step": 1616, "time_per_iteration": 2.7749834060668945 }, { "auxiliary_loss_clip": 0.01203063, "auxiliary_loss_mlp": 0.01033961, "balance_loss_clip": 1.06064618, "balance_loss_mlp": 1.02381682, "epoch": 0.19443275416341008, "flos": 27016208150400.0, "grad_norm": 1.8691217929315063, "language_loss": 0.79924935, "learning_rate": 3.7231550796063816e-06, "loss": 0.82161963, "num_input_tokens_seen": 34236840, "step": 1617, "time_per_iteration": 2.8988046646118164 }, { "auxiliary_loss_clip": 0.01222592, "auxiliary_loss_mlp": 0.01032085, "balance_loss_clip": 1.06490254, "balance_loss_mlp": 1.02129602, "epoch": 0.1945529970540492, "flos": 15846625077120.0, "grad_norm": 2.1832238548057505, "language_loss": 0.64995354, "learning_rate": 3.722759522664266e-06, "loss": 0.67250031, "num_input_tokens_seen": 34254140, "step": 1618, "time_per_iteration": 3.0115230083465576 }, { "auxiliary_loss_clip": 0.01210693, "auxiliary_loss_mlp": 0.01031729, "balance_loss_clip": 1.06264579, "balance_loss_mlp": 1.02141702, "epoch": 0.19467323994468827, "flos": 19314985403520.0, "grad_norm": 1.84225970160816, "language_loss": 0.81717873, "learning_rate": 3.7223637043809016e-06, "loss": 0.83960295, "num_input_tokens_seen": 34273120, "step": 1619, "time_per_iteration": 2.8107173442840576 }, { "auxiliary_loss_clip": 0.01213215, "auxiliary_loss_mlp": 0.01033369, "balance_loss_clip": 1.06111264, "balance_loss_mlp": 1.02335572, "epoch": 0.19479348283532735, "flos": 24133227770880.0, "grad_norm": 2.1114205073448984, "language_loss": 0.86763799, "learning_rate": 3.7219676248163322e-06, "loss": 0.89010382, "num_input_tokens_seen": 34290285, "step": 1620, "time_per_iteration": 2.831143856048584 }, { "auxiliary_loss_clip": 0.01222848, "auxiliary_loss_mlp": 0.01036102, "balance_loss_clip": 1.06163025, "balance_loss_mlp": 1.02627277, "epoch": 0.19491372572596646, "flos": 25775638174080.0, "grad_norm": 1.8647916584228317, "language_loss": 0.93265951, "learning_rate": 3.721571284030643e-06, "loss": 0.95524901, "num_input_tokens_seen": 34310095, "step": 1621, "time_per_iteration": 2.7918872833251953 }, { "auxiliary_loss_clip": 0.01223109, "auxiliary_loss_mlp": 0.01031618, "balance_loss_clip": 1.06182134, "balance_loss_mlp": 1.02175379, "epoch": 0.19503396861660555, "flos": 19645220067840.0, "grad_norm": 2.213818231324747, "language_loss": 0.78807127, "learning_rate": 3.7211746820839587e-06, "loss": 0.81061852, "num_input_tokens_seen": 34327190, "step": 1622, "time_per_iteration": 2.8233089447021484 }, { "auxiliary_loss_clip": 0.01188583, "auxiliary_loss_mlp": 0.01038991, "balance_loss_clip": 1.06193793, "balance_loss_mlp": 1.029037, "epoch": 0.19515421150724463, "flos": 21033023892480.0, "grad_norm": 2.4729990535826674, "language_loss": 0.81126773, "learning_rate": 3.7207778190364437e-06, "loss": 0.83354342, "num_input_tokens_seen": 34345615, "step": 1623, "time_per_iteration": 2.8423871994018555 }, { "auxiliary_loss_clip": 0.01191873, "auxiliary_loss_mlp": 0.01040727, "balance_loss_clip": 1.06361341, "balance_loss_mlp": 1.03095186, "epoch": 0.1952744543978837, "flos": 32961255143040.0, "grad_norm": 1.9699176694538456, "language_loss": 0.73859912, "learning_rate": 3.720380694948302e-06, "loss": 0.76092517, "num_input_tokens_seen": 34368500, "step": 1624, "time_per_iteration": 2.8897852897644043 }, { "auxiliary_loss_clip": 0.01117552, "auxiliary_loss_mlp": 0.01004613, "balance_loss_clip": 1.02732396, "balance_loss_mlp": 1.00251448, "epoch": 0.19539469728852282, "flos": 64044312030720.0, "grad_norm": 1.0392509717219072, "language_loss": 0.71170831, "learning_rate": 3.719983309879777e-06, "loss": 0.73292994, "num_input_tokens_seen": 34428280, "step": 1625, "time_per_iteration": 3.3264262676239014 }, { "auxiliary_loss_clip": 0.01205509, "auxiliary_loss_mlp": 0.01040695, "balance_loss_clip": 1.06655848, "balance_loss_mlp": 1.03090239, "epoch": 0.1955149401791619, "flos": 13370908078080.0, "grad_norm": 1.748277344629602, "language_loss": 0.7725392, "learning_rate": 3.719585663891151e-06, "loss": 0.79500127, "num_input_tokens_seen": 34445815, "step": 1626, "time_per_iteration": 2.8549115657806396 }, { "auxiliary_loss_clip": 0.01199965, "auxiliary_loss_mlp": 0.01042018, "balance_loss_clip": 1.06461096, "balance_loss_mlp": 1.03147388, "epoch": 0.195635183069801, "flos": 18728887184640.0, "grad_norm": 2.7696847238393856, "language_loss": 0.78785455, "learning_rate": 3.719187757042747e-06, "loss": 0.81027436, "num_input_tokens_seen": 34463635, "step": 1627, "time_per_iteration": 2.836839437484741 }, { "auxiliary_loss_clip": 0.01113378, "auxiliary_loss_mlp": 0.01003308, "balance_loss_clip": 1.02602482, "balance_loss_mlp": 1.00135279, "epoch": 0.1957554259604401, "flos": 69313952615040.0, "grad_norm": 0.7483410267618045, "language_loss": 0.5490911, "learning_rate": 3.7187895893949275e-06, "loss": 0.5702579, "num_input_tokens_seen": 34530105, "step": 1628, "time_per_iteration": 3.497142791748047 }, { "auxiliary_loss_clip": 0.01195678, "auxiliary_loss_mlp": 0.01036637, "balance_loss_clip": 1.06093514, "balance_loss_mlp": 1.02662301, "epoch": 0.19587566885107918, "flos": 21069257736960.0, "grad_norm": 2.4141064286792764, "language_loss": 0.75635087, "learning_rate": 3.7183911610080937e-06, "loss": 0.77867401, "num_input_tokens_seen": 34546970, "step": 1629, "time_per_iteration": 2.7764604091644287 }, { "auxiliary_loss_clip": 0.01212985, "auxiliary_loss_mlp": 0.0103607, "balance_loss_clip": 1.06211209, "balance_loss_mlp": 1.0252521, "epoch": 0.19599591174171827, "flos": 22194661731840.0, "grad_norm": 2.8866976557942325, "language_loss": 0.74996412, "learning_rate": 3.7179924719426872e-06, "loss": 0.77245474, "num_input_tokens_seen": 34564865, "step": 1630, "time_per_iteration": 3.754629135131836 }, { "auxiliary_loss_clip": 0.01221617, "auxiliary_loss_mlp": 0.01035374, "balance_loss_clip": 1.06333923, "balance_loss_mlp": 1.02503276, "epoch": 0.19611615463235738, "flos": 23768375374080.0, "grad_norm": 4.520062086116575, "language_loss": 0.75612831, "learning_rate": 3.7175935222591885e-06, "loss": 0.77869833, "num_input_tokens_seen": 34584165, "step": 1631, "time_per_iteration": 2.7913200855255127 }, { "auxiliary_loss_clip": 0.01220916, "auxiliary_loss_mlp": 0.01041781, "balance_loss_clip": 1.06768703, "balance_loss_mlp": 1.03114188, "epoch": 0.19623639752299646, "flos": 28618218731520.0, "grad_norm": 2.3089372582548666, "language_loss": 0.74630988, "learning_rate": 3.717194312018118e-06, "loss": 0.76893687, "num_input_tokens_seen": 34603150, "step": 1632, "time_per_iteration": 4.7387707233428955 }, { "auxiliary_loss_clip": 0.01219021, "auxiliary_loss_mlp": 0.01040969, "balance_loss_clip": 1.060992, "balance_loss_mlp": 1.02971005, "epoch": 0.19635664041363554, "flos": 21032700670080.0, "grad_norm": 2.1586563596069466, "language_loss": 0.76069373, "learning_rate": 3.716794841280036e-06, "loss": 0.7832936, "num_input_tokens_seen": 34621855, "step": 1633, "time_per_iteration": 3.726792573928833 }, { "auxiliary_loss_clip": 0.01229192, "auxiliary_loss_mlp": 0.01032924, "balance_loss_clip": 1.06450295, "balance_loss_mlp": 1.0221777, "epoch": 0.19647688330427462, "flos": 18879748306560.0, "grad_norm": 2.1718383513741335, "language_loss": 0.77851927, "learning_rate": 3.7163951101055407e-06, "loss": 0.80114043, "num_input_tokens_seen": 34639915, "step": 1634, "time_per_iteration": 2.72133207321167 }, { "auxiliary_loss_clip": 0.01214034, "auxiliary_loss_mlp": 0.01044479, "balance_loss_clip": 1.06623387, "balance_loss_mlp": 1.03409576, "epoch": 0.19659712619491373, "flos": 24242503921920.0, "grad_norm": 1.8341940870176647, "language_loss": 0.79059744, "learning_rate": 3.715995118555273e-06, "loss": 0.81318253, "num_input_tokens_seen": 34659890, "step": 1635, "time_per_iteration": 2.8338334560394287 }, { "auxiliary_loss_clip": 0.01208968, "auxiliary_loss_mlp": 0.01040262, "balance_loss_clip": 1.06452513, "balance_loss_mlp": 1.02930045, "epoch": 0.19671736908555282, "flos": 24717422568960.0, "grad_norm": 2.176291692244178, "language_loss": 0.86058438, "learning_rate": 3.71559486668991e-06, "loss": 0.88307667, "num_input_tokens_seen": 34678750, "step": 1636, "time_per_iteration": 2.9192888736724854 }, { "auxiliary_loss_clip": 0.01228242, "auxiliary_loss_mlp": 0.01064601, "balance_loss_clip": 1.06534374, "balance_loss_mlp": 1.01997674, "epoch": 0.1968376119761919, "flos": 23842279607040.0, "grad_norm": 1.6043408392315446, "language_loss": 0.77738035, "learning_rate": 3.715194354570169e-06, "loss": 0.80030876, "num_input_tokens_seen": 34698755, "step": 1637, "time_per_iteration": 2.8213369846343994 }, { "auxiliary_loss_clip": 0.01218091, "auxiliary_loss_mlp": 0.01040776, "balance_loss_clip": 1.06332612, "balance_loss_mlp": 1.03062582, "epoch": 0.196957854866831, "flos": 18113917409280.0, "grad_norm": 2.0709834457999112, "language_loss": 0.83166772, "learning_rate": 3.714793582256809e-06, "loss": 0.85425639, "num_input_tokens_seen": 34715820, "step": 1638, "time_per_iteration": 2.709219455718994 }, { "auxiliary_loss_clip": 0.01221213, "auxiliary_loss_mlp": 0.01040735, "balance_loss_clip": 1.06044078, "balance_loss_mlp": 1.03103161, "epoch": 0.1970780977574701, "flos": 21653129312640.0, "grad_norm": 2.6205188829220494, "language_loss": 0.84921861, "learning_rate": 3.7143925498106253e-06, "loss": 0.87183815, "num_input_tokens_seen": 34734360, "step": 1639, "time_per_iteration": 2.759735345840454 }, { "auxiliary_loss_clip": 0.01217587, "auxiliary_loss_mlp": 0.01034071, "balance_loss_clip": 1.06117702, "balance_loss_mlp": 1.02340198, "epoch": 0.19719834064810918, "flos": 20811813984000.0, "grad_norm": 2.2786929916952396, "language_loss": 0.79481429, "learning_rate": 3.7139912572924558e-06, "loss": 0.81733084, "num_input_tokens_seen": 34753390, "step": 1640, "time_per_iteration": 2.9121389389038086 }, { "auxiliary_loss_clip": 0.01217335, "auxiliary_loss_mlp": 0.01037376, "balance_loss_clip": 1.06026173, "balance_loss_mlp": 1.02720141, "epoch": 0.19731858353874826, "flos": 23434800744960.0, "grad_norm": 2.6116568347715114, "language_loss": 0.80682838, "learning_rate": 3.7135897047631744e-06, "loss": 0.82937551, "num_input_tokens_seen": 34771275, "step": 1641, "time_per_iteration": 2.8709959983825684 }, { "auxiliary_loss_clip": 0.0121334, "auxiliary_loss_mlp": 0.0103247, "balance_loss_clip": 1.05865073, "balance_loss_mlp": 1.02254581, "epoch": 0.19743882642938737, "flos": 23988184652160.0, "grad_norm": 2.095115959733709, "language_loss": 0.76251423, "learning_rate": 3.713187892283698e-06, "loss": 0.78497243, "num_input_tokens_seen": 34790885, "step": 1642, "time_per_iteration": 2.813415765762329 }, { "auxiliary_loss_clip": 0.01208056, "auxiliary_loss_mlp": 0.01039962, "balance_loss_clip": 1.0625, "balance_loss_mlp": 1.029549, "epoch": 0.19755906932002645, "flos": 15004340081280.0, "grad_norm": 2.1903902599590555, "language_loss": 0.87196642, "learning_rate": 3.71278581991498e-06, "loss": 0.89444661, "num_input_tokens_seen": 34806745, "step": 1643, "time_per_iteration": 2.8895881175994873 }, { "auxiliary_loss_clip": 0.0121511, "auxiliary_loss_mlp": 0.01069356, "balance_loss_clip": 1.06389987, "balance_loss_mlp": 1.02350569, "epoch": 0.19767931221066554, "flos": 19494466686720.0, "grad_norm": 1.91855941724868, "language_loss": 0.79363465, "learning_rate": 3.712383487718015e-06, "loss": 0.81647927, "num_input_tokens_seen": 34824985, "step": 1644, "time_per_iteration": 2.818697452545166 }, { "auxiliary_loss_clip": 0.01195695, "auxiliary_loss_mlp": 0.01035998, "balance_loss_clip": 1.06281519, "balance_loss_mlp": 1.02641988, "epoch": 0.19779955510130465, "flos": 25737895958400.0, "grad_norm": 3.0344373607678228, "language_loss": 0.86634272, "learning_rate": 3.7119808957538365e-06, "loss": 0.88865966, "num_input_tokens_seen": 34843980, "step": 1645, "time_per_iteration": 2.8967227935791016 }, { "auxiliary_loss_clip": 0.01211659, "auxiliary_loss_mlp": 0.01036601, "balance_loss_clip": 1.0628649, "balance_loss_mlp": 1.02627158, "epoch": 0.19791979799194373, "flos": 20777699041920.0, "grad_norm": 2.346423597621551, "language_loss": 0.80147547, "learning_rate": 3.711578044083517e-06, "loss": 0.82395804, "num_input_tokens_seen": 34860780, "step": 1646, "time_per_iteration": 2.7776854038238525 }, { "auxiliary_loss_clip": 0.01216374, "auxiliary_loss_mlp": 0.01030999, "balance_loss_clip": 1.06149673, "balance_loss_mlp": 1.0209322, "epoch": 0.1980400408825828, "flos": 25589010084480.0, "grad_norm": 1.985915046244363, "language_loss": 0.74887395, "learning_rate": 3.7111749327681698e-06, "loss": 0.77134764, "num_input_tokens_seen": 34880815, "step": 1647, "time_per_iteration": 2.7633116245269775 }, { "auxiliary_loss_clip": 0.01222275, "auxiliary_loss_mlp": 0.01031174, "balance_loss_clip": 1.06266904, "balance_loss_mlp": 1.02159548, "epoch": 0.1981602837732219, "flos": 23513840622720.0, "grad_norm": 2.8339673655747, "language_loss": 0.86089796, "learning_rate": 3.7107715618689455e-06, "loss": 0.88343251, "num_input_tokens_seen": 34899790, "step": 1648, "time_per_iteration": 2.8681693077087402 }, { "auxiliary_loss_clip": 0.01218614, "auxiliary_loss_mlp": 0.01030652, "balance_loss_clip": 1.06265521, "balance_loss_mlp": 1.02093673, "epoch": 0.198280526663861, "flos": 23185365724800.0, "grad_norm": 1.4147183062281956, "language_loss": 0.8310442, "learning_rate": 3.710367931447035e-06, "loss": 0.85353684, "num_input_tokens_seen": 34921570, "step": 1649, "time_per_iteration": 2.8269214630126953 }, { "auxiliary_loss_clip": 0.01226438, "auxiliary_loss_mlp": 0.01039763, "balance_loss_clip": 1.06319273, "balance_loss_mlp": 1.02952266, "epoch": 0.1984007695545001, "flos": 21689470897920.0, "grad_norm": 2.313201270523849, "language_loss": 0.86743474, "learning_rate": 3.70996404156367e-06, "loss": 0.89009666, "num_input_tokens_seen": 34941205, "step": 1650, "time_per_iteration": 2.76649808883667 }, { "auxiliary_loss_clip": 0.01198767, "auxiliary_loss_mlp": 0.0103254, "balance_loss_clip": 1.06327581, "balance_loss_mlp": 1.02288365, "epoch": 0.19852101244513917, "flos": 36064008887040.0, "grad_norm": 1.6567231066851056, "language_loss": 0.72881758, "learning_rate": 3.7095598922801187e-06, "loss": 0.75113058, "num_input_tokens_seen": 34963280, "step": 1651, "time_per_iteration": 2.9264473915100098 }, { "auxiliary_loss_clip": 0.01223721, "auxiliary_loss_mlp": 0.01035051, "balance_loss_clip": 1.0608691, "balance_loss_mlp": 1.02536559, "epoch": 0.19864125533577828, "flos": 23105894883840.0, "grad_norm": 2.8505681019488813, "language_loss": 0.76734948, "learning_rate": 3.7091554836576914e-06, "loss": 0.78993714, "num_input_tokens_seen": 34979955, "step": 1652, "time_per_iteration": 2.7183141708374023 }, { "auxiliary_loss_clip": 0.01218336, "auxiliary_loss_mlp": 0.01067659, "balance_loss_clip": 1.06285024, "balance_loss_mlp": 1.0227294, "epoch": 0.19876149822641737, "flos": 24608505553920.0, "grad_norm": 1.6252999809734325, "language_loss": 0.82774252, "learning_rate": 3.708750815757736e-06, "loss": 0.85060251, "num_input_tokens_seen": 35000725, "step": 1653, "time_per_iteration": 2.8501336574554443 }, { "auxiliary_loss_clip": 0.01220503, "auxiliary_loss_mlp": 0.01034015, "balance_loss_clip": 1.0607574, "balance_loss_mlp": 1.02378118, "epoch": 0.19888174111705645, "flos": 32196645308160.0, "grad_norm": 4.131352192895713, "language_loss": 0.73759866, "learning_rate": 3.7083458886416407e-06, "loss": 0.76014388, "num_input_tokens_seen": 35019920, "step": 1654, "time_per_iteration": 2.7919905185699463 }, { "auxiliary_loss_clip": 0.01205012, "auxiliary_loss_mlp": 0.01037751, "balance_loss_clip": 1.06172419, "balance_loss_mlp": 1.02779686, "epoch": 0.19900198400769553, "flos": 24608469640320.0, "grad_norm": 2.8165859516134013, "language_loss": 0.88450217, "learning_rate": 3.707940702370832e-06, "loss": 0.90692979, "num_input_tokens_seen": 35040765, "step": 1655, "time_per_iteration": 2.8785324096679688 }, { "auxiliary_loss_clip": 0.01105978, "auxiliary_loss_mlp": 0.01004575, "balance_loss_clip": 1.02028561, "balance_loss_mlp": 1.00282311, "epoch": 0.19912222689833464, "flos": 67915805673600.0, "grad_norm": 0.8058742818453501, "language_loss": 0.58215559, "learning_rate": 3.707535257006777e-06, "loss": 0.60326111, "num_input_tokens_seen": 35106390, "step": 1656, "time_per_iteration": 4.286026954650879 }, { "auxiliary_loss_clip": 0.01217697, "auxiliary_loss_mlp": 0.01037699, "balance_loss_clip": 1.06250978, "balance_loss_mlp": 1.02733409, "epoch": 0.19924246978897373, "flos": 15742340916480.0, "grad_norm": 3.148876943861073, "language_loss": 0.87954736, "learning_rate": 3.707129552610981e-06, "loss": 0.90210128, "num_input_tokens_seen": 35125040, "step": 1657, "time_per_iteration": 2.841768980026245 }, { "auxiliary_loss_clip": 0.01210609, "auxiliary_loss_mlp": 0.01034842, "balance_loss_clip": 1.06414461, "balance_loss_mlp": 1.02408957, "epoch": 0.1993627126796128, "flos": 17566566986880.0, "grad_norm": 1.892715835206746, "language_loss": 0.73733783, "learning_rate": 3.70672358924499e-06, "loss": 0.75979233, "num_input_tokens_seen": 35144280, "step": 1658, "time_per_iteration": 4.756134748458862 }, { "auxiliary_loss_clip": 0.01210488, "auxiliary_loss_mlp": 0.01033791, "balance_loss_clip": 1.06463671, "balance_loss_mlp": 1.02459955, "epoch": 0.19948295557025192, "flos": 40843826680320.0, "grad_norm": 1.88615045558848, "language_loss": 0.7840957, "learning_rate": 3.706317366970386e-06, "loss": 0.80653846, "num_input_tokens_seen": 35165280, "step": 1659, "time_per_iteration": 3.879854679107666 }, { "auxiliary_loss_clip": 0.01223886, "auxiliary_loss_mlp": 0.01065471, "balance_loss_clip": 1.06003892, "balance_loss_mlp": 1.02042007, "epoch": 0.199603198460891, "flos": 25082418620160.0, "grad_norm": 3.766255872186801, "language_loss": 0.8366437, "learning_rate": 3.705910885848795e-06, "loss": 0.8595373, "num_input_tokens_seen": 35183655, "step": 1660, "time_per_iteration": 2.7761244773864746 }, { "auxiliary_loss_clip": 0.01216404, "auxiliary_loss_mlp": 0.01030848, "balance_loss_clip": 1.06069052, "balance_loss_mlp": 1.01987481, "epoch": 0.19972344135153008, "flos": 20084120352000.0, "grad_norm": 1.872934812596742, "language_loss": 0.84774637, "learning_rate": 3.705504145941879e-06, "loss": 0.87021887, "num_input_tokens_seen": 35201825, "step": 1661, "time_per_iteration": 2.7734029293060303 }, { "auxiliary_loss_clip": 0.01222031, "auxiliary_loss_mlp": 0.01040071, "balance_loss_clip": 1.06114674, "balance_loss_mlp": 1.02977717, "epoch": 0.1998436842421692, "flos": 23727472761600.0, "grad_norm": 1.860274878727329, "language_loss": 0.78971756, "learning_rate": 3.7050971473113403e-06, "loss": 0.81233859, "num_input_tokens_seen": 35221600, "step": 1662, "time_per_iteration": 2.7524235248565674 }, { "auxiliary_loss_clip": 0.01217138, "auxiliary_loss_mlp": 0.01073778, "balance_loss_clip": 1.06100535, "balance_loss_mlp": 1.02869153, "epoch": 0.19996392713280828, "flos": 36102361633920.0, "grad_norm": 2.317572691047653, "language_loss": 0.79941243, "learning_rate": 3.7046898900189196e-06, "loss": 0.82232159, "num_input_tokens_seen": 35245935, "step": 1663, "time_per_iteration": 2.8072826862335205 }, { "auxiliary_loss_clip": 0.01216349, "auxiliary_loss_mlp": 0.0104103, "balance_loss_clip": 1.06306231, "balance_loss_mlp": 1.03086746, "epoch": 0.20008417002344736, "flos": 23657662679040.0, "grad_norm": 1.626237443162751, "language_loss": 0.82976961, "learning_rate": 3.704282374126398e-06, "loss": 0.85234344, "num_input_tokens_seen": 35265615, "step": 1664, "time_per_iteration": 2.7730190753936768 }, { "auxiliary_loss_clip": 0.01211427, "auxiliary_loss_mlp": 0.01035805, "balance_loss_clip": 1.06265569, "balance_loss_mlp": 1.02528501, "epoch": 0.20020441291408644, "flos": 21872076664320.0, "grad_norm": 3.352239680268019, "language_loss": 0.87656051, "learning_rate": 3.7038745996955954e-06, "loss": 0.89903283, "num_input_tokens_seen": 35284960, "step": 1665, "time_per_iteration": 305.52595806121826 }, { "auxiliary_loss_clip": 0.01219754, "auxiliary_loss_mlp": 0.01033336, "balance_loss_clip": 1.06329679, "balance_loss_mlp": 1.02214181, "epoch": 0.20032465580472555, "flos": 23179691376000.0, "grad_norm": 2.799956904178113, "language_loss": 0.7200222, "learning_rate": 3.703466566788371e-06, "loss": 0.74255311, "num_input_tokens_seen": 35304090, "step": 1666, "time_per_iteration": 2.81030535697937 }, { "auxiliary_loss_clip": 0.01207297, "auxiliary_loss_mlp": 0.01039306, "balance_loss_clip": 1.06192231, "balance_loss_mlp": 1.02897644, "epoch": 0.20044489869536464, "flos": 23873521461120.0, "grad_norm": 1.7538909872761927, "language_loss": 0.74400485, "learning_rate": 3.703058275466622e-06, "loss": 0.76647091, "num_input_tokens_seen": 35323325, "step": 1667, "time_per_iteration": 2.7765300273895264 }, { "auxiliary_loss_clip": 0.01216356, "auxiliary_loss_mlp": 0.010437, "balance_loss_clip": 1.06103015, "balance_loss_mlp": 1.03224361, "epoch": 0.20056514158600372, "flos": 21945226711680.0, "grad_norm": 2.23071800795143, "language_loss": 0.77865601, "learning_rate": 3.7026497257922877e-06, "loss": 0.80125654, "num_input_tokens_seen": 35343635, "step": 1668, "time_per_iteration": 2.7688827514648438 }, { "auxiliary_loss_clip": 0.01204333, "auxiliary_loss_mlp": 0.01038375, "balance_loss_clip": 1.0616827, "balance_loss_mlp": 1.02739024, "epoch": 0.20068538447664283, "flos": 23879159896320.0, "grad_norm": 1.6396275049580158, "language_loss": 0.85094899, "learning_rate": 3.7022409178273436e-06, "loss": 0.87337607, "num_input_tokens_seen": 35364615, "step": 1669, "time_per_iteration": 2.7777833938598633 }, { "auxiliary_loss_clip": 0.01217155, "auxiliary_loss_mlp": 0.01031371, "balance_loss_clip": 1.06050611, "balance_loss_mlp": 1.02201271, "epoch": 0.2008056273672819, "flos": 18442823270400.0, "grad_norm": 2.0682606951726332, "language_loss": 0.78577662, "learning_rate": 3.7018318516338054e-06, "loss": 0.80826193, "num_input_tokens_seen": 35383775, "step": 1670, "time_per_iteration": 2.7763760089874268 }, { "auxiliary_loss_clip": 0.01220297, "auxiliary_loss_mlp": 0.01028581, "balance_loss_clip": 1.05979228, "balance_loss_mlp": 1.01801348, "epoch": 0.200925870257921, "flos": 23659530186240.0, "grad_norm": 4.131856004311266, "language_loss": 0.81505084, "learning_rate": 3.7014225272737284e-06, "loss": 0.83753955, "num_input_tokens_seen": 35403000, "step": 1671, "time_per_iteration": 2.7672135829925537 }, { "auxiliary_loss_clip": 0.01211805, "auxiliary_loss_mlp": 0.01035254, "balance_loss_clip": 1.06002295, "balance_loss_mlp": 1.02472138, "epoch": 0.20104611314856008, "flos": 16217115909120.0, "grad_norm": 2.245124237283204, "language_loss": 0.74262989, "learning_rate": 3.701012944809207e-06, "loss": 0.76510048, "num_input_tokens_seen": 35420115, "step": 1672, "time_per_iteration": 2.760533571243286 }, { "auxiliary_loss_clip": 0.01213668, "auxiliary_loss_mlp": 0.01070154, "balance_loss_clip": 1.06099367, "balance_loss_mlp": 1.02371037, "epoch": 0.2011663560391992, "flos": 21397373498880.0, "grad_norm": 2.6239836840813564, "language_loss": 0.79086053, "learning_rate": 3.700603104302374e-06, "loss": 0.81369877, "num_input_tokens_seen": 35439925, "step": 1673, "time_per_iteration": 2.739731550216675 }, { "auxiliary_loss_clip": 0.01102709, "auxiliary_loss_mlp": 0.01002203, "balance_loss_clip": 1.0244863, "balance_loss_mlp": 1.00026035, "epoch": 0.20128659892983827, "flos": 62229459409920.0, "grad_norm": 0.9155481374473694, "language_loss": 0.56009543, "learning_rate": 3.7001930058154027e-06, "loss": 0.58114457, "num_input_tokens_seen": 35504885, "step": 1674, "time_per_iteration": 3.354590892791748 }, { "auxiliary_loss_clip": 0.01210684, "auxiliary_loss_mlp": 0.01034337, "balance_loss_clip": 1.06301641, "balance_loss_mlp": 1.02390635, "epoch": 0.20140684182047736, "flos": 28438737448320.0, "grad_norm": 6.349995253762938, "language_loss": 0.80160642, "learning_rate": 3.6997826494105037e-06, "loss": 0.82405663, "num_input_tokens_seen": 35525330, "step": 1675, "time_per_iteration": 2.8093388080596924 }, { "auxiliary_loss_clip": 0.01214888, "auxiliary_loss_mlp": 0.01039061, "balance_loss_clip": 1.06010342, "balance_loss_mlp": 1.02916694, "epoch": 0.20152708471111647, "flos": 28074064619520.0, "grad_norm": 2.2449380524583007, "language_loss": 0.69303238, "learning_rate": 3.6993720351499286e-06, "loss": 0.71557188, "num_input_tokens_seen": 35546455, "step": 1676, "time_per_iteration": 2.7946152687072754 }, { "auxiliary_loss_clip": 0.01208817, "auxiliary_loss_mlp": 0.01038678, "balance_loss_clip": 1.06240535, "balance_loss_mlp": 1.02874827, "epoch": 0.20164732760175555, "flos": 23549751244800.0, "grad_norm": 1.9811426541397446, "language_loss": 0.76911747, "learning_rate": 3.6989611630959666e-06, "loss": 0.79159248, "num_input_tokens_seen": 35565010, "step": 1677, "time_per_iteration": 2.8185036182403564 }, { "auxiliary_loss_clip": 0.0110695, "auxiliary_loss_mlp": 0.00999613, "balance_loss_clip": 1.01756644, "balance_loss_mlp": 0.99765837, "epoch": 0.20176757049239463, "flos": 71100616037760.0, "grad_norm": 0.803831546157282, "language_loss": 0.58265448, "learning_rate": 3.6985500333109474e-06, "loss": 0.60372007, "num_input_tokens_seen": 35633340, "step": 1678, "time_per_iteration": 3.381561517715454 }, { "auxiliary_loss_clip": 0.01206483, "auxiliary_loss_mlp": 0.01034704, "balance_loss_clip": 1.06267846, "balance_loss_mlp": 1.0248754, "epoch": 0.20188781338303372, "flos": 21430159637760.0, "grad_norm": 2.5548435257787063, "language_loss": 0.76474267, "learning_rate": 3.6981386458572385e-06, "loss": 0.78715456, "num_input_tokens_seen": 35651315, "step": 1679, "time_per_iteration": 2.8382728099823 }, { "auxiliary_loss_clip": 0.01207298, "auxiliary_loss_mlp": 0.01036253, "balance_loss_clip": 1.0613879, "balance_loss_mlp": 1.02473795, "epoch": 0.20200805627367283, "flos": 11546215130880.0, "grad_norm": 2.251434178334769, "language_loss": 0.75886172, "learning_rate": 3.6977270007972468e-06, "loss": 0.78129721, "num_input_tokens_seen": 35668850, "step": 1680, "time_per_iteration": 2.8201053142547607 }, { "auxiliary_loss_clip": 0.01219916, "auxiliary_loss_mlp": 0.01031208, "balance_loss_clip": 1.0635947, "balance_loss_mlp": 1.02151632, "epoch": 0.2021282991643119, "flos": 28545391906560.0, "grad_norm": 2.4731477422192216, "language_loss": 0.72222257, "learning_rate": 3.6973150981934196e-06, "loss": 0.74473381, "num_input_tokens_seen": 35690080, "step": 1681, "time_per_iteration": 3.717083692550659 }, { "auxiliary_loss_clip": 0.01227511, "auxiliary_loss_mlp": 0.01033342, "balance_loss_clip": 1.06204391, "balance_loss_mlp": 1.02313828, "epoch": 0.202248542054951, "flos": 17923446564480.0, "grad_norm": 2.4803527851862235, "language_loss": 0.83598554, "learning_rate": 3.6969029381082415e-06, "loss": 0.85859412, "num_input_tokens_seen": 35706075, "step": 1682, "time_per_iteration": 2.705199718475342 }, { "auxiliary_loss_clip": 0.01211741, "auxiliary_loss_mlp": 0.01038741, "balance_loss_clip": 1.06058085, "balance_loss_mlp": 1.02928758, "epoch": 0.2023687849455901, "flos": 19864634296320.0, "grad_norm": 1.7112312333257078, "language_loss": 0.79411495, "learning_rate": 3.696490520604237e-06, "loss": 0.81661975, "num_input_tokens_seen": 35724765, "step": 1683, "time_per_iteration": 3.8525595664978027 }, { "auxiliary_loss_clip": 0.01216566, "auxiliary_loss_mlp": 0.01028019, "balance_loss_clip": 1.06105685, "balance_loss_mlp": 1.01876271, "epoch": 0.20248902783622919, "flos": 22564721600640.0, "grad_norm": 28.601330853884942, "language_loss": 0.8073734, "learning_rate": 3.696077845743968e-06, "loss": 0.82981932, "num_input_tokens_seen": 35744355, "step": 1684, "time_per_iteration": 3.6348094940185547 }, { "auxiliary_loss_clip": 0.01225011, "auxiliary_loss_mlp": 0.0104133, "balance_loss_clip": 1.06132507, "balance_loss_mlp": 1.03058314, "epoch": 0.20260927072686827, "flos": 22709728805760.0, "grad_norm": 2.694306301055445, "language_loss": 0.72927827, "learning_rate": 3.69566491359004e-06, "loss": 0.75194168, "num_input_tokens_seen": 35761000, "step": 1685, "time_per_iteration": 3.659092664718628 }, { "auxiliary_loss_clip": 0.01216134, "auxiliary_loss_mlp": 0.01032367, "balance_loss_clip": 1.06264389, "balance_loss_mlp": 1.02223384, "epoch": 0.20272951361750738, "flos": 51023998650240.0, "grad_norm": 1.9691185609152897, "language_loss": 0.69368827, "learning_rate": 3.695251724205092e-06, "loss": 0.71617329, "num_input_tokens_seen": 35785360, "step": 1686, "time_per_iteration": 2.9795315265655518 }, { "auxiliary_loss_clip": 0.01224148, "auxiliary_loss_mlp": 0.01034443, "balance_loss_clip": 1.06181765, "balance_loss_mlp": 1.0242151, "epoch": 0.20284975650814646, "flos": 26578133879040.0, "grad_norm": 1.6421988425890515, "language_loss": 0.86604977, "learning_rate": 3.6948382776518054e-06, "loss": 0.88863564, "num_input_tokens_seen": 35806065, "step": 1687, "time_per_iteration": 2.743063449859619 }, { "auxiliary_loss_clip": 0.0122059, "auxiliary_loss_mlp": 0.01039202, "balance_loss_clip": 1.06178629, "balance_loss_mlp": 1.0286634, "epoch": 0.20296999939878554, "flos": 16034222833920.0, "grad_norm": 2.068797559692194, "language_loss": 0.79659134, "learning_rate": 3.6944245739929e-06, "loss": 0.81918925, "num_input_tokens_seen": 35822225, "step": 1688, "time_per_iteration": 2.805424690246582 }, { "auxiliary_loss_clip": 0.01220435, "auxiliary_loss_mlp": 0.01038511, "balance_loss_clip": 1.0635097, "balance_loss_mlp": 1.02864003, "epoch": 0.20309024228942463, "flos": 19203374868480.0, "grad_norm": 2.189033791445448, "language_loss": 0.71657032, "learning_rate": 3.6940106132911332e-06, "loss": 0.73915976, "num_input_tokens_seen": 35839410, "step": 1689, "time_per_iteration": 2.7731423377990723 }, { "auxiliary_loss_clip": 0.01220366, "auxiliary_loss_mlp": 0.01033929, "balance_loss_clip": 1.06118202, "balance_loss_mlp": 1.02350402, "epoch": 0.20321048518006374, "flos": 22821087945600.0, "grad_norm": 1.9932114670072403, "language_loss": 0.88802171, "learning_rate": 3.6935963956093037e-06, "loss": 0.91056472, "num_input_tokens_seen": 35859495, "step": 1690, "time_per_iteration": 3.031801223754883 }, { "auxiliary_loss_clip": 0.0121261, "auxiliary_loss_mlp": 0.01039855, "balance_loss_clip": 1.06194735, "balance_loss_mlp": 1.02946568, "epoch": 0.20333072807070282, "flos": 19096397187840.0, "grad_norm": 1.8573969827127126, "language_loss": 0.68784815, "learning_rate": 3.6931819210102474e-06, "loss": 0.71037281, "num_input_tokens_seen": 35878890, "step": 1691, "time_per_iteration": 2.6535351276397705 }, { "auxiliary_loss_clip": 0.01227756, "auxiliary_loss_mlp": 0.01036946, "balance_loss_clip": 1.06315982, "balance_loss_mlp": 1.02631843, "epoch": 0.2034509709613419, "flos": 18180962144640.0, "grad_norm": 2.0758433881389227, "language_loss": 0.84395814, "learning_rate": 3.6927671895568402e-06, "loss": 0.86660516, "num_input_tokens_seen": 35897950, "step": 1692, "time_per_iteration": 2.694671392440796 }, { "auxiliary_loss_clip": 0.01226764, "auxiliary_loss_mlp": 0.01038015, "balance_loss_clip": 1.06363153, "balance_loss_mlp": 1.0271256, "epoch": 0.20357121385198101, "flos": 22923899648640.0, "grad_norm": 5.805381890674552, "language_loss": 0.86649001, "learning_rate": 3.692352201311996e-06, "loss": 0.88913774, "num_input_tokens_seen": 35916800, "step": 1693, "time_per_iteration": 2.67447829246521 }, { "auxiliary_loss_clip": 0.012115, "auxiliary_loss_mlp": 0.01037025, "balance_loss_clip": 1.06472552, "balance_loss_mlp": 1.02684426, "epoch": 0.2036914567426201, "flos": 20922131629440.0, "grad_norm": 4.661447041364881, "language_loss": 0.76532364, "learning_rate": 3.6919369563386687e-06, "loss": 0.78780884, "num_input_tokens_seen": 35936600, "step": 1694, "time_per_iteration": 2.7792251110076904 }, { "auxiliary_loss_clip": 0.01210689, "auxiliary_loss_mlp": 0.01034923, "balance_loss_clip": 1.06211841, "balance_loss_mlp": 1.02564824, "epoch": 0.20381169963325918, "flos": 15519155760000.0, "grad_norm": 2.2031641815494045, "language_loss": 0.7938025, "learning_rate": 3.69152145469985e-06, "loss": 0.81625867, "num_input_tokens_seen": 35953645, "step": 1695, "time_per_iteration": 2.729132652282715 }, { "auxiliary_loss_clip": 0.01213883, "auxiliary_loss_mlp": 0.01036265, "balance_loss_clip": 1.06346273, "balance_loss_mlp": 1.02542901, "epoch": 0.20393194252389826, "flos": 28833143760000.0, "grad_norm": 1.9489034003572772, "language_loss": 0.82318729, "learning_rate": 3.691105696458572e-06, "loss": 0.84568876, "num_input_tokens_seen": 35970940, "step": 1696, "time_per_iteration": 2.8816237449645996 }, { "auxiliary_loss_clip": 0.01223195, "auxiliary_loss_mlp": 0.01033301, "balance_loss_clip": 1.06326842, "balance_loss_mlp": 1.02369237, "epoch": 0.20405218541453737, "flos": 22488554810880.0, "grad_norm": 4.227945977048187, "language_loss": 0.67821777, "learning_rate": 3.690689681677904e-06, "loss": 0.70078278, "num_input_tokens_seen": 35989410, "step": 1697, "time_per_iteration": 2.7125084400177 }, { "auxiliary_loss_clip": 0.01217948, "auxiliary_loss_mlp": 0.01034772, "balance_loss_clip": 1.06331098, "balance_loss_mlp": 1.02404368, "epoch": 0.20417242830517646, "flos": 25374408278400.0, "grad_norm": 1.9981137776002773, "language_loss": 0.8873418, "learning_rate": 3.690273410420956e-06, "loss": 0.90986907, "num_input_tokens_seen": 36009175, "step": 1698, "time_per_iteration": 2.8170180320739746 }, { "auxiliary_loss_clip": 0.01216927, "auxiliary_loss_mlp": 0.01030303, "balance_loss_clip": 1.06051064, "balance_loss_mlp": 1.02108836, "epoch": 0.20429267119581554, "flos": 14793078240000.0, "grad_norm": 2.6457487392486625, "language_loss": 0.76602685, "learning_rate": 3.689856882750875e-06, "loss": 0.78849918, "num_input_tokens_seen": 36024375, "step": 1699, "time_per_iteration": 2.672466278076172 }, { "auxiliary_loss_clip": 0.01213981, "auxiliary_loss_mlp": 0.01037478, "balance_loss_clip": 1.06063342, "balance_loss_mlp": 1.02726173, "epoch": 0.20441291408645465, "flos": 17781851151360.0, "grad_norm": 1.8207388236319255, "language_loss": 0.78781539, "learning_rate": 3.6894400987308486e-06, "loss": 0.81033003, "num_input_tokens_seen": 36041895, "step": 1700, "time_per_iteration": 2.8008370399475098 }, { "auxiliary_loss_clip": 0.01224029, "auxiliary_loss_mlp": 0.01033094, "balance_loss_clip": 1.06277966, "balance_loss_mlp": 1.02265072, "epoch": 0.20453315697709373, "flos": 16435668211200.0, "grad_norm": 2.5450669887054675, "language_loss": 0.85064673, "learning_rate": 3.6890230584241024e-06, "loss": 0.87321806, "num_input_tokens_seen": 36058825, "step": 1701, "time_per_iteration": 2.6495468616485596 }, { "auxiliary_loss_clip": 0.0111256, "auxiliary_loss_mlp": 0.01011579, "balance_loss_clip": 1.02100873, "balance_loss_mlp": 1.00966024, "epoch": 0.20465339986773282, "flos": 66713085653760.0, "grad_norm": 1.1196132457682473, "language_loss": 0.66375935, "learning_rate": 3.6886057618939016e-06, "loss": 0.68500078, "num_input_tokens_seen": 36121645, "step": 1702, "time_per_iteration": 3.2919323444366455 }, { "auxiliary_loss_clip": 0.01211235, "auxiliary_loss_mlp": 0.01035167, "balance_loss_clip": 1.06574726, "balance_loss_mlp": 1.0255115, "epoch": 0.2047736427583719, "flos": 41974114924800.0, "grad_norm": 2.0186669705975193, "language_loss": 0.69862902, "learning_rate": 3.6881882092035492e-06, "loss": 0.72109306, "num_input_tokens_seen": 36143030, "step": 1703, "time_per_iteration": 2.9242498874664307 }, { "auxiliary_loss_clip": 0.0111437, "auxiliary_loss_mlp": 0.01053664, "balance_loss_clip": 1.02096879, "balance_loss_mlp": 1.01082468, "epoch": 0.204893885649011, "flos": 69940878641280.0, "grad_norm": 0.9319574932241553, "language_loss": 0.6119864, "learning_rate": 3.6877704004163873e-06, "loss": 0.63366675, "num_input_tokens_seen": 36203435, "step": 1704, "time_per_iteration": 3.4501547813415527 }, { "auxiliary_loss_clip": 0.01228399, "auxiliary_loss_mlp": 0.01035928, "balance_loss_clip": 1.06392145, "balance_loss_mlp": 1.02582526, "epoch": 0.2050141285396501, "flos": 22200012858240.0, "grad_norm": 2.213359177171052, "language_loss": 0.77767253, "learning_rate": 3.6873523355957984e-06, "loss": 0.80031586, "num_input_tokens_seen": 36222435, "step": 1705, "time_per_iteration": 2.6899337768554688 }, { "auxiliary_loss_clip": 0.01111482, "auxiliary_loss_mlp": 0.01001819, "balance_loss_clip": 1.0206356, "balance_loss_mlp": 0.99966091, "epoch": 0.20513437143028918, "flos": 46283721730560.0, "grad_norm": 1.0115516009263088, "language_loss": 0.64083898, "learning_rate": 3.686934014805201e-06, "loss": 0.66197193, "num_input_tokens_seen": 36273065, "step": 1706, "time_per_iteration": 3.149104595184326 }, { "auxiliary_loss_clip": 0.01218148, "auxiliary_loss_mlp": 0.01034249, "balance_loss_clip": 1.0622623, "balance_loss_mlp": 1.02386022, "epoch": 0.20525461432092829, "flos": 21904324099200.0, "grad_norm": 1.901247384348491, "language_loss": 0.81041795, "learning_rate": 3.6865154381080552e-06, "loss": 0.83294189, "num_input_tokens_seen": 36293750, "step": 1707, "time_per_iteration": 3.735645294189453 }, { "auxiliary_loss_clip": 0.01201871, "auxiliary_loss_mlp": 0.01033718, "balance_loss_clip": 1.06181633, "balance_loss_mlp": 1.02347183, "epoch": 0.20537485721156737, "flos": 21214264942080.0, "grad_norm": 2.625630557066218, "language_loss": 0.82473218, "learning_rate": 3.6860966055678585e-06, "loss": 0.8470881, "num_input_tokens_seen": 36310105, "step": 1708, "time_per_iteration": 2.88771653175354 }, { "auxiliary_loss_clip": 0.01220047, "auxiliary_loss_mlp": 0.01037689, "balance_loss_clip": 1.06252599, "balance_loss_mlp": 1.02606046, "epoch": 0.20549510010220645, "flos": 20191205773440.0, "grad_norm": 1.9237839201973848, "language_loss": 0.86436653, "learning_rate": 3.685677517248147e-06, "loss": 0.88694394, "num_input_tokens_seen": 36328995, "step": 1709, "time_per_iteration": 3.7771365642547607 }, { "auxiliary_loss_clip": 0.01217351, "auxiliary_loss_mlp": 0.01072898, "balance_loss_clip": 1.06701779, "balance_loss_mlp": 1.0269506, "epoch": 0.20561534299284553, "flos": 17016702612480.0, "grad_norm": 2.037872364846002, "language_loss": 0.80163634, "learning_rate": 3.6852581732124967e-06, "loss": 0.82453889, "num_input_tokens_seen": 36346340, "step": 1710, "time_per_iteration": 3.7331249713897705 }, { "auxiliary_loss_clip": 0.01222096, "auxiliary_loss_mlp": 0.01035731, "balance_loss_clip": 1.06346476, "balance_loss_mlp": 1.02456689, "epoch": 0.20573558588348465, "flos": 22890467064960.0, "grad_norm": 2.1941085877466344, "language_loss": 0.76225281, "learning_rate": 3.6848385735245213e-06, "loss": 0.78483111, "num_input_tokens_seen": 36365430, "step": 1711, "time_per_iteration": 3.7642555236816406 }, { "auxiliary_loss_clip": 0.01207632, "auxiliary_loss_mlp": 0.01041179, "balance_loss_clip": 1.05956459, "balance_loss_mlp": 1.03124893, "epoch": 0.20585582877412373, "flos": 24643123286400.0, "grad_norm": 3.174964445666008, "language_loss": 0.85748744, "learning_rate": 3.6844187182478734e-06, "loss": 0.87997556, "num_input_tokens_seen": 36386285, "step": 1712, "time_per_iteration": 2.7705540657043457 }, { "auxiliary_loss_clip": 0.01203781, "auxiliary_loss_mlp": 0.01034539, "balance_loss_clip": 1.0581286, "balance_loss_mlp": 1.02378654, "epoch": 0.2059760716647628, "flos": 24206952435840.0, "grad_norm": 1.7122416343919684, "language_loss": 0.74795091, "learning_rate": 3.683998607446246e-06, "loss": 0.77033412, "num_input_tokens_seen": 36404935, "step": 1713, "time_per_iteration": 2.7675764560699463 }, { "auxiliary_loss_clip": 0.01223017, "auxiliary_loss_mlp": 0.01031156, "balance_loss_clip": 1.06513345, "balance_loss_mlp": 1.02177978, "epoch": 0.20609631455540192, "flos": 20229522606720.0, "grad_norm": 2.0996727315809123, "language_loss": 0.74717051, "learning_rate": 3.6835782411833686e-06, "loss": 0.76971227, "num_input_tokens_seen": 36424455, "step": 1714, "time_per_iteration": 2.703925848007202 }, { "auxiliary_loss_clip": 0.0120009, "auxiliary_loss_mlp": 0.01037638, "balance_loss_clip": 1.06102192, "balance_loss_mlp": 1.02756464, "epoch": 0.206216557446041, "flos": 19864957518720.0, "grad_norm": 1.7442244623931438, "language_loss": 0.74277449, "learning_rate": 3.68315761952301e-06, "loss": 0.7651518, "num_input_tokens_seen": 36441685, "step": 1715, "time_per_iteration": 2.855215549468994 }, { "auxiliary_loss_clip": 0.01224255, "auxiliary_loss_mlp": 0.0104176, "balance_loss_clip": 1.06233311, "balance_loss_mlp": 1.03117418, "epoch": 0.2063368003366801, "flos": 24096311568000.0, "grad_norm": 2.190998002046023, "language_loss": 0.82918596, "learning_rate": 3.6827367425289797e-06, "loss": 0.8518461, "num_input_tokens_seen": 36461460, "step": 1716, "time_per_iteration": 2.7393980026245117 }, { "auxiliary_loss_clip": 0.01214719, "auxiliary_loss_mlp": 0.01036725, "balance_loss_clip": 1.06149769, "balance_loss_mlp": 1.02608585, "epoch": 0.2064570432273192, "flos": 20340163474560.0, "grad_norm": 2.7037176691763776, "language_loss": 0.72454846, "learning_rate": 3.6823156102651225e-06, "loss": 0.74706286, "num_input_tokens_seen": 36479615, "step": 1717, "time_per_iteration": 2.950064182281494 }, { "auxiliary_loss_clip": 0.0119349, "auxiliary_loss_mlp": 0.01032091, "balance_loss_clip": 1.06251383, "balance_loss_mlp": 1.02247691, "epoch": 0.20657728611795828, "flos": 20520363029760.0, "grad_norm": 1.8823680565311942, "language_loss": 0.71276492, "learning_rate": 3.6818942227953257e-06, "loss": 0.73502076, "num_input_tokens_seen": 36500160, "step": 1718, "time_per_iteration": 2.868786096572876 }, { "auxiliary_loss_clip": 0.01209184, "auxiliary_loss_mlp": 0.01037237, "balance_loss_clip": 1.0596509, "balance_loss_mlp": 1.02709246, "epoch": 0.20669752900859736, "flos": 21799285752960.0, "grad_norm": 2.250081782577679, "language_loss": 0.69038033, "learning_rate": 3.681472580183512e-06, "loss": 0.71284449, "num_input_tokens_seen": 36518810, "step": 1719, "time_per_iteration": 2.7569689750671387 }, { "auxiliary_loss_clip": 0.0121523, "auxiliary_loss_mlp": 0.01034204, "balance_loss_clip": 1.06261623, "balance_loss_mlp": 1.02427423, "epoch": 0.20681777189923645, "flos": 15122020014720.0, "grad_norm": 2.2025160376779955, "language_loss": 0.86438769, "learning_rate": 3.6810506824936455e-06, "loss": 0.88688201, "num_input_tokens_seen": 36536890, "step": 1720, "time_per_iteration": 2.695614814758301 }, { "auxiliary_loss_clip": 0.0110433, "auxiliary_loss_mlp": 0.01013057, "balance_loss_clip": 1.01781332, "balance_loss_mlp": 1.01078022, "epoch": 0.20693801478987556, "flos": 56481021509760.0, "grad_norm": 1.0566648757514454, "language_loss": 0.62551057, "learning_rate": 3.680628529789726e-06, "loss": 0.64668441, "num_input_tokens_seen": 36589300, "step": 1721, "time_per_iteration": 3.200901746749878 }, { "auxiliary_loss_clip": 0.01231111, "auxiliary_loss_mlp": 0.01042368, "balance_loss_clip": 1.06562281, "balance_loss_mlp": 1.03032792, "epoch": 0.20705825768051464, "flos": 21614201948160.0, "grad_norm": 3.3692027369246995, "language_loss": 0.86347854, "learning_rate": 3.680206122135796e-06, "loss": 0.8862133, "num_input_tokens_seen": 36609905, "step": 1722, "time_per_iteration": 2.89663028717041 }, { "auxiliary_loss_clip": 0.01214303, "auxiliary_loss_mlp": 0.01040616, "balance_loss_clip": 1.06399393, "balance_loss_mlp": 1.03066742, "epoch": 0.20717850057115372, "flos": 25848895962240.0, "grad_norm": 1.8252696044489714, "language_loss": 0.7852357, "learning_rate": 3.6797834595959323e-06, "loss": 0.80778491, "num_input_tokens_seen": 36629805, "step": 1723, "time_per_iteration": 2.895401954650879 }, { "auxiliary_loss_clip": 0.01198134, "auxiliary_loss_mlp": 0.01040048, "balance_loss_clip": 1.05853701, "balance_loss_mlp": 1.02886641, "epoch": 0.20729874346179283, "flos": 29130807767040.0, "grad_norm": 3.7073362393355813, "language_loss": 0.7777949, "learning_rate": 3.679360542234254e-06, "loss": 0.80017674, "num_input_tokens_seen": 36649150, "step": 1724, "time_per_iteration": 2.827329397201538 }, { "auxiliary_loss_clip": 0.01209966, "auxiliary_loss_mlp": 0.01068716, "balance_loss_clip": 1.06157649, "balance_loss_mlp": 1.02697134, "epoch": 0.20741898635243192, "flos": 29023363209600.0, "grad_norm": 1.9547478079098304, "language_loss": 0.72328889, "learning_rate": 3.678937370114916e-06, "loss": 0.74607575, "num_input_tokens_seen": 36668955, "step": 1725, "time_per_iteration": 2.8657734394073486 }, { "auxiliary_loss_clip": 0.01210093, "auxiliary_loss_mlp": 0.01030003, "balance_loss_clip": 1.06241989, "balance_loss_mlp": 1.02043664, "epoch": 0.207539229243071, "flos": 15559447841280.0, "grad_norm": 2.849187121230309, "language_loss": 0.79082453, "learning_rate": 3.678513943302114e-06, "loss": 0.81322557, "num_input_tokens_seen": 36685730, "step": 1726, "time_per_iteration": 2.7352280616760254 }, { "auxiliary_loss_clip": 0.01220145, "auxiliary_loss_mlp": 0.01039346, "balance_loss_clip": 1.05995035, "balance_loss_mlp": 1.02918363, "epoch": 0.20765947213371008, "flos": 20521081301760.0, "grad_norm": 2.0277145522022555, "language_loss": 0.84977639, "learning_rate": 3.678090261860082e-06, "loss": 0.87237132, "num_input_tokens_seen": 36705460, "step": 1727, "time_per_iteration": 2.831930637359619 }, { "auxiliary_loss_clip": 0.01212308, "auxiliary_loss_mlp": 0.01040278, "balance_loss_clip": 1.058447, "balance_loss_mlp": 1.02961516, "epoch": 0.2077797150243492, "flos": 19354415558400.0, "grad_norm": 2.0048474317729474, "language_loss": 0.77501708, "learning_rate": 3.6776663258530906e-06, "loss": 0.79754299, "num_input_tokens_seen": 36724110, "step": 1728, "time_per_iteration": 2.8101799488067627 }, { "auxiliary_loss_clip": 0.01221265, "auxiliary_loss_mlp": 0.0103588, "balance_loss_clip": 1.06135809, "balance_loss_mlp": 1.02558053, "epoch": 0.20789995791498828, "flos": 21829952989440.0, "grad_norm": 2.785909409733621, "language_loss": 0.71221852, "learning_rate": 3.6772421353454516e-06, "loss": 0.73478997, "num_input_tokens_seen": 36742705, "step": 1729, "time_per_iteration": 2.7651002407073975 }, { "auxiliary_loss_clip": 0.01217936, "auxiliary_loss_mlp": 0.01035013, "balance_loss_clip": 1.06268716, "balance_loss_mlp": 1.0242424, "epoch": 0.20802020080562736, "flos": 23148844571520.0, "grad_norm": 2.4332954539796106, "language_loss": 0.8834908, "learning_rate": 3.6768176904015153e-06, "loss": 0.90602028, "num_input_tokens_seen": 36762510, "step": 1730, "time_per_iteration": 2.697389841079712 }, { "auxiliary_loss_clip": 0.01215999, "auxiliary_loss_mlp": 0.01033023, "balance_loss_clip": 1.05813766, "balance_loss_mlp": 1.02272916, "epoch": 0.20814044369626647, "flos": 23072677781760.0, "grad_norm": 3.217689132821011, "language_loss": 0.60052764, "learning_rate": 3.6763929910856674e-06, "loss": 0.62301791, "num_input_tokens_seen": 36780960, "step": 1731, "time_per_iteration": 2.8200716972351074 }, { "auxiliary_loss_clip": 0.01216453, "auxiliary_loss_mlp": 0.01044298, "balance_loss_clip": 1.06254804, "balance_loss_mlp": 1.0345701, "epoch": 0.20826068658690555, "flos": 19608016556160.0, "grad_norm": 2.841747580786085, "language_loss": 0.77956605, "learning_rate": 3.6759680374623365e-06, "loss": 0.8021735, "num_input_tokens_seen": 36798875, "step": 1732, "time_per_iteration": 2.710690498352051 }, { "auxiliary_loss_clip": 0.0121925, "auxiliary_loss_mlp": 0.01041861, "balance_loss_clip": 1.06101978, "balance_loss_mlp": 1.03240788, "epoch": 0.20838092947754464, "flos": 25374049142400.0, "grad_norm": 2.5558343868178084, "language_loss": 0.75475919, "learning_rate": 3.675542829595986e-06, "loss": 0.77737033, "num_input_tokens_seen": 36818540, "step": 1733, "time_per_iteration": 3.6283676624298096 }, { "auxiliary_loss_clip": 0.01213094, "auxiliary_loss_mlp": 0.0103237, "balance_loss_clip": 1.05921161, "balance_loss_mlp": 1.02304196, "epoch": 0.20850117236818372, "flos": 24061729749120.0, "grad_norm": 1.6506108537500526, "language_loss": 0.79133677, "learning_rate": 3.6751173675511213e-06, "loss": 0.81379139, "num_input_tokens_seen": 36840585, "step": 1734, "time_per_iteration": 2.7836129665374756 }, { "auxiliary_loss_clip": 0.01210531, "auxiliary_loss_mlp": 0.01033166, "balance_loss_clip": 1.05643535, "balance_loss_mlp": 1.02235949, "epoch": 0.20862141525882283, "flos": 20077799558400.0, "grad_norm": 2.630156595026203, "language_loss": 0.87461501, "learning_rate": 3.674691651392283e-06, "loss": 0.89705205, "num_input_tokens_seen": 36858255, "step": 1735, "time_per_iteration": 3.6849513053894043 }, { "auxiliary_loss_clip": 0.01218845, "auxiliary_loss_mlp": 0.01032721, "balance_loss_clip": 1.06227851, "balance_loss_mlp": 1.02313638, "epoch": 0.2087416581494619, "flos": 39015183237120.0, "grad_norm": 1.9491355210451558, "language_loss": 0.75669348, "learning_rate": 3.674265681184053e-06, "loss": 0.77920914, "num_input_tokens_seen": 36881515, "step": 1736, "time_per_iteration": 3.9251933097839355 }, { "auxiliary_loss_clip": 0.01216752, "auxiliary_loss_mlp": 0.01034935, "balance_loss_clip": 1.06194162, "balance_loss_mlp": 1.02515948, "epoch": 0.208861901040101, "flos": 26101994169600.0, "grad_norm": 1.858714564255426, "language_loss": 0.86534524, "learning_rate": 3.6738394569910504e-06, "loss": 0.88786215, "num_input_tokens_seen": 36902055, "step": 1737, "time_per_iteration": 3.868346929550171 }, { "auxiliary_loss_clip": 0.01214329, "auxiliary_loss_mlp": 0.01033785, "balance_loss_clip": 1.0597229, "balance_loss_mlp": 1.02307415, "epoch": 0.2089821439307401, "flos": 28398732675840.0, "grad_norm": 2.356850434530327, "language_loss": 0.82490504, "learning_rate": 3.6734129788779333e-06, "loss": 0.84738618, "num_input_tokens_seen": 36921230, "step": 1738, "time_per_iteration": 2.901776075363159 }, { "auxiliary_loss_clip": 0.0120711, "auxiliary_loss_mlp": 0.01038338, "balance_loss_clip": 1.06099737, "balance_loss_mlp": 1.0280745, "epoch": 0.2091023868213792, "flos": 21069616872960.0, "grad_norm": 1.9395676484460813, "language_loss": 0.90345252, "learning_rate": 3.6729862469093976e-06, "loss": 0.92590702, "num_input_tokens_seen": 36940325, "step": 1739, "time_per_iteration": 2.7205302715301514 }, { "auxiliary_loss_clip": 0.01203428, "auxiliary_loss_mlp": 0.01036652, "balance_loss_clip": 1.05999458, "balance_loss_mlp": 1.02685928, "epoch": 0.20922262971201827, "flos": 22455481363200.0, "grad_norm": 2.5593507250802414, "language_loss": 0.83121443, "learning_rate": 3.6725592611501782e-06, "loss": 0.85361516, "num_input_tokens_seen": 36959000, "step": 1740, "time_per_iteration": 2.779477834701538 }, { "auxiliary_loss_clip": 0.01216905, "auxiliary_loss_mlp": 0.01037152, "balance_loss_clip": 1.06014562, "balance_loss_mlp": 1.02704883, "epoch": 0.20934287260265738, "flos": 27852244179840.0, "grad_norm": 1.9654454817394198, "language_loss": 0.7605108, "learning_rate": 3.6721320216650496e-06, "loss": 0.78305137, "num_input_tokens_seen": 36979615, "step": 1741, "time_per_iteration": 2.7696754932403564 }, { "auxiliary_loss_clip": 0.01210465, "auxiliary_loss_mlp": 0.01034189, "balance_loss_clip": 1.05923891, "balance_loss_mlp": 1.02344227, "epoch": 0.20946311549329646, "flos": 16435309075200.0, "grad_norm": 2.1528634995126934, "language_loss": 0.83527654, "learning_rate": 3.6717045285188215e-06, "loss": 0.85772312, "num_input_tokens_seen": 36997310, "step": 1742, "time_per_iteration": 2.799894094467163 }, { "auxiliary_loss_clip": 0.01192694, "auxiliary_loss_mlp": 0.0103362, "balance_loss_clip": 1.05835843, "balance_loss_mlp": 1.02282596, "epoch": 0.20958335838393555, "flos": 22492720788480.0, "grad_norm": 2.0890600758837663, "language_loss": 0.86843669, "learning_rate": 3.671276781776346e-06, "loss": 0.89069986, "num_input_tokens_seen": 37015965, "step": 1743, "time_per_iteration": 2.8070690631866455 }, { "auxiliary_loss_clip": 0.01216875, "auxiliary_loss_mlp": 0.01030731, "balance_loss_clip": 1.05918074, "balance_loss_mlp": 1.02095616, "epoch": 0.20970360127457463, "flos": 25224768218880.0, "grad_norm": 2.104152123806125, "language_loss": 0.66967785, "learning_rate": 3.6708487815025128e-06, "loss": 0.69215393, "num_input_tokens_seen": 37036545, "step": 1744, "time_per_iteration": 2.7819983959198 }, { "auxiliary_loss_clip": 0.01208753, "auxiliary_loss_mlp": 0.01033822, "balance_loss_clip": 1.05810595, "balance_loss_mlp": 1.02402306, "epoch": 0.20982384416521374, "flos": 18479164855680.0, "grad_norm": 2.604132322466177, "language_loss": 0.74510694, "learning_rate": 3.6704205277622463e-06, "loss": 0.76753271, "num_input_tokens_seen": 37054985, "step": 1745, "time_per_iteration": 2.715513229370117 }, { "auxiliary_loss_clip": 0.01216208, "auxiliary_loss_mlp": 0.01032683, "balance_loss_clip": 1.05863047, "balance_loss_mlp": 1.02198982, "epoch": 0.20994408705585282, "flos": 25373546352000.0, "grad_norm": 1.9556152142893803, "language_loss": 0.80725229, "learning_rate": 3.6699920206205146e-06, "loss": 0.82974118, "num_input_tokens_seen": 37075725, "step": 1746, "time_per_iteration": 2.783463478088379 }, { "auxiliary_loss_clip": 0.01215392, "auxiliary_loss_mlp": 0.01033306, "balance_loss_clip": 1.05698919, "balance_loss_mlp": 1.02384067, "epoch": 0.2100643299464919, "flos": 21320955313920.0, "grad_norm": 1.784301093796309, "language_loss": 0.82075524, "learning_rate": 3.669563260142321e-06, "loss": 0.84324229, "num_input_tokens_seen": 37094615, "step": 1747, "time_per_iteration": 2.758795738220215 }, { "auxiliary_loss_clip": 0.01211148, "auxiliary_loss_mlp": 0.01035658, "balance_loss_clip": 1.06155765, "balance_loss_mlp": 1.02576327, "epoch": 0.21018457283713102, "flos": 19354379644800.0, "grad_norm": 3.0045676155996452, "language_loss": 0.84437597, "learning_rate": 3.6691342463927083e-06, "loss": 0.86684406, "num_input_tokens_seen": 37113610, "step": 1748, "time_per_iteration": 2.8131794929504395 }, { "auxiliary_loss_clip": 0.01217346, "auxiliary_loss_mlp": 0.01034823, "balance_loss_clip": 1.06249893, "balance_loss_mlp": 1.0244813, "epoch": 0.2103048157277701, "flos": 28330035914880.0, "grad_norm": 1.7752705987015427, "language_loss": 0.8173703, "learning_rate": 3.668704979436758e-06, "loss": 0.83989197, "num_input_tokens_seen": 37133705, "step": 1749, "time_per_iteration": 2.8520925045013428 }, { "auxiliary_loss_clip": 0.01209915, "auxiliary_loss_mlp": 0.01035127, "balance_loss_clip": 1.06042624, "balance_loss_mlp": 1.02494669, "epoch": 0.21042505861840918, "flos": 17457290835840.0, "grad_norm": 2.283242210562656, "language_loss": 0.78526843, "learning_rate": 3.668275459339588e-06, "loss": 0.80771887, "num_input_tokens_seen": 37152185, "step": 1750, "time_per_iteration": 2.783346176147461 }, { "auxiliary_loss_clip": 0.01219655, "auxiliary_loss_mlp": 0.01035882, "balance_loss_clip": 1.0603807, "balance_loss_mlp": 1.02577305, "epoch": 0.21054530150904827, "flos": 14209817195520.0, "grad_norm": 2.097776112805244, "language_loss": 0.80074388, "learning_rate": 3.667845686166358e-06, "loss": 0.82329917, "num_input_tokens_seen": 37169110, "step": 1751, "time_per_iteration": 2.7226786613464355 }, { "auxiliary_loss_clip": 0.01199369, "auxiliary_loss_mlp": 0.01033118, "balance_loss_clip": 1.06392217, "balance_loss_mlp": 1.02296114, "epoch": 0.21066554439968738, "flos": 18618210403200.0, "grad_norm": 1.9562893516664401, "language_loss": 0.85926092, "learning_rate": 3.6674156599822634e-06, "loss": 0.88158578, "num_input_tokens_seen": 37184905, "step": 1752, "time_per_iteration": 2.7409887313842773 }, { "auxiliary_loss_clip": 0.01212242, "auxiliary_loss_mlp": 0.01035773, "balance_loss_clip": 1.06272101, "balance_loss_mlp": 1.02441823, "epoch": 0.21078578729032646, "flos": 23658883741440.0, "grad_norm": 2.3311088764290817, "language_loss": 0.81488168, "learning_rate": 3.666985380852539e-06, "loss": 0.83736181, "num_input_tokens_seen": 37203910, "step": 1753, "time_per_iteration": 2.9680089950561523 }, { "auxiliary_loss_clip": 0.01212774, "auxiliary_loss_mlp": 0.01033892, "balance_loss_clip": 1.05932045, "balance_loss_mlp": 1.02412283, "epoch": 0.21090603018096554, "flos": 29346379240320.0, "grad_norm": 3.7415108118995284, "language_loss": 0.74522114, "learning_rate": 3.6665548488424576e-06, "loss": 0.7676878, "num_input_tokens_seen": 37222670, "step": 1754, "time_per_iteration": 2.8648509979248047 }, { "auxiliary_loss_clip": 0.01221691, "auxiliary_loss_mlp": 0.01033526, "balance_loss_clip": 1.06023824, "balance_loss_mlp": 1.02326775, "epoch": 0.21102627307160465, "flos": 23261245205760.0, "grad_norm": 1.7956839344515998, "language_loss": 0.87974089, "learning_rate": 3.6661240640173307e-06, "loss": 0.90229309, "num_input_tokens_seen": 37244140, "step": 1755, "time_per_iteration": 2.8128557205200195 }, { "auxiliary_loss_clip": 0.01112339, "auxiliary_loss_mlp": 0.01004061, "balance_loss_clip": 1.02436948, "balance_loss_mlp": 1.00189173, "epoch": 0.21114651596224374, "flos": 54633454577280.0, "grad_norm": 0.9484259604420701, "language_loss": 0.57837415, "learning_rate": 3.6656930264425085e-06, "loss": 0.59953821, "num_input_tokens_seen": 37308185, "step": 1756, "time_per_iteration": 3.41463565826416 }, { "auxiliary_loss_clip": 0.01222119, "auxiliary_loss_mlp": 0.01032162, "balance_loss_clip": 1.06088459, "balance_loss_mlp": 1.02273262, "epoch": 0.21126675885288282, "flos": 21543314457600.0, "grad_norm": 2.136635878967333, "language_loss": 0.75464225, "learning_rate": 3.665261736183378e-06, "loss": 0.77718508, "num_input_tokens_seen": 37328220, "step": 1757, "time_per_iteration": 2.7490508556365967 }, { "auxiliary_loss_clip": 0.0121339, "auxiliary_loss_mlp": 0.01032479, "balance_loss_clip": 1.06328869, "balance_loss_mlp": 1.02213216, "epoch": 0.2113870017435219, "flos": 10961876678400.0, "grad_norm": 3.5418963201907823, "language_loss": 0.88744587, "learning_rate": 3.664830193305366e-06, "loss": 0.90990454, "num_input_tokens_seen": 37345995, "step": 1758, "time_per_iteration": 2.88525128364563 }, { "auxiliary_loss_clip": 0.01207278, "auxiliary_loss_mlp": 0.01034923, "balance_loss_clip": 1.0613606, "balance_loss_mlp": 1.02479625, "epoch": 0.211507244634161, "flos": 16653825463680.0, "grad_norm": 7.325805619572033, "language_loss": 0.77592653, "learning_rate": 3.6643983978739373e-06, "loss": 0.79834855, "num_input_tokens_seen": 37362610, "step": 1759, "time_per_iteration": 3.685502052307129 }, { "auxiliary_loss_clip": 0.01207742, "auxiliary_loss_mlp": 0.01038275, "balance_loss_clip": 1.06260681, "balance_loss_mlp": 1.02834535, "epoch": 0.2116274875248001, "flos": 20954091755520.0, "grad_norm": 1.6936763387554836, "language_loss": 0.81841803, "learning_rate": 3.663966349954596e-06, "loss": 0.84087819, "num_input_tokens_seen": 37382790, "step": 1760, "time_per_iteration": 2.9343085289001465 }, { "auxiliary_loss_clip": 0.01112693, "auxiliary_loss_mlp": 0.01000259, "balance_loss_clip": 1.02283919, "balance_loss_mlp": 0.99800605, "epoch": 0.21174773041543918, "flos": 68196949424640.0, "grad_norm": 0.7908041404671265, "language_loss": 0.59757644, "learning_rate": 3.6635340496128816e-06, "loss": 0.61870599, "num_input_tokens_seen": 37439720, "step": 1761, "time_per_iteration": 4.284596920013428 }, { "auxiliary_loss_clip": 0.01200442, "auxiliary_loss_mlp": 0.01039133, "balance_loss_clip": 1.05849159, "balance_loss_mlp": 1.02849936, "epoch": 0.2118679733060783, "flos": 20668315150080.0, "grad_norm": 1.8013810381086934, "language_loss": 0.92967522, "learning_rate": 3.6631014969143747e-06, "loss": 0.95207095, "num_input_tokens_seen": 37459410, "step": 1762, "time_per_iteration": 2.7853739261627197 }, { "auxiliary_loss_clip": 0.01219545, "auxiliary_loss_mlp": 0.01038819, "balance_loss_clip": 1.06262481, "balance_loss_mlp": 1.02806616, "epoch": 0.21198821619671737, "flos": 23223431162880.0, "grad_norm": 2.085428170038623, "language_loss": 0.88806033, "learning_rate": 3.662668691924693e-06, "loss": 0.91064405, "num_input_tokens_seen": 37480460, "step": 1763, "time_per_iteration": 4.569253206253052 }, { "auxiliary_loss_clip": 0.01208055, "auxiliary_loss_mlp": 0.01036559, "balance_loss_clip": 1.06002212, "balance_loss_mlp": 1.0254724, "epoch": 0.21210845908735645, "flos": 24498547044480.0, "grad_norm": 2.164675865494348, "language_loss": 0.71314335, "learning_rate": 3.6622356347094927e-06, "loss": 0.7355895, "num_input_tokens_seen": 37502025, "step": 1764, "time_per_iteration": 2.692283868789673 }, { "auxiliary_loss_clip": 0.01212218, "auxiliary_loss_mlp": 0.01033889, "balance_loss_clip": 1.06083632, "balance_loss_mlp": 1.02289772, "epoch": 0.21222870197799554, "flos": 27089789160960.0, "grad_norm": 2.0390082477715343, "language_loss": 0.78387487, "learning_rate": 3.6618023253344684e-06, "loss": 0.80633605, "num_input_tokens_seen": 37520885, "step": 1765, "time_per_iteration": 2.9102113246917725 }, { "auxiliary_loss_clip": 0.01217194, "auxiliary_loss_mlp": 0.01032949, "balance_loss_clip": 1.06004775, "balance_loss_mlp": 1.02218997, "epoch": 0.21234894486863465, "flos": 16873850223360.0, "grad_norm": 1.65645925281019, "language_loss": 0.83520186, "learning_rate": 3.6613687638653527e-06, "loss": 0.85770333, "num_input_tokens_seen": 37539055, "step": 1766, "time_per_iteration": 2.703200101852417 }, { "auxiliary_loss_clip": 0.01206377, "auxiliary_loss_mlp": 0.01032665, "balance_loss_clip": 1.05882394, "balance_loss_mlp": 1.02253234, "epoch": 0.21246918775927373, "flos": 23474949171840.0, "grad_norm": 2.0058509359608445, "language_loss": 0.77795619, "learning_rate": 3.660934950367916e-06, "loss": 0.80034661, "num_input_tokens_seen": 37558300, "step": 1767, "time_per_iteration": 2.725558280944824 }, { "auxiliary_loss_clip": 0.01215157, "auxiliary_loss_mlp": 0.01035759, "balance_loss_clip": 1.05774069, "balance_loss_mlp": 1.02525711, "epoch": 0.21258943064991281, "flos": 22382295402240.0, "grad_norm": 3.4394979341348164, "language_loss": 0.83470309, "learning_rate": 3.660500884907968e-06, "loss": 0.85721219, "num_input_tokens_seen": 37579040, "step": 1768, "time_per_iteration": 2.7797577381134033 }, { "auxiliary_loss_clip": 0.01097749, "auxiliary_loss_mlp": 0.01008869, "balance_loss_clip": 1.02146232, "balance_loss_mlp": 1.0066874, "epoch": 0.21270967354055192, "flos": 59440168679040.0, "grad_norm": 0.8205767501051239, "language_loss": 0.60029149, "learning_rate": 3.660066567551356e-06, "loss": 0.62135768, "num_input_tokens_seen": 37639185, "step": 1769, "time_per_iteration": 3.325444221496582 }, { "auxiliary_loss_clip": 0.01209906, "auxiliary_loss_mlp": 0.01061994, "balance_loss_clip": 1.05545211, "balance_loss_mlp": 1.02355039, "epoch": 0.212829916431191, "flos": 21544032729600.0, "grad_norm": 4.864840447864339, "language_loss": 0.84487551, "learning_rate": 3.6596319983639657e-06, "loss": 0.86759448, "num_input_tokens_seen": 37657765, "step": 1770, "time_per_iteration": 2.796480894088745 }, { "auxiliary_loss_clip": 0.01205883, "auxiliary_loss_mlp": 0.0107227, "balance_loss_clip": 1.05930829, "balance_loss_mlp": 1.0326364, "epoch": 0.2129501593218301, "flos": 28987739896320.0, "grad_norm": 1.835615221771805, "language_loss": 0.86273777, "learning_rate": 3.6591971774117214e-06, "loss": 0.88551927, "num_input_tokens_seen": 37680740, "step": 1771, "time_per_iteration": 2.8051633834838867 }, { "auxiliary_loss_clip": 0.01220561, "auxiliary_loss_mlp": 0.01034244, "balance_loss_clip": 1.06103253, "balance_loss_mlp": 1.02401567, "epoch": 0.2130704022124692, "flos": 18806993308800.0, "grad_norm": 2.6368950051398143, "language_loss": 0.80295497, "learning_rate": 3.6587621047605833e-06, "loss": 0.82550299, "num_input_tokens_seen": 37697910, "step": 1772, "time_per_iteration": 2.736590623855591 }, { "auxiliary_loss_clip": 0.0121177, "auxiliary_loss_mlp": 0.01032893, "balance_loss_clip": 1.05714905, "balance_loss_mlp": 1.02286756, "epoch": 0.21319064510310828, "flos": 13918150759680.0, "grad_norm": 2.050611232254323, "language_loss": 0.86822045, "learning_rate": 3.6583267804765542e-06, "loss": 0.89066708, "num_input_tokens_seen": 37712245, "step": 1773, "time_per_iteration": 2.640172004699707 }, { "auxiliary_loss_clip": 0.01215369, "auxiliary_loss_mlp": 0.01036338, "balance_loss_clip": 1.05999756, "balance_loss_mlp": 1.02499485, "epoch": 0.21331088799374737, "flos": 20959694277120.0, "grad_norm": 2.179840311756926, "language_loss": 0.8567183, "learning_rate": 3.6578912046256702e-06, "loss": 0.87923539, "num_input_tokens_seen": 37730765, "step": 1774, "time_per_iteration": 2.79758620262146 }, { "auxiliary_loss_clip": 0.01205153, "auxiliary_loss_mlp": 0.01035636, "balance_loss_clip": 1.05916548, "balance_loss_mlp": 1.02496731, "epoch": 0.21343113088438645, "flos": 18624638937600.0, "grad_norm": 19.94363114746645, "language_loss": 0.76404387, "learning_rate": 3.6574553772740083e-06, "loss": 0.78645176, "num_input_tokens_seen": 37748695, "step": 1775, "time_per_iteration": 2.7505571842193604 }, { "auxiliary_loss_clip": 0.01109548, "auxiliary_loss_mlp": 0.01006636, "balance_loss_clip": 1.0276773, "balance_loss_mlp": 1.00474072, "epoch": 0.21355137377502556, "flos": 67413128791680.0, "grad_norm": 0.8541386715226884, "language_loss": 0.61846262, "learning_rate": 3.657019298487684e-06, "loss": 0.63962448, "num_input_tokens_seen": 37813705, "step": 1776, "time_per_iteration": 3.371685266494751 }, { "auxiliary_loss_clip": 0.01217188, "auxiliary_loss_mlp": 0.010608, "balance_loss_clip": 1.05718637, "balance_loss_mlp": 1.02189517, "epoch": 0.21367161666566464, "flos": 34532095697280.0, "grad_norm": 1.9331656760763567, "language_loss": 0.83666086, "learning_rate": 3.6565829683328495e-06, "loss": 0.8594408, "num_input_tokens_seen": 37836330, "step": 1777, "time_per_iteration": 2.829339027404785 }, { "auxiliary_loss_clip": 0.01210393, "auxiliary_loss_mlp": 0.01034044, "balance_loss_clip": 1.05921113, "balance_loss_mlp": 1.02441216, "epoch": 0.21379185955630373, "flos": 18989347680000.0, "grad_norm": 1.8030143559138845, "language_loss": 0.8585977, "learning_rate": 3.6561463868756965e-06, "loss": 0.88104212, "num_input_tokens_seen": 37855030, "step": 1778, "time_per_iteration": 2.8003454208374023 }, { "auxiliary_loss_clip": 0.0121313, "auxiliary_loss_mlp": 0.01035371, "balance_loss_clip": 1.05912769, "balance_loss_mlp": 1.02433801, "epoch": 0.21391210244694284, "flos": 28218497207040.0, "grad_norm": 1.5843817320333422, "language_loss": 0.78060365, "learning_rate": 3.655709554182452e-06, "loss": 0.80308861, "num_input_tokens_seen": 37875370, "step": 1779, "time_per_iteration": 2.8455355167388916 }, { "auxiliary_loss_clip": 0.0121956, "auxiliary_loss_mlp": 0.010326, "balance_loss_clip": 1.06037474, "balance_loss_mlp": 1.0229497, "epoch": 0.21403234533758192, "flos": 17455064192640.0, "grad_norm": 1.9837012470661994, "language_loss": 0.84700108, "learning_rate": 3.6552724703193855e-06, "loss": 0.86952269, "num_input_tokens_seen": 37892560, "step": 1780, "time_per_iteration": 2.673391819000244 }, { "auxiliary_loss_clip": 0.01106304, "auxiliary_loss_mlp": 0.01002937, "balance_loss_clip": 1.0246675, "balance_loss_mlp": 1.0008868, "epoch": 0.214152588228221, "flos": 51637606686720.0, "grad_norm": 0.824062560658655, "language_loss": 0.55971992, "learning_rate": 3.654835135352801e-06, "loss": 0.58081234, "num_input_tokens_seen": 37947370, "step": 1781, "time_per_iteration": 3.2377121448516846 }, { "auxiliary_loss_clip": 0.0120681, "auxiliary_loss_mlp": 0.0103643, "balance_loss_clip": 1.05898559, "balance_loss_mlp": 1.02650571, "epoch": 0.21427283111886009, "flos": 19496154625920.0, "grad_norm": 2.5445583572003025, "language_loss": 0.87473041, "learning_rate": 3.654397549349043e-06, "loss": 0.89716285, "num_input_tokens_seen": 37964745, "step": 1782, "time_per_iteration": 2.7906570434570312 }, { "auxiliary_loss_clip": 0.01209003, "auxiliary_loss_mlp": 0.01035045, "balance_loss_clip": 1.06098306, "balance_loss_mlp": 1.02436411, "epoch": 0.2143930740094992, "flos": 20084802710400.0, "grad_norm": 1.8314743421678186, "language_loss": 0.75277323, "learning_rate": 3.653959712374491e-06, "loss": 0.77521372, "num_input_tokens_seen": 37982850, "step": 1783, "time_per_iteration": 2.874361038208008 }, { "auxiliary_loss_clip": 0.01200247, "auxiliary_loss_mlp": 0.01032857, "balance_loss_clip": 1.05971396, "balance_loss_mlp": 1.02286148, "epoch": 0.21451331690013828, "flos": 21798603394560.0, "grad_norm": 1.7208023054375525, "language_loss": 0.82454985, "learning_rate": 3.6535216244955663e-06, "loss": 0.84688091, "num_input_tokens_seen": 38002745, "step": 1784, "time_per_iteration": 2.8036632537841797 }, { "auxiliary_loss_clip": 0.01209814, "auxiliary_loss_mlp": 0.01033079, "balance_loss_clip": 1.0593133, "balance_loss_mlp": 1.02308297, "epoch": 0.21463355979077736, "flos": 32853882412800.0, "grad_norm": 1.9049678801031398, "language_loss": 0.70893043, "learning_rate": 3.653083285778726e-06, "loss": 0.73135936, "num_input_tokens_seen": 38024115, "step": 1785, "time_per_iteration": 2.8404412269592285 }, { "auxiliary_loss_clip": 0.01221521, "auxiliary_loss_mlp": 0.01037276, "balance_loss_clip": 1.05992508, "balance_loss_mlp": 1.02620697, "epoch": 0.21475380268141647, "flos": 21543817248000.0, "grad_norm": 2.236889348249806, "language_loss": 0.80877268, "learning_rate": 3.6526446962904653e-06, "loss": 0.8313607, "num_input_tokens_seen": 38042830, "step": 1786, "time_per_iteration": 3.7088961601257324 }, { "auxiliary_loss_clip": 0.01213494, "auxiliary_loss_mlp": 0.01033682, "balance_loss_clip": 1.06112325, "balance_loss_mlp": 1.02272105, "epoch": 0.21487404557205556, "flos": 32159082660480.0, "grad_norm": 1.6025218604887463, "language_loss": 0.74146891, "learning_rate": 3.652205856097318e-06, "loss": 0.76394063, "num_input_tokens_seen": 38066015, "step": 1787, "time_per_iteration": 3.967348337173462 }, { "auxiliary_loss_clip": 0.01218372, "auxiliary_loss_mlp": 0.01057329, "balance_loss_clip": 1.05999327, "balance_loss_mlp": 1.01825213, "epoch": 0.21499428846269464, "flos": 12673091583360.0, "grad_norm": 10.182371335969487, "language_loss": 0.79183328, "learning_rate": 3.651766765265856e-06, "loss": 0.81459033, "num_input_tokens_seen": 38083025, "step": 1788, "time_per_iteration": 3.66292667388916 }, { "auxiliary_loss_clip": 0.01207738, "auxiliary_loss_mlp": 0.0103353, "balance_loss_clip": 1.05661392, "balance_loss_mlp": 1.02268231, "epoch": 0.21511453135333372, "flos": 23471573293440.0, "grad_norm": 2.479401765407621, "language_loss": 0.80621374, "learning_rate": 3.65132742386269e-06, "loss": 0.82862645, "num_input_tokens_seen": 38098245, "step": 1789, "time_per_iteration": 3.724473714828491 }, { "auxiliary_loss_clip": 0.01220156, "auxiliary_loss_mlp": 0.01038536, "balance_loss_clip": 1.06020045, "balance_loss_mlp": 1.02824819, "epoch": 0.21523477424397283, "flos": 26943560893440.0, "grad_norm": 4.350069907170668, "language_loss": 0.84772074, "learning_rate": 3.6508878319544656e-06, "loss": 0.87030768, "num_input_tokens_seen": 38118460, "step": 1790, "time_per_iteration": 2.7575340270996094 }, { "auxiliary_loss_clip": 0.01207475, "auxiliary_loss_mlp": 0.01036656, "balance_loss_clip": 1.06207728, "balance_loss_mlp": 1.02579594, "epoch": 0.21535501713461191, "flos": 18916161719040.0, "grad_norm": 3.8074965518199093, "language_loss": 0.8154639, "learning_rate": 3.65044798960787e-06, "loss": 0.83790517, "num_input_tokens_seen": 38136800, "step": 1791, "time_per_iteration": 2.81597900390625 }, { "auxiliary_loss_clip": 0.01201753, "auxiliary_loss_mlp": 0.01033283, "balance_loss_clip": 1.05848646, "balance_loss_mlp": 1.0230248, "epoch": 0.215475260025251, "flos": 17895113712000.0, "grad_norm": 1.9855445639346827, "language_loss": 0.78191268, "learning_rate": 3.650007896889627e-06, "loss": 0.80426306, "num_input_tokens_seen": 38155380, "step": 1792, "time_per_iteration": 2.860294818878174 }, { "auxiliary_loss_clip": 0.01219045, "auxiliary_loss_mlp": 0.01039027, "balance_loss_clip": 1.06136453, "balance_loss_mlp": 1.02815509, "epoch": 0.2155955029158901, "flos": 16654292340480.0, "grad_norm": 1.8895259437839802, "language_loss": 0.80500597, "learning_rate": 3.6495675538664974e-06, "loss": 0.82758665, "num_input_tokens_seen": 38174395, "step": 1793, "time_per_iteration": 2.6706786155700684 }, { "auxiliary_loss_clip": 0.01215865, "auxiliary_loss_mlp": 0.01035854, "balance_loss_clip": 1.06043625, "balance_loss_mlp": 1.02541113, "epoch": 0.2157157458065292, "flos": 23621213352960.0, "grad_norm": 2.125481145448336, "language_loss": 0.82673907, "learning_rate": 3.649126960605282e-06, "loss": 0.84925622, "num_input_tokens_seen": 38195380, "step": 1794, "time_per_iteration": 2.7251882553100586 }, { "auxiliary_loss_clip": 0.01208699, "auxiliary_loss_mlp": 0.01032064, "balance_loss_clip": 1.06018734, "balance_loss_mlp": 1.02159107, "epoch": 0.21583598869716827, "flos": 22127078292480.0, "grad_norm": 9.428566099221177, "language_loss": 0.83698821, "learning_rate": 3.6486861171728174e-06, "loss": 0.85939586, "num_input_tokens_seen": 38213775, "step": 1795, "time_per_iteration": 2.658876657485962 }, { "auxiliary_loss_clip": 0.01212085, "auxiliary_loss_mlp": 0.01036418, "balance_loss_clip": 1.05987883, "balance_loss_mlp": 1.0256598, "epoch": 0.21595623158780738, "flos": 23441229279360.0, "grad_norm": 1.650955953043855, "language_loss": 0.78191471, "learning_rate": 3.6482450236359803e-06, "loss": 0.80439973, "num_input_tokens_seen": 38235630, "step": 1796, "time_per_iteration": 2.7785704135894775 }, { "auxiliary_loss_clip": 0.01213343, "auxiliary_loss_mlp": 0.01037383, "balance_loss_clip": 1.06052315, "balance_loss_mlp": 1.02701187, "epoch": 0.21607647447844647, "flos": 26906501036160.0, "grad_norm": 2.3335350554810153, "language_loss": 0.7772783, "learning_rate": 3.647803680061683e-06, "loss": 0.79978561, "num_input_tokens_seen": 38256045, "step": 1797, "time_per_iteration": 2.77480149269104 }, { "auxiliary_loss_clip": 0.01210888, "auxiliary_loss_mlp": 0.01040032, "balance_loss_clip": 1.06063068, "balance_loss_mlp": 1.02890372, "epoch": 0.21619671736908555, "flos": 14495378319360.0, "grad_norm": 2.2837315224244814, "language_loss": 0.74295866, "learning_rate": 3.6473620865168776e-06, "loss": 0.76546788, "num_input_tokens_seen": 38272915, "step": 1798, "time_per_iteration": 2.6830098628997803 }, { "auxiliary_loss_clip": 0.01212277, "auxiliary_loss_mlp": 0.0103654, "balance_loss_clip": 1.06227195, "balance_loss_mlp": 1.02757525, "epoch": 0.21631696025972463, "flos": 17931096161280.0, "grad_norm": 2.2946581491055085, "language_loss": 0.81580317, "learning_rate": 3.646920243068554e-06, "loss": 0.83829129, "num_input_tokens_seen": 38290810, "step": 1799, "time_per_iteration": 2.7137291431427 }, { "auxiliary_loss_clip": 0.01204562, "auxiliary_loss_mlp": 0.01033697, "balance_loss_clip": 1.06286788, "balance_loss_mlp": 1.02373683, "epoch": 0.21643720315036374, "flos": 24462385027200.0, "grad_norm": 2.4483788237941315, "language_loss": 0.74740243, "learning_rate": 3.6464781497837384e-06, "loss": 0.76978499, "num_input_tokens_seen": 38312785, "step": 1800, "time_per_iteration": 2.7270610332489014 }, { "auxiliary_loss_clip": 0.01217783, "auxiliary_loss_mlp": 0.01038866, "balance_loss_clip": 1.06027865, "balance_loss_mlp": 1.02846456, "epoch": 0.21655744604100283, "flos": 28474432588800.0, "grad_norm": 1.6805008527292145, "language_loss": 0.72795892, "learning_rate": 3.6460358067294965e-06, "loss": 0.75052536, "num_input_tokens_seen": 38334015, "step": 1801, "time_per_iteration": 2.78402042388916 }, { "auxiliary_loss_clip": 0.01224315, "auxiliary_loss_mlp": 0.01027844, "balance_loss_clip": 1.0610851, "balance_loss_mlp": 1.01694226, "epoch": 0.2166776889316419, "flos": 20152960767360.0, "grad_norm": 2.150448010868598, "language_loss": 0.78251421, "learning_rate": 3.645593213972932e-06, "loss": 0.80503571, "num_input_tokens_seen": 38352920, "step": 1802, "time_per_iteration": 2.696321964263916 }, { "auxiliary_loss_clip": 0.0121531, "auxiliary_loss_mlp": 0.01038613, "balance_loss_clip": 1.06303668, "balance_loss_mlp": 1.02774131, "epoch": 0.21679793182228102, "flos": 15193482122880.0, "grad_norm": 2.5153318339357864, "language_loss": 0.79805517, "learning_rate": 3.6451503715811852e-06, "loss": 0.82059437, "num_input_tokens_seen": 38371230, "step": 1803, "time_per_iteration": 2.7935070991516113 }, { "auxiliary_loss_clip": 0.01211185, "auxiliary_loss_mlp": 0.01034843, "balance_loss_clip": 1.06276369, "balance_loss_mlp": 1.02575922, "epoch": 0.2169181747129201, "flos": 17384464010880.0, "grad_norm": 2.0688365278472727, "language_loss": 0.79905033, "learning_rate": 3.6447072796214345e-06, "loss": 0.82151055, "num_input_tokens_seen": 38389795, "step": 1804, "time_per_iteration": 2.8373045921325684 }, { "auxiliary_loss_clip": 0.0109686, "auxiliary_loss_mlp": 0.01005466, "balance_loss_clip": 1.01815677, "balance_loss_mlp": 1.00327229, "epoch": 0.21703841760355919, "flos": 58760955429120.0, "grad_norm": 0.9076647441912769, "language_loss": 0.63080227, "learning_rate": 3.644263938160898e-06, "loss": 0.65182555, "num_input_tokens_seen": 38445760, "step": 1805, "time_per_iteration": 3.226926803588867 }, { "auxiliary_loss_clip": 0.01210106, "auxiliary_loss_mlp": 0.01036836, "balance_loss_clip": 1.06423044, "balance_loss_mlp": 1.02608979, "epoch": 0.21715866049419827, "flos": 22418457419520.0, "grad_norm": 2.261151425829317, "language_loss": 0.71947479, "learning_rate": 3.6438203472668293e-06, "loss": 0.74194419, "num_input_tokens_seen": 38465405, "step": 1806, "time_per_iteration": 2.7988648414611816 }, { "auxiliary_loss_clip": 0.01214929, "auxiliary_loss_mlp": 0.0103034, "balance_loss_clip": 1.06336558, "balance_loss_mlp": 1.02097011, "epoch": 0.21727890338483738, "flos": 17237732952960.0, "grad_norm": 9.461068438586139, "language_loss": 0.82019842, "learning_rate": 3.6433765070065206e-06, "loss": 0.84265113, "num_input_tokens_seen": 38483195, "step": 1807, "time_per_iteration": 2.7591664791107178 }, { "auxiliary_loss_clip": 0.01219934, "auxiliary_loss_mlp": 0.01036349, "balance_loss_clip": 1.05926228, "balance_loss_mlp": 1.02608514, "epoch": 0.21739914627547646, "flos": 13434792416640.0, "grad_norm": 2.5113864025905133, "language_loss": 0.87477088, "learning_rate": 3.6429324174473025e-06, "loss": 0.89733374, "num_input_tokens_seen": 38496735, "step": 1808, "time_per_iteration": 2.648237943649292 }, { "auxiliary_loss_clip": 0.01218161, "auxiliary_loss_mlp": 0.01028573, "balance_loss_clip": 1.0611062, "balance_loss_mlp": 1.01903677, "epoch": 0.21751938916611555, "flos": 20959514709120.0, "grad_norm": 2.2680119585103413, "language_loss": 0.85258305, "learning_rate": 3.6424880786565425e-06, "loss": 0.87505037, "num_input_tokens_seen": 38512880, "step": 1809, "time_per_iteration": 2.6718335151672363 }, { "auxiliary_loss_clip": 0.0120589, "auxiliary_loss_mlp": 0.01041093, "balance_loss_clip": 1.06452036, "balance_loss_mlp": 1.03001845, "epoch": 0.21763963205675466, "flos": 27599936071680.0, "grad_norm": 2.772990754021015, "language_loss": 0.79885268, "learning_rate": 3.6420434907016482e-06, "loss": 0.8213225, "num_input_tokens_seen": 38532570, "step": 1810, "time_per_iteration": 2.8555972576141357 }, { "auxiliary_loss_clip": 0.01216974, "auxiliary_loss_mlp": 0.01033911, "balance_loss_clip": 1.06210887, "balance_loss_mlp": 1.02409983, "epoch": 0.21775987494739374, "flos": 21430411032960.0, "grad_norm": 2.908945531103808, "language_loss": 0.80992478, "learning_rate": 3.6415986536500606e-06, "loss": 0.83243358, "num_input_tokens_seen": 38550900, "step": 1811, "time_per_iteration": 2.7322285175323486 }, { "auxiliary_loss_clip": 0.01199213, "auxiliary_loss_mlp": 0.01035268, "balance_loss_clip": 1.06550717, "balance_loss_mlp": 1.02526021, "epoch": 0.21788011783803282, "flos": 18332972501760.0, "grad_norm": 1.8106578349917424, "language_loss": 0.80889297, "learning_rate": 3.641153567569263e-06, "loss": 0.83123779, "num_input_tokens_seen": 38569215, "step": 1812, "time_per_iteration": 3.6994097232818604 }, { "auxiliary_loss_clip": 0.01206718, "auxiliary_loss_mlp": 0.01034908, "balance_loss_clip": 1.0565412, "balance_loss_mlp": 1.02522802, "epoch": 0.2180003607286719, "flos": 30262748037120.0, "grad_norm": 2.496884618259209, "language_loss": 0.96022189, "learning_rate": 3.640708232526774e-06, "loss": 0.98263812, "num_input_tokens_seen": 38587870, "step": 1813, "time_per_iteration": 3.843369722366333 }, { "auxiliary_loss_clip": 0.01202203, "auxiliary_loss_mlp": 0.01036193, "balance_loss_clip": 1.06052923, "balance_loss_mlp": 1.02620363, "epoch": 0.21812060361931102, "flos": 25480272637440.0, "grad_norm": 2.260518952587816, "language_loss": 0.78456748, "learning_rate": 3.6402626485901504e-06, "loss": 0.8069514, "num_input_tokens_seen": 38606965, "step": 1814, "time_per_iteration": 4.533259630203247 }, { "auxiliary_loss_clip": 0.0121159, "auxiliary_loss_mlp": 0.01037025, "balance_loss_clip": 1.06018806, "balance_loss_mlp": 1.0270412, "epoch": 0.2182408465099501, "flos": 21908166854400.0, "grad_norm": 2.552627339462762, "language_loss": 0.78105557, "learning_rate": 3.639816815826988e-06, "loss": 0.80354166, "num_input_tokens_seen": 38626290, "step": 1815, "time_per_iteration": 3.8097198009490967 }, { "auxiliary_loss_clip": 0.01209977, "auxiliary_loss_mlp": 0.01030574, "balance_loss_clip": 1.06079233, "balance_loss_mlp": 1.02093625, "epoch": 0.21836108940058918, "flos": 23657339456640.0, "grad_norm": 1.9189086547927612, "language_loss": 0.78109765, "learning_rate": 3.6393707343049176e-06, "loss": 0.80350316, "num_input_tokens_seen": 38646620, "step": 1816, "time_per_iteration": 2.8471713066101074 }, { "auxiliary_loss_clip": 0.01219676, "auxiliary_loss_mlp": 0.01038764, "balance_loss_clip": 1.05990481, "balance_loss_mlp": 1.0289948, "epoch": 0.2184813322912283, "flos": 24681009156480.0, "grad_norm": 2.9207500811967417, "language_loss": 0.73504961, "learning_rate": 3.6389244040916104e-06, "loss": 0.7576341, "num_input_tokens_seen": 38665695, "step": 1817, "time_per_iteration": 2.9249656200408936 }, { "auxiliary_loss_clip": 0.01204013, "auxiliary_loss_mlp": 0.01062325, "balance_loss_clip": 1.05895519, "balance_loss_mlp": 1.02320409, "epoch": 0.21860157518186737, "flos": 26574650259840.0, "grad_norm": 2.108042570839482, "language_loss": 0.79212499, "learning_rate": 3.6384778252547747e-06, "loss": 0.81478834, "num_input_tokens_seen": 38681575, "step": 1818, "time_per_iteration": 2.917170286178589 }, { "auxiliary_loss_clip": 0.01210397, "auxiliary_loss_mlp": 0.01062139, "balance_loss_clip": 1.06292546, "balance_loss_mlp": 1.02128911, "epoch": 0.21872181807250646, "flos": 20886292834560.0, "grad_norm": 3.0636145524691503, "language_loss": 0.77737767, "learning_rate": 3.638030997862155e-06, "loss": 0.80010301, "num_input_tokens_seen": 38700510, "step": 1819, "time_per_iteration": 2.847414970397949 }, { "auxiliary_loss_clip": 0.01097524, "auxiliary_loss_mlp": 0.01004077, "balance_loss_clip": 1.0184617, "balance_loss_mlp": 1.00215816, "epoch": 0.21884206096314554, "flos": 61209452897280.0, "grad_norm": 0.768596821667704, "language_loss": 0.59378272, "learning_rate": 3.6375839219815356e-06, "loss": 0.61479867, "num_input_tokens_seen": 38758310, "step": 1820, "time_per_iteration": 3.3209424018859863 }, { "auxiliary_loss_clip": 0.01218415, "auxiliary_loss_mlp": 0.01031225, "balance_loss_clip": 1.05934548, "balance_loss_mlp": 1.02124107, "epoch": 0.21896230385378465, "flos": 23473835850240.0, "grad_norm": 2.4876052332474337, "language_loss": 0.82618368, "learning_rate": 3.6371365976807375e-06, "loss": 0.84868002, "num_input_tokens_seen": 38778705, "step": 1821, "time_per_iteration": 2.7240586280822754 }, { "auxiliary_loss_clip": 0.01195337, "auxiliary_loss_mlp": 0.01037233, "balance_loss_clip": 1.06025589, "balance_loss_mlp": 1.02712953, "epoch": 0.21908254674442373, "flos": 25081915829760.0, "grad_norm": 2.0771464324089295, "language_loss": 0.83771431, "learning_rate": 3.6366890250276185e-06, "loss": 0.86004001, "num_input_tokens_seen": 38799660, "step": 1822, "time_per_iteration": 2.876056671142578 }, { "auxiliary_loss_clip": 0.01216598, "auxiliary_loss_mlp": 0.01037334, "balance_loss_clip": 1.06037915, "balance_loss_mlp": 1.02633727, "epoch": 0.21920278963506282, "flos": 23513768795520.0, "grad_norm": 3.547389029447437, "language_loss": 0.89982474, "learning_rate": 3.6362412040900764e-06, "loss": 0.922364, "num_input_tokens_seen": 38819450, "step": 1823, "time_per_iteration": 2.8242835998535156 }, { "auxiliary_loss_clip": 0.01214796, "auxiliary_loss_mlp": 0.01034255, "balance_loss_clip": 1.05772948, "balance_loss_mlp": 1.02467096, "epoch": 0.21932303252570193, "flos": 29242238734080.0, "grad_norm": 2.0256055800275266, "language_loss": 0.80850267, "learning_rate": 3.635793134936044e-06, "loss": 0.83099324, "num_input_tokens_seen": 38840460, "step": 1824, "time_per_iteration": 2.897989511489868 }, { "auxiliary_loss_clip": 0.01210682, "auxiliary_loss_mlp": 0.01037647, "balance_loss_clip": 1.05950141, "balance_loss_mlp": 1.02798498, "epoch": 0.219443275416341, "flos": 20806857907200.0, "grad_norm": 3.130012646340852, "language_loss": 0.73006845, "learning_rate": 3.635344817633494e-06, "loss": 0.75255179, "num_input_tokens_seen": 38859775, "step": 1825, "time_per_iteration": 2.744328498840332 }, { "auxiliary_loss_clip": 0.01208746, "auxiliary_loss_mlp": 0.0103491, "balance_loss_clip": 1.05813265, "balance_loss_mlp": 1.02509356, "epoch": 0.2195635183069801, "flos": 14501555458560.0, "grad_norm": 2.009684957340271, "language_loss": 0.75250769, "learning_rate": 3.634896252250436e-06, "loss": 0.77494425, "num_input_tokens_seen": 38876540, "step": 1826, "time_per_iteration": 2.7824723720550537 }, { "auxiliary_loss_clip": 0.01222256, "auxiliary_loss_mlp": 0.01038914, "balance_loss_clip": 1.06218863, "balance_loss_mlp": 1.0287869, "epoch": 0.2196837611976192, "flos": 24243473589120.0, "grad_norm": 1.7995172161434294, "language_loss": 0.82194281, "learning_rate": 3.6344474388549157e-06, "loss": 0.84455448, "num_input_tokens_seen": 38896195, "step": 1827, "time_per_iteration": 2.802060842514038 }, { "auxiliary_loss_clip": 0.01215713, "auxiliary_loss_mlp": 0.01031849, "balance_loss_clip": 1.0631938, "balance_loss_mlp": 1.02177, "epoch": 0.2198040040882583, "flos": 18074523168000.0, "grad_norm": 1.9984837723722364, "language_loss": 0.7988466, "learning_rate": 3.6339983775150183e-06, "loss": 0.8213222, "num_input_tokens_seen": 38912755, "step": 1828, "time_per_iteration": 2.8143956661224365 }, { "auxiliary_loss_clip": 0.01216219, "auxiliary_loss_mlp": 0.01032446, "balance_loss_clip": 1.06257796, "balance_loss_mlp": 1.02241492, "epoch": 0.21992424697889737, "flos": 17784185535360.0, "grad_norm": 2.861167186804514, "language_loss": 0.83996797, "learning_rate": 3.6335490682988664e-06, "loss": 0.86245459, "num_input_tokens_seen": 38928365, "step": 1829, "time_per_iteration": 2.7926833629608154 }, { "auxiliary_loss_clip": 0.01196982, "auxiliary_loss_mlp": 0.01041077, "balance_loss_clip": 1.06255102, "balance_loss_mlp": 1.0310992, "epoch": 0.22004448986953645, "flos": 17638495971840.0, "grad_norm": 2.0670278698613114, "language_loss": 0.82954067, "learning_rate": 3.63309951127462e-06, "loss": 0.85192126, "num_input_tokens_seen": 38945275, "step": 1830, "time_per_iteration": 2.786417245864868 }, { "auxiliary_loss_clip": 0.01205232, "auxiliary_loss_mlp": 0.01031846, "balance_loss_clip": 1.06024647, "balance_loss_mlp": 1.02165985, "epoch": 0.22016473276017556, "flos": 22275533203200.0, "grad_norm": 2.1688173369894126, "language_loss": 0.75349319, "learning_rate": 3.6326497065104757e-06, "loss": 0.77586401, "num_input_tokens_seen": 38965740, "step": 1831, "time_per_iteration": 2.836690664291382 }, { "auxiliary_loss_clip": 0.01218789, "auxiliary_loss_mlp": 0.01038404, "balance_loss_clip": 1.06088591, "balance_loss_mlp": 1.02802646, "epoch": 0.22028497565081465, "flos": 25556259859200.0, "grad_norm": 3.158653197698435, "language_loss": 0.78213865, "learning_rate": 3.6321996540746697e-06, "loss": 0.80471063, "num_input_tokens_seen": 38984815, "step": 1832, "time_per_iteration": 2.9138364791870117 }, { "auxiliary_loss_clip": 0.0120798, "auxiliary_loss_mlp": 0.01037327, "balance_loss_clip": 1.06184828, "balance_loss_mlp": 1.02627659, "epoch": 0.22040521854145373, "flos": 36247332925440.0, "grad_norm": 1.9146187166552833, "language_loss": 0.80440986, "learning_rate": 3.6317493540354733e-06, "loss": 0.82686293, "num_input_tokens_seen": 39008230, "step": 1833, "time_per_iteration": 2.9615206718444824 }, { "auxiliary_loss_clip": 0.01210931, "auxiliary_loss_mlp": 0.01037639, "balance_loss_clip": 1.06092787, "balance_loss_mlp": 1.02575362, "epoch": 0.22052546143209284, "flos": 11838420270720.0, "grad_norm": 4.240019802031053, "language_loss": 0.76852787, "learning_rate": 3.6312988064611976e-06, "loss": 0.7910136, "num_input_tokens_seen": 39026540, "step": 1834, "time_per_iteration": 2.7224302291870117 }, { "auxiliary_loss_clip": 0.01211309, "auxiliary_loss_mlp": 0.01036656, "balance_loss_clip": 1.05894351, "balance_loss_mlp": 1.02737594, "epoch": 0.22064570432273192, "flos": 24209250906240.0, "grad_norm": 2.5079805537532907, "language_loss": 0.81304252, "learning_rate": 3.6308480114201896e-06, "loss": 0.83552212, "num_input_tokens_seen": 39048460, "step": 1835, "time_per_iteration": 2.8210461139678955 }, { "auxiliary_loss_clip": 0.01221428, "auxiliary_loss_mlp": 0.01035427, "balance_loss_clip": 1.06331396, "balance_loss_mlp": 1.02585411, "epoch": 0.220765947213371, "flos": 17931347556480.0, "grad_norm": 58.767054403864925, "language_loss": 0.76661468, "learning_rate": 3.630396968980835e-06, "loss": 0.78918326, "num_input_tokens_seen": 39066335, "step": 1836, "time_per_iteration": 2.763704776763916 }, { "auxiliary_loss_clip": 0.01211135, "auxiliary_loss_mlp": 0.01035953, "balance_loss_clip": 1.06013083, "balance_loss_mlp": 1.02601719, "epoch": 0.2208861901040101, "flos": 26757040544640.0, "grad_norm": 3.6601393663501707, "language_loss": 0.83756059, "learning_rate": 3.6299456792115575e-06, "loss": 0.86003149, "num_input_tokens_seen": 39087590, "step": 1837, "time_per_iteration": 2.8320655822753906 }, { "auxiliary_loss_clip": 0.01181086, "auxiliary_loss_mlp": 0.01031181, "balance_loss_clip": 1.05877197, "balance_loss_mlp": 1.02171016, "epoch": 0.2210064329946492, "flos": 17817977255040.0, "grad_norm": 2.4986657347532355, "language_loss": 0.8082993, "learning_rate": 3.629494142180815e-06, "loss": 0.83042204, "num_input_tokens_seen": 39106335, "step": 1838, "time_per_iteration": 3.865345001220703 }, { "auxiliary_loss_clip": 0.01218683, "auxiliary_loss_mlp": 0.01032906, "balance_loss_clip": 1.06066775, "balance_loss_mlp": 1.02264225, "epoch": 0.22112667588528828, "flos": 17967401832960.0, "grad_norm": 2.4958470556855903, "language_loss": 0.85175276, "learning_rate": 3.6290423579571075e-06, "loss": 0.87426865, "num_input_tokens_seen": 39122875, "step": 1839, "time_per_iteration": 3.696209192276001 }, { "auxiliary_loss_clip": 0.01211741, "auxiliary_loss_mlp": 0.01031687, "balance_loss_clip": 1.06020045, "balance_loss_mlp": 1.02155972, "epoch": 0.22124691877592736, "flos": 18369206346240.0, "grad_norm": 1.7756493473512271, "language_loss": 0.80359721, "learning_rate": 3.6285903266089694e-06, "loss": 0.82603151, "num_input_tokens_seen": 39142150, "step": 1840, "time_per_iteration": 4.065831661224365 }, { "auxiliary_loss_clip": 0.01210365, "auxiliary_loss_mlp": 0.01035374, "balance_loss_clip": 1.05937564, "balance_loss_mlp": 1.02491355, "epoch": 0.22136716166656648, "flos": 20813286441600.0, "grad_norm": 5.170685642191666, "language_loss": 0.77540481, "learning_rate": 3.628138048204974e-06, "loss": 0.79786223, "num_input_tokens_seen": 39162835, "step": 1841, "time_per_iteration": 3.905057668685913 }, { "auxiliary_loss_clip": 0.01196293, "auxiliary_loss_mlp": 0.01033737, "balance_loss_clip": 1.06085753, "balance_loss_mlp": 1.02301383, "epoch": 0.22148740455720556, "flos": 17675699483520.0, "grad_norm": 1.6914975622561852, "language_loss": 0.76154095, "learning_rate": 3.6276855228137304e-06, "loss": 0.78384125, "num_input_tokens_seen": 39181040, "step": 1842, "time_per_iteration": 2.8150672912597656 }, { "auxiliary_loss_clip": 0.01220126, "auxiliary_loss_mlp": 0.01067009, "balance_loss_clip": 1.06262612, "balance_loss_mlp": 1.02680278, "epoch": 0.22160764744784464, "flos": 21726710323200.0, "grad_norm": 2.4066509079297944, "language_loss": 0.81790799, "learning_rate": 3.6272327505038874e-06, "loss": 0.8407793, "num_input_tokens_seen": 39197505, "step": 1843, "time_per_iteration": 2.725430727005005 }, { "auxiliary_loss_clip": 0.0120824, "auxiliary_loss_mlp": 0.01033367, "balance_loss_clip": 1.05984449, "balance_loss_mlp": 1.02450991, "epoch": 0.22172789033848372, "flos": 23764712186880.0, "grad_norm": 2.0789711398910646, "language_loss": 0.78478122, "learning_rate": 3.626779731344131e-06, "loss": 0.80719721, "num_input_tokens_seen": 39217295, "step": 1844, "time_per_iteration": 2.887949228286743 }, { "auxiliary_loss_clip": 0.01214398, "auxiliary_loss_mlp": 0.01031417, "balance_loss_clip": 1.05950332, "balance_loss_mlp": 1.02177882, "epoch": 0.22184813322912283, "flos": 16982300361600.0, "grad_norm": 2.7320180714537337, "language_loss": 0.85357243, "learning_rate": 3.6263264654031814e-06, "loss": 0.87603056, "num_input_tokens_seen": 39234195, "step": 1845, "time_per_iteration": 2.7858893871307373 }, { "auxiliary_loss_clip": 0.01102079, "auxiliary_loss_mlp": 0.01009146, "balance_loss_clip": 1.02583277, "balance_loss_mlp": 1.00728607, "epoch": 0.22196837611976192, "flos": 61823740314240.0, "grad_norm": 0.7251255713642204, "language_loss": 0.59156537, "learning_rate": 3.6258729527498008e-06, "loss": 0.61267769, "num_input_tokens_seen": 39295040, "step": 1846, "time_per_iteration": 3.3372435569763184 }, { "auxiliary_loss_clip": 0.01215709, "auxiliary_loss_mlp": 0.01033509, "balance_loss_clip": 1.06229329, "balance_loss_mlp": 1.02357864, "epoch": 0.222088619010401, "flos": 25558019625600.0, "grad_norm": 2.423716080138699, "language_loss": 0.64409781, "learning_rate": 3.6254191934527854e-06, "loss": 0.66658998, "num_input_tokens_seen": 39314395, "step": 1847, "time_per_iteration": 2.80780029296875 }, { "auxiliary_loss_clip": 0.01208607, "auxiliary_loss_mlp": 0.01034447, "balance_loss_clip": 1.06386185, "balance_loss_mlp": 1.02443361, "epoch": 0.2222088619010401, "flos": 19318612677120.0, "grad_norm": 2.133044709885436, "language_loss": 0.65049422, "learning_rate": 3.6249651875809715e-06, "loss": 0.67292476, "num_input_tokens_seen": 39334275, "step": 1848, "time_per_iteration": 2.769829750061035 }, { "auxiliary_loss_clip": 0.01209071, "auxiliary_loss_mlp": 0.0103469, "balance_loss_clip": 1.0631088, "balance_loss_mlp": 1.02449203, "epoch": 0.2223291047916792, "flos": 19099342103040.0, "grad_norm": 2.3480158373134783, "language_loss": 0.89688057, "learning_rate": 3.62451093520323e-06, "loss": 0.9193182, "num_input_tokens_seen": 39352180, "step": 1849, "time_per_iteration": 2.8422608375549316 }, { "auxiliary_loss_clip": 0.0119959, "auxiliary_loss_mlp": 0.01030147, "balance_loss_clip": 1.06003892, "balance_loss_mlp": 1.02097952, "epoch": 0.22244934768231828, "flos": 20850418126080.0, "grad_norm": 2.2673758537360498, "language_loss": 0.90736133, "learning_rate": 3.6240564363884714e-06, "loss": 0.92965877, "num_input_tokens_seen": 39372125, "step": 1850, "time_per_iteration": 2.9449260234832764 }, { "auxiliary_loss_clip": 0.01222204, "auxiliary_loss_mlp": 0.01033917, "balance_loss_clip": 1.06235623, "balance_loss_mlp": 1.02325928, "epoch": 0.2225695905729574, "flos": 15632921111040.0, "grad_norm": 2.047213526013096, "language_loss": 0.70157045, "learning_rate": 3.623601691205643e-06, "loss": 0.72413164, "num_input_tokens_seen": 39391200, "step": 1851, "time_per_iteration": 2.8378546237945557 }, { "auxiliary_loss_clip": 0.01213793, "auxiliary_loss_mlp": 0.01032844, "balance_loss_clip": 1.06125498, "balance_loss_mlp": 1.02284873, "epoch": 0.22268983346359647, "flos": 25373582265600.0, "grad_norm": 2.851961213812617, "language_loss": 0.81930137, "learning_rate": 3.623146699723729e-06, "loss": 0.84176773, "num_input_tokens_seen": 39410660, "step": 1852, "time_per_iteration": 2.8170382976531982 }, { "auxiliary_loss_clip": 0.01208916, "auxiliary_loss_mlp": 0.01032495, "balance_loss_clip": 1.06303132, "balance_loss_mlp": 1.02295256, "epoch": 0.22281007635423555, "flos": 13261452359040.0, "grad_norm": 1.9621121500786909, "language_loss": 0.77525717, "learning_rate": 3.6226914620117507e-06, "loss": 0.79767132, "num_input_tokens_seen": 39429280, "step": 1853, "time_per_iteration": 2.790602684020996 }, { "auxiliary_loss_clip": 0.01208969, "auxiliary_loss_mlp": 0.01034911, "balance_loss_clip": 1.05906153, "balance_loss_mlp": 1.02559507, "epoch": 0.22293031924487464, "flos": 15340536403200.0, "grad_norm": 2.4614724350102115, "language_loss": 0.80898988, "learning_rate": 3.622235978138768e-06, "loss": 0.83142871, "num_input_tokens_seen": 39446905, "step": 1854, "time_per_iteration": 2.8666250705718994 }, { "auxiliary_loss_clip": 0.0121516, "auxiliary_loss_mlp": 0.01035645, "balance_loss_clip": 1.06300211, "balance_loss_mlp": 1.02616227, "epoch": 0.22305056213551375, "flos": 22564649773440.0, "grad_norm": 2.247119725516732, "language_loss": 0.812693, "learning_rate": 3.621780248173877e-06, "loss": 0.83520103, "num_input_tokens_seen": 39465105, "step": 1855, "time_per_iteration": 2.8779852390289307 }, { "auxiliary_loss_clip": 0.01114104, "auxiliary_loss_mlp": 0.01006622, "balance_loss_clip": 1.02527571, "balance_loss_mlp": 1.00461888, "epoch": 0.22317080502615283, "flos": 64880419887360.0, "grad_norm": 0.8328906966958178, "language_loss": 0.61041403, "learning_rate": 3.6213242721862125e-06, "loss": 0.6316213, "num_input_tokens_seen": 39523560, "step": 1856, "time_per_iteration": 3.5153443813323975 }, { "auxiliary_loss_clip": 0.01204537, "auxiliary_loss_mlp": 0.01036066, "balance_loss_clip": 1.06408393, "balance_loss_mlp": 1.02528393, "epoch": 0.2232910479167919, "flos": 25775997310080.0, "grad_norm": 1.9384864814684966, "language_loss": 0.75077748, "learning_rate": 3.620868050244945e-06, "loss": 0.77318352, "num_input_tokens_seen": 39544040, "step": 1857, "time_per_iteration": 2.822598695755005 }, { "auxiliary_loss_clip": 0.01208519, "auxiliary_loss_mlp": 0.01029987, "balance_loss_clip": 1.06187367, "balance_loss_mlp": 1.01899552, "epoch": 0.22341129080743102, "flos": 23251799928960.0, "grad_norm": 2.277570982039106, "language_loss": 0.7761929, "learning_rate": 3.6204115824192817e-06, "loss": 0.79857802, "num_input_tokens_seen": 39561515, "step": 1858, "time_per_iteration": 2.907454490661621 }, { "auxiliary_loss_clip": 0.01205133, "auxiliary_loss_mlp": 0.01039865, "balance_loss_clip": 1.06052756, "balance_loss_mlp": 1.03000093, "epoch": 0.2235315336980701, "flos": 21214552250880.0, "grad_norm": 2.8450960199169946, "language_loss": 0.76732242, "learning_rate": 3.619954868778471e-06, "loss": 0.78977239, "num_input_tokens_seen": 39578210, "step": 1859, "time_per_iteration": 2.965156316757202 }, { "auxiliary_loss_clip": 0.01212852, "auxiliary_loss_mlp": 0.01040016, "balance_loss_clip": 1.06319308, "balance_loss_mlp": 1.02972269, "epoch": 0.2236517765887092, "flos": 19901945548800.0, "grad_norm": 2.1968079038672954, "language_loss": 0.83149749, "learning_rate": 3.6194979093917944e-06, "loss": 0.8540262, "num_input_tokens_seen": 39597625, "step": 1860, "time_per_iteration": 2.8006601333618164 }, { "auxiliary_loss_clip": 0.01205189, "auxiliary_loss_mlp": 0.01042184, "balance_loss_clip": 1.06200981, "balance_loss_mlp": 1.03264177, "epoch": 0.22377201947934827, "flos": 23214847812480.0, "grad_norm": 1.9263726785459299, "language_loss": 0.87113076, "learning_rate": 3.6190407043285724e-06, "loss": 0.89360452, "num_input_tokens_seen": 39615360, "step": 1861, "time_per_iteration": 2.7821595668792725 }, { "auxiliary_loss_clip": 0.01221103, "auxiliary_loss_mlp": 0.01039017, "balance_loss_clip": 1.06107569, "balance_loss_mlp": 1.02897406, "epoch": 0.22389226236998738, "flos": 26794244056320.0, "grad_norm": 2.052573835411191, "language_loss": 0.75858736, "learning_rate": 3.618583253658163e-06, "loss": 0.78118855, "num_input_tokens_seen": 39635460, "step": 1862, "time_per_iteration": 2.815534830093384 }, { "auxiliary_loss_clip": 0.01205844, "auxiliary_loss_mlp": 0.01066881, "balance_loss_clip": 1.06203282, "balance_loss_mlp": 1.02413726, "epoch": 0.22401250526062647, "flos": 24170359455360.0, "grad_norm": 2.053943088400196, "language_loss": 0.86649781, "learning_rate": 3.618125557449961e-06, "loss": 0.88922507, "num_input_tokens_seen": 39653515, "step": 1863, "time_per_iteration": 2.8790223598480225 }, { "auxiliary_loss_clip": 0.01212011, "auxiliary_loss_mlp": 0.01033243, "balance_loss_clip": 1.0599432, "balance_loss_mlp": 1.02328897, "epoch": 0.22413274815126555, "flos": 16759761649920.0, "grad_norm": 2.3052832199041116, "language_loss": 0.8296448, "learning_rate": 3.6176676157733983e-06, "loss": 0.85209733, "num_input_tokens_seen": 39668525, "step": 1864, "time_per_iteration": 3.656712055206299 }, { "auxiliary_loss_clip": 0.01205836, "auxiliary_loss_mlp": 0.01039192, "balance_loss_clip": 1.06471729, "balance_loss_mlp": 1.02851725, "epoch": 0.22425299104190466, "flos": 21360205900800.0, "grad_norm": 2.6361951043954766, "language_loss": 0.75978583, "learning_rate": 3.6172094286979443e-06, "loss": 0.7822361, "num_input_tokens_seen": 39685895, "step": 1865, "time_per_iteration": 2.915928840637207 }, { "auxiliary_loss_clip": 0.01208332, "auxiliary_loss_mlp": 0.01033093, "balance_loss_clip": 1.05968189, "balance_loss_mlp": 1.02293038, "epoch": 0.22437323393254374, "flos": 32165547108480.0, "grad_norm": 1.5646832793107168, "language_loss": 0.81125194, "learning_rate": 3.6167509962931064e-06, "loss": 0.83366621, "num_input_tokens_seen": 39711595, "step": 1866, "time_per_iteration": 5.069075584411621 }, { "auxiliary_loss_clip": 0.01209292, "auxiliary_loss_mlp": 0.01035302, "balance_loss_clip": 1.06320775, "balance_loss_mlp": 1.02433515, "epoch": 0.22449347682318282, "flos": 18002809664640.0, "grad_norm": 3.8960652187363136, "language_loss": 0.77364981, "learning_rate": 3.6162923186284276e-06, "loss": 0.79609585, "num_input_tokens_seen": 39727555, "step": 1867, "time_per_iteration": 3.847109794616699 }, { "auxiliary_loss_clip": 0.0121023, "auxiliary_loss_mlp": 0.01032397, "balance_loss_clip": 1.06080532, "balance_loss_mlp": 1.02223468, "epoch": 0.2246137197138219, "flos": 18697286194560.0, "grad_norm": 2.1945901353102504, "language_loss": 0.85585672, "learning_rate": 3.6158333957734888e-06, "loss": 0.87828302, "num_input_tokens_seen": 39746145, "step": 1868, "time_per_iteration": 2.767998695373535 }, { "auxiliary_loss_clip": 0.01211167, "auxiliary_loss_mlp": 0.01038372, "balance_loss_clip": 1.06017923, "balance_loss_mlp": 1.02876997, "epoch": 0.22473396260446102, "flos": 15590653781760.0, "grad_norm": 2.7061282755014235, "language_loss": 0.8305077, "learning_rate": 3.6153742277979088e-06, "loss": 0.85300308, "num_input_tokens_seen": 39763575, "step": 1869, "time_per_iteration": 2.777839422225952 }, { "auxiliary_loss_clip": 0.01211787, "auxiliary_loss_mlp": 0.01035058, "balance_loss_clip": 1.06099916, "balance_loss_mlp": 1.02567649, "epoch": 0.2248542054951001, "flos": 14465501182080.0, "grad_norm": 3.127307749344642, "language_loss": 0.78334105, "learning_rate": 3.6149148147713434e-06, "loss": 0.8058095, "num_input_tokens_seen": 39781810, "step": 1870, "time_per_iteration": 2.8505747318267822 }, { "auxiliary_loss_clip": 0.01219936, "auxiliary_loss_mlp": 0.01039455, "balance_loss_clip": 1.06412995, "balance_loss_mlp": 1.02967358, "epoch": 0.22497444838573918, "flos": 19243882431360.0, "grad_norm": 3.6954313263040834, "language_loss": 0.86432374, "learning_rate": 3.614455156763484e-06, "loss": 0.88691765, "num_input_tokens_seen": 39800115, "step": 1871, "time_per_iteration": 2.873762607574463 }, { "auxiliary_loss_clip": 0.01201008, "auxiliary_loss_mlp": 0.01036627, "balance_loss_clip": 1.06105447, "balance_loss_mlp": 1.02686369, "epoch": 0.2250946912763783, "flos": 16910299549440.0, "grad_norm": 2.069590789725167, "language_loss": 0.71294129, "learning_rate": 3.613995253844061e-06, "loss": 0.73531765, "num_input_tokens_seen": 39817795, "step": 1872, "time_per_iteration": 2.9200081825256348 }, { "auxiliary_loss_clip": 0.01207646, "auxiliary_loss_mlp": 0.01035811, "balance_loss_clip": 1.05823135, "balance_loss_mlp": 1.02555919, "epoch": 0.22521493416701738, "flos": 24681368292480.0, "grad_norm": 2.176973765254701, "language_loss": 0.8106187, "learning_rate": 3.6135351060828414e-06, "loss": 0.83305335, "num_input_tokens_seen": 39838270, "step": 1873, "time_per_iteration": 2.819871187210083 }, { "auxiliary_loss_clip": 0.01225898, "auxiliary_loss_mlp": 0.01029495, "balance_loss_clip": 1.06506562, "balance_loss_mlp": 1.01900446, "epoch": 0.22533517705765646, "flos": 17821963664640.0, "grad_norm": 2.3590551184098056, "language_loss": 0.69895059, "learning_rate": 3.6130747135496285e-06, "loss": 0.72150451, "num_input_tokens_seen": 39857270, "step": 1874, "time_per_iteration": 2.7636609077453613 }, { "auxiliary_loss_clip": 0.01218363, "auxiliary_loss_mlp": 0.01032857, "balance_loss_clip": 1.06134772, "balance_loss_mlp": 1.02233052, "epoch": 0.22545541994829554, "flos": 33691390899840.0, "grad_norm": 2.1347709398118027, "language_loss": 0.66024494, "learning_rate": 3.6126140763142646e-06, "loss": 0.68275714, "num_input_tokens_seen": 39882300, "step": 1875, "time_per_iteration": 2.8313539028167725 }, { "auxiliary_loss_clip": 0.01217566, "auxiliary_loss_mlp": 0.01035622, "balance_loss_clip": 1.06022906, "balance_loss_mlp": 1.02561486, "epoch": 0.22557566283893465, "flos": 19171594310400.0, "grad_norm": 2.9353188178386223, "language_loss": 0.86004853, "learning_rate": 3.6121531944466275e-06, "loss": 0.8825804, "num_input_tokens_seen": 39899625, "step": 1876, "time_per_iteration": 2.793774366378784 }, { "auxiliary_loss_clip": 0.01209993, "auxiliary_loss_mlp": 0.01032086, "balance_loss_clip": 1.06043959, "balance_loss_mlp": 1.02209663, "epoch": 0.22569590572957374, "flos": 20773281669120.0, "grad_norm": 3.54460484512041, "language_loss": 0.78141969, "learning_rate": 3.611692068016633e-06, "loss": 0.80384046, "num_input_tokens_seen": 39915955, "step": 1877, "time_per_iteration": 2.860092878341675 }, { "auxiliary_loss_clip": 0.01204891, "auxiliary_loss_mlp": 0.01034038, "balance_loss_clip": 1.06093621, "balance_loss_mlp": 1.02349412, "epoch": 0.22581614862021282, "flos": 18442715529600.0, "grad_norm": 14.196306437969003, "language_loss": 0.75186664, "learning_rate": 3.611230697094233e-06, "loss": 0.77425593, "num_input_tokens_seen": 39932655, "step": 1878, "time_per_iteration": 2.8713483810424805 }, { "auxiliary_loss_clip": 0.01213412, "auxiliary_loss_mlp": 0.01040304, "balance_loss_clip": 1.06076443, "balance_loss_mlp": 1.03030252, "epoch": 0.22593639151085193, "flos": 20048389297920.0, "grad_norm": 2.02271292496886, "language_loss": 0.8705135, "learning_rate": 3.6107690817494173e-06, "loss": 0.89305055, "num_input_tokens_seen": 39952875, "step": 1879, "time_per_iteration": 2.833364248275757 }, { "auxiliary_loss_clip": 0.01197053, "auxiliary_loss_mlp": 0.01031468, "balance_loss_clip": 1.06011391, "balance_loss_mlp": 1.02170444, "epoch": 0.226056634401491, "flos": 13115116350720.0, "grad_norm": 2.2430413960547173, "language_loss": 0.70899487, "learning_rate": 3.6103072220522117e-06, "loss": 0.73128003, "num_input_tokens_seen": 39968405, "step": 1880, "time_per_iteration": 2.818053960800171 }, { "auxiliary_loss_clip": 0.01211669, "auxiliary_loss_mlp": 0.01034692, "balance_loss_clip": 1.06072474, "balance_loss_mlp": 1.02492249, "epoch": 0.2261768772921301, "flos": 18988378012800.0, "grad_norm": 1.7290527956476305, "language_loss": 0.91692883, "learning_rate": 3.609845118072682e-06, "loss": 0.93939245, "num_input_tokens_seen": 39987075, "step": 1881, "time_per_iteration": 2.924936294555664 }, { "auxiliary_loss_clip": 0.01220651, "auxiliary_loss_mlp": 0.01063402, "balance_loss_clip": 1.0618794, "balance_loss_mlp": 1.02191842, "epoch": 0.2262971201827692, "flos": 19974054101760.0, "grad_norm": 1.7985024208141567, "language_loss": 0.79906005, "learning_rate": 3.6093827698809276e-06, "loss": 0.82190061, "num_input_tokens_seen": 40006175, "step": 1882, "time_per_iteration": 2.8728575706481934 }, { "auxiliary_loss_clip": 0.01212977, "auxiliary_loss_mlp": 0.01040385, "balance_loss_clip": 1.05829668, "balance_loss_mlp": 1.03012729, "epoch": 0.2264173630734083, "flos": 16654543735680.0, "grad_norm": 2.4303518961272523, "language_loss": 0.85019588, "learning_rate": 3.6089201775470864e-06, "loss": 0.87272948, "num_input_tokens_seen": 40021630, "step": 1883, "time_per_iteration": 2.819094657897949 }, { "auxiliary_loss_clip": 0.01193534, "auxiliary_loss_mlp": 0.01029755, "balance_loss_clip": 1.05864632, "balance_loss_mlp": 1.01973534, "epoch": 0.22653760596404737, "flos": 24389809597440.0, "grad_norm": 1.7014617467668367, "language_loss": 0.77453256, "learning_rate": 3.6084573411413334e-06, "loss": 0.79676545, "num_input_tokens_seen": 40041025, "step": 1884, "time_per_iteration": 2.893017530441284 }, { "auxiliary_loss_clip": 0.01206825, "auxiliary_loss_mlp": 0.01037225, "balance_loss_clip": 1.06301379, "balance_loss_mlp": 1.02565622, "epoch": 0.22665784885468646, "flos": 18332541538560.0, "grad_norm": 2.3426729958890866, "language_loss": 0.80889416, "learning_rate": 3.607994260733881e-06, "loss": 0.83133471, "num_input_tokens_seen": 40060265, "step": 1885, "time_per_iteration": 2.8866543769836426 }, { "auxiliary_loss_clip": 0.01203835, "auxiliary_loss_mlp": 0.01033151, "balance_loss_clip": 1.05896616, "balance_loss_mlp": 1.02393031, "epoch": 0.22677809174532557, "flos": 24058102475520.0, "grad_norm": 1.6149760390900723, "language_loss": 0.74491245, "learning_rate": 3.6075309363949776e-06, "loss": 0.76728237, "num_input_tokens_seen": 40079435, "step": 1886, "time_per_iteration": 2.9208948612213135 }, { "auxiliary_loss_clip": 0.01217177, "auxiliary_loss_mlp": 0.01033564, "balance_loss_clip": 1.06037426, "balance_loss_mlp": 1.0228467, "epoch": 0.22689833463596465, "flos": 20374242503040.0, "grad_norm": 1.9788737358240587, "language_loss": 0.81140041, "learning_rate": 3.6070673681949094e-06, "loss": 0.83390784, "num_input_tokens_seen": 40097800, "step": 1887, "time_per_iteration": 2.868058204650879 }, { "auxiliary_loss_clip": 0.01213571, "auxiliary_loss_mlp": 0.01068714, "balance_loss_clip": 1.06153131, "balance_loss_mlp": 1.02628827, "epoch": 0.22701857752660373, "flos": 30120398438400.0, "grad_norm": 1.6212655844169892, "language_loss": 0.81333804, "learning_rate": 3.606603556203999e-06, "loss": 0.8361609, "num_input_tokens_seen": 40122745, "step": 1888, "time_per_iteration": 3.018045663833618 }, { "auxiliary_loss_clip": 0.01217637, "auxiliary_loss_mlp": 0.01034652, "balance_loss_clip": 1.06000423, "balance_loss_mlp": 1.02466249, "epoch": 0.22713882041724284, "flos": 22492182084480.0, "grad_norm": 1.7428810950463687, "language_loss": 0.83377528, "learning_rate": 3.6061395004926066e-06, "loss": 0.85629815, "num_input_tokens_seen": 40141680, "step": 1889, "time_per_iteration": 2.9243898391723633 }, { "auxiliary_loss_clip": 0.01217574, "auxiliary_loss_mlp": 0.01035423, "balance_loss_clip": 1.05795574, "balance_loss_mlp": 1.02478933, "epoch": 0.22725906330788193, "flos": 20521548178560.0, "grad_norm": 3.5732164126796255, "language_loss": 0.84984076, "learning_rate": 3.605675201131129e-06, "loss": 0.87237072, "num_input_tokens_seen": 40160140, "step": 1890, "time_per_iteration": 3.658029079437256 }, { "auxiliary_loss_clip": 0.01218682, "auxiliary_loss_mlp": 0.01038141, "balance_loss_clip": 1.06239629, "balance_loss_mlp": 1.02818096, "epoch": 0.227379306198521, "flos": 18989922297600.0, "grad_norm": 2.1947226150618167, "language_loss": 0.7948705, "learning_rate": 3.60521065819e-06, "loss": 0.81743872, "num_input_tokens_seen": 40177450, "step": 1891, "time_per_iteration": 2.956021308898926 }, { "auxiliary_loss_clip": 0.01211998, "auxiliary_loss_mlp": 0.01036638, "balance_loss_clip": 1.05870628, "balance_loss_mlp": 1.02628458, "epoch": 0.2274995490891601, "flos": 21798351999360.0, "grad_norm": 1.8019052696624938, "language_loss": 0.87925124, "learning_rate": 3.60474587173969e-06, "loss": 0.90173763, "num_input_tokens_seen": 40195935, "step": 1892, "time_per_iteration": 3.796438455581665 }, { "auxiliary_loss_clip": 0.01212796, "auxiliary_loss_mlp": 0.01046556, "balance_loss_clip": 1.06092691, "balance_loss_mlp": 1.03616118, "epoch": 0.2276197919797992, "flos": 19058654972160.0, "grad_norm": 2.419991949798145, "language_loss": 0.84230268, "learning_rate": 3.6042808418507084e-06, "loss": 0.86489618, "num_input_tokens_seen": 40213620, "step": 1893, "time_per_iteration": 4.6840291023254395 }, { "auxiliary_loss_clip": 0.01214005, "auxiliary_loss_mlp": 0.01046202, "balance_loss_clip": 1.06178188, "balance_loss_mlp": 1.03525865, "epoch": 0.22774003487043828, "flos": 18806777827200.0, "grad_norm": 2.092349217194576, "language_loss": 0.76851773, "learning_rate": 3.6038155685935976e-06, "loss": 0.79111981, "num_input_tokens_seen": 40230190, "step": 1894, "time_per_iteration": 2.7233903408050537 }, { "auxiliary_loss_clip": 0.01211882, "auxiliary_loss_mlp": 0.01041112, "balance_loss_clip": 1.05998492, "balance_loss_mlp": 1.03096795, "epoch": 0.22786027776107737, "flos": 23002544476800.0, "grad_norm": 2.4733774617524014, "language_loss": 0.70383805, "learning_rate": 3.6033500520389404e-06, "loss": 0.72636801, "num_input_tokens_seen": 40246860, "step": 1895, "time_per_iteration": 2.800041675567627 }, { "auxiliary_loss_clip": 0.01108903, "auxiliary_loss_mlp": 0.0101181, "balance_loss_clip": 1.02428091, "balance_loss_mlp": 1.00972426, "epoch": 0.22798052065171648, "flos": 66706872600960.0, "grad_norm": 0.880193168430626, "language_loss": 0.64825106, "learning_rate": 3.6028842922573553e-06, "loss": 0.66945815, "num_input_tokens_seen": 40311005, "step": 1896, "time_per_iteration": 3.3939809799194336 }, { "auxiliary_loss_clip": 0.01110521, "auxiliary_loss_mlp": 0.01040292, "balance_loss_clip": 1.02358031, "balance_loss_mlp": 1.00415134, "epoch": 0.22810076354235556, "flos": 62080896758400.0, "grad_norm": 0.8569051434085891, "language_loss": 0.62957555, "learning_rate": 3.602418289319497e-06, "loss": 0.65108371, "num_input_tokens_seen": 40369560, "step": 1897, "time_per_iteration": 3.2991690635681152 }, { "auxiliary_loss_clip": 0.01206059, "auxiliary_loss_mlp": 0.01034257, "balance_loss_clip": 1.06438518, "balance_loss_mlp": 1.02462447, "epoch": 0.22822100643299464, "flos": 23876358635520.0, "grad_norm": 1.9257879695567999, "language_loss": 0.73320937, "learning_rate": 3.601952043296059e-06, "loss": 0.75561261, "num_input_tokens_seen": 40389555, "step": 1898, "time_per_iteration": 2.857922315597534 }, { "auxiliary_loss_clip": 0.01217849, "auxiliary_loss_mlp": 0.01033534, "balance_loss_clip": 1.06113601, "balance_loss_mlp": 1.02347255, "epoch": 0.22834124932363373, "flos": 20991331180800.0, "grad_norm": 2.308186834809963, "language_loss": 0.80947745, "learning_rate": 3.6014855542577696e-06, "loss": 0.83199131, "num_input_tokens_seen": 40406765, "step": 1899, "time_per_iteration": 2.7258517742156982 }, { "auxiliary_loss_clip": 0.01210721, "auxiliary_loss_mlp": 0.01031042, "balance_loss_clip": 1.06098592, "balance_loss_mlp": 1.02007484, "epoch": 0.22846149221427284, "flos": 24901572620160.0, "grad_norm": 1.7632542264082254, "language_loss": 0.84090418, "learning_rate": 3.6010188222753943e-06, "loss": 0.86332178, "num_input_tokens_seen": 40427535, "step": 1900, "time_per_iteration": 2.8569610118865967 }, { "auxiliary_loss_clip": 0.01103169, "auxiliary_loss_mlp": 0.01004612, "balance_loss_clip": 1.02284193, "balance_loss_mlp": 1.00271654, "epoch": 0.22858173510491192, "flos": 56132294319360.0, "grad_norm": 0.8996069446340567, "language_loss": 0.64157534, "learning_rate": 3.6005518474197372e-06, "loss": 0.66265321, "num_input_tokens_seen": 40479580, "step": 1901, "time_per_iteration": 3.2682416439056396 }, { "auxiliary_loss_clip": 0.01216523, "auxiliary_loss_mlp": 0.01034335, "balance_loss_clip": 1.06247997, "balance_loss_mlp": 1.02250385, "epoch": 0.228701977995551, "flos": 24170826332160.0, "grad_norm": 2.0259066742996317, "language_loss": 0.78625834, "learning_rate": 3.6000846297616373e-06, "loss": 0.80876696, "num_input_tokens_seen": 40497880, "step": 1902, "time_per_iteration": 2.9372119903564453 }, { "auxiliary_loss_clip": 0.01227565, "auxiliary_loss_mlp": 0.01034779, "balance_loss_clip": 1.06607985, "balance_loss_mlp": 1.02410412, "epoch": 0.22882222088619011, "flos": 21387892308480.0, "grad_norm": 3.6951819477846257, "language_loss": 0.72244006, "learning_rate": 3.5996171693719717e-06, "loss": 0.74506354, "num_input_tokens_seen": 40513975, "step": 1903, "time_per_iteration": 2.662292718887329 }, { "auxiliary_loss_clip": 0.0110659, "auxiliary_loss_mlp": 0.01002913, "balance_loss_clip": 1.020257, "balance_loss_mlp": 1.00093424, "epoch": 0.2289424637768292, "flos": 64589615377920.0, "grad_norm": 0.8485614232266915, "language_loss": 0.64801764, "learning_rate": 3.5991494663216528e-06, "loss": 0.66911268, "num_input_tokens_seen": 40576960, "step": 1904, "time_per_iteration": 3.377476930618286 }, { "auxiliary_loss_clip": 0.01219816, "auxiliary_loss_mlp": 0.01039188, "balance_loss_clip": 1.06180429, "balance_loss_mlp": 1.02894211, "epoch": 0.22906270666746828, "flos": 22163419877760.0, "grad_norm": 2.1444581646451337, "language_loss": 0.8737911, "learning_rate": 3.5986815206816314e-06, "loss": 0.89638114, "num_input_tokens_seen": 40595780, "step": 1905, "time_per_iteration": 2.8227367401123047 }, { "auxiliary_loss_clip": 0.01218255, "auxiliary_loss_mlp": 0.0103349, "balance_loss_clip": 1.06047893, "balance_loss_mlp": 1.02310085, "epoch": 0.2291829495581074, "flos": 25772334122880.0, "grad_norm": 3.992028341739906, "language_loss": 0.74643528, "learning_rate": 3.598213332522895e-06, "loss": 0.76895273, "num_input_tokens_seen": 40615810, "step": 1906, "time_per_iteration": 3.0314693450927734 }, { "auxiliary_loss_clip": 0.01216262, "auxiliary_loss_mlp": 0.01035408, "balance_loss_clip": 1.06307411, "balance_loss_mlp": 1.02565718, "epoch": 0.22930319244874647, "flos": 31172760126720.0, "grad_norm": 1.801353232316309, "language_loss": 0.77494568, "learning_rate": 3.597744901916466e-06, "loss": 0.79746234, "num_input_tokens_seen": 40637095, "step": 1907, "time_per_iteration": 2.9821009635925293 }, { "auxiliary_loss_clip": 0.01218975, "auxiliary_loss_mlp": 0.01042266, "balance_loss_clip": 1.0571965, "balance_loss_mlp": 1.03146565, "epoch": 0.22942343533938556, "flos": 23254098399360.0, "grad_norm": 4.113810901471038, "language_loss": 0.76669818, "learning_rate": 3.5972762289334058e-06, "loss": 0.78931057, "num_input_tokens_seen": 40656725, "step": 1908, "time_per_iteration": 2.85280442237854 }, { "auxiliary_loss_clip": 0.01197216, "auxiliary_loss_mlp": 0.01034993, "balance_loss_clip": 1.06183875, "balance_loss_mlp": 1.02503288, "epoch": 0.22954367823002464, "flos": 14610903436800.0, "grad_norm": 1.8017449238488303, "language_loss": 0.84961295, "learning_rate": 3.5968073136448116e-06, "loss": 0.87193501, "num_input_tokens_seen": 40674745, "step": 1909, "time_per_iteration": 2.9633474349975586 }, { "auxiliary_loss_clip": 0.01218556, "auxiliary_loss_mlp": 0.01032551, "balance_loss_clip": 1.06118703, "balance_loss_mlp": 1.02062428, "epoch": 0.22966392112066375, "flos": 16763604405120.0, "grad_norm": 1.838539763582809, "language_loss": 0.91175216, "learning_rate": 3.596338156121818e-06, "loss": 0.93426323, "num_input_tokens_seen": 40693630, "step": 1910, "time_per_iteration": 2.8614609241485596 }, { "auxiliary_loss_clip": 0.0110148, "auxiliary_loss_mlp": 0.01001484, "balance_loss_clip": 1.0192945, "balance_loss_mlp": 0.99962395, "epoch": 0.22978416401130283, "flos": 67474247783040.0, "grad_norm": 0.7416748801144013, "language_loss": 0.59324437, "learning_rate": 3.595868756435595e-06, "loss": 0.61427402, "num_input_tokens_seen": 40761310, "step": 1911, "time_per_iteration": 3.550468921661377 }, { "auxiliary_loss_clip": 0.01213536, "auxiliary_loss_mlp": 0.01038283, "balance_loss_clip": 1.06518865, "balance_loss_mlp": 1.02728021, "epoch": 0.22990440690194192, "flos": 19865137086720.0, "grad_norm": 2.0996563034829183, "language_loss": 0.80323672, "learning_rate": 3.5953991146573504e-06, "loss": 0.825755, "num_input_tokens_seen": 40779955, "step": 1912, "time_per_iteration": 2.8873727321624756 }, { "auxiliary_loss_clip": 0.01220983, "auxiliary_loss_mlp": 0.01043894, "balance_loss_clip": 1.06171405, "balance_loss_mlp": 1.0329746, "epoch": 0.23002464979258103, "flos": 13289246507520.0, "grad_norm": 2.7241909676215097, "language_loss": 0.834602, "learning_rate": 3.5949292308583294e-06, "loss": 0.85725069, "num_input_tokens_seen": 40793200, "step": 1913, "time_per_iteration": 2.8569653034210205 }, { "auxiliary_loss_clip": 0.01219568, "auxiliary_loss_mlp": 0.01037962, "balance_loss_clip": 1.06223559, "balance_loss_mlp": 1.02757871, "epoch": 0.2301448926832201, "flos": 22163779013760.0, "grad_norm": 2.2122342582837735, "language_loss": 0.81210792, "learning_rate": 3.594459105109811e-06, "loss": 0.83468324, "num_input_tokens_seen": 40812380, "step": 1914, "time_per_iteration": 2.858181953430176 }, { "auxiliary_loss_clip": 0.01216598, "auxiliary_loss_mlp": 0.01036722, "balance_loss_clip": 1.06198907, "balance_loss_mlp": 1.0271914, "epoch": 0.2302651355738592, "flos": 20704477167360.0, "grad_norm": 1.7804956797746634, "language_loss": 0.81133491, "learning_rate": 3.593988737483115e-06, "loss": 0.83386815, "num_input_tokens_seen": 40832320, "step": 1915, "time_per_iteration": 2.9304795265197754 }, { "auxiliary_loss_clip": 0.01211762, "auxiliary_loss_mlp": 0.01033081, "balance_loss_clip": 1.059847, "balance_loss_mlp": 1.02254295, "epoch": 0.23038537846449827, "flos": 18588943797120.0, "grad_norm": 2.2988243723696593, "language_loss": 0.78138316, "learning_rate": 3.5935181280495947e-06, "loss": 0.80383164, "num_input_tokens_seen": 40850900, "step": 1916, "time_per_iteration": 3.7256855964660645 }, { "auxiliary_loss_clip": 0.01098603, "auxiliary_loss_mlp": 0.01001383, "balance_loss_clip": 1.01896012, "balance_loss_mlp": 0.99970192, "epoch": 0.23050562135513739, "flos": 64224260190720.0, "grad_norm": 1.6504294250335843, "language_loss": 0.54258358, "learning_rate": 3.5930472768806412e-06, "loss": 0.56358337, "num_input_tokens_seen": 40909570, "step": 1917, "time_per_iteration": 3.3911423683166504 }, { "auxiliary_loss_clip": 0.01218051, "auxiliary_loss_mlp": 0.01039195, "balance_loss_clip": 1.06287718, "balance_loss_mlp": 1.02875781, "epoch": 0.23062586424577647, "flos": 17313396952320.0, "grad_norm": 2.0495266302826236, "language_loss": 0.77068347, "learning_rate": 3.5925761840476826e-06, "loss": 0.79325593, "num_input_tokens_seen": 40928180, "step": 1918, "time_per_iteration": 5.296905994415283 }, { "auxiliary_loss_clip": 0.01207667, "auxiliary_loss_mlp": 0.01039909, "balance_loss_clip": 1.06322443, "balance_loss_mlp": 1.02945995, "epoch": 0.23074610713641555, "flos": 27855979194240.0, "grad_norm": 2.1656237944444645, "language_loss": 0.81614834, "learning_rate": 3.592104849622183e-06, "loss": 0.83862412, "num_input_tokens_seen": 40950435, "step": 1919, "time_per_iteration": 2.9399900436401367 }, { "auxiliary_loss_clip": 0.01196516, "auxiliary_loss_mlp": 0.01038223, "balance_loss_clip": 1.06229532, "balance_loss_mlp": 1.02776194, "epoch": 0.23086635002705466, "flos": 28841798937600.0, "grad_norm": 2.1375725942812496, "language_loss": 0.73273826, "learning_rate": 3.591633273675644e-06, "loss": 0.75508571, "num_input_tokens_seen": 40972670, "step": 1920, "time_per_iteration": 3.9375438690185547 }, { "auxiliary_loss_clip": 0.01091728, "auxiliary_loss_mlp": 0.01004275, "balance_loss_clip": 1.02444863, "balance_loss_mlp": 1.00217664, "epoch": 0.23098659291769374, "flos": 62923681566720.0, "grad_norm": 0.9074420799164824, "language_loss": 0.58179891, "learning_rate": 3.591161456279602e-06, "loss": 0.60275894, "num_input_tokens_seen": 41018215, "step": 1921, "time_per_iteration": 3.1961545944213867 }, { "auxiliary_loss_clip": 0.01214138, "auxiliary_loss_mlp": 0.0103511, "balance_loss_clip": 1.06023753, "balance_loss_mlp": 1.02460122, "epoch": 0.23110683580833283, "flos": 23476816679040.0, "grad_norm": 1.51298792725626, "language_loss": 0.80325294, "learning_rate": 3.590689397505633e-06, "loss": 0.82574546, "num_input_tokens_seen": 41039125, "step": 1922, "time_per_iteration": 2.865826368331909 }, { "auxiliary_loss_clip": 0.01213971, "auxiliary_loss_mlp": 0.01032592, "balance_loss_clip": 1.06029058, "balance_loss_mlp": 1.02270365, "epoch": 0.2312270786989719, "flos": 27271066124160.0, "grad_norm": 1.8568621246819832, "language_loss": 0.86682165, "learning_rate": 3.590217097425347e-06, "loss": 0.88928729, "num_input_tokens_seen": 41059025, "step": 1923, "time_per_iteration": 2.786952257156372 }, { "auxiliary_loss_clip": 0.01222212, "auxiliary_loss_mlp": 0.01033242, "balance_loss_clip": 1.06378126, "balance_loss_mlp": 1.02284133, "epoch": 0.23134732158961102, "flos": 13261344618240.0, "grad_norm": 2.052241985370075, "language_loss": 0.70995653, "learning_rate": 3.589744556110391e-06, "loss": 0.73251116, "num_input_tokens_seen": 41077015, "step": 1924, "time_per_iteration": 2.788850784301758 }, { "auxiliary_loss_clip": 0.01209725, "auxiliary_loss_mlp": 0.01036773, "balance_loss_clip": 1.06209064, "balance_loss_mlp": 1.02577639, "epoch": 0.2314675644802501, "flos": 36977648250240.0, "grad_norm": 2.0243661717483707, "language_loss": 0.8439759, "learning_rate": 3.58927177363245e-06, "loss": 0.86644083, "num_input_tokens_seen": 41099840, "step": 1925, "time_per_iteration": 2.8689422607421875 }, { "auxiliary_loss_clip": 0.01203544, "auxiliary_loss_mlp": 0.01039367, "balance_loss_clip": 1.06089664, "balance_loss_mlp": 1.02841163, "epoch": 0.2315878073708892, "flos": 23842207779840.0, "grad_norm": 2.045308201225201, "language_loss": 0.72583079, "learning_rate": 3.5887987500632447e-06, "loss": 0.7482599, "num_input_tokens_seen": 41117845, "step": 1926, "time_per_iteration": 2.8089308738708496 }, { "auxiliary_loss_clip": 0.01212506, "auxiliary_loss_mlp": 0.01036129, "balance_loss_clip": 1.06232345, "balance_loss_mlp": 1.02630007, "epoch": 0.2317080502615283, "flos": 23039424766080.0, "grad_norm": 1.9791322291100153, "language_loss": 0.84459615, "learning_rate": 3.5883254854745325e-06, "loss": 0.86708248, "num_input_tokens_seen": 41136235, "step": 1927, "time_per_iteration": 2.9154109954833984 }, { "auxiliary_loss_clip": 0.01219049, "auxiliary_loss_mlp": 0.01032776, "balance_loss_clip": 1.06087232, "balance_loss_mlp": 1.02159464, "epoch": 0.23182829315216738, "flos": 11254656435840.0, "grad_norm": 2.2292133393915052, "language_loss": 0.75492972, "learning_rate": 3.587851979938107e-06, "loss": 0.77744794, "num_input_tokens_seen": 41153125, "step": 1928, "time_per_iteration": 2.893308401107788 }, { "auxiliary_loss_clip": 0.01209993, "auxiliary_loss_mlp": 0.01038231, "balance_loss_clip": 1.059237, "balance_loss_mlp": 1.02786028, "epoch": 0.23194853604280646, "flos": 19828939155840.0, "grad_norm": 2.014476928494808, "language_loss": 0.78132588, "learning_rate": 3.5873782335257985e-06, "loss": 0.80380809, "num_input_tokens_seen": 41171290, "step": 1929, "time_per_iteration": 2.827355146408081 }, { "auxiliary_loss_clip": 0.01207754, "auxiliary_loss_mlp": 0.01031696, "balance_loss_clip": 1.06347859, "balance_loss_mlp": 1.02041841, "epoch": 0.23206877893344555, "flos": 15305020830720.0, "grad_norm": 2.167084934251934, "language_loss": 0.78382897, "learning_rate": 3.5869042463094744e-06, "loss": 0.80622345, "num_input_tokens_seen": 41189005, "step": 1930, "time_per_iteration": 2.918362855911255 }, { "auxiliary_loss_clip": 0.01189219, "auxiliary_loss_mlp": 0.0104039, "balance_loss_clip": 1.060853, "balance_loss_mlp": 1.02944064, "epoch": 0.23218902182408466, "flos": 22711488572160.0, "grad_norm": 1.9715097341560628, "language_loss": 0.77271807, "learning_rate": 3.586430018361038e-06, "loss": 0.7950142, "num_input_tokens_seen": 41208775, "step": 1931, "time_per_iteration": 2.9284329414367676 }, { "auxiliary_loss_clip": 0.01202586, "auxiliary_loss_mlp": 0.01035284, "balance_loss_clip": 1.0627923, "balance_loss_mlp": 1.02459121, "epoch": 0.23230926471472374, "flos": 22710734386560.0, "grad_norm": 2.1440918202355874, "language_loss": 0.76307166, "learning_rate": 3.5859555497524283e-06, "loss": 0.78545034, "num_input_tokens_seen": 41226010, "step": 1932, "time_per_iteration": 3.015392303466797 }, { "auxiliary_loss_clip": 0.01214842, "auxiliary_loss_mlp": 0.01036578, "balance_loss_clip": 1.06281066, "balance_loss_mlp": 1.02606392, "epoch": 0.23242950760536282, "flos": 20375499479040.0, "grad_norm": 2.181831871405265, "language_loss": 0.92388064, "learning_rate": 3.5854808405556237e-06, "loss": 0.9463948, "num_input_tokens_seen": 41245245, "step": 1933, "time_per_iteration": 2.8636162281036377 }, { "auxiliary_loss_clip": 0.01206427, "auxiliary_loss_mlp": 0.01034654, "balance_loss_clip": 1.05975962, "balance_loss_mlp": 1.02481937, "epoch": 0.23254975049600193, "flos": 16908324301440.0, "grad_norm": 2.1569489889767275, "language_loss": 0.75306123, "learning_rate": 3.5850058908426355e-06, "loss": 0.77547204, "num_input_tokens_seen": 41263795, "step": 1934, "time_per_iteration": 2.8697872161865234 }, { "auxiliary_loss_clip": 0.01215466, "auxiliary_loss_mlp": 0.01036352, "balance_loss_clip": 1.06151855, "balance_loss_mlp": 1.02622521, "epoch": 0.23266999338664102, "flos": 23294821443840.0, "grad_norm": 2.3048609116236634, "language_loss": 0.85515511, "learning_rate": 3.584530700685514e-06, "loss": 0.87767327, "num_input_tokens_seen": 41284055, "step": 1935, "time_per_iteration": 2.85498046875 }, { "auxiliary_loss_clip": 0.01204316, "auxiliary_loss_mlp": 0.01043384, "balance_loss_clip": 1.06254983, "balance_loss_mlp": 1.03289986, "epoch": 0.2327902362772801, "flos": 19569987031680.0, "grad_norm": 2.3503445149249518, "language_loss": 0.8885954, "learning_rate": 3.5840552701563448e-06, "loss": 0.91107249, "num_input_tokens_seen": 41300255, "step": 1936, "time_per_iteration": 2.7488510608673096 }, { "auxiliary_loss_clip": 0.01214281, "auxiliary_loss_mlp": 0.01039782, "balance_loss_clip": 1.05986667, "balance_loss_mlp": 1.03044212, "epoch": 0.2329104791679192, "flos": 16727514215040.0, "grad_norm": 2.288708555606501, "language_loss": 0.81661141, "learning_rate": 3.5835795993272513e-06, "loss": 0.83915198, "num_input_tokens_seen": 41318540, "step": 1937, "time_per_iteration": 2.6487796306610107 }, { "auxiliary_loss_clip": 0.0119042, "auxiliary_loss_mlp": 0.01036528, "balance_loss_clip": 1.06285131, "balance_loss_mlp": 1.02640104, "epoch": 0.2330307220585583, "flos": 22163743100160.0, "grad_norm": 2.0522193876252355, "language_loss": 0.71036321, "learning_rate": 3.583103688270391e-06, "loss": 0.7326327, "num_input_tokens_seen": 41338320, "step": 1938, "time_per_iteration": 2.8676297664642334 }, { "auxiliary_loss_clip": 0.01198879, "auxiliary_loss_mlp": 0.01036491, "balance_loss_clip": 1.06286311, "balance_loss_mlp": 1.02657866, "epoch": 0.23315096494919738, "flos": 19317319787520.0, "grad_norm": 2.16219242238128, "language_loss": 0.89633441, "learning_rate": 3.58262753705796e-06, "loss": 0.91868812, "num_input_tokens_seen": 41353210, "step": 1939, "time_per_iteration": 2.9002859592437744 }, { "auxiliary_loss_clip": 0.01091338, "auxiliary_loss_mlp": 0.01017707, "balance_loss_clip": 1.01500225, "balance_loss_mlp": 1.01587152, "epoch": 0.23327120783983646, "flos": 53031048946560.0, "grad_norm": 0.861937242656712, "language_loss": 0.55532867, "learning_rate": 3.5821511457621902e-06, "loss": 0.57641912, "num_input_tokens_seen": 41410510, "step": 1940, "time_per_iteration": 3.3002727031707764 }, { "auxiliary_loss_clip": 0.0120408, "auxiliary_loss_mlp": 0.01040294, "balance_loss_clip": 1.06180561, "balance_loss_mlp": 1.02935696, "epoch": 0.23339145073047557, "flos": 17126984344320.0, "grad_norm": 3.5955425161102887, "language_loss": 0.81350732, "learning_rate": 3.5816745144553497e-06, "loss": 0.83595109, "num_input_tokens_seen": 41425830, "step": 1941, "time_per_iteration": 2.8120434284210205 }, { "auxiliary_loss_clip": 0.01192216, "auxiliary_loss_mlp": 0.01037487, "balance_loss_clip": 1.06035817, "balance_loss_mlp": 1.02751493, "epoch": 0.23351169362111465, "flos": 13078918419840.0, "grad_norm": 2.0339513864943384, "language_loss": 0.75541908, "learning_rate": 3.5811976432097424e-06, "loss": 0.7777161, "num_input_tokens_seen": 41443500, "step": 1942, "time_per_iteration": 3.8515827655792236 }, { "auxiliary_loss_clip": 0.01214143, "auxiliary_loss_mlp": 0.0107167, "balance_loss_clip": 1.06381178, "balance_loss_mlp": 1.02645469, "epoch": 0.23363193651175373, "flos": 15851257931520.0, "grad_norm": 2.2409453633309506, "language_loss": 0.84794664, "learning_rate": 3.58072053209771e-06, "loss": 0.87080479, "num_input_tokens_seen": 41460055, "step": 1943, "time_per_iteration": 2.8943142890930176 }, { "auxiliary_loss_clip": 0.01203721, "auxiliary_loss_mlp": 0.01035186, "balance_loss_clip": 1.0604471, "balance_loss_mlp": 1.02406991, "epoch": 0.23375217940239285, "flos": 21025769345280.0, "grad_norm": 2.8705244584603906, "language_loss": 0.79044127, "learning_rate": 3.5802431811916296e-06, "loss": 0.81283039, "num_input_tokens_seen": 41476665, "step": 1944, "time_per_iteration": 4.306621313095093 }, { "auxiliary_loss_clip": 0.01206325, "auxiliary_loss_mlp": 0.01033342, "balance_loss_clip": 1.06049633, "balance_loss_mlp": 1.02307796, "epoch": 0.23387242229303193, "flos": 20594698225920.0, "grad_norm": 1.7162135610221203, "language_loss": 0.80467749, "learning_rate": 3.579765590563916e-06, "loss": 0.82707417, "num_input_tokens_seen": 41496065, "step": 1945, "time_per_iteration": 3.9056146144866943 }, { "auxiliary_loss_clip": 0.01204933, "auxiliary_loss_mlp": 0.01039391, "balance_loss_clip": 1.05986226, "balance_loss_mlp": 1.02966976, "epoch": 0.233992665183671, "flos": 24279491952000.0, "grad_norm": 8.310361091273577, "language_loss": 0.81545144, "learning_rate": 3.579287760287017e-06, "loss": 0.83789468, "num_input_tokens_seen": 41516815, "step": 1946, "time_per_iteration": 3.8420145511627197 }, { "auxiliary_loss_clip": 0.0120919, "auxiliary_loss_mlp": 0.01032686, "balance_loss_clip": 1.06105089, "balance_loss_mlp": 1.02258277, "epoch": 0.2341129080743101, "flos": 30154621121280.0, "grad_norm": 2.4218438401681315, "language_loss": 0.72915375, "learning_rate": 3.578809690433421e-06, "loss": 0.75157249, "num_input_tokens_seen": 41538525, "step": 1947, "time_per_iteration": 2.9432239532470703 }, { "auxiliary_loss_clip": 0.01219561, "auxiliary_loss_mlp": 0.01041993, "balance_loss_clip": 1.06218362, "balance_loss_mlp": 1.03121662, "epoch": 0.2342331509649492, "flos": 22784135829120.0, "grad_norm": 2.069363760510242, "language_loss": 0.81477755, "learning_rate": 3.578331381075651e-06, "loss": 0.83739305, "num_input_tokens_seen": 41559025, "step": 1948, "time_per_iteration": 2.874664306640625 }, { "auxiliary_loss_clip": 0.01212668, "auxiliary_loss_mlp": 0.01035462, "balance_loss_clip": 1.06059456, "balance_loss_mlp": 1.02494812, "epoch": 0.2343533938555883, "flos": 23623152687360.0, "grad_norm": 2.2526093908670193, "language_loss": 0.69567597, "learning_rate": 3.5778528322862646e-06, "loss": 0.71815729, "num_input_tokens_seen": 41577845, "step": 1949, "time_per_iteration": 2.7719500064849854 }, { "auxiliary_loss_clip": 0.01213993, "auxiliary_loss_mlp": 0.0103119, "balance_loss_clip": 1.05930996, "balance_loss_mlp": 1.02087855, "epoch": 0.23447363674622737, "flos": 24570332375040.0, "grad_norm": 1.5011625291824968, "language_loss": 0.86629349, "learning_rate": 3.5773740441378585e-06, "loss": 0.88874531, "num_input_tokens_seen": 41598600, "step": 1950, "time_per_iteration": 2.7063710689544678 }, { "auxiliary_loss_clip": 0.01209201, "auxiliary_loss_mlp": 0.01033102, "balance_loss_clip": 1.05935431, "balance_loss_mlp": 1.02363038, "epoch": 0.23459387963686648, "flos": 53140322119680.0, "grad_norm": 1.672945076527158, "language_loss": 0.73713362, "learning_rate": 3.5768950167030633e-06, "loss": 0.75955665, "num_input_tokens_seen": 41623300, "step": 1951, "time_per_iteration": 2.9041671752929688 }, { "auxiliary_loss_clip": 0.01200077, "auxiliary_loss_mlp": 0.01034143, "balance_loss_clip": 1.06075823, "balance_loss_mlp": 1.02354503, "epoch": 0.23471412252750556, "flos": 23951412103680.0, "grad_norm": 2.3179115881556895, "language_loss": 0.78614771, "learning_rate": 3.576415750054548e-06, "loss": 0.80848992, "num_input_tokens_seen": 41643420, "step": 1952, "time_per_iteration": 2.666114568710327 }, { "auxiliary_loss_clip": 0.01201272, "auxiliary_loss_mlp": 0.01034594, "balance_loss_clip": 1.06046772, "balance_loss_mlp": 1.0250392, "epoch": 0.23483436541814465, "flos": 15706573948800.0, "grad_norm": 1.8694059009874004, "language_loss": 0.86021078, "learning_rate": 3.5759362442650172e-06, "loss": 0.88256943, "num_input_tokens_seen": 41660170, "step": 1953, "time_per_iteration": 2.5603065490722656 }, { "auxiliary_loss_clip": 0.01209633, "auxiliary_loss_mlp": 0.01038531, "balance_loss_clip": 1.06047702, "balance_loss_mlp": 1.02888715, "epoch": 0.23495460830878373, "flos": 24936262179840.0, "grad_norm": 2.4606904104529983, "language_loss": 0.85177112, "learning_rate": 3.5754564994072113e-06, "loss": 0.8742528, "num_input_tokens_seen": 41679010, "step": 1954, "time_per_iteration": 2.6796329021453857 }, { "auxiliary_loss_clip": 0.01203054, "auxiliary_loss_mlp": 0.01032959, "balance_loss_clip": 1.05874491, "balance_loss_mlp": 1.02301729, "epoch": 0.23507485119942284, "flos": 30482665056000.0, "grad_norm": 2.3535290289832376, "language_loss": 0.60049152, "learning_rate": 3.5749765155539067e-06, "loss": 0.62285167, "num_input_tokens_seen": 41699495, "step": 1955, "time_per_iteration": 2.672914743423462 }, { "auxiliary_loss_clip": 0.01200941, "auxiliary_loss_mlp": 0.01039023, "balance_loss_clip": 1.06061745, "balance_loss_mlp": 1.02880716, "epoch": 0.23519509409006192, "flos": 18329129746560.0, "grad_norm": 2.210217256110073, "language_loss": 0.92585468, "learning_rate": 3.574496292777917e-06, "loss": 0.94825435, "num_input_tokens_seen": 41717705, "step": 1956, "time_per_iteration": 2.6917128562927246 }, { "auxiliary_loss_clip": 0.01211868, "auxiliary_loss_mlp": 0.01038033, "balance_loss_clip": 1.06115794, "balance_loss_mlp": 1.02690458, "epoch": 0.235315336980701, "flos": 29643217234560.0, "grad_norm": 2.548570796401478, "language_loss": 0.71769404, "learning_rate": 3.574015831152092e-06, "loss": 0.74019301, "num_input_tokens_seen": 41738120, "step": 1957, "time_per_iteration": 2.6959028244018555 }, { "auxiliary_loss_clip": 0.01198357, "auxiliary_loss_mlp": 0.01033699, "balance_loss_clip": 1.06062102, "balance_loss_mlp": 1.02381015, "epoch": 0.23543557987134012, "flos": 18551704371840.0, "grad_norm": 2.166461431305008, "language_loss": 0.83818734, "learning_rate": 3.573535130749316e-06, "loss": 0.86050785, "num_input_tokens_seen": 41756070, "step": 1958, "time_per_iteration": 2.9747066497802734 }, { "auxiliary_loss_clip": 0.01201827, "auxiliary_loss_mlp": 0.01033587, "balance_loss_clip": 1.06266356, "balance_loss_mlp": 1.02405, "epoch": 0.2355558227619792, "flos": 24679033908480.0, "grad_norm": 1.707766518803487, "language_loss": 0.74268889, "learning_rate": 3.5730541916425127e-06, "loss": 0.76504302, "num_input_tokens_seen": 41777550, "step": 1959, "time_per_iteration": 2.9125545024871826 }, { "auxiliary_loss_clip": 0.01206357, "auxiliary_loss_mlp": 0.01032439, "balance_loss_clip": 1.06111503, "balance_loss_mlp": 1.02218652, "epoch": 0.23567606565261828, "flos": 21944795748480.0, "grad_norm": 1.8807082613221209, "language_loss": 0.86417186, "learning_rate": 3.572573013904639e-06, "loss": 0.88655984, "num_input_tokens_seen": 41797460, "step": 1960, "time_per_iteration": 3.1079866886138916 }, { "auxiliary_loss_clip": 0.01212821, "auxiliary_loss_mlp": 0.01036565, "balance_loss_clip": 1.05931425, "balance_loss_mlp": 1.02602673, "epoch": 0.2357963085432574, "flos": 13589352639360.0, "grad_norm": 1.8169726169651106, "language_loss": 0.92354012, "learning_rate": 3.572091597608689e-06, "loss": 0.94603401, "num_input_tokens_seen": 41815585, "step": 1961, "time_per_iteration": 3.0785322189331055 }, { "auxiliary_loss_clip": 0.01212904, "auxiliary_loss_mlp": 0.01036788, "balance_loss_clip": 1.06131589, "balance_loss_mlp": 1.02699542, "epoch": 0.23591655143389648, "flos": 22088689632000.0, "grad_norm": 2.0879128891528858, "language_loss": 0.73457766, "learning_rate": 3.571609942827694e-06, "loss": 0.75707465, "num_input_tokens_seen": 41834700, "step": 1962, "time_per_iteration": 2.8652186393737793 }, { "auxiliary_loss_clip": 0.01207078, "auxiliary_loss_mlp": 0.0103698, "balance_loss_clip": 1.05980897, "balance_loss_mlp": 1.02609587, "epoch": 0.23603679432453556, "flos": 17017349057280.0, "grad_norm": 1.7755412748149086, "language_loss": 0.88470918, "learning_rate": 3.57112804963472e-06, "loss": 0.90714979, "num_input_tokens_seen": 41852915, "step": 1963, "time_per_iteration": 2.8818349838256836 }, { "auxiliary_loss_clip": 0.01192679, "auxiliary_loss_mlp": 0.01038542, "balance_loss_clip": 1.05777478, "balance_loss_mlp": 1.02907062, "epoch": 0.23615703721517464, "flos": 19171307001600.0, "grad_norm": 1.7041474098729332, "language_loss": 0.76490504, "learning_rate": 3.57064591810287e-06, "loss": 0.78721726, "num_input_tokens_seen": 41870415, "step": 1964, "time_per_iteration": 2.9419994354248047 }, { "auxiliary_loss_clip": 0.0121348, "auxiliary_loss_mlp": 0.01071716, "balance_loss_clip": 1.0605129, "balance_loss_mlp": 1.02721179, "epoch": 0.23627728010581375, "flos": 19098803399040.0, "grad_norm": 2.1845847443822812, "language_loss": 0.80946016, "learning_rate": 3.570163548305284e-06, "loss": 0.83231211, "num_input_tokens_seen": 41889345, "step": 1965, "time_per_iteration": 2.7564332485198975 }, { "auxiliary_loss_clip": 0.01208242, "auxiliary_loss_mlp": 0.01039195, "balance_loss_clip": 1.06017125, "balance_loss_mlp": 1.02909207, "epoch": 0.23639752299645284, "flos": 14282213057280.0, "grad_norm": 2.3249660329012203, "language_loss": 0.69943613, "learning_rate": 3.569680940315135e-06, "loss": 0.72191048, "num_input_tokens_seen": 41905745, "step": 1966, "time_per_iteration": 2.824997901916504 }, { "auxiliary_loss_clip": 0.01210474, "auxiliary_loss_mlp": 0.01036308, "balance_loss_clip": 1.06074584, "balance_loss_mlp": 1.02551329, "epoch": 0.23651776588709192, "flos": 22893411980160.0, "grad_norm": 2.010048198539654, "language_loss": 0.82335794, "learning_rate": 3.5691980942056356e-06, "loss": 0.84582579, "num_input_tokens_seen": 41925115, "step": 1967, "time_per_iteration": 2.82653546333313 }, { "auxiliary_loss_clip": 0.01212386, "auxiliary_loss_mlp": 0.01036713, "balance_loss_clip": 1.05713177, "balance_loss_mlp": 1.02699149, "epoch": 0.23663800877773103, "flos": 18624531196800.0, "grad_norm": 2.490588679778054, "language_loss": 0.79828912, "learning_rate": 3.5687150100500332e-06, "loss": 0.82078004, "num_input_tokens_seen": 41944815, "step": 1968, "time_per_iteration": 3.764960765838623 }, { "auxiliary_loss_clip": 0.01212645, "auxiliary_loss_mlp": 0.01040529, "balance_loss_clip": 1.05867004, "balance_loss_mlp": 1.0301578, "epoch": 0.2367582516683701, "flos": 25555828896000.0, "grad_norm": 2.0006205674160533, "language_loss": 0.74262667, "learning_rate": 3.568231687921611e-06, "loss": 0.76515841, "num_input_tokens_seen": 41964990, "step": 1969, "time_per_iteration": 2.920900344848633 }, { "auxiliary_loss_clip": 0.0121242, "auxiliary_loss_mlp": 0.01040775, "balance_loss_clip": 1.05912495, "balance_loss_mlp": 1.03131533, "epoch": 0.2368784945590092, "flos": 23295072839040.0, "grad_norm": 1.538937163670766, "language_loss": 0.80523753, "learning_rate": 3.5677481278936883e-06, "loss": 0.8277694, "num_input_tokens_seen": 41984570, "step": 1970, "time_per_iteration": 2.7490546703338623 }, { "auxiliary_loss_clip": 0.01098238, "auxiliary_loss_mlp": 0.01011975, "balance_loss_clip": 1.01694322, "balance_loss_mlp": 1.01006746, "epoch": 0.23699873744964828, "flos": 69859291875840.0, "grad_norm": 0.829349353575977, "language_loss": 0.57819724, "learning_rate": 3.5672643300396214e-06, "loss": 0.59929931, "num_input_tokens_seen": 42053715, "step": 1971, "time_per_iteration": 5.939958095550537 }, { "auxiliary_loss_clip": 0.01201231, "auxiliary_loss_mlp": 0.01033179, "balance_loss_clip": 1.05886424, "balance_loss_mlp": 1.02436376, "epoch": 0.2371189803402874, "flos": 21835052720640.0, "grad_norm": 2.1062521330927684, "language_loss": 0.67334944, "learning_rate": 3.566780294432802e-06, "loss": 0.69569355, "num_input_tokens_seen": 42070890, "step": 1972, "time_per_iteration": 3.8600263595581055 }, { "auxiliary_loss_clip": 0.01213504, "auxiliary_loss_mlp": 0.01034066, "balance_loss_clip": 1.0593766, "balance_loss_mlp": 1.02470183, "epoch": 0.23723922323092647, "flos": 21908490076800.0, "grad_norm": 2.760381188235578, "language_loss": 0.74745554, "learning_rate": 3.566296021146657e-06, "loss": 0.76993126, "num_input_tokens_seen": 42090270, "step": 1973, "time_per_iteration": 2.7773096561431885 }, { "auxiliary_loss_clip": 0.01218181, "auxiliary_loss_mlp": 0.01034086, "balance_loss_clip": 1.06184876, "balance_loss_mlp": 1.02462125, "epoch": 0.23735946612156555, "flos": 32708803380480.0, "grad_norm": 1.760058578209318, "language_loss": 0.73379725, "learning_rate": 3.565811510254652e-06, "loss": 0.75631994, "num_input_tokens_seen": 42111150, "step": 1974, "time_per_iteration": 2.9001100063323975 }, { "auxiliary_loss_clip": 0.010981, "auxiliary_loss_mlp": 0.01004342, "balance_loss_clip": 1.01942396, "balance_loss_mlp": 1.0024823, "epoch": 0.23747970901220466, "flos": 70546944821760.0, "grad_norm": 0.8131934340019639, "language_loss": 0.58212018, "learning_rate": 3.5653267618302845e-06, "loss": 0.60314465, "num_input_tokens_seen": 42178730, "step": 1975, "time_per_iteration": 3.374183416366577 }, { "auxiliary_loss_clip": 0.01211076, "auxiliary_loss_mlp": 0.01032218, "balance_loss_clip": 1.05708754, "balance_loss_mlp": 1.02210903, "epoch": 0.23759995190284375, "flos": 20849807594880.0, "grad_norm": 2.5961319876696205, "language_loss": 0.86072975, "learning_rate": 3.564841775947093e-06, "loss": 0.88316268, "num_input_tokens_seen": 42199620, "step": 1976, "time_per_iteration": 2.744259834289551 }, { "auxiliary_loss_clip": 0.01200257, "auxiliary_loss_mlp": 0.0103086, "balance_loss_clip": 1.05708504, "balance_loss_mlp": 1.02117431, "epoch": 0.23772019479348283, "flos": 32921645420160.0, "grad_norm": 2.5429576469544726, "language_loss": 0.76022881, "learning_rate": 3.5643565526786475e-06, "loss": 0.78254002, "num_input_tokens_seen": 42219560, "step": 1977, "time_per_iteration": 2.8906502723693848 }, { "auxiliary_loss_clip": 0.01215475, "auxiliary_loss_mlp": 0.01040585, "balance_loss_clip": 1.05959082, "balance_loss_mlp": 1.03019023, "epoch": 0.2378404376841219, "flos": 32342765834880.0, "grad_norm": 1.5524903938981571, "language_loss": 0.77287889, "learning_rate": 3.5638710920985574e-06, "loss": 0.79543942, "num_input_tokens_seen": 42241020, "step": 1978, "time_per_iteration": 2.8611905574798584 }, { "auxiliary_loss_clip": 0.01211545, "auxiliary_loss_mlp": 0.01062942, "balance_loss_clip": 1.05483603, "balance_loss_mlp": 1.01926756, "epoch": 0.23796068057476102, "flos": 22997624313600.0, "grad_norm": 2.1917250752181205, "language_loss": 0.82409239, "learning_rate": 3.5633853942804655e-06, "loss": 0.84683722, "num_input_tokens_seen": 42259345, "step": 1979, "time_per_iteration": 2.7268974781036377 }, { "auxiliary_loss_clip": 0.01205263, "auxiliary_loss_mlp": 0.01031264, "balance_loss_clip": 1.05858147, "balance_loss_mlp": 1.02141762, "epoch": 0.2380809234654001, "flos": 13480938414720.0, "grad_norm": 4.570718775279468, "language_loss": 0.76746428, "learning_rate": 3.5628994592980527e-06, "loss": 0.78982955, "num_input_tokens_seen": 42277250, "step": 1980, "time_per_iteration": 2.730574369430542 }, { "auxiliary_loss_clip": 0.01211819, "auxiliary_loss_mlp": 0.01030402, "balance_loss_clip": 1.05614007, "balance_loss_mlp": 1.02072823, "epoch": 0.2382011663560392, "flos": 16871803148160.0, "grad_norm": 3.0231740252456896, "language_loss": 0.70189285, "learning_rate": 3.562413287225034e-06, "loss": 0.72431511, "num_input_tokens_seen": 42295360, "step": 1981, "time_per_iteration": 2.7704219818115234 }, { "auxiliary_loss_clip": 0.01206153, "auxiliary_loss_mlp": 0.01028785, "balance_loss_clip": 1.06040072, "balance_loss_mlp": 1.01880705, "epoch": 0.2383214092466783, "flos": 18441135331200.0, "grad_norm": 2.3996750406800493, "language_loss": 0.89539599, "learning_rate": 3.5619268781351623e-06, "loss": 0.91774535, "num_input_tokens_seen": 42313430, "step": 1982, "time_per_iteration": 2.830697536468506 }, { "auxiliary_loss_clip": 0.0120069, "auxiliary_loss_mlp": 0.01037933, "balance_loss_clip": 1.06037092, "balance_loss_mlp": 1.02844381, "epoch": 0.23844165213731738, "flos": 19755717281280.0, "grad_norm": 1.7834871061677735, "language_loss": 0.77111852, "learning_rate": 3.5614402321022256e-06, "loss": 0.79350483, "num_input_tokens_seen": 42331260, "step": 1983, "time_per_iteration": 2.777411937713623 }, { "auxiliary_loss_clip": 0.01190551, "auxiliary_loss_mlp": 0.01034499, "balance_loss_clip": 1.05812335, "balance_loss_mlp": 1.02514088, "epoch": 0.23856189502795647, "flos": 23367360960000.0, "grad_norm": 2.0883008335885527, "language_loss": 0.87112355, "learning_rate": 3.5609533492000463e-06, "loss": 0.89337403, "num_input_tokens_seen": 42350150, "step": 1984, "time_per_iteration": 2.9002628326416016 }, { "auxiliary_loss_clip": 0.01203879, "auxiliary_loss_mlp": 0.01030752, "balance_loss_clip": 1.06110692, "balance_loss_mlp": 1.02013648, "epoch": 0.23868213791859555, "flos": 23475056912640.0, "grad_norm": 2.156887839668613, "language_loss": 0.78485215, "learning_rate": 3.560466229502485e-06, "loss": 0.80719841, "num_input_tokens_seen": 42369495, "step": 1985, "time_per_iteration": 2.88285231590271 }, { "auxiliary_loss_clip": 0.01204085, "auxiliary_loss_mlp": 0.01072198, "balance_loss_clip": 1.06162953, "balance_loss_mlp": 1.02934837, "epoch": 0.23880238080923466, "flos": 16617340224000.0, "grad_norm": 2.4556454947469697, "language_loss": 0.89601111, "learning_rate": 3.5599788730834384e-06, "loss": 0.91877395, "num_input_tokens_seen": 42387455, "step": 1986, "time_per_iteration": 2.7696542739868164 }, { "auxiliary_loss_clip": 0.012094, "auxiliary_loss_mlp": 0.01036182, "balance_loss_clip": 1.05610251, "balance_loss_mlp": 1.02618587, "epoch": 0.23892262369987374, "flos": 17348409734400.0, "grad_norm": 4.585088733970655, "language_loss": 0.79091376, "learning_rate": 3.559491280016836e-06, "loss": 0.81336957, "num_input_tokens_seen": 42405400, "step": 1987, "time_per_iteration": 2.7521564960479736 }, { "auxiliary_loss_clip": 0.01203973, "auxiliary_loss_mlp": 0.01033112, "balance_loss_clip": 1.05699325, "balance_loss_mlp": 1.02294374, "epoch": 0.23904286659051283, "flos": 22309899540480.0, "grad_norm": 1.8564259018664393, "language_loss": 0.71331549, "learning_rate": 3.5590034503766465e-06, "loss": 0.7356863, "num_input_tokens_seen": 42425065, "step": 1988, "time_per_iteration": 2.9485063552856445 }, { "auxiliary_loss_clip": 0.01213338, "auxiliary_loss_mlp": 0.01034867, "balance_loss_clip": 1.05908298, "balance_loss_mlp": 1.02513349, "epoch": 0.23916310948115194, "flos": 21178246579200.0, "grad_norm": 2.06043786289832, "language_loss": 0.81206268, "learning_rate": 3.558515384236874e-06, "loss": 0.83454472, "num_input_tokens_seen": 42442495, "step": 1989, "time_per_iteration": 2.803835868835449 }, { "auxiliary_loss_clip": 0.01194435, "auxiliary_loss_mlp": 0.01066463, "balance_loss_clip": 1.06055021, "balance_loss_mlp": 1.02366793, "epoch": 0.23928335237179102, "flos": 14137349506560.0, "grad_norm": 1.7820516849814296, "language_loss": 0.83782959, "learning_rate": 3.558027081671556e-06, "loss": 0.86043859, "num_input_tokens_seen": 42459480, "step": 1990, "time_per_iteration": 2.7608747482299805 }, { "auxiliary_loss_clip": 0.01212533, "auxiliary_loss_mlp": 0.0103492, "balance_loss_clip": 1.05897343, "balance_loss_mlp": 1.0249064, "epoch": 0.2394035952624301, "flos": 23769596436480.0, "grad_norm": 2.312078541184106, "language_loss": 0.68903649, "learning_rate": 3.557538542754769e-06, "loss": 0.71151102, "num_input_tokens_seen": 42479175, "step": 1991, "time_per_iteration": 2.718815803527832 }, { "auxiliary_loss_clip": 0.01213707, "auxiliary_loss_mlp": 0.0103573, "balance_loss_clip": 1.05974209, "balance_loss_mlp": 1.02584183, "epoch": 0.2395238381530692, "flos": 24206198250240.0, "grad_norm": 1.8074528282157143, "language_loss": 0.66777456, "learning_rate": 3.557049767560623e-06, "loss": 0.69026887, "num_input_tokens_seen": 42498090, "step": 1992, "time_per_iteration": 2.7276742458343506 }, { "auxiliary_loss_clip": 0.01197604, "auxiliary_loss_mlp": 0.01032644, "balance_loss_clip": 1.0606041, "balance_loss_mlp": 1.02354789, "epoch": 0.2396440810437083, "flos": 25295763450240.0, "grad_norm": 1.9153574460767746, "language_loss": 0.86412442, "learning_rate": 3.5565607561632655e-06, "loss": 0.88642687, "num_input_tokens_seen": 42516930, "step": 1993, "time_per_iteration": 2.773810863494873 }, { "auxiliary_loss_clip": 0.01200505, "auxiliary_loss_mlp": 0.01028068, "balance_loss_clip": 1.0564158, "balance_loss_mlp": 1.01869845, "epoch": 0.23976432393434738, "flos": 28543093436160.0, "grad_norm": 2.0874838775154183, "language_loss": 0.80165768, "learning_rate": 3.5560715086368787e-06, "loss": 0.82394338, "num_input_tokens_seen": 42534800, "step": 1994, "time_per_iteration": 3.868670701980591 }, { "auxiliary_loss_clip": 0.01200158, "auxiliary_loss_mlp": 0.0103599, "balance_loss_clip": 1.05732512, "balance_loss_mlp": 1.02619076, "epoch": 0.23988456682498646, "flos": 19494358945920.0, "grad_norm": 2.4694925876570646, "language_loss": 0.8207956, "learning_rate": 3.5555820250556816e-06, "loss": 0.84315705, "num_input_tokens_seen": 42552000, "step": 1995, "time_per_iteration": 2.860914468765259 }, { "auxiliary_loss_clip": 0.01208095, "auxiliary_loss_mlp": 0.01030117, "balance_loss_clip": 1.06132686, "balance_loss_mlp": 1.02032411, "epoch": 0.24000480971562557, "flos": 20266331068800.0, "grad_norm": 2.1603254616214635, "language_loss": 0.69186306, "learning_rate": 3.5550923054939278e-06, "loss": 0.7142452, "num_input_tokens_seen": 42571455, "step": 1996, "time_per_iteration": 2.849883794784546 }, { "auxiliary_loss_clip": 0.01189418, "auxiliary_loss_mlp": 0.01035362, "balance_loss_clip": 1.05600977, "balance_loss_mlp": 1.026016, "epoch": 0.24012505260626466, "flos": 25443176866560.0, "grad_norm": 1.841261244124633, "language_loss": 0.7394352, "learning_rate": 3.5546023500259083e-06, "loss": 0.76168299, "num_input_tokens_seen": 42592550, "step": 1997, "time_per_iteration": 5.347617864608765 }, { "auxiliary_loss_clip": 0.01202793, "auxiliary_loss_mlp": 0.01028422, "balance_loss_clip": 1.05934262, "balance_loss_mlp": 1.01859903, "epoch": 0.24024529549690374, "flos": 15553342529280.0, "grad_norm": 3.8057724457959687, "language_loss": 0.80911052, "learning_rate": 3.5541121587259477e-06, "loss": 0.83142263, "num_input_tokens_seen": 42610385, "step": 1998, "time_per_iteration": 3.9059579372406006 }, { "auxiliary_loss_clip": 0.01096466, "auxiliary_loss_mlp": 0.01006176, "balance_loss_clip": 1.01944423, "balance_loss_mlp": 1.00442398, "epoch": 0.24036553838754285, "flos": 57122351867520.0, "grad_norm": 0.8307571746954224, "language_loss": 0.5789035, "learning_rate": 3.553621731668408e-06, "loss": 0.59992993, "num_input_tokens_seen": 42673595, "step": 1999, "time_per_iteration": 3.378833293914795 }, { "auxiliary_loss_clip": 0.01202167, "auxiliary_loss_mlp": 0.01036652, "balance_loss_clip": 1.05699432, "balance_loss_mlp": 1.02714491, "epoch": 0.24048578127818193, "flos": 24969946158720.0, "grad_norm": 1.8800275983317183, "language_loss": 0.83397079, "learning_rate": 3.553131068927688e-06, "loss": 0.856359, "num_input_tokens_seen": 42692000, "step": 2000, "time_per_iteration": 2.865724802017212 }, { "auxiliary_loss_clip": 0.01198457, "auxiliary_loss_mlp": 0.01034748, "balance_loss_clip": 1.05788898, "balance_loss_mlp": 1.02525866, "epoch": 0.24060602416882101, "flos": 23330947547520.0, "grad_norm": 1.728873236152605, "language_loss": 0.80351138, "learning_rate": 3.552640170578219e-06, "loss": 0.82584339, "num_input_tokens_seen": 42712250, "step": 2001, "time_per_iteration": 2.924213409423828 }, { "auxiliary_loss_clip": 0.01206739, "auxiliary_loss_mlp": 0.01032516, "balance_loss_clip": 1.05832493, "balance_loss_mlp": 1.0225029, "epoch": 0.2407262670594601, "flos": 14173260128640.0, "grad_norm": 2.366100567121622, "language_loss": 0.77794123, "learning_rate": 3.5521490366944703e-06, "loss": 0.80033374, "num_input_tokens_seen": 42729900, "step": 2002, "time_per_iteration": 2.8879384994506836 }, { "auxiliary_loss_clip": 0.01201953, "auxiliary_loss_mlp": 0.01034187, "balance_loss_clip": 1.05933738, "balance_loss_mlp": 1.02470934, "epoch": 0.2408465099500992, "flos": 13663113217920.0, "grad_norm": 2.2089119259179473, "language_loss": 0.80483353, "learning_rate": 3.5516576673509474e-06, "loss": 0.82719493, "num_input_tokens_seen": 42747900, "step": 2003, "time_per_iteration": 2.960411787033081 }, { "auxiliary_loss_clip": 0.0121153, "auxiliary_loss_mlp": 0.01040964, "balance_loss_clip": 1.05757856, "balance_loss_mlp": 1.03102803, "epoch": 0.2409667528407383, "flos": 31248029076480.0, "grad_norm": 2.0970668154146903, "language_loss": 0.86478662, "learning_rate": 3.5511660626221896e-06, "loss": 0.88731152, "num_input_tokens_seen": 42768540, "step": 2004, "time_per_iteration": 2.766444206237793 }, { "auxiliary_loss_clip": 0.01205109, "auxiliary_loss_mlp": 0.01065893, "balance_loss_clip": 1.0596683, "balance_loss_mlp": 1.02555823, "epoch": 0.24108699573137737, "flos": 22199941031040.0, "grad_norm": 2.3084056514394007, "language_loss": 0.89811599, "learning_rate": 3.5506742225827744e-06, "loss": 0.92082596, "num_input_tokens_seen": 42785395, "step": 2005, "time_per_iteration": 2.84765887260437 }, { "auxiliary_loss_clip": 0.01199272, "auxiliary_loss_mlp": 0.010422, "balance_loss_clip": 1.05984211, "balance_loss_mlp": 1.03250217, "epoch": 0.24120723862201648, "flos": 26103035664000.0, "grad_norm": 2.8572990802147653, "language_loss": 0.90686405, "learning_rate": 3.5501821473073116e-06, "loss": 0.92927873, "num_input_tokens_seen": 42801980, "step": 2006, "time_per_iteration": 2.899564027786255 }, { "auxiliary_loss_clip": 0.01198028, "auxiliary_loss_mlp": 0.01035009, "balance_loss_clip": 1.06050563, "balance_loss_mlp": 1.02505529, "epoch": 0.24132748151265557, "flos": 18624926246400.0, "grad_norm": 2.9662270894421856, "language_loss": 0.86911726, "learning_rate": 3.54968983687045e-06, "loss": 0.8914476, "num_input_tokens_seen": 42818850, "step": 2007, "time_per_iteration": 2.89289927482605 }, { "auxiliary_loss_clip": 0.01208659, "auxiliary_loss_mlp": 0.01031721, "balance_loss_clip": 1.05970216, "balance_loss_mlp": 1.0217669, "epoch": 0.24144772440329465, "flos": 15267673664640.0, "grad_norm": 2.6082271619348036, "language_loss": 0.8964113, "learning_rate": 3.549197291346872e-06, "loss": 0.91881514, "num_input_tokens_seen": 42835375, "step": 2008, "time_per_iteration": 2.847398042678833 }, { "auxiliary_loss_clip": 0.01208705, "auxiliary_loss_mlp": 0.01036138, "balance_loss_clip": 1.05695105, "balance_loss_mlp": 1.02612996, "epoch": 0.24156796729393373, "flos": 24024274842240.0, "grad_norm": 2.356100144386199, "language_loss": 0.79677916, "learning_rate": 3.548704510811297e-06, "loss": 0.81922758, "num_input_tokens_seen": 42854570, "step": 2009, "time_per_iteration": 2.807258129119873 }, { "auxiliary_loss_clip": 0.01202058, "auxiliary_loss_mlp": 0.01038352, "balance_loss_clip": 1.05726171, "balance_loss_mlp": 1.0277307, "epoch": 0.24168821018457284, "flos": 26286790665600.0, "grad_norm": 6.168365912787901, "language_loss": 0.74408889, "learning_rate": 3.5482114953384787e-06, "loss": 0.76649302, "num_input_tokens_seen": 42873800, "step": 2010, "time_per_iteration": 2.8927419185638428 }, { "auxiliary_loss_clip": 0.01212244, "auxiliary_loss_mlp": 0.01038392, "balance_loss_clip": 1.05865288, "balance_loss_mlp": 1.02828276, "epoch": 0.24180845307521193, "flos": 18223193560320.0, "grad_norm": 2.0150784591358843, "language_loss": 0.84222859, "learning_rate": 3.5477182450032077e-06, "loss": 0.86473501, "num_input_tokens_seen": 42892400, "step": 2011, "time_per_iteration": 2.699922800064087 }, { "auxiliary_loss_clip": 0.01207211, "auxiliary_loss_mlp": 0.01042562, "balance_loss_clip": 1.05800092, "balance_loss_mlp": 1.03205991, "epoch": 0.241928695965851, "flos": 20449260057600.0, "grad_norm": 2.014529268722692, "language_loss": 0.83509564, "learning_rate": 3.5472247598803097e-06, "loss": 0.85759342, "num_input_tokens_seen": 42911745, "step": 2012, "time_per_iteration": 2.831334114074707 }, { "auxiliary_loss_clip": 0.01212851, "auxiliary_loss_mlp": 0.01039313, "balance_loss_clip": 1.05662704, "balance_loss_mlp": 1.02849531, "epoch": 0.24204893885649012, "flos": 25556475340800.0, "grad_norm": 6.015304761449703, "language_loss": 0.85308695, "learning_rate": 3.546731040044645e-06, "loss": 0.87560862, "num_input_tokens_seen": 42926915, "step": 2013, "time_per_iteration": 2.817866086959839 }, { "auxiliary_loss_clip": 0.01213771, "auxiliary_loss_mlp": 0.01037769, "balance_loss_clip": 1.0599494, "balance_loss_mlp": 1.02790403, "epoch": 0.2421691817471292, "flos": 30660207004800.0, "grad_norm": 2.221828385597625, "language_loss": 0.75224751, "learning_rate": 3.546237085571112e-06, "loss": 0.77476287, "num_input_tokens_seen": 42945350, "step": 2014, "time_per_iteration": 2.8808696269989014 }, { "auxiliary_loss_clip": 0.01207543, "auxiliary_loss_mlp": 0.01040115, "balance_loss_clip": 1.05753589, "balance_loss_mlp": 1.02983284, "epoch": 0.24228942463776829, "flos": 21945011230080.0, "grad_norm": 4.020968352155436, "language_loss": 0.7264936, "learning_rate": 3.5457428965346425e-06, "loss": 0.74897015, "num_input_tokens_seen": 42964290, "step": 2015, "time_per_iteration": 2.7832694053649902 }, { "auxiliary_loss_clip": 0.0119585, "auxiliary_loss_mlp": 0.01034061, "balance_loss_clip": 1.06008601, "balance_loss_mlp": 1.02401125, "epoch": 0.2424096675284074, "flos": 33984493879680.0, "grad_norm": 1.5456262383115091, "language_loss": 0.74830937, "learning_rate": 3.545248473010205e-06, "loss": 0.77060848, "num_input_tokens_seen": 42987095, "step": 2016, "time_per_iteration": 3.0598597526550293 }, { "auxiliary_loss_clip": 0.01217796, "auxiliary_loss_mlp": 0.01066118, "balance_loss_clip": 1.0607537, "balance_loss_mlp": 1.026142, "epoch": 0.24252991041904648, "flos": 21653416621440.0, "grad_norm": 1.7986786824453875, "language_loss": 0.88036561, "learning_rate": 3.544753815072802e-06, "loss": 0.90320474, "num_input_tokens_seen": 43005750, "step": 2017, "time_per_iteration": 2.8929944038391113 }, { "auxiliary_loss_clip": 0.01187101, "auxiliary_loss_mlp": 0.01037375, "balance_loss_clip": 1.05821586, "balance_loss_mlp": 1.02713478, "epoch": 0.24265015330968556, "flos": 21870065502720.0, "grad_norm": 2.0015219576586754, "language_loss": 0.88436735, "learning_rate": 3.544258922797474e-06, "loss": 0.90661216, "num_input_tokens_seen": 43023870, "step": 2018, "time_per_iteration": 2.9206252098083496 }, { "auxiliary_loss_clip": 0.0121397, "auxiliary_loss_mlp": 0.01038157, "balance_loss_clip": 1.05999553, "balance_loss_mlp": 1.0288167, "epoch": 0.24277039620032465, "flos": 25628260671360.0, "grad_norm": 1.662067754058353, "language_loss": 0.78277797, "learning_rate": 3.543763796259295e-06, "loss": 0.80529928, "num_input_tokens_seen": 43043825, "step": 2019, "time_per_iteration": 2.8386759757995605 }, { "auxiliary_loss_clip": 0.01208193, "auxiliary_loss_mlp": 0.01029726, "balance_loss_clip": 1.05752182, "balance_loss_mlp": 1.01998615, "epoch": 0.24289063909096376, "flos": 26286575184000.0, "grad_norm": 1.79994987342564, "language_loss": 0.90838313, "learning_rate": 3.5432684355333754e-06, "loss": 0.93076229, "num_input_tokens_seen": 43062480, "step": 2020, "time_per_iteration": 3.8150312900543213 }, { "auxiliary_loss_clip": 0.01210717, "auxiliary_loss_mlp": 0.01035346, "balance_loss_clip": 1.05872023, "balance_loss_mlp": 1.02604795, "epoch": 0.24301088198160284, "flos": 25075056332160.0, "grad_norm": 2.5983468798315847, "language_loss": 0.76674122, "learning_rate": 3.5427728406948613e-06, "loss": 0.7892018, "num_input_tokens_seen": 43081595, "step": 2021, "time_per_iteration": 2.819132089614868 }, { "auxiliary_loss_clip": 0.01102211, "auxiliary_loss_mlp": 0.0100911, "balance_loss_clip": 1.01916528, "balance_loss_mlp": 1.00735784, "epoch": 0.24313112487224192, "flos": 69900948673920.0, "grad_norm": 0.7591460936498707, "language_loss": 0.57937992, "learning_rate": 3.542277011818934e-06, "loss": 0.60049307, "num_input_tokens_seen": 43145430, "step": 2022, "time_per_iteration": 4.521865606307983 }, { "auxiliary_loss_clip": 0.0120548, "auxiliary_loss_mlp": 0.01031922, "balance_loss_clip": 1.0601387, "balance_loss_mlp": 1.02224791, "epoch": 0.24325136776288103, "flos": 40662334235520.0, "grad_norm": 2.0320080015348054, "language_loss": 0.74212229, "learning_rate": 3.5417809489808104e-06, "loss": 0.76449627, "num_input_tokens_seen": 43167040, "step": 2023, "time_per_iteration": 3.037010431289673 }, { "auxiliary_loss_clip": 0.01213156, "auxiliary_loss_mlp": 0.01034555, "balance_loss_clip": 1.05867267, "balance_loss_mlp": 1.02461314, "epoch": 0.24337161065352012, "flos": 25046400257280.0, "grad_norm": 1.733675064398704, "language_loss": 0.72451359, "learning_rate": 3.5412846522557422e-06, "loss": 0.74699068, "num_input_tokens_seen": 43187930, "step": 2024, "time_per_iteration": 3.801563024520874 }, { "auxiliary_loss_clip": 0.01214297, "auxiliary_loss_mlp": 0.01031819, "balance_loss_clip": 1.05996442, "balance_loss_mlp": 1.02212155, "epoch": 0.2434918535441592, "flos": 18661160090880.0, "grad_norm": 2.289962349123274, "language_loss": 0.74069172, "learning_rate": 3.540788121719018e-06, "loss": 0.7631529, "num_input_tokens_seen": 43206350, "step": 2025, "time_per_iteration": 2.710552453994751 }, { "auxiliary_loss_clip": 0.01195257, "auxiliary_loss_mlp": 0.01032887, "balance_loss_clip": 1.06010413, "balance_loss_mlp": 1.02253354, "epoch": 0.24361209643479828, "flos": 23915142345600.0, "grad_norm": 1.8877263473743793, "language_loss": 0.81973159, "learning_rate": 3.5402913574459604e-06, "loss": 0.84201306, "num_input_tokens_seen": 43226255, "step": 2026, "time_per_iteration": 2.8199303150177 }, { "auxiliary_loss_clip": 0.01191248, "auxiliary_loss_mlp": 0.01030479, "balance_loss_clip": 1.05787027, "balance_loss_mlp": 1.02102518, "epoch": 0.2437323393254374, "flos": 28657505232000.0, "grad_norm": 1.901442583277541, "language_loss": 0.8603096, "learning_rate": 3.5397943595119297e-06, "loss": 0.88252687, "num_input_tokens_seen": 43247675, "step": 2027, "time_per_iteration": 2.852743625640869 }, { "auxiliary_loss_clip": 0.01204379, "auxiliary_loss_mlp": 0.01035086, "balance_loss_clip": 1.0598433, "balance_loss_mlp": 1.02576399, "epoch": 0.24385258221607647, "flos": 23550325862400.0, "grad_norm": 7.229661160411872, "language_loss": 0.77470052, "learning_rate": 3.5392971279923177e-06, "loss": 0.79709518, "num_input_tokens_seen": 43265895, "step": 2028, "time_per_iteration": 2.7988734245300293 }, { "auxiliary_loss_clip": 0.0120031, "auxiliary_loss_mlp": 0.0103746, "balance_loss_clip": 1.06006694, "balance_loss_mlp": 1.02646339, "epoch": 0.24397282510671556, "flos": 25336091445120.0, "grad_norm": 2.279618768854197, "language_loss": 0.83091164, "learning_rate": 3.5387996629625557e-06, "loss": 0.85328937, "num_input_tokens_seen": 43283485, "step": 2029, "time_per_iteration": 2.800400972366333 }, { "auxiliary_loss_clip": 0.01100935, "auxiliary_loss_mlp": 0.01003249, "balance_loss_clip": 1.01602066, "balance_loss_mlp": 1.00143671, "epoch": 0.24409306799735467, "flos": 65187421430400.0, "grad_norm": 0.8134199494205475, "language_loss": 0.54953057, "learning_rate": 3.5383019644981083e-06, "loss": 0.57057244, "num_input_tokens_seen": 43347180, "step": 2030, "time_per_iteration": 3.3491785526275635 }, { "auxiliary_loss_clip": 0.01206478, "auxiliary_loss_mlp": 0.01030771, "balance_loss_clip": 1.06045055, "balance_loss_mlp": 1.02053094, "epoch": 0.24421331088799375, "flos": 19537093152000.0, "grad_norm": 2.1701623511413946, "language_loss": 0.73011488, "learning_rate": 3.5378040326744763e-06, "loss": 0.7524873, "num_input_tokens_seen": 43366665, "step": 2031, "time_per_iteration": 2.8205771446228027 }, { "auxiliary_loss_clip": 0.0119841, "auxiliary_loss_mlp": 0.01031251, "balance_loss_clip": 1.05950665, "balance_loss_mlp": 1.02212572, "epoch": 0.24433355377863283, "flos": 21068575378560.0, "grad_norm": 2.1066173655822547, "language_loss": 0.85386539, "learning_rate": 3.5373058675671946e-06, "loss": 0.87616193, "num_input_tokens_seen": 43384670, "step": 2032, "time_per_iteration": 2.9259119033813477 }, { "auxiliary_loss_clip": 0.01192056, "auxiliary_loss_mlp": 0.01032857, "balance_loss_clip": 1.0583601, "balance_loss_mlp": 1.02311206, "epoch": 0.24445379666927192, "flos": 22637189289600.0, "grad_norm": 1.8691896134275858, "language_loss": 0.72206897, "learning_rate": 3.536807469251836e-06, "loss": 0.74431813, "num_input_tokens_seen": 43403825, "step": 2033, "time_per_iteration": 2.9842889308929443 }, { "auxiliary_loss_clip": 0.01205098, "auxiliary_loss_mlp": 0.01033924, "balance_loss_clip": 1.05600286, "balance_loss_mlp": 1.02404714, "epoch": 0.24457403955991103, "flos": 21251612108160.0, "grad_norm": 1.9061652974173113, "language_loss": 0.82812285, "learning_rate": 3.5363088378040055e-06, "loss": 0.8505131, "num_input_tokens_seen": 43422715, "step": 2034, "time_per_iteration": 3.000535726547241 }, { "auxiliary_loss_clip": 0.01099433, "auxiliary_loss_mlp": 0.01031421, "balance_loss_clip": 1.01455641, "balance_loss_mlp": 1.00103521, "epoch": 0.2446942824505501, "flos": 66997820764800.0, "grad_norm": 0.7629449144920618, "language_loss": 0.64446199, "learning_rate": 3.5358099732993463e-06, "loss": 0.66577053, "num_input_tokens_seen": 43481825, "step": 2035, "time_per_iteration": 3.2129223346710205 }, { "auxiliary_loss_clip": 0.01212465, "auxiliary_loss_mlp": 0.01031656, "balance_loss_clip": 1.05933535, "balance_loss_mlp": 1.02239954, "epoch": 0.2448145253411892, "flos": 20411122792320.0, "grad_norm": 4.047572187809506, "language_loss": 0.89599258, "learning_rate": 3.535310875813535e-06, "loss": 0.91843379, "num_input_tokens_seen": 43500220, "step": 2036, "time_per_iteration": 2.8141911029815674 }, { "auxiliary_loss_clip": 0.0120593, "auxiliary_loss_mlp": 0.01034985, "balance_loss_clip": 1.05766761, "balance_loss_mlp": 1.02537704, "epoch": 0.2449347682318283, "flos": 28804739080320.0, "grad_norm": 1.6791440252846588, "language_loss": 0.81306207, "learning_rate": 3.5348115454222843e-06, "loss": 0.83547115, "num_input_tokens_seen": 43522805, "step": 2037, "time_per_iteration": 2.850869655609131 }, { "auxiliary_loss_clip": 0.012036, "auxiliary_loss_mlp": 0.01032615, "balance_loss_clip": 1.05933869, "balance_loss_mlp": 1.02244091, "epoch": 0.2450550111224674, "flos": 22528990546560.0, "grad_norm": 1.957042563291599, "language_loss": 0.86442113, "learning_rate": 3.5343119822013425e-06, "loss": 0.88678324, "num_input_tokens_seen": 43541915, "step": 2038, "time_per_iteration": 2.7876381874084473 }, { "auxiliary_loss_clip": 0.01218239, "auxiliary_loss_mlp": 0.01038107, "balance_loss_clip": 1.06168342, "balance_loss_mlp": 1.02839148, "epoch": 0.24517525401310647, "flos": 21759137326080.0, "grad_norm": 1.9029525226125739, "language_loss": 0.7746489, "learning_rate": 3.533812186226493e-06, "loss": 0.79721236, "num_input_tokens_seen": 43562625, "step": 2039, "time_per_iteration": 2.7979111671447754 }, { "auxiliary_loss_clip": 0.01207558, "auxiliary_loss_mlp": 0.01035069, "balance_loss_clip": 1.05537546, "balance_loss_mlp": 1.02526999, "epoch": 0.24529549690374555, "flos": 25043311687680.0, "grad_norm": 1.7221878289613701, "language_loss": 0.7583887, "learning_rate": 3.5333121575735545e-06, "loss": 0.78081501, "num_input_tokens_seen": 43582265, "step": 2040, "time_per_iteration": 2.756361722946167 }, { "auxiliary_loss_clip": 0.0120313, "auxiliary_loss_mlp": 0.01030145, "balance_loss_clip": 1.05806684, "balance_loss_mlp": 1.02154994, "epoch": 0.24541573979438466, "flos": 32123638915200.0, "grad_norm": 1.939935916499192, "language_loss": 0.75784969, "learning_rate": 3.532811896318381e-06, "loss": 0.78018248, "num_input_tokens_seen": 43604335, "step": 2041, "time_per_iteration": 2.8684537410736084 }, { "auxiliary_loss_clip": 0.01199613, "auxiliary_loss_mlp": 0.01035342, "balance_loss_clip": 1.05687571, "balance_loss_mlp": 1.02538776, "epoch": 0.24553598268502375, "flos": 31357556622720.0, "grad_norm": 2.3355275809996283, "language_loss": 0.81894809, "learning_rate": 3.5323114025368615e-06, "loss": 0.84129775, "num_input_tokens_seen": 43619400, "step": 2042, "time_per_iteration": 2.8846538066864014 }, { "auxiliary_loss_clip": 0.01203591, "auxiliary_loss_mlp": 0.01028294, "balance_loss_clip": 1.05616093, "balance_loss_mlp": 1.01891816, "epoch": 0.24565622557566283, "flos": 14027462824320.0, "grad_norm": 2.1911167812750167, "language_loss": 0.81498212, "learning_rate": 3.53181067630492e-06, "loss": 0.83730102, "num_input_tokens_seen": 43636870, "step": 2043, "time_per_iteration": 2.773181915283203 }, { "auxiliary_loss_clip": 0.01196743, "auxiliary_loss_mlp": 0.01037105, "balance_loss_clip": 1.05712342, "balance_loss_mlp": 1.02755666, "epoch": 0.24577646846630194, "flos": 16581465515520.0, "grad_norm": 1.7238327595198653, "language_loss": 0.76053572, "learning_rate": 3.5313097176985175e-06, "loss": 0.78287423, "num_input_tokens_seen": 43655180, "step": 2044, "time_per_iteration": 2.782444477081299 }, { "auxiliary_loss_clip": 0.0120359, "auxiliary_loss_mlp": 0.01030599, "balance_loss_clip": 1.05476308, "balance_loss_mlp": 1.02100277, "epoch": 0.24589671135694102, "flos": 18807424272000.0, "grad_norm": 2.4848806188404136, "language_loss": 0.80968714, "learning_rate": 3.5308085267936482e-06, "loss": 0.83202904, "num_input_tokens_seen": 43672895, "step": 2045, "time_per_iteration": 2.8435418605804443 }, { "auxiliary_loss_clip": 0.01194738, "auxiliary_loss_mlp": 0.0106438, "balance_loss_clip": 1.05718279, "balance_loss_mlp": 1.02930009, "epoch": 0.2460169542475801, "flos": 19938538529280.0, "grad_norm": 1.7032664473466816, "language_loss": 0.89606035, "learning_rate": 3.530307103666342e-06, "loss": 0.91865152, "num_input_tokens_seen": 43691975, "step": 2046, "time_per_iteration": 3.803926944732666 }, { "auxiliary_loss_clip": 0.01204913, "auxiliary_loss_mlp": 0.01038035, "balance_loss_clip": 1.05851388, "balance_loss_mlp": 1.02817035, "epoch": 0.24613719713821922, "flos": 24171221381760.0, "grad_norm": 1.8595230869589996, "language_loss": 0.80065393, "learning_rate": 3.5298054483926658e-06, "loss": 0.8230834, "num_input_tokens_seen": 43712670, "step": 2047, "time_per_iteration": 2.892366886138916 }, { "auxiliary_loss_clip": 0.0121663, "auxiliary_loss_mlp": 0.01035988, "balance_loss_clip": 1.06080103, "balance_loss_mlp": 1.02616549, "epoch": 0.2462574400288583, "flos": 30221055325440.0, "grad_norm": 3.926200938062796, "language_loss": 0.83194989, "learning_rate": 3.5293035610487187e-06, "loss": 0.85447609, "num_input_tokens_seen": 43732035, "step": 2048, "time_per_iteration": 4.065930366516113 }, { "auxiliary_loss_clip": 0.01092526, "auxiliary_loss_mlp": 0.01004563, "balance_loss_clip": 1.01505995, "balance_loss_mlp": 1.00269115, "epoch": 0.24637768291949738, "flos": 68943030819840.0, "grad_norm": 0.7307820663148775, "language_loss": 0.61962533, "learning_rate": 3.5288014417106374e-06, "loss": 0.64059615, "num_input_tokens_seen": 43798055, "step": 2049, "time_per_iteration": 4.429203510284424 }, { "auxiliary_loss_clip": 0.01200565, "auxiliary_loss_mlp": 0.01037184, "balance_loss_clip": 1.058882, "balance_loss_mlp": 1.02730787, "epoch": 0.24649792581013646, "flos": 34383999922560.0, "grad_norm": 1.800666810248906, "language_loss": 0.75706846, "learning_rate": 3.528299090454593e-06, "loss": 0.77944601, "num_input_tokens_seen": 43818590, "step": 2050, "time_per_iteration": 3.8367104530334473 }, { "auxiliary_loss_clip": 0.01210754, "auxiliary_loss_mlp": 0.01036895, "balance_loss_clip": 1.05766046, "balance_loss_mlp": 1.02710187, "epoch": 0.24661816870077558, "flos": 19680448331520.0, "grad_norm": 2.527794963375439, "language_loss": 0.82928598, "learning_rate": 3.527796507356792e-06, "loss": 0.85176253, "num_input_tokens_seen": 43832480, "step": 2051, "time_per_iteration": 2.8300509452819824 }, { "auxiliary_loss_clip": 0.01211904, "auxiliary_loss_mlp": 0.0103143, "balance_loss_clip": 1.05858481, "balance_loss_mlp": 1.02169657, "epoch": 0.24673841159141466, "flos": 20002279213440.0, "grad_norm": 3.0200147183537878, "language_loss": 0.90505415, "learning_rate": 3.527293692493475e-06, "loss": 0.92748749, "num_input_tokens_seen": 43848345, "step": 2052, "time_per_iteration": 2.805985450744629 }, { "auxiliary_loss_clip": 0.01213652, "auxiliary_loss_mlp": 0.01034708, "balance_loss_clip": 1.05856895, "balance_loss_mlp": 1.02404475, "epoch": 0.24685865448205374, "flos": 21646593037440.0, "grad_norm": 2.446812862851139, "language_loss": 0.73290366, "learning_rate": 3.52679064594092e-06, "loss": 0.75538731, "num_input_tokens_seen": 43865685, "step": 2053, "time_per_iteration": 2.766519546508789 }, { "auxiliary_loss_clip": 0.01189053, "auxiliary_loss_mlp": 0.01033279, "balance_loss_clip": 1.05683923, "balance_loss_mlp": 1.02389145, "epoch": 0.24697889737269285, "flos": 17960470508160.0, "grad_norm": 2.3337570245693127, "language_loss": 0.7529223, "learning_rate": 3.5262873677754375e-06, "loss": 0.77514565, "num_input_tokens_seen": 43883690, "step": 2054, "time_per_iteration": 2.927088499069214 }, { "auxiliary_loss_clip": 0.01209389, "auxiliary_loss_mlp": 0.01037047, "balance_loss_clip": 1.05701637, "balance_loss_mlp": 1.02764153, "epoch": 0.24709914026333193, "flos": 27344611221120.0, "grad_norm": 1.687584306784854, "language_loss": 0.8053875, "learning_rate": 3.5257838580733745e-06, "loss": 0.82785189, "num_input_tokens_seen": 43903295, "step": 2055, "time_per_iteration": 2.8818819522857666 }, { "auxiliary_loss_clip": 0.01212122, "auxiliary_loss_mlp": 0.01037544, "balance_loss_clip": 1.05907083, "balance_loss_mlp": 1.02807856, "epoch": 0.24721938315397102, "flos": 19275519335040.0, "grad_norm": 1.937899035515247, "language_loss": 0.87310755, "learning_rate": 3.5252801169111138e-06, "loss": 0.89560425, "num_input_tokens_seen": 43920960, "step": 2056, "time_per_iteration": 2.751681327819824 }, { "auxiliary_loss_clip": 0.01200967, "auxiliary_loss_mlp": 0.01038381, "balance_loss_clip": 1.05908227, "balance_loss_mlp": 1.02852809, "epoch": 0.2473396260446101, "flos": 23185796688000.0, "grad_norm": 1.9693783484044787, "language_loss": 0.80092502, "learning_rate": 3.524776144365072e-06, "loss": 0.82331848, "num_input_tokens_seen": 43939415, "step": 2057, "time_per_iteration": 2.958500623703003 }, { "auxiliary_loss_clip": 0.01198585, "auxiliary_loss_mlp": 0.01034608, "balance_loss_clip": 1.05976439, "balance_loss_mlp": 1.02442765, "epoch": 0.2474598689352492, "flos": 21142443697920.0, "grad_norm": 1.6545057375906227, "language_loss": 0.79444784, "learning_rate": 3.5242719405117016e-06, "loss": 0.81677973, "num_input_tokens_seen": 43959220, "step": 2058, "time_per_iteration": 2.9856088161468506 }, { "auxiliary_loss_clip": 0.0120462, "auxiliary_loss_mlp": 0.010693, "balance_loss_clip": 1.05793834, "balance_loss_mlp": 1.0312413, "epoch": 0.2475801118258883, "flos": 21648352803840.0, "grad_norm": 3.090627231353954, "language_loss": 0.75575757, "learning_rate": 3.5237675054274893e-06, "loss": 0.77849674, "num_input_tokens_seen": 43978420, "step": 2059, "time_per_iteration": 2.939532518386841 }, { "auxiliary_loss_clip": 0.0121153, "auxiliary_loss_mlp": 0.01033531, "balance_loss_clip": 1.06172752, "balance_loss_mlp": 1.02317202, "epoch": 0.24770035471652738, "flos": 22674500542080.0, "grad_norm": 1.888213248125545, "language_loss": 0.80338961, "learning_rate": 3.5232628391889584e-06, "loss": 0.82584023, "num_input_tokens_seen": 43996710, "step": 2060, "time_per_iteration": 2.893012285232544 }, { "auxiliary_loss_clip": 0.01192679, "auxiliary_loss_mlp": 0.01037972, "balance_loss_clip": 1.05678999, "balance_loss_mlp": 1.02903152, "epoch": 0.2478205976071665, "flos": 22163814927360.0, "grad_norm": 2.369529479201006, "language_loss": 0.64281422, "learning_rate": 3.522757941872666e-06, "loss": 0.66512072, "num_input_tokens_seen": 44014865, "step": 2061, "time_per_iteration": 2.902465343475342 }, { "auxiliary_loss_clip": 0.0121395, "auxiliary_loss_mlp": 0.01062928, "balance_loss_clip": 1.06077933, "balance_loss_mlp": 1.02595019, "epoch": 0.24794084049780557, "flos": 24973106555520.0, "grad_norm": 1.7086514797181693, "language_loss": 0.8255533, "learning_rate": 3.5222528135552042e-06, "loss": 0.84832209, "num_input_tokens_seen": 44036325, "step": 2062, "time_per_iteration": 2.8900818824768066 }, { "auxiliary_loss_clip": 0.01208252, "auxiliary_loss_mlp": 0.01039778, "balance_loss_clip": 1.06052351, "balance_loss_mlp": 1.03032494, "epoch": 0.24806108338844465, "flos": 18296379521280.0, "grad_norm": 1.9549079961909615, "language_loss": 0.80637181, "learning_rate": 3.521747454313201e-06, "loss": 0.82885212, "num_input_tokens_seen": 44055005, "step": 2063, "time_per_iteration": 2.8259222507476807 }, { "auxiliary_loss_clip": 0.01195827, "auxiliary_loss_mlp": 0.01038749, "balance_loss_clip": 1.05892897, "balance_loss_mlp": 1.02815163, "epoch": 0.24818132627908374, "flos": 19282163351040.0, "grad_norm": 2.2250127198721237, "language_loss": 0.66649151, "learning_rate": 3.521241864223319e-06, "loss": 0.68883729, "num_input_tokens_seen": 44073965, "step": 2064, "time_per_iteration": 2.80892276763916 }, { "auxiliary_loss_clip": 0.01101726, "auxiliary_loss_mlp": 0.01005076, "balance_loss_clip": 1.01850843, "balance_loss_mlp": 1.00319278, "epoch": 0.24830156916972285, "flos": 70285837881600.0, "grad_norm": 0.7931598020360386, "language_loss": 0.61945212, "learning_rate": 3.5207360433622552e-06, "loss": 0.64052016, "num_input_tokens_seen": 44135965, "step": 2065, "time_per_iteration": 3.370144844055176 }, { "auxiliary_loss_clip": 0.01198884, "auxiliary_loss_mlp": 0.01033998, "balance_loss_clip": 1.05978012, "balance_loss_mlp": 1.02445579, "epoch": 0.24842181206036193, "flos": 40409128287360.0, "grad_norm": 1.719493996214331, "language_loss": 0.74636567, "learning_rate": 3.5202299918067437e-06, "loss": 0.76869452, "num_input_tokens_seen": 44159560, "step": 2066, "time_per_iteration": 2.9954006671905518 }, { "auxiliary_loss_clip": 0.01207671, "auxiliary_loss_mlp": 0.01036579, "balance_loss_clip": 1.05922675, "balance_loss_mlp": 1.02736402, "epoch": 0.248542054951001, "flos": 20082432412800.0, "grad_norm": 2.927146401228489, "language_loss": 0.70155483, "learning_rate": 3.519723709633551e-06, "loss": 0.72399735, "num_input_tokens_seen": 44178320, "step": 2067, "time_per_iteration": 2.8142247200012207 }, { "auxiliary_loss_clip": 0.01196443, "auxiliary_loss_mlp": 0.01035523, "balance_loss_clip": 1.05803502, "balance_loss_mlp": 1.02570653, "epoch": 0.24866229784164012, "flos": 23513948363520.0, "grad_norm": 1.7941803998350032, "language_loss": 0.835976, "learning_rate": 3.519217196919479e-06, "loss": 0.85829562, "num_input_tokens_seen": 44197305, "step": 2068, "time_per_iteration": 2.840486526489258 }, { "auxiliary_loss_clip": 0.01206616, "auxiliary_loss_mlp": 0.01031643, "balance_loss_clip": 1.05985558, "balance_loss_mlp": 1.02209997, "epoch": 0.2487825407322792, "flos": 19865101173120.0, "grad_norm": 1.7681881659406427, "language_loss": 0.73311913, "learning_rate": 3.518710453741367e-06, "loss": 0.75550169, "num_input_tokens_seen": 44216505, "step": 2069, "time_per_iteration": 2.8311893939971924 }, { "auxiliary_loss_clip": 0.01197572, "auxiliary_loss_mlp": 0.01054492, "balance_loss_clip": 1.05999231, "balance_loss_mlp": 1.01899791, "epoch": 0.2489027836229183, "flos": 22017622573440.0, "grad_norm": 3.162107757160438, "language_loss": 0.68012017, "learning_rate": 3.518203480176086e-06, "loss": 0.70264077, "num_input_tokens_seen": 44235435, "step": 2070, "time_per_iteration": 2.9906537532806396 }, { "auxiliary_loss_clip": 0.01192393, "auxiliary_loss_mlp": 0.01037733, "balance_loss_clip": 1.05879748, "balance_loss_mlp": 1.02827346, "epoch": 0.2490230265135574, "flos": 23294354567040.0, "grad_norm": 1.756243529452079, "language_loss": 0.81022179, "learning_rate": 3.517696276300545e-06, "loss": 0.83252305, "num_input_tokens_seen": 44256975, "step": 2071, "time_per_iteration": 3.0441815853118896 }, { "auxiliary_loss_clip": 0.0121091, "auxiliary_loss_mlp": 0.01035085, "balance_loss_clip": 1.06290495, "balance_loss_mlp": 1.0257988, "epoch": 0.24914326940419648, "flos": 19826784339840.0, "grad_norm": 3.679349226586698, "language_loss": 0.69083571, "learning_rate": 3.517188842191685e-06, "loss": 0.7132957, "num_input_tokens_seen": 44275125, "step": 2072, "time_per_iteration": 3.8799047470092773 }, { "auxiliary_loss_clip": 0.01206269, "auxiliary_loss_mlp": 0.010375, "balance_loss_clip": 1.05898666, "balance_loss_mlp": 1.02795124, "epoch": 0.24926351229483557, "flos": 20229271211520.0, "grad_norm": 1.5423048309778844, "language_loss": 0.73836237, "learning_rate": 3.5166811779264837e-06, "loss": 0.7608, "num_input_tokens_seen": 44295445, "step": 2073, "time_per_iteration": 2.8767621517181396 }, { "auxiliary_loss_clip": 0.01210088, "auxiliary_loss_mlp": 0.0103225, "balance_loss_clip": 1.05779946, "balance_loss_mlp": 1.02279699, "epoch": 0.24938375518547465, "flos": 23294570048640.0, "grad_norm": 2.139535936206723, "language_loss": 0.78067935, "learning_rate": 3.5161732835819545e-06, "loss": 0.80310273, "num_input_tokens_seen": 44314755, "step": 2074, "time_per_iteration": 4.053346633911133 }, { "auxiliary_loss_clip": 0.01209841, "auxiliary_loss_mlp": 0.01039512, "balance_loss_clip": 1.05877805, "balance_loss_mlp": 1.02920675, "epoch": 0.24950399807611376, "flos": 17311673099520.0, "grad_norm": 14.498597400462343, "language_loss": 0.8325454, "learning_rate": 3.515665159235143e-06, "loss": 0.85503888, "num_input_tokens_seen": 44333640, "step": 2075, "time_per_iteration": 3.9099621772766113 }, { "auxiliary_loss_clip": 0.01203794, "auxiliary_loss_mlp": 0.01031921, "balance_loss_clip": 1.05864787, "balance_loss_mlp": 1.02295089, "epoch": 0.24962424096675284, "flos": 19024863252480.0, "grad_norm": 1.5421831947080349, "language_loss": 0.74766988, "learning_rate": 3.5151568049631318e-06, "loss": 0.77002704, "num_input_tokens_seen": 44352355, "step": 2076, "time_per_iteration": 3.7615766525268555 }, { "auxiliary_loss_clip": 0.01210742, "auxiliary_loss_mlp": 0.01034108, "balance_loss_clip": 1.05860162, "balance_loss_mlp": 1.02451777, "epoch": 0.24974448385739192, "flos": 33398790710400.0, "grad_norm": 2.804124013358194, "language_loss": 0.80348611, "learning_rate": 3.5146482208430385e-06, "loss": 0.82593465, "num_input_tokens_seen": 44374185, "step": 2077, "time_per_iteration": 2.930952310562134 }, { "auxiliary_loss_clip": 0.01184283, "auxiliary_loss_mlp": 0.01040219, "balance_loss_clip": 1.05895388, "balance_loss_mlp": 1.02974689, "epoch": 0.24986472674803104, "flos": 30007279532160.0, "grad_norm": 3.3863149345581736, "language_loss": 0.68098068, "learning_rate": 3.514139406952014e-06, "loss": 0.70322573, "num_input_tokens_seen": 44396210, "step": 2078, "time_per_iteration": 2.909465789794922 }, { "auxiliary_loss_clip": 0.01204351, "auxiliary_loss_mlp": 0.01035432, "balance_loss_clip": 1.05691838, "balance_loss_mlp": 1.02580035, "epoch": 0.24998496963867012, "flos": 26613074833920.0, "grad_norm": 2.0081230441000355, "language_loss": 0.83814335, "learning_rate": 3.5136303633672454e-06, "loss": 0.86054122, "num_input_tokens_seen": 44416340, "step": 2079, "time_per_iteration": 2.786970376968384 }, { "auxiliary_loss_clip": 0.01207596, "auxiliary_loss_mlp": 0.01060897, "balance_loss_clip": 1.06047308, "balance_loss_mlp": 1.02557206, "epoch": 0.25010521252930923, "flos": 23553989049600.0, "grad_norm": 1.651932425615973, "language_loss": 0.74385059, "learning_rate": 3.5131210901659544e-06, "loss": 0.76653546, "num_input_tokens_seen": 44438095, "step": 2080, "time_per_iteration": 2.8729116916656494 }, { "auxiliary_loss_clip": 0.01196073, "auxiliary_loss_mlp": 0.01035826, "balance_loss_clip": 1.05775309, "balance_loss_mlp": 1.0265274, "epoch": 0.2502254554199483, "flos": 23441193365760.0, "grad_norm": 2.284495146253505, "language_loss": 0.82279158, "learning_rate": 3.5126115874253967e-06, "loss": 0.84511054, "num_input_tokens_seen": 44457650, "step": 2081, "time_per_iteration": 2.801931142807007 }, { "auxiliary_loss_clip": 0.01200727, "auxiliary_loss_mlp": 0.01030098, "balance_loss_clip": 1.05949092, "balance_loss_mlp": 1.02103841, "epoch": 0.2503456983105874, "flos": 28761681651840.0, "grad_norm": 1.8459226532647037, "language_loss": 0.81196201, "learning_rate": 3.5121018552228644e-06, "loss": 0.83427024, "num_input_tokens_seen": 44476155, "step": 2082, "time_per_iteration": 2.87222957611084 }, { "auxiliary_loss_clip": 0.01199492, "auxiliary_loss_mlp": 0.01029382, "balance_loss_clip": 1.05905545, "balance_loss_mlp": 1.02032769, "epoch": 0.2504659412012265, "flos": 18770256673920.0, "grad_norm": 2.298059078704271, "language_loss": 0.76255614, "learning_rate": 3.5115918936356827e-06, "loss": 0.78484482, "num_input_tokens_seen": 44492910, "step": 2083, "time_per_iteration": 2.833953857421875 }, { "auxiliary_loss_clip": 0.01185706, "auxiliary_loss_mlp": 0.01034593, "balance_loss_clip": 1.06158566, "balance_loss_mlp": 1.02507448, "epoch": 0.25058618409186556, "flos": 16873383346560.0, "grad_norm": 2.3641254665682703, "language_loss": 0.78680432, "learning_rate": 3.5110817027412123e-06, "loss": 0.80900729, "num_input_tokens_seen": 44512000, "step": 2084, "time_per_iteration": 2.7241575717926025 }, { "auxiliary_loss_clip": 0.01195631, "auxiliary_loss_mlp": 0.01030934, "balance_loss_clip": 1.05860043, "balance_loss_mlp": 1.02196348, "epoch": 0.25070642698250467, "flos": 24425540651520.0, "grad_norm": 2.2512544835625143, "language_loss": 0.69186032, "learning_rate": 3.5105712826168493e-06, "loss": 0.71412599, "num_input_tokens_seen": 44531650, "step": 2085, "time_per_iteration": 2.8150224685668945 }, { "auxiliary_loss_clip": 0.01203841, "auxiliary_loss_mlp": 0.01057423, "balance_loss_clip": 1.05655205, "balance_loss_mlp": 1.02189028, "epoch": 0.2508266698731437, "flos": 20260944028800.0, "grad_norm": 2.482464270383857, "language_loss": 0.70766521, "learning_rate": 3.5100606333400235e-06, "loss": 0.7302779, "num_input_tokens_seen": 44548785, "step": 2086, "time_per_iteration": 2.843451499938965 }, { "auxiliary_loss_clip": 0.01216214, "auxiliary_loss_mlp": 0.01029584, "balance_loss_clip": 1.06144738, "balance_loss_mlp": 1.02038121, "epoch": 0.25094691276378284, "flos": 19245318975360.0, "grad_norm": 2.4979435417803124, "language_loss": 0.76860851, "learning_rate": 3.5095497549882006e-06, "loss": 0.79106647, "num_input_tokens_seen": 44567230, "step": 2087, "time_per_iteration": 2.8181276321411133 }, { "auxiliary_loss_clip": 0.01209954, "auxiliary_loss_mlp": 0.01037073, "balance_loss_clip": 1.06217766, "balance_loss_mlp": 1.02754819, "epoch": 0.25106715565442195, "flos": 26943237671040.0, "grad_norm": 1.9193798543050253, "language_loss": 0.7276547, "learning_rate": 3.50903864763888e-06, "loss": 0.75012499, "num_input_tokens_seen": 44588020, "step": 2088, "time_per_iteration": 2.820049524307251 }, { "auxiliary_loss_clip": 0.0120941, "auxiliary_loss_mlp": 0.01029714, "balance_loss_clip": 1.0579567, "balance_loss_mlp": 1.02023053, "epoch": 0.251187398545061, "flos": 48359570572800.0, "grad_norm": 2.023380158720194, "language_loss": 0.75833291, "learning_rate": 3.5085273113695965e-06, "loss": 0.78072411, "num_input_tokens_seen": 44612590, "step": 2089, "time_per_iteration": 2.9851510524749756 }, { "auxiliary_loss_clip": 0.01212257, "auxiliary_loss_mlp": 0.01033006, "balance_loss_clip": 1.06017268, "balance_loss_mlp": 1.02379668, "epoch": 0.2513076414357001, "flos": 27016100409600.0, "grad_norm": 1.7592404937370278, "language_loss": 0.78191918, "learning_rate": 3.508015746257919e-06, "loss": 0.80437177, "num_input_tokens_seen": 44631630, "step": 2090, "time_per_iteration": 2.836881399154663 }, { "auxiliary_loss_clip": 0.01202025, "auxiliary_loss_mlp": 0.01031874, "balance_loss_clip": 1.05876696, "balance_loss_mlp": 1.02208138, "epoch": 0.2514278843263392, "flos": 19463619882240.0, "grad_norm": 1.9652800695182469, "language_loss": 0.83318037, "learning_rate": 3.5075039523814518e-06, "loss": 0.85551941, "num_input_tokens_seen": 44650820, "step": 2091, "time_per_iteration": 2.8333334922790527 }, { "auxiliary_loss_clip": 0.0121123, "auxiliary_loss_mlp": 0.01030572, "balance_loss_clip": 1.05773985, "balance_loss_mlp": 1.01969433, "epoch": 0.2515481272169783, "flos": 16866092885760.0, "grad_norm": 2.0580714551985366, "language_loss": 0.81876016, "learning_rate": 3.506991929817834e-06, "loss": 0.84117818, "num_input_tokens_seen": 44667540, "step": 2092, "time_per_iteration": 2.7334110736846924 }, { "auxiliary_loss_clip": 0.01210641, "auxiliary_loss_mlp": 0.01038084, "balance_loss_clip": 1.06208527, "balance_loss_mlp": 1.0296793, "epoch": 0.2516683701076174, "flos": 23732464752000.0, "grad_norm": 1.735950205427012, "language_loss": 0.82451224, "learning_rate": 3.506479678644738e-06, "loss": 0.84699947, "num_input_tokens_seen": 44687935, "step": 2093, "time_per_iteration": 2.7489356994628906 }, { "auxiliary_loss_clip": 0.011864, "auxiliary_loss_mlp": 0.01037273, "balance_loss_clip": 1.05648255, "balance_loss_mlp": 1.02798057, "epoch": 0.2517886129982565, "flos": 27635954434560.0, "grad_norm": 2.785142363987121, "language_loss": 0.73792237, "learning_rate": 3.505967198939873e-06, "loss": 0.76015913, "num_input_tokens_seen": 44704975, "step": 2094, "time_per_iteration": 2.855348587036133 }, { "auxiliary_loss_clip": 0.01199525, "auxiliary_loss_mlp": 0.01030803, "balance_loss_clip": 1.05556464, "balance_loss_mlp": 1.0217967, "epoch": 0.25190885588889556, "flos": 38104596529920.0, "grad_norm": 2.0011683138870926, "language_loss": 0.78175658, "learning_rate": 3.5054544907809813e-06, "loss": 0.80405986, "num_input_tokens_seen": 44725475, "step": 2095, "time_per_iteration": 2.9325802326202393 }, { "auxiliary_loss_clip": 0.01197747, "auxiliary_loss_mlp": 0.0105778, "balance_loss_clip": 1.05921638, "balance_loss_mlp": 1.02086389, "epoch": 0.25202909877953467, "flos": 22269894768000.0, "grad_norm": 2.0648995173713525, "language_loss": 0.80455095, "learning_rate": 3.50494155424584e-06, "loss": 0.82710624, "num_input_tokens_seen": 44744380, "step": 2096, "time_per_iteration": 2.927882194519043 }, { "auxiliary_loss_clip": 0.01208535, "auxiliary_loss_mlp": 0.0103429, "balance_loss_clip": 1.05860806, "balance_loss_mlp": 1.02481258, "epoch": 0.2521493416701738, "flos": 21761759018880.0, "grad_norm": 2.567076380431348, "language_loss": 0.83064902, "learning_rate": 3.504428389412262e-06, "loss": 0.85307729, "num_input_tokens_seen": 44765190, "step": 2097, "time_per_iteration": 2.807185173034668 }, { "auxiliary_loss_clip": 0.01206375, "auxiliary_loss_mlp": 0.01030113, "balance_loss_clip": 1.05912042, "balance_loss_mlp": 1.02022421, "epoch": 0.25226958456081283, "flos": 27746738956800.0, "grad_norm": 2.00368935021317, "language_loss": 0.73064601, "learning_rate": 3.5039149963580927e-06, "loss": 0.75301087, "num_input_tokens_seen": 44785210, "step": 2098, "time_per_iteration": 3.722954750061035 }, { "auxiliary_loss_clip": 0.01199555, "auxiliary_loss_mlp": 0.01029132, "balance_loss_clip": 1.06001782, "balance_loss_mlp": 1.01994085, "epoch": 0.25238982745145194, "flos": 30732171903360.0, "grad_norm": 2.2814279263345614, "language_loss": 0.70841956, "learning_rate": 3.503401375161215e-06, "loss": 0.73070645, "num_input_tokens_seen": 44804955, "step": 2099, "time_per_iteration": 2.811668634414673 }, { "auxiliary_loss_clip": 0.01206831, "auxiliary_loss_mlp": 0.01032879, "balance_loss_clip": 1.05714655, "balance_loss_mlp": 1.02310991, "epoch": 0.252510070342091, "flos": 20266331068800.0, "grad_norm": 1.4687798768401372, "language_loss": 0.83669066, "learning_rate": 3.502887525899544e-06, "loss": 0.85908777, "num_input_tokens_seen": 44823935, "step": 2100, "time_per_iteration": 3.7207412719726562 }, { "auxiliary_loss_clip": 0.01203189, "auxiliary_loss_mlp": 0.01041477, "balance_loss_clip": 1.0592258, "balance_loss_mlp": 1.03092694, "epoch": 0.2526303132327301, "flos": 22747399194240.0, "grad_norm": 1.8267088607925022, "language_loss": 0.82969767, "learning_rate": 3.50237344865103e-06, "loss": 0.8521443, "num_input_tokens_seen": 44844935, "step": 2101, "time_per_iteration": 2.835127830505371 }, { "auxiliary_loss_clip": 0.01210621, "auxiliary_loss_mlp": 0.01032775, "balance_loss_clip": 1.05886531, "balance_loss_mlp": 1.02330971, "epoch": 0.2527505561233692, "flos": 30263466309120.0, "grad_norm": 2.625090633889195, "language_loss": 0.76319355, "learning_rate": 3.501859143493658e-06, "loss": 0.78562748, "num_input_tokens_seen": 44865565, "step": 2102, "time_per_iteration": 4.793001174926758 }, { "auxiliary_loss_clip": 0.01104961, "auxiliary_loss_mlp": 0.01003847, "balance_loss_clip": 1.02356601, "balance_loss_mlp": 1.00204659, "epoch": 0.2528707990140083, "flos": 58492917164160.0, "grad_norm": 0.9220671772043545, "language_loss": 0.60536301, "learning_rate": 3.5013446105054488e-06, "loss": 0.62645113, "num_input_tokens_seen": 44918485, "step": 2103, "time_per_iteration": 3.057772159576416 }, { "auxiliary_loss_clip": 0.01188266, "auxiliary_loss_mlp": 0.01038297, "balance_loss_clip": 1.06035638, "balance_loss_mlp": 1.02945137, "epoch": 0.2529910419046474, "flos": 24645134448000.0, "grad_norm": 1.7171222497439618, "language_loss": 0.75001431, "learning_rate": 3.5008298497644555e-06, "loss": 0.77227998, "num_input_tokens_seen": 44937530, "step": 2104, "time_per_iteration": 2.9498579502105713 }, { "auxiliary_loss_clip": 0.01199869, "auxiliary_loss_mlp": 0.01035224, "balance_loss_clip": 1.05879557, "balance_loss_mlp": 1.02548444, "epoch": 0.2531112847952865, "flos": 23842135952640.0, "grad_norm": 2.284042948583063, "language_loss": 0.8809613, "learning_rate": 3.500314861348767e-06, "loss": 0.90331221, "num_input_tokens_seen": 44958165, "step": 2105, "time_per_iteration": 2.93096661567688 }, { "auxiliary_loss_clip": 0.01190044, "auxiliary_loss_mlp": 0.01031956, "balance_loss_clip": 1.05886436, "balance_loss_mlp": 1.02302766, "epoch": 0.25323152768592555, "flos": 16143822207360.0, "grad_norm": 2.025311897314839, "language_loss": 0.77075815, "learning_rate": 3.499799645336507e-06, "loss": 0.79297817, "num_input_tokens_seen": 44975060, "step": 2106, "time_per_iteration": 2.7817516326904297 }, { "auxiliary_loss_clip": 0.01211651, "auxiliary_loss_mlp": 0.01030125, "balance_loss_clip": 1.06442773, "balance_loss_mlp": 1.02142906, "epoch": 0.25335177057656466, "flos": 28405161210240.0, "grad_norm": 1.3361339305449653, "language_loss": 0.86916828, "learning_rate": 3.4992842018058336e-06, "loss": 0.89158607, "num_input_tokens_seen": 44997960, "step": 2107, "time_per_iteration": 2.9731218814849854 }, { "auxiliary_loss_clip": 0.01206112, "auxiliary_loss_mlp": 0.01029258, "balance_loss_clip": 1.06004035, "balance_loss_mlp": 1.02006125, "epoch": 0.25347201346720377, "flos": 18799666934400.0, "grad_norm": 2.099014088767429, "language_loss": 0.88455701, "learning_rate": 3.4987685308349384e-06, "loss": 0.90691072, "num_input_tokens_seen": 45015690, "step": 2108, "time_per_iteration": 2.8921682834625244 }, { "auxiliary_loss_clip": 0.01198819, "auxiliary_loss_mlp": 0.01033476, "balance_loss_clip": 1.05766881, "balance_loss_mlp": 1.02426147, "epoch": 0.2535922563578428, "flos": 15815490963840.0, "grad_norm": 2.2731192339157906, "language_loss": 0.61238575, "learning_rate": 3.4982526325020497e-06, "loss": 0.63470864, "num_input_tokens_seen": 45032660, "step": 2109, "time_per_iteration": 2.8486077785491943 }, { "auxiliary_loss_clip": 0.01205435, "auxiliary_loss_mlp": 0.01029791, "balance_loss_clip": 1.06045067, "balance_loss_mlp": 1.02046263, "epoch": 0.25371249924848194, "flos": 16318922031360.0, "grad_norm": 2.4441735679367196, "language_loss": 0.8193866, "learning_rate": 3.4977365068854273e-06, "loss": 0.84173888, "num_input_tokens_seen": 45048280, "step": 2110, "time_per_iteration": 2.7317421436309814 }, { "auxiliary_loss_clip": 0.01194234, "auxiliary_loss_mlp": 0.01032455, "balance_loss_clip": 1.0595268, "balance_loss_mlp": 1.0227747, "epoch": 0.25383274213912105, "flos": 21761615364480.0, "grad_norm": 1.6486609595183166, "language_loss": 0.73377061, "learning_rate": 3.4972201540633676e-06, "loss": 0.75603753, "num_input_tokens_seen": 45067635, "step": 2111, "time_per_iteration": 2.7207210063934326 }, { "auxiliary_loss_clip": 0.01194276, "auxiliary_loss_mlp": 0.010396, "balance_loss_clip": 1.05778778, "balance_loss_mlp": 1.02994955, "epoch": 0.2539529850297601, "flos": 21396870708480.0, "grad_norm": 1.7883605092892294, "language_loss": 0.85583806, "learning_rate": 3.4967035741142008e-06, "loss": 0.87817681, "num_input_tokens_seen": 45086455, "step": 2112, "time_per_iteration": 2.7959775924682617 }, { "auxiliary_loss_clip": 0.01192662, "auxiliary_loss_mlp": 0.01035082, "balance_loss_clip": 1.05964518, "balance_loss_mlp": 1.02648067, "epoch": 0.2540732279203992, "flos": 25228467319680.0, "grad_norm": 1.856610929759295, "language_loss": 0.82080495, "learning_rate": 3.4961867671162917e-06, "loss": 0.84308237, "num_input_tokens_seen": 45106385, "step": 2113, "time_per_iteration": 2.791088819503784 }, { "auxiliary_loss_clip": 0.0121078, "auxiliary_loss_mlp": 0.01034862, "balance_loss_clip": 1.05879259, "balance_loss_mlp": 1.02474689, "epoch": 0.2541934708110383, "flos": 19427386037760.0, "grad_norm": 2.8737113283975044, "language_loss": 0.77868319, "learning_rate": 3.4956697331480402e-06, "loss": 0.80113959, "num_input_tokens_seen": 45124955, "step": 2114, "time_per_iteration": 2.707597255706787 }, { "auxiliary_loss_clip": 0.0120513, "auxiliary_loss_mlp": 0.01032989, "balance_loss_clip": 1.05925512, "balance_loss_mlp": 1.02369082, "epoch": 0.2543137137016774, "flos": 23949436855680.0, "grad_norm": 1.7721994546944575, "language_loss": 0.80158222, "learning_rate": 3.495152472287879e-06, "loss": 0.8239634, "num_input_tokens_seen": 45145665, "step": 2115, "time_per_iteration": 2.7247660160064697 }, { "auxiliary_loss_clip": 0.01198561, "auxiliary_loss_mlp": 0.01033752, "balance_loss_clip": 1.0582571, "balance_loss_mlp": 1.02434063, "epoch": 0.2544339565923165, "flos": 25593283802880.0, "grad_norm": 2.176503968203904, "language_loss": 0.74006498, "learning_rate": 3.4946349846142766e-06, "loss": 0.76238817, "num_input_tokens_seen": 45164805, "step": 2116, "time_per_iteration": 2.8984506130218506 }, { "auxiliary_loss_clip": 0.01208453, "auxiliary_loss_mlp": 0.01034918, "balance_loss_clip": 1.05689287, "balance_loss_mlp": 1.0255481, "epoch": 0.25455419948295555, "flos": 21689470897920.0, "grad_norm": 1.918132869567011, "language_loss": 0.76003021, "learning_rate": 3.4941172702057353e-06, "loss": 0.78246391, "num_input_tokens_seen": 45184865, "step": 2117, "time_per_iteration": 2.847450017929077 }, { "auxiliary_loss_clip": 0.01201755, "auxiliary_loss_mlp": 0.01039334, "balance_loss_clip": 1.05871606, "balance_loss_mlp": 1.02945161, "epoch": 0.25467444237359466, "flos": 26250341339520.0, "grad_norm": 2.0764585195250342, "language_loss": 0.80638832, "learning_rate": 3.4935993291407924e-06, "loss": 0.82879919, "num_input_tokens_seen": 45203690, "step": 2118, "time_per_iteration": 2.9104807376861572 }, { "auxiliary_loss_clip": 0.01200718, "auxiliary_loss_mlp": 0.01033555, "balance_loss_clip": 1.06003332, "balance_loss_mlp": 1.02382767, "epoch": 0.25479468526423377, "flos": 26979686997120.0, "grad_norm": 2.9104995240329163, "language_loss": 0.71986198, "learning_rate": 3.4930811614980183e-06, "loss": 0.74220467, "num_input_tokens_seen": 45225385, "step": 2119, "time_per_iteration": 2.8022239208221436 }, { "auxiliary_loss_clip": 0.01201471, "auxiliary_loss_mlp": 0.01037775, "balance_loss_clip": 1.06009352, "balance_loss_mlp": 1.02831006, "epoch": 0.2549149281548728, "flos": 23475811098240.0, "grad_norm": 1.719949288327079, "language_loss": 0.79120493, "learning_rate": 3.4925627673560198e-06, "loss": 0.81359732, "num_input_tokens_seen": 45246045, "step": 2120, "time_per_iteration": 2.7973103523254395 }, { "auxiliary_loss_clip": 0.01195099, "auxiliary_loss_mlp": 0.01029763, "balance_loss_clip": 1.05722523, "balance_loss_mlp": 1.02106071, "epoch": 0.25503517104551193, "flos": 25812302981760.0, "grad_norm": 1.7986301690684345, "language_loss": 0.88416922, "learning_rate": 3.4920441467934357e-06, "loss": 0.90641785, "num_input_tokens_seen": 45266560, "step": 2121, "time_per_iteration": 3.094005584716797 }, { "auxiliary_loss_clip": 0.01192368, "auxiliary_loss_mlp": 0.0102865, "balance_loss_clip": 1.05906188, "balance_loss_mlp": 1.01848197, "epoch": 0.25515541393615104, "flos": 26645106787200.0, "grad_norm": 1.7693165837026945, "language_loss": 0.83016777, "learning_rate": 3.491525299888941e-06, "loss": 0.85237801, "num_input_tokens_seen": 45285405, "step": 2122, "time_per_iteration": 2.9538228511810303 }, { "auxiliary_loss_clip": 0.01099274, "auxiliary_loss_mlp": 0.0102862, "balance_loss_clip": 1.02790713, "balance_loss_mlp": 1.00096214, "epoch": 0.2552756568267901, "flos": 65955945847680.0, "grad_norm": 0.8787535845134538, "language_loss": 0.62706649, "learning_rate": 3.491006226721244e-06, "loss": 0.64834547, "num_input_tokens_seen": 45349615, "step": 2123, "time_per_iteration": 3.4148764610290527 }, { "auxiliary_loss_clip": 0.01204742, "auxiliary_loss_mlp": 0.01062848, "balance_loss_clip": 1.0614531, "balance_loss_mlp": 1.02599525, "epoch": 0.2553958997174292, "flos": 17931096161280.0, "grad_norm": 2.739514629305108, "language_loss": 0.77741182, "learning_rate": 3.4904869273690882e-06, "loss": 0.80008775, "num_input_tokens_seen": 45367505, "step": 2124, "time_per_iteration": 3.6742565631866455 }, { "auxiliary_loss_clip": 0.01208145, "auxiliary_loss_mlp": 0.01033509, "balance_loss_clip": 1.05794203, "balance_loss_mlp": 1.02387047, "epoch": 0.2555161426080683, "flos": 23367791923200.0, "grad_norm": 1.823253161036928, "language_loss": 0.88769674, "learning_rate": 3.489967401911251e-06, "loss": 0.91011333, "num_input_tokens_seen": 45386805, "step": 2125, "time_per_iteration": 2.6948888301849365 }, { "auxiliary_loss_clip": 0.01215574, "auxiliary_loss_mlp": 0.01034998, "balance_loss_clip": 1.06174004, "balance_loss_mlp": 1.02450776, "epoch": 0.2556363854987074, "flos": 40625130723840.0, "grad_norm": 1.853795683901748, "language_loss": 0.69675618, "learning_rate": 3.4894476504265428e-06, "loss": 0.71926194, "num_input_tokens_seen": 45411045, "step": 2126, "time_per_iteration": 3.82967209815979 }, { "auxiliary_loss_clip": 0.01101122, "auxiliary_loss_mlp": 0.01003581, "balance_loss_clip": 1.02365923, "balance_loss_mlp": 1.00168598, "epoch": 0.2557566283893465, "flos": 68019443389440.0, "grad_norm": 0.8649150410225716, "language_loss": 0.54475796, "learning_rate": 3.4889276729938104e-06, "loss": 0.56580496, "num_input_tokens_seen": 45469575, "step": 2127, "time_per_iteration": 3.1456775665283203 }, { "auxiliary_loss_clip": 0.01199463, "auxiliary_loss_mlp": 0.01030892, "balance_loss_clip": 1.05943704, "balance_loss_mlp": 1.02160585, "epoch": 0.2558768712799856, "flos": 22635645004800.0, "grad_norm": 2.137693036974967, "language_loss": 0.80343056, "learning_rate": 3.488407469691934e-06, "loss": 0.82573414, "num_input_tokens_seen": 45490270, "step": 2128, "time_per_iteration": 4.918376922607422 }, { "auxiliary_loss_clip": 0.01203821, "auxiliary_loss_mlp": 0.01032665, "balance_loss_clip": 1.06161118, "balance_loss_mlp": 1.02325952, "epoch": 0.25599711417062465, "flos": 26396354125440.0, "grad_norm": 2.469135791726126, "language_loss": 0.80986714, "learning_rate": 3.487887040599828e-06, "loss": 0.832232, "num_input_tokens_seen": 45510070, "step": 2129, "time_per_iteration": 2.831876754760742 }, { "auxiliary_loss_clip": 0.01211805, "auxiliary_loss_mlp": 0.01033565, "balance_loss_clip": 1.06055355, "balance_loss_mlp": 1.02359366, "epoch": 0.25611735706126376, "flos": 22852042490880.0, "grad_norm": 3.325242399859175, "language_loss": 0.76844692, "learning_rate": 3.4873663857964407e-06, "loss": 0.79090065, "num_input_tokens_seen": 45527285, "step": 2130, "time_per_iteration": 2.738274097442627 }, { "auxiliary_loss_clip": 0.0119223, "auxiliary_loss_mlp": 0.01035179, "balance_loss_clip": 1.05718398, "balance_loss_mlp": 1.02596402, "epoch": 0.2562375999519028, "flos": 23367863750400.0, "grad_norm": 1.7799753310489579, "language_loss": 0.66821313, "learning_rate": 3.4868455053607556e-06, "loss": 0.69048727, "num_input_tokens_seen": 45546900, "step": 2131, "time_per_iteration": 2.7871830463409424 }, { "auxiliary_loss_clip": 0.01213156, "auxiliary_loss_mlp": 0.01037082, "balance_loss_clip": 1.06043828, "balance_loss_mlp": 1.02734244, "epoch": 0.2563578428425419, "flos": 22856962654080.0, "grad_norm": 2.005737203804408, "language_loss": 0.71806788, "learning_rate": 3.486324399371789e-06, "loss": 0.74057019, "num_input_tokens_seen": 45566200, "step": 2132, "time_per_iteration": 2.805065155029297 }, { "auxiliary_loss_clip": 0.01194977, "auxiliary_loss_mlp": 0.01032628, "balance_loss_clip": 1.0599401, "balance_loss_mlp": 1.02343154, "epoch": 0.25647808573318104, "flos": 21653883498240.0, "grad_norm": 1.8485893290822886, "language_loss": 0.78307819, "learning_rate": 3.485803067908593e-06, "loss": 0.80535424, "num_input_tokens_seen": 45585710, "step": 2133, "time_per_iteration": 2.8661468029022217 }, { "auxiliary_loss_clip": 0.01171696, "auxiliary_loss_mlp": 0.01036542, "balance_loss_clip": 1.05839729, "balance_loss_mlp": 1.02652228, "epoch": 0.2565983286238201, "flos": 33730569659520.0, "grad_norm": 1.7994489645892118, "language_loss": 0.79718441, "learning_rate": 3.485281511050253e-06, "loss": 0.8192668, "num_input_tokens_seen": 45607845, "step": 2134, "time_per_iteration": 2.989682674407959 }, { "auxiliary_loss_clip": 0.01204921, "auxiliary_loss_mlp": 0.01025828, "balance_loss_clip": 1.05606842, "balance_loss_mlp": 1.01639867, "epoch": 0.2567185715144592, "flos": 16216002587520.0, "grad_norm": 7.4019257423239395, "language_loss": 0.89879763, "learning_rate": 3.484759728875889e-06, "loss": 0.92110515, "num_input_tokens_seen": 45623210, "step": 2135, "time_per_iteration": 2.74493408203125 }, { "auxiliary_loss_clip": 0.01183557, "auxiliary_loss_mlp": 0.01034082, "balance_loss_clip": 1.05701995, "balance_loss_mlp": 1.02480209, "epoch": 0.2568388144050983, "flos": 17458475984640.0, "grad_norm": 1.8436406093286812, "language_loss": 0.80977416, "learning_rate": 3.4842377214646543e-06, "loss": 0.83195055, "num_input_tokens_seen": 45641505, "step": 2136, "time_per_iteration": 2.7590420246124268 }, { "auxiliary_loss_clip": 0.01204606, "auxiliary_loss_mlp": 0.01035918, "balance_loss_clip": 1.05606997, "balance_loss_mlp": 1.02687049, "epoch": 0.25695905729573737, "flos": 20887442069760.0, "grad_norm": 1.715721281354692, "language_loss": 0.66778517, "learning_rate": 3.483715488895737e-06, "loss": 0.69019043, "num_input_tokens_seen": 45661835, "step": 2137, "time_per_iteration": 2.7285683155059814 }, { "auxiliary_loss_clip": 0.01194361, "auxiliary_loss_mlp": 0.01030811, "balance_loss_clip": 1.05667746, "balance_loss_mlp": 1.02076805, "epoch": 0.2570793001863765, "flos": 24717278914560.0, "grad_norm": 2.057998453912699, "language_loss": 0.78898954, "learning_rate": 3.48319303124836e-06, "loss": 0.81124127, "num_input_tokens_seen": 45682215, "step": 2138, "time_per_iteration": 2.842071056365967 }, { "auxiliary_loss_clip": 0.01195261, "auxiliary_loss_mlp": 0.01031818, "balance_loss_clip": 1.05826831, "balance_loss_mlp": 1.02265644, "epoch": 0.2571995430770156, "flos": 26906896085760.0, "grad_norm": 2.3759182647172916, "language_loss": 0.66999114, "learning_rate": 3.4826703486017798e-06, "loss": 0.69226193, "num_input_tokens_seen": 45701840, "step": 2139, "time_per_iteration": 2.8834071159362793 }, { "auxiliary_loss_clip": 0.01201987, "auxiliary_loss_mlp": 0.01030385, "balance_loss_clip": 1.05757809, "balance_loss_mlp": 1.02089548, "epoch": 0.25731978596765465, "flos": 19792561656960.0, "grad_norm": 1.650017539700142, "language_loss": 0.76596516, "learning_rate": 3.4821474410352867e-06, "loss": 0.78828883, "num_input_tokens_seen": 45720500, "step": 2140, "time_per_iteration": 2.6880762577056885 }, { "auxiliary_loss_clip": 0.01108716, "auxiliary_loss_mlp": 0.01025928, "balance_loss_clip": 1.02860725, "balance_loss_mlp": 1.02403295, "epoch": 0.25744002885829376, "flos": 70564970471040.0, "grad_norm": 0.9046174513118874, "language_loss": 0.62701422, "learning_rate": 3.481624308628205e-06, "loss": 0.64836067, "num_input_tokens_seen": 45781870, "step": 2141, "time_per_iteration": 3.3902618885040283 }, { "auxiliary_loss_clip": 0.01199524, "auxiliary_loss_mlp": 0.01034681, "balance_loss_clip": 1.05927312, "balance_loss_mlp": 1.02562726, "epoch": 0.25756027174893287, "flos": 18038181582720.0, "grad_norm": 2.7978101679649203, "language_loss": 1.00446475, "learning_rate": 3.481100951459893e-06, "loss": 1.02680683, "num_input_tokens_seen": 45794890, "step": 2142, "time_per_iteration": 2.743041753768921 }, { "auxiliary_loss_clip": 0.01197241, "auxiliary_loss_mlp": 0.01028172, "balance_loss_clip": 1.05536437, "balance_loss_mlp": 1.01904631, "epoch": 0.2576805146395719, "flos": 22674069578880.0, "grad_norm": 1.5608671991349874, "language_loss": 0.78807461, "learning_rate": 3.4805773696097453e-06, "loss": 0.81032872, "num_input_tokens_seen": 45815780, "step": 2143, "time_per_iteration": 2.806553602218628 }, { "auxiliary_loss_clip": 0.01192719, "auxiliary_loss_mlp": 0.01032246, "balance_loss_clip": 1.05686736, "balance_loss_mlp": 1.02172029, "epoch": 0.25780075753021103, "flos": 16472225278080.0, "grad_norm": 2.0063777547080965, "language_loss": 0.87882751, "learning_rate": 3.4800535631571874e-06, "loss": 0.90107715, "num_input_tokens_seen": 45831310, "step": 2144, "time_per_iteration": 2.814992666244507 }, { "auxiliary_loss_clip": 0.01205046, "auxiliary_loss_mlp": 0.01035822, "balance_loss_clip": 1.05816102, "balance_loss_mlp": 1.02568316, "epoch": 0.25792100042085014, "flos": 22820297846400.0, "grad_norm": 1.9610970803694412, "language_loss": 0.76238573, "learning_rate": 3.4795295321816804e-06, "loss": 0.78479445, "num_input_tokens_seen": 45850135, "step": 2145, "time_per_iteration": 2.7297821044921875 }, { "auxiliary_loss_clip": 0.01189875, "auxiliary_loss_mlp": 0.01029953, "balance_loss_clip": 1.05680501, "balance_loss_mlp": 1.02067256, "epoch": 0.2580412433114892, "flos": 18697286194560.0, "grad_norm": 3.5581763807671423, "language_loss": 0.91172063, "learning_rate": 3.47900527676272e-06, "loss": 0.93391889, "num_input_tokens_seen": 45868470, "step": 2146, "time_per_iteration": 2.6977927684783936 }, { "auxiliary_loss_clip": 0.01209969, "auxiliary_loss_mlp": 0.01033034, "balance_loss_clip": 1.05977201, "balance_loss_mlp": 1.02361047, "epoch": 0.2581614862021283, "flos": 14283146810880.0, "grad_norm": 2.04154509521346, "language_loss": 0.88256586, "learning_rate": 3.478480796979835e-06, "loss": 0.90499586, "num_input_tokens_seen": 45886355, "step": 2147, "time_per_iteration": 2.6268606185913086 }, { "auxiliary_loss_clip": 0.01196188, "auxiliary_loss_mlp": 0.01037026, "balance_loss_clip": 1.05764031, "balance_loss_mlp": 1.02738166, "epoch": 0.25828172909276736, "flos": 29498281856640.0, "grad_norm": 1.7508884208344313, "language_loss": 0.78029788, "learning_rate": 3.4779560929125894e-06, "loss": 0.80263007, "num_input_tokens_seen": 45907900, "step": 2148, "time_per_iteration": 2.8404595851898193 }, { "auxiliary_loss_clip": 0.01092238, "auxiliary_loss_mlp": 0.01011442, "balance_loss_clip": 1.02138698, "balance_loss_mlp": 1.00972509, "epoch": 0.2584019719834065, "flos": 67114387376640.0, "grad_norm": 0.6701050590614199, "language_loss": 0.56917238, "learning_rate": 3.4774311646405783e-06, "loss": 0.59020925, "num_input_tokens_seen": 45977805, "step": 2149, "time_per_iteration": 3.48331618309021 }, { "auxiliary_loss_clip": 0.01189839, "auxiliary_loss_mlp": 0.01032698, "balance_loss_clip": 1.05830598, "balance_loss_mlp": 1.02338791, "epoch": 0.2585222148740456, "flos": 22893555634560.0, "grad_norm": 2.329740803525044, "language_loss": 0.8366822, "learning_rate": 3.476906012243435e-06, "loss": 0.85890758, "num_input_tokens_seen": 45996715, "step": 2150, "time_per_iteration": 3.7187588214874268 }, { "auxiliary_loss_clip": 0.01196237, "auxiliary_loss_mlp": 0.01032411, "balance_loss_clip": 1.05978692, "balance_loss_mlp": 1.02314806, "epoch": 0.25864245776468464, "flos": 28909202808960.0, "grad_norm": 1.6593201324471805, "language_loss": 0.81431973, "learning_rate": 3.476380635800824e-06, "loss": 0.83660614, "num_input_tokens_seen": 46017915, "step": 2151, "time_per_iteration": 2.7364230155944824 }, { "auxiliary_loss_clip": 0.01195014, "auxiliary_loss_mlp": 0.010407, "balance_loss_clip": 1.05683208, "balance_loss_mlp": 1.03164566, "epoch": 0.25876270065532375, "flos": 14793185980800.0, "grad_norm": 2.1713615754613715, "language_loss": 0.86327392, "learning_rate": 3.475855035392444e-06, "loss": 0.88563108, "num_input_tokens_seen": 46033235, "step": 2152, "time_per_iteration": 2.6441590785980225 }, { "auxiliary_loss_clip": 0.01185546, "auxiliary_loss_mlp": 0.01030474, "balance_loss_clip": 1.05730236, "balance_loss_mlp": 1.02114546, "epoch": 0.25888294354596286, "flos": 60467821810560.0, "grad_norm": 1.8998338429592534, "language_loss": 0.71524197, "learning_rate": 3.475329211098029e-06, "loss": 0.7374022, "num_input_tokens_seen": 46056390, "step": 2153, "time_per_iteration": 5.009742021560669 }, { "auxiliary_loss_clip": 0.01194659, "auxiliary_loss_mlp": 0.01035682, "balance_loss_clip": 1.05884004, "balance_loss_mlp": 1.02621138, "epoch": 0.2590031864366019, "flos": 27851166771840.0, "grad_norm": 1.5571072584588874, "language_loss": 0.8253513, "learning_rate": 3.4748031629973453e-06, "loss": 0.84765476, "num_input_tokens_seen": 46077120, "step": 2154, "time_per_iteration": 3.830332040786743 }, { "auxiliary_loss_clip": 0.01088183, "auxiliary_loss_mlp": 0.01002702, "balance_loss_clip": 1.02148795, "balance_loss_mlp": 1.00088978, "epoch": 0.25912342932724103, "flos": 62422444206720.0, "grad_norm": 0.9127646302522853, "language_loss": 0.56537247, "learning_rate": 3.4742768911701944e-06, "loss": 0.5862813, "num_input_tokens_seen": 46139815, "step": 2155, "time_per_iteration": 3.473341226577759 }, { "auxiliary_loss_clip": 0.01209938, "auxiliary_loss_mlp": 0.01035506, "balance_loss_clip": 1.06178999, "balance_loss_mlp": 1.02478862, "epoch": 0.25924367221788014, "flos": 12378839368320.0, "grad_norm": 3.7746079902345135, "language_loss": 0.70371056, "learning_rate": 3.4737503956964113e-06, "loss": 0.726165, "num_input_tokens_seen": 46152120, "step": 2156, "time_per_iteration": 2.7746012210845947 }, { "auxiliary_loss_clip": 0.01198288, "auxiliary_loss_mlp": 0.01035971, "balance_loss_clip": 1.06015658, "balance_loss_mlp": 1.0256176, "epoch": 0.2593639151085192, "flos": 14575208296320.0, "grad_norm": 1.9673151954812003, "language_loss": 0.67230523, "learning_rate": 3.473223676655865e-06, "loss": 0.69464779, "num_input_tokens_seen": 46170120, "step": 2157, "time_per_iteration": 2.821122646331787 }, { "auxiliary_loss_clip": 0.01199333, "auxiliary_loss_mlp": 0.01037134, "balance_loss_clip": 1.05888033, "balance_loss_mlp": 1.0274663, "epoch": 0.2594841579991583, "flos": 15230937029760.0, "grad_norm": 1.721439540339833, "language_loss": 0.79756117, "learning_rate": 3.472696734128459e-06, "loss": 0.8199259, "num_input_tokens_seen": 46187985, "step": 2158, "time_per_iteration": 2.7990522384643555 }, { "auxiliary_loss_clip": 0.01202836, "auxiliary_loss_mlp": 0.01032202, "balance_loss_clip": 1.05832672, "balance_loss_mlp": 1.02223039, "epoch": 0.2596044008897974, "flos": 23623583650560.0, "grad_norm": 1.7168469626983285, "language_loss": 0.7580902, "learning_rate": 3.4721695681941286e-06, "loss": 0.78044057, "num_input_tokens_seen": 46207025, "step": 2159, "time_per_iteration": 2.8849167823791504 }, { "auxiliary_loss_clip": 0.01202616, "auxiliary_loss_mlp": 0.0106503, "balance_loss_clip": 1.0600307, "balance_loss_mlp": 1.02990341, "epoch": 0.25972464378043647, "flos": 13772281628160.0, "grad_norm": 2.068315547295325, "language_loss": 0.8216213, "learning_rate": 3.471642178932845e-06, "loss": 0.84429777, "num_input_tokens_seen": 46225670, "step": 2160, "time_per_iteration": 2.7628166675567627 }, { "auxiliary_loss_clip": 0.01202451, "auxiliary_loss_mlp": 0.01033814, "balance_loss_clip": 1.05773163, "balance_loss_mlp": 1.02439618, "epoch": 0.2598448866710756, "flos": 19573578391680.0, "grad_norm": 1.7261233931932367, "language_loss": 0.89185607, "learning_rate": 3.471114566424613e-06, "loss": 0.91421866, "num_input_tokens_seen": 46244130, "step": 2161, "time_per_iteration": 2.7098522186279297 }, { "auxiliary_loss_clip": 0.01197464, "auxiliary_loss_mlp": 0.01034831, "balance_loss_clip": 1.05885255, "balance_loss_mlp": 1.0252707, "epoch": 0.25996512956171464, "flos": 21653237053440.0, "grad_norm": 1.9904461071209478, "language_loss": 0.75579482, "learning_rate": 3.4705867307494715e-06, "loss": 0.77811778, "num_input_tokens_seen": 46263200, "step": 2162, "time_per_iteration": 2.8265976905822754 }, { "auxiliary_loss_clip": 0.01206089, "auxiliary_loss_mlp": 0.0103677, "balance_loss_clip": 1.05790997, "balance_loss_mlp": 1.02679205, "epoch": 0.26008537245235375, "flos": 18223480869120.0, "grad_norm": 2.1957563267317233, "language_loss": 0.84784174, "learning_rate": 3.470058671987492e-06, "loss": 0.87027037, "num_input_tokens_seen": 46281465, "step": 2163, "time_per_iteration": 2.720111608505249 }, { "auxiliary_loss_clip": 0.01210769, "auxiliary_loss_mlp": 0.0103315, "balance_loss_clip": 1.05955637, "balance_loss_mlp": 1.02346992, "epoch": 0.26020561534299286, "flos": 24645385843200.0, "grad_norm": 2.0204227183101824, "language_loss": 0.8443194, "learning_rate": 3.4695303902187805e-06, "loss": 0.86675858, "num_input_tokens_seen": 46301020, "step": 2164, "time_per_iteration": 2.7549242973327637 }, { "auxiliary_loss_clip": 0.01192747, "auxiliary_loss_mlp": 0.01037454, "balance_loss_clip": 1.05594313, "balance_loss_mlp": 1.02853703, "epoch": 0.2603258582336319, "flos": 25773662926080.0, "grad_norm": 1.9319366704683247, "language_loss": 0.79120517, "learning_rate": 3.469001885523478e-06, "loss": 0.81350714, "num_input_tokens_seen": 46321740, "step": 2165, "time_per_iteration": 2.8073267936706543 }, { "auxiliary_loss_clip": 0.01205562, "auxiliary_loss_mlp": 0.01027092, "balance_loss_clip": 1.05649579, "balance_loss_mlp": 1.01790643, "epoch": 0.260446101124271, "flos": 28766314506240.0, "grad_norm": 1.5470997018129906, "language_loss": 0.81103683, "learning_rate": 3.4684731579817568e-06, "loss": 0.83336329, "num_input_tokens_seen": 46342730, "step": 2166, "time_per_iteration": 2.7884058952331543 }, { "auxiliary_loss_clip": 0.01185366, "auxiliary_loss_mlp": 0.01031254, "balance_loss_clip": 1.05842376, "balance_loss_mlp": 1.02150846, "epoch": 0.26056634401491013, "flos": 25666757072640.0, "grad_norm": 1.5083831667188619, "language_loss": 0.7656132, "learning_rate": 3.4679442076738247e-06, "loss": 0.78777945, "num_input_tokens_seen": 46362445, "step": 2167, "time_per_iteration": 2.964049816131592 }, { "auxiliary_loss_clip": 0.01209466, "auxiliary_loss_mlp": 0.01032824, "balance_loss_clip": 1.05928624, "balance_loss_mlp": 1.0233407, "epoch": 0.2606865869055492, "flos": 27052765217280.0, "grad_norm": 2.043371685988747, "language_loss": 0.83296299, "learning_rate": 3.4674150346799245e-06, "loss": 0.8553859, "num_input_tokens_seen": 46382145, "step": 2168, "time_per_iteration": 2.741645336151123 }, { "auxiliary_loss_clip": 0.01202578, "auxiliary_loss_mlp": 0.01037556, "balance_loss_clip": 1.05942607, "balance_loss_mlp": 1.02800727, "epoch": 0.2608068297961883, "flos": 17712615686400.0, "grad_norm": 2.1198436440242014, "language_loss": 0.80059516, "learning_rate": 3.4668856390803295e-06, "loss": 0.8229965, "num_input_tokens_seen": 46400025, "step": 2169, "time_per_iteration": 2.783857822418213 }, { "auxiliary_loss_clip": 0.01193975, "auxiliary_loss_mlp": 0.0103308, "balance_loss_clip": 1.05819774, "balance_loss_mlp": 1.02371025, "epoch": 0.2609270726868274, "flos": 18551632544640.0, "grad_norm": 2.1512305440266917, "language_loss": 0.8982712, "learning_rate": 3.4663560209553495e-06, "loss": 0.92054176, "num_input_tokens_seen": 46418090, "step": 2170, "time_per_iteration": 2.856830596923828 }, { "auxiliary_loss_clip": 0.01191505, "auxiliary_loss_mlp": 0.01028861, "balance_loss_clip": 1.05721521, "balance_loss_mlp": 1.01950312, "epoch": 0.26104731557746647, "flos": 21835699165440.0, "grad_norm": 1.63954133154922, "language_loss": 0.79215121, "learning_rate": 3.4658261803853267e-06, "loss": 0.81435484, "num_input_tokens_seen": 46436015, "step": 2171, "time_per_iteration": 2.8195478916168213 }, { "auxiliary_loss_clip": 0.01196344, "auxiliary_loss_mlp": 0.01030167, "balance_loss_clip": 1.06009507, "balance_loss_mlp": 1.02155995, "epoch": 0.2611675584681056, "flos": 21689650465920.0, "grad_norm": 2.330553322637291, "language_loss": 0.81042045, "learning_rate": 3.4652961174506383e-06, "loss": 0.83268559, "num_input_tokens_seen": 46455885, "step": 2172, "time_per_iteration": 2.7874207496643066 }, { "auxiliary_loss_clip": 0.01092681, "auxiliary_loss_mlp": 0.01011361, "balance_loss_clip": 1.02007341, "balance_loss_mlp": 1.00987649, "epoch": 0.2612878013587447, "flos": 71862101389440.0, "grad_norm": 0.9605082473803771, "language_loss": 0.58103561, "learning_rate": 3.464765832231694e-06, "loss": 0.60207593, "num_input_tokens_seen": 46510050, "step": 2173, "time_per_iteration": 3.340725898742676 }, { "auxiliary_loss_clip": 0.01206316, "auxiliary_loss_mlp": 0.01033119, "balance_loss_clip": 1.06113482, "balance_loss_mlp": 1.02336812, "epoch": 0.26140804424938374, "flos": 20227511445120.0, "grad_norm": 1.7055492888918242, "language_loss": 0.70899856, "learning_rate": 3.4642353248089373e-06, "loss": 0.73139292, "num_input_tokens_seen": 46528810, "step": 2174, "time_per_iteration": 2.8670735359191895 }, { "auxiliary_loss_clip": 0.01196206, "auxiliary_loss_mlp": 0.01028539, "balance_loss_clip": 1.06100106, "balance_loss_mlp": 1.01887083, "epoch": 0.26152828714002285, "flos": 25557085872000.0, "grad_norm": 1.990818813096105, "language_loss": 0.80483752, "learning_rate": 3.463704595262846e-06, "loss": 0.82708496, "num_input_tokens_seen": 46549690, "step": 2175, "time_per_iteration": 2.976209878921509 }, { "auxiliary_loss_clip": 0.0119508, "auxiliary_loss_mlp": 0.01033549, "balance_loss_clip": 1.05870318, "balance_loss_mlp": 1.02420866, "epoch": 0.26164853003066196, "flos": 25446516831360.0, "grad_norm": 2.063778011029724, "language_loss": 0.7053715, "learning_rate": 3.463173643673931e-06, "loss": 0.72765779, "num_input_tokens_seen": 46572215, "step": 2176, "time_per_iteration": 3.6752710342407227 }, { "auxiliary_loss_clip": 0.01097768, "auxiliary_loss_mlp": 0.01008202, "balance_loss_clip": 1.01790905, "balance_loss_mlp": 1.00648534, "epoch": 0.261768772921301, "flos": 53944580568960.0, "grad_norm": 0.9117398618199399, "language_loss": 0.63500565, "learning_rate": 3.4626424701227387e-06, "loss": 0.65606534, "num_input_tokens_seen": 46627275, "step": 2177, "time_per_iteration": 3.223889112472534 }, { "auxiliary_loss_clip": 0.01097789, "auxiliary_loss_mlp": 0.01006438, "balance_loss_clip": 1.01801169, "balance_loss_mlp": 1.00470984, "epoch": 0.26188901581194013, "flos": 70687606481280.0, "grad_norm": 0.8825077303515815, "language_loss": 0.55813289, "learning_rate": 3.4621110746898452e-06, "loss": 0.57917511, "num_input_tokens_seen": 46695135, "step": 2178, "time_per_iteration": 4.3833606243133545 }, { "auxiliary_loss_clip": 0.01206452, "auxiliary_loss_mlp": 0.01029014, "balance_loss_clip": 1.06026065, "balance_loss_mlp": 1.02031827, "epoch": 0.2620092587025792, "flos": 21069580959360.0, "grad_norm": 1.621681473507248, "language_loss": 0.74313873, "learning_rate": 3.4615794574558654e-06, "loss": 0.76549339, "num_input_tokens_seen": 46714145, "step": 2179, "time_per_iteration": 3.7323384284973145 }, { "auxiliary_loss_clip": 0.01197641, "auxiliary_loss_mlp": 0.0103016, "balance_loss_clip": 1.05740881, "balance_loss_mlp": 1.02048039, "epoch": 0.2621295015932183, "flos": 18369601395840.0, "grad_norm": 3.1420578595476414, "language_loss": 0.83992982, "learning_rate": 3.4610476185014436e-06, "loss": 0.86220789, "num_input_tokens_seen": 46731405, "step": 2180, "time_per_iteration": 3.942735433578491 }, { "auxiliary_loss_clip": 0.01209705, "auxiliary_loss_mlp": 0.0103218, "balance_loss_clip": 1.0596776, "balance_loss_mlp": 1.02304816, "epoch": 0.2622497444838574, "flos": 23659997063040.0, "grad_norm": 3.542674550607896, "language_loss": 0.79082966, "learning_rate": 3.4605155579072597e-06, "loss": 0.81324852, "num_input_tokens_seen": 46751260, "step": 2181, "time_per_iteration": 2.718674898147583 }, { "auxiliary_loss_clip": 0.01186432, "auxiliary_loss_mlp": 0.01031929, "balance_loss_clip": 1.0575738, "balance_loss_mlp": 1.02282155, "epoch": 0.26236998737449646, "flos": 22123810154880.0, "grad_norm": 1.869660450986636, "language_loss": 0.71872455, "learning_rate": 3.459983275754027e-06, "loss": 0.74090815, "num_input_tokens_seen": 46770155, "step": 2182, "time_per_iteration": 2.752704620361328 }, { "auxiliary_loss_clip": 0.01209362, "auxiliary_loss_mlp": 0.01031136, "balance_loss_clip": 1.05980134, "balance_loss_mlp": 1.02194548, "epoch": 0.26249023026513557, "flos": 17895185539200.0, "grad_norm": 3.576163035696768, "language_loss": 0.80154705, "learning_rate": 3.4594507721224918e-06, "loss": 0.82395208, "num_input_tokens_seen": 46788805, "step": 2183, "time_per_iteration": 2.7243199348449707 }, { "auxiliary_loss_clip": 0.0120552, "auxiliary_loss_mlp": 0.01037313, "balance_loss_clip": 1.06213331, "balance_loss_mlp": 1.02793705, "epoch": 0.2626104731557747, "flos": 18332936588160.0, "grad_norm": 2.0614671653734917, "language_loss": 0.8213563, "learning_rate": 3.4589180470934353e-06, "loss": 0.84378469, "num_input_tokens_seen": 46808670, "step": 2184, "time_per_iteration": 2.780942916870117 }, { "auxiliary_loss_clip": 0.01208025, "auxiliary_loss_mlp": 0.0102816, "balance_loss_clip": 1.05742121, "balance_loss_mlp": 1.01829541, "epoch": 0.26273071604641374, "flos": 19317714837120.0, "grad_norm": 1.9582162813049926, "language_loss": 0.76969612, "learning_rate": 3.4583851007476713e-06, "loss": 0.79205799, "num_input_tokens_seen": 46827140, "step": 2185, "time_per_iteration": 2.7331631183624268 }, { "auxiliary_loss_clip": 0.01200559, "auxiliary_loss_mlp": 0.01031205, "balance_loss_clip": 1.06110954, "balance_loss_mlp": 1.02138853, "epoch": 0.26285095893705285, "flos": 18327477720960.0, "grad_norm": 4.3009971246568135, "language_loss": 0.68618059, "learning_rate": 3.4578519331660464e-06, "loss": 0.70849824, "num_input_tokens_seen": 46844135, "step": 2186, "time_per_iteration": 2.729804277420044 }, { "auxiliary_loss_clip": 0.01199539, "auxiliary_loss_mlp": 0.01035483, "balance_loss_clip": 1.05988085, "balance_loss_mlp": 1.02701294, "epoch": 0.26297120182769196, "flos": 20193827466240.0, "grad_norm": 1.9049905301500407, "language_loss": 0.8213113, "learning_rate": 3.4573185444294426e-06, "loss": 0.84366149, "num_input_tokens_seen": 46862500, "step": 2187, "time_per_iteration": 2.755441427230835 }, { "auxiliary_loss_clip": 0.01202775, "auxiliary_loss_mlp": 0.01061734, "balance_loss_clip": 1.06181836, "balance_loss_mlp": 1.02750492, "epoch": 0.263091444718331, "flos": 22418421505920.0, "grad_norm": 2.325148883485377, "language_loss": 0.78783602, "learning_rate": 3.456784934618774e-06, "loss": 0.81048113, "num_input_tokens_seen": 46883665, "step": 2188, "time_per_iteration": 2.7695562839508057 }, { "auxiliary_loss_clip": 0.01201457, "auxiliary_loss_mlp": 0.01030604, "balance_loss_clip": 1.06089425, "balance_loss_mlp": 1.02132308, "epoch": 0.2632116876089701, "flos": 19024827338880.0, "grad_norm": 1.9199008945153726, "language_loss": 0.79733235, "learning_rate": 3.4562511038149897e-06, "loss": 0.81965297, "num_input_tokens_seen": 46899160, "step": 2189, "time_per_iteration": 2.7332935333251953 }, { "auxiliary_loss_clip": 0.01084975, "auxiliary_loss_mlp": 0.01013668, "balance_loss_clip": 1.01510119, "balance_loss_mlp": 1.01168907, "epoch": 0.26333193049960923, "flos": 67308054531840.0, "grad_norm": 0.8817124872457472, "language_loss": 0.5778569, "learning_rate": 3.4557170520990705e-06, "loss": 0.59884322, "num_input_tokens_seen": 46959835, "step": 2190, "time_per_iteration": 3.330531597137451 }, { "auxiliary_loss_clip": 0.01200121, "auxiliary_loss_mlp": 0.01040344, "balance_loss_clip": 1.05905247, "balance_loss_mlp": 1.03154683, "epoch": 0.2634521733902483, "flos": 25048806468480.0, "grad_norm": 2.0503372767702657, "language_loss": 0.86683631, "learning_rate": 3.4551827795520324e-06, "loss": 0.88924098, "num_input_tokens_seen": 46982720, "step": 2191, "time_per_iteration": 2.7444443702697754 }, { "auxiliary_loss_clip": 0.01206231, "auxiliary_loss_mlp": 0.01032978, "balance_loss_clip": 1.05891323, "balance_loss_mlp": 1.02348268, "epoch": 0.2635724162808874, "flos": 20594985534720.0, "grad_norm": 1.8555211756024326, "language_loss": 0.84779298, "learning_rate": 3.4546482862549226e-06, "loss": 0.87018502, "num_input_tokens_seen": 47003035, "step": 2192, "time_per_iteration": 2.7697041034698486 }, { "auxiliary_loss_clip": 0.01196345, "auxiliary_loss_mlp": 0.01037891, "balance_loss_clip": 1.0620985, "balance_loss_mlp": 1.02826476, "epoch": 0.2636926591715265, "flos": 19244636616960.0, "grad_norm": 1.9490687126627022, "language_loss": 0.78907549, "learning_rate": 3.4541135722888253e-06, "loss": 0.81141782, "num_input_tokens_seen": 47019625, "step": 2193, "time_per_iteration": 2.918215751647949 }, { "auxiliary_loss_clip": 0.0120519, "auxiliary_loss_mlp": 0.010325, "balance_loss_clip": 1.05660295, "balance_loss_mlp": 1.02237284, "epoch": 0.26381290206216557, "flos": 28804882734720.0, "grad_norm": 1.608022421804392, "language_loss": 0.8028537, "learning_rate": 3.453578637734854e-06, "loss": 0.8252306, "num_input_tokens_seen": 47040815, "step": 2194, "time_per_iteration": 2.783221960067749 }, { "auxiliary_loss_clip": 0.01208858, "auxiliary_loss_mlp": 0.01034667, "balance_loss_clip": 1.06076813, "balance_loss_mlp": 1.02513027, "epoch": 0.2639331449528047, "flos": 25008909436800.0, "grad_norm": 1.760384372913697, "language_loss": 0.78671795, "learning_rate": 3.4530434826741605e-06, "loss": 0.8091532, "num_input_tokens_seen": 47061755, "step": 2195, "time_per_iteration": 2.693444013595581 }, { "auxiliary_loss_clip": 0.01195525, "auxiliary_loss_mlp": 0.01029355, "balance_loss_clip": 1.05933321, "balance_loss_mlp": 1.02003241, "epoch": 0.26405338784344373, "flos": 46535775465600.0, "grad_norm": 1.7283598146621286, "language_loss": 0.68897665, "learning_rate": 3.452508107187926e-06, "loss": 0.71122551, "num_input_tokens_seen": 47085130, "step": 2196, "time_per_iteration": 2.8869435787200928 }, { "auxiliary_loss_clip": 0.01191082, "auxiliary_loss_mlp": 0.01034418, "balance_loss_clip": 1.05819213, "balance_loss_mlp": 1.02454197, "epoch": 0.26417363073408284, "flos": 21179467641600.0, "grad_norm": 1.868670284366715, "language_loss": 0.77355123, "learning_rate": 3.451972511357366e-06, "loss": 0.79580623, "num_input_tokens_seen": 47104675, "step": 2197, "time_per_iteration": 2.7807834148406982 }, { "auxiliary_loss_clip": 0.01195796, "auxiliary_loss_mlp": 0.01030253, "balance_loss_clip": 1.05807137, "balance_loss_mlp": 1.02094316, "epoch": 0.26429387362472195, "flos": 22674751937280.0, "grad_norm": 1.7786543685692555, "language_loss": 0.85209078, "learning_rate": 3.45143669526373e-06, "loss": 0.87435126, "num_input_tokens_seen": 47124435, "step": 2198, "time_per_iteration": 2.7352802753448486 }, { "auxiliary_loss_clip": 0.01093916, "auxiliary_loss_mlp": 0.01005787, "balance_loss_clip": 1.01444936, "balance_loss_mlp": 1.00391579, "epoch": 0.264414116515361, "flos": 67180534272000.0, "grad_norm": 0.7839552345700292, "language_loss": 0.63196719, "learning_rate": 3.450900658988302e-06, "loss": 0.65296423, "num_input_tokens_seen": 47185985, "step": 2199, "time_per_iteration": 3.2708168029785156 }, { "auxiliary_loss_clip": 0.01192452, "auxiliary_loss_mlp": 0.01033493, "balance_loss_clip": 1.05957937, "balance_loss_mlp": 1.02362251, "epoch": 0.2645343594060001, "flos": 25664709997440.0, "grad_norm": 2.2761382549094202, "language_loss": 0.77720046, "learning_rate": 3.450364402612397e-06, "loss": 0.79945993, "num_input_tokens_seen": 47203140, "step": 2200, "time_per_iteration": 2.776158571243286 }, { "auxiliary_loss_clip": 0.01197538, "auxiliary_loss_mlp": 0.0103829, "balance_loss_clip": 1.05802226, "balance_loss_mlp": 1.02813363, "epoch": 0.26465460229663923, "flos": 22491822948480.0, "grad_norm": 1.8141085555110459, "language_loss": 0.83647394, "learning_rate": 3.449827926217366e-06, "loss": 0.85883224, "num_input_tokens_seen": 47222575, "step": 2201, "time_per_iteration": 2.777466297149658 }, { "auxiliary_loss_clip": 0.01199768, "auxiliary_loss_mlp": 0.01030775, "balance_loss_clip": 1.05313587, "balance_loss_mlp": 1.0219419, "epoch": 0.2647748451872783, "flos": 29388036038400.0, "grad_norm": 2.42545884169907, "language_loss": 0.80491221, "learning_rate": 3.449291229884591e-06, "loss": 0.8272177, "num_input_tokens_seen": 47243815, "step": 2202, "time_per_iteration": 3.694007158279419 }, { "auxiliary_loss_clip": 0.01201169, "auxiliary_loss_mlp": 0.01038002, "balance_loss_clip": 1.05890751, "balance_loss_mlp": 1.02869797, "epoch": 0.2648950880779174, "flos": 26797799502720.0, "grad_norm": 2.118929307746323, "language_loss": 0.87136173, "learning_rate": 3.4487543136954887e-06, "loss": 0.89375341, "num_input_tokens_seen": 47263435, "step": 2203, "time_per_iteration": 2.8351478576660156 }, { "auxiliary_loss_clip": 0.01193844, "auxiliary_loss_mlp": 0.010321, "balance_loss_clip": 1.0586822, "balance_loss_mlp": 1.02299237, "epoch": 0.2650153309685565, "flos": 28841008838400.0, "grad_norm": 1.598536038277342, "language_loss": 0.9128564, "learning_rate": 3.448217177731509e-06, "loss": 0.93511581, "num_input_tokens_seen": 47283920, "step": 2204, "time_per_iteration": 3.880108594894409 }, { "auxiliary_loss_clip": 0.01196213, "auxiliary_loss_mlp": 0.01038915, "balance_loss_clip": 1.05863297, "balance_loss_mlp": 1.02959323, "epoch": 0.26513557385919556, "flos": 20303247271680.0, "grad_norm": 1.8577129483639543, "language_loss": 0.77886593, "learning_rate": 3.4476798220741348e-06, "loss": 0.8012172, "num_input_tokens_seen": 47302800, "step": 2205, "time_per_iteration": 2.861955404281616 }, { "auxiliary_loss_clip": 0.01206143, "auxiliary_loss_mlp": 0.01033299, "balance_loss_clip": 1.05990052, "balance_loss_mlp": 1.02399445, "epoch": 0.26525581674983467, "flos": 17676274101120.0, "grad_norm": 1.8671676937441315, "language_loss": 0.78342533, "learning_rate": 3.4471422468048826e-06, "loss": 0.80581975, "num_input_tokens_seen": 47321525, "step": 2206, "time_per_iteration": 3.7081029415130615 }, { "auxiliary_loss_clip": 0.01199741, "auxiliary_loss_mlp": 0.01035808, "balance_loss_clip": 1.06237209, "balance_loss_mlp": 1.02602124, "epoch": 0.2653760596404738, "flos": 26833746038400.0, "grad_norm": 2.5360181005518476, "language_loss": 0.73185807, "learning_rate": 3.4466044520053022e-06, "loss": 0.75421363, "num_input_tokens_seen": 47340530, "step": 2207, "time_per_iteration": 2.8378350734710693 }, { "auxiliary_loss_clip": 0.01191729, "auxiliary_loss_mlp": 0.01025886, "balance_loss_clip": 1.06019652, "balance_loss_mlp": 1.0161767, "epoch": 0.26549630253111284, "flos": 22782160581120.0, "grad_norm": 1.7629156115205595, "language_loss": 0.60105014, "learning_rate": 3.446066437756977e-06, "loss": 0.62322628, "num_input_tokens_seen": 47359735, "step": 2208, "time_per_iteration": 2.8714234828948975 }, { "auxiliary_loss_clip": 0.01194644, "auxiliary_loss_mlp": 0.01031417, "balance_loss_clip": 1.05740905, "balance_loss_mlp": 1.02244663, "epoch": 0.26561654542175195, "flos": 23550002640000.0, "grad_norm": 1.961334396377742, "language_loss": 0.75072813, "learning_rate": 3.4455282041415224e-06, "loss": 0.7729888, "num_input_tokens_seen": 47378945, "step": 2209, "time_per_iteration": 2.8516957759857178 }, { "auxiliary_loss_clip": 0.0119364, "auxiliary_loss_mlp": 0.010312, "balance_loss_clip": 1.05663919, "balance_loss_mlp": 1.02162194, "epoch": 0.265736788312391, "flos": 26906680604160.0, "grad_norm": 2.2514663269630497, "language_loss": 0.8743006, "learning_rate": 3.4449897512405894e-06, "loss": 0.89654899, "num_input_tokens_seen": 47398095, "step": 2210, "time_per_iteration": 2.8127458095550537 }, { "auxiliary_loss_clip": 0.01186473, "auxiliary_loss_mlp": 0.01052499, "balance_loss_clip": 1.05987728, "balance_loss_mlp": 1.02062428, "epoch": 0.2658570312030301, "flos": 23477139901440.0, "grad_norm": 2.2252152353430255, "language_loss": 0.75681907, "learning_rate": 3.444451079135859e-06, "loss": 0.77920878, "num_input_tokens_seen": 47417605, "step": 2211, "time_per_iteration": 2.8897898197174072 }, { "auxiliary_loss_clip": 0.0118622, "auxiliary_loss_mlp": 0.01065144, "balance_loss_clip": 1.05839896, "balance_loss_mlp": 1.03063357, "epoch": 0.2659772740936692, "flos": 21866402315520.0, "grad_norm": 2.541230022230289, "language_loss": 0.74246687, "learning_rate": 3.4439121879090493e-06, "loss": 0.76498049, "num_input_tokens_seen": 47435385, "step": 2212, "time_per_iteration": 2.849283456802368 }, { "auxiliary_loss_clip": 0.01208056, "auxiliary_loss_mlp": 0.01032332, "balance_loss_clip": 1.06204665, "balance_loss_mlp": 1.02258086, "epoch": 0.2660975169843083, "flos": 19793100360960.0, "grad_norm": 1.9462406138157027, "language_loss": 0.83262086, "learning_rate": 3.4433730776419082e-06, "loss": 0.8550247, "num_input_tokens_seen": 47454310, "step": 2213, "time_per_iteration": 2.8976123332977295 }, { "auxiliary_loss_clip": 0.01208208, "auxiliary_loss_mlp": 0.01061456, "balance_loss_clip": 1.05901706, "balance_loss_mlp": 1.02570462, "epoch": 0.2662177598749474, "flos": 29018981750400.0, "grad_norm": 3.5282156466554793, "language_loss": 0.80653012, "learning_rate": 3.4428337484162183e-06, "loss": 0.82922673, "num_input_tokens_seen": 47475120, "step": 2214, "time_per_iteration": 2.8599109649658203 }, { "auxiliary_loss_clip": 0.01198618, "auxiliary_loss_mlp": 0.0102942, "balance_loss_clip": 1.05928862, "balance_loss_mlp": 1.01960278, "epoch": 0.2663380027655865, "flos": 21762549118080.0, "grad_norm": 1.9860862201752871, "language_loss": 0.84505326, "learning_rate": 3.442294200313797e-06, "loss": 0.86733365, "num_input_tokens_seen": 47493150, "step": 2215, "time_per_iteration": 2.7843899726867676 }, { "auxiliary_loss_clip": 0.01090929, "auxiliary_loss_mlp": 0.01008127, "balance_loss_clip": 1.01160407, "balance_loss_mlp": 1.00624323, "epoch": 0.26645824565622556, "flos": 66980333819520.0, "grad_norm": 0.7592008590124193, "language_loss": 0.52676702, "learning_rate": 3.4417544334164916e-06, "loss": 0.54775751, "num_input_tokens_seen": 47557295, "step": 2216, "time_per_iteration": 3.3397607803344727 }, { "auxiliary_loss_clip": 0.01194169, "auxiliary_loss_mlp": 0.01026225, "balance_loss_clip": 1.05821884, "balance_loss_mlp": 1.01646197, "epoch": 0.26657848854686467, "flos": 25264198373760.0, "grad_norm": 1.5614185803552307, "language_loss": 0.77621263, "learning_rate": 3.4412144478061854e-06, "loss": 0.79841655, "num_input_tokens_seen": 47579705, "step": 2217, "time_per_iteration": 2.8552966117858887 }, { "auxiliary_loss_clip": 0.01181103, "auxiliary_loss_mlp": 0.01036687, "balance_loss_clip": 1.06059122, "balance_loss_mlp": 1.02621484, "epoch": 0.2666987314375038, "flos": 23696769611520.0, "grad_norm": 1.9435059568151596, "language_loss": 0.7552309, "learning_rate": 3.4406742435647925e-06, "loss": 0.77740884, "num_input_tokens_seen": 47599770, "step": 2218, "time_per_iteration": 3.24810791015625 }, { "auxiliary_loss_clip": 0.01202673, "auxiliary_loss_mlp": 0.01031998, "balance_loss_clip": 1.06068492, "balance_loss_mlp": 1.02218103, "epoch": 0.26681897432814283, "flos": 27048958375680.0, "grad_norm": 2.1106818810745325, "language_loss": 0.78769648, "learning_rate": 3.440133820774263e-06, "loss": 0.81004316, "num_input_tokens_seen": 47619580, "step": 2219, "time_per_iteration": 3.2167723178863525 }, { "auxiliary_loss_clip": 0.01203264, "auxiliary_loss_mlp": 0.01033902, "balance_loss_clip": 1.05784845, "balance_loss_mlp": 1.02361429, "epoch": 0.26693921721878194, "flos": 28985944216320.0, "grad_norm": 1.977616755944786, "language_loss": 0.81656754, "learning_rate": 3.439593179516578e-06, "loss": 0.83893919, "num_input_tokens_seen": 47639490, "step": 2220, "time_per_iteration": 2.9045066833496094 }, { "auxiliary_loss_clip": 0.01202452, "auxiliary_loss_mlp": 0.01035064, "balance_loss_clip": 1.0587641, "balance_loss_mlp": 1.02515161, "epoch": 0.26705946010942105, "flos": 21507834798720.0, "grad_norm": 1.8423214020158, "language_loss": 0.80923402, "learning_rate": 3.4390523198737524e-06, "loss": 0.83160919, "num_input_tokens_seen": 47658650, "step": 2221, "time_per_iteration": 2.8992621898651123 }, { "auxiliary_loss_clip": 0.01207576, "auxiliary_loss_mlp": 0.01061125, "balance_loss_clip": 1.05867362, "balance_loss_mlp": 1.02515566, "epoch": 0.2671797030000601, "flos": 21471277731840.0, "grad_norm": 1.587026947436549, "language_loss": 0.73405504, "learning_rate": 3.4385112419278333e-06, "loss": 0.75674206, "num_input_tokens_seen": 47679875, "step": 2222, "time_per_iteration": 2.7790462970733643 }, { "auxiliary_loss_clip": 0.01091535, "auxiliary_loss_mlp": 0.0100372, "balance_loss_clip": 1.01210189, "balance_loss_mlp": 1.00184882, "epoch": 0.2672999458906992, "flos": 64189929767040.0, "grad_norm": 0.7854879266608306, "language_loss": 0.64768922, "learning_rate": 3.4379699457609033e-06, "loss": 0.66864175, "num_input_tokens_seen": 47737700, "step": 2223, "time_per_iteration": 3.1638855934143066 }, { "auxiliary_loss_clip": 0.01191992, "auxiliary_loss_mlp": 0.01038737, "balance_loss_clip": 1.05873132, "balance_loss_mlp": 1.02846718, "epoch": 0.26742018878133833, "flos": 16909042573440.0, "grad_norm": 1.9242128740103248, "language_loss": 0.90329766, "learning_rate": 3.4374284314550755e-06, "loss": 0.925605, "num_input_tokens_seen": 47756740, "step": 2224, "time_per_iteration": 2.7556838989257812 }, { "auxiliary_loss_clip": 0.01207495, "auxiliary_loss_mlp": 0.01030758, "balance_loss_clip": 1.0586102, "balance_loss_mlp": 1.02108407, "epoch": 0.2675404316719774, "flos": 20667560964480.0, "grad_norm": 4.011869526937454, "language_loss": 0.80902421, "learning_rate": 3.436886699092498e-06, "loss": 0.83140671, "num_input_tokens_seen": 47775255, "step": 2225, "time_per_iteration": 2.7214574813842773 }, { "auxiliary_loss_clip": 0.01212242, "auxiliary_loss_mlp": 0.01032056, "balance_loss_clip": 1.0603466, "balance_loss_mlp": 1.02211344, "epoch": 0.2676606745626165, "flos": 17485013157120.0, "grad_norm": 2.462477904786165, "language_loss": 0.71793121, "learning_rate": 3.4363447487553502e-06, "loss": 0.74037421, "num_input_tokens_seen": 47788570, "step": 2226, "time_per_iteration": 2.7647674083709717 }, { "auxiliary_loss_clip": 0.01198302, "auxiliary_loss_mlp": 0.01039441, "balance_loss_clip": 1.06043494, "balance_loss_mlp": 1.03053629, "epoch": 0.26778091745325555, "flos": 27852675143040.0, "grad_norm": 2.1633729981475773, "language_loss": 0.78040504, "learning_rate": 3.4358025805258455e-06, "loss": 0.80278242, "num_input_tokens_seen": 47808275, "step": 2227, "time_per_iteration": 2.797625780105591 }, { "auxiliary_loss_clip": 0.01194409, "auxiliary_loss_mlp": 0.010314, "balance_loss_clip": 1.05808592, "balance_loss_mlp": 1.02139187, "epoch": 0.26790116034389466, "flos": 20955995176320.0, "grad_norm": 1.7018047904885993, "language_loss": 0.83359045, "learning_rate": 3.435260194486232e-06, "loss": 0.85584849, "num_input_tokens_seen": 47826245, "step": 2228, "time_per_iteration": 3.7271008491516113 }, { "auxiliary_loss_clip": 0.01197142, "auxiliary_loss_mlp": 0.01043069, "balance_loss_clip": 1.05718923, "balance_loss_mlp": 1.03331757, "epoch": 0.2680214032345338, "flos": 18040659621120.0, "grad_norm": 2.332039690455256, "language_loss": 0.82071161, "learning_rate": 3.4347175907187875e-06, "loss": 0.84311378, "num_input_tokens_seen": 47843235, "step": 2229, "time_per_iteration": 2.8182084560394287 }, { "auxiliary_loss_clip": 0.01201075, "auxiliary_loss_mlp": 0.0103619, "balance_loss_clip": 1.05836451, "balance_loss_mlp": 1.02651596, "epoch": 0.26814164612517283, "flos": 22419427086720.0, "grad_norm": 1.8765857061822329, "language_loss": 0.88113666, "learning_rate": 3.4341747693058254e-06, "loss": 0.90350932, "num_input_tokens_seen": 47861710, "step": 2230, "time_per_iteration": 2.7283456325531006 }, { "auxiliary_loss_clip": 0.01174082, "auxiliary_loss_mlp": 0.01033525, "balance_loss_clip": 1.05796444, "balance_loss_mlp": 1.0235467, "epoch": 0.26826188901581194, "flos": 35627371159680.0, "grad_norm": 1.8051918933245976, "language_loss": 0.77330005, "learning_rate": 3.4336317303296916e-06, "loss": 0.79537606, "num_input_tokens_seen": 47882685, "step": 2231, "time_per_iteration": 4.909849643707275 }, { "auxiliary_loss_clip": 0.01197876, "auxiliary_loss_mlp": 0.01034195, "balance_loss_clip": 1.05650663, "balance_loss_mlp": 1.02509975, "epoch": 0.26838213190645105, "flos": 17639788861440.0, "grad_norm": 2.124807981832121, "language_loss": 0.75373363, "learning_rate": 3.4330884738727635e-06, "loss": 0.77605438, "num_input_tokens_seen": 47900860, "step": 2232, "time_per_iteration": 3.7133476734161377 }, { "auxiliary_loss_clip": 0.01184605, "auxiliary_loss_mlp": 0.0103101, "balance_loss_clip": 1.05832863, "balance_loss_mlp": 1.02163422, "epoch": 0.2685023747970901, "flos": 22674823764480.0, "grad_norm": 3.0768130599889574, "language_loss": 0.70751834, "learning_rate": 3.4325450000174535e-06, "loss": 0.72967446, "num_input_tokens_seen": 47917500, "step": 2233, "time_per_iteration": 2.9857726097106934 }, { "auxiliary_loss_clip": 0.01180104, "auxiliary_loss_mlp": 0.01037039, "balance_loss_clip": 1.05512667, "balance_loss_mlp": 1.02769279, "epoch": 0.2686226176877292, "flos": 20120533764480.0, "grad_norm": 1.7031590219176829, "language_loss": 0.74198943, "learning_rate": 3.4320013088462067e-06, "loss": 0.76416081, "num_input_tokens_seen": 47934860, "step": 2234, "time_per_iteration": 2.996934652328491 }, { "auxiliary_loss_clip": 0.01198073, "auxiliary_loss_mlp": 0.01031283, "balance_loss_clip": 1.05766797, "balance_loss_mlp": 1.02237844, "epoch": 0.2687428605783683, "flos": 21872040750720.0, "grad_norm": 1.4972227675757173, "language_loss": 0.81798971, "learning_rate": 3.431457400441499e-06, "loss": 0.84028322, "num_input_tokens_seen": 47955255, "step": 2235, "time_per_iteration": 2.8814151287078857 }, { "auxiliary_loss_clip": 0.010722, "auxiliary_loss_mlp": 0.01005154, "balance_loss_clip": 1.0135088, "balance_loss_mlp": 1.003425, "epoch": 0.2688631034690074, "flos": 69943320766080.0, "grad_norm": 0.9241807030572503, "language_loss": 0.60897565, "learning_rate": 3.4309132748858424e-06, "loss": 0.62974918, "num_input_tokens_seen": 48016245, "step": 2236, "time_per_iteration": 3.567798376083374 }, { "auxiliary_loss_clip": 0.0119984, "auxiliary_loss_mlp": 0.0103357, "balance_loss_clip": 1.060076, "balance_loss_mlp": 1.02473068, "epoch": 0.2689833463596465, "flos": 22856639431680.0, "grad_norm": 1.5885336337056215, "language_loss": 0.83829343, "learning_rate": 3.430368932261779e-06, "loss": 0.86062753, "num_input_tokens_seen": 48036600, "step": 2237, "time_per_iteration": 3.0680508613586426 }, { "auxiliary_loss_clip": 0.01195409, "auxiliary_loss_mlp": 0.01035025, "balance_loss_clip": 1.05773628, "balance_loss_mlp": 1.02605486, "epoch": 0.2691035892502856, "flos": 17200242132480.0, "grad_norm": 2.5501759625416747, "language_loss": 0.74726987, "learning_rate": 3.429824372651886e-06, "loss": 0.76957422, "num_input_tokens_seen": 48054750, "step": 2238, "time_per_iteration": 2.932992458343506 }, { "auxiliary_loss_clip": 0.01195424, "auxiliary_loss_mlp": 0.0103244, "balance_loss_clip": 1.05596995, "balance_loss_mlp": 1.02237916, "epoch": 0.26922383214092466, "flos": 17747484814080.0, "grad_norm": 3.8976751917268575, "language_loss": 0.83306342, "learning_rate": 3.4292795961387732e-06, "loss": 0.85534203, "num_input_tokens_seen": 48072650, "step": 2239, "time_per_iteration": 2.8800394535064697 }, { "auxiliary_loss_clip": 0.01206467, "auxiliary_loss_mlp": 0.01027998, "balance_loss_clip": 1.05827975, "balance_loss_mlp": 1.01861596, "epoch": 0.26934407503156377, "flos": 16173376122240.0, "grad_norm": 2.0213346922693955, "language_loss": 0.87605095, "learning_rate": 3.4287346028050818e-06, "loss": 0.8983956, "num_input_tokens_seen": 48088720, "step": 2240, "time_per_iteration": 2.760834217071533 }, { "auxiliary_loss_clip": 0.01194132, "auxiliary_loss_mlp": 0.01033742, "balance_loss_clip": 1.05540299, "balance_loss_mlp": 1.02519441, "epoch": 0.2694643179222028, "flos": 23732895715200.0, "grad_norm": 1.4520261121218698, "language_loss": 0.79645419, "learning_rate": 3.4281893927334866e-06, "loss": 0.81873298, "num_input_tokens_seen": 48108630, "step": 2241, "time_per_iteration": 2.9709928035736084 }, { "auxiliary_loss_clip": 0.01200885, "auxiliary_loss_mlp": 0.01036515, "balance_loss_clip": 1.05735111, "balance_loss_mlp": 1.02747262, "epoch": 0.26958456081284193, "flos": 24718140840960.0, "grad_norm": 1.8162047884427062, "language_loss": 0.75231683, "learning_rate": 3.4276439660066963e-06, "loss": 0.77469081, "num_input_tokens_seen": 48128330, "step": 2242, "time_per_iteration": 2.816131114959717 }, { "auxiliary_loss_clip": 0.01199559, "auxiliary_loss_mlp": 0.01034192, "balance_loss_clip": 1.05639672, "balance_loss_mlp": 1.02535224, "epoch": 0.26970480370348104, "flos": 18112588606080.0, "grad_norm": 1.9898604652910217, "language_loss": 0.84146559, "learning_rate": 3.427098322707452e-06, "loss": 0.86380303, "num_input_tokens_seen": 48144295, "step": 2243, "time_per_iteration": 2.902916431427002 }, { "auxiliary_loss_clip": 0.01201043, "auxiliary_loss_mlp": 0.01035437, "balance_loss_clip": 1.06070018, "balance_loss_mlp": 1.02629375, "epoch": 0.2698250465941201, "flos": 10816546250880.0, "grad_norm": 2.6756625346959857, "language_loss": 0.89731574, "learning_rate": 3.426552462918526e-06, "loss": 0.91968054, "num_input_tokens_seen": 48162230, "step": 2244, "time_per_iteration": 2.769817352294922 }, { "auxiliary_loss_clip": 0.01204232, "auxiliary_loss_mlp": 0.0103681, "balance_loss_clip": 1.06014812, "balance_loss_mlp": 1.02731538, "epoch": 0.2699452894847592, "flos": 17308117653120.0, "grad_norm": 2.8954737327834996, "language_loss": 0.72809619, "learning_rate": 3.426006386722726e-06, "loss": 0.75050664, "num_input_tokens_seen": 48180290, "step": 2245, "time_per_iteration": 2.7627766132354736 }, { "auxiliary_loss_clip": 0.01199024, "auxiliary_loss_mlp": 0.01028836, "balance_loss_clip": 1.06245041, "balance_loss_mlp": 1.01943064, "epoch": 0.2700655323753983, "flos": 18078150441600.0, "grad_norm": 1.8102354085386034, "language_loss": 0.92560017, "learning_rate": 3.4254600942028914e-06, "loss": 0.94787878, "num_input_tokens_seen": 48198165, "step": 2246, "time_per_iteration": 2.7845304012298584 }, { "auxiliary_loss_clip": 0.01193997, "auxiliary_loss_mlp": 0.01028694, "balance_loss_clip": 1.06004739, "balance_loss_mlp": 1.01953232, "epoch": 0.2701857752660374, "flos": 18186636493440.0, "grad_norm": 1.9470240824731349, "language_loss": 0.82623124, "learning_rate": 3.424913585441893e-06, "loss": 0.84845817, "num_input_tokens_seen": 48216000, "step": 2247, "time_per_iteration": 2.8088815212249756 }, { "auxiliary_loss_clip": 0.01197547, "auxiliary_loss_mlp": 0.01033887, "balance_loss_clip": 1.05814433, "balance_loss_mlp": 1.02467823, "epoch": 0.2703060181566765, "flos": 16319496648960.0, "grad_norm": 2.0379494633496242, "language_loss": 0.87377512, "learning_rate": 3.4243668605226374e-06, "loss": 0.89608949, "num_input_tokens_seen": 48233025, "step": 2248, "time_per_iteration": 2.858985424041748 }, { "auxiliary_loss_clip": 0.01193175, "auxiliary_loss_mlp": 0.01036704, "balance_loss_clip": 1.06012261, "balance_loss_mlp": 1.02702439, "epoch": 0.2704262610473156, "flos": 19572357329280.0, "grad_norm": 2.4149681834724586, "language_loss": 0.82876945, "learning_rate": 3.423819919528061e-06, "loss": 0.85106826, "num_input_tokens_seen": 48251110, "step": 2249, "time_per_iteration": 2.896864891052246 }, { "auxiliary_loss_clip": 0.01196787, "auxiliary_loss_mlp": 0.01030884, "balance_loss_clip": 1.05744183, "balance_loss_mlp": 1.0220747, "epoch": 0.27054650393795465, "flos": 20740746925440.0, "grad_norm": 1.7912374193321734, "language_loss": 0.78805572, "learning_rate": 3.4232727625411355e-06, "loss": 0.81033242, "num_input_tokens_seen": 48270215, "step": 2250, "time_per_iteration": 2.876620292663574 }, { "auxiliary_loss_clip": 0.01179598, "auxiliary_loss_mlp": 0.01030936, "balance_loss_clip": 1.05667019, "balance_loss_mlp": 1.02193022, "epoch": 0.27066674682859376, "flos": 18658322916480.0, "grad_norm": 1.77949839218201, "language_loss": 0.864573, "learning_rate": 3.4227253896448626e-06, "loss": 0.88667828, "num_input_tokens_seen": 48288075, "step": 2251, "time_per_iteration": 2.9416234493255615 }, { "auxiliary_loss_clip": 0.01201543, "auxiliary_loss_mlp": 0.01031302, "balance_loss_clip": 1.05669427, "balance_loss_mlp": 1.02271891, "epoch": 0.2707869897192329, "flos": 23002759958400.0, "grad_norm": 2.7721253120162386, "language_loss": 0.82943296, "learning_rate": 3.42217780092228e-06, "loss": 0.85176146, "num_input_tokens_seen": 48306415, "step": 2252, "time_per_iteration": 2.832672119140625 }, { "auxiliary_loss_clip": 0.01096855, "auxiliary_loss_mlp": 0.01004754, "balance_loss_clip": 1.017452, "balance_loss_mlp": 1.00292969, "epoch": 0.27090723260987193, "flos": 58323240293760.0, "grad_norm": 0.7914146311229543, "language_loss": 0.60315758, "learning_rate": 3.421629996456456e-06, "loss": 0.6241737, "num_input_tokens_seen": 48365035, "step": 2253, "time_per_iteration": 3.3188672065734863 }, { "auxiliary_loss_clip": 0.01199666, "auxiliary_loss_mlp": 0.01030771, "balance_loss_clip": 1.05909717, "balance_loss_mlp": 1.02127635, "epoch": 0.27102747550051104, "flos": 11984540797440.0, "grad_norm": 2.818140503347025, "language_loss": 0.82588524, "learning_rate": 3.421081976330491e-06, "loss": 0.84818959, "num_input_tokens_seen": 48383550, "step": 2254, "time_per_iteration": 3.6971559524536133 }, { "auxiliary_loss_clip": 0.01193925, "auxiliary_loss_mlp": 0.01040804, "balance_loss_clip": 1.05907583, "balance_loss_mlp": 1.03145826, "epoch": 0.27114771839115015, "flos": 19900401264000.0, "grad_norm": 1.9182500436216838, "language_loss": 0.87773031, "learning_rate": 3.4205337406275207e-06, "loss": 0.90007758, "num_input_tokens_seen": 48403670, "step": 2255, "time_per_iteration": 2.8892247676849365 }, { "auxiliary_loss_clip": 0.01203225, "auxiliary_loss_mlp": 0.01030332, "balance_loss_clip": 1.05665851, "balance_loss_mlp": 1.02167702, "epoch": 0.2712679612817892, "flos": 18331966920960.0, "grad_norm": 2.359976285022614, "language_loss": 0.75913119, "learning_rate": 3.4199852894307114e-06, "loss": 0.78146672, "num_input_tokens_seen": 48420420, "step": 2256, "time_per_iteration": 2.748802423477173 }, { "auxiliary_loss_clip": 0.01187972, "auxiliary_loss_mlp": 0.01027666, "balance_loss_clip": 1.05668998, "balance_loss_mlp": 1.01814115, "epoch": 0.2713882041724283, "flos": 24460302038400.0, "grad_norm": 2.676257325621373, "language_loss": 0.7890889, "learning_rate": 3.419436622823262e-06, "loss": 0.81124532, "num_input_tokens_seen": 48441140, "step": 2257, "time_per_iteration": 4.810990810394287 }, { "auxiliary_loss_clip": 0.01196061, "auxiliary_loss_mlp": 0.01030738, "balance_loss_clip": 1.06018376, "balance_loss_mlp": 1.021523, "epoch": 0.27150844706306737, "flos": 23039317025280.0, "grad_norm": 1.6948268860260876, "language_loss": 0.74187458, "learning_rate": 3.4188877408884063e-06, "loss": 0.76414251, "num_input_tokens_seen": 48461845, "step": 2258, "time_per_iteration": 3.8171892166137695 }, { "auxiliary_loss_clip": 0.01192375, "auxiliary_loss_mlp": 0.01032608, "balance_loss_clip": 1.05988789, "balance_loss_mlp": 1.02336311, "epoch": 0.2716286899537065, "flos": 22563644192640.0, "grad_norm": 2.2954933121470735, "language_loss": 0.6503582, "learning_rate": 3.4183386437094088e-06, "loss": 0.67260802, "num_input_tokens_seen": 48478510, "step": 2259, "time_per_iteration": 2.8142101764678955 }, { "auxiliary_loss_clip": 0.01195034, "auxiliary_loss_mlp": 0.01025922, "balance_loss_clip": 1.0549047, "balance_loss_mlp": 1.01624811, "epoch": 0.2717489328443456, "flos": 13115044523520.0, "grad_norm": 2.1879602033732626, "language_loss": 0.82007706, "learning_rate": 3.417789331369565e-06, "loss": 0.84228659, "num_input_tokens_seen": 48494300, "step": 2260, "time_per_iteration": 2.7718958854675293 }, { "auxiliary_loss_clip": 0.01207436, "auxiliary_loss_mlp": 0.01028604, "balance_loss_clip": 1.05935717, "balance_loss_mlp": 1.01828063, "epoch": 0.27186917573498465, "flos": 29278688060160.0, "grad_norm": 1.9134417828120853, "language_loss": 0.9158231, "learning_rate": 3.4172398039522088e-06, "loss": 0.93818349, "num_input_tokens_seen": 48515585, "step": 2261, "time_per_iteration": 2.7966089248657227 }, { "auxiliary_loss_clip": 0.0120392, "auxiliary_loss_mlp": 0.01036698, "balance_loss_clip": 1.06034827, "balance_loss_mlp": 1.02702403, "epoch": 0.27198941862562376, "flos": 26032220000640.0, "grad_norm": 1.5863056347516342, "language_loss": 0.79830968, "learning_rate": 3.4166900615407e-06, "loss": 0.8207159, "num_input_tokens_seen": 48533500, "step": 2262, "time_per_iteration": 2.7495734691619873 }, { "auxiliary_loss_clip": 0.01199376, "auxiliary_loss_mlp": 0.01033335, "balance_loss_clip": 1.05922985, "balance_loss_mlp": 1.0239712, "epoch": 0.27210966151626287, "flos": 32780983760640.0, "grad_norm": 1.9921974440281183, "language_loss": 0.74816847, "learning_rate": 3.416140104218436e-06, "loss": 0.77049565, "num_input_tokens_seen": 48552865, "step": 2263, "time_per_iteration": 2.85276460647583 }, { "auxiliary_loss_clip": 0.01092131, "auxiliary_loss_mlp": 0.01023468, "balance_loss_clip": 1.01587725, "balance_loss_mlp": 1.00467491, "epoch": 0.2722299044069019, "flos": 65471043219840.0, "grad_norm": 0.8362318893728069, "language_loss": 0.6965391, "learning_rate": 3.4155899320688437e-06, "loss": 0.71769506, "num_input_tokens_seen": 48618940, "step": 2264, "time_per_iteration": 3.4619638919830322 }, { "auxiliary_loss_clip": 0.01186873, "auxiliary_loss_mlp": 0.01042067, "balance_loss_clip": 1.05836892, "balance_loss_mlp": 1.03227985, "epoch": 0.27235014729754103, "flos": 15334143782400.0, "grad_norm": 3.0231894086806257, "language_loss": 0.74012804, "learning_rate": 3.415039545175384e-06, "loss": 0.76241744, "num_input_tokens_seen": 48634665, "step": 2265, "time_per_iteration": 2.9347476959228516 }, { "auxiliary_loss_clip": 0.01202372, "auxiliary_loss_mlp": 0.01034389, "balance_loss_clip": 1.05839396, "balance_loss_mlp": 1.02494168, "epoch": 0.27247039018818014, "flos": 21872363973120.0, "grad_norm": 2.251090135490892, "language_loss": 0.6517657, "learning_rate": 3.414488943621551e-06, "loss": 0.67413336, "num_input_tokens_seen": 48653330, "step": 2266, "time_per_iteration": 2.92720365524292 }, { "auxiliary_loss_clip": 0.01198568, "auxiliary_loss_mlp": 0.01035583, "balance_loss_clip": 1.05817592, "balance_loss_mlp": 1.02628517, "epoch": 0.2725906330788192, "flos": 18695490514560.0, "grad_norm": 1.828572193898631, "language_loss": 0.73893315, "learning_rate": 3.41393812749087e-06, "loss": 0.76127464, "num_input_tokens_seen": 48671375, "step": 2267, "time_per_iteration": 2.7982654571533203 }, { "auxiliary_loss_clip": 0.01196887, "auxiliary_loss_mlp": 0.0103534, "balance_loss_clip": 1.05988157, "balance_loss_mlp": 1.02610159, "epoch": 0.2727108759694583, "flos": 17886099398400.0, "grad_norm": 2.425257931506788, "language_loss": 0.72067058, "learning_rate": 3.4133870968668984e-06, "loss": 0.74299282, "num_input_tokens_seen": 48686175, "step": 2268, "time_per_iteration": 2.771148920059204 }, { "auxiliary_loss_clip": 0.01199455, "auxiliary_loss_mlp": 0.01041235, "balance_loss_clip": 1.05818129, "balance_loss_mlp": 1.03200257, "epoch": 0.2728311188600974, "flos": 24461666755200.0, "grad_norm": 1.6524185690043418, "language_loss": 0.78591853, "learning_rate": 3.412835851833229e-06, "loss": 0.80832541, "num_input_tokens_seen": 48708370, "step": 2269, "time_per_iteration": 2.891040325164795 }, { "auxiliary_loss_clip": 0.01195831, "auxiliary_loss_mlp": 0.01030324, "balance_loss_clip": 1.05816448, "balance_loss_mlp": 1.02125251, "epoch": 0.2729513617507365, "flos": 30993314757120.0, "grad_norm": 1.7730164440174991, "language_loss": 0.78073382, "learning_rate": 3.4122843924734834e-06, "loss": 0.80299532, "num_input_tokens_seen": 48730670, "step": 2270, "time_per_iteration": 2.8456389904022217 }, { "auxiliary_loss_clip": 0.01193561, "auxiliary_loss_mlp": 0.01030108, "balance_loss_clip": 1.05871916, "balance_loss_mlp": 1.0196712, "epoch": 0.2730716046413756, "flos": 19094637421440.0, "grad_norm": 2.832433768393988, "language_loss": 0.87773359, "learning_rate": 3.411732718871319e-06, "loss": 0.89997029, "num_input_tokens_seen": 48746510, "step": 2271, "time_per_iteration": 2.7304956912994385 }, { "auxiliary_loss_clip": 0.01200221, "auxiliary_loss_mlp": 0.01039731, "balance_loss_clip": 1.05822968, "balance_loss_mlp": 1.03076673, "epoch": 0.27319184753201464, "flos": 26944566474240.0, "grad_norm": 1.7901857308072806, "language_loss": 0.78697544, "learning_rate": 3.4111808311104227e-06, "loss": 0.80937493, "num_input_tokens_seen": 48768825, "step": 2272, "time_per_iteration": 2.741567611694336 }, { "auxiliary_loss_clip": 0.0120265, "auxiliary_loss_mlp": 0.01030865, "balance_loss_clip": 1.05727577, "balance_loss_mlp": 1.02078533, "epoch": 0.27331209042265375, "flos": 31759828012800.0, "grad_norm": 1.7128993892748465, "language_loss": 0.69543338, "learning_rate": 3.410628729274517e-06, "loss": 0.71776849, "num_input_tokens_seen": 48790345, "step": 2273, "time_per_iteration": 2.909356117248535 }, { "auxiliary_loss_clip": 0.01192953, "auxiliary_loss_mlp": 0.0104859, "balance_loss_clip": 1.05741429, "balance_loss_mlp": 1.01735198, "epoch": 0.27343233331329286, "flos": 25739081107200.0, "grad_norm": 1.8488628877367976, "language_loss": 0.82618088, "learning_rate": 3.4100764134473546e-06, "loss": 0.84859633, "num_input_tokens_seen": 48809630, "step": 2274, "time_per_iteration": 2.7717373371124268 }, { "auxiliary_loss_clip": 0.01199397, "auxiliary_loss_mlp": 0.01023447, "balance_loss_clip": 1.05666423, "balance_loss_mlp": 1.01502538, "epoch": 0.2735525762039319, "flos": 24389414547840.0, "grad_norm": 2.3846552808355126, "language_loss": 0.85294271, "learning_rate": 3.4095238837127215e-06, "loss": 0.87517118, "num_input_tokens_seen": 48828770, "step": 2275, "time_per_iteration": 2.8235278129577637 }, { "auxiliary_loss_clip": 0.01190028, "auxiliary_loss_mlp": 0.01027316, "balance_loss_clip": 1.05606508, "balance_loss_mlp": 1.01873279, "epoch": 0.27367281909457103, "flos": 14465357527680.0, "grad_norm": 2.0027813860907604, "language_loss": 0.79336292, "learning_rate": 3.4089711401544355e-06, "loss": 0.81553638, "num_input_tokens_seen": 48846365, "step": 2276, "time_per_iteration": 2.7988009452819824 }, { "auxiliary_loss_clip": 0.0120067, "auxiliary_loss_mlp": 0.01026801, "balance_loss_clip": 1.05855298, "balance_loss_mlp": 1.0174017, "epoch": 0.27379306198521014, "flos": 23476996247040.0, "grad_norm": 5.295099142562154, "language_loss": 0.67728812, "learning_rate": 3.4084181828563486e-06, "loss": 0.69956285, "num_input_tokens_seen": 48863085, "step": 2277, "time_per_iteration": 2.785341501235962 }, { "auxiliary_loss_clip": 0.01187081, "auxiliary_loss_mlp": 0.01029613, "balance_loss_clip": 1.0568279, "balance_loss_mlp": 1.02035058, "epoch": 0.2739133048758492, "flos": 17458152762240.0, "grad_norm": 1.7096290086368682, "language_loss": 0.70682359, "learning_rate": 3.4078650119023428e-06, "loss": 0.72899044, "num_input_tokens_seen": 48881400, "step": 2278, "time_per_iteration": 2.843019485473633 }, { "auxiliary_loss_clip": 0.0119037, "auxiliary_loss_mlp": 0.01031181, "balance_loss_clip": 1.05823183, "balance_loss_mlp": 1.02176392, "epoch": 0.2740335477664883, "flos": 19273113123840.0, "grad_norm": 2.2179520392543033, "language_loss": 0.74025971, "learning_rate": 3.4073116273763337e-06, "loss": 0.76247525, "num_input_tokens_seen": 48895845, "step": 2279, "time_per_iteration": 2.8624205589294434 }, { "auxiliary_loss_clip": 0.01198738, "auxiliary_loss_mlp": 0.01034925, "balance_loss_clip": 1.05678487, "balance_loss_mlp": 1.02485824, "epoch": 0.2741537906571274, "flos": 26104723603200.0, "grad_norm": 2.0155373227075257, "language_loss": 0.80913311, "learning_rate": 3.40675802936227e-06, "loss": 0.83146977, "num_input_tokens_seen": 48916630, "step": 2280, "time_per_iteration": 3.8526458740234375 }, { "auxiliary_loss_clip": 0.01190932, "auxiliary_loss_mlp": 0.01031355, "balance_loss_clip": 1.05865169, "balance_loss_mlp": 1.0216279, "epoch": 0.27427403354776647, "flos": 34164190644480.0, "grad_norm": 1.8726458204203613, "language_loss": 0.7184552, "learning_rate": 3.4062042179441318e-06, "loss": 0.74067813, "num_input_tokens_seen": 48937100, "step": 2281, "time_per_iteration": 2.8794753551483154 }, { "auxiliary_loss_clip": 0.01196024, "auxiliary_loss_mlp": 0.01028191, "balance_loss_clip": 1.05797458, "balance_loss_mlp": 1.01994729, "epoch": 0.2743942764384056, "flos": 18766988536320.0, "grad_norm": 2.133297606639588, "language_loss": 0.80359161, "learning_rate": 3.4056501932059314e-06, "loss": 0.8258338, "num_input_tokens_seen": 48955175, "step": 2282, "time_per_iteration": 2.899186849594116 }, { "auxiliary_loss_clip": 0.01095342, "auxiliary_loss_mlp": 0.01001666, "balance_loss_clip": 1.01821232, "balance_loss_mlp": 0.99994993, "epoch": 0.2745145193290447, "flos": 64904048058240.0, "grad_norm": 0.7651346983267902, "language_loss": 0.5816412, "learning_rate": 3.405095955231715e-06, "loss": 0.6026113, "num_input_tokens_seen": 49006830, "step": 2283, "time_per_iteration": 4.238837957382202 }, { "auxiliary_loss_clip": 0.01200545, "auxiliary_loss_mlp": 0.0102759, "balance_loss_clip": 1.05440664, "balance_loss_mlp": 1.01845276, "epoch": 0.27463476221968375, "flos": 16136926796160.0, "grad_norm": 2.9983576250956467, "language_loss": 0.94630986, "learning_rate": 3.4045415041055585e-06, "loss": 0.96859121, "num_input_tokens_seen": 49022470, "step": 2284, "time_per_iteration": 5.620631694793701 }, { "auxiliary_loss_clip": 0.01199933, "auxiliary_loss_mlp": 0.01034052, "balance_loss_clip": 1.05761003, "balance_loss_mlp": 1.02451539, "epoch": 0.27475500511032286, "flos": 10376712213120.0, "grad_norm": 2.111155862396606, "language_loss": 0.78088558, "learning_rate": 3.4039868399115728e-06, "loss": 0.8032254, "num_input_tokens_seen": 49037110, "step": 2285, "time_per_iteration": 2.8402645587921143 }, { "auxiliary_loss_clip": 0.01189426, "auxiliary_loss_mlp": 0.01039766, "balance_loss_clip": 1.06050658, "balance_loss_mlp": 1.02995491, "epoch": 0.27487524800096197, "flos": 17311062568320.0, "grad_norm": 1.8559583628107008, "language_loss": 0.80387592, "learning_rate": 3.4034319627339003e-06, "loss": 0.82616782, "num_input_tokens_seen": 49053975, "step": 2286, "time_per_iteration": 2.837334394454956 }, { "auxiliary_loss_clip": 0.01198255, "auxiliary_loss_mlp": 0.01033402, "balance_loss_clip": 1.05849457, "balance_loss_mlp": 1.02374589, "epoch": 0.274995490891601, "flos": 27120205002240.0, "grad_norm": 2.1463625938366437, "language_loss": 0.6972065, "learning_rate": 3.402876872656715e-06, "loss": 0.71952307, "num_input_tokens_seen": 49072295, "step": 2287, "time_per_iteration": 3.0012502670288086 }, { "auxiliary_loss_clip": 0.01194809, "auxiliary_loss_mlp": 0.01035556, "balance_loss_clip": 1.0589087, "balance_loss_mlp": 1.02749157, "epoch": 0.27511573378224013, "flos": 23436093634560.0, "grad_norm": 2.6510102515333998, "language_loss": 0.89458847, "learning_rate": 3.402321569764223e-06, "loss": 0.91689211, "num_input_tokens_seen": 49091600, "step": 2288, "time_per_iteration": 2.879101276397705 }, { "auxiliary_loss_clip": 0.01188766, "auxiliary_loss_mlp": 0.01061683, "balance_loss_clip": 1.05609429, "balance_loss_mlp": 1.02751577, "epoch": 0.2752359766728792, "flos": 16722019434240.0, "grad_norm": 1.7828713225897639, "language_loss": 0.83826876, "learning_rate": 3.4017660541406635e-06, "loss": 0.86077332, "num_input_tokens_seen": 49107665, "step": 2289, "time_per_iteration": 2.91243052482605 }, { "auxiliary_loss_clip": 0.01201632, "auxiliary_loss_mlp": 0.0103665, "balance_loss_clip": 1.05539274, "balance_loss_mlp": 1.02593875, "epoch": 0.2753562195635183, "flos": 25297738698240.0, "grad_norm": 1.7047309563571034, "language_loss": 0.74264008, "learning_rate": 3.4012103258703092e-06, "loss": 0.76502287, "num_input_tokens_seen": 49126420, "step": 2290, "time_per_iteration": 2.7771780490875244 }, { "auxiliary_loss_clip": 0.01190946, "auxiliary_loss_mlp": 0.01031739, "balance_loss_clip": 1.0570488, "balance_loss_mlp": 1.02265537, "epoch": 0.2754764624541574, "flos": 27338972785920.0, "grad_norm": 1.8376119919349072, "language_loss": 0.83027607, "learning_rate": 3.4006543850374616e-06, "loss": 0.85250294, "num_input_tokens_seen": 49141470, "step": 2291, "time_per_iteration": 2.856072425842285 }, { "auxiliary_loss_clip": 0.01204654, "auxiliary_loss_mlp": 0.01032124, "balance_loss_clip": 1.05869174, "balance_loss_mlp": 1.02341557, "epoch": 0.27559670534479647, "flos": 17238379397760.0, "grad_norm": 1.9557829852173465, "language_loss": 0.74716896, "learning_rate": 3.400098231726458e-06, "loss": 0.76953673, "num_input_tokens_seen": 49158570, "step": 2292, "time_per_iteration": 2.72622013092041 }, { "auxiliary_loss_clip": 0.01195524, "auxiliary_loss_mlp": 0.01029572, "balance_loss_clip": 1.05529845, "balance_loss_mlp": 1.02003551, "epoch": 0.2757169482354356, "flos": 21939085486080.0, "grad_norm": 1.8262197248337966, "language_loss": 0.87085384, "learning_rate": 3.3995418660216657e-06, "loss": 0.89310479, "num_input_tokens_seen": 49176025, "step": 2293, "time_per_iteration": 2.8213930130004883 }, { "auxiliary_loss_clip": 0.01212128, "auxiliary_loss_mlp": 0.01029493, "balance_loss_clip": 1.06052351, "balance_loss_mlp": 1.0191102, "epoch": 0.2758371911260747, "flos": 20850669521280.0, "grad_norm": 3.7491556859559063, "language_loss": 0.80781209, "learning_rate": 3.3989852880074848e-06, "loss": 0.83022833, "num_input_tokens_seen": 49197455, "step": 2294, "time_per_iteration": 2.7907443046569824 }, { "auxiliary_loss_clip": 0.01096038, "auxiliary_loss_mlp": 0.01003769, "balance_loss_clip": 1.0244503, "balance_loss_mlp": 1.00138462, "epoch": 0.27595743401671374, "flos": 69269063592960.0, "grad_norm": 0.7435911400840407, "language_loss": 0.60574573, "learning_rate": 3.398428497768348e-06, "loss": 0.62674379, "num_input_tokens_seen": 49262625, "step": 2295, "time_per_iteration": 3.4093315601348877 }, { "auxiliary_loss_clip": 0.01202805, "auxiliary_loss_mlp": 0.01031849, "balance_loss_clip": 1.05729818, "balance_loss_mlp": 1.02216339, "epoch": 0.27607767690735285, "flos": 21215019127680.0, "grad_norm": 1.7715774765501413, "language_loss": 0.72178519, "learning_rate": 3.3978714953887205e-06, "loss": 0.7441318, "num_input_tokens_seen": 49282380, "step": 2296, "time_per_iteration": 2.9364662170410156 }, { "auxiliary_loss_clip": 0.01179164, "auxiliary_loss_mlp": 0.01029427, "balance_loss_clip": 1.05556226, "balance_loss_mlp": 1.01969993, "epoch": 0.27619791979799196, "flos": 24825334003200.0, "grad_norm": 1.7251030217991519, "language_loss": 0.86075222, "learning_rate": 3.397314280953098e-06, "loss": 0.88283813, "num_input_tokens_seen": 49303205, "step": 2297, "time_per_iteration": 2.9573495388031006 }, { "auxiliary_loss_clip": 0.0119347, "auxiliary_loss_mlp": 0.01028987, "balance_loss_clip": 1.05789614, "balance_loss_mlp": 1.01964104, "epoch": 0.276318162688631, "flos": 24753548672640.0, "grad_norm": 1.8863324669417534, "language_loss": 0.80537814, "learning_rate": 3.3967568545460108e-06, "loss": 0.82760268, "num_input_tokens_seen": 49322745, "step": 2298, "time_per_iteration": 2.8635692596435547 }, { "auxiliary_loss_clip": 0.01196936, "auxiliary_loss_mlp": 0.01033597, "balance_loss_clip": 1.05611086, "balance_loss_mlp": 1.02409637, "epoch": 0.27643840557927013, "flos": 18150007599360.0, "grad_norm": 1.934533023377939, "language_loss": 0.80815566, "learning_rate": 3.3961992162520185e-06, "loss": 0.83046103, "num_input_tokens_seen": 49341370, "step": 2299, "time_per_iteration": 2.8003334999084473 }, { "auxiliary_loss_clip": 0.0120055, "auxiliary_loss_mlp": 0.01030004, "balance_loss_clip": 1.05662537, "balance_loss_mlp": 1.02027059, "epoch": 0.27655864846990924, "flos": 24823933372800.0, "grad_norm": 4.007457786833891, "language_loss": 0.71720546, "learning_rate": 3.3956413661557156e-06, "loss": 0.73951101, "num_input_tokens_seen": 49361545, "step": 2300, "time_per_iteration": 2.8105499744415283 }, { "auxiliary_loss_clip": 0.0119864, "auxiliary_loss_mlp": 0.01033468, "balance_loss_clip": 1.05555654, "balance_loss_mlp": 1.02372229, "epoch": 0.2766788913605483, "flos": 20266582464000.0, "grad_norm": 2.2251022968372647, "language_loss": 0.65676063, "learning_rate": 3.3950833043417273e-06, "loss": 0.67908168, "num_input_tokens_seen": 49379690, "step": 2301, "time_per_iteration": 2.7863848209381104 }, { "auxiliary_loss_clip": 0.01203857, "auxiliary_loss_mlp": 0.01033673, "balance_loss_clip": 1.06027007, "balance_loss_mlp": 1.02343893, "epoch": 0.2767991342511874, "flos": 21470272151040.0, "grad_norm": 3.5079279134550996, "language_loss": 0.73284161, "learning_rate": 3.3945250308947105e-06, "loss": 0.75521696, "num_input_tokens_seen": 49395995, "step": 2302, "time_per_iteration": 2.7558982372283936 }, { "auxiliary_loss_clip": 0.0109427, "auxiliary_loss_mlp": 0.01005476, "balance_loss_clip": 1.01686263, "balance_loss_mlp": 1.00365245, "epoch": 0.2769193771418265, "flos": 66002627571840.0, "grad_norm": 1.2283508066259328, "language_loss": 0.68355405, "learning_rate": 3.3939665458993556e-06, "loss": 0.70455152, "num_input_tokens_seen": 49450415, "step": 2303, "time_per_iteration": 3.308072805404663 }, { "auxiliary_loss_clip": 0.01197305, "auxiliary_loss_mlp": 0.01029413, "balance_loss_clip": 1.05570149, "balance_loss_mlp": 1.01996529, "epoch": 0.27703962003246557, "flos": 20704441253760.0, "grad_norm": 1.8832418838737857, "language_loss": 0.7678256, "learning_rate": 3.3934078494403843e-06, "loss": 0.79009283, "num_input_tokens_seen": 49469990, "step": 2304, "time_per_iteration": 2.7767951488494873 }, { "auxiliary_loss_clip": 0.01180077, "auxiliary_loss_mlp": 0.01058403, "balance_loss_clip": 1.05923843, "balance_loss_mlp": 1.02308202, "epoch": 0.2771598629231047, "flos": 22929897219840.0, "grad_norm": 2.1955529632859045, "language_loss": 0.81840283, "learning_rate": 3.3928489416025495e-06, "loss": 0.84078759, "num_input_tokens_seen": 49490835, "step": 2305, "time_per_iteration": 2.895704984664917 }, { "auxiliary_loss_clip": 0.01198641, "auxiliary_loss_mlp": 0.01033248, "balance_loss_clip": 1.05841708, "balance_loss_mlp": 1.02294242, "epoch": 0.27728010581374374, "flos": 18369457741440.0, "grad_norm": 2.0035388260213502, "language_loss": 0.79004955, "learning_rate": 3.392289822470638e-06, "loss": 0.81236845, "num_input_tokens_seen": 49508815, "step": 2306, "time_per_iteration": 3.72263765335083 }, { "auxiliary_loss_clip": 0.01195866, "auxiliary_loss_mlp": 0.0103438, "balance_loss_clip": 1.05757284, "balance_loss_mlp": 1.02451539, "epoch": 0.27740034870438285, "flos": 19427637432960.0, "grad_norm": 2.2290037873540696, "language_loss": 0.76172775, "learning_rate": 3.3917304921294674e-06, "loss": 0.7840302, "num_input_tokens_seen": 49526980, "step": 2307, "time_per_iteration": 2.7724945545196533 }, { "auxiliary_loss_clip": 0.01199552, "auxiliary_loss_mlp": 0.0103138, "balance_loss_clip": 1.05533075, "balance_loss_mlp": 1.02115822, "epoch": 0.27752059159502196, "flos": 21614776565760.0, "grad_norm": 1.6818720290604712, "language_loss": 0.81048751, "learning_rate": 3.3911709506638876e-06, "loss": 0.83279687, "num_input_tokens_seen": 49546290, "step": 2308, "time_per_iteration": 2.8663461208343506 }, { "auxiliary_loss_clip": 0.0118691, "auxiliary_loss_mlp": 0.01061334, "balance_loss_clip": 1.05734396, "balance_loss_mlp": 1.02550459, "epoch": 0.277640834485661, "flos": 26608011016320.0, "grad_norm": 1.949794714311563, "language_loss": 0.81110013, "learning_rate": 3.390611198158781e-06, "loss": 0.83358252, "num_input_tokens_seen": 49564165, "step": 2309, "time_per_iteration": 5.039456605911255 }, { "auxiliary_loss_clip": 0.01210246, "auxiliary_loss_mlp": 0.01031106, "balance_loss_clip": 1.06112981, "balance_loss_mlp": 1.02146196, "epoch": 0.2777610773763001, "flos": 19492814661120.0, "grad_norm": 2.0166839001038173, "language_loss": 0.89738911, "learning_rate": 3.3900512346990612e-06, "loss": 0.91980267, "num_input_tokens_seen": 49580155, "step": 2310, "time_per_iteration": 2.8395609855651855 }, { "auxiliary_loss_clip": 0.01194302, "auxiliary_loss_mlp": 0.01033022, "balance_loss_clip": 1.05823112, "balance_loss_mlp": 1.02294254, "epoch": 0.27788132026693924, "flos": 38290650001920.0, "grad_norm": 2.0655215575396353, "language_loss": 0.65735602, "learning_rate": 3.389491060369674e-06, "loss": 0.67962921, "num_input_tokens_seen": 49605830, "step": 2311, "time_per_iteration": 4.173294305801392 }, { "auxiliary_loss_clip": 0.0118138, "auxiliary_loss_mlp": 0.0103716, "balance_loss_clip": 1.05586433, "balance_loss_mlp": 1.02788544, "epoch": 0.2780015631575783, "flos": 22382546797440.0, "grad_norm": 1.909310939761063, "language_loss": 0.89230996, "learning_rate": 3.388930675255598e-06, "loss": 0.91449535, "num_input_tokens_seen": 49625680, "step": 2312, "time_per_iteration": 2.9545445442199707 }, { "auxiliary_loss_clip": 0.01202127, "auxiliary_loss_mlp": 0.01029048, "balance_loss_clip": 1.0572437, "balance_loss_mlp": 1.0190165, "epoch": 0.2781218060482174, "flos": 12203200840320.0, "grad_norm": 4.200764869947027, "language_loss": 0.79169309, "learning_rate": 3.388370079441843e-06, "loss": 0.81400484, "num_input_tokens_seen": 49641195, "step": 2313, "time_per_iteration": 2.8269529342651367 }, { "auxiliary_loss_clip": 0.01199209, "auxiliary_loss_mlp": 0.01034555, "balance_loss_clip": 1.06130219, "balance_loss_mlp": 1.02494681, "epoch": 0.2782420489388565, "flos": 18107632529280.0, "grad_norm": 1.9537288572784806, "language_loss": 0.93006152, "learning_rate": 3.3878092730134505e-06, "loss": 0.95239908, "num_input_tokens_seen": 49659180, "step": 2314, "time_per_iteration": 2.8835296630859375 }, { "auxiliary_loss_clip": 0.01197826, "auxiliary_loss_mlp": 0.01036637, "balance_loss_clip": 1.05948639, "balance_loss_mlp": 1.02660561, "epoch": 0.27836229182949557, "flos": 18514752255360.0, "grad_norm": 1.5466581256879108, "language_loss": 0.80397075, "learning_rate": 3.3872482560554947e-06, "loss": 0.8263154, "num_input_tokens_seen": 49677955, "step": 2315, "time_per_iteration": 2.7576420307159424 }, { "auxiliary_loss_clip": 0.01090859, "auxiliary_loss_mlp": 0.01000668, "balance_loss_clip": 1.01368666, "balance_loss_mlp": 0.99895114, "epoch": 0.2784825347201347, "flos": 67079230940160.0, "grad_norm": 0.7987535465295433, "language_loss": 0.56900257, "learning_rate": 3.386687028653082e-06, "loss": 0.58991784, "num_input_tokens_seen": 49740800, "step": 2316, "time_per_iteration": 3.333284616470337 }, { "auxiliary_loss_clip": 0.01191657, "auxiliary_loss_mlp": 0.01027796, "balance_loss_clip": 1.05790377, "balance_loss_mlp": 1.01834869, "epoch": 0.2786027776107738, "flos": 22631119891200.0, "grad_norm": 1.8171517347139183, "language_loss": 0.85047388, "learning_rate": 3.386125590891349e-06, "loss": 0.87266839, "num_input_tokens_seen": 49757675, "step": 2317, "time_per_iteration": 2.8484714031219482 }, { "auxiliary_loss_clip": 0.01189786, "auxiliary_loss_mlp": 0.01035279, "balance_loss_clip": 1.05735433, "balance_loss_mlp": 1.02605176, "epoch": 0.27872302050141284, "flos": 15778826156160.0, "grad_norm": 1.9596380545982979, "language_loss": 0.82874495, "learning_rate": 3.3855639428554657e-06, "loss": 0.8509956, "num_input_tokens_seen": 49775205, "step": 2318, "time_per_iteration": 2.8160793781280518 }, { "auxiliary_loss_clip": 0.01185067, "auxiliary_loss_mlp": 0.01035618, "balance_loss_clip": 1.05603671, "balance_loss_mlp": 1.0260576, "epoch": 0.27884326339205195, "flos": 22126970551680.0, "grad_norm": 1.7215216846481423, "language_loss": 0.80679262, "learning_rate": 3.385002084630635e-06, "loss": 0.82899946, "num_input_tokens_seen": 49794175, "step": 2319, "time_per_iteration": 2.7671310901641846 }, { "auxiliary_loss_clip": 0.01202181, "auxiliary_loss_mlp": 0.01030598, "balance_loss_clip": 1.05565894, "balance_loss_mlp": 1.02060211, "epoch": 0.278963506282691, "flos": 20558715776640.0, "grad_norm": 2.124387728876184, "language_loss": 0.84652328, "learning_rate": 3.384440016302088e-06, "loss": 0.86885107, "num_input_tokens_seen": 49812850, "step": 2320, "time_per_iteration": 2.7668607234954834 }, { "auxiliary_loss_clip": 0.01198711, "auxiliary_loss_mlp": 0.01031774, "balance_loss_clip": 1.0563643, "balance_loss_mlp": 1.0216589, "epoch": 0.2790837491733301, "flos": 21942928241280.0, "grad_norm": 2.4762540231897376, "language_loss": 0.62795675, "learning_rate": 3.3838777379550923e-06, "loss": 0.65026164, "num_input_tokens_seen": 49832295, "step": 2321, "time_per_iteration": 2.889153242111206 }, { "auxiliary_loss_clip": 0.01201427, "auxiliary_loss_mlp": 0.01039069, "balance_loss_clip": 1.05849957, "balance_loss_mlp": 1.02962184, "epoch": 0.27920399206396923, "flos": 26286790665600.0, "grad_norm": 1.9822328822117363, "language_loss": 0.78641444, "learning_rate": 3.383315249674944e-06, "loss": 0.80881935, "num_input_tokens_seen": 49850860, "step": 2322, "time_per_iteration": 2.786949872970581 }, { "auxiliary_loss_clip": 0.01196762, "auxiliary_loss_mlp": 0.01033592, "balance_loss_clip": 1.05783296, "balance_loss_mlp": 1.02403164, "epoch": 0.2793242349546083, "flos": 25400981364480.0, "grad_norm": 3.202528959614104, "language_loss": 0.86221457, "learning_rate": 3.3827525515469715e-06, "loss": 0.88451815, "num_input_tokens_seen": 49865765, "step": 2323, "time_per_iteration": 2.7462453842163086 }, { "auxiliary_loss_clip": 0.01191282, "auxiliary_loss_mlp": 0.01031358, "balance_loss_clip": 1.05803168, "balance_loss_mlp": 1.0205158, "epoch": 0.2794444778452474, "flos": 20850346298880.0, "grad_norm": 2.1400542754574348, "language_loss": 0.71482921, "learning_rate": 3.3821896436565367e-06, "loss": 0.73705566, "num_input_tokens_seen": 49885425, "step": 2324, "time_per_iteration": 2.8543643951416016 }, { "auxiliary_loss_clip": 0.01205275, "auxiliary_loss_mlp": 0.01033035, "balance_loss_clip": 1.06065369, "balance_loss_mlp": 1.02333713, "epoch": 0.2795647207358865, "flos": 21576244250880.0, "grad_norm": 3.4934252660050853, "language_loss": 0.70354223, "learning_rate": 3.381626526089032e-06, "loss": 0.72592533, "num_input_tokens_seen": 49904990, "step": 2325, "time_per_iteration": 2.7519612312316895 }, { "auxiliary_loss_clip": 0.01194983, "auxiliary_loss_mlp": 0.01034062, "balance_loss_clip": 1.05639398, "balance_loss_mlp": 1.02502632, "epoch": 0.27968496362652556, "flos": 21471744608640.0, "grad_norm": 2.0430208143060087, "language_loss": 0.79131794, "learning_rate": 3.3810631989298815e-06, "loss": 0.81360841, "num_input_tokens_seen": 49924600, "step": 2326, "time_per_iteration": 2.8010292053222656 }, { "auxiliary_loss_clip": 0.01196211, "auxiliary_loss_mlp": 0.01034798, "balance_loss_clip": 1.05897117, "balance_loss_mlp": 1.02423024, "epoch": 0.2798052065171647, "flos": 23258695340160.0, "grad_norm": 2.4993928428322776, "language_loss": 0.84475392, "learning_rate": 3.3804996622645423e-06, "loss": 0.867064, "num_input_tokens_seen": 49942600, "step": 2327, "time_per_iteration": 2.9313271045684814 }, { "auxiliary_loss_clip": 0.01204406, "auxiliary_loss_mlp": 0.01033463, "balance_loss_clip": 1.05738044, "balance_loss_mlp": 1.02370584, "epoch": 0.2799254494078038, "flos": 21539328048000.0, "grad_norm": 1.8330051327185117, "language_loss": 0.89587754, "learning_rate": 3.3799359161785015e-06, "loss": 0.91825622, "num_input_tokens_seen": 49962250, "step": 2328, "time_per_iteration": 2.737577199935913 }, { "auxiliary_loss_clip": 0.01196472, "auxiliary_loss_mlp": 0.0103175, "balance_loss_clip": 1.05444121, "balance_loss_mlp": 1.02189744, "epoch": 0.28004569229844284, "flos": 26393912000640.0, "grad_norm": 1.4895818910738803, "language_loss": 0.85670841, "learning_rate": 3.3793719607572798e-06, "loss": 0.87899053, "num_input_tokens_seen": 49983215, "step": 2329, "time_per_iteration": 2.807681083679199 }, { "auxiliary_loss_clip": 0.01185068, "auxiliary_loss_mlp": 0.01038652, "balance_loss_clip": 1.05813432, "balance_loss_mlp": 1.02975249, "epoch": 0.28016593518908195, "flos": 33547676584320.0, "grad_norm": 2.015619669958863, "language_loss": 0.77220452, "learning_rate": 3.378807796086428e-06, "loss": 0.7944417, "num_input_tokens_seen": 50006075, "step": 2330, "time_per_iteration": 2.9184470176696777 }, { "auxiliary_loss_clip": 0.01205727, "auxiliary_loss_mlp": 0.01034103, "balance_loss_clip": 1.05866933, "balance_loss_mlp": 1.02432799, "epoch": 0.28028617807972106, "flos": 15340823712000.0, "grad_norm": 1.9852545235701524, "language_loss": 0.77062565, "learning_rate": 3.37824342225153e-06, "loss": 0.79302394, "num_input_tokens_seen": 50022495, "step": 2331, "time_per_iteration": 2.7448506355285645 }, { "auxiliary_loss_clip": 0.01194012, "auxiliary_loss_mlp": 0.01034724, "balance_loss_clip": 1.06013811, "balance_loss_mlp": 1.02494311, "epoch": 0.2804064209703601, "flos": 25520277409920.0, "grad_norm": 1.9174499125273547, "language_loss": 0.77709192, "learning_rate": 3.3776788393382006e-06, "loss": 0.79937929, "num_input_tokens_seen": 50041975, "step": 2332, "time_per_iteration": 3.8128042221069336 }, { "auxiliary_loss_clip": 0.01206474, "auxiliary_loss_mlp": 0.01036852, "balance_loss_clip": 1.0582242, "balance_loss_mlp": 1.02729166, "epoch": 0.2805266638609992, "flos": 29351766280320.0, "grad_norm": 2.2676603785558234, "language_loss": 0.77147746, "learning_rate": 3.3771140474320872e-06, "loss": 0.79391074, "num_input_tokens_seen": 50061925, "step": 2333, "time_per_iteration": 2.802976131439209 }, { "auxiliary_loss_clip": 0.01197247, "auxiliary_loss_mlp": 0.01040236, "balance_loss_clip": 1.05622673, "balance_loss_mlp": 1.03003216, "epoch": 0.28064690675163834, "flos": 21463735875840.0, "grad_norm": 3.948207120843396, "language_loss": 0.79322827, "learning_rate": 3.3765490466188664e-06, "loss": 0.81560308, "num_input_tokens_seen": 50079325, "step": 2334, "time_per_iteration": 2.8386526107788086 }, { "auxiliary_loss_clip": 0.01194037, "auxiliary_loss_mlp": 0.01026896, "balance_loss_clip": 1.05917048, "balance_loss_mlp": 1.01709127, "epoch": 0.2807671496422774, "flos": 20995640812800.0, "grad_norm": 2.492195445204469, "language_loss": 0.73701537, "learning_rate": 3.3759838369842508e-06, "loss": 0.75922465, "num_input_tokens_seen": 50097400, "step": 2335, "time_per_iteration": 3.753452777862549 }, { "auxiliary_loss_clip": 0.01192104, "auxiliary_loss_mlp": 0.01035704, "balance_loss_clip": 1.05567729, "balance_loss_mlp": 1.02561247, "epoch": 0.2808873925329165, "flos": 21506577822720.0, "grad_norm": 2.0471732068563684, "language_loss": 0.73008919, "learning_rate": 3.375418418613981e-06, "loss": 0.75236726, "num_input_tokens_seen": 50116425, "step": 2336, "time_per_iteration": 3.8435816764831543 }, { "auxiliary_loss_clip": 0.01199057, "auxiliary_loss_mlp": 0.0102927, "balance_loss_clip": 1.05810475, "balance_loss_mlp": 1.02004886, "epoch": 0.28100763542355556, "flos": 16070815814400.0, "grad_norm": 2.403402429109567, "language_loss": 0.83956903, "learning_rate": 3.374852791593831e-06, "loss": 0.86185229, "num_input_tokens_seen": 50132625, "step": 2337, "time_per_iteration": 4.109964370727539 }, { "auxiliary_loss_clip": 0.01202723, "auxiliary_loss_mlp": 0.01034117, "balance_loss_clip": 1.05986881, "balance_loss_mlp": 1.02428818, "epoch": 0.28112787831419467, "flos": 19062605468160.0, "grad_norm": 3.805799330048743, "language_loss": 0.54090929, "learning_rate": 3.374286956009605e-06, "loss": 0.5632776, "num_input_tokens_seen": 50151190, "step": 2338, "time_per_iteration": 2.8464059829711914 }, { "auxiliary_loss_clip": 0.01199752, "auxiliary_loss_mlp": 0.0103268, "balance_loss_clip": 1.05962121, "balance_loss_mlp": 1.02378702, "epoch": 0.2812481212048338, "flos": 12823629482880.0, "grad_norm": 2.2772093991000144, "language_loss": 0.75320417, "learning_rate": 3.3737209119471405e-06, "loss": 0.77552855, "num_input_tokens_seen": 50167700, "step": 2339, "time_per_iteration": 2.9199793338775635 }, { "auxiliary_loss_clip": 0.01207705, "auxiliary_loss_mlp": 0.01040439, "balance_loss_clip": 1.05869234, "balance_loss_mlp": 1.03048503, "epoch": 0.28136836409547283, "flos": 15633064765440.0, "grad_norm": 2.7574495894042563, "language_loss": 0.6366055, "learning_rate": 3.373154659492306e-06, "loss": 0.65908694, "num_input_tokens_seen": 50185840, "step": 2340, "time_per_iteration": 2.890167713165283 }, { "auxiliary_loss_clip": 0.01201517, "auxiliary_loss_mlp": 0.01042164, "balance_loss_clip": 1.05974042, "balance_loss_mlp": 1.03226936, "epoch": 0.28148860698611194, "flos": 19933726106880.0, "grad_norm": 1.7271780948419613, "language_loss": 0.85049963, "learning_rate": 3.3725881987310016e-06, "loss": 0.87293649, "num_input_tokens_seen": 50203375, "step": 2341, "time_per_iteration": 2.752345085144043 }, { "auxiliary_loss_clip": 0.01193332, "auxiliary_loss_mlp": 0.0102945, "balance_loss_clip": 1.0546968, "balance_loss_mlp": 1.02012801, "epoch": 0.28160884987675106, "flos": 17457219008640.0, "grad_norm": 1.864066262134156, "language_loss": 0.87781596, "learning_rate": 3.372021529749159e-06, "loss": 0.90004379, "num_input_tokens_seen": 50222435, "step": 2342, "time_per_iteration": 2.8271889686584473 }, { "auxiliary_loss_clip": 0.01188375, "auxiliary_loss_mlp": 0.01032715, "balance_loss_clip": 1.05845141, "balance_loss_mlp": 1.02338743, "epoch": 0.2817290927673901, "flos": 16834743290880.0, "grad_norm": 1.8981817119516906, "language_loss": 0.92286456, "learning_rate": 3.3714546526327405e-06, "loss": 0.94507551, "num_input_tokens_seen": 50240435, "step": 2343, "time_per_iteration": 2.7766239643096924 }, { "auxiliary_loss_clip": 0.01197198, "auxiliary_loss_mlp": 0.0103347, "balance_loss_clip": 1.05599701, "balance_loss_mlp": 1.02319455, "epoch": 0.2818493356580292, "flos": 15414081500160.0, "grad_norm": 3.3731074474644815, "language_loss": 0.88020396, "learning_rate": 3.3708875674677423e-06, "loss": 0.9025107, "num_input_tokens_seen": 50258410, "step": 2344, "time_per_iteration": 2.8437445163726807 }, { "auxiliary_loss_clip": 0.01209001, "auxiliary_loss_mlp": 0.01031549, "balance_loss_clip": 1.06247914, "balance_loss_mlp": 1.02185738, "epoch": 0.28196957854866833, "flos": 20412451595520.0, "grad_norm": 2.390659979568907, "language_loss": 0.83525503, "learning_rate": 3.37032027434019e-06, "loss": 0.85766053, "num_input_tokens_seen": 50277930, "step": 2345, "time_per_iteration": 2.807892084121704 }, { "auxiliary_loss_clip": 0.01209528, "auxiliary_loss_mlp": 0.01035893, "balance_loss_clip": 1.05791855, "balance_loss_mlp": 1.02497375, "epoch": 0.2820898214393074, "flos": 19973120348160.0, "grad_norm": 2.0434456059580146, "language_loss": 0.83019006, "learning_rate": 3.369752773336141e-06, "loss": 0.85264426, "num_input_tokens_seen": 50297410, "step": 2346, "time_per_iteration": 2.8319475650787354 }, { "auxiliary_loss_clip": 0.01201139, "auxiliary_loss_mlp": 0.01038171, "balance_loss_clip": 1.05961478, "balance_loss_mlp": 1.02731669, "epoch": 0.2822100643299465, "flos": 22528308188160.0, "grad_norm": 1.7361764706529204, "language_loss": 0.78452861, "learning_rate": 3.3691850645416864e-06, "loss": 0.80692166, "num_input_tokens_seen": 50317120, "step": 2347, "time_per_iteration": 2.8129427433013916 }, { "auxiliary_loss_clip": 0.01206877, "auxiliary_loss_mlp": 0.01033251, "balance_loss_clip": 1.05862963, "balance_loss_mlp": 1.02398813, "epoch": 0.2823303072205856, "flos": 11546682007680.0, "grad_norm": 4.519292742800578, "language_loss": 0.83020604, "learning_rate": 3.368617148042945e-06, "loss": 0.85260725, "num_input_tokens_seen": 50334790, "step": 2348, "time_per_iteration": 2.6765503883361816 }, { "auxiliary_loss_clip": 0.01195308, "auxiliary_loss_mlp": 0.01037197, "balance_loss_clip": 1.05741036, "balance_loss_mlp": 1.02578318, "epoch": 0.28245055011122466, "flos": 18259894281600.0, "grad_norm": 1.7218761418502362, "language_loss": 0.84633303, "learning_rate": 3.368049023926071e-06, "loss": 0.86865807, "num_input_tokens_seen": 50353785, "step": 2349, "time_per_iteration": 2.846416473388672 }, { "auxiliary_loss_clip": 0.01198923, "auxiliary_loss_mlp": 0.01032195, "balance_loss_clip": 1.05858696, "balance_loss_mlp": 1.02324283, "epoch": 0.2825707930018638, "flos": 24608110504320.0, "grad_norm": 1.6185800132121, "language_loss": 0.83422405, "learning_rate": 3.3674806922772476e-06, "loss": 0.8565352, "num_input_tokens_seen": 50374670, "step": 2350, "time_per_iteration": 2.8348724842071533 }, { "auxiliary_loss_clip": 0.01196206, "auxiliary_loss_mlp": 0.0102782, "balance_loss_clip": 1.0561955, "balance_loss_mlp": 1.01781225, "epoch": 0.28269103589250283, "flos": 25226994862080.0, "grad_norm": 1.835958242656441, "language_loss": 0.7485798, "learning_rate": 3.3669121531826904e-06, "loss": 0.77082002, "num_input_tokens_seen": 50395650, "step": 2351, "time_per_iteration": 2.9145946502685547 }, { "auxiliary_loss_clip": 0.01191113, "auxiliary_loss_mlp": 0.01032306, "balance_loss_clip": 1.05996478, "balance_loss_mlp": 1.02266765, "epoch": 0.28281127878314194, "flos": 19281552819840.0, "grad_norm": 2.0185998768976967, "language_loss": 0.83345324, "learning_rate": 3.366343406728647e-06, "loss": 0.85568744, "num_input_tokens_seen": 50415100, "step": 2352, "time_per_iteration": 2.7658984661102295 }, { "auxiliary_loss_clip": 0.01197122, "auxiliary_loss_mlp": 0.01032014, "balance_loss_clip": 1.05709088, "balance_loss_mlp": 1.02240014, "epoch": 0.28293152167378105, "flos": 23878405710720.0, "grad_norm": 1.6104355037587432, "language_loss": 0.68626344, "learning_rate": 3.3657744530013946e-06, "loss": 0.70855474, "num_input_tokens_seen": 50434335, "step": 2353, "time_per_iteration": 2.7818963527679443 }, { "auxiliary_loss_clip": 0.01205596, "auxiliary_loss_mlp": 0.01029339, "balance_loss_clip": 1.05754769, "balance_loss_mlp": 1.0194025, "epoch": 0.2830517645644201, "flos": 43866965928960.0, "grad_norm": 2.361255222903452, "language_loss": 0.71627915, "learning_rate": 3.3652052920872437e-06, "loss": 0.73862851, "num_input_tokens_seen": 50457200, "step": 2354, "time_per_iteration": 3.0002989768981934 }, { "auxiliary_loss_clip": 0.01198887, "auxiliary_loss_mlp": 0.01035539, "balance_loss_clip": 1.05752206, "balance_loss_mlp": 1.02552569, "epoch": 0.2831720074550592, "flos": 26651750803200.0, "grad_norm": 1.9251992565420526, "language_loss": 0.85669339, "learning_rate": 3.3646359240725355e-06, "loss": 0.87903768, "num_input_tokens_seen": 50476390, "step": 2355, "time_per_iteration": 2.9021284580230713 }, { "auxiliary_loss_clip": 0.01197121, "auxiliary_loss_mlp": 0.01065608, "balance_loss_clip": 1.05842566, "balance_loss_mlp": 1.02920449, "epoch": 0.2832922503456983, "flos": 31029979564800.0, "grad_norm": 2.0908579620639585, "language_loss": 0.67842954, "learning_rate": 3.364066349043643e-06, "loss": 0.70105684, "num_input_tokens_seen": 50497595, "step": 2356, "time_per_iteration": 2.850151538848877 }, { "auxiliary_loss_clip": 0.01193045, "auxiliary_loss_mlp": 0.0102698, "balance_loss_clip": 1.0555346, "balance_loss_mlp": 1.0176878, "epoch": 0.2834124932363374, "flos": 20405699838720.0, "grad_norm": 1.754255906257202, "language_loss": 0.82102394, "learning_rate": 3.363496567086969e-06, "loss": 0.84322423, "num_input_tokens_seen": 50514690, "step": 2357, "time_per_iteration": 3.8411166667938232 }, { "auxiliary_loss_clip": 0.01203185, "auxiliary_loss_mlp": 0.0103728, "balance_loss_clip": 1.05615366, "balance_loss_mlp": 1.02743959, "epoch": 0.2835327361269765, "flos": 39384848056320.0, "grad_norm": 1.863534541505176, "language_loss": 0.75661534, "learning_rate": 3.3629265782889506e-06, "loss": 0.77901995, "num_input_tokens_seen": 50536515, "step": 2358, "time_per_iteration": 2.8791959285736084 }, { "auxiliary_loss_clip": 0.0119207, "auxiliary_loss_mlp": 0.01033299, "balance_loss_clip": 1.05734944, "balance_loss_mlp": 1.02283835, "epoch": 0.2836529790176156, "flos": 30261598801920.0, "grad_norm": 2.279177622206951, "language_loss": 0.72170246, "learning_rate": 3.362356382736054e-06, "loss": 0.74395609, "num_input_tokens_seen": 50557120, "step": 2359, "time_per_iteration": 2.954113245010376 }, { "auxiliary_loss_clip": 0.01192069, "auxiliary_loss_mlp": 0.01032573, "balance_loss_clip": 1.05232549, "balance_loss_mlp": 1.02296484, "epoch": 0.28377322190825466, "flos": 12677796264960.0, "grad_norm": 2.0187333434392243, "language_loss": 0.90960753, "learning_rate": 3.361785980514777e-06, "loss": 0.93185389, "num_input_tokens_seen": 50573320, "step": 2360, "time_per_iteration": 2.8893232345581055 }, { "auxiliary_loss_clip": 0.0117818, "auxiliary_loss_mlp": 0.01040051, "balance_loss_clip": 1.05645609, "balance_loss_mlp": 1.03038347, "epoch": 0.28389346479889377, "flos": 18296666830080.0, "grad_norm": 1.9325621103927473, "language_loss": 0.76950139, "learning_rate": 3.361215371711649e-06, "loss": 0.79168367, "num_input_tokens_seen": 50592415, "step": 2361, "time_per_iteration": 3.858900308609009 }, { "auxiliary_loss_clip": 0.01182633, "auxiliary_loss_mlp": 0.01028777, "balance_loss_clip": 1.05193043, "balance_loss_mlp": 1.01977134, "epoch": 0.2840137076895329, "flos": 20406992728320.0, "grad_norm": 1.8241084262703957, "language_loss": 0.83395106, "learning_rate": 3.3606445564132326e-06, "loss": 0.85606515, "num_input_tokens_seen": 50609710, "step": 2362, "time_per_iteration": 3.8324244022369385 }, { "auxiliary_loss_clip": 0.01206392, "auxiliary_loss_mlp": 0.010685, "balance_loss_clip": 1.0599668, "balance_loss_mlp": 1.0329411, "epoch": 0.28413395058017193, "flos": 20048030161920.0, "grad_norm": 2.056220248735175, "language_loss": 0.82049394, "learning_rate": 3.360073534706118e-06, "loss": 0.84324288, "num_input_tokens_seen": 50626865, "step": 2363, "time_per_iteration": 3.786370038986206 }, { "auxiliary_loss_clip": 0.01198982, "auxiliary_loss_mlp": 0.01037422, "balance_loss_clip": 1.05909681, "balance_loss_mlp": 1.02772987, "epoch": 0.28425419347081105, "flos": 37663613256960.0, "grad_norm": 2.1099742502210557, "language_loss": 0.75903362, "learning_rate": 3.35950230667693e-06, "loss": 0.78139764, "num_input_tokens_seen": 50648560, "step": 2364, "time_per_iteration": 2.906193494796753 }, { "auxiliary_loss_clip": 0.01197696, "auxiliary_loss_mlp": 0.0103568, "balance_loss_clip": 1.05335987, "balance_loss_mlp": 1.02633405, "epoch": 0.28437443636145016, "flos": 13845072539520.0, "grad_norm": 2.3540173510388294, "language_loss": 0.86222064, "learning_rate": 3.358930872412323e-06, "loss": 0.88455433, "num_input_tokens_seen": 50665725, "step": 2365, "time_per_iteration": 2.6782352924346924 }, { "auxiliary_loss_clip": 0.01196123, "auxiliary_loss_mlp": 0.01035626, "balance_loss_clip": 1.0576272, "balance_loss_mlp": 1.02616704, "epoch": 0.2844946792520892, "flos": 22747794243840.0, "grad_norm": 1.5050885448559188, "language_loss": 0.80939043, "learning_rate": 3.3583592319989825e-06, "loss": 0.83170795, "num_input_tokens_seen": 50685095, "step": 2366, "time_per_iteration": 2.716048002243042 }, { "auxiliary_loss_clip": 0.01209424, "auxiliary_loss_mlp": 0.01037677, "balance_loss_clip": 1.05992913, "balance_loss_mlp": 1.02744317, "epoch": 0.2846149221427283, "flos": 32415987709440.0, "grad_norm": 2.2459194423452535, "language_loss": 0.68755651, "learning_rate": 3.357787385523627e-06, "loss": 0.71002746, "num_input_tokens_seen": 50706500, "step": 2367, "time_per_iteration": 2.8044376373291016 }, { "auxiliary_loss_clip": 0.01192255, "auxiliary_loss_mlp": 0.01031095, "balance_loss_clip": 1.05542874, "balance_loss_mlp": 1.02174926, "epoch": 0.2847351650333674, "flos": 28475976873600.0, "grad_norm": 2.4511491430767265, "language_loss": 0.83063984, "learning_rate": 3.3572153330730048e-06, "loss": 0.85287333, "num_input_tokens_seen": 50727595, "step": 2368, "time_per_iteration": 2.865941047668457 }, { "auxiliary_loss_clip": 0.01094837, "auxiliary_loss_mlp": 0.01011738, "balance_loss_clip": 1.01902628, "balance_loss_mlp": 1.00971127, "epoch": 0.2848554079240065, "flos": 55753399704960.0, "grad_norm": 0.86032244528326, "language_loss": 0.64692056, "learning_rate": 3.3566430747338956e-06, "loss": 0.66798627, "num_input_tokens_seen": 50782800, "step": 2369, "time_per_iteration": 3.203667163848877 }, { "auxiliary_loss_clip": 0.01200056, "auxiliary_loss_mlp": 0.01029456, "balance_loss_clip": 1.05537081, "balance_loss_mlp": 1.0199852, "epoch": 0.2849756508146456, "flos": 11836875985920.0, "grad_norm": 2.1208599440057654, "language_loss": 0.86562741, "learning_rate": 3.35607061059311e-06, "loss": 0.88792253, "num_input_tokens_seen": 50797730, "step": 2370, "time_per_iteration": 2.7114603519439697 }, { "auxiliary_loss_clip": 0.01200902, "auxiliary_loss_mlp": 0.01032232, "balance_loss_clip": 1.05654824, "balance_loss_mlp": 1.02196229, "epoch": 0.28509589370528465, "flos": 25155209531520.0, "grad_norm": 2.3517148534268095, "language_loss": 0.74876857, "learning_rate": 3.3554979407374917e-06, "loss": 0.77109993, "num_input_tokens_seen": 50819840, "step": 2371, "time_per_iteration": 2.832796096801758 }, { "auxiliary_loss_clip": 0.01198991, "auxiliary_loss_mlp": 0.01035092, "balance_loss_clip": 1.05538034, "balance_loss_mlp": 1.02548385, "epoch": 0.28521613659592376, "flos": 19974808287360.0, "grad_norm": 1.5219466232623728, "language_loss": 0.73728055, "learning_rate": 3.3549250652539134e-06, "loss": 0.75962138, "num_input_tokens_seen": 50838935, "step": 2372, "time_per_iteration": 2.7776126861572266 }, { "auxiliary_loss_clip": 0.01198942, "auxiliary_loss_mlp": 0.01033759, "balance_loss_clip": 1.05799174, "balance_loss_mlp": 1.02438354, "epoch": 0.2853363794865629, "flos": 23367971491200.0, "grad_norm": 2.3019224533710925, "language_loss": 0.81406903, "learning_rate": 3.3543519842292794e-06, "loss": 0.83639598, "num_input_tokens_seen": 50858590, "step": 2373, "time_per_iteration": 2.798180103302002 }, { "auxiliary_loss_clip": 0.01204808, "auxiliary_loss_mlp": 0.01055644, "balance_loss_clip": 1.05874014, "balance_loss_mlp": 1.02128005, "epoch": 0.28545662237720193, "flos": 19861940776320.0, "grad_norm": 2.1952109966721376, "language_loss": 0.83694041, "learning_rate": 3.353778697750527e-06, "loss": 0.85954487, "num_input_tokens_seen": 50876995, "step": 2374, "time_per_iteration": 2.7505085468292236 }, { "auxiliary_loss_clip": 0.01190573, "auxiliary_loss_mlp": 0.01031256, "balance_loss_clip": 1.05724955, "balance_loss_mlp": 1.02176666, "epoch": 0.28557686526784104, "flos": 23879016241920.0, "grad_norm": 1.7795645094600745, "language_loss": 0.89401877, "learning_rate": 3.353205205904622e-06, "loss": 0.91623712, "num_input_tokens_seen": 50896105, "step": 2375, "time_per_iteration": 2.7508139610290527 }, { "auxiliary_loss_clip": 0.0119406, "auxiliary_loss_mlp": 0.01034718, "balance_loss_clip": 1.05602503, "balance_loss_mlp": 1.02513957, "epoch": 0.28569710815848015, "flos": 44890384233600.0, "grad_norm": 1.9441068746686405, "language_loss": 0.72245014, "learning_rate": 3.3526315087785637e-06, "loss": 0.74473792, "num_input_tokens_seen": 50917220, "step": 2376, "time_per_iteration": 2.963944911956787 }, { "auxiliary_loss_clip": 0.01175626, "auxiliary_loss_mlp": 0.0102987, "balance_loss_clip": 1.0573498, "balance_loss_mlp": 1.01993942, "epoch": 0.2858173510491192, "flos": 26829759628800.0, "grad_norm": 1.5680195516636732, "language_loss": 0.81016272, "learning_rate": 3.3520576064593805e-06, "loss": 0.83221769, "num_input_tokens_seen": 50937175, "step": 2377, "time_per_iteration": 2.841452121734619 }, { "auxiliary_loss_clip": 0.01205162, "auxiliary_loss_mlp": 0.01037943, "balance_loss_clip": 1.05802011, "balance_loss_mlp": 1.02844191, "epoch": 0.2859375939397583, "flos": 23148916398720.0, "grad_norm": 1.42008059489987, "language_loss": 0.81538343, "learning_rate": 3.3514834990341337e-06, "loss": 0.83781445, "num_input_tokens_seen": 50957500, "step": 2378, "time_per_iteration": 2.8606531620025635 }, { "auxiliary_loss_clip": 0.01200755, "auxiliary_loss_mlp": 0.01039749, "balance_loss_clip": 1.05683053, "balance_loss_mlp": 1.03073072, "epoch": 0.2860578368303974, "flos": 12129799397760.0, "grad_norm": 2.4551270377954917, "language_loss": 0.92950475, "learning_rate": 3.3509091865899144e-06, "loss": 0.95190972, "num_input_tokens_seen": 50972690, "step": 2379, "time_per_iteration": 2.741229772567749 }, { "auxiliary_loss_clip": 0.01204518, "auxiliary_loss_mlp": 0.01031487, "balance_loss_clip": 1.05760336, "balance_loss_mlp": 1.02135444, "epoch": 0.2861780797210365, "flos": 19938035738880.0, "grad_norm": 2.170525126278678, "language_loss": 0.70814335, "learning_rate": 3.350334669213846e-06, "loss": 0.73050344, "num_input_tokens_seen": 50990095, "step": 2380, "time_per_iteration": 2.7001919746398926 }, { "auxiliary_loss_clip": 0.01199179, "auxiliary_loss_mlp": 0.01038163, "balance_loss_clip": 1.06060147, "balance_loss_mlp": 1.0277797, "epoch": 0.2862983226116756, "flos": 27563127609600.0, "grad_norm": 2.188721617552355, "language_loss": 0.75739145, "learning_rate": 3.3497599469930816e-06, "loss": 0.77976483, "num_input_tokens_seen": 51008305, "step": 2381, "time_per_iteration": 2.7839980125427246 }, { "auxiliary_loss_clip": 0.01203816, "auxiliary_loss_mlp": 0.01031651, "balance_loss_clip": 1.05605829, "balance_loss_mlp": 1.022084, "epoch": 0.28641856550231465, "flos": 22053964158720.0, "grad_norm": 2.7547288779199546, "language_loss": 0.83146894, "learning_rate": 3.349185020014807e-06, "loss": 0.85382366, "num_input_tokens_seen": 51025570, "step": 2382, "time_per_iteration": 2.7800984382629395 }, { "auxiliary_loss_clip": 0.01203236, "auxiliary_loss_mlp": 0.01031396, "balance_loss_clip": 1.0572114, "balance_loss_mlp": 1.02154303, "epoch": 0.28653880839295376, "flos": 22378775869440.0, "grad_norm": 1.705390005991622, "language_loss": 0.74500161, "learning_rate": 3.348609888366237e-06, "loss": 0.76734793, "num_input_tokens_seen": 51044585, "step": 2383, "time_per_iteration": 2.956735849380493 }, { "auxiliary_loss_clip": 0.01180999, "auxiliary_loss_mlp": 0.01031084, "balance_loss_clip": 1.05488181, "balance_loss_mlp": 1.02146351, "epoch": 0.28665905128359287, "flos": 23367971491200.0, "grad_norm": 2.690424563524348, "language_loss": 0.63066411, "learning_rate": 3.348034552134619e-06, "loss": 0.65278494, "num_input_tokens_seen": 51063990, "step": 2384, "time_per_iteration": 3.748319387435913 }, { "auxiliary_loss_clip": 0.01181903, "auxiliary_loss_mlp": 0.01029863, "balance_loss_clip": 1.05709469, "balance_loss_mlp": 1.02043962, "epoch": 0.2867792941742319, "flos": 20881695893760.0, "grad_norm": 1.9684401008800356, "language_loss": 0.84553242, "learning_rate": 3.3474590114072316e-06, "loss": 0.86765003, "num_input_tokens_seen": 51081990, "step": 2385, "time_per_iteration": 2.7771761417388916 }, { "auxiliary_loss_clip": 0.01187742, "auxiliary_loss_mlp": 0.01031363, "balance_loss_clip": 1.05642176, "balance_loss_mlp": 1.0212307, "epoch": 0.28689953706487104, "flos": 20664005518080.0, "grad_norm": 2.064705583435299, "language_loss": 0.8314656, "learning_rate": 3.3468832662713836e-06, "loss": 0.85365671, "num_input_tokens_seen": 51100235, "step": 2386, "time_per_iteration": 2.8140859603881836 }, { "auxiliary_loss_clip": 0.01190899, "auxiliary_loss_mlp": 0.01037356, "balance_loss_clip": 1.05780339, "balance_loss_mlp": 1.02792025, "epoch": 0.28701977995551015, "flos": 12675533708160.0, "grad_norm": 2.548919934083795, "language_loss": 0.84228075, "learning_rate": 3.346307316814415e-06, "loss": 0.86456329, "num_input_tokens_seen": 51115405, "step": 2387, "time_per_iteration": 2.8612403869628906 }, { "auxiliary_loss_clip": 0.01199976, "auxiliary_loss_mlp": 0.01033954, "balance_loss_clip": 1.0585537, "balance_loss_mlp": 1.02363086, "epoch": 0.2871400228461492, "flos": 21252366293760.0, "grad_norm": 1.9532791105371474, "language_loss": 0.75592196, "learning_rate": 3.3457311631236965e-06, "loss": 0.7782613, "num_input_tokens_seen": 51136390, "step": 2388, "time_per_iteration": 4.9400246143341064 }, { "auxiliary_loss_clip": 0.01186968, "auxiliary_loss_mlp": 0.01034268, "balance_loss_clip": 1.05745053, "balance_loss_mlp": 1.02398014, "epoch": 0.2872602657367883, "flos": 25119262995840.0, "grad_norm": 2.0984602480850016, "language_loss": 0.84493601, "learning_rate": 3.345154805286631e-06, "loss": 0.86714828, "num_input_tokens_seen": 51156650, "step": 2389, "time_per_iteration": 3.9006528854370117 }, { "auxiliary_loss_clip": 0.01193039, "auxiliary_loss_mlp": 0.01031552, "balance_loss_clip": 1.0562762, "balance_loss_mlp": 1.02186584, "epoch": 0.2873805086274274, "flos": 16646606830080.0, "grad_norm": 2.473844875732605, "language_loss": 0.76544452, "learning_rate": 3.344578243390651e-06, "loss": 0.78769046, "num_input_tokens_seen": 51172210, "step": 2390, "time_per_iteration": 2.7417032718658447 }, { "auxiliary_loss_clip": 0.01194956, "auxiliary_loss_mlp": 0.01028193, "balance_loss_clip": 1.05700147, "balance_loss_mlp": 1.01882923, "epoch": 0.2875007515180665, "flos": 17420123237760.0, "grad_norm": 2.3651881859367094, "language_loss": 0.78378725, "learning_rate": 3.3440014775232206e-06, "loss": 0.80601883, "num_input_tokens_seen": 51190265, "step": 2391, "time_per_iteration": 2.904844284057617 }, { "auxiliary_loss_clip": 0.01194676, "auxiliary_loss_mlp": 0.01032298, "balance_loss_clip": 1.05569863, "balance_loss_mlp": 1.02342296, "epoch": 0.2876209944087056, "flos": 23434190213760.0, "grad_norm": 1.8531617382629457, "language_loss": 0.71151125, "learning_rate": 3.343424507771834e-06, "loss": 0.73378104, "num_input_tokens_seen": 51208475, "step": 2392, "time_per_iteration": 2.8302879333496094 }, { "auxiliary_loss_clip": 0.01188504, "auxiliary_loss_mlp": 0.01035468, "balance_loss_clip": 1.05525672, "balance_loss_mlp": 1.02604485, "epoch": 0.2877412372993447, "flos": 13735509079680.0, "grad_norm": 1.684116765326, "language_loss": 0.86414862, "learning_rate": 3.342847334224018e-06, "loss": 0.8863883, "num_input_tokens_seen": 51225875, "step": 2393, "time_per_iteration": 2.771475076675415 }, { "auxiliary_loss_clip": 0.01091914, "auxiliary_loss_mlp": 0.01002404, "balance_loss_clip": 1.01578736, "balance_loss_mlp": 1.0006752, "epoch": 0.28786148018998375, "flos": 58079695104000.0, "grad_norm": 0.9416674640687213, "language_loss": 0.62354815, "learning_rate": 3.342269956967329e-06, "loss": 0.64449131, "num_input_tokens_seen": 51287780, "step": 2394, "time_per_iteration": 3.369027614593506 }, { "auxiliary_loss_clip": 0.01202488, "auxiliary_loss_mlp": 0.01042206, "balance_loss_clip": 1.05733013, "balance_loss_mlp": 1.03183532, "epoch": 0.28798172308062286, "flos": 23435052140160.0, "grad_norm": 2.5774812572255437, "language_loss": 0.71718884, "learning_rate": 3.341692376089355e-06, "loss": 0.73963571, "num_input_tokens_seen": 51303335, "step": 2395, "time_per_iteration": 2.758943557739258 }, { "auxiliary_loss_clip": 0.01195529, "auxiliary_loss_mlp": 0.01030934, "balance_loss_clip": 1.05640054, "balance_loss_mlp": 1.02223778, "epoch": 0.288101965971262, "flos": 25110033200640.0, "grad_norm": 5.662248855546502, "language_loss": 0.84055334, "learning_rate": 3.3411145916777146e-06, "loss": 0.86281794, "num_input_tokens_seen": 51317495, "step": 2396, "time_per_iteration": 2.7823808193206787 }, { "auxiliary_loss_clip": 0.01195501, "auxiliary_loss_mlp": 0.01036777, "balance_loss_clip": 1.05992675, "balance_loss_mlp": 1.02728176, "epoch": 0.28822220886190103, "flos": 16252559654400.0, "grad_norm": 2.2936965529864493, "language_loss": 0.9081279, "learning_rate": 3.3405366038200566e-06, "loss": 0.93045068, "num_input_tokens_seen": 51336430, "step": 2397, "time_per_iteration": 2.759030342102051 }, { "auxiliary_loss_clip": 0.01201817, "auxiliary_loss_mlp": 0.01034341, "balance_loss_clip": 1.06136811, "balance_loss_mlp": 1.02408862, "epoch": 0.28834245175254014, "flos": 24535642815360.0, "grad_norm": 3.546965091247789, "language_loss": 0.85020804, "learning_rate": 3.3399584126040617e-06, "loss": 0.87256968, "num_input_tokens_seen": 51355930, "step": 2398, "time_per_iteration": 2.780245780944824 }, { "auxiliary_loss_clip": 0.01199855, "auxiliary_loss_mlp": 0.01062489, "balance_loss_clip": 1.05597103, "balance_loss_mlp": 1.02976751, "epoch": 0.2884626946431792, "flos": 24571445696640.0, "grad_norm": 1.8394453928961978, "language_loss": 0.90958667, "learning_rate": 3.339380018117441e-06, "loss": 0.93221009, "num_input_tokens_seen": 51376765, "step": 2399, "time_per_iteration": 2.831808567047119 }, { "auxiliary_loss_clip": 0.01198635, "auxiliary_loss_mlp": 0.01034261, "balance_loss_clip": 1.05915928, "balance_loss_mlp": 1.02549899, "epoch": 0.2885829375338183, "flos": 16544657053440.0, "grad_norm": 3.5305252799378457, "language_loss": 0.77969897, "learning_rate": 3.3388014204479366e-06, "loss": 0.80202788, "num_input_tokens_seen": 51394570, "step": 2400, "time_per_iteration": 2.758114814758301 }, { "auxiliary_loss_clip": 0.01204634, "auxiliary_loss_mlp": 0.0103887, "balance_loss_clip": 1.05681133, "balance_loss_mlp": 1.028934, "epoch": 0.2887031804244574, "flos": 24061226958720.0, "grad_norm": 1.896342665950489, "language_loss": 0.91399825, "learning_rate": 3.338222619683321e-06, "loss": 0.9364332, "num_input_tokens_seen": 51414535, "step": 2401, "time_per_iteration": 2.852858304977417 }, { "auxiliary_loss_clip": 0.01197542, "auxiliary_loss_mlp": 0.01036624, "balance_loss_clip": 1.0564909, "balance_loss_mlp": 1.02700996, "epoch": 0.2888234233150965, "flos": 23330696152320.0, "grad_norm": 3.788534617621484, "language_loss": 0.7382502, "learning_rate": 3.337643615911398e-06, "loss": 0.76059186, "num_input_tokens_seen": 51434160, "step": 2402, "time_per_iteration": 2.7541306018829346 }, { "auxiliary_loss_clip": 0.0120009, "auxiliary_loss_mlp": 0.010303, "balance_loss_clip": 1.05589676, "balance_loss_mlp": 1.02098393, "epoch": 0.2889436662057356, "flos": 22272767856000.0, "grad_norm": 1.9913475436365569, "language_loss": 0.78567922, "learning_rate": 3.3370644092200026e-06, "loss": 0.80798304, "num_input_tokens_seen": 51451435, "step": 2403, "time_per_iteration": 2.7698590755462646 }, { "auxiliary_loss_clip": 0.01181563, "auxiliary_loss_mlp": 0.0102853, "balance_loss_clip": 1.05598855, "balance_loss_mlp": 1.01963735, "epoch": 0.2890639090963747, "flos": 21616931381760.0, "grad_norm": 2.173390452581672, "language_loss": 0.78366661, "learning_rate": 3.3364849996969985e-06, "loss": 0.80576754, "num_input_tokens_seen": 51471455, "step": 2404, "time_per_iteration": 2.7806379795074463 }, { "auxiliary_loss_clip": 0.01198441, "auxiliary_loss_mlp": 0.01032988, "balance_loss_clip": 1.05784512, "balance_loss_mlp": 1.02361834, "epoch": 0.28918415198701375, "flos": 28585540333440.0, "grad_norm": 2.097514677285045, "language_loss": 0.85440958, "learning_rate": 3.335905387430283e-06, "loss": 0.87672383, "num_input_tokens_seen": 51492890, "step": 2405, "time_per_iteration": 2.8974809646606445 }, { "auxiliary_loss_clip": 0.01197962, "auxiliary_loss_mlp": 0.01035559, "balance_loss_clip": 1.05447304, "balance_loss_mlp": 1.02576554, "epoch": 0.28930439487765286, "flos": 21944688007680.0, "grad_norm": 1.7829234698850172, "language_loss": 0.82795578, "learning_rate": 3.335325572507782e-06, "loss": 0.85029101, "num_input_tokens_seen": 51513390, "step": 2406, "time_per_iteration": 2.867621660232544 }, { "auxiliary_loss_clip": 0.01204673, "auxiliary_loss_mlp": 0.01064486, "balance_loss_clip": 1.06052411, "balance_loss_mlp": 1.02782214, "epoch": 0.28942463776829197, "flos": 19281911955840.0, "grad_norm": 1.5742140134220421, "language_loss": 0.73919594, "learning_rate": 3.3347455550174537e-06, "loss": 0.76188755, "num_input_tokens_seen": 51532730, "step": 2407, "time_per_iteration": 2.8099820613861084 }, { "auxiliary_loss_clip": 0.01191993, "auxiliary_loss_mlp": 0.01033706, "balance_loss_clip": 1.05815268, "balance_loss_mlp": 1.02331662, "epoch": 0.289544880658931, "flos": 14645700737280.0, "grad_norm": 1.8975309753139944, "language_loss": 0.68289065, "learning_rate": 3.3341653350472864e-06, "loss": 0.70514756, "num_input_tokens_seen": 51549560, "step": 2408, "time_per_iteration": 2.9292662143707275 }, { "auxiliary_loss_clip": 0.01209932, "auxiliary_loss_mlp": 0.0103239, "balance_loss_clip": 1.0587132, "balance_loss_mlp": 1.02171445, "epoch": 0.28966512354957014, "flos": 28621881918720.0, "grad_norm": 3.2271509729987686, "language_loss": 0.69233215, "learning_rate": 3.333584912685298e-06, "loss": 0.7147553, "num_input_tokens_seen": 51568180, "step": 2409, "time_per_iteration": 2.818631887435913 }, { "auxiliary_loss_clip": 0.01084756, "auxiliary_loss_mlp": 0.01004886, "balance_loss_clip": 1.01527524, "balance_loss_mlp": 1.00318134, "epoch": 0.28978536644020925, "flos": 64711784511360.0, "grad_norm": 0.8829673568697551, "language_loss": 0.55613488, "learning_rate": 3.3330042880195385e-06, "loss": 0.57703131, "num_input_tokens_seen": 51622530, "step": 2410, "time_per_iteration": 4.1833062171936035 }, { "auxiliary_loss_clip": 0.01194312, "auxiliary_loss_mlp": 0.01030325, "balance_loss_clip": 1.05524373, "balance_loss_mlp": 1.02085972, "epoch": 0.2899056093308483, "flos": 18624638937600.0, "grad_norm": 1.7440177915571715, "language_loss": 0.78378272, "learning_rate": 3.3324234611380888e-06, "loss": 0.80602908, "num_input_tokens_seen": 51641260, "step": 2411, "time_per_iteration": 2.772758722305298 }, { "auxiliary_loss_clip": 0.01188887, "auxiliary_loss_mlp": 0.01031407, "balance_loss_clip": 1.05930471, "balance_loss_mlp": 1.02180433, "epoch": 0.2900258522214874, "flos": 22893735202560.0, "grad_norm": 1.7340033639734036, "language_loss": 0.82110918, "learning_rate": 3.3318424321290596e-06, "loss": 0.84331208, "num_input_tokens_seen": 51660975, "step": 2412, "time_per_iteration": 2.769015073776245 }, { "auxiliary_loss_clip": 0.01087253, "auxiliary_loss_mlp": 0.01007706, "balance_loss_clip": 1.01591372, "balance_loss_mlp": 1.00598991, "epoch": 0.2901460951121265, "flos": 71106036013440.0, "grad_norm": 0.8330539962019791, "language_loss": 0.59975433, "learning_rate": 3.3312612010805917e-06, "loss": 0.62070394, "num_input_tokens_seen": 51720550, "step": 2413, "time_per_iteration": 4.3687357902526855 }, { "auxiliary_loss_clip": 0.01188591, "auxiliary_loss_mlp": 0.01030794, "balance_loss_clip": 1.05818892, "balance_loss_mlp": 1.02005351, "epoch": 0.2902663380027656, "flos": 32160986081280.0, "grad_norm": 1.9875929226125417, "language_loss": 0.70228672, "learning_rate": 3.330679768080858e-06, "loss": 0.72448057, "num_input_tokens_seen": 51744435, "step": 2414, "time_per_iteration": 3.8838469982147217 }, { "auxiliary_loss_clip": 0.01197378, "auxiliary_loss_mlp": 0.01031592, "balance_loss_clip": 1.05843163, "balance_loss_mlp": 1.02249062, "epoch": 0.2903865808934047, "flos": 29351658539520.0, "grad_norm": 2.475519274878724, "language_loss": 0.83840609, "learning_rate": 3.3300981332180627e-06, "loss": 0.86069578, "num_input_tokens_seen": 51763640, "step": 2415, "time_per_iteration": 3.8087146282196045 }, { "auxiliary_loss_clip": 0.01195608, "auxiliary_loss_mlp": 0.01037818, "balance_loss_clip": 1.05645454, "balance_loss_mlp": 1.02870464, "epoch": 0.29050682378404374, "flos": 17089026647040.0, "grad_norm": 1.9974034591072811, "language_loss": 0.80183733, "learning_rate": 3.3295162965804373e-06, "loss": 0.82417154, "num_input_tokens_seen": 51782135, "step": 2416, "time_per_iteration": 2.7335174083709717 }, { "auxiliary_loss_clip": 0.01186609, "auxiliary_loss_mlp": 0.01035162, "balance_loss_clip": 1.05533576, "balance_loss_mlp": 1.02564931, "epoch": 0.29062706667468285, "flos": 17858233422720.0, "grad_norm": 4.586067602330447, "language_loss": 0.78757071, "learning_rate": 3.328934258256247e-06, "loss": 0.80978847, "num_input_tokens_seen": 51800200, "step": 2417, "time_per_iteration": 2.8060293197631836 }, { "auxiliary_loss_clip": 0.01194932, "auxiliary_loss_mlp": 0.0103575, "balance_loss_clip": 1.0531615, "balance_loss_mlp": 1.02625561, "epoch": 0.29074730956532197, "flos": 24279815174400.0, "grad_norm": 1.917769679001178, "language_loss": 0.67118758, "learning_rate": 3.3283520183337856e-06, "loss": 0.69349438, "num_input_tokens_seen": 51819905, "step": 2418, "time_per_iteration": 2.833040952682495 }, { "auxiliary_loss_clip": 0.01192371, "auxiliary_loss_mlp": 0.01035117, "balance_loss_clip": 1.05588567, "balance_loss_mlp": 1.02607477, "epoch": 0.290867552455961, "flos": 22340961826560.0, "grad_norm": 1.6483495728831206, "language_loss": 0.68968207, "learning_rate": 3.3277695769013797e-06, "loss": 0.71195692, "num_input_tokens_seen": 51839350, "step": 2419, "time_per_iteration": 2.8832476139068604 }, { "auxiliary_loss_clip": 0.01200043, "auxiliary_loss_mlp": 0.01030208, "balance_loss_clip": 1.05741024, "balance_loss_mlp": 1.02067745, "epoch": 0.29098779534660013, "flos": 23186155824000.0, "grad_norm": 2.3042360594779194, "language_loss": 0.77374804, "learning_rate": 3.327186934047385e-06, "loss": 0.79605055, "num_input_tokens_seen": 51858045, "step": 2420, "time_per_iteration": 2.7861807346343994 }, { "auxiliary_loss_clip": 0.0118678, "auxiliary_loss_mlp": 0.01031556, "balance_loss_clip": 1.05384922, "balance_loss_mlp": 1.02288926, "epoch": 0.29110803823723924, "flos": 15304194817920.0, "grad_norm": 1.9465384869847022, "language_loss": 0.66126478, "learning_rate": 3.3266040898601877e-06, "loss": 0.68344814, "num_input_tokens_seen": 51875880, "step": 2421, "time_per_iteration": 2.8411076068878174 }, { "auxiliary_loss_clip": 0.01186934, "auxiliary_loss_mlp": 0.01030982, "balance_loss_clip": 1.05956244, "balance_loss_mlp": 1.02217865, "epoch": 0.2912282811278783, "flos": 22595352923520.0, "grad_norm": 1.783402880495616, "language_loss": 0.77478117, "learning_rate": 3.3260210444282045e-06, "loss": 0.79696035, "num_input_tokens_seen": 51893835, "step": 2422, "time_per_iteration": 2.783139228820801 }, { "auxiliary_loss_clip": 0.01187421, "auxiliary_loss_mlp": 0.0103579, "balance_loss_clip": 1.05164433, "balance_loss_mlp": 1.02631867, "epoch": 0.2913485240185174, "flos": 24497900599680.0, "grad_norm": 2.2501578030209712, "language_loss": 0.73432785, "learning_rate": 3.325437797839883e-06, "loss": 0.75655997, "num_input_tokens_seen": 51912205, "step": 2423, "time_per_iteration": 2.744779586791992 }, { "auxiliary_loss_clip": 0.01200289, "auxiliary_loss_mlp": 0.01027653, "balance_loss_clip": 1.05385554, "balance_loss_mlp": 1.01790142, "epoch": 0.2914687669091565, "flos": 17931024334080.0, "grad_norm": 2.8189780541503113, "language_loss": 0.74908423, "learning_rate": 3.3248543501837015e-06, "loss": 0.77136368, "num_input_tokens_seen": 51929410, "step": 2424, "time_per_iteration": 2.7833588123321533 }, { "auxiliary_loss_clip": 0.01191085, "auxiliary_loss_mlp": 0.01032269, "balance_loss_clip": 1.05656552, "balance_loss_mlp": 1.023525, "epoch": 0.2915890097997956, "flos": 22529313768960.0, "grad_norm": 1.8634918751603726, "language_loss": 0.77218354, "learning_rate": 3.3242707015481684e-06, "loss": 0.79441708, "num_input_tokens_seen": 51949345, "step": 2425, "time_per_iteration": 2.834021806716919 }, { "auxiliary_loss_clip": 0.01195724, "auxiliary_loss_mlp": 0.01037526, "balance_loss_clip": 1.05556715, "balance_loss_mlp": 1.02834702, "epoch": 0.2917092526904347, "flos": 13845216193920.0, "grad_norm": 1.8752417546255018, "language_loss": 0.80541956, "learning_rate": 3.323686852021823e-06, "loss": 0.82775199, "num_input_tokens_seen": 51966855, "step": 2426, "time_per_iteration": 2.723341464996338 }, { "auxiliary_loss_clip": 0.01197636, "auxiliary_loss_mlp": 0.01029511, "balance_loss_clip": 1.05578685, "balance_loss_mlp": 1.02021909, "epoch": 0.2918294955810738, "flos": 22674859678080.0, "grad_norm": 1.8750435348335621, "language_loss": 0.79643536, "learning_rate": 3.323102801693235e-06, "loss": 0.81870681, "num_input_tokens_seen": 51985620, "step": 2427, "time_per_iteration": 2.7652149200439453 }, { "auxiliary_loss_clip": 0.01193127, "auxiliary_loss_mlp": 0.01031556, "balance_loss_clip": 1.05630767, "balance_loss_mlp": 1.02229905, "epoch": 0.29194973847171285, "flos": 23438284364160.0, "grad_norm": 2.214353116295236, "language_loss": 0.80678099, "learning_rate": 3.322518550651003e-06, "loss": 0.82902783, "num_input_tokens_seen": 52004930, "step": 2428, "time_per_iteration": 2.766371011734009 }, { "auxiliary_loss_clip": 0.01202842, "auxiliary_loss_mlp": 0.0102999, "balance_loss_clip": 1.05689979, "balance_loss_mlp": 1.02017331, "epoch": 0.29206998136235196, "flos": 21909064694400.0, "grad_norm": 1.703951880956063, "language_loss": 0.81045145, "learning_rate": 3.3219340989837586e-06, "loss": 0.83277977, "num_input_tokens_seen": 52024920, "step": 2429, "time_per_iteration": 2.7693958282470703 }, { "auxiliary_loss_clip": 0.01195303, "auxiliary_loss_mlp": 0.01028804, "balance_loss_clip": 1.05708742, "balance_loss_mlp": 1.01986885, "epoch": 0.292190224252991, "flos": 23215925220480.0, "grad_norm": 1.7658338071238002, "language_loss": 0.7993142, "learning_rate": 3.3213494467801625e-06, "loss": 0.82155526, "num_input_tokens_seen": 52044095, "step": 2430, "time_per_iteration": 2.745189905166626 }, { "auxiliary_loss_clip": 0.01184465, "auxiliary_loss_mlp": 0.01028375, "balance_loss_clip": 1.05610418, "balance_loss_mlp": 1.01889789, "epoch": 0.2923104671436301, "flos": 20740818752640.0, "grad_norm": 2.172342590449981, "language_loss": 0.71255231, "learning_rate": 3.3207645941289063e-06, "loss": 0.73468077, "num_input_tokens_seen": 52062440, "step": 2431, "time_per_iteration": 2.7809410095214844 }, { "auxiliary_loss_clip": 0.01195908, "auxiliary_loss_mlp": 0.01053075, "balance_loss_clip": 1.05553865, "balance_loss_mlp": 1.02107859, "epoch": 0.29243071003426924, "flos": 35809114999680.0, "grad_norm": 1.7618343615010257, "language_loss": 0.80413258, "learning_rate": 3.320179541118711e-06, "loss": 0.82662249, "num_input_tokens_seen": 52084940, "step": 2432, "time_per_iteration": 2.8482301235198975 }, { "auxiliary_loss_clip": 0.01093059, "auxiliary_loss_mlp": 0.01015143, "balance_loss_clip": 1.0168941, "balance_loss_mlp": 1.01330698, "epoch": 0.2925509529249083, "flos": 58081598524800.0, "grad_norm": 1.0053793382021294, "language_loss": 0.60338479, "learning_rate": 3.3195942878383293e-06, "loss": 0.62446684, "num_input_tokens_seen": 52141040, "step": 2433, "time_per_iteration": 3.2967798709869385 }, { "auxiliary_loss_clip": 0.01197466, "auxiliary_loss_mlp": 0.0103266, "balance_loss_clip": 1.05533195, "balance_loss_mlp": 1.02293885, "epoch": 0.2926711958155474, "flos": 21397122103680.0, "grad_norm": 12.633039192651479, "language_loss": 0.78033161, "learning_rate": 3.319008834376543e-06, "loss": 0.80263293, "num_input_tokens_seen": 52160730, "step": 2434, "time_per_iteration": 2.8146345615386963 }, { "auxiliary_loss_clip": 0.01195822, "auxiliary_loss_mlp": 0.01027467, "balance_loss_clip": 1.05276966, "balance_loss_mlp": 1.01888371, "epoch": 0.2927914387061865, "flos": 23185796688000.0, "grad_norm": 2.358684223645502, "language_loss": 0.88769317, "learning_rate": 3.3184231808221654e-06, "loss": 0.90992606, "num_input_tokens_seen": 52175055, "step": 2435, "time_per_iteration": 2.8418593406677246 }, { "auxiliary_loss_clip": 0.01193678, "auxiliary_loss_mlp": 0.01037192, "balance_loss_clip": 1.05735672, "balance_loss_mlp": 1.02741051, "epoch": 0.29291168159682557, "flos": 22455553190400.0, "grad_norm": 2.1133938651123905, "language_loss": 0.63162166, "learning_rate": 3.3178373272640394e-06, "loss": 0.65393031, "num_input_tokens_seen": 52194150, "step": 2436, "time_per_iteration": 3.762317419052124 }, { "auxiliary_loss_clip": 0.01199541, "auxiliary_loss_mlp": 0.0103438, "balance_loss_clip": 1.05533278, "balance_loss_mlp": 1.02486086, "epoch": 0.2930319244874647, "flos": 21170632896000.0, "grad_norm": 2.198951105404419, "language_loss": 0.85489249, "learning_rate": 3.3172512737910387e-06, "loss": 0.87723166, "num_input_tokens_seen": 52211660, "step": 2437, "time_per_iteration": 2.771275043487549 }, { "auxiliary_loss_clip": 0.01200459, "auxiliary_loss_mlp": 0.01033059, "balance_loss_clip": 1.05774379, "balance_loss_mlp": 1.02455389, "epoch": 0.2931521673781038, "flos": 31357843931520.0, "grad_norm": 2.155950891737508, "language_loss": 0.88106835, "learning_rate": 3.3166650204920674e-06, "loss": 0.90340352, "num_input_tokens_seen": 52232830, "step": 2438, "time_per_iteration": 2.8417632579803467 }, { "auxiliary_loss_clip": 0.01197776, "auxiliary_loss_mlp": 0.01039234, "balance_loss_clip": 1.05835223, "balance_loss_mlp": 1.02994156, "epoch": 0.29327241026874284, "flos": 24200990778240.0, "grad_norm": 1.5840412516307667, "language_loss": 0.81513762, "learning_rate": 3.316078567456059e-06, "loss": 0.83750767, "num_input_tokens_seen": 52250670, "step": 2439, "time_per_iteration": 3.7127554416656494 }, { "auxiliary_loss_clip": 0.0118994, "auxiliary_loss_mlp": 0.0103011, "balance_loss_clip": 1.05600929, "balance_loss_mlp": 1.02100253, "epoch": 0.29339265315938196, "flos": 24242611662720.0, "grad_norm": 1.5577360321673221, "language_loss": 0.75588238, "learning_rate": 3.3154919147719786e-06, "loss": 0.77808285, "num_input_tokens_seen": 52271685, "step": 2440, "time_per_iteration": 4.117595672607422 }, { "auxiliary_loss_clip": 0.01195446, "auxiliary_loss_mlp": 0.01030652, "balance_loss_clip": 1.05416059, "balance_loss_mlp": 1.02150893, "epoch": 0.29351289605002107, "flos": 16946641134720.0, "grad_norm": 2.010130406985086, "language_loss": 0.86771369, "learning_rate": 3.31490506252882e-06, "loss": 0.88997465, "num_input_tokens_seen": 52291065, "step": 2441, "time_per_iteration": 3.8586134910583496 }, { "auxiliary_loss_clip": 0.01187119, "auxiliary_loss_mlp": 0.01029347, "balance_loss_clip": 1.05608916, "balance_loss_mlp": 1.02044213, "epoch": 0.2936331389406601, "flos": 19829082810240.0, "grad_norm": 1.8736656790131985, "language_loss": 0.83947176, "learning_rate": 3.31431801081561e-06, "loss": 0.8616364, "num_input_tokens_seen": 52310000, "step": 2442, "time_per_iteration": 2.75418758392334 }, { "auxiliary_loss_clip": 0.01091507, "auxiliary_loss_mlp": 0.01003243, "balance_loss_clip": 1.02182186, "balance_loss_mlp": 1.00141943, "epoch": 0.29375338183129923, "flos": 71416844398080.0, "grad_norm": 0.893425527895405, "language_loss": 0.67892545, "learning_rate": 3.313730759721402e-06, "loss": 0.69987291, "num_input_tokens_seen": 52372930, "step": 2443, "time_per_iteration": 3.3383829593658447 }, { "auxiliary_loss_clip": 0.01191393, "auxiliary_loss_mlp": 0.01034964, "balance_loss_clip": 1.05676103, "balance_loss_mlp": 1.02526653, "epoch": 0.29387362472193834, "flos": 22054502862720.0, "grad_norm": 2.3052141814396654, "language_loss": 0.86267626, "learning_rate": 3.313143309335282e-06, "loss": 0.88493979, "num_input_tokens_seen": 52391420, "step": 2444, "time_per_iteration": 2.744215488433838 }, { "auxiliary_loss_clip": 0.01188511, "auxiliary_loss_mlp": 0.01031381, "balance_loss_clip": 1.05645537, "balance_loss_mlp": 1.02229106, "epoch": 0.2939938676125774, "flos": 22966418373120.0, "grad_norm": 1.801593562657932, "language_loss": 0.85086453, "learning_rate": 3.3125556597463665e-06, "loss": 0.87306345, "num_input_tokens_seen": 52410725, "step": 2445, "time_per_iteration": 2.898440361022949 }, { "auxiliary_loss_clip": 0.01195957, "auxiliary_loss_mlp": 0.01031472, "balance_loss_clip": 1.05823183, "balance_loss_mlp": 1.02226281, "epoch": 0.2941141105032165, "flos": 31358705857920.0, "grad_norm": 2.2448514130751627, "language_loss": 0.66042513, "learning_rate": 3.311967811043801e-06, "loss": 0.68269938, "num_input_tokens_seen": 52432645, "step": 2446, "time_per_iteration": 2.841738224029541 }, { "auxiliary_loss_clip": 0.01197123, "auxiliary_loss_mlp": 0.01032443, "balance_loss_clip": 1.05850887, "balance_loss_mlp": 1.02351975, "epoch": 0.29423435339385556, "flos": 23222138273280.0, "grad_norm": 2.070184317719521, "language_loss": 0.82095087, "learning_rate": 3.3113797633167617e-06, "loss": 0.84324652, "num_input_tokens_seen": 52450940, "step": 2447, "time_per_iteration": 2.6963672637939453 }, { "auxiliary_loss_clip": 0.01200135, "auxiliary_loss_mlp": 0.01033419, "balance_loss_clip": 1.05581498, "balance_loss_mlp": 1.02305973, "epoch": 0.2943545962844947, "flos": 26864054138880.0, "grad_norm": 2.5194533681771305, "language_loss": 0.69271946, "learning_rate": 3.310791516654455e-06, "loss": 0.71505499, "num_input_tokens_seen": 52468000, "step": 2448, "time_per_iteration": 2.67852520942688 }, { "auxiliary_loss_clip": 0.01198163, "auxiliary_loss_mlp": 0.01034573, "balance_loss_clip": 1.058079, "balance_loss_mlp": 1.02491069, "epoch": 0.2944748391751338, "flos": 20231677422720.0, "grad_norm": 2.1600019209472983, "language_loss": 0.79114097, "learning_rate": 3.3102030711461177e-06, "loss": 0.81346834, "num_input_tokens_seen": 52487575, "step": 2449, "time_per_iteration": 2.7778215408325195 }, { "auxiliary_loss_clip": 0.011914, "auxiliary_loss_mlp": 0.01038022, "balance_loss_clip": 1.05596268, "balance_loss_mlp": 1.02859831, "epoch": 0.29459508206577284, "flos": 15960965045760.0, "grad_norm": 1.9306844862613242, "language_loss": 0.68121946, "learning_rate": 3.3096144268810156e-06, "loss": 0.70351362, "num_input_tokens_seen": 52506335, "step": 2450, "time_per_iteration": 2.6655075550079346 }, { "auxiliary_loss_clip": 0.01192086, "auxiliary_loss_mlp": 0.01034994, "balance_loss_clip": 1.05882752, "balance_loss_mlp": 1.02581513, "epoch": 0.29471532495641195, "flos": 20412882558720.0, "grad_norm": 1.9851185922100971, "language_loss": 0.73336971, "learning_rate": 3.3090255839484462e-06, "loss": 0.75564051, "num_input_tokens_seen": 52524330, "step": 2451, "time_per_iteration": 2.712653160095215 }, { "auxiliary_loss_clip": 0.01196332, "auxiliary_loss_mlp": 0.01032896, "balance_loss_clip": 1.05563521, "balance_loss_mlp": 1.02343059, "epoch": 0.29483556784705106, "flos": 20376576887040.0, "grad_norm": 2.107858091621538, "language_loss": 0.85559595, "learning_rate": 3.3084365424377366e-06, "loss": 0.8778882, "num_input_tokens_seen": 52543095, "step": 2452, "time_per_iteration": 2.7189252376556396 }, { "auxiliary_loss_clip": 0.0109, "auxiliary_loss_mlp": 0.01017049, "balance_loss_clip": 1.02238345, "balance_loss_mlp": 1.01517761, "epoch": 0.2949558107376901, "flos": 68555660595840.0, "grad_norm": 0.7295341776714459, "language_loss": 0.55973852, "learning_rate": 3.307847302438245e-06, "loss": 0.580809, "num_input_tokens_seen": 52597075, "step": 2453, "time_per_iteration": 3.238333225250244 }, { "auxiliary_loss_clip": 0.01176781, "auxiliary_loss_mlp": 0.01036977, "balance_loss_clip": 1.05634069, "balance_loss_mlp": 1.02754772, "epoch": 0.2950760536283292, "flos": 16107085572480.0, "grad_norm": 2.7007416170493688, "language_loss": 0.77468485, "learning_rate": 3.3072578640393562e-06, "loss": 0.79682243, "num_input_tokens_seen": 52614410, "step": 2454, "time_per_iteration": 2.8119521141052246 }, { "auxiliary_loss_clip": 0.01196341, "auxiliary_loss_mlp": 0.01027819, "balance_loss_clip": 1.05773377, "balance_loss_mlp": 1.01844287, "epoch": 0.29519629651896834, "flos": 20483626394880.0, "grad_norm": 2.1860380250626514, "language_loss": 0.79554629, "learning_rate": 3.3066682273304886e-06, "loss": 0.81778789, "num_input_tokens_seen": 52632055, "step": 2455, "time_per_iteration": 2.938156843185425 }, { "auxiliary_loss_clip": 0.01203796, "auxiliary_loss_mlp": 0.01060774, "balance_loss_clip": 1.05862832, "balance_loss_mlp": 1.02499771, "epoch": 0.2953165394096074, "flos": 18916484941440.0, "grad_norm": 5.05804245196484, "language_loss": 0.78651845, "learning_rate": 3.3060783924010904e-06, "loss": 0.80916417, "num_input_tokens_seen": 52649980, "step": 2456, "time_per_iteration": 2.7774550914764404 }, { "auxiliary_loss_clip": 0.01189871, "auxiliary_loss_mlp": 0.01032455, "balance_loss_clip": 1.05515456, "balance_loss_mlp": 1.02382421, "epoch": 0.2954367823002465, "flos": 20624467622400.0, "grad_norm": 2.0948788590041505, "language_loss": 0.8486405, "learning_rate": 3.3054883593406387e-06, "loss": 0.8708638, "num_input_tokens_seen": 52664730, "step": 2457, "time_per_iteration": 2.789485454559326 }, { "auxiliary_loss_clip": 0.01197702, "auxiliary_loss_mlp": 0.01030831, "balance_loss_clip": 1.05651414, "balance_loss_mlp": 1.02103209, "epoch": 0.2955570251908856, "flos": 31175525473920.0, "grad_norm": 1.9487927241876937, "language_loss": 0.65056133, "learning_rate": 3.3048981282386404e-06, "loss": 0.67284667, "num_input_tokens_seen": 52686040, "step": 2458, "time_per_iteration": 2.8480663299560547 }, { "auxiliary_loss_clip": 0.01182112, "auxiliary_loss_mlp": 0.01030684, "balance_loss_clip": 1.0579561, "balance_loss_mlp": 1.02080178, "epoch": 0.29567726808152467, "flos": 21650328051840.0, "grad_norm": 2.362401517909385, "language_loss": 0.82753086, "learning_rate": 3.304307699184634e-06, "loss": 0.84965885, "num_input_tokens_seen": 52704630, "step": 2459, "time_per_iteration": 2.829465389251709 }, { "auxiliary_loss_clip": 0.01195468, "auxiliary_loss_mlp": 0.01029472, "balance_loss_clip": 1.05859613, "balance_loss_mlp": 1.02039421, "epoch": 0.2957975109721638, "flos": 24243868638720.0, "grad_norm": 1.8821647206052363, "language_loss": 0.78906626, "learning_rate": 3.3037170722681866e-06, "loss": 0.81131566, "num_input_tokens_seen": 52725465, "step": 2460, "time_per_iteration": 2.7490217685699463 }, { "auxiliary_loss_clip": 0.01182996, "auxiliary_loss_mlp": 0.01028279, "balance_loss_clip": 1.05645251, "balance_loss_mlp": 1.01893282, "epoch": 0.29591775386280283, "flos": 13479717352320.0, "grad_norm": 1.8388478401103163, "language_loss": 0.67924333, "learning_rate": 3.3031262475788956e-06, "loss": 0.70135605, "num_input_tokens_seen": 52742405, "step": 2461, "time_per_iteration": 2.7901899814605713 }, { "auxiliary_loss_clip": 0.01188918, "auxiliary_loss_mlp": 0.01038131, "balance_loss_clip": 1.05553532, "balance_loss_mlp": 1.02889824, "epoch": 0.29603799675344195, "flos": 17749783284480.0, "grad_norm": 1.7082541633199682, "language_loss": 0.72985089, "learning_rate": 3.3025352252063897e-06, "loss": 0.75212145, "num_input_tokens_seen": 52761100, "step": 2462, "time_per_iteration": 3.9406189918518066 }, { "auxiliary_loss_clip": 0.01195586, "auxiliary_loss_mlp": 0.01029881, "balance_loss_clip": 1.05732012, "balance_loss_mlp": 1.02080941, "epoch": 0.29615823964408106, "flos": 22783920347520.0, "grad_norm": 1.7093662422189408, "language_loss": 0.75050163, "learning_rate": 3.3019440052403252e-06, "loss": 0.77275634, "num_input_tokens_seen": 52780965, "step": 2463, "time_per_iteration": 2.683638095855713 }, { "auxiliary_loss_clip": 0.01196501, "auxiliary_loss_mlp": 0.0103184, "balance_loss_clip": 1.05801749, "balance_loss_mlp": 1.02207637, "epoch": 0.2962784825347201, "flos": 23514199758720.0, "grad_norm": 3.3590278253731394, "language_loss": 0.7049253, "learning_rate": 3.30135258777039e-06, "loss": 0.72720861, "num_input_tokens_seen": 52800335, "step": 2464, "time_per_iteration": 2.7250185012817383 }, { "auxiliary_loss_clip": 0.01202408, "auxiliary_loss_mlp": 0.01054545, "balance_loss_clip": 1.05545259, "balance_loss_mlp": 1.01917791, "epoch": 0.2963987254253592, "flos": 16362769559040.0, "grad_norm": 2.0851865104334792, "language_loss": 0.70762676, "learning_rate": 3.3007609728863024e-06, "loss": 0.73019624, "num_input_tokens_seen": 52818425, "step": 2465, "time_per_iteration": 3.654949188232422 }, { "auxiliary_loss_clip": 0.01173041, "auxiliary_loss_mlp": 0.01033786, "balance_loss_clip": 1.05597472, "balance_loss_mlp": 1.0245471, "epoch": 0.29651896831599833, "flos": 33472263980160.0, "grad_norm": 2.159132119698248, "language_loss": 0.72761869, "learning_rate": 3.300169160677809e-06, "loss": 0.74968696, "num_input_tokens_seen": 52842340, "step": 2466, "time_per_iteration": 2.9427311420440674 }, { "auxiliary_loss_clip": 0.01197163, "auxiliary_loss_mlp": 0.010312, "balance_loss_clip": 1.05781937, "balance_loss_mlp": 1.0215441, "epoch": 0.2966392112066374, "flos": 23805363404160.0, "grad_norm": 2.329092513539347, "language_loss": 0.77445191, "learning_rate": 3.2995771512346878e-06, "loss": 0.79673553, "num_input_tokens_seen": 52860690, "step": 2467, "time_per_iteration": 3.714120626449585 }, { "auxiliary_loss_clip": 0.01205401, "auxiliary_loss_mlp": 0.0105604, "balance_loss_clip": 1.05879772, "balance_loss_mlp": 1.01960397, "epoch": 0.2967594540972765, "flos": 19938466702080.0, "grad_norm": 2.180080870555425, "language_loss": 0.73336524, "learning_rate": 3.298984944646746e-06, "loss": 0.7559796, "num_input_tokens_seen": 52879370, "step": 2468, "time_per_iteration": 2.756208896636963 }, { "auxiliary_loss_clip": 0.01200697, "auxiliary_loss_mlp": 0.01057218, "balance_loss_clip": 1.05834579, "balance_loss_mlp": 1.02303994, "epoch": 0.2968796969879156, "flos": 23732823888000.0, "grad_norm": 1.8937575833557654, "language_loss": 0.81652796, "learning_rate": 3.298392541003822e-06, "loss": 0.83910716, "num_input_tokens_seen": 52898775, "step": 2469, "time_per_iteration": 2.739769458770752 }, { "auxiliary_loss_clip": 0.01195727, "auxiliary_loss_mlp": 0.01028166, "balance_loss_clip": 1.05790591, "balance_loss_mlp": 1.01839101, "epoch": 0.29699993987855466, "flos": 22893699288960.0, "grad_norm": 1.5812066633083495, "language_loss": 0.89538652, "learning_rate": 3.2977999403957806e-06, "loss": 0.91762543, "num_input_tokens_seen": 52917535, "step": 2470, "time_per_iteration": 2.7270150184631348 }, { "auxiliary_loss_clip": 0.01203232, "auxiliary_loss_mlp": 0.0103556, "balance_loss_clip": 1.05949092, "balance_loss_mlp": 1.02604699, "epoch": 0.2971201827691938, "flos": 33832555349760.0, "grad_norm": 2.581821368800556, "language_loss": 0.67324811, "learning_rate": 3.2972071429125207e-06, "loss": 0.69563603, "num_input_tokens_seen": 52938755, "step": 2471, "time_per_iteration": 2.76650333404541 }, { "auxiliary_loss_clip": 0.01186981, "auxiliary_loss_mlp": 0.01028651, "balance_loss_clip": 1.05623269, "balance_loss_mlp": 1.01961493, "epoch": 0.2972404256598329, "flos": 22054359208320.0, "grad_norm": 1.9451805986101354, "language_loss": 0.88341236, "learning_rate": 3.2966141486439682e-06, "loss": 0.90556872, "num_input_tokens_seen": 52957945, "step": 2472, "time_per_iteration": 2.7714900970458984 }, { "auxiliary_loss_clip": 0.01187498, "auxiliary_loss_mlp": 0.01028347, "balance_loss_clip": 1.05669737, "balance_loss_mlp": 1.01825571, "epoch": 0.29736066855047194, "flos": 31978595796480.0, "grad_norm": 5.3611743567984025, "language_loss": 0.64255857, "learning_rate": 3.29602095768008e-06, "loss": 0.66471702, "num_input_tokens_seen": 52978460, "step": 2473, "time_per_iteration": 2.9613037109375 }, { "auxiliary_loss_clip": 0.01186655, "auxiliary_loss_mlp": 0.0102492, "balance_loss_clip": 1.05628645, "balance_loss_mlp": 1.01610494, "epoch": 0.29748091144111105, "flos": 33510401245440.0, "grad_norm": 7.607202652579819, "language_loss": 0.63664889, "learning_rate": 3.2954275701108437e-06, "loss": 0.6587646, "num_input_tokens_seen": 52999640, "step": 2474, "time_per_iteration": 2.8538355827331543 }, { "auxiliary_loss_clip": 0.01181082, "auxiliary_loss_mlp": 0.01026957, "balance_loss_clip": 1.05483937, "balance_loss_mlp": 1.01761127, "epoch": 0.29760115433175016, "flos": 41283373409280.0, "grad_norm": 1.9103921489000075, "language_loss": 0.68488109, "learning_rate": 3.294833986026275e-06, "loss": 0.70696145, "num_input_tokens_seen": 53022880, "step": 2475, "time_per_iteration": 2.971804618835449 }, { "auxiliary_loss_clip": 0.0118561, "auxiliary_loss_mlp": 0.01034243, "balance_loss_clip": 1.05765975, "balance_loss_mlp": 1.02520728, "epoch": 0.2977213972223892, "flos": 24493339572480.0, "grad_norm": 2.0654008435620232, "language_loss": 0.8517828, "learning_rate": 3.29424020551642e-06, "loss": 0.87398136, "num_input_tokens_seen": 53041515, "step": 2476, "time_per_iteration": 2.7461097240448 }, { "auxiliary_loss_clip": 0.01203635, "auxiliary_loss_mlp": 0.01036268, "balance_loss_clip": 1.05695248, "balance_loss_mlp": 1.02585566, "epoch": 0.2978416401130283, "flos": 21285116519040.0, "grad_norm": 1.9972564544265525, "language_loss": 0.72100377, "learning_rate": 3.2936462286713546e-06, "loss": 0.74340284, "num_input_tokens_seen": 53059865, "step": 2477, "time_per_iteration": 2.757887125015259 }, { "auxiliary_loss_clip": 0.01198378, "auxiliary_loss_mlp": 0.01033115, "balance_loss_clip": 1.05608833, "balance_loss_mlp": 1.02345324, "epoch": 0.2979618830036674, "flos": 25772154554880.0, "grad_norm": 2.3915728905944014, "language_loss": 0.77548397, "learning_rate": 3.2930520555811846e-06, "loss": 0.79779893, "num_input_tokens_seen": 53079490, "step": 2478, "time_per_iteration": 2.793642044067383 }, { "auxiliary_loss_clip": 0.01157342, "auxiliary_loss_mlp": 0.01061007, "balance_loss_clip": 1.05476356, "balance_loss_mlp": 1.02691293, "epoch": 0.2980821258943065, "flos": 23476996247040.0, "grad_norm": 2.027510432553691, "language_loss": 0.80278009, "learning_rate": 3.292457686336046e-06, "loss": 0.82496363, "num_input_tokens_seen": 53098810, "step": 2479, "time_per_iteration": 2.9544386863708496 }, { "auxiliary_loss_clip": 0.0109817, "auxiliary_loss_mlp": 0.01001861, "balance_loss_clip": 1.02215385, "balance_loss_mlp": 0.99984628, "epoch": 0.2982023687849456, "flos": 69752314195200.0, "grad_norm": 0.8488568714100809, "language_loss": 0.61221528, "learning_rate": 3.291863121026105e-06, "loss": 0.63321555, "num_input_tokens_seen": 53162590, "step": 2480, "time_per_iteration": 3.3865959644317627 }, { "auxiliary_loss_clip": 0.01197069, "auxiliary_loss_mlp": 0.01033436, "balance_loss_clip": 1.05721915, "balance_loss_mlp": 1.02449512, "epoch": 0.29832261167558466, "flos": 29825930741760.0, "grad_norm": 2.0066652214680176, "language_loss": 0.77151608, "learning_rate": 3.2912683597415547e-06, "loss": 0.7938211, "num_input_tokens_seen": 53186675, "step": 2481, "time_per_iteration": 2.8229808807373047 }, { "auxiliary_loss_clip": 0.01191881, "auxiliary_loss_mlp": 0.01029414, "balance_loss_clip": 1.05595052, "balance_loss_mlp": 1.02011526, "epoch": 0.29844285456622377, "flos": 33910158683520.0, "grad_norm": 2.3799045942784463, "language_loss": 0.7822504, "learning_rate": 3.2906734025726213e-06, "loss": 0.80446339, "num_input_tokens_seen": 53205940, "step": 2482, "time_per_iteration": 2.856456756591797 }, { "auxiliary_loss_clip": 0.0120451, "auxiliary_loss_mlp": 0.01032318, "balance_loss_clip": 1.06063259, "balance_loss_mlp": 1.02346063, "epoch": 0.2985630974568629, "flos": 23876933253120.0, "grad_norm": 3.136166717027694, "language_loss": 0.880229, "learning_rate": 3.290078249609559e-06, "loss": 0.90259731, "num_input_tokens_seen": 53225360, "step": 2483, "time_per_iteration": 2.786628007888794 }, { "auxiliary_loss_clip": 0.01196714, "auxiliary_loss_mlp": 0.0103064, "balance_loss_clip": 1.05894673, "balance_loss_mlp": 1.02118075, "epoch": 0.29868334034750194, "flos": 21799106184960.0, "grad_norm": 2.116715554146535, "language_loss": 0.88302624, "learning_rate": 3.2894829009426514e-06, "loss": 0.90529978, "num_input_tokens_seen": 53243195, "step": 2484, "time_per_iteration": 2.6856298446655273 }, { "auxiliary_loss_clip": 0.01193876, "auxiliary_loss_mlp": 0.01033063, "balance_loss_clip": 1.05706573, "balance_loss_mlp": 1.02390766, "epoch": 0.29880358323814105, "flos": 25666649331840.0, "grad_norm": 2.1358877285253555, "language_loss": 0.77103722, "learning_rate": 3.288887356662213e-06, "loss": 0.79330659, "num_input_tokens_seen": 53264530, "step": 2485, "time_per_iteration": 2.7226827144622803 }, { "auxiliary_loss_clip": 0.01091289, "auxiliary_loss_mlp": 0.01001338, "balance_loss_clip": 1.02014327, "balance_loss_mlp": 0.99933571, "epoch": 0.29892382612878016, "flos": 71005846003200.0, "grad_norm": 0.7673753077980462, "language_loss": 0.59720534, "learning_rate": 3.288291616858588e-06, "loss": 0.61813158, "num_input_tokens_seen": 53319920, "step": 2486, "time_per_iteration": 3.1660666465759277 }, { "auxiliary_loss_clip": 0.01179669, "auxiliary_loss_mlp": 0.01033793, "balance_loss_clip": 1.0569973, "balance_loss_mlp": 1.02455997, "epoch": 0.2990440690194192, "flos": 25481134563840.0, "grad_norm": 1.8076526999047593, "language_loss": 0.76722515, "learning_rate": 3.287695681622149e-06, "loss": 0.78935981, "num_input_tokens_seen": 53339270, "step": 2487, "time_per_iteration": 2.856586217880249 }, { "auxiliary_loss_clip": 0.01199124, "auxiliary_loss_mlp": 0.01035584, "balance_loss_clip": 1.05663681, "balance_loss_mlp": 1.02696538, "epoch": 0.2991643119100583, "flos": 23732357011200.0, "grad_norm": 1.780139050929316, "language_loss": 0.80644155, "learning_rate": 3.2870995510432982e-06, "loss": 0.82878858, "num_input_tokens_seen": 53357750, "step": 2488, "time_per_iteration": 2.7141315937042236 }, { "auxiliary_loss_clip": 0.01192078, "auxiliary_loss_mlp": 0.01032646, "balance_loss_clip": 1.0567795, "balance_loss_mlp": 1.02433658, "epoch": 0.29928455480069743, "flos": 27417545786880.0, "grad_norm": 1.7342062895073194, "language_loss": 0.77197886, "learning_rate": 3.2865032252124697e-06, "loss": 0.79422617, "num_input_tokens_seen": 53378265, "step": 2489, "time_per_iteration": 3.836641788482666 }, { "auxiliary_loss_clip": 0.01192825, "auxiliary_loss_mlp": 0.01030146, "balance_loss_clip": 1.05583763, "balance_loss_mlp": 1.02155113, "epoch": 0.2994047976913365, "flos": 33692935184640.0, "grad_norm": 1.4667659457607667, "language_loss": 0.77795935, "learning_rate": 3.2859067042201243e-06, "loss": 0.80018902, "num_input_tokens_seen": 53400305, "step": 2490, "time_per_iteration": 2.9281177520751953 }, { "auxiliary_loss_clip": 0.01164697, "auxiliary_loss_mlp": 0.01033263, "balance_loss_clip": 1.05759478, "balance_loss_mlp": 1.02476978, "epoch": 0.2995250405819756, "flos": 16763963541120.0, "grad_norm": 2.590460434446562, "language_loss": 0.78085303, "learning_rate": 3.2853099881567544e-06, "loss": 0.8028326, "num_input_tokens_seen": 53418705, "step": 2491, "time_per_iteration": 3.816394090652466 }, { "auxiliary_loss_clip": 0.01196144, "auxiliary_loss_mlp": 0.01034314, "balance_loss_clip": 1.0554291, "balance_loss_mlp": 1.02555776, "epoch": 0.29964528347261465, "flos": 22963976248320.0, "grad_norm": 2.0700947800618987, "language_loss": 0.79161727, "learning_rate": 3.284713077112881e-06, "loss": 0.81392181, "num_input_tokens_seen": 53438135, "step": 2492, "time_per_iteration": 2.7333343029022217 }, { "auxiliary_loss_clip": 0.01197322, "auxiliary_loss_mlp": 0.01037828, "balance_loss_clip": 1.06019878, "balance_loss_mlp": 1.02767682, "epoch": 0.29976552636325376, "flos": 16938021870720.0, "grad_norm": 2.549408651218425, "language_loss": 0.87021786, "learning_rate": 3.284115971179056e-06, "loss": 0.89256936, "num_input_tokens_seen": 53452165, "step": 2493, "time_per_iteration": 4.757082939147949 }, { "auxiliary_loss_clip": 0.01184783, "auxiliary_loss_mlp": 0.01027462, "balance_loss_clip": 1.05767345, "balance_loss_mlp": 1.01943946, "epoch": 0.2998857692538929, "flos": 17056455989760.0, "grad_norm": 4.329348367753728, "language_loss": 0.78419876, "learning_rate": 3.283518670445859e-06, "loss": 0.80632114, "num_input_tokens_seen": 53470075, "step": 2494, "time_per_iteration": 2.740349531173706 }, { "auxiliary_loss_clip": 0.01080989, "auxiliary_loss_mlp": 0.01022428, "balance_loss_clip": 1.01634288, "balance_loss_mlp": 1.00029647, "epoch": 0.30000601214453193, "flos": 68831528025600.0, "grad_norm": 0.6876826711456944, "language_loss": 0.54302657, "learning_rate": 3.2829211750038995e-06, "loss": 0.56406075, "num_input_tokens_seen": 53538705, "step": 2495, "time_per_iteration": 3.384500026702881 }, { "auxiliary_loss_clip": 0.01189259, "auxiliary_loss_mlp": 0.01028389, "balance_loss_clip": 1.05879021, "balance_loss_mlp": 1.01959729, "epoch": 0.30012625503517104, "flos": 17603267708160.0, "grad_norm": 2.2058380218831077, "language_loss": 0.89088589, "learning_rate": 3.2823234849438183e-06, "loss": 0.91306239, "num_input_tokens_seen": 53556740, "step": 2496, "time_per_iteration": 2.782501220703125 }, { "auxiliary_loss_clip": 0.01193295, "auxiliary_loss_mlp": 0.0103066, "balance_loss_clip": 1.05448461, "balance_loss_mlp": 1.02224958, "epoch": 0.30024649792581015, "flos": 21252581775360.0, "grad_norm": 2.590195346435813, "language_loss": 0.75855094, "learning_rate": 3.2817256003562836e-06, "loss": 0.78079051, "num_input_tokens_seen": 53577115, "step": 2497, "time_per_iteration": 2.8158631324768066 }, { "auxiliary_loss_clip": 0.01186311, "auxiliary_loss_mlp": 0.01028338, "balance_loss_clip": 1.0581162, "balance_loss_mlp": 1.01886129, "epoch": 0.3003667408164492, "flos": 23003262748800.0, "grad_norm": 1.8461490596677057, "language_loss": 0.66323936, "learning_rate": 3.281127521331995e-06, "loss": 0.68538582, "num_input_tokens_seen": 53598295, "step": 2498, "time_per_iteration": 2.9295263290405273 }, { "auxiliary_loss_clip": 0.01089706, "auxiliary_loss_mlp": 0.01008077, "balance_loss_clip": 1.01535058, "balance_loss_mlp": 1.00626457, "epoch": 0.3004869837070883, "flos": 64232340750720.0, "grad_norm": 0.8927259131532407, "language_loss": 0.6062746, "learning_rate": 3.2805292479616798e-06, "loss": 0.6272524, "num_input_tokens_seen": 53657160, "step": 2499, "time_per_iteration": 3.270220994949341 }, { "auxiliary_loss_clip": 0.01196176, "auxiliary_loss_mlp": 0.01034478, "balance_loss_clip": 1.05959535, "balance_loss_mlp": 1.02509594, "epoch": 0.30060722659772743, "flos": 26248653400320.0, "grad_norm": 4.995446428285232, "language_loss": 0.92202795, "learning_rate": 3.2799307803360955e-06, "loss": 0.94433451, "num_input_tokens_seen": 53673090, "step": 2500, "time_per_iteration": 2.8218371868133545 }, { "auxiliary_loss_clip": 0.01196311, "auxiliary_loss_mlp": 0.0102692, "balance_loss_clip": 1.05472052, "balance_loss_mlp": 1.01847386, "epoch": 0.3007274694883665, "flos": 24970879912320.0, "grad_norm": 1.6174830302433671, "language_loss": 0.81613052, "learning_rate": 3.27933211854603e-06, "loss": 0.83836281, "num_input_tokens_seen": 53692145, "step": 2501, "time_per_iteration": 2.8019604682922363 }, { "auxiliary_loss_clip": 0.01190108, "auxiliary_loss_mlp": 0.01035989, "balance_loss_clip": 1.05424237, "balance_loss_mlp": 1.02692866, "epoch": 0.3008477123790056, "flos": 17055845458560.0, "grad_norm": 1.655837354890188, "language_loss": 0.87020862, "learning_rate": 3.278733262682299e-06, "loss": 0.89246964, "num_input_tokens_seen": 53710000, "step": 2502, "time_per_iteration": 2.7736165523529053 }, { "auxiliary_loss_clip": 0.01197234, "auxiliary_loss_mlp": 0.01030527, "balance_loss_clip": 1.05459011, "balance_loss_mlp": 1.02084684, "epoch": 0.3009679552696447, "flos": 21506398254720.0, "grad_norm": 2.456073681252683, "language_loss": 0.82410163, "learning_rate": 3.2781342128357484e-06, "loss": 0.84637922, "num_input_tokens_seen": 53729355, "step": 2503, "time_per_iteration": 2.70434832572937 }, { "auxiliary_loss_clip": 0.01190802, "auxiliary_loss_mlp": 0.01025398, "balance_loss_clip": 1.05780661, "balance_loss_mlp": 1.01702332, "epoch": 0.30108819816028376, "flos": 21134004001920.0, "grad_norm": 2.6686589521220885, "language_loss": 0.80312175, "learning_rate": 3.2775349690972547e-06, "loss": 0.82528377, "num_input_tokens_seen": 53743505, "step": 2504, "time_per_iteration": 2.751049757003784 }, { "auxiliary_loss_clip": 0.01087353, "auxiliary_loss_mlp": 0.01001457, "balance_loss_clip": 1.01872468, "balance_loss_mlp": 0.99962157, "epoch": 0.30120844105092287, "flos": 71126434938240.0, "grad_norm": 0.7528169618698503, "language_loss": 0.51810908, "learning_rate": 3.276935531557722e-06, "loss": 0.53899717, "num_input_tokens_seen": 53808725, "step": 2505, "time_per_iteration": 3.409721851348877 }, { "auxiliary_loss_clip": 0.0119289, "auxiliary_loss_mlp": 0.01026235, "balance_loss_clip": 1.05927658, "balance_loss_mlp": 1.01681149, "epoch": 0.301328683941562, "flos": 20264571302400.0, "grad_norm": 2.3147166982706535, "language_loss": 0.80000246, "learning_rate": 3.2763359003080837e-06, "loss": 0.82219374, "num_input_tokens_seen": 53825680, "step": 2506, "time_per_iteration": 2.816732883453369 }, { "auxiliary_loss_clip": 0.01085573, "auxiliary_loss_mlp": 0.01007178, "balance_loss_clip": 1.0135572, "balance_loss_mlp": 1.0049845, "epoch": 0.30144892683220104, "flos": 70648212240000.0, "grad_norm": 0.8085046034005023, "language_loss": 0.62439114, "learning_rate": 3.2757360754393047e-06, "loss": 0.64531863, "num_input_tokens_seen": 53889750, "step": 2507, "time_per_iteration": 3.3776841163635254 }, { "auxiliary_loss_clip": 0.01195773, "auxiliary_loss_mlp": 0.01034706, "balance_loss_clip": 1.05603886, "balance_loss_mlp": 1.02589083, "epoch": 0.30156916972284015, "flos": 22820549241600.0, "grad_norm": 2.568140245546686, "language_loss": 0.64081144, "learning_rate": 3.2751360570423767e-06, "loss": 0.66311622, "num_input_tokens_seen": 53908135, "step": 2508, "time_per_iteration": 2.711700677871704 }, { "auxiliary_loss_clip": 0.01190133, "auxiliary_loss_mlp": 0.01030794, "balance_loss_clip": 1.05621147, "balance_loss_mlp": 1.02203846, "epoch": 0.3016894126134792, "flos": 29899188529920.0, "grad_norm": 2.267347178962572, "language_loss": 0.75643295, "learning_rate": 3.2745358452083236e-06, "loss": 0.77864224, "num_input_tokens_seen": 53931035, "step": 2509, "time_per_iteration": 2.7632460594177246 }, { "auxiliary_loss_clip": 0.01195588, "auxiliary_loss_mlp": 0.01028893, "balance_loss_clip": 1.05552042, "balance_loss_mlp": 1.02070284, "epoch": 0.3018096555041183, "flos": 21546331200000.0, "grad_norm": 1.4845095989935733, "language_loss": 0.82375145, "learning_rate": 3.2739354400281955e-06, "loss": 0.8459962, "num_input_tokens_seen": 53952255, "step": 2510, "time_per_iteration": 2.778061628341675 }, { "auxiliary_loss_clip": 0.01086321, "auxiliary_loss_mlp": 0.01022672, "balance_loss_clip": 1.01326895, "balance_loss_mlp": 1.00176334, "epoch": 0.3019298983947574, "flos": 59136294597120.0, "grad_norm": 0.8641727485416237, "language_loss": 0.63744128, "learning_rate": 3.2733348415930744e-06, "loss": 0.65853119, "num_input_tokens_seen": 54014125, "step": 2511, "time_per_iteration": 3.3599016666412354 }, { "auxiliary_loss_clip": 0.01179915, "auxiliary_loss_mlp": 0.01027893, "balance_loss_clip": 1.05339241, "balance_loss_mlp": 1.01925087, "epoch": 0.3020501412853965, "flos": 34423070941440.0, "grad_norm": 1.9584342536865662, "language_loss": 0.80500782, "learning_rate": 3.27273404999407e-06, "loss": 0.82708585, "num_input_tokens_seen": 54036345, "step": 2512, "time_per_iteration": 2.8080227375030518 }, { "auxiliary_loss_clip": 0.01087736, "auxiliary_loss_mlp": 0.01002081, "balance_loss_clip": 1.01415765, "balance_loss_mlp": 1.00025678, "epoch": 0.3021703841760356, "flos": 71008288128000.0, "grad_norm": 0.8019839001953012, "language_loss": 0.60414445, "learning_rate": 3.272133065322322e-06, "loss": 0.62504262, "num_input_tokens_seen": 54094615, "step": 2513, "time_per_iteration": 3.25403094291687 }, { "auxiliary_loss_clip": 0.01194889, "auxiliary_loss_mlp": 0.0103369, "balance_loss_clip": 1.05318356, "balance_loss_mlp": 1.02495754, "epoch": 0.3022906270666747, "flos": 21510528318720.0, "grad_norm": 1.5568282452287039, "language_loss": 0.79783309, "learning_rate": 3.271531887669e-06, "loss": 0.8201189, "num_input_tokens_seen": 54114675, "step": 2514, "time_per_iteration": 3.66056227684021 }, { "auxiliary_loss_clip": 0.01189299, "auxiliary_loss_mlp": 0.01031058, "balance_loss_clip": 1.05416286, "balance_loss_mlp": 1.02214122, "epoch": 0.30241086995731375, "flos": 31132001168640.0, "grad_norm": 2.1449679288536077, "language_loss": 0.63714588, "learning_rate": 3.2709305171253015e-06, "loss": 0.6593495, "num_input_tokens_seen": 54134795, "step": 2515, "time_per_iteration": 2.8148369789123535 }, { "auxiliary_loss_clip": 0.01194204, "auxiliary_loss_mlp": 0.01032869, "balance_loss_clip": 1.05564451, "balance_loss_mlp": 1.02465522, "epoch": 0.30253111284795287, "flos": 23511542152320.0, "grad_norm": 2.105845246680977, "language_loss": 0.77490115, "learning_rate": 3.2703289537824536e-06, "loss": 0.79717183, "num_input_tokens_seen": 54154595, "step": 2516, "time_per_iteration": 2.754852056503296 }, { "auxiliary_loss_clip": 0.01189872, "auxiliary_loss_mlp": 0.01036233, "balance_loss_clip": 1.05738068, "balance_loss_mlp": 1.02773333, "epoch": 0.302651355738592, "flos": 18725367651840.0, "grad_norm": 2.472529730626087, "language_loss": 0.78215808, "learning_rate": 3.269727197731714e-06, "loss": 0.80441916, "num_input_tokens_seen": 54167360, "step": 2517, "time_per_iteration": 3.82426118850708 }, { "auxiliary_loss_clip": 0.01176777, "auxiliary_loss_mlp": 0.01030632, "balance_loss_clip": 1.05352998, "balance_loss_mlp": 1.02186465, "epoch": 0.30277159862923103, "flos": 22418888382720.0, "grad_norm": 1.586110446826664, "language_loss": 0.77805126, "learning_rate": 3.269125249064367e-06, "loss": 0.80012536, "num_input_tokens_seen": 54187055, "step": 2518, "time_per_iteration": 2.8568596839904785 }, { "auxiliary_loss_clip": 0.01200994, "auxiliary_loss_mlp": 0.01033846, "balance_loss_clip": 1.05511069, "balance_loss_mlp": 1.02438104, "epoch": 0.30289184151987014, "flos": 22273126992000.0, "grad_norm": 1.6219934930089999, "language_loss": 0.83187139, "learning_rate": 3.2685231078717297e-06, "loss": 0.85421979, "num_input_tokens_seen": 54207245, "step": 2519, "time_per_iteration": 4.7889721393585205 }, { "auxiliary_loss_clip": 0.01180529, "auxiliary_loss_mlp": 0.01059429, "balance_loss_clip": 1.05656719, "balance_loss_mlp": 1.02309978, "epoch": 0.30301208441050925, "flos": 25225594231680.0, "grad_norm": 2.01262537207444, "language_loss": 0.75288773, "learning_rate": 3.267920774245145e-06, "loss": 0.77528739, "num_input_tokens_seen": 54226650, "step": 2520, "time_per_iteration": 2.833792209625244 }, { "auxiliary_loss_clip": 0.01198296, "auxiliary_loss_mlp": 0.0103032, "balance_loss_clip": 1.05890405, "balance_loss_mlp": 1.020473, "epoch": 0.3031323273011483, "flos": 23039245198080.0, "grad_norm": 1.7455458853637207, "language_loss": 0.84854436, "learning_rate": 3.2673182482759876e-06, "loss": 0.87083048, "num_input_tokens_seen": 54245765, "step": 2521, "time_per_iteration": 2.8674347400665283 }, { "auxiliary_loss_clip": 0.0119426, "auxiliary_loss_mlp": 0.01028804, "balance_loss_clip": 1.0557611, "balance_loss_mlp": 1.01995301, "epoch": 0.3032525701917874, "flos": 18876695650560.0, "grad_norm": 2.2877592663864936, "language_loss": 0.66246057, "learning_rate": 3.266715530055659e-06, "loss": 0.68469125, "num_input_tokens_seen": 54263915, "step": 2522, "time_per_iteration": 2.825976848602295 }, { "auxiliary_loss_clip": 0.01187733, "auxiliary_loss_mlp": 0.01025616, "balance_loss_clip": 1.05465865, "balance_loss_mlp": 1.01584649, "epoch": 0.30337281308242653, "flos": 17782641250560.0, "grad_norm": 1.6395892599374213, "language_loss": 0.80480421, "learning_rate": 3.2661126196755927e-06, "loss": 0.82693768, "num_input_tokens_seen": 54283025, "step": 2523, "time_per_iteration": 2.7406153678894043 }, { "auxiliary_loss_clip": 0.01086177, "auxiliary_loss_mlp": 0.01006157, "balance_loss_clip": 1.01198256, "balance_loss_mlp": 1.00442874, "epoch": 0.3034930559730656, "flos": 57824298426240.0, "grad_norm": 0.7775661006449133, "language_loss": 0.55935383, "learning_rate": 3.265509517227248e-06, "loss": 0.5802772, "num_input_tokens_seen": 54339840, "step": 2524, "time_per_iteration": 3.2352168560028076 }, { "auxiliary_loss_clip": 0.01191577, "auxiliary_loss_mlp": 0.01030082, "balance_loss_clip": 1.05347526, "balance_loss_mlp": 1.02121258, "epoch": 0.3036132988637047, "flos": 14755587419520.0, "grad_norm": 1.7974134826650332, "language_loss": 0.80810595, "learning_rate": 3.264906222802115e-06, "loss": 0.83032256, "num_input_tokens_seen": 54357690, "step": 2525, "time_per_iteration": 2.7285428047180176 }, { "auxiliary_loss_clip": 0.01200533, "auxiliary_loss_mlp": 0.01031248, "balance_loss_clip": 1.05484426, "balance_loss_mlp": 1.02228904, "epoch": 0.30373354175434375, "flos": 21033203460480.0, "grad_norm": 2.3167369346859257, "language_loss": 0.78298938, "learning_rate": 3.264302736491715e-06, "loss": 0.80530727, "num_input_tokens_seen": 54377810, "step": 2526, "time_per_iteration": 2.7695930004119873 }, { "auxiliary_loss_clip": 0.0119369, "auxiliary_loss_mlp": 0.01033784, "balance_loss_clip": 1.05804956, "balance_loss_mlp": 1.02486706, "epoch": 0.30385378464498286, "flos": 21143233797120.0, "grad_norm": 1.869325903634438, "language_loss": 0.87263918, "learning_rate": 3.263699058387594e-06, "loss": 0.89491391, "num_input_tokens_seen": 54395245, "step": 2527, "time_per_iteration": 2.666353464126587 }, { "auxiliary_loss_clip": 0.01188626, "auxiliary_loss_mlp": 0.01034445, "balance_loss_clip": 1.05659604, "balance_loss_mlp": 1.0251708, "epoch": 0.30397402753562197, "flos": 20629244131200.0, "grad_norm": 2.5631030050659542, "language_loss": 0.90046334, "learning_rate": 3.2630951885813315e-06, "loss": 0.92269403, "num_input_tokens_seen": 54412640, "step": 2528, "time_per_iteration": 2.7276225090026855 }, { "auxiliary_loss_clip": 0.01194046, "auxiliary_loss_mlp": 0.0103287, "balance_loss_clip": 1.05541503, "balance_loss_mlp": 1.02420902, "epoch": 0.304094270426261, "flos": 15085678429440.0, "grad_norm": 2.0781048548005248, "language_loss": 0.78429234, "learning_rate": 3.262491127164533e-06, "loss": 0.80656147, "num_input_tokens_seen": 54431455, "step": 2529, "time_per_iteration": 2.887725830078125 }, { "auxiliary_loss_clip": 0.01199785, "auxiliary_loss_mlp": 0.01063933, "balance_loss_clip": 1.05689907, "balance_loss_mlp": 1.02559841, "epoch": 0.30421451331690014, "flos": 13845216193920.0, "grad_norm": 2.238316990860686, "language_loss": 0.80256259, "learning_rate": 3.2618868742288337e-06, "loss": 0.82519978, "num_input_tokens_seen": 54448380, "step": 2530, "time_per_iteration": 2.649062395095825 }, { "auxiliary_loss_clip": 0.01194868, "auxiliary_loss_mlp": 0.01030951, "balance_loss_clip": 1.05499184, "balance_loss_mlp": 1.02230859, "epoch": 0.30433475620753925, "flos": 17384212615680.0, "grad_norm": 1.9276067151424234, "language_loss": 0.72670197, "learning_rate": 3.261282429865899e-06, "loss": 0.74896014, "num_input_tokens_seen": 54466385, "step": 2531, "time_per_iteration": 2.6071510314941406 }, { "auxiliary_loss_clip": 0.01198776, "auxiliary_loss_mlp": 0.01066948, "balance_loss_clip": 1.05857134, "balance_loss_mlp": 1.03047872, "epoch": 0.3044549990981783, "flos": 18916951818240.0, "grad_norm": 1.8831196134412922, "language_loss": 0.72500789, "learning_rate": 3.2606777941674225e-06, "loss": 0.74766517, "num_input_tokens_seen": 54485040, "step": 2532, "time_per_iteration": 2.802147150039673 }, { "auxiliary_loss_clip": 0.01175845, "auxiliary_loss_mlp": 0.01031052, "balance_loss_clip": 1.05364919, "balance_loss_mlp": 1.02076995, "epoch": 0.3045752419888174, "flos": 21068431724160.0, "grad_norm": 2.799362017358233, "language_loss": 0.8441776, "learning_rate": 3.2600729672251276e-06, "loss": 0.86624658, "num_input_tokens_seen": 54502755, "step": 2533, "time_per_iteration": 2.766549825668335 }, { "auxiliary_loss_clip": 0.01199171, "auxiliary_loss_mlp": 0.01057556, "balance_loss_clip": 1.05638719, "balance_loss_mlp": 1.02215576, "epoch": 0.3046954848794565, "flos": 29096405516160.0, "grad_norm": 1.918881235147937, "language_loss": 0.65160066, "learning_rate": 3.259467949130765e-06, "loss": 0.67416793, "num_input_tokens_seen": 54524165, "step": 2534, "time_per_iteration": 2.7764110565185547 }, { "auxiliary_loss_clip": 0.01198233, "auxiliary_loss_mlp": 0.01033775, "balance_loss_clip": 1.05943978, "balance_loss_mlp": 1.02442324, "epoch": 0.3048157277700956, "flos": 20295346279680.0, "grad_norm": 2.1763913490211104, "language_loss": 0.83095217, "learning_rate": 3.2588627399761164e-06, "loss": 0.85327232, "num_input_tokens_seen": 54540160, "step": 2535, "time_per_iteration": 2.746696949005127 }, { "auxiliary_loss_clip": 0.01195553, "auxiliary_loss_mlp": 0.01031886, "balance_loss_clip": 1.05834866, "balance_loss_mlp": 1.02264762, "epoch": 0.3049359706607347, "flos": 22739929165440.0, "grad_norm": 2.0326321353100285, "language_loss": 0.70770514, "learning_rate": 3.2582573398529903e-06, "loss": 0.72997963, "num_input_tokens_seen": 54557515, "step": 2536, "time_per_iteration": 2.811412811279297 }, { "auxiliary_loss_clip": 0.01196403, "auxiliary_loss_mlp": 0.01030234, "balance_loss_clip": 1.06146145, "balance_loss_mlp": 1.02097154, "epoch": 0.3050562135513738, "flos": 18434634969600.0, "grad_norm": 2.794677880937359, "language_loss": 0.73896754, "learning_rate": 3.2576517488532265e-06, "loss": 0.76123393, "num_input_tokens_seen": 54573865, "step": 2537, "time_per_iteration": 2.7622504234313965 }, { "auxiliary_loss_clip": 0.01196741, "auxiliary_loss_mlp": 0.0103409, "balance_loss_clip": 1.05694914, "balance_loss_mlp": 1.02478004, "epoch": 0.30517645644201286, "flos": 20370327920640.0, "grad_norm": 1.802422247978569, "language_loss": 0.87529027, "learning_rate": 3.257045967068692e-06, "loss": 0.89759851, "num_input_tokens_seen": 54593120, "step": 2538, "time_per_iteration": 2.782932758331299 }, { "auxiliary_loss_clip": 0.0120367, "auxiliary_loss_mlp": 0.01034295, "balance_loss_clip": 1.0587275, "balance_loss_mlp": 1.02493095, "epoch": 0.30529669933265197, "flos": 21945118970880.0, "grad_norm": 1.6280446009145029, "language_loss": 0.8205837, "learning_rate": 3.2564399945912848e-06, "loss": 0.84296334, "num_input_tokens_seen": 54612910, "step": 2539, "time_per_iteration": 2.794241428375244 }, { "auxiliary_loss_clip": 0.01193548, "auxiliary_loss_mlp": 0.0103229, "balance_loss_clip": 1.05910408, "balance_loss_mlp": 1.02421951, "epoch": 0.305416942223291, "flos": 21835411856640.0, "grad_norm": 2.4571657237988305, "language_loss": 0.82597232, "learning_rate": 3.2558338315129287e-06, "loss": 0.84823072, "num_input_tokens_seen": 54631055, "step": 2540, "time_per_iteration": 2.8408167362213135 }, { "auxiliary_loss_clip": 0.0119332, "auxiliary_loss_mlp": 0.01031874, "balance_loss_clip": 1.0588547, "balance_loss_mlp": 1.0228914, "epoch": 0.30553718511393013, "flos": 33911810709120.0, "grad_norm": 2.2695468516735517, "language_loss": 0.76087964, "learning_rate": 3.2552274779255785e-06, "loss": 0.7831316, "num_input_tokens_seen": 54651985, "step": 2541, "time_per_iteration": 3.814737319946289 }, { "auxiliary_loss_clip": 0.01197502, "auxiliary_loss_mlp": 0.01032709, "balance_loss_clip": 1.05747032, "balance_loss_mlp": 1.02329111, "epoch": 0.30565742800456924, "flos": 22268530051200.0, "grad_norm": 2.100798818089926, "language_loss": 0.77182436, "learning_rate": 3.2546209339212184e-06, "loss": 0.79412645, "num_input_tokens_seen": 54671005, "step": 2542, "time_per_iteration": 2.90272855758667 }, { "auxiliary_loss_clip": 0.0119556, "auxiliary_loss_mlp": 0.01030032, "balance_loss_clip": 1.05661321, "balance_loss_mlp": 1.02139509, "epoch": 0.3057776708952083, "flos": 22565044823040.0, "grad_norm": 1.6329768695131706, "language_loss": 0.77496457, "learning_rate": 3.25401419959186e-06, "loss": 0.79722047, "num_input_tokens_seen": 54691615, "step": 2543, "time_per_iteration": 3.7786765098571777 }, { "auxiliary_loss_clip": 0.01206835, "auxiliary_loss_mlp": 0.01036768, "balance_loss_clip": 1.06289136, "balance_loss_mlp": 1.02705276, "epoch": 0.3058979137858474, "flos": 21799213925760.0, "grad_norm": 2.0186278257473367, "language_loss": 0.7614882, "learning_rate": 3.253407275029545e-06, "loss": 0.78392416, "num_input_tokens_seen": 54710520, "step": 2544, "time_per_iteration": 2.8457324504852295 }, { "auxiliary_loss_clip": 0.01193201, "auxiliary_loss_mlp": 0.01030526, "balance_loss_clip": 1.05977011, "balance_loss_mlp": 1.02074456, "epoch": 0.3060181566764865, "flos": 26979435601920.0, "grad_norm": 1.8588108838392061, "language_loss": 0.79911608, "learning_rate": 3.2528001603263425e-06, "loss": 0.82135332, "num_input_tokens_seen": 54732590, "step": 2545, "time_per_iteration": 3.9771578311920166 }, { "auxiliary_loss_clip": 0.01196142, "auxiliary_loss_mlp": 0.01032003, "balance_loss_clip": 1.05945671, "balance_loss_mlp": 1.02217436, "epoch": 0.3061383995671256, "flos": 19865101173120.0, "grad_norm": 2.068505347282067, "language_loss": 0.81824255, "learning_rate": 3.2521928555743514e-06, "loss": 0.84052408, "num_input_tokens_seen": 54749935, "step": 2546, "time_per_iteration": 2.865889072418213 }, { "auxiliary_loss_clip": 0.01189327, "auxiliary_loss_mlp": 0.01067066, "balance_loss_clip": 1.05732501, "balance_loss_mlp": 1.03036654, "epoch": 0.3062586424577647, "flos": 22127509255680.0, "grad_norm": 1.9522396186962694, "language_loss": 0.67518628, "learning_rate": 3.2515853608657e-06, "loss": 0.69775021, "num_input_tokens_seen": 54767935, "step": 2547, "time_per_iteration": 2.815378427505493 }, { "auxiliary_loss_clip": 0.01192941, "auxiliary_loss_mlp": 0.01028011, "balance_loss_clip": 1.05717182, "balance_loss_mlp": 1.01889765, "epoch": 0.3063788853484038, "flos": 20845497962880.0, "grad_norm": 2.679413901147367, "language_loss": 0.74632955, "learning_rate": 3.250977676292545e-06, "loss": 0.76853907, "num_input_tokens_seen": 54786175, "step": 2548, "time_per_iteration": 2.833333730697632 }, { "auxiliary_loss_clip": 0.01197249, "auxiliary_loss_mlp": 0.01030521, "balance_loss_clip": 1.05853534, "balance_loss_mlp": 1.02128184, "epoch": 0.30649912823904285, "flos": 16209717707520.0, "grad_norm": 2.1843175558352943, "language_loss": 0.79163253, "learning_rate": 3.2503698019470712e-06, "loss": 0.81391025, "num_input_tokens_seen": 54801945, "step": 2549, "time_per_iteration": 2.7441725730895996 }, { "auxiliary_loss_clip": 0.01196355, "auxiliary_loss_mlp": 0.01032026, "balance_loss_clip": 1.05704963, "balance_loss_mlp": 1.02238846, "epoch": 0.30661937112968196, "flos": 18617815353600.0, "grad_norm": 2.26438062016576, "language_loss": 0.7825588, "learning_rate": 3.249761737921492e-06, "loss": 0.80484259, "num_input_tokens_seen": 54818475, "step": 2550, "time_per_iteration": 2.7310125827789307 }, { "auxiliary_loss_clip": 0.01191419, "auxiliary_loss_mlp": 0.01028013, "balance_loss_clip": 1.05854702, "balance_loss_mlp": 1.0191021, "epoch": 0.30673961402032107, "flos": 31390809638400.0, "grad_norm": 1.9620564895841124, "language_loss": 0.74291825, "learning_rate": 3.249153484308051e-06, "loss": 0.76511258, "num_input_tokens_seen": 54837090, "step": 2551, "time_per_iteration": 2.8792550563812256 }, { "auxiliary_loss_clip": 0.01177135, "auxiliary_loss_mlp": 0.01028583, "balance_loss_clip": 1.05816913, "balance_loss_mlp": 1.01919532, "epoch": 0.3068598569109601, "flos": 20229809915520.0, "grad_norm": 1.9969013202231762, "language_loss": 0.77379698, "learning_rate": 3.2485450411990194e-06, "loss": 0.79585415, "num_input_tokens_seen": 54856445, "step": 2552, "time_per_iteration": 2.8641250133514404 }, { "auxiliary_loss_clip": 0.01199908, "auxiliary_loss_mlp": 0.01033427, "balance_loss_clip": 1.05520797, "balance_loss_mlp": 1.02349734, "epoch": 0.30698009980159924, "flos": 29601991399680.0, "grad_norm": 1.6725961165237244, "language_loss": 0.825773, "learning_rate": 3.2479364086866983e-06, "loss": 0.84810638, "num_input_tokens_seen": 54876700, "step": 2553, "time_per_iteration": 2.8416104316711426 }, { "auxiliary_loss_clip": 0.01193512, "auxiliary_loss_mlp": 0.01063332, "balance_loss_clip": 1.05831456, "balance_loss_mlp": 1.02720904, "epoch": 0.30710034269223835, "flos": 23842423261440.0, "grad_norm": 1.7201313411102925, "language_loss": 0.81253731, "learning_rate": 3.247327586863416e-06, "loss": 0.83510578, "num_input_tokens_seen": 54897580, "step": 2554, "time_per_iteration": 2.801096200942993 }, { "auxiliary_loss_clip": 0.01194581, "auxiliary_loss_mlp": 0.0103355, "balance_loss_clip": 1.05590177, "balance_loss_mlp": 1.02345335, "epoch": 0.3072205855828774, "flos": 25884986152320.0, "grad_norm": 2.219633906089811, "language_loss": 0.76951683, "learning_rate": 3.2467185758215304e-06, "loss": 0.79179817, "num_input_tokens_seen": 54917320, "step": 2555, "time_per_iteration": 2.8187830448150635 }, { "auxiliary_loss_clip": 0.01190743, "auxiliary_loss_mlp": 0.01057469, "balance_loss_clip": 1.05662513, "balance_loss_mlp": 1.02210164, "epoch": 0.3073408284735165, "flos": 22236390357120.0, "grad_norm": 2.495127497387699, "language_loss": 0.85587001, "learning_rate": 3.246109375653428e-06, "loss": 0.87835217, "num_input_tokens_seen": 54934085, "step": 2556, "time_per_iteration": 2.718132495880127 }, { "auxiliary_loss_clip": 0.01199557, "auxiliary_loss_mlp": 0.01031424, "balance_loss_clip": 1.05669141, "balance_loss_mlp": 1.02235234, "epoch": 0.30746107136415557, "flos": 19500284689920.0, "grad_norm": 1.9055887824387667, "language_loss": 0.78127134, "learning_rate": 3.2454999864515243e-06, "loss": 0.80358112, "num_input_tokens_seen": 54953460, "step": 2557, "time_per_iteration": 2.7685320377349854 }, { "auxiliary_loss_clip": 0.01189274, "auxiliary_loss_mlp": 0.0105824, "balance_loss_clip": 1.05602312, "balance_loss_mlp": 1.02091694, "epoch": 0.3075813142547947, "flos": 21724806902400.0, "grad_norm": 1.999811130742696, "language_loss": 0.69517678, "learning_rate": 3.244890408308263e-06, "loss": 0.71765196, "num_input_tokens_seen": 54974165, "step": 2558, "time_per_iteration": 2.759572982788086 }, { "auxiliary_loss_clip": 0.01187634, "auxiliary_loss_mlp": 0.01034796, "balance_loss_clip": 1.05664515, "balance_loss_mlp": 1.02577806, "epoch": 0.3077015571454338, "flos": 24097963593600.0, "grad_norm": 2.757636192129193, "language_loss": 0.60979426, "learning_rate": 3.2442806413161165e-06, "loss": 0.63201857, "num_input_tokens_seen": 54993810, "step": 2559, "time_per_iteration": 2.8152191638946533 }, { "auxiliary_loss_clip": 0.01194018, "auxiliary_loss_mlp": 0.01033222, "balance_loss_clip": 1.0593549, "balance_loss_mlp": 1.02379274, "epoch": 0.30782180003607285, "flos": 18405476104320.0, "grad_norm": 2.0318328890152193, "language_loss": 0.7581836, "learning_rate": 3.243670685567586e-06, "loss": 0.78045595, "num_input_tokens_seen": 55011210, "step": 2560, "time_per_iteration": 2.7168543338775635 }, { "auxiliary_loss_clip": 0.01193535, "auxiliary_loss_mlp": 0.01053826, "balance_loss_clip": 1.05714178, "balance_loss_mlp": 1.01865566, "epoch": 0.30794204292671196, "flos": 23878549365120.0, "grad_norm": 2.1812860414839865, "language_loss": 0.8034085, "learning_rate": 3.2430605411552012e-06, "loss": 0.82588208, "num_input_tokens_seen": 55031325, "step": 2561, "time_per_iteration": 2.822920322418213 }, { "auxiliary_loss_clip": 0.01094894, "auxiliary_loss_mlp": 0.01002951, "balance_loss_clip": 1.0193795, "balance_loss_mlp": 1.00109088, "epoch": 0.30806228581735107, "flos": 67927800816000.0, "grad_norm": 0.8914078010849693, "language_loss": 0.70543671, "learning_rate": 3.2424502081715205e-06, "loss": 0.72641516, "num_input_tokens_seen": 55094440, "step": 2562, "time_per_iteration": 3.321331262588501 }, { "auxiliary_loss_clip": 0.01196682, "auxiliary_loss_mlp": 0.01033464, "balance_loss_clip": 1.05851233, "balance_loss_mlp": 1.02409983, "epoch": 0.3081825287079901, "flos": 23843213360640.0, "grad_norm": 1.609509126041659, "language_loss": 0.77798271, "learning_rate": 3.241839686709132e-06, "loss": 0.80028421, "num_input_tokens_seen": 55115375, "step": 2563, "time_per_iteration": 2.8469884395599365 }, { "auxiliary_loss_clip": 0.01198262, "auxiliary_loss_mlp": 0.01030318, "balance_loss_clip": 1.05790007, "balance_loss_mlp": 1.0207808, "epoch": 0.30830277159862923, "flos": 16209969102720.0, "grad_norm": 3.719619534537086, "language_loss": 0.81798732, "learning_rate": 3.2412289768606495e-06, "loss": 0.84027314, "num_input_tokens_seen": 55131945, "step": 2564, "time_per_iteration": 2.8113598823547363 }, { "auxiliary_loss_clip": 0.01203791, "auxiliary_loss_mlp": 0.01037539, "balance_loss_clip": 1.05992079, "balance_loss_mlp": 1.02784765, "epoch": 0.30842301448926834, "flos": 29349503723520.0, "grad_norm": 1.7670424656822041, "language_loss": 0.82538462, "learning_rate": 3.240618078718718e-06, "loss": 0.84779787, "num_input_tokens_seen": 55153405, "step": 2565, "time_per_iteration": 2.7144806385040283 }, { "auxiliary_loss_clip": 0.01189301, "auxiliary_loss_mlp": 0.01033665, "balance_loss_clip": 1.0568949, "balance_loss_mlp": 1.02341318, "epoch": 0.3085432573799074, "flos": 21945190798080.0, "grad_norm": 2.7311594610996837, "language_loss": 0.74183893, "learning_rate": 3.240006992376011e-06, "loss": 0.76406854, "num_input_tokens_seen": 55173030, "step": 2566, "time_per_iteration": 2.8216631412506104 }, { "auxiliary_loss_clip": 0.0119768, "auxiliary_loss_mlp": 0.01032288, "balance_loss_clip": 1.05779767, "balance_loss_mlp": 1.02380645, "epoch": 0.3086635002705465, "flos": 22054718344320.0, "grad_norm": 2.3369916889964273, "language_loss": 0.76449513, "learning_rate": 3.2393957179252284e-06, "loss": 0.78679478, "num_input_tokens_seen": 55189565, "step": 2567, "time_per_iteration": 3.7718148231506348 }, { "auxiliary_loss_clip": 0.01205484, "auxiliary_loss_mlp": 0.01032968, "balance_loss_clip": 1.06027555, "balance_loss_mlp": 1.02304339, "epoch": 0.3087837431611856, "flos": 32665925520000.0, "grad_norm": 2.2337255341666222, "language_loss": 0.80545741, "learning_rate": 3.2387842554591016e-06, "loss": 0.82784194, "num_input_tokens_seen": 55210380, "step": 2568, "time_per_iteration": 2.667625665664673 }, { "auxiliary_loss_clip": 0.01203946, "auxiliary_loss_mlp": 0.01032449, "balance_loss_clip": 1.05946863, "balance_loss_mlp": 1.02347231, "epoch": 0.3089039860518247, "flos": 17599245384960.0, "grad_norm": 2.5646761961585645, "language_loss": 0.8808918, "learning_rate": 3.238172605070388e-06, "loss": 0.90325576, "num_input_tokens_seen": 55225795, "step": 2569, "time_per_iteration": 3.4393436908721924 }, { "auxiliary_loss_clip": 0.0119836, "auxiliary_loss_mlp": 0.01058928, "balance_loss_clip": 1.05838406, "balance_loss_mlp": 1.02275372, "epoch": 0.3090242289424638, "flos": 14383839611520.0, "grad_norm": 2.442115310155503, "language_loss": 0.78248465, "learning_rate": 3.2375607668518745e-06, "loss": 0.80505759, "num_input_tokens_seen": 55238830, "step": 2570, "time_per_iteration": 3.923109292984009 }, { "auxiliary_loss_clip": 0.01187296, "auxiliary_loss_mlp": 0.01029351, "balance_loss_clip": 1.05806112, "balance_loss_mlp": 1.01993382, "epoch": 0.30914447183310284, "flos": 16068625084800.0, "grad_norm": 2.633618875282009, "language_loss": 0.89648664, "learning_rate": 3.236948740896377e-06, "loss": 0.91865313, "num_input_tokens_seen": 55253630, "step": 2571, "time_per_iteration": 3.4432015419006348 }, { "auxiliary_loss_clip": 0.01199012, "auxiliary_loss_mlp": 0.01031537, "balance_loss_clip": 1.05746257, "balance_loss_mlp": 1.02230406, "epoch": 0.30926471472374195, "flos": 32230221546240.0, "grad_norm": 1.4732279907013552, "language_loss": 0.84154916, "learning_rate": 3.2363365272967384e-06, "loss": 0.86385465, "num_input_tokens_seen": 55276200, "step": 2572, "time_per_iteration": 2.701899528503418 }, { "auxiliary_loss_clip": 0.01199158, "auxiliary_loss_mlp": 0.01031036, "balance_loss_clip": 1.06147575, "balance_loss_mlp": 1.0212307, "epoch": 0.30938495761438106, "flos": 20370722970240.0, "grad_norm": 1.8601965628929662, "language_loss": 0.81611466, "learning_rate": 3.235724126145832e-06, "loss": 0.83841658, "num_input_tokens_seen": 55292235, "step": 2573, "time_per_iteration": 2.5326647758483887 }, { "auxiliary_loss_clip": 0.01190099, "auxiliary_loss_mlp": 0.01030437, "balance_loss_clip": 1.05832684, "balance_loss_mlp": 1.02116203, "epoch": 0.3095052005050201, "flos": 24061155131520.0, "grad_norm": 1.4699870920298979, "language_loss": 0.77524185, "learning_rate": 3.235111537536558e-06, "loss": 0.7974472, "num_input_tokens_seen": 55313050, "step": 2574, "time_per_iteration": 2.6879191398620605 }, { "auxiliary_loss_clip": 0.01198652, "auxiliary_loss_mlp": 0.01028437, "balance_loss_clip": 1.05774307, "balance_loss_mlp": 1.01924634, "epoch": 0.30962544339565923, "flos": 23401547729280.0, "grad_norm": 1.885474866010979, "language_loss": 0.82925677, "learning_rate": 3.2344987615618456e-06, "loss": 0.85152763, "num_input_tokens_seen": 55332885, "step": 2575, "time_per_iteration": 2.8336071968078613 }, { "auxiliary_loss_clip": 0.01190855, "auxiliary_loss_mlp": 0.01035092, "balance_loss_clip": 1.05952406, "balance_loss_mlp": 1.02604985, "epoch": 0.30974568628629834, "flos": 33799984692480.0, "grad_norm": 1.554601059695725, "language_loss": 0.7866019, "learning_rate": 3.2338857983146533e-06, "loss": 0.80886126, "num_input_tokens_seen": 55354385, "step": 2576, "time_per_iteration": 3.124422073364258 }, { "auxiliary_loss_clip": 0.01189556, "auxiliary_loss_mlp": 0.01027526, "balance_loss_clip": 1.06119931, "balance_loss_mlp": 1.01839447, "epoch": 0.3098659291769374, "flos": 20229594433920.0, "grad_norm": 1.9175268567661934, "language_loss": 0.76166892, "learning_rate": 3.233272647887966e-06, "loss": 0.7838397, "num_input_tokens_seen": 55373275, "step": 2577, "time_per_iteration": 2.9745540618896484 }, { "auxiliary_loss_clip": 0.01202281, "auxiliary_loss_mlp": 0.01035724, "balance_loss_clip": 1.05810785, "balance_loss_mlp": 1.02661026, "epoch": 0.3099861720675765, "flos": 24748556682240.0, "grad_norm": 1.5901973636160558, "language_loss": 0.89817321, "learning_rate": 3.2326593103747985e-06, "loss": 0.92055321, "num_input_tokens_seen": 55392290, "step": 2578, "time_per_iteration": 3.021958351135254 }, { "auxiliary_loss_clip": 0.01198084, "auxiliary_loss_mlp": 0.0102993, "balance_loss_clip": 1.06012559, "balance_loss_mlp": 1.02078092, "epoch": 0.3101064149582156, "flos": 11765485704960.0, "grad_norm": 2.4296088487819736, "language_loss": 0.84920734, "learning_rate": 3.2320457858681936e-06, "loss": 0.8714875, "num_input_tokens_seen": 55410680, "step": 2579, "time_per_iteration": 2.893911600112915 }, { "auxiliary_loss_clip": 0.01194209, "auxiliary_loss_mlp": 0.01036961, "balance_loss_clip": 1.05875683, "balance_loss_mlp": 1.02782309, "epoch": 0.31022665784885467, "flos": 23033247626880.0, "grad_norm": 2.9874764778209744, "language_loss": 0.85354894, "learning_rate": 3.2314320744612228e-06, "loss": 0.87586063, "num_input_tokens_seen": 55425980, "step": 2580, "time_per_iteration": 2.8201143741607666 }, { "auxiliary_loss_clip": 0.01193649, "auxiliary_loss_mlp": 0.01031278, "balance_loss_clip": 1.05669677, "balance_loss_mlp": 1.02259898, "epoch": 0.3103469007394938, "flos": 16289188548480.0, "grad_norm": 2.0377590489581054, "language_loss": 0.76427197, "learning_rate": 3.2308181762469854e-06, "loss": 0.78652126, "num_input_tokens_seen": 55443925, "step": 2581, "time_per_iteration": 2.7020153999328613 }, { "auxiliary_loss_clip": 0.01206369, "auxiliary_loss_mlp": 0.01033104, "balance_loss_clip": 1.05947673, "balance_loss_mlp": 1.02364528, "epoch": 0.3104671436301329, "flos": 30515271626880.0, "grad_norm": 2.4090751585393426, "language_loss": 0.78485078, "learning_rate": 3.230204091318609e-06, "loss": 0.80724549, "num_input_tokens_seen": 55464465, "step": 2582, "time_per_iteration": 2.724358081817627 }, { "auxiliary_loss_clip": 0.01197838, "auxiliary_loss_mlp": 0.01057214, "balance_loss_clip": 1.05544019, "balance_loss_mlp": 1.02216291, "epoch": 0.31058738652077195, "flos": 20047240062720.0, "grad_norm": 1.8074467882891052, "language_loss": 0.84834552, "learning_rate": 3.2295898197692503e-06, "loss": 0.87089598, "num_input_tokens_seen": 55483425, "step": 2583, "time_per_iteration": 2.682339668273926 }, { "auxiliary_loss_clip": 0.01201169, "auxiliary_loss_mlp": 0.0102953, "balance_loss_clip": 1.05873036, "balance_loss_mlp": 1.02079225, "epoch": 0.31070762941141106, "flos": 28074639237120.0, "grad_norm": 2.1184360206122914, "language_loss": 0.79233044, "learning_rate": 3.228975361692094e-06, "loss": 0.81463742, "num_input_tokens_seen": 55504445, "step": 2584, "time_per_iteration": 2.7984421253204346 }, { "auxiliary_loss_clip": 0.01202542, "auxiliary_loss_mlp": 0.01056965, "balance_loss_clip": 1.05735767, "balance_loss_mlp": 1.02101004, "epoch": 0.31082787230205017, "flos": 20521907314560.0, "grad_norm": 2.1002530956244962, "language_loss": 0.79773092, "learning_rate": 3.228360717180352e-06, "loss": 0.82032597, "num_input_tokens_seen": 55521970, "step": 2585, "time_per_iteration": 2.821897268295288 }, { "auxiliary_loss_clip": 0.01094341, "auxiliary_loss_mlp": 0.01021975, "balance_loss_clip": 1.02036452, "balance_loss_mlp": 0.99864751, "epoch": 0.3109481151926892, "flos": 62445928723200.0, "grad_norm": 0.84789346582047, "language_loss": 0.59397972, "learning_rate": 3.227745886327266e-06, "loss": 0.61514294, "num_input_tokens_seen": 55580665, "step": 2586, "time_per_iteration": 3.404919147491455 }, { "auxiliary_loss_clip": 0.01093823, "auxiliary_loss_mlp": 0.01001668, "balance_loss_clip": 1.01980114, "balance_loss_mlp": 0.99964124, "epoch": 0.31106835808332833, "flos": 44746744723200.0, "grad_norm": 0.8053952756686056, "language_loss": 0.55830455, "learning_rate": 3.227130869226105e-06, "loss": 0.5792594, "num_input_tokens_seen": 55637825, "step": 2587, "time_per_iteration": 3.340362787246704 }, { "auxiliary_loss_clip": 0.01197605, "auxiliary_loss_mlp": 0.01032062, "balance_loss_clip": 1.05678856, "balance_loss_mlp": 1.02229249, "epoch": 0.3111886009739674, "flos": 23403056100480.0, "grad_norm": 2.544906504552733, "language_loss": 0.82550859, "learning_rate": 3.226515665970167e-06, "loss": 0.84780532, "num_input_tokens_seen": 55655365, "step": 2588, "time_per_iteration": 2.8101511001586914 }, { "auxiliary_loss_clip": 0.01192017, "auxiliary_loss_mlp": 0.01032028, "balance_loss_clip": 1.05451512, "balance_loss_mlp": 1.02213979, "epoch": 0.3113088438646065, "flos": 17530728192000.0, "grad_norm": 2.5082213058602973, "language_loss": 0.8628698, "learning_rate": 3.225900276652777e-06, "loss": 0.88511026, "num_input_tokens_seen": 55672140, "step": 2589, "time_per_iteration": 2.793214797973633 }, { "auxiliary_loss_clip": 0.01199185, "auxiliary_loss_mlp": 0.01030698, "balance_loss_clip": 1.05685985, "balance_loss_mlp": 1.02167976, "epoch": 0.3114290867552456, "flos": 28365802882560.0, "grad_norm": 2.3869026770346786, "language_loss": 0.75360203, "learning_rate": 3.2252847013672906e-06, "loss": 0.77590084, "num_input_tokens_seen": 55694800, "step": 2590, "time_per_iteration": 2.8388590812683105 }, { "auxiliary_loss_clip": 0.01182537, "auxiliary_loss_mlp": 0.01028619, "balance_loss_clip": 1.05548787, "balance_loss_mlp": 1.01975012, "epoch": 0.31154932964588467, "flos": 27379157126400.0, "grad_norm": 1.876628159860185, "language_loss": 0.75740123, "learning_rate": 3.224668940207089e-06, "loss": 0.77951276, "num_input_tokens_seen": 55713785, "step": 2591, "time_per_iteration": 2.908064842224121 }, { "auxiliary_loss_clip": 0.01181247, "auxiliary_loss_mlp": 0.01033281, "balance_loss_clip": 1.05612135, "balance_loss_mlp": 1.02397656, "epoch": 0.3116695725365238, "flos": 26541864120960.0, "grad_norm": 1.9041286097509844, "language_loss": 0.86975503, "learning_rate": 3.2240529932655828e-06, "loss": 0.8919003, "num_input_tokens_seen": 55733050, "step": 2592, "time_per_iteration": 2.8526008129119873 }, { "auxiliary_loss_clip": 0.01191068, "auxiliary_loss_mlp": 0.01035938, "balance_loss_clip": 1.05776739, "balance_loss_mlp": 1.02649081, "epoch": 0.3117898154271629, "flos": 21177600134400.0, "grad_norm": 2.676456467357024, "language_loss": 0.88125217, "learning_rate": 3.223436860636211e-06, "loss": 0.90352225, "num_input_tokens_seen": 55748685, "step": 2593, "time_per_iteration": 3.6974170207977295 }, { "auxiliary_loss_clip": 0.01198496, "auxiliary_loss_mlp": 0.01034104, "balance_loss_clip": 1.05584204, "balance_loss_mlp": 1.02525806, "epoch": 0.31191005831780194, "flos": 27272430840960.0, "grad_norm": 1.9297537403184775, "language_loss": 0.74261594, "learning_rate": 3.2228205424124403e-06, "loss": 0.76494193, "num_input_tokens_seen": 55771840, "step": 2594, "time_per_iteration": 2.760154962539673 }, { "auxiliary_loss_clip": 0.01183065, "auxiliary_loss_mlp": 0.01035053, "balance_loss_clip": 1.0565747, "balance_loss_mlp": 1.02576041, "epoch": 0.31203030120844105, "flos": 12963501043200.0, "grad_norm": 2.590783166416829, "language_loss": 0.75051892, "learning_rate": 3.222204038687765e-06, "loss": 0.77270007, "num_input_tokens_seen": 55784975, "step": 2595, "time_per_iteration": 3.6386303901672363 }, { "auxiliary_loss_clip": 0.01194561, "auxiliary_loss_mlp": 0.01041661, "balance_loss_clip": 1.05782044, "balance_loss_mlp": 1.03292346, "epoch": 0.31215054409908016, "flos": 27562014288000.0, "grad_norm": 1.646425616156323, "language_loss": 0.87930429, "learning_rate": 3.221587349555709e-06, "loss": 0.90166652, "num_input_tokens_seen": 55805235, "step": 2596, "time_per_iteration": 4.0684754848480225 }, { "auxiliary_loss_clip": 0.01197105, "auxiliary_loss_mlp": 0.01027585, "balance_loss_clip": 1.05817008, "balance_loss_mlp": 1.01888287, "epoch": 0.3122707869897192, "flos": 21506326427520.0, "grad_norm": 1.7417434589653096, "language_loss": 0.69738108, "learning_rate": 3.2209704751098236e-06, "loss": 0.71962798, "num_input_tokens_seen": 55824265, "step": 2597, "time_per_iteration": 3.859828472137451 }, { "auxiliary_loss_clip": 0.01195399, "auxiliary_loss_mlp": 0.0102766, "balance_loss_clip": 1.05748725, "balance_loss_mlp": 1.01831412, "epoch": 0.31239102988035833, "flos": 15187017674880.0, "grad_norm": 1.9550058266118884, "language_loss": 0.8294363, "learning_rate": 3.2203534154436875e-06, "loss": 0.85166693, "num_input_tokens_seen": 55838620, "step": 2598, "time_per_iteration": 2.79160213470459 }, { "auxiliary_loss_clip": 0.01181329, "auxiliary_loss_mlp": 0.01032358, "balance_loss_clip": 1.05710745, "balance_loss_mlp": 1.02304745, "epoch": 0.31251127277099744, "flos": 22053712763520.0, "grad_norm": 2.3582637676181264, "language_loss": 0.75739133, "learning_rate": 3.2197361706509084e-06, "loss": 0.77952826, "num_input_tokens_seen": 55859375, "step": 2599, "time_per_iteration": 2.8347582817077637 }, { "auxiliary_loss_clip": 0.01204011, "auxiliary_loss_mlp": 0.01029748, "balance_loss_clip": 1.05862784, "balance_loss_mlp": 1.02026534, "epoch": 0.3126315156616365, "flos": 15193984913280.0, "grad_norm": 2.948882884778414, "language_loss": 0.83151269, "learning_rate": 3.2191187408251228e-06, "loss": 0.85385036, "num_input_tokens_seen": 55876535, "step": 2600, "time_per_iteration": 2.750575065612793 }, { "auxiliary_loss_clip": 0.01199733, "auxiliary_loss_mlp": 0.01039333, "balance_loss_clip": 1.05493593, "balance_loss_mlp": 1.02983224, "epoch": 0.3127517585522756, "flos": 18145338831360.0, "grad_norm": 2.422812369495687, "language_loss": 0.78814107, "learning_rate": 3.218501126059993e-06, "loss": 0.81053168, "num_input_tokens_seen": 55891930, "step": 2601, "time_per_iteration": 2.644723892211914 }, { "auxiliary_loss_clip": 0.01196091, "auxiliary_loss_mlp": 0.01030628, "balance_loss_clip": 1.05494773, "balance_loss_mlp": 1.02156174, "epoch": 0.31287200144291466, "flos": 21908633731200.0, "grad_norm": 1.7809146070421096, "language_loss": 0.81421208, "learning_rate": 3.2178833264492116e-06, "loss": 0.83647919, "num_input_tokens_seen": 55910635, "step": 2602, "time_per_iteration": 2.9061543941497803 }, { "auxiliary_loss_clip": 0.01205916, "auxiliary_loss_mlp": 0.01037422, "balance_loss_clip": 1.06021869, "balance_loss_mlp": 1.02765918, "epoch": 0.31299224433355377, "flos": 29896997800320.0, "grad_norm": 1.6687720766706502, "language_loss": 0.75985295, "learning_rate": 3.217265342086498e-06, "loss": 0.78228635, "num_input_tokens_seen": 55931125, "step": 2603, "time_per_iteration": 2.839503765106201 }, { "auxiliary_loss_clip": 0.01192473, "auxiliary_loss_mlp": 0.01058893, "balance_loss_clip": 1.05797434, "balance_loss_mlp": 1.02162409, "epoch": 0.3131124872241929, "flos": 11655886331520.0, "grad_norm": 3.1606812340980195, "language_loss": 0.73090994, "learning_rate": 3.216647173065599e-06, "loss": 0.75342357, "num_input_tokens_seen": 55946590, "step": 2604, "time_per_iteration": 2.820810079574585 }, { "auxiliary_loss_clip": 0.01195499, "auxiliary_loss_mlp": 0.01032367, "balance_loss_clip": 1.06267595, "balance_loss_mlp": 1.02314615, "epoch": 0.31323273011483194, "flos": 49848785470080.0, "grad_norm": 1.6961443603047672, "language_loss": 0.73845679, "learning_rate": 3.216028819480292e-06, "loss": 0.76073539, "num_input_tokens_seen": 55967930, "step": 2605, "time_per_iteration": 3.0833792686462402 }, { "auxiliary_loss_clip": 0.01184427, "auxiliary_loss_mlp": 0.01033473, "balance_loss_clip": 1.05918705, "balance_loss_mlp": 1.02437186, "epoch": 0.31335297300547105, "flos": 22601278667520.0, "grad_norm": 2.004539598119937, "language_loss": 0.75646114, "learning_rate": 3.2154102814243793e-06, "loss": 0.77864015, "num_input_tokens_seen": 55987070, "step": 2606, "time_per_iteration": 2.8063738346099854 }, { "auxiliary_loss_clip": 0.01197179, "auxiliary_loss_mlp": 0.0103369, "balance_loss_clip": 1.06045878, "balance_loss_mlp": 1.02452326, "epoch": 0.31347321589611016, "flos": 34710858708480.0, "grad_norm": 2.499184312835526, "language_loss": 0.66835797, "learning_rate": 3.2147915589916937e-06, "loss": 0.69066668, "num_input_tokens_seen": 56008630, "step": 2607, "time_per_iteration": 3.0355303287506104 }, { "auxiliary_loss_clip": 0.01193122, "auxiliary_loss_mlp": 0.01033328, "balance_loss_clip": 1.06024718, "balance_loss_mlp": 1.02363026, "epoch": 0.3135934587867492, "flos": 19755789108480.0, "grad_norm": 2.0577104012602163, "language_loss": 0.82656133, "learning_rate": 3.2141726522760938e-06, "loss": 0.84882587, "num_input_tokens_seen": 56026690, "step": 2608, "time_per_iteration": 2.7778117656707764 }, { "auxiliary_loss_clip": 0.01088498, "auxiliary_loss_mlp": 0.01008933, "balance_loss_clip": 1.01801777, "balance_loss_mlp": 1.00688291, "epoch": 0.3137137016773883, "flos": 65815535583360.0, "grad_norm": 0.7043483912393634, "language_loss": 0.52631378, "learning_rate": 3.213553561371469e-06, "loss": 0.54728806, "num_input_tokens_seen": 56090425, "step": 2609, "time_per_iteration": 3.4289348125457764 }, { "auxiliary_loss_clip": 0.01185808, "auxiliary_loss_mlp": 0.01031651, "balance_loss_clip": 1.05889344, "balance_loss_mlp": 1.02279949, "epoch": 0.31383394456802743, "flos": 16252739222400.0, "grad_norm": 2.2943777304221755, "language_loss": 0.95787126, "learning_rate": 3.212934286371733e-06, "loss": 0.98004586, "num_input_tokens_seen": 56107135, "step": 2610, "time_per_iteration": 2.8212504386901855 }, { "auxiliary_loss_clip": 0.01198435, "auxiliary_loss_mlp": 0.01036539, "balance_loss_clip": 1.06114113, "balance_loss_mlp": 1.02753854, "epoch": 0.3139541874586665, "flos": 38795517613440.0, "grad_norm": 3.3842869120622403, "language_loss": 0.83612108, "learning_rate": 3.2123148273708304e-06, "loss": 0.85847086, "num_input_tokens_seen": 56127325, "step": 2611, "time_per_iteration": 2.999971389770508 }, { "auxiliary_loss_clip": 0.01200081, "auxiliary_loss_mlp": 0.01031102, "balance_loss_clip": 1.05956078, "balance_loss_mlp": 1.02216172, "epoch": 0.3140744303493056, "flos": 25046328430080.0, "grad_norm": 2.79453351333914, "language_loss": 0.76768845, "learning_rate": 3.211695184462733e-06, "loss": 0.79000032, "num_input_tokens_seen": 56148500, "step": 2612, "time_per_iteration": 2.91485857963562 }, { "auxiliary_loss_clip": 0.01089772, "auxiliary_loss_mlp": 0.01007942, "balance_loss_clip": 1.0188067, "balance_loss_mlp": 1.00579631, "epoch": 0.3141946732399447, "flos": 72504254782080.0, "grad_norm": 0.8825304324350595, "language_loss": 0.60472274, "learning_rate": 3.2110753577414383e-06, "loss": 0.62569988, "num_input_tokens_seen": 56210080, "step": 2613, "time_per_iteration": 3.4460041522979736 }, { "auxiliary_loss_clip": 0.01196709, "auxiliary_loss_mlp": 0.01037571, "balance_loss_clip": 1.05745113, "balance_loss_mlp": 1.02821863, "epoch": 0.31431491613058377, "flos": 19239788280960.0, "grad_norm": 2.414957228922851, "language_loss": 0.78821516, "learning_rate": 3.2104553473009757e-06, "loss": 0.81055796, "num_input_tokens_seen": 56228200, "step": 2614, "time_per_iteration": 2.9715757369995117 }, { "auxiliary_loss_clip": 0.01182705, "auxiliary_loss_mlp": 0.01032032, "balance_loss_clip": 1.05613732, "balance_loss_mlp": 1.02260804, "epoch": 0.3144351590212229, "flos": 36210596290560.0, "grad_norm": 2.9760555049690334, "language_loss": 0.67672718, "learning_rate": 3.209835153235399e-06, "loss": 0.69887459, "num_input_tokens_seen": 56249755, "step": 2615, "time_per_iteration": 2.9940035343170166 }, { "auxiliary_loss_clip": 0.01185605, "auxiliary_loss_mlp": 0.01030657, "balance_loss_clip": 1.06159544, "balance_loss_mlp": 1.02233052, "epoch": 0.314555401911862, "flos": 18551740285440.0, "grad_norm": 2.124402830395289, "language_loss": 0.67481256, "learning_rate": 3.2092147756387916e-06, "loss": 0.69697517, "num_input_tokens_seen": 56270080, "step": 2616, "time_per_iteration": 2.7971644401550293 }, { "auxiliary_loss_clip": 0.01194193, "auxiliary_loss_mlp": 0.01031686, "balance_loss_clip": 1.06275201, "balance_loss_mlp": 1.02225661, "epoch": 0.31467564480250104, "flos": 16362877299840.0, "grad_norm": 2.1873074062674642, "language_loss": 0.83581352, "learning_rate": 3.208594214605264e-06, "loss": 0.85807228, "num_input_tokens_seen": 56288625, "step": 2617, "time_per_iteration": 2.86737322807312 }, { "auxiliary_loss_clip": 0.01185259, "auxiliary_loss_mlp": 0.01026902, "balance_loss_clip": 1.05871773, "balance_loss_mlp": 1.01832461, "epoch": 0.31479588769314015, "flos": 21652375127040.0, "grad_norm": 2.191873516578777, "language_loss": 0.77066541, "learning_rate": 3.2079734702289553e-06, "loss": 0.79278696, "num_input_tokens_seen": 56307520, "step": 2618, "time_per_iteration": 2.808680772781372 }, { "auxiliary_loss_clip": 0.01086538, "auxiliary_loss_mlp": 0.01030017, "balance_loss_clip": 1.01667261, "balance_loss_mlp": 1.00389981, "epoch": 0.3149161305837792, "flos": 66051072040320.0, "grad_norm": 0.8085898141167369, "language_loss": 0.6036669, "learning_rate": 3.207352542604031e-06, "loss": 0.62483245, "num_input_tokens_seen": 56369855, "step": 2619, "time_per_iteration": 4.405975341796875 }, { "auxiliary_loss_clip": 0.01184822, "auxiliary_loss_mlp": 0.01030019, "balance_loss_clip": 1.05935097, "balance_loss_mlp": 1.02167487, "epoch": 0.3150363734744183, "flos": 28987201192320.0, "grad_norm": 1.5970274132152396, "language_loss": 0.78230906, "learning_rate": 3.2067314318246864e-06, "loss": 0.80445743, "num_input_tokens_seen": 56390570, "step": 2620, "time_per_iteration": 2.9027812480926514 }, { "auxiliary_loss_clip": 0.01191957, "auxiliary_loss_mlp": 0.0103485, "balance_loss_clip": 1.05942035, "balance_loss_mlp": 1.02514017, "epoch": 0.31515661636505743, "flos": 27636600879360.0, "grad_norm": 1.784363311470096, "language_loss": 0.77737808, "learning_rate": 3.206110137985143e-06, "loss": 0.79964614, "num_input_tokens_seen": 56410775, "step": 2621, "time_per_iteration": 4.30575156211853 }, { "auxiliary_loss_clip": 0.01179862, "auxiliary_loss_mlp": 0.01042093, "balance_loss_clip": 1.05647469, "balance_loss_mlp": 1.03265798, "epoch": 0.3152768592556965, "flos": 24605632465920.0, "grad_norm": 2.2871508126819613, "language_loss": 0.92532581, "learning_rate": 3.2054886611796505e-06, "loss": 0.94754541, "num_input_tokens_seen": 56429770, "step": 2622, "time_per_iteration": 4.111226558685303 }, { "auxiliary_loss_clip": 0.0108715, "auxiliary_loss_mlp": 0.01003822, "balance_loss_clip": 1.01434672, "balance_loss_mlp": 1.00214088, "epoch": 0.3153971021463356, "flos": 68476908026880.0, "grad_norm": 0.8916539349217176, "language_loss": 0.6357435, "learning_rate": 3.204867001502487e-06, "loss": 0.65665317, "num_input_tokens_seen": 56488425, "step": 2623, "time_per_iteration": 4.313206911087036 }, { "auxiliary_loss_clip": 0.01201835, "auxiliary_loss_mlp": 0.01029108, "balance_loss_clip": 1.05928552, "balance_loss_mlp": 1.02088833, "epoch": 0.3155173450369747, "flos": 25593714766080.0, "grad_norm": 1.919432859887555, "language_loss": 0.80490959, "learning_rate": 3.2042451590479567e-06, "loss": 0.82721901, "num_input_tokens_seen": 56508940, "step": 2624, "time_per_iteration": 2.757880926132202 }, { "auxiliary_loss_clip": 0.01195202, "auxiliary_loss_mlp": 0.01029204, "balance_loss_clip": 1.05566359, "balance_loss_mlp": 1.02050149, "epoch": 0.31563758792761376, "flos": 24309333175680.0, "grad_norm": 1.9244220192844, "language_loss": 0.86991674, "learning_rate": 3.203623133910394e-06, "loss": 0.89216077, "num_input_tokens_seen": 56527245, "step": 2625, "time_per_iteration": 2.6984872817993164 }, { "auxiliary_loss_clip": 0.0118355, "auxiliary_loss_mlp": 0.01033933, "balance_loss_clip": 1.05507326, "balance_loss_mlp": 1.02507567, "epoch": 0.31575783081825287, "flos": 31903865550720.0, "grad_norm": 2.471797258046172, "language_loss": 0.77276945, "learning_rate": 3.203000926184158e-06, "loss": 0.79494417, "num_input_tokens_seen": 56546170, "step": 2626, "time_per_iteration": 2.9090583324432373 }, { "auxiliary_loss_clip": 0.01197355, "auxiliary_loss_mlp": 0.0102618, "balance_loss_clip": 1.0571146, "balance_loss_mlp": 1.01822901, "epoch": 0.315878073708892, "flos": 30810960385920.0, "grad_norm": 1.7632600012077513, "language_loss": 0.77548677, "learning_rate": 3.202378535963639e-06, "loss": 0.7977221, "num_input_tokens_seen": 56567085, "step": 2627, "time_per_iteration": 2.803398370742798 }, { "auxiliary_loss_clip": 0.01186628, "auxiliary_loss_mlp": 0.01056838, "balance_loss_clip": 1.05762815, "balance_loss_mlp": 1.01932073, "epoch": 0.31599831659953104, "flos": 22200264253440.0, "grad_norm": 2.2417786221140696, "language_loss": 0.83994114, "learning_rate": 3.2017559633432516e-06, "loss": 0.86237586, "num_input_tokens_seen": 56586715, "step": 2628, "time_per_iteration": 2.8750531673431396 }, { "auxiliary_loss_clip": 0.01201967, "auxiliary_loss_mlp": 0.01032929, "balance_loss_clip": 1.05849028, "balance_loss_mlp": 1.02333236, "epoch": 0.31611855949017015, "flos": 25593463370880.0, "grad_norm": 1.8774318524612539, "language_loss": 0.6624403, "learning_rate": 3.2011332084174398e-06, "loss": 0.6847893, "num_input_tokens_seen": 56607585, "step": 2629, "time_per_iteration": 2.959768056869507 }, { "auxiliary_loss_clip": 0.01193524, "auxiliary_loss_mlp": 0.01032821, "balance_loss_clip": 1.05645072, "balance_loss_mlp": 1.02383542, "epoch": 0.31623880238080926, "flos": 20594087694720.0, "grad_norm": 1.642074713207896, "language_loss": 0.89228415, "learning_rate": 3.2005102712806756e-06, "loss": 0.91454756, "num_input_tokens_seen": 56626415, "step": 2630, "time_per_iteration": 2.8221938610076904 }, { "auxiliary_loss_clip": 0.01202588, "auxiliary_loss_mlp": 0.01030658, "balance_loss_clip": 1.05885768, "balance_loss_mlp": 1.02168131, "epoch": 0.3163590452714483, "flos": 12784917600000.0, "grad_norm": 2.163480481413607, "language_loss": 0.73188508, "learning_rate": 3.1998871520274575e-06, "loss": 0.75421751, "num_input_tokens_seen": 56641750, "step": 2631, "time_per_iteration": 2.843618631362915 }, { "auxiliary_loss_clip": 0.01195647, "auxiliary_loss_mlp": 0.01032162, "balance_loss_clip": 1.05711138, "balance_loss_mlp": 1.02385926, "epoch": 0.3164792881620874, "flos": 23041292273280.0, "grad_norm": 2.074697933764765, "language_loss": 0.84738493, "learning_rate": 3.199263850752312e-06, "loss": 0.86966306, "num_input_tokens_seen": 56662585, "step": 2632, "time_per_iteration": 2.899770975112915 }, { "auxiliary_loss_clip": 0.01196288, "auxiliary_loss_mlp": 0.01034667, "balance_loss_clip": 1.05644584, "balance_loss_mlp": 1.02536321, "epoch": 0.31659953105272653, "flos": 18296271780480.0, "grad_norm": 2.250213616921107, "language_loss": 0.85496622, "learning_rate": 3.198640367549795e-06, "loss": 0.87727576, "num_input_tokens_seen": 56681480, "step": 2633, "time_per_iteration": 2.8270835876464844 }, { "auxiliary_loss_clip": 0.01196293, "auxiliary_loss_mlp": 0.01056549, "balance_loss_clip": 1.05692983, "balance_loss_mlp": 1.01948071, "epoch": 0.3167197739433656, "flos": 25703421880320.0, "grad_norm": 1.8059849745021892, "language_loss": 0.85954642, "learning_rate": 3.198016702514487e-06, "loss": 0.88207495, "num_input_tokens_seen": 56701760, "step": 2634, "time_per_iteration": 2.9036595821380615 }, { "auxiliary_loss_clip": 0.01196452, "auxiliary_loss_mlp": 0.01028812, "balance_loss_clip": 1.05644476, "balance_loss_mlp": 1.0204078, "epoch": 0.3168400168340047, "flos": 23546016230400.0, "grad_norm": 1.6655897822706052, "language_loss": 0.84441221, "learning_rate": 3.1973928557409972e-06, "loss": 0.86666483, "num_input_tokens_seen": 56719800, "step": 2635, "time_per_iteration": 2.887681722640991 }, { "auxiliary_loss_clip": 0.01194288, "auxiliary_loss_mlp": 0.01030906, "balance_loss_clip": 1.0555892, "balance_loss_mlp": 1.02259135, "epoch": 0.31696025972464376, "flos": 28366449327360.0, "grad_norm": 2.5871632459397946, "language_loss": 0.71149814, "learning_rate": 3.1967688273239636e-06, "loss": 0.7337501, "num_input_tokens_seen": 56739605, "step": 2636, "time_per_iteration": 2.9045417308807373 }, { "auxiliary_loss_clip": 0.01182438, "auxiliary_loss_mlp": 0.0103292, "balance_loss_clip": 1.05347252, "balance_loss_mlp": 1.02476001, "epoch": 0.31708050261528287, "flos": 16399111144320.0, "grad_norm": 2.181416673432395, "language_loss": 0.82253397, "learning_rate": 3.1961446173580503e-06, "loss": 0.84468758, "num_input_tokens_seen": 56756545, "step": 2637, "time_per_iteration": 2.876211404800415 }, { "auxiliary_loss_clip": 0.01187385, "auxiliary_loss_mlp": 0.01032739, "balance_loss_clip": 1.05736971, "balance_loss_mlp": 1.02472854, "epoch": 0.317200745505922, "flos": 26212347728640.0, "grad_norm": 1.7500576862417216, "language_loss": 0.76962411, "learning_rate": 3.1955202259379502e-06, "loss": 0.79182541, "num_input_tokens_seen": 56778275, "step": 2638, "time_per_iteration": 2.9407286643981934 }, { "auxiliary_loss_clip": 0.01194172, "auxiliary_loss_mlp": 0.01032613, "balance_loss_clip": 1.05695081, "balance_loss_mlp": 1.02458382, "epoch": 0.31732098839656103, "flos": 31350876693120.0, "grad_norm": 1.8158914523553609, "language_loss": 0.82838905, "learning_rate": 3.194895653158381e-06, "loss": 0.85065687, "num_input_tokens_seen": 56797215, "step": 2639, "time_per_iteration": 2.8554394245147705 }, { "auxiliary_loss_clip": 0.01087087, "auxiliary_loss_mlp": 0.01008589, "balance_loss_clip": 1.01531708, "balance_loss_mlp": 1.00688446, "epoch": 0.31744123128720014, "flos": 58989024835200.0, "grad_norm": 1.0996778598821522, "language_loss": 0.55539304, "learning_rate": 3.194270899114093e-06, "loss": 0.57634979, "num_input_tokens_seen": 56863010, "step": 2640, "time_per_iteration": 3.404146432876587 }, { "auxiliary_loss_clip": 0.01206084, "auxiliary_loss_mlp": 0.01034103, "balance_loss_clip": 1.06101155, "balance_loss_mlp": 1.0249716, "epoch": 0.31756147417783925, "flos": 17417573372160.0, "grad_norm": 4.849228975695159, "language_loss": 0.82237446, "learning_rate": 3.193645963899858e-06, "loss": 0.84477627, "num_input_tokens_seen": 56880625, "step": 2641, "time_per_iteration": 2.7760396003723145 }, { "auxiliary_loss_clip": 0.01188184, "auxiliary_loss_mlp": 0.01025438, "balance_loss_clip": 1.05791354, "balance_loss_mlp": 1.01697409, "epoch": 0.3176817170684783, "flos": 25481673267840.0, "grad_norm": 4.35064375216456, "language_loss": 0.83746445, "learning_rate": 3.193020847610479e-06, "loss": 0.85960072, "num_input_tokens_seen": 56900945, "step": 2642, "time_per_iteration": 2.765465021133423 }, { "auxiliary_loss_clip": 0.0118681, "auxiliary_loss_mlp": 0.01033507, "balance_loss_clip": 1.05913019, "balance_loss_mlp": 1.02479911, "epoch": 0.3178019599591174, "flos": 24972603765120.0, "grad_norm": 2.171266306851101, "language_loss": 0.70848727, "learning_rate": 3.192395550340787e-06, "loss": 0.73069048, "num_input_tokens_seen": 56918895, "step": 2643, "time_per_iteration": 2.837782382965088 }, { "auxiliary_loss_clip": 0.01192706, "auxiliary_loss_mlp": 0.01030772, "balance_loss_clip": 1.05701017, "balance_loss_mlp": 1.02246261, "epoch": 0.31792220284975653, "flos": 12422220019200.0, "grad_norm": 2.091375172485726, "language_loss": 0.76737857, "learning_rate": 3.191770072185638e-06, "loss": 0.78961337, "num_input_tokens_seen": 56935890, "step": 2644, "time_per_iteration": 2.7148563861846924 }, { "auxiliary_loss_clip": 0.01190186, "auxiliary_loss_mlp": 0.01025497, "balance_loss_clip": 1.05573297, "balance_loss_mlp": 1.0168308, "epoch": 0.3180424457403956, "flos": 15485759089920.0, "grad_norm": 3.0560517288711107, "language_loss": 0.72720778, "learning_rate": 3.191144413239916e-06, "loss": 0.74936461, "num_input_tokens_seen": 56952460, "step": 2645, "time_per_iteration": 3.671509027481079 }, { "auxiliary_loss_clip": 0.01194149, "auxiliary_loss_mlp": 0.0102795, "balance_loss_clip": 1.05920863, "balance_loss_mlp": 1.01863384, "epoch": 0.3181626886310347, "flos": 26174964648960.0, "grad_norm": 2.9958170684463625, "language_loss": 0.88449752, "learning_rate": 3.190518573598534e-06, "loss": 0.90671849, "num_input_tokens_seen": 56969065, "step": 2646, "time_per_iteration": 2.9442758560180664 }, { "auxiliary_loss_clip": 0.01197392, "auxiliary_loss_mlp": 0.01035244, "balance_loss_clip": 1.05950534, "balance_loss_mlp": 1.02620745, "epoch": 0.3182829315216738, "flos": 25483109811840.0, "grad_norm": 1.5753748794910456, "language_loss": 0.77622443, "learning_rate": 3.1898925533564308e-06, "loss": 0.79855084, "num_input_tokens_seen": 56990535, "step": 2647, "time_per_iteration": 3.8715083599090576 }, { "auxiliary_loss_clip": 0.01177937, "auxiliary_loss_mlp": 0.01031088, "balance_loss_clip": 1.05666375, "balance_loss_mlp": 1.02196205, "epoch": 0.31840317441231286, "flos": 18113701927680.0, "grad_norm": 2.247696025228797, "language_loss": 0.63955724, "learning_rate": 3.1892663526085733e-06, "loss": 0.6616475, "num_input_tokens_seen": 57008910, "step": 2648, "time_per_iteration": 2.8968286514282227 }, { "auxiliary_loss_clip": 0.01087997, "auxiliary_loss_mlp": 0.01002582, "balance_loss_clip": 1.01633143, "balance_loss_mlp": 1.00099623, "epoch": 0.31852341730295197, "flos": 64741948957440.0, "grad_norm": 0.7520061645600508, "language_loss": 0.5692451, "learning_rate": 3.188639971449956e-06, "loss": 0.59015083, "num_input_tokens_seen": 57074960, "step": 2649, "time_per_iteration": 4.50249719619751 }, { "auxiliary_loss_clip": 0.01199689, "auxiliary_loss_mlp": 0.01031104, "balance_loss_clip": 1.05725276, "balance_loss_mlp": 1.02224112, "epoch": 0.318643660193591, "flos": 20668135582080.0, "grad_norm": 1.8389794840093678, "language_loss": 0.72832161, "learning_rate": 3.1880134099756e-06, "loss": 0.75062954, "num_input_tokens_seen": 57094595, "step": 2650, "time_per_iteration": 3.764662504196167 }, { "auxiliary_loss_clip": 0.01191705, "auxiliary_loss_mlp": 0.010325, "balance_loss_clip": 1.05613577, "balance_loss_mlp": 1.02406621, "epoch": 0.31876390308423014, "flos": 26943345411840.0, "grad_norm": 2.0559551725271166, "language_loss": 0.69307798, "learning_rate": 3.1873866682805535e-06, "loss": 0.71531999, "num_input_tokens_seen": 57115290, "step": 2651, "time_per_iteration": 2.8993144035339355 }, { "auxiliary_loss_clip": 0.0119364, "auxiliary_loss_mlp": 0.01029917, "balance_loss_clip": 1.05578256, "balance_loss_mlp": 1.02113152, "epoch": 0.31888414597486925, "flos": 18041916597120.0, "grad_norm": 4.418893043495418, "language_loss": 0.88541806, "learning_rate": 3.186759746459894e-06, "loss": 0.90765363, "num_input_tokens_seen": 57134400, "step": 2652, "time_per_iteration": 2.708582639694214 }, { "auxiliary_loss_clip": 0.01189508, "auxiliary_loss_mlp": 0.01028473, "balance_loss_clip": 1.05705523, "balance_loss_mlp": 1.02003288, "epoch": 0.3190043888655083, "flos": 25149319701120.0, "grad_norm": 1.8538469969624471, "language_loss": 0.79197299, "learning_rate": 3.1861326446087246e-06, "loss": 0.81415284, "num_input_tokens_seen": 57153140, "step": 2653, "time_per_iteration": 2.702558994293213 }, { "auxiliary_loss_clip": 0.01194582, "auxiliary_loss_mlp": 0.01028163, "balance_loss_clip": 1.05514646, "balance_loss_mlp": 1.01991081, "epoch": 0.3191246317561474, "flos": 22053892331520.0, "grad_norm": 2.3159204357120626, "language_loss": 0.71873569, "learning_rate": 3.1855053628221763e-06, "loss": 0.7409631, "num_input_tokens_seen": 57172395, "step": 2654, "time_per_iteration": 2.7378756999969482 }, { "auxiliary_loss_clip": 0.01182862, "auxiliary_loss_mlp": 0.01032117, "balance_loss_clip": 1.05718899, "balance_loss_mlp": 1.02245462, "epoch": 0.3192448746467865, "flos": 14901815687040.0, "grad_norm": 2.4922518218755063, "language_loss": 0.89812887, "learning_rate": 3.184877901195407e-06, "loss": 0.92027861, "num_input_tokens_seen": 57189090, "step": 2655, "time_per_iteration": 2.687068223953247 }, { "auxiliary_loss_clip": 0.01098907, "auxiliary_loss_mlp": 0.01015055, "balance_loss_clip": 1.03242683, "balance_loss_mlp": 1.01345754, "epoch": 0.3193651175374256, "flos": 67234832657280.0, "grad_norm": 0.7938150090741031, "language_loss": 0.62811136, "learning_rate": 3.184250259823602e-06, "loss": 0.64925098, "num_input_tokens_seen": 57251620, "step": 2656, "time_per_iteration": 3.4343454837799072 }, { "auxiliary_loss_clip": 0.01189054, "auxiliary_loss_mlp": 0.01033755, "balance_loss_clip": 1.05596721, "balance_loss_mlp": 1.02371192, "epoch": 0.3194853604280647, "flos": 12233077977600.0, "grad_norm": 2.2909096336154118, "language_loss": 0.81902599, "learning_rate": 3.183622438801974e-06, "loss": 0.84125412, "num_input_tokens_seen": 57266910, "step": 2657, "time_per_iteration": 2.843163251876831 }, { "auxiliary_loss_clip": 0.01200616, "auxiliary_loss_mlp": 0.01030887, "balance_loss_clip": 1.0603379, "balance_loss_mlp": 1.02219629, "epoch": 0.3196056033187038, "flos": 14939917038720.0, "grad_norm": 1.809919608785458, "language_loss": 0.7512309, "learning_rate": 3.1829944382257637e-06, "loss": 0.77354592, "num_input_tokens_seen": 57285040, "step": 2658, "time_per_iteration": 2.7255859375 }, { "auxiliary_loss_clip": 0.01190839, "auxiliary_loss_mlp": 0.01028056, "balance_loss_clip": 1.05674303, "balance_loss_mlp": 1.01913941, "epoch": 0.31972584620934286, "flos": 23768878164480.0, "grad_norm": 2.1059072835790777, "language_loss": 0.81611151, "learning_rate": 3.1823662581902373e-06, "loss": 0.83830041, "num_input_tokens_seen": 57302725, "step": 2659, "time_per_iteration": 2.885552406311035 }, { "auxiliary_loss_clip": 0.01179659, "auxiliary_loss_mlp": 0.0103387, "balance_loss_clip": 1.05656385, "balance_loss_mlp": 1.02515543, "epoch": 0.31984608909998197, "flos": 21251540280960.0, "grad_norm": 2.166776581998663, "language_loss": 0.73704648, "learning_rate": 3.1817378987906896e-06, "loss": 0.75918186, "num_input_tokens_seen": 57322230, "step": 2660, "time_per_iteration": 2.780170440673828 }, { "auxiliary_loss_clip": 0.01181867, "auxiliary_loss_mlp": 0.01029328, "balance_loss_clip": 1.05833554, "balance_loss_mlp": 1.02061379, "epoch": 0.3199663319906211, "flos": 18296235866880.0, "grad_norm": 2.9743629873641204, "language_loss": 0.80140889, "learning_rate": 3.181109360122442e-06, "loss": 0.82352084, "num_input_tokens_seen": 57339820, "step": 2661, "time_per_iteration": 2.9026167392730713 }, { "auxiliary_loss_clip": 0.01185529, "auxiliary_loss_mlp": 0.01033218, "balance_loss_clip": 1.05742288, "balance_loss_mlp": 1.0238719, "epoch": 0.32008657488126013, "flos": 18733627779840.0, "grad_norm": 3.1149279782203374, "language_loss": 0.78592885, "learning_rate": 3.1804806422808445e-06, "loss": 0.80811632, "num_input_tokens_seen": 57356955, "step": 2662, "time_per_iteration": 2.8442225456237793 }, { "auxiliary_loss_clip": 0.01186382, "auxiliary_loss_mlp": 0.01030816, "balance_loss_clip": 1.05970645, "balance_loss_mlp": 1.02155328, "epoch": 0.32020681777189924, "flos": 20595344670720.0, "grad_norm": 1.7750996675631467, "language_loss": 0.7333535, "learning_rate": 3.1798517453612714e-06, "loss": 0.75552547, "num_input_tokens_seen": 57376760, "step": 2663, "time_per_iteration": 2.843442440032959 }, { "auxiliary_loss_clip": 0.01190265, "auxiliary_loss_mlp": 0.01033785, "balance_loss_clip": 1.05794358, "balance_loss_mlp": 1.02560687, "epoch": 0.32032706066253835, "flos": 35261692750080.0, "grad_norm": 1.925482848832825, "language_loss": 0.75835168, "learning_rate": 3.1792226694591265e-06, "loss": 0.78059208, "num_input_tokens_seen": 57398145, "step": 2664, "time_per_iteration": 2.932906150817871 }, { "auxiliary_loss_clip": 0.01185733, "auxiliary_loss_mlp": 0.01029788, "balance_loss_clip": 1.05744815, "balance_loss_mlp": 1.02149677, "epoch": 0.3204473035531774, "flos": 15304230731520.0, "grad_norm": 2.157175772023635, "language_loss": 0.80604529, "learning_rate": 3.178593414669841e-06, "loss": 0.82820058, "num_input_tokens_seen": 57416730, "step": 2665, "time_per_iteration": 2.9260385036468506 }, { "auxiliary_loss_clip": 0.01196926, "auxiliary_loss_mlp": 0.0103674, "balance_loss_clip": 1.05790067, "balance_loss_mlp": 1.02703667, "epoch": 0.3205675464438165, "flos": 24462564595200.0, "grad_norm": 4.406367696575101, "language_loss": 0.70767033, "learning_rate": 3.1779639810888707e-06, "loss": 0.73000705, "num_input_tokens_seen": 57436325, "step": 2666, "time_per_iteration": 2.837160348892212 }, { "auxiliary_loss_clip": 0.01192045, "auxiliary_loss_mlp": 0.01026651, "balance_loss_clip": 1.05701852, "balance_loss_mlp": 1.01794922, "epoch": 0.3206877893344556, "flos": 22456235548800.0, "grad_norm": 2.158986103643617, "language_loss": 0.76577765, "learning_rate": 3.1773343688117013e-06, "loss": 0.78796458, "num_input_tokens_seen": 57457235, "step": 2667, "time_per_iteration": 2.771047592163086 }, { "auxiliary_loss_clip": 0.0119564, "auxiliary_loss_mlp": 0.01059599, "balance_loss_clip": 1.05903375, "balance_loss_mlp": 1.02160287, "epoch": 0.3208080322250947, "flos": 20412236113920.0, "grad_norm": 2.38676491865224, "language_loss": 0.83878022, "learning_rate": 3.1767045779338445e-06, "loss": 0.86133265, "num_input_tokens_seen": 57474895, "step": 2668, "time_per_iteration": 2.8640193939208984 }, { "auxiliary_loss_clip": 0.01193481, "auxiliary_loss_mlp": 0.01028511, "balance_loss_clip": 1.05624926, "balance_loss_mlp": 1.01983237, "epoch": 0.3209282751157338, "flos": 21762118154880.0, "grad_norm": 3.009127989645683, "language_loss": 0.91297567, "learning_rate": 3.176074608550839e-06, "loss": 0.93519557, "num_input_tokens_seen": 57490715, "step": 2669, "time_per_iteration": 2.754361152648926 }, { "auxiliary_loss_clip": 0.01185606, "auxiliary_loss_mlp": 0.01034899, "balance_loss_clip": 1.06033456, "balance_loss_mlp": 1.02649426, "epoch": 0.32104851800637285, "flos": 22055041566720.0, "grad_norm": 2.577266244981675, "language_loss": 0.82459712, "learning_rate": 3.17544446075825e-06, "loss": 0.84680218, "num_input_tokens_seen": 57509880, "step": 2670, "time_per_iteration": 2.8754942417144775 }, { "auxiliary_loss_clip": 0.01194532, "auxiliary_loss_mlp": 0.01028192, "balance_loss_clip": 1.05443311, "balance_loss_mlp": 1.01948333, "epoch": 0.32116876089701196, "flos": 37012301896320.0, "grad_norm": 1.5294957070305288, "language_loss": 0.70760655, "learning_rate": 3.174814134651671e-06, "loss": 0.72983378, "num_input_tokens_seen": 57532430, "step": 2671, "time_per_iteration": 3.768653392791748 }, { "auxiliary_loss_clip": 0.01193315, "auxiliary_loss_mlp": 0.01028195, "balance_loss_clip": 1.05556154, "balance_loss_mlp": 1.02037835, "epoch": 0.3212890037876511, "flos": 21979233912960.0, "grad_norm": 3.7510959932168157, "language_loss": 0.80212259, "learning_rate": 3.1741836303267215e-06, "loss": 0.82433772, "num_input_tokens_seen": 57551965, "step": 2672, "time_per_iteration": 2.6747682094573975 }, { "auxiliary_loss_clip": 0.01194584, "auxiliary_loss_mlp": 0.0103406, "balance_loss_clip": 1.05595732, "balance_loss_mlp": 1.02543497, "epoch": 0.32140924667829013, "flos": 10342345875840.0, "grad_norm": 1.9168634403843035, "language_loss": 0.75184584, "learning_rate": 3.1735529478790496e-06, "loss": 0.77413237, "num_input_tokens_seen": 57569955, "step": 2673, "time_per_iteration": 3.7220444679260254 }, { "auxiliary_loss_clip": 0.0119538, "auxiliary_loss_mlp": 0.01036075, "balance_loss_clip": 1.05729401, "balance_loss_mlp": 1.02677107, "epoch": 0.32152948956892924, "flos": 50798910072960.0, "grad_norm": 2.1920902905634922, "language_loss": 0.7963953, "learning_rate": 3.172922087404328e-06, "loss": 0.81870985, "num_input_tokens_seen": 57592215, "step": 2674, "time_per_iteration": 2.9976818561553955 }, { "auxiliary_loss_clip": 0.01090116, "auxiliary_loss_mlp": 0.01003799, "balance_loss_clip": 1.01868653, "balance_loss_mlp": 1.00211799, "epoch": 0.32164973245956835, "flos": 63863250549120.0, "grad_norm": 0.7679265630459085, "language_loss": 0.55253643, "learning_rate": 3.1722910489982586e-06, "loss": 0.57347554, "num_input_tokens_seen": 57652575, "step": 2675, "time_per_iteration": 4.539612770080566 }, { "auxiliary_loss_clip": 0.01189807, "auxiliary_loss_mlp": 0.01029977, "balance_loss_clip": 1.0582273, "balance_loss_mlp": 1.0206368, "epoch": 0.3217699753502074, "flos": 23513948363520.0, "grad_norm": 1.511987957717962, "language_loss": 0.79993701, "learning_rate": 3.1716598327565694e-06, "loss": 0.82213485, "num_input_tokens_seen": 57672215, "step": 2676, "time_per_iteration": 3.646527051925659 }, { "auxiliary_loss_clip": 0.01193032, "auxiliary_loss_mlp": 0.01027852, "balance_loss_clip": 1.05475283, "balance_loss_mlp": 1.01949501, "epoch": 0.3218902182408465, "flos": 19062533640960.0, "grad_norm": 2.2682004932054967, "language_loss": 0.84145552, "learning_rate": 3.171028438775015e-06, "loss": 0.86366433, "num_input_tokens_seen": 57691410, "step": 2677, "time_per_iteration": 2.794811964035034 }, { "auxiliary_loss_clip": 0.01193953, "auxiliary_loss_mlp": 0.01025727, "balance_loss_clip": 1.05517101, "balance_loss_mlp": 1.01745963, "epoch": 0.3220104611314856, "flos": 20375571306240.0, "grad_norm": 1.8061964955537104, "language_loss": 0.84059215, "learning_rate": 3.170396867149377e-06, "loss": 0.86278892, "num_input_tokens_seen": 57709415, "step": 2678, "time_per_iteration": 2.675610065460205 }, { "auxiliary_loss_clip": 0.01177226, "auxiliary_loss_mlp": 0.01033409, "balance_loss_clip": 1.0592531, "balance_loss_mlp": 1.02439678, "epoch": 0.3221307040221247, "flos": 20117014231680.0, "grad_norm": 1.9830440542792467, "language_loss": 0.86798775, "learning_rate": 3.1697651179754653e-06, "loss": 0.8900941, "num_input_tokens_seen": 57728075, "step": 2679, "time_per_iteration": 2.8678500652313232 }, { "auxiliary_loss_clip": 0.01187833, "auxiliary_loss_mlp": 0.01032512, "balance_loss_clip": 1.05922091, "balance_loss_mlp": 1.02453995, "epoch": 0.3222509469127638, "flos": 23987789602560.0, "grad_norm": 1.6452832112349756, "language_loss": 0.72771704, "learning_rate": 3.1691331913491153e-06, "loss": 0.74992049, "num_input_tokens_seen": 57750645, "step": 2680, "time_per_iteration": 2.9088051319122314 }, { "auxiliary_loss_clip": 0.01199299, "auxiliary_loss_mlp": 0.01037229, "balance_loss_clip": 1.05775774, "balance_loss_mlp": 1.02816927, "epoch": 0.32237118980340285, "flos": 17675735397120.0, "grad_norm": 2.2221220874630063, "language_loss": 0.84585345, "learning_rate": 3.1685010873661898e-06, "loss": 0.86821878, "num_input_tokens_seen": 57769820, "step": 2681, "time_per_iteration": 2.7514662742614746 }, { "auxiliary_loss_clip": 0.01191368, "auxiliary_loss_mlp": 0.01026474, "balance_loss_clip": 1.05781579, "balance_loss_mlp": 1.01700234, "epoch": 0.32249143269404196, "flos": 23147982645120.0, "grad_norm": 2.1409648972562065, "language_loss": 0.79547405, "learning_rate": 3.167868806122578e-06, "loss": 0.81765246, "num_input_tokens_seen": 57788870, "step": 2682, "time_per_iteration": 2.810431957244873 }, { "auxiliary_loss_clip": 0.01192587, "auxiliary_loss_mlp": 0.01031727, "balance_loss_clip": 1.05732918, "balance_loss_mlp": 1.02250648, "epoch": 0.32261167558468107, "flos": 24422308427520.0, "grad_norm": 1.7849762204298152, "language_loss": 0.65707535, "learning_rate": 3.1672363477141968e-06, "loss": 0.67931855, "num_input_tokens_seen": 57808165, "step": 2683, "time_per_iteration": 2.7280263900756836 }, { "auxiliary_loss_clip": 0.01195104, "auxiliary_loss_mlp": 0.01033248, "balance_loss_clip": 1.05599916, "balance_loss_mlp": 1.02430129, "epoch": 0.3227319184753201, "flos": 30367175852160.0, "grad_norm": 2.1814713078732275, "language_loss": 0.84925294, "learning_rate": 3.1666037122369903e-06, "loss": 0.87153649, "num_input_tokens_seen": 57828825, "step": 2684, "time_per_iteration": 2.851789712905884 }, { "auxiliary_loss_clip": 0.01192692, "auxiliary_loss_mlp": 0.01030508, "balance_loss_clip": 1.05664539, "balance_loss_mlp": 1.02169275, "epoch": 0.32285216136595923, "flos": 16946174257920.0, "grad_norm": 2.0474976702396313, "language_loss": 0.86539167, "learning_rate": 3.165970899786928e-06, "loss": 0.88762367, "num_input_tokens_seen": 57846740, "step": 2685, "time_per_iteration": 2.718984365463257 }, { "auxiliary_loss_clip": 0.01186656, "auxiliary_loss_mlp": 0.01031797, "balance_loss_clip": 1.05391514, "balance_loss_mlp": 1.02295196, "epoch": 0.32297240425659834, "flos": 21981532383360.0, "grad_norm": 1.8188107997904324, "language_loss": 0.75224382, "learning_rate": 3.1653379104600067e-06, "loss": 0.77442831, "num_input_tokens_seen": 57866885, "step": 2686, "time_per_iteration": 2.861259937286377 }, { "auxiliary_loss_clip": 0.01196323, "auxiliary_loss_mlp": 0.01037458, "balance_loss_clip": 1.05872941, "balance_loss_mlp": 1.02879786, "epoch": 0.3230926471472374, "flos": 22748045639040.0, "grad_norm": 1.5577883250633693, "language_loss": 0.69173843, "learning_rate": 3.164704744352251e-06, "loss": 0.71407628, "num_input_tokens_seen": 57887690, "step": 2687, "time_per_iteration": 2.7560641765594482 }, { "auxiliary_loss_clip": 0.01190271, "auxiliary_loss_mlp": 0.01026778, "balance_loss_clip": 1.05443561, "balance_loss_mlp": 1.01774836, "epoch": 0.3232128900378765, "flos": 16942977947520.0, "grad_norm": 1.8128943072226724, "language_loss": 0.80553794, "learning_rate": 3.164071401559713e-06, "loss": 0.82770848, "num_input_tokens_seen": 57905090, "step": 2688, "time_per_iteration": 2.794619083404541 }, { "auxiliary_loss_clip": 0.01192971, "auxiliary_loss_mlp": 0.01037172, "balance_loss_clip": 1.05716491, "balance_loss_mlp": 1.02844, "epoch": 0.3233331329285156, "flos": 24023736138240.0, "grad_norm": 2.110257940291789, "language_loss": 0.71158159, "learning_rate": 3.1634378821784674e-06, "loss": 0.73388302, "num_input_tokens_seen": 57925305, "step": 2689, "time_per_iteration": 2.799807548522949 }, { "auxiliary_loss_clip": 0.01185823, "auxiliary_loss_mlp": 0.01035762, "balance_loss_clip": 1.05773735, "balance_loss_mlp": 1.02662516, "epoch": 0.3234533758191547, "flos": 18113845582080.0, "grad_norm": 2.6735009595469488, "language_loss": 0.74175692, "learning_rate": 3.1628041863046208e-06, "loss": 0.76397282, "num_input_tokens_seen": 57942720, "step": 2690, "time_per_iteration": 2.707960844039917 }, { "auxiliary_loss_clip": 0.01200166, "auxiliary_loss_mlp": 0.01034746, "balance_loss_clip": 1.05603719, "balance_loss_mlp": 1.02463746, "epoch": 0.3235736187097938, "flos": 16946138344320.0, "grad_norm": 2.1717321186136362, "language_loss": 0.91327846, "learning_rate": 3.162170314034304e-06, "loss": 0.93562758, "num_input_tokens_seen": 57960135, "step": 2691, "time_per_iteration": 2.7368123531341553 }, { "auxiliary_loss_clip": 0.01197554, "auxiliary_loss_mlp": 0.01036272, "balance_loss_clip": 1.05604553, "balance_loss_mlp": 1.02680087, "epoch": 0.3236938616004329, "flos": 22127150119680.0, "grad_norm": 1.5743494241329927, "language_loss": 0.80737066, "learning_rate": 3.1615362654636738e-06, "loss": 0.82970893, "num_input_tokens_seen": 57980875, "step": 2692, "time_per_iteration": 2.7244112491607666 }, { "auxiliary_loss_clip": 0.01180648, "auxiliary_loss_mlp": 0.01035472, "balance_loss_clip": 1.05823755, "balance_loss_mlp": 1.02759182, "epoch": 0.32381410449107195, "flos": 17164618819200.0, "grad_norm": 1.8552171402179638, "language_loss": 0.8742696, "learning_rate": 3.1609020406889163e-06, "loss": 0.89643079, "num_input_tokens_seen": 57998310, "step": 2693, "time_per_iteration": 2.821662664413452 }, { "auxiliary_loss_clip": 0.0119664, "auxiliary_loss_mlp": 0.01037201, "balance_loss_clip": 1.05963993, "balance_loss_mlp": 1.02820683, "epoch": 0.32393434738171106, "flos": 16578125550720.0, "grad_norm": 1.6679519205710949, "language_loss": 0.84902036, "learning_rate": 3.1602676398062416e-06, "loss": 0.87135875, "num_input_tokens_seen": 58017220, "step": 2694, "time_per_iteration": 2.7377429008483887 }, { "auxiliary_loss_clip": 0.01191886, "auxiliary_loss_mlp": 0.01031495, "balance_loss_clip": 1.05539012, "balance_loss_mlp": 1.02262568, "epoch": 0.3240545902723502, "flos": 25483612602240.0, "grad_norm": 7.314631658951018, "language_loss": 0.61423385, "learning_rate": 3.1596330629118886e-06, "loss": 0.6364677, "num_input_tokens_seen": 58037190, "step": 2695, "time_per_iteration": 2.813002824783325 }, { "auxiliary_loss_clip": 0.01177588, "auxiliary_loss_mlp": 0.01035012, "balance_loss_clip": 1.05760098, "balance_loss_mlp": 1.02613044, "epoch": 0.32417483316298923, "flos": 35845851634560.0, "grad_norm": 3.299088901913642, "language_loss": 0.73130083, "learning_rate": 3.1589983101021223e-06, "loss": 0.75342679, "num_input_tokens_seen": 58055820, "step": 2696, "time_per_iteration": 2.9873275756835938 }, { "auxiliary_loss_clip": 0.01189605, "auxiliary_loss_mlp": 0.01031296, "balance_loss_clip": 1.05460858, "balance_loss_mlp": 1.02290964, "epoch": 0.32429507605362834, "flos": 30080501406720.0, "grad_norm": 2.1967278478697625, "language_loss": 0.84813643, "learning_rate": 3.1583633814732337e-06, "loss": 0.87034547, "num_input_tokens_seen": 58075340, "step": 2697, "time_per_iteration": 3.733417510986328 }, { "auxiliary_loss_clip": 0.01195868, "auxiliary_loss_mlp": 0.01029731, "balance_loss_clip": 1.05503607, "balance_loss_mlp": 1.02108836, "epoch": 0.3244153189442674, "flos": 18223265387520.0, "grad_norm": 2.8807672269956033, "language_loss": 0.71666622, "learning_rate": 3.157728277121541e-06, "loss": 0.73892224, "num_input_tokens_seen": 58093515, "step": 2698, "time_per_iteration": 2.7706308364868164 }, { "auxiliary_loss_clip": 0.01196028, "auxiliary_loss_mlp": 0.01026795, "balance_loss_clip": 1.05442095, "balance_loss_mlp": 1.01791406, "epoch": 0.3245355618349065, "flos": 17710317216000.0, "grad_norm": 2.4510557277067906, "language_loss": 0.78660834, "learning_rate": 3.1570929971433897e-06, "loss": 0.80883652, "num_input_tokens_seen": 58109300, "step": 2699, "time_per_iteration": 2.6750290393829346 }, { "auxiliary_loss_clip": 0.01195401, "auxiliary_loss_mlp": 0.01034495, "balance_loss_clip": 1.05960107, "balance_loss_mlp": 1.02510142, "epoch": 0.3246558047255456, "flos": 23440798316160.0, "grad_norm": 2.014589613394052, "language_loss": 0.83680892, "learning_rate": 3.1564575416351504e-06, "loss": 0.85910785, "num_input_tokens_seen": 58128000, "step": 2700, "time_per_iteration": 3.772429943084717 }, { "auxiliary_loss_clip": 0.01198825, "auxiliary_loss_mlp": 0.01034408, "balance_loss_clip": 1.05719757, "balance_loss_mlp": 1.02504444, "epoch": 0.32477604761618467, "flos": 21760861178880.0, "grad_norm": 1.8703497827480093, "language_loss": 0.74477828, "learning_rate": 3.155821910693221e-06, "loss": 0.76711059, "num_input_tokens_seen": 58147415, "step": 2701, "time_per_iteration": 3.6489346027374268 }, { "auxiliary_loss_clip": 0.01188408, "auxiliary_loss_mlp": 0.01029977, "balance_loss_clip": 1.05551338, "balance_loss_mlp": 1.02096498, "epoch": 0.3248962905068238, "flos": 19828328624640.0, "grad_norm": 2.7771769667869015, "language_loss": 0.85800445, "learning_rate": 3.1551861044140275e-06, "loss": 0.88018823, "num_input_tokens_seen": 58167050, "step": 2702, "time_per_iteration": 3.7988908290863037 }, { "auxiliary_loss_clip": 0.01175138, "auxiliary_loss_mlp": 0.01030949, "balance_loss_clip": 1.0561341, "balance_loss_mlp": 1.02160907, "epoch": 0.3250165333974629, "flos": 23948215793280.0, "grad_norm": 2.4473055346407753, "language_loss": 0.77414137, "learning_rate": 3.15455012289402e-06, "loss": 0.79620224, "num_input_tokens_seen": 58186695, "step": 2703, "time_per_iteration": 2.839635133743286 }, { "auxiliary_loss_clip": 0.01194695, "auxiliary_loss_mlp": 0.01031646, "balance_loss_clip": 1.05899239, "balance_loss_mlp": 1.02280664, "epoch": 0.32513677628810195, "flos": 23989333887360.0, "grad_norm": 1.9665301446276064, "language_loss": 0.84324634, "learning_rate": 3.153913966229677e-06, "loss": 0.86550975, "num_input_tokens_seen": 58205815, "step": 2704, "time_per_iteration": 2.7416703701019287 }, { "auxiliary_loss_clip": 0.01089453, "auxiliary_loss_mlp": 0.01004098, "balance_loss_clip": 1.01848102, "balance_loss_mlp": 1.00263786, "epoch": 0.32525701917874106, "flos": 70655790009600.0, "grad_norm": 0.6402101825389757, "language_loss": 0.50253117, "learning_rate": 3.1532776345175027e-06, "loss": 0.52346671, "num_input_tokens_seen": 58270960, "step": 2705, "time_per_iteration": 3.254603862762451 }, { "auxiliary_loss_clip": 0.01192647, "auxiliary_loss_mlp": 0.01037456, "balance_loss_clip": 1.05519629, "balance_loss_mlp": 1.02867007, "epoch": 0.32537726206938017, "flos": 19682639061120.0, "grad_norm": 1.9637708241269745, "language_loss": 0.78829336, "learning_rate": 3.1526411278540285e-06, "loss": 0.81059438, "num_input_tokens_seen": 58289390, "step": 2706, "time_per_iteration": 2.7603979110717773 }, { "auxiliary_loss_clip": 0.01196981, "auxiliary_loss_mlp": 0.01032467, "balance_loss_clip": 1.0569427, "balance_loss_mlp": 1.02376509, "epoch": 0.3254975049600192, "flos": 28760999293440.0, "grad_norm": 2.1229848877805613, "language_loss": 0.81097901, "learning_rate": 3.1520044463358116e-06, "loss": 0.83327353, "num_input_tokens_seen": 58306120, "step": 2707, "time_per_iteration": 2.793320894241333 }, { "auxiliary_loss_clip": 0.01186749, "auxiliary_loss_mlp": 0.0103338, "balance_loss_clip": 1.05403614, "balance_loss_mlp": 1.02460003, "epoch": 0.32561774785065833, "flos": 18877378008960.0, "grad_norm": 1.6693929667544025, "language_loss": 0.8032077, "learning_rate": 3.151367590059436e-06, "loss": 0.82540894, "num_input_tokens_seen": 58324545, "step": 2708, "time_per_iteration": 2.7278215885162354 }, { "auxiliary_loss_clip": 0.01194234, "auxiliary_loss_mlp": 0.01053411, "balance_loss_clip": 1.05485463, "balance_loss_mlp": 1.01580536, "epoch": 0.32573799074129745, "flos": 23112107936640.0, "grad_norm": 3.3900760026272376, "language_loss": 0.86862659, "learning_rate": 3.1507305591215117e-06, "loss": 0.89110309, "num_input_tokens_seen": 58342455, "step": 2709, "time_per_iteration": 2.729757308959961 }, { "auxiliary_loss_clip": 0.01085994, "auxiliary_loss_mlp": 0.01004648, "balance_loss_clip": 1.01541328, "balance_loss_mlp": 1.003003, "epoch": 0.3258582336319365, "flos": 71237650423680.0, "grad_norm": 0.6708130212019862, "language_loss": 0.55737925, "learning_rate": 3.150093353618677e-06, "loss": 0.57828569, "num_input_tokens_seen": 58407185, "step": 2710, "time_per_iteration": 3.344364643096924 }, { "auxiliary_loss_clip": 0.01196388, "auxiliary_loss_mlp": 0.01037207, "balance_loss_clip": 1.05516529, "balance_loss_mlp": 1.02801037, "epoch": 0.3259784765225756, "flos": 22456020067200.0, "grad_norm": 3.356575915877832, "language_loss": 0.87853855, "learning_rate": 3.149455973647596e-06, "loss": 0.9008745, "num_input_tokens_seen": 58425245, "step": 2711, "time_per_iteration": 2.901442050933838 }, { "auxiliary_loss_clip": 0.01177384, "auxiliary_loss_mlp": 0.01033479, "balance_loss_clip": 1.05232024, "balance_loss_mlp": 1.02432406, "epoch": 0.32609871941321467, "flos": 20484811543680.0, "grad_norm": 2.0242556934407183, "language_loss": 0.7675395, "learning_rate": 3.1488184193049563e-06, "loss": 0.78964812, "num_input_tokens_seen": 58444780, "step": 2712, "time_per_iteration": 2.9856340885162354 }, { "auxiliary_loss_clip": 0.01191227, "auxiliary_loss_mlp": 0.01029774, "balance_loss_clip": 1.0539093, "balance_loss_mlp": 1.02177501, "epoch": 0.3262189623038538, "flos": 22416805393920.0, "grad_norm": 1.88101489741556, "language_loss": 0.72220469, "learning_rate": 3.1481806906874767e-06, "loss": 0.74441475, "num_input_tokens_seen": 58466090, "step": 2713, "time_per_iteration": 2.8209316730499268 }, { "auxiliary_loss_clip": 0.01192609, "auxiliary_loss_mlp": 0.01027801, "balance_loss_clip": 1.0549556, "balance_loss_mlp": 1.01967072, "epoch": 0.3263392051944929, "flos": 20923496346240.0, "grad_norm": 3.5791638245119892, "language_loss": 0.8794145, "learning_rate": 3.147542787891899e-06, "loss": 0.90161866, "num_input_tokens_seen": 58485435, "step": 2714, "time_per_iteration": 2.7927908897399902 }, { "auxiliary_loss_clip": 0.01185818, "auxiliary_loss_mlp": 0.01032517, "balance_loss_clip": 1.05430436, "balance_loss_mlp": 1.02349889, "epoch": 0.32645944808513194, "flos": 24025172682240.0, "grad_norm": 1.8366338203294554, "language_loss": 0.75456977, "learning_rate": 3.1469047110149926e-06, "loss": 0.77675319, "num_input_tokens_seen": 58504175, "step": 2715, "time_per_iteration": 2.812424898147583 }, { "auxiliary_loss_clip": 0.01176585, "auxiliary_loss_mlp": 0.01033296, "balance_loss_clip": 1.05576158, "balance_loss_mlp": 1.02420664, "epoch": 0.32657969097577105, "flos": 21032413361280.0, "grad_norm": 2.1910494071236326, "language_loss": 0.85604495, "learning_rate": 3.146266460153554e-06, "loss": 0.87814379, "num_input_tokens_seen": 58523885, "step": 2716, "time_per_iteration": 2.9180784225463867 }, { "auxiliary_loss_clip": 0.01188034, "auxiliary_loss_mlp": 0.01061481, "balance_loss_clip": 1.0569222, "balance_loss_mlp": 1.02423513, "epoch": 0.32669993386641016, "flos": 22710267509760.0, "grad_norm": 1.7136890347538034, "language_loss": 0.80205846, "learning_rate": 3.145628035404404e-06, "loss": 0.82455361, "num_input_tokens_seen": 58543085, "step": 2717, "time_per_iteration": 2.8052473068237305 }, { "auxiliary_loss_clip": 0.01084455, "auxiliary_loss_mlp": 0.01007601, "balance_loss_clip": 1.014624, "balance_loss_mlp": 1.0060513, "epoch": 0.3268201767570492, "flos": 72105718406400.0, "grad_norm": 0.886294964164846, "language_loss": 0.57468367, "learning_rate": 3.1449894368643922e-06, "loss": 0.59560418, "num_input_tokens_seen": 58605400, "step": 2718, "time_per_iteration": 3.447197437286377 }, { "auxiliary_loss_clip": 0.01183339, "auxiliary_loss_mlp": 0.01024835, "balance_loss_clip": 1.05696535, "balance_loss_mlp": 1.01677668, "epoch": 0.32694041964768833, "flos": 24535175938560.0, "grad_norm": 1.8595847723652814, "language_loss": 0.71475232, "learning_rate": 3.1443506646303934e-06, "loss": 0.73683405, "num_input_tokens_seen": 58626700, "step": 2719, "time_per_iteration": 2.9255995750427246 }, { "auxiliary_loss_clip": 0.01194273, "auxiliary_loss_mlp": 0.01023015, "balance_loss_clip": 1.05464602, "balance_loss_mlp": 1.01445007, "epoch": 0.32706066253832744, "flos": 33183003755520.0, "grad_norm": 2.0287388485289988, "language_loss": 0.66740751, "learning_rate": 3.1437117187993086e-06, "loss": 0.68958044, "num_input_tokens_seen": 58649020, "step": 2720, "time_per_iteration": 3.0016913414001465 }, { "auxiliary_loss_clip": 0.01181514, "auxiliary_loss_mlp": 0.01032951, "balance_loss_clip": 1.0551616, "balance_loss_mlp": 1.02420092, "epoch": 0.3271809054289665, "flos": 24061622008320.0, "grad_norm": 1.7835379104100226, "language_loss": 0.79963392, "learning_rate": 3.143072599468065e-06, "loss": 0.8217786, "num_input_tokens_seen": 58668845, "step": 2721, "time_per_iteration": 3.0112671852111816 }, { "auxiliary_loss_clip": 0.01189584, "auxiliary_loss_mlp": 0.01038487, "balance_loss_clip": 1.05781853, "balance_loss_mlp": 1.02997565, "epoch": 0.3273011483196056, "flos": 38253769712640.0, "grad_norm": 1.6621032046268667, "language_loss": 0.7551589, "learning_rate": 3.1424333067336174e-06, "loss": 0.77743959, "num_input_tokens_seen": 58691610, "step": 2722, "time_per_iteration": 2.9054970741271973 }, { "auxiliary_loss_clip": 0.01199222, "auxiliary_loss_mlp": 0.01034077, "balance_loss_clip": 1.05701661, "balance_loss_mlp": 1.02493966, "epoch": 0.3274213912102447, "flos": 29054389582080.0, "grad_norm": 1.9545935001123695, "language_loss": 0.7804727, "learning_rate": 3.141793840692945e-06, "loss": 0.80280566, "num_input_tokens_seen": 58712360, "step": 2723, "time_per_iteration": 3.7835047245025635 }, { "auxiliary_loss_clip": 0.01178995, "auxiliary_loss_mlp": 0.01032658, "balance_loss_clip": 1.05455184, "balance_loss_mlp": 1.02350259, "epoch": 0.32754163410088377, "flos": 29133249891840.0, "grad_norm": 1.9536061340052968, "language_loss": 0.61493307, "learning_rate": 3.1411542014430553e-06, "loss": 0.63704962, "num_input_tokens_seen": 58733440, "step": 2724, "time_per_iteration": 2.843393325805664 }, { "auxiliary_loss_clip": 0.01184253, "auxiliary_loss_mlp": 0.0102766, "balance_loss_clip": 1.05355525, "balance_loss_mlp": 1.01941633, "epoch": 0.3276618769915229, "flos": 20631075724800.0, "grad_norm": 1.639226070988493, "language_loss": 0.81973583, "learning_rate": 3.1405143890809804e-06, "loss": 0.84185493, "num_input_tokens_seen": 58752735, "step": 2725, "time_per_iteration": 3.8786962032318115 }, { "auxiliary_loss_clip": 0.01184577, "auxiliary_loss_mlp": 0.01031186, "balance_loss_clip": 1.05586684, "balance_loss_mlp": 1.02274013, "epoch": 0.327782119882162, "flos": 18657425076480.0, "grad_norm": 21.43993432495675, "language_loss": 0.69723648, "learning_rate": 3.1398744037037796e-06, "loss": 0.71939409, "num_input_tokens_seen": 58772070, "step": 2726, "time_per_iteration": 2.832611322402954 }, { "auxiliary_loss_clip": 0.01187757, "auxiliary_loss_mlp": 0.01028601, "balance_loss_clip": 1.05599463, "balance_loss_mlp": 1.02050722, "epoch": 0.32790236277280105, "flos": 21795802133760.0, "grad_norm": 4.182041867759858, "language_loss": 0.84191179, "learning_rate": 3.139234245408538e-06, "loss": 0.86407536, "num_input_tokens_seen": 58790950, "step": 2727, "time_per_iteration": 3.771103620529175 }, { "auxiliary_loss_clip": 0.0118674, "auxiliary_loss_mlp": 0.01053527, "balance_loss_clip": 1.05652881, "balance_loss_mlp": 1.01853752, "epoch": 0.32802260566344016, "flos": 23331414424320.0, "grad_norm": 1.4879071585528056, "language_loss": 0.76080495, "learning_rate": 3.1385939142923666e-06, "loss": 0.7832076, "num_input_tokens_seen": 58813340, "step": 2728, "time_per_iteration": 3.758840799331665 }, { "auxiliary_loss_clip": 0.01192722, "auxiliary_loss_mlp": 0.01031511, "balance_loss_clip": 1.05701303, "balance_loss_mlp": 1.02226639, "epoch": 0.3281428485540792, "flos": 24206988349440.0, "grad_norm": 2.482421039422424, "language_loss": 0.77857661, "learning_rate": 3.137953410452405e-06, "loss": 0.80081892, "num_input_tokens_seen": 58833610, "step": 2729, "time_per_iteration": 2.8375930786132812 }, { "auxiliary_loss_clip": 0.01185657, "auxiliary_loss_mlp": 0.0103139, "balance_loss_clip": 1.05298316, "balance_loss_mlp": 1.02265799, "epoch": 0.3282630914447183, "flos": 34128962380800.0, "grad_norm": 1.7162751447119813, "language_loss": 0.74466479, "learning_rate": 3.1373127339858146e-06, "loss": 0.76683521, "num_input_tokens_seen": 58856210, "step": 2730, "time_per_iteration": 2.9104247093200684 }, { "auxiliary_loss_clip": 0.01184078, "auxiliary_loss_mlp": 0.01027096, "balance_loss_clip": 1.05664098, "balance_loss_mlp": 1.01907945, "epoch": 0.32838333433535744, "flos": 27600726170880.0, "grad_norm": 2.5109594292531763, "language_loss": 0.74470121, "learning_rate": 3.136671884989787e-06, "loss": 0.76681292, "num_input_tokens_seen": 58876120, "step": 2731, "time_per_iteration": 2.824465274810791 }, { "auxiliary_loss_clip": 0.01181717, "auxiliary_loss_mlp": 0.01030247, "balance_loss_clip": 1.0575223, "balance_loss_mlp": 1.02111006, "epoch": 0.3285035772259965, "flos": 12349500935040.0, "grad_norm": 5.109214417503897, "language_loss": 0.87189376, "learning_rate": 3.1360308635615383e-06, "loss": 0.8940134, "num_input_tokens_seen": 58894660, "step": 2732, "time_per_iteration": 2.7321739196777344 }, { "auxiliary_loss_clip": 0.01194508, "auxiliary_loss_mlp": 0.01029863, "balance_loss_clip": 1.05681109, "balance_loss_mlp": 1.02063644, "epoch": 0.3286238201166356, "flos": 24316084932480.0, "grad_norm": 1.9689162526809991, "language_loss": 0.78792489, "learning_rate": 3.135389669798311e-06, "loss": 0.81016862, "num_input_tokens_seen": 58912720, "step": 2733, "time_per_iteration": 2.8451976776123047 }, { "auxiliary_loss_clip": 0.01193731, "auxiliary_loss_mlp": 0.01055458, "balance_loss_clip": 1.0573715, "balance_loss_mlp": 1.01909852, "epoch": 0.3287440630072747, "flos": 21392812471680.0, "grad_norm": 2.3084921941748795, "language_loss": 0.80095536, "learning_rate": 3.134748303797373e-06, "loss": 0.82344723, "num_input_tokens_seen": 58930090, "step": 2734, "time_per_iteration": 2.8640804290771484 }, { "auxiliary_loss_clip": 0.01184516, "auxiliary_loss_mlp": 0.01032694, "balance_loss_clip": 1.0580411, "balance_loss_mlp": 1.02316284, "epoch": 0.32886430589791377, "flos": 23732536579200.0, "grad_norm": 1.9067931069448427, "language_loss": 0.81131959, "learning_rate": 3.1341067656560203e-06, "loss": 0.83349174, "num_input_tokens_seen": 58947935, "step": 2735, "time_per_iteration": 2.8678159713745117 }, { "auxiliary_loss_clip": 0.01197251, "auxiliary_loss_mlp": 0.01030312, "balance_loss_clip": 1.05576205, "balance_loss_mlp": 1.02187181, "epoch": 0.3289845487885529, "flos": 22418708814720.0, "grad_norm": 2.126005369216627, "language_loss": 0.86438465, "learning_rate": 3.133465055471572e-06, "loss": 0.88666028, "num_input_tokens_seen": 58967720, "step": 2736, "time_per_iteration": 2.8316879272460938 }, { "auxiliary_loss_clip": 0.01180585, "auxiliary_loss_mlp": 0.01036923, "balance_loss_clip": 1.05665851, "balance_loss_mlp": 1.02848935, "epoch": 0.329104791679192, "flos": 19682603147520.0, "grad_norm": 2.6781050167796785, "language_loss": 0.66657698, "learning_rate": 3.1328231733413767e-06, "loss": 0.68875206, "num_input_tokens_seen": 58984360, "step": 2737, "time_per_iteration": 2.8601462841033936 }, { "auxiliary_loss_clip": 0.01191858, "auxiliary_loss_mlp": 0.01032311, "balance_loss_clip": 1.05854392, "balance_loss_mlp": 1.02309608, "epoch": 0.32922503456983104, "flos": 15997234803840.0, "grad_norm": 2.0740590846482667, "language_loss": 0.90853739, "learning_rate": 3.1321811193628067e-06, "loss": 0.9307791, "num_input_tokens_seen": 59002505, "step": 2738, "time_per_iteration": 2.7726683616638184 }, { "auxiliary_loss_clip": 0.01191943, "auxiliary_loss_mlp": 0.01055328, "balance_loss_clip": 1.05706286, "balance_loss_mlp": 1.01967895, "epoch": 0.32934527746047015, "flos": 26834069260800.0, "grad_norm": 2.6721557520176744, "language_loss": 0.70230258, "learning_rate": 3.131538893633261e-06, "loss": 0.72477531, "num_input_tokens_seen": 59022065, "step": 2739, "time_per_iteration": 2.8851823806762695 }, { "auxiliary_loss_clip": 0.01198399, "auxiliary_loss_mlp": 0.01034081, "balance_loss_clip": 1.05723071, "balance_loss_mlp": 1.02572417, "epoch": 0.32946552035110926, "flos": 23403774372480.0, "grad_norm": 2.4469636496173446, "language_loss": 0.77898675, "learning_rate": 3.130896496250165e-06, "loss": 0.80131155, "num_input_tokens_seen": 59041890, "step": 2740, "time_per_iteration": 2.767496109008789 }, { "auxiliary_loss_clip": 0.01196271, "auxiliary_loss_mlp": 0.01032124, "balance_loss_clip": 1.05480027, "balance_loss_mlp": 1.02288508, "epoch": 0.3295857632417483, "flos": 14172470029440.0, "grad_norm": 1.9166491416656073, "language_loss": 0.86248255, "learning_rate": 3.1302539273109693e-06, "loss": 0.88476658, "num_input_tokens_seen": 59058715, "step": 2741, "time_per_iteration": 2.7417571544647217 }, { "auxiliary_loss_clip": 0.01188259, "auxiliary_loss_mlp": 0.01031363, "balance_loss_clip": 1.05984819, "balance_loss_mlp": 1.02286386, "epoch": 0.32970600613238743, "flos": 22196708807040.0, "grad_norm": 1.961255181868139, "language_loss": 0.80564785, "learning_rate": 3.1296111869131513e-06, "loss": 0.82784408, "num_input_tokens_seen": 59076140, "step": 2742, "time_per_iteration": 2.7382421493530273 }, { "auxiliary_loss_clip": 0.01194187, "auxiliary_loss_mlp": 0.01031646, "balance_loss_clip": 1.05427504, "balance_loss_mlp": 1.02370107, "epoch": 0.32982624902302654, "flos": 22053784590720.0, "grad_norm": 1.82363377943795, "language_loss": 0.85624772, "learning_rate": 3.1289682751542153e-06, "loss": 0.87850606, "num_input_tokens_seen": 59095700, "step": 2743, "time_per_iteration": 2.7803380489349365 }, { "auxiliary_loss_clip": 0.01192782, "auxiliary_loss_mlp": 0.01031141, "balance_loss_clip": 1.05839038, "balance_loss_mlp": 1.0226655, "epoch": 0.3299464919136656, "flos": 18661626967680.0, "grad_norm": 2.016529488495452, "language_loss": 0.71237612, "learning_rate": 3.1283251921316883e-06, "loss": 0.73461533, "num_input_tokens_seen": 59113445, "step": 2744, "time_per_iteration": 2.820833683013916 }, { "auxiliary_loss_clip": 0.01179201, "auxiliary_loss_mlp": 0.01031562, "balance_loss_clip": 1.05682635, "balance_loss_mlp": 1.02312827, "epoch": 0.3300667348043047, "flos": 13407357404160.0, "grad_norm": 2.3132216640652183, "language_loss": 0.8091886, "learning_rate": 3.1276819379431277e-06, "loss": 0.83129627, "num_input_tokens_seen": 59131535, "step": 2745, "time_per_iteration": 2.996628522872925 }, { "auxiliary_loss_clip": 0.01197596, "auxiliary_loss_mlp": 0.01056954, "balance_loss_clip": 1.05634344, "balance_loss_mlp": 1.02072167, "epoch": 0.33018697769494376, "flos": 15742556398080.0, "grad_norm": 2.0762922627063376, "language_loss": 0.75287032, "learning_rate": 3.1270385126861134e-06, "loss": 0.77541578, "num_input_tokens_seen": 59149520, "step": 2746, "time_per_iteration": 2.7050724029541016 }, { "auxiliary_loss_clip": 0.01198389, "auxiliary_loss_mlp": 0.01033652, "balance_loss_clip": 1.05617034, "balance_loss_mlp": 1.02508688, "epoch": 0.3303072205855829, "flos": 18258601392000.0, "grad_norm": 2.9767790756748345, "language_loss": 0.81966043, "learning_rate": 3.1263949164582533e-06, "loss": 0.84198081, "num_input_tokens_seen": 59169170, "step": 2747, "time_per_iteration": 2.7866311073303223 }, { "auxiliary_loss_clip": 0.01196442, "auxiliary_loss_mlp": 0.01026388, "balance_loss_clip": 1.05438721, "balance_loss_mlp": 1.0177927, "epoch": 0.330427463476222, "flos": 17749424148480.0, "grad_norm": 2.131627313788395, "language_loss": 0.78351748, "learning_rate": 3.1257511493571797e-06, "loss": 0.80574584, "num_input_tokens_seen": 59187675, "step": 2748, "time_per_iteration": 2.789977550506592 }, { "auxiliary_loss_clip": 0.01188015, "auxiliary_loss_mlp": 0.01028016, "balance_loss_clip": 1.05718482, "balance_loss_mlp": 1.01925468, "epoch": 0.33054770636686104, "flos": 27162580072320.0, "grad_norm": 1.9416613535502825, "language_loss": 0.78189397, "learning_rate": 3.125107211480552e-06, "loss": 0.80405432, "num_input_tokens_seen": 59207610, "step": 2749, "time_per_iteration": 3.707984209060669 }, { "auxiliary_loss_clip": 0.01180844, "auxiliary_loss_mlp": 0.01027666, "balance_loss_clip": 1.05868173, "balance_loss_mlp": 1.01911247, "epoch": 0.33066794925750015, "flos": 20117193799680.0, "grad_norm": 1.6103824192228333, "language_loss": 0.79573143, "learning_rate": 3.124463102926054e-06, "loss": 0.8178165, "num_input_tokens_seen": 59226945, "step": 2750, "time_per_iteration": 2.820369005203247 }, { "auxiliary_loss_clip": 0.01086489, "auxiliary_loss_mlp": 0.0100674, "balance_loss_clip": 1.02005684, "balance_loss_mlp": 1.00514281, "epoch": 0.33078819214813926, "flos": 70642609718400.0, "grad_norm": 0.9558740406102982, "language_loss": 0.61532491, "learning_rate": 3.1238188237913984e-06, "loss": 0.63625717, "num_input_tokens_seen": 59291485, "step": 2751, "time_per_iteration": 4.310765027999878 }, { "auxiliary_loss_clip": 0.01203546, "auxiliary_loss_mlp": 0.01034928, "balance_loss_clip": 1.05882382, "balance_loss_mlp": 1.02557564, "epoch": 0.3309084350387783, "flos": 21141940907520.0, "grad_norm": 2.192212595236077, "language_loss": 0.76381564, "learning_rate": 3.1231743741743202e-06, "loss": 0.7862004, "num_input_tokens_seen": 59310990, "step": 2752, "time_per_iteration": 2.7212727069854736 }, { "auxiliary_loss_clip": 0.01189853, "auxiliary_loss_mlp": 0.01029268, "balance_loss_clip": 1.05549765, "balance_loss_mlp": 1.02064288, "epoch": 0.3310286779294174, "flos": 14209350318720.0, "grad_norm": 2.5929468518805314, "language_loss": 0.83475065, "learning_rate": 3.122529754172582e-06, "loss": 0.85694188, "num_input_tokens_seen": 59327875, "step": 2753, "time_per_iteration": 3.7554025650024414 }, { "auxiliary_loss_clip": 0.01190099, "auxiliary_loss_mlp": 0.01026294, "balance_loss_clip": 1.05499697, "balance_loss_mlp": 1.01771688, "epoch": 0.33114892082005654, "flos": 20778130005120.0, "grad_norm": 1.9995943171462895, "language_loss": 0.7197718, "learning_rate": 3.1218849638839736e-06, "loss": 0.74193573, "num_input_tokens_seen": 59347135, "step": 2754, "time_per_iteration": 2.867311716079712 }, { "auxiliary_loss_clip": 0.01183595, "auxiliary_loss_mlp": 0.01035341, "balance_loss_clip": 1.05974269, "balance_loss_mlp": 1.02559614, "epoch": 0.3312691637106956, "flos": 17090750499840.0, "grad_norm": 1.853425733622671, "language_loss": 0.78173292, "learning_rate": 3.121240003406307e-06, "loss": 0.8039223, "num_input_tokens_seen": 59365985, "step": 2755, "time_per_iteration": 3.7990779876708984 }, { "auxiliary_loss_clip": 0.01191192, "auxiliary_loss_mlp": 0.01029565, "balance_loss_clip": 1.05811167, "balance_loss_mlp": 1.02051091, "epoch": 0.3313894066013347, "flos": 29456230008960.0, "grad_norm": 2.1689672010264074, "language_loss": 0.72466844, "learning_rate": 3.120594872837425e-06, "loss": 0.746876, "num_input_tokens_seen": 59384655, "step": 2756, "time_per_iteration": 2.853126287460327 }, { "auxiliary_loss_clip": 0.01084495, "auxiliary_loss_mlp": 0.0103249, "balance_loss_clip": 1.01585329, "balance_loss_mlp": 0.99919665, "epoch": 0.3315096494919738, "flos": 61419242280960.0, "grad_norm": 0.824559022506589, "language_loss": 0.62421203, "learning_rate": 3.1199495722751906e-06, "loss": 0.64538187, "num_input_tokens_seen": 59444185, "step": 2757, "time_per_iteration": 3.344203233718872 }, { "auxiliary_loss_clip": 0.01184361, "auxiliary_loss_mlp": 0.01031089, "balance_loss_clip": 1.0555706, "balance_loss_mlp": 1.02304244, "epoch": 0.33162989238261287, "flos": 21653057485440.0, "grad_norm": 1.7516694336502865, "language_loss": 0.83973473, "learning_rate": 3.1193041018174972e-06, "loss": 0.86188924, "num_input_tokens_seen": 59464900, "step": 2758, "time_per_iteration": 2.773409128189087 }, { "auxiliary_loss_clip": 0.01196685, "auxiliary_loss_mlp": 0.0103166, "balance_loss_clip": 1.05808496, "balance_loss_mlp": 1.02330983, "epoch": 0.331750135273252, "flos": 22674787850880.0, "grad_norm": 1.9806043358793233, "language_loss": 0.94691718, "learning_rate": 3.118658461562261e-06, "loss": 0.96920061, "num_input_tokens_seen": 59481000, "step": 2759, "time_per_iteration": 2.7800512313842773 }, { "auxiliary_loss_clip": 0.01191103, "auxiliary_loss_mlp": 0.01029388, "balance_loss_clip": 1.0582788, "balance_loss_mlp": 1.02077508, "epoch": 0.33187037816389103, "flos": 22746896403840.0, "grad_norm": 1.3775910877079742, "language_loss": 0.84900051, "learning_rate": 3.118012651607426e-06, "loss": 0.87120545, "num_input_tokens_seen": 59502605, "step": 2760, "time_per_iteration": 2.9071900844573975 }, { "auxiliary_loss_clip": 0.01195693, "auxiliary_loss_mlp": 0.01033585, "balance_loss_clip": 1.05559409, "balance_loss_mlp": 1.02517474, "epoch": 0.33199062105453014, "flos": 19203769918080.0, "grad_norm": 2.578871407379786, "language_loss": 0.83275735, "learning_rate": 3.1173666720509603e-06, "loss": 0.85505021, "num_input_tokens_seen": 59519540, "step": 2761, "time_per_iteration": 2.6976864337921143 }, { "auxiliary_loss_clip": 0.01196294, "auxiliary_loss_mlp": 0.01028582, "balance_loss_clip": 1.0579989, "balance_loss_mlp": 1.02030897, "epoch": 0.33211086394516925, "flos": 31577006764800.0, "grad_norm": 2.2658626086287357, "language_loss": 0.68320084, "learning_rate": 3.116720522990859e-06, "loss": 0.70544958, "num_input_tokens_seen": 59540415, "step": 2762, "time_per_iteration": 2.816801071166992 }, { "auxiliary_loss_clip": 0.01176886, "auxiliary_loss_mlp": 0.01030882, "balance_loss_clip": 1.05459571, "balance_loss_mlp": 1.02226317, "epoch": 0.3322311068358083, "flos": 17932496791680.0, "grad_norm": 2.4807127401541993, "language_loss": 0.62095118, "learning_rate": 3.116074204525142e-06, "loss": 0.64302886, "num_input_tokens_seen": 59558590, "step": 2763, "time_per_iteration": 2.7259304523468018 }, { "auxiliary_loss_clip": 0.01186745, "auxiliary_loss_mlp": 0.01028078, "balance_loss_clip": 1.05756819, "balance_loss_mlp": 1.01964402, "epoch": 0.3323513497264474, "flos": 32269831269120.0, "grad_norm": 1.641798625843403, "language_loss": 0.83847356, "learning_rate": 3.1154277167518553e-06, "loss": 0.86062169, "num_input_tokens_seen": 59580205, "step": 2764, "time_per_iteration": 2.8805062770843506 }, { "auxiliary_loss_clip": 0.01079673, "auxiliary_loss_mlp": 0.01007732, "balance_loss_clip": 1.01465917, "balance_loss_mlp": 1.00597918, "epoch": 0.33247159261708653, "flos": 52668674588160.0, "grad_norm": 0.7804392862868376, "language_loss": 0.59469366, "learning_rate": 3.114781059769072e-06, "loss": 0.61556768, "num_input_tokens_seen": 59631530, "step": 2765, "time_per_iteration": 3.2199766635894775 }, { "auxiliary_loss_clip": 0.01191898, "auxiliary_loss_mlp": 0.01032612, "balance_loss_clip": 1.05801201, "balance_loss_mlp": 1.02378428, "epoch": 0.3325918355077256, "flos": 27125232906240.0, "grad_norm": 2.270783814770196, "language_loss": 0.67869854, "learning_rate": 3.1141342336748874e-06, "loss": 0.70094365, "num_input_tokens_seen": 59651090, "step": 2766, "time_per_iteration": 2.842355489730835 }, { "auxiliary_loss_clip": 0.01186942, "auxiliary_loss_mlp": 0.01029174, "balance_loss_clip": 1.05401754, "balance_loss_mlp": 1.02057874, "epoch": 0.3327120783983647, "flos": 23664414435840.0, "grad_norm": 1.6748468348930237, "language_loss": 0.81978226, "learning_rate": 3.1134872385674253e-06, "loss": 0.84194344, "num_input_tokens_seen": 59675245, "step": 2767, "time_per_iteration": 2.8116374015808105 }, { "auxiliary_loss_clip": 0.01193276, "auxiliary_loss_mlp": 0.01026731, "balance_loss_clip": 1.05448186, "balance_loss_mlp": 1.01835608, "epoch": 0.3328323212890038, "flos": 19171378828800.0, "grad_norm": 1.825724114239498, "language_loss": 0.85245192, "learning_rate": 3.1128400745448353e-06, "loss": 0.87465203, "num_input_tokens_seen": 59694625, "step": 2768, "time_per_iteration": 2.7591187953948975 }, { "auxiliary_loss_clip": 0.01193793, "auxiliary_loss_mlp": 0.01029761, "balance_loss_clip": 1.05648899, "balance_loss_mlp": 1.02117801, "epoch": 0.33295256417964286, "flos": 37706347463040.0, "grad_norm": 2.7026997372185204, "language_loss": 0.63008052, "learning_rate": 3.11219274170529e-06, "loss": 0.65231609, "num_input_tokens_seen": 59716435, "step": 2769, "time_per_iteration": 2.917185068130493 }, { "auxiliary_loss_clip": 0.01180273, "auxiliary_loss_mlp": 0.01032051, "balance_loss_clip": 1.05373251, "balance_loss_mlp": 1.02345026, "epoch": 0.333072807070282, "flos": 26505989412480.0, "grad_norm": 1.7633079157822318, "language_loss": 0.81363708, "learning_rate": 3.1115452401469903e-06, "loss": 0.83576035, "num_input_tokens_seen": 59736835, "step": 2770, "time_per_iteration": 2.8716087341308594 }, { "auxiliary_loss_clip": 0.01176229, "auxiliary_loss_mlp": 0.01033763, "balance_loss_clip": 1.05602014, "balance_loss_mlp": 1.02585912, "epoch": 0.3331930499609211, "flos": 21430913823360.0, "grad_norm": 2.0180105888043154, "language_loss": 0.86414266, "learning_rate": 3.1108975699681613e-06, "loss": 0.88624263, "num_input_tokens_seen": 59754230, "step": 2771, "time_per_iteration": 2.8010056018829346 }, { "auxiliary_loss_clip": 0.01182423, "auxiliary_loss_mlp": 0.01036333, "balance_loss_clip": 1.05475461, "balance_loss_mlp": 1.02813113, "epoch": 0.33331329285156014, "flos": 20659947281280.0, "grad_norm": 1.7067845559057528, "language_loss": 0.71763903, "learning_rate": 3.1102497312670542e-06, "loss": 0.73982662, "num_input_tokens_seen": 59772235, "step": 2772, "time_per_iteration": 2.9396286010742188 }, { "auxiliary_loss_clip": 0.01178872, "auxiliary_loss_mlp": 0.01029368, "balance_loss_clip": 1.05574465, "balance_loss_mlp": 1.02058792, "epoch": 0.33343353574219925, "flos": 28001596930560.0, "grad_norm": 1.8849194883737495, "language_loss": 0.80578089, "learning_rate": 3.109601724141946e-06, "loss": 0.82786328, "num_input_tokens_seen": 59791230, "step": 2773, "time_per_iteration": 2.856459856033325 }, { "auxiliary_loss_clip": 0.01186172, "auxiliary_loss_mlp": 0.01026486, "balance_loss_clip": 1.05625069, "balance_loss_mlp": 1.01751542, "epoch": 0.33355377863283836, "flos": 23764963582080.0, "grad_norm": 2.098009243480592, "language_loss": 0.68341762, "learning_rate": 3.108953548691138e-06, "loss": 0.70554423, "num_input_tokens_seen": 59811315, "step": 2774, "time_per_iteration": 2.7285051345825195 }, { "auxiliary_loss_clip": 0.01194971, "auxiliary_loss_mlp": 0.01025152, "balance_loss_clip": 1.05547047, "balance_loss_mlp": 1.01690865, "epoch": 0.3336740215234774, "flos": 37779677078400.0, "grad_norm": 3.6388774788548606, "language_loss": 0.72716647, "learning_rate": 3.108305205012959e-06, "loss": 0.74936765, "num_input_tokens_seen": 59832010, "step": 2775, "time_per_iteration": 3.813525915145874 }, { "auxiliary_loss_clip": 0.011812, "auxiliary_loss_mlp": 0.01030955, "balance_loss_clip": 1.05131173, "balance_loss_mlp": 1.02246702, "epoch": 0.3337942644141165, "flos": 25519056347520.0, "grad_norm": 2.1462096630701044, "language_loss": 0.87788886, "learning_rate": 3.107656693205761e-06, "loss": 0.90001041, "num_input_tokens_seen": 59851450, "step": 2776, "time_per_iteration": 2.7830393314361572 }, { "auxiliary_loss_clip": 0.01201468, "auxiliary_loss_mlp": 0.01032905, "balance_loss_clip": 1.05716741, "balance_loss_mlp": 1.02386332, "epoch": 0.3339145073047556, "flos": 25989844930560.0, "grad_norm": 2.3689532321059974, "language_loss": 0.70719278, "learning_rate": 3.107008013367924e-06, "loss": 0.72953653, "num_input_tokens_seen": 59870245, "step": 2777, "time_per_iteration": 2.73701810836792 }, { "auxiliary_loss_clip": 0.01182089, "auxiliary_loss_mlp": 0.01026998, "balance_loss_clip": 1.05411696, "balance_loss_mlp": 1.01893902, "epoch": 0.3340347501953947, "flos": 19062569554560.0, "grad_norm": 2.1165924926762574, "language_loss": 0.86403203, "learning_rate": 3.1063591655978507e-06, "loss": 0.88612294, "num_input_tokens_seen": 59886195, "step": 2778, "time_per_iteration": 3.7195076942443848 }, { "auxiliary_loss_clip": 0.01174574, "auxiliary_loss_mlp": 0.01031849, "balance_loss_clip": 1.05561221, "balance_loss_mlp": 1.02343869, "epoch": 0.3341549930860338, "flos": 18109715518080.0, "grad_norm": 1.8314249535317009, "language_loss": 0.79594982, "learning_rate": 3.105710149993972e-06, "loss": 0.81801403, "num_input_tokens_seen": 59905525, "step": 2779, "time_per_iteration": 3.8118653297424316 }, { "auxiliary_loss_clip": 0.01195629, "auxiliary_loss_mlp": 0.01029892, "balance_loss_clip": 1.05441475, "balance_loss_mlp": 1.02150011, "epoch": 0.33427523597667286, "flos": 22674967418880.0, "grad_norm": 1.9662715862062699, "language_loss": 0.85696208, "learning_rate": 3.1050609666547427e-06, "loss": 0.87921733, "num_input_tokens_seen": 59925085, "step": 2780, "time_per_iteration": 3.6670522689819336 }, { "auxiliary_loss_clip": 0.01189063, "auxiliary_loss_mlp": 0.01033382, "balance_loss_clip": 1.05505896, "balance_loss_mlp": 1.02458429, "epoch": 0.33439547886731197, "flos": 22638338524800.0, "grad_norm": 1.8736541758902423, "language_loss": 0.77412879, "learning_rate": 3.104411615678644e-06, "loss": 0.79635328, "num_input_tokens_seen": 59943935, "step": 2781, "time_per_iteration": 2.8003201484680176 }, { "auxiliary_loss_clip": 0.01182929, "auxiliary_loss_mlp": 0.01026507, "balance_loss_clip": 1.05582643, "balance_loss_mlp": 1.01803672, "epoch": 0.3345157217579511, "flos": 24096383395200.0, "grad_norm": 2.4133815142375576, "language_loss": 0.73717797, "learning_rate": 3.1037620971641803e-06, "loss": 0.75927234, "num_input_tokens_seen": 59963725, "step": 2782, "time_per_iteration": 2.783073902130127 }, { "auxiliary_loss_clip": 0.01195556, "auxiliary_loss_mlp": 0.01033509, "balance_loss_clip": 1.05502927, "balance_loss_mlp": 1.0245688, "epoch": 0.33463596464859013, "flos": 18989491334400.0, "grad_norm": 2.6350341662203367, "language_loss": 0.64903867, "learning_rate": 3.1031124112098844e-06, "loss": 0.67132938, "num_input_tokens_seen": 59981935, "step": 2783, "time_per_iteration": 2.6635966300964355 }, { "auxiliary_loss_clip": 0.0118987, "auxiliary_loss_mlp": 0.01028286, "balance_loss_clip": 1.05716372, "balance_loss_mlp": 1.01995325, "epoch": 0.33475620753922924, "flos": 20375607219840.0, "grad_norm": 2.207344241232778, "language_loss": 0.72388262, "learning_rate": 3.1024625579143127e-06, "loss": 0.74606419, "num_input_tokens_seen": 59999455, "step": 2784, "time_per_iteration": 2.7813665866851807 }, { "auxiliary_loss_clip": 0.01190417, "auxiliary_loss_mlp": 0.01035669, "balance_loss_clip": 1.05278218, "balance_loss_mlp": 1.02697301, "epoch": 0.33487645042986836, "flos": 18182578256640.0, "grad_norm": 1.8037618023822835, "language_loss": 0.72883075, "learning_rate": 3.101812537376048e-06, "loss": 0.7510916, "num_input_tokens_seen": 60018475, "step": 2785, "time_per_iteration": 2.7070608139038086 }, { "auxiliary_loss_clip": 0.01178875, "auxiliary_loss_mlp": 0.01059851, "balance_loss_clip": 1.05431759, "balance_loss_mlp": 1.02257204, "epoch": 0.3349966933205074, "flos": 25848824135040.0, "grad_norm": 2.2589541482568984, "language_loss": 0.84537983, "learning_rate": 3.1011623496936973e-06, "loss": 0.8677671, "num_input_tokens_seen": 60036770, "step": 2786, "time_per_iteration": 2.7961835861206055 }, { "auxiliary_loss_clip": 0.01192739, "auxiliary_loss_mlp": 0.01034322, "balance_loss_clip": 1.05485082, "balance_loss_mlp": 1.02597141, "epoch": 0.3351169362111465, "flos": 28111447699200.0, "grad_norm": 1.839130026392024, "language_loss": 0.69852763, "learning_rate": 3.100511994965893e-06, "loss": 0.72079825, "num_input_tokens_seen": 60056725, "step": 2787, "time_per_iteration": 2.815098285675049 }, { "auxiliary_loss_clip": 0.01185945, "auxiliary_loss_mlp": 0.01026264, "balance_loss_clip": 1.05448389, "balance_loss_mlp": 1.01738846, "epoch": 0.33523717910178563, "flos": 22673315393280.0, "grad_norm": 1.7822125992562137, "language_loss": 0.84532666, "learning_rate": 3.0998614732912947e-06, "loss": 0.86744881, "num_input_tokens_seen": 60076100, "step": 2788, "time_per_iteration": 2.873349666595459 }, { "auxiliary_loss_clip": 0.01187577, "auxiliary_loss_mlp": 0.01030735, "balance_loss_clip": 1.05657637, "balance_loss_mlp": 1.02218163, "epoch": 0.3353574219924247, "flos": 15669801400320.0, "grad_norm": 2.0601017014178375, "language_loss": 0.67835283, "learning_rate": 3.0992107847685855e-06, "loss": 0.70053595, "num_input_tokens_seen": 60093815, "step": 2789, "time_per_iteration": 2.7751007080078125 }, { "auxiliary_loss_clip": 0.01186642, "auxiliary_loss_mlp": 0.01027896, "balance_loss_clip": 1.057639, "balance_loss_mlp": 1.01913404, "epoch": 0.3354776648830638, "flos": 24790644443520.0, "grad_norm": 1.6762992505757244, "language_loss": 0.79336262, "learning_rate": 3.0985599294964736e-06, "loss": 0.81550807, "num_input_tokens_seen": 60113370, "step": 2790, "time_per_iteration": 2.758526086807251 }, { "auxiliary_loss_clip": 0.01197513, "auxiliary_loss_mlp": 0.01031899, "balance_loss_clip": 1.05659652, "balance_loss_mlp": 1.02329206, "epoch": 0.33559790777370285, "flos": 28694852398080.0, "grad_norm": 1.9493963442610978, "language_loss": 0.70000637, "learning_rate": 3.097908907573695e-06, "loss": 0.72230047, "num_input_tokens_seen": 60131350, "step": 2791, "time_per_iteration": 2.7539734840393066 }, { "auxiliary_loss_clip": 0.0117496, "auxiliary_loss_mlp": 0.01032825, "balance_loss_clip": 1.05537224, "balance_loss_mlp": 1.02499914, "epoch": 0.33571815066434196, "flos": 22235779825920.0, "grad_norm": 2.351334608868113, "language_loss": 0.89490807, "learning_rate": 3.0972577190990067e-06, "loss": 0.91698587, "num_input_tokens_seen": 60149830, "step": 2792, "time_per_iteration": 2.789234161376953 }, { "auxiliary_loss_clip": 0.0119229, "auxiliary_loss_mlp": 0.01027697, "balance_loss_clip": 1.05870318, "balance_loss_mlp": 1.02006781, "epoch": 0.3358383935549811, "flos": 23842279607040.0, "grad_norm": 1.760739283962779, "language_loss": 0.79885709, "learning_rate": 3.096606364171196e-06, "loss": 0.82105696, "num_input_tokens_seen": 60169620, "step": 2793, "time_per_iteration": 2.7892048358917236 }, { "auxiliary_loss_clip": 0.01175955, "auxiliary_loss_mlp": 0.01030659, "balance_loss_clip": 1.05872202, "balance_loss_mlp": 1.02226651, "epoch": 0.33595863644562013, "flos": 22267308988800.0, "grad_norm": 2.8140659864991338, "language_loss": 0.85168988, "learning_rate": 3.0959548428890703e-06, "loss": 0.87375605, "num_input_tokens_seen": 60188490, "step": 2794, "time_per_iteration": 2.754861831665039 }, { "auxiliary_loss_clip": 0.01187426, "auxiliary_loss_mlp": 0.01026538, "balance_loss_clip": 1.0562315, "balance_loss_mlp": 1.01795471, "epoch": 0.33607887933625924, "flos": 20119779578880.0, "grad_norm": 1.8173635067750944, "language_loss": 0.83885247, "learning_rate": 3.095303155351468e-06, "loss": 0.86099207, "num_input_tokens_seen": 60208695, "step": 2795, "time_per_iteration": 2.7511420249938965 }, { "auxiliary_loss_clip": 0.01174757, "auxiliary_loss_mlp": 0.0103158, "balance_loss_clip": 1.05689549, "balance_loss_mlp": 1.02280664, "epoch": 0.33619912222689835, "flos": 19318109886720.0, "grad_norm": 2.413331961611296, "language_loss": 0.78785479, "learning_rate": 3.0946513016572464e-06, "loss": 0.80991817, "num_input_tokens_seen": 60227600, "step": 2796, "time_per_iteration": 2.7189903259277344 }, { "auxiliary_loss_clip": 0.01193539, "auxiliary_loss_mlp": 0.01030802, "balance_loss_clip": 1.05568814, "balance_loss_mlp": 1.02140868, "epoch": 0.3363193651175374, "flos": 16800664262400.0, "grad_norm": 2.1271505789089975, "language_loss": 0.76557946, "learning_rate": 3.0939992819052938e-06, "loss": 0.78782284, "num_input_tokens_seen": 60245110, "step": 2797, "time_per_iteration": 2.762991428375244 }, { "auxiliary_loss_clip": 0.01187124, "auxiliary_loss_mlp": 0.01029449, "balance_loss_clip": 1.05555749, "balance_loss_mlp": 1.02104521, "epoch": 0.3364396080081765, "flos": 23550289948800.0, "grad_norm": 1.9105346825328176, "language_loss": 0.8125425, "learning_rate": 3.0933470961945193e-06, "loss": 0.83470821, "num_input_tokens_seen": 60263405, "step": 2798, "time_per_iteration": 2.8100013732910156 }, { "auxiliary_loss_clip": 0.01186861, "auxiliary_loss_mlp": 0.01029048, "balance_loss_clip": 1.0572716, "balance_loss_mlp": 1.02131128, "epoch": 0.3365598508988156, "flos": 28037902602240.0, "grad_norm": 1.9016646770127876, "language_loss": 0.68412751, "learning_rate": 3.0926947446238597e-06, "loss": 0.70628655, "num_input_tokens_seen": 60282975, "step": 2799, "time_per_iteration": 2.8757550716400146 }, { "auxiliary_loss_clip": 0.01197866, "auxiliary_loss_mlp": 0.01026809, "balance_loss_clip": 1.0552907, "balance_loss_mlp": 1.01801097, "epoch": 0.3366800937894547, "flos": 16982767238400.0, "grad_norm": 2.5656782837032113, "language_loss": 0.82892931, "learning_rate": 3.092042227292276e-06, "loss": 0.85117608, "num_input_tokens_seen": 60299810, "step": 2800, "time_per_iteration": 2.7433583736419678 }, { "auxiliary_loss_clip": 0.01191395, "auxiliary_loss_mlp": 0.01029111, "balance_loss_clip": 1.05546641, "balance_loss_mlp": 1.02169633, "epoch": 0.3368003366800938, "flos": 23915321913600.0, "grad_norm": 1.6580250322080843, "language_loss": 0.8830207, "learning_rate": 3.0913895442987557e-06, "loss": 0.90522575, "num_input_tokens_seen": 60320775, "step": 2801, "time_per_iteration": 3.710991859436035 }, { "auxiliary_loss_clip": 0.01181325, "auxiliary_loss_mlp": 0.01062031, "balance_loss_clip": 1.05501878, "balance_loss_mlp": 1.02401328, "epoch": 0.3369205795707329, "flos": 24791219061120.0, "grad_norm": 1.755912216742805, "language_loss": 0.85615605, "learning_rate": 3.090736695742308e-06, "loss": 0.87858963, "num_input_tokens_seen": 60341905, "step": 2802, "time_per_iteration": 2.8794734477996826 }, { "auxiliary_loss_clip": 0.01174407, "auxiliary_loss_mlp": 0.01030151, "balance_loss_clip": 1.05448949, "balance_loss_mlp": 1.02189612, "epoch": 0.33704082246137196, "flos": 17931096161280.0, "grad_norm": 2.567071289806709, "language_loss": 0.522838, "learning_rate": 3.0900836817219713e-06, "loss": 0.54488361, "num_input_tokens_seen": 60358335, "step": 2803, "time_per_iteration": 2.7791028022766113 }, { "auxiliary_loss_clip": 0.0119368, "auxiliary_loss_mlp": 0.01030264, "balance_loss_clip": 1.05496192, "balance_loss_mlp": 1.02259898, "epoch": 0.33716106535201107, "flos": 21286517149440.0, "grad_norm": 2.1420395600443363, "language_loss": 0.83733636, "learning_rate": 3.089430502336807e-06, "loss": 0.85957575, "num_input_tokens_seen": 60378305, "step": 2804, "time_per_iteration": 3.7130379676818848 }, { "auxiliary_loss_clip": 0.01195752, "auxiliary_loss_mlp": 0.01031363, "balance_loss_clip": 1.05664659, "balance_loss_mlp": 1.02224398, "epoch": 0.3372813082426502, "flos": 18402962152320.0, "grad_norm": 2.6699994615290508, "language_loss": 0.8997457, "learning_rate": 3.088777157685902e-06, "loss": 0.92201698, "num_input_tokens_seen": 60393895, "step": 2805, "time_per_iteration": 3.6392664909362793 }, { "auxiliary_loss_clip": 0.01181532, "auxiliary_loss_mlp": 0.01027644, "balance_loss_clip": 1.05396199, "balance_loss_mlp": 1.01942444, "epoch": 0.33740155113328923, "flos": 17201391367680.0, "grad_norm": 1.9457009759989334, "language_loss": 0.85736656, "learning_rate": 3.088123647868367e-06, "loss": 0.87945831, "num_input_tokens_seen": 60410445, "step": 2806, "time_per_iteration": 2.8699498176574707 }, { "auxiliary_loss_clip": 0.01191492, "auxiliary_loss_mlp": 0.01026201, "balance_loss_clip": 1.05403125, "balance_loss_mlp": 1.01830339, "epoch": 0.33752179402392835, "flos": 29058950609280.0, "grad_norm": 2.8858223170483384, "language_loss": 0.8139962, "learning_rate": 3.0874699729833405e-06, "loss": 0.83617324, "num_input_tokens_seen": 60431815, "step": 2807, "time_per_iteration": 3.8041610717773438 }, { "auxiliary_loss_clip": 0.01183368, "auxiliary_loss_mlp": 0.01029309, "balance_loss_clip": 1.05430293, "balance_loss_mlp": 1.02114284, "epoch": 0.3376420369145674, "flos": 25080730680960.0, "grad_norm": 1.640588321282194, "language_loss": 0.79780245, "learning_rate": 3.086816133129983e-06, "loss": 0.81992924, "num_input_tokens_seen": 60452075, "step": 2808, "time_per_iteration": 2.7824878692626953 }, { "auxiliary_loss_clip": 0.01196403, "auxiliary_loss_mlp": 0.01034113, "balance_loss_clip": 1.05872869, "balance_loss_mlp": 1.02606022, "epoch": 0.3377622798052065, "flos": 27490624007040.0, "grad_norm": 2.4478492171364645, "language_loss": 0.76209009, "learning_rate": 3.0861621284074826e-06, "loss": 0.78439522, "num_input_tokens_seen": 60472600, "step": 2809, "time_per_iteration": 2.9414279460906982 }, { "auxiliary_loss_clip": 0.01193026, "auxiliary_loss_mlp": 0.01030136, "balance_loss_clip": 1.05704653, "balance_loss_mlp": 1.02233362, "epoch": 0.3378825226958456, "flos": 21975211589760.0, "grad_norm": 1.6145914582068093, "language_loss": 0.7313509, "learning_rate": 3.085507958915051e-06, "loss": 0.75358248, "num_input_tokens_seen": 60491030, "step": 2810, "time_per_iteration": 2.756582021713257 }, { "auxiliary_loss_clip": 0.0118722, "auxiliary_loss_mlp": 0.01032455, "balance_loss_clip": 1.05759168, "balance_loss_mlp": 1.02453899, "epoch": 0.3380027655864847, "flos": 42523189200000.0, "grad_norm": 4.126822841273205, "language_loss": 0.71221989, "learning_rate": 3.084853624751925e-06, "loss": 0.7344166, "num_input_tokens_seen": 60512615, "step": 2811, "time_per_iteration": 3.011326551437378 }, { "auxiliary_loss_clip": 0.01183803, "auxiliary_loss_mlp": 0.01030201, "balance_loss_clip": 1.0555948, "balance_loss_mlp": 1.02195811, "epoch": 0.3381230084771238, "flos": 26725080418560.0, "grad_norm": 1.663820563918392, "language_loss": 0.85225689, "learning_rate": 3.0841991260173668e-06, "loss": 0.87439692, "num_input_tokens_seen": 60532520, "step": 2812, "time_per_iteration": 2.9005794525146484 }, { "auxiliary_loss_clip": 0.01193428, "auxiliary_loss_mlp": 0.01028972, "balance_loss_clip": 1.0548687, "balance_loss_mlp": 1.0203948, "epoch": 0.3382432513677629, "flos": 22710375250560.0, "grad_norm": 1.9668713597784442, "language_loss": 0.80211878, "learning_rate": 3.0835444628106634e-06, "loss": 0.82434285, "num_input_tokens_seen": 60551500, "step": 2813, "time_per_iteration": 2.7800605297088623 }, { "auxiliary_loss_clip": 0.01191092, "auxiliary_loss_mlp": 0.01058947, "balance_loss_clip": 1.05330062, "balance_loss_mlp": 1.02345204, "epoch": 0.33836349425840195, "flos": 22122409524480.0, "grad_norm": 1.9992400348595318, "language_loss": 0.82603121, "learning_rate": 3.082889635231126e-06, "loss": 0.8485316, "num_input_tokens_seen": 60570160, "step": 2814, "time_per_iteration": 2.6927099227905273 }, { "auxiliary_loss_clip": 0.01189581, "auxiliary_loss_mlp": 0.01033582, "balance_loss_clip": 1.0560441, "balance_loss_mlp": 1.0249455, "epoch": 0.33848373714904106, "flos": 27308090067840.0, "grad_norm": 3.0713252726066673, "language_loss": 0.76916027, "learning_rate": 3.0822346433780925e-06, "loss": 0.79139191, "num_input_tokens_seen": 60590885, "step": 2815, "time_per_iteration": 2.8004343509674072 }, { "auxiliary_loss_clip": 0.01190825, "auxiliary_loss_mlp": 0.01030634, "balance_loss_clip": 1.05416262, "balance_loss_mlp": 1.02202749, "epoch": 0.3386039800396802, "flos": 25848716394240.0, "grad_norm": 1.90475523935677, "language_loss": 0.87304354, "learning_rate": 3.0815794873509237e-06, "loss": 0.89525807, "num_input_tokens_seen": 60609170, "step": 2816, "time_per_iteration": 2.8060030937194824 }, { "auxiliary_loss_clip": 0.01192912, "auxiliary_loss_mlp": 0.01031686, "balance_loss_clip": 1.05436802, "balance_loss_mlp": 1.02334762, "epoch": 0.33872422293031923, "flos": 18880646146560.0, "grad_norm": 2.042617028766347, "language_loss": 0.72798431, "learning_rate": 3.0809241672490066e-06, "loss": 0.75023031, "num_input_tokens_seen": 60627340, "step": 2817, "time_per_iteration": 2.8247435092926025 }, { "auxiliary_loss_clip": 0.0118565, "auxiliary_loss_mlp": 0.01029182, "balance_loss_clip": 1.05427575, "balance_loss_mlp": 1.02135038, "epoch": 0.33884446582095834, "flos": 23146977064320.0, "grad_norm": 1.831698960078937, "language_loss": 0.85051501, "learning_rate": 3.080268683171753e-06, "loss": 0.87266332, "num_input_tokens_seen": 60647630, "step": 2818, "time_per_iteration": 2.8217718601226807 }, { "auxiliary_loss_clip": 0.01191171, "auxiliary_loss_mlp": 0.01025963, "balance_loss_clip": 1.05523479, "balance_loss_mlp": 1.01794875, "epoch": 0.33896470871159745, "flos": 15997342544640.0, "grad_norm": 4.044412307121417, "language_loss": 0.89324927, "learning_rate": 3.0796130352185985e-06, "loss": 0.91542065, "num_input_tokens_seen": 60664485, "step": 2819, "time_per_iteration": 2.723555564880371 }, { "auxiliary_loss_clip": 0.01177989, "auxiliary_loss_mlp": 0.01061912, "balance_loss_clip": 1.05480695, "balance_loss_mlp": 1.02566814, "epoch": 0.3390849516022365, "flos": 34495754112000.0, "grad_norm": 2.073957747300572, "language_loss": 0.66316134, "learning_rate": 3.0789572234890057e-06, "loss": 0.68556035, "num_input_tokens_seen": 60686125, "step": 2820, "time_per_iteration": 2.9546473026275635 }, { "auxiliary_loss_clip": 0.01185321, "auxiliary_loss_mlp": 0.01032452, "balance_loss_clip": 1.05499458, "balance_loss_mlp": 1.02330256, "epoch": 0.3392051944928756, "flos": 16180307447040.0, "grad_norm": 1.7666558723967394, "language_loss": 0.7743777, "learning_rate": 3.0783012480824596e-06, "loss": 0.79655546, "num_input_tokens_seen": 60705270, "step": 2821, "time_per_iteration": 2.778196096420288 }, { "auxiliary_loss_clip": 0.01192717, "auxiliary_loss_mlp": 0.01030449, "balance_loss_clip": 1.05372071, "balance_loss_mlp": 1.02128816, "epoch": 0.33932543738351467, "flos": 17086656349440.0, "grad_norm": 2.1153483057686984, "language_loss": 0.74425864, "learning_rate": 3.077645109098471e-06, "loss": 0.76649034, "num_input_tokens_seen": 60721540, "step": 2822, "time_per_iteration": 2.742384433746338 }, { "auxiliary_loss_clip": 0.01173638, "auxiliary_loss_mlp": 0.01030103, "balance_loss_clip": 1.05590653, "balance_loss_mlp": 1.02162766, "epoch": 0.3394456802741538, "flos": 22126970551680.0, "grad_norm": 1.7594992062526797, "language_loss": 0.72281706, "learning_rate": 3.076988806636577e-06, "loss": 0.74485445, "num_input_tokens_seen": 60739300, "step": 2823, "time_per_iteration": 2.805408477783203 }, { "auxiliary_loss_clip": 0.01192277, "auxiliary_loss_mlp": 0.01057131, "balance_loss_clip": 1.05827343, "balance_loss_mlp": 1.02065063, "epoch": 0.3395659231647929, "flos": 25226887121280.0, "grad_norm": 1.92510778677778, "language_loss": 0.88545215, "learning_rate": 3.0763323407963377e-06, "loss": 0.90794617, "num_input_tokens_seen": 60758910, "step": 2824, "time_per_iteration": 2.763911724090576 }, { "auxiliary_loss_clip": 0.01192933, "auxiliary_loss_mlp": 0.01034009, "balance_loss_clip": 1.05808139, "balance_loss_mlp": 1.02537847, "epoch": 0.33968616605543195, "flos": 29096477343360.0, "grad_norm": 1.8733953344470309, "language_loss": 0.79742575, "learning_rate": 3.075675711677337e-06, "loss": 0.81969512, "num_input_tokens_seen": 60779005, "step": 2825, "time_per_iteration": 2.7764859199523926 }, { "auxiliary_loss_clip": 0.01183968, "auxiliary_loss_mlp": 0.01034081, "balance_loss_clip": 1.05727804, "balance_loss_mlp": 1.02612996, "epoch": 0.33980640894607106, "flos": 21433966479360.0, "grad_norm": 1.9441308362518164, "language_loss": 0.7865513, "learning_rate": 3.0750189193791865e-06, "loss": 0.80873179, "num_input_tokens_seen": 60798590, "step": 2826, "time_per_iteration": 2.8305630683898926 }, { "auxiliary_loss_clip": 0.01188577, "auxiliary_loss_mlp": 0.01028338, "balance_loss_clip": 1.05592573, "balance_loss_mlp": 1.01986241, "epoch": 0.33992665183671017, "flos": 32490035596800.0, "grad_norm": 3.7009528755825083, "language_loss": 0.70207798, "learning_rate": 3.0743619640015203e-06, "loss": 0.72424716, "num_input_tokens_seen": 60818840, "step": 2827, "time_per_iteration": 3.836693048477173 }, { "auxiliary_loss_clip": 0.01194193, "auxiliary_loss_mlp": 0.010297, "balance_loss_clip": 1.05517852, "balance_loss_mlp": 1.02074158, "epoch": 0.3400468947273492, "flos": 17055414495360.0, "grad_norm": 2.908878211764775, "language_loss": 0.9261632, "learning_rate": 3.073704845643999e-06, "loss": 0.94840217, "num_input_tokens_seen": 60835965, "step": 2828, "time_per_iteration": 2.6881630420684814 }, { "auxiliary_loss_clip": 0.01194486, "auxiliary_loss_mlp": 0.01023944, "balance_loss_clip": 1.05524814, "balance_loss_mlp": 1.01404381, "epoch": 0.34016713761798834, "flos": 16872988296960.0, "grad_norm": 3.8315772539839625, "language_loss": 0.77965474, "learning_rate": 3.0730475644063063e-06, "loss": 0.80183899, "num_input_tokens_seen": 60851065, "step": 2829, "time_per_iteration": 2.760183811187744 }, { "auxiliary_loss_clip": 0.01180645, "auxiliary_loss_mlp": 0.01059303, "balance_loss_clip": 1.05352473, "balance_loss_mlp": 1.02197671, "epoch": 0.34028738050862745, "flos": 21907161273600.0, "grad_norm": 1.7302695777663268, "language_loss": 0.64969528, "learning_rate": 3.072390120388151e-06, "loss": 0.67209482, "num_input_tokens_seen": 60869390, "step": 2830, "time_per_iteration": 3.7987327575683594 }, { "auxiliary_loss_clip": 0.01191378, "auxiliary_loss_mlp": 0.01033545, "balance_loss_clip": 1.05505395, "balance_loss_mlp": 1.02426481, "epoch": 0.3404076233992665, "flos": 22746034477440.0, "grad_norm": 2.3732084962430084, "language_loss": 0.71058172, "learning_rate": 3.071732513689267e-06, "loss": 0.73283094, "num_input_tokens_seen": 60887925, "step": 2831, "time_per_iteration": 3.696254253387451 }, { "auxiliary_loss_clip": 0.01191247, "auxiliary_loss_mlp": 0.01031225, "balance_loss_clip": 1.05702186, "balance_loss_mlp": 1.02220678, "epoch": 0.3405278662899056, "flos": 17052361839360.0, "grad_norm": 2.207030138990605, "language_loss": 0.67946094, "learning_rate": 3.0710747444094134e-06, "loss": 0.70168567, "num_input_tokens_seen": 60905955, "step": 2832, "time_per_iteration": 2.8259763717651367 }, { "auxiliary_loss_clip": 0.01190645, "auxiliary_loss_mlp": 0.01032958, "balance_loss_clip": 1.05632579, "balance_loss_mlp": 1.02429771, "epoch": 0.3406481091805447, "flos": 42813131783040.0, "grad_norm": 1.879387255727415, "language_loss": 0.65029758, "learning_rate": 3.070416812648372e-06, "loss": 0.67253363, "num_input_tokens_seen": 60929405, "step": 2833, "time_per_iteration": 3.9639947414398193 }, { "auxiliary_loss_clip": 0.01184789, "auxiliary_loss_mlp": 0.01028935, "balance_loss_clip": 1.05522013, "balance_loss_mlp": 1.01997614, "epoch": 0.3407683520711838, "flos": 26761457917440.0, "grad_norm": 3.480497957554778, "language_loss": 0.64691722, "learning_rate": 3.069758718505951e-06, "loss": 0.66905451, "num_input_tokens_seen": 60951145, "step": 2834, "time_per_iteration": 2.7776694297790527 }, { "auxiliary_loss_clip": 0.01196029, "auxiliary_loss_mlp": 0.01032926, "balance_loss_clip": 1.05829537, "balance_loss_mlp": 1.02442646, "epoch": 0.3408885949618229, "flos": 28767643309440.0, "grad_norm": 1.800616487444273, "language_loss": 0.79961705, "learning_rate": 3.0691004620819836e-06, "loss": 0.82190657, "num_input_tokens_seen": 60971275, "step": 2835, "time_per_iteration": 2.76962947845459 }, { "auxiliary_loss_clip": 0.010851, "auxiliary_loss_mlp": 0.01006588, "balance_loss_clip": 1.02042091, "balance_loss_mlp": 1.00522888, "epoch": 0.341008837852462, "flos": 63576252881280.0, "grad_norm": 0.7890816831684464, "language_loss": 0.60149813, "learning_rate": 3.0684420434763254e-06, "loss": 0.62241495, "num_input_tokens_seen": 61037460, "step": 2836, "time_per_iteration": 3.389636278152466 }, { "auxiliary_loss_clip": 0.01177269, "auxiliary_loss_mlp": 0.01032391, "balance_loss_clip": 1.05737329, "balance_loss_mlp": 1.02416861, "epoch": 0.34112908074310105, "flos": 20812173120000.0, "grad_norm": 1.8547892457530077, "language_loss": 0.76881075, "learning_rate": 3.06778346278886e-06, "loss": 0.79090732, "num_input_tokens_seen": 61056295, "step": 2837, "time_per_iteration": 2.7597947120666504 }, { "auxiliary_loss_clip": 0.01196787, "auxiliary_loss_mlp": 0.01026028, "balance_loss_clip": 1.05792952, "balance_loss_mlp": 1.01737309, "epoch": 0.34124932363374016, "flos": 24976446520320.0, "grad_norm": 2.0931509199902, "language_loss": 0.79119134, "learning_rate": 3.0671247201194906e-06, "loss": 0.81341952, "num_input_tokens_seen": 61078430, "step": 2838, "time_per_iteration": 2.746685028076172 }, { "auxiliary_loss_clip": 0.01187236, "auxiliary_loss_mlp": 0.01033032, "balance_loss_clip": 1.05652666, "balance_loss_mlp": 1.02363253, "epoch": 0.3413695665243792, "flos": 28402970480640.0, "grad_norm": 3.772010742314739, "language_loss": 0.75627255, "learning_rate": 3.066465815568151e-06, "loss": 0.77847528, "num_input_tokens_seen": 61099260, "step": 2839, "time_per_iteration": 2.795510768890381 }, { "auxiliary_loss_clip": 0.01193708, "auxiliary_loss_mlp": 0.01031299, "balance_loss_clip": 1.05560088, "balance_loss_mlp": 1.02272761, "epoch": 0.34148980941501833, "flos": 25302012416640.0, "grad_norm": 2.0789647262447954, "language_loss": 0.68878818, "learning_rate": 3.0658067492347947e-06, "loss": 0.71103823, "num_input_tokens_seen": 61121900, "step": 2840, "time_per_iteration": 2.835843801498413 }, { "auxiliary_loss_clip": 0.01169776, "auxiliary_loss_mlp": 0.01033873, "balance_loss_clip": 1.05732846, "balance_loss_mlp": 1.02518272, "epoch": 0.34161005230565744, "flos": 17530081747200.0, "grad_norm": 2.5703104295114776, "language_loss": 0.66770375, "learning_rate": 3.065147521219402e-06, "loss": 0.68974018, "num_input_tokens_seen": 61141155, "step": 2841, "time_per_iteration": 2.976546049118042 }, { "auxiliary_loss_clip": 0.01180707, "auxiliary_loss_mlp": 0.01031719, "balance_loss_clip": 1.05803752, "balance_loss_mlp": 1.02349377, "epoch": 0.3417302951962965, "flos": 43650101566080.0, "grad_norm": 1.428638762399339, "language_loss": 0.74422157, "learning_rate": 3.064488131621977e-06, "loss": 0.7663458, "num_input_tokens_seen": 61164480, "step": 2842, "time_per_iteration": 2.9742462635040283 }, { "auxiliary_loss_clip": 0.01185279, "auxiliary_loss_mlp": 0.01033972, "balance_loss_clip": 1.05521703, "balance_loss_mlp": 1.02507901, "epoch": 0.3418505380869356, "flos": 30882207012480.0, "grad_norm": 1.796881336703635, "language_loss": 0.73981744, "learning_rate": 3.063828580542549e-06, "loss": 0.76200998, "num_input_tokens_seen": 61185675, "step": 2843, "time_per_iteration": 2.886683225631714 }, { "auxiliary_loss_clip": 0.01188473, "auxiliary_loss_mlp": 0.01031002, "balance_loss_clip": 1.05677557, "balance_loss_mlp": 1.02290773, "epoch": 0.3419707809775747, "flos": 19463871277440.0, "grad_norm": 1.925358116309221, "language_loss": 0.73523426, "learning_rate": 3.0631688680811706e-06, "loss": 0.75742888, "num_input_tokens_seen": 61205300, "step": 2844, "time_per_iteration": 2.8534417152404785 }, { "auxiliary_loss_clip": 0.0119243, "auxiliary_loss_mlp": 0.0103137, "balance_loss_clip": 1.0543716, "balance_loss_mlp": 1.02297127, "epoch": 0.3420910238682138, "flos": 28727818104960.0, "grad_norm": 2.0197440081633564, "language_loss": 0.75582767, "learning_rate": 3.062508994337921e-06, "loss": 0.77806568, "num_input_tokens_seen": 61224905, "step": 2845, "time_per_iteration": 2.823319673538208 }, { "auxiliary_loss_clip": 0.0119123, "auxiliary_loss_mlp": 0.01031565, "balance_loss_clip": 1.05600381, "balance_loss_mlp": 1.02280939, "epoch": 0.3422112667588529, "flos": 21397265758080.0, "grad_norm": 1.9841235710981766, "language_loss": 0.7920242, "learning_rate": 3.0618489594129013e-06, "loss": 0.8142522, "num_input_tokens_seen": 61243045, "step": 2846, "time_per_iteration": 2.7707979679107666 }, { "auxiliary_loss_clip": 0.01190722, "auxiliary_loss_mlp": 0.01034869, "balance_loss_clip": 1.05951214, "balance_loss_mlp": 1.02560592, "epoch": 0.342331509649492, "flos": 13881450038400.0, "grad_norm": 1.888317566708277, "language_loss": 0.70777565, "learning_rate": 3.061188763406239e-06, "loss": 0.73003161, "num_input_tokens_seen": 61259190, "step": 2847, "time_per_iteration": 2.8624627590179443 }, { "auxiliary_loss_clip": 0.0118523, "auxiliary_loss_mlp": 0.01034061, "balance_loss_clip": 1.05730367, "balance_loss_mlp": 1.0254153, "epoch": 0.34245175254013105, "flos": 28621450955520.0, "grad_norm": 2.7259952503189226, "language_loss": 0.82119322, "learning_rate": 3.060528406418085e-06, "loss": 0.84338605, "num_input_tokens_seen": 61279040, "step": 2848, "time_per_iteration": 2.944096565246582 }, { "auxiliary_loss_clip": 0.01181955, "auxiliary_loss_mlp": 0.01029891, "balance_loss_clip": 1.0559231, "balance_loss_mlp": 1.02169526, "epoch": 0.34257199543077016, "flos": 34127058960000.0, "grad_norm": 1.7311957568956728, "language_loss": 0.61714828, "learning_rate": 3.0598678885486145e-06, "loss": 0.63926673, "num_input_tokens_seen": 61301580, "step": 2849, "time_per_iteration": 2.9040451049804688 }, { "auxiliary_loss_clip": 0.01185447, "auxiliary_loss_mlp": 0.01058284, "balance_loss_clip": 1.05435717, "balance_loss_mlp": 1.0197922, "epoch": 0.34269223832140927, "flos": 19974018188160.0, "grad_norm": 1.6683804821850845, "language_loss": 0.74495065, "learning_rate": 3.0592072098980282e-06, "loss": 0.76738799, "num_input_tokens_seen": 61321240, "step": 2850, "time_per_iteration": 2.793212890625 }, { "auxiliary_loss_clip": 0.01182966, "auxiliary_loss_mlp": 0.01033218, "balance_loss_clip": 1.05634344, "balance_loss_mlp": 1.02474833, "epoch": 0.3428124812120483, "flos": 27235658292480.0, "grad_norm": 2.7495968243704363, "language_loss": 0.72595799, "learning_rate": 3.0585463705665514e-06, "loss": 0.74811983, "num_input_tokens_seen": 61341615, "step": 2851, "time_per_iteration": 2.7817742824554443 }, { "auxiliary_loss_clip": 0.01183786, "auxiliary_loss_mlp": 0.01031731, "balance_loss_clip": 1.05595064, "balance_loss_mlp": 1.02264762, "epoch": 0.34293272410268744, "flos": 24570871079040.0, "grad_norm": 2.395321514968215, "language_loss": 0.70780778, "learning_rate": 3.0578853706544304e-06, "loss": 0.729963, "num_input_tokens_seen": 61359005, "step": 2852, "time_per_iteration": 2.970202922821045 }, { "auxiliary_loss_clip": 0.01189215, "auxiliary_loss_mlp": 0.01057998, "balance_loss_clip": 1.05951738, "balance_loss_mlp": 1.01875949, "epoch": 0.34305296699332655, "flos": 21506865131520.0, "grad_norm": 2.1110847070803547, "language_loss": 0.64993596, "learning_rate": 3.0572242102619404e-06, "loss": 0.6724081, "num_input_tokens_seen": 61376160, "step": 2853, "time_per_iteration": 3.7303378582000732 }, { "auxiliary_loss_clip": 0.01185618, "auxiliary_loss_mlp": 0.01028618, "balance_loss_clip": 1.05644131, "balance_loss_mlp": 1.02005243, "epoch": 0.3431732098839656, "flos": 24056665931520.0, "grad_norm": 2.2712235448724183, "language_loss": 0.807145, "learning_rate": 3.0565628894893784e-06, "loss": 0.82928729, "num_input_tokens_seen": 61396795, "step": 2854, "time_per_iteration": 2.837157726287842 }, { "auxiliary_loss_clip": 0.01182737, "auxiliary_loss_mlp": 0.01025862, "balance_loss_clip": 1.05526984, "balance_loss_mlp": 1.01736879, "epoch": 0.3432934527746047, "flos": 16800879744000.0, "grad_norm": 1.8267763142346722, "language_loss": 0.74449909, "learning_rate": 3.0559014084370655e-06, "loss": 0.76658511, "num_input_tokens_seen": 61415320, "step": 2855, "time_per_iteration": 2.833875894546509 }, { "auxiliary_loss_clip": 0.01194731, "auxiliary_loss_mlp": 0.01028981, "balance_loss_clip": 1.05769014, "balance_loss_mlp": 1.0199275, "epoch": 0.34341369566524377, "flos": 23439720908160.0, "grad_norm": 1.8269421577754668, "language_loss": 0.78661203, "learning_rate": 3.055239767205349e-06, "loss": 0.80884922, "num_input_tokens_seen": 61437070, "step": 2856, "time_per_iteration": 3.8478941917419434 }, { "auxiliary_loss_clip": 0.01190081, "auxiliary_loss_mlp": 0.01031578, "balance_loss_clip": 1.06041586, "balance_loss_mlp": 1.02348399, "epoch": 0.3435339385558829, "flos": 17267466435840.0, "grad_norm": 1.787055217263602, "language_loss": 0.7835651, "learning_rate": 3.054577965894599e-06, "loss": 0.80578166, "num_input_tokens_seen": 61453215, "step": 2857, "time_per_iteration": 3.8035314083099365 }, { "auxiliary_loss_clip": 0.01194944, "auxiliary_loss_mlp": 0.01027713, "balance_loss_clip": 1.06002462, "balance_loss_mlp": 1.01936793, "epoch": 0.343654181446522, "flos": 22199366413440.0, "grad_norm": 1.5672641265248586, "language_loss": 0.70283914, "learning_rate": 3.0539160046052094e-06, "loss": 0.72506571, "num_input_tokens_seen": 61472915, "step": 2858, "time_per_iteration": 2.899287462234497 }, { "auxiliary_loss_clip": 0.01185489, "auxiliary_loss_mlp": 0.01032857, "balance_loss_clip": 1.05941057, "balance_loss_mlp": 1.02329087, "epoch": 0.34377442433716104, "flos": 19901801894400.0, "grad_norm": 2.473867308258705, "language_loss": 0.7065767, "learning_rate": 3.0532538834376003e-06, "loss": 0.72876018, "num_input_tokens_seen": 61492475, "step": 2859, "time_per_iteration": 3.60373854637146 }, { "auxiliary_loss_clip": 0.01196803, "auxiliary_loss_mlp": 0.01023533, "balance_loss_clip": 1.05787754, "balance_loss_mlp": 1.01521778, "epoch": 0.34389466722780015, "flos": 22197678474240.0, "grad_norm": 2.6166331434428787, "language_loss": 0.78009963, "learning_rate": 3.0525916024922143e-06, "loss": 0.80230296, "num_input_tokens_seen": 61511660, "step": 2860, "time_per_iteration": 2.7367687225341797 }, { "auxiliary_loss_clip": 0.01187143, "auxiliary_loss_mlp": 0.01029858, "balance_loss_clip": 1.05671144, "balance_loss_mlp": 1.02118587, "epoch": 0.34401491011843927, "flos": 18624567110400.0, "grad_norm": 3.1883627334061915, "language_loss": 0.83634263, "learning_rate": 3.0519291618695193e-06, "loss": 0.85851264, "num_input_tokens_seen": 61529060, "step": 2861, "time_per_iteration": 2.819915771484375 }, { "auxiliary_loss_clip": 0.01175871, "auxiliary_loss_mlp": 0.01032686, "balance_loss_clip": 1.05525112, "balance_loss_mlp": 1.02342916, "epoch": 0.3441351530090783, "flos": 17858197509120.0, "grad_norm": 1.9931492245530233, "language_loss": 0.75598782, "learning_rate": 3.0512665616700065e-06, "loss": 0.77807337, "num_input_tokens_seen": 61548125, "step": 2862, "time_per_iteration": 2.875340700149536 }, { "auxiliary_loss_clip": 0.01172218, "auxiliary_loss_mlp": 0.01031441, "balance_loss_clip": 1.05432391, "balance_loss_mlp": 1.02309656, "epoch": 0.34425539589971743, "flos": 23112754381440.0, "grad_norm": 3.1548239457354894, "language_loss": 0.89158833, "learning_rate": 3.0506038019941933e-06, "loss": 0.91362494, "num_input_tokens_seen": 61568135, "step": 2863, "time_per_iteration": 2.89613676071167 }, { "auxiliary_loss_clip": 0.01184835, "auxiliary_loss_mlp": 0.01033757, "balance_loss_clip": 1.05794573, "balance_loss_mlp": 1.02478039, "epoch": 0.34437563879035654, "flos": 21907699977600.0, "grad_norm": 3.1260782407362844, "language_loss": 0.67423183, "learning_rate": 3.049940882942617e-06, "loss": 0.69641775, "num_input_tokens_seen": 61586920, "step": 2864, "time_per_iteration": 2.9408538341522217 }, { "auxiliary_loss_clip": 0.01192368, "auxiliary_loss_mlp": 0.01031972, "balance_loss_clip": 1.0547545, "balance_loss_mlp": 1.02244759, "epoch": 0.3444958816809956, "flos": 23076915586560.0, "grad_norm": 2.775646078345754, "language_loss": 0.80129111, "learning_rate": 3.0492778046158448e-06, "loss": 0.82353449, "num_input_tokens_seen": 61608340, "step": 2865, "time_per_iteration": 2.818481206893921 }, { "auxiliary_loss_clip": 0.01186601, "auxiliary_loss_mlp": 0.01041027, "balance_loss_clip": 1.05689716, "balance_loss_mlp": 1.0322355, "epoch": 0.3446161245716347, "flos": 21908633731200.0, "grad_norm": 7.5112811104572215, "language_loss": 0.76828283, "learning_rate": 3.0486145671144633e-06, "loss": 0.79055917, "num_input_tokens_seen": 61628130, "step": 2866, "time_per_iteration": 2.736919641494751 }, { "auxiliary_loss_clip": 0.01163997, "auxiliary_loss_mlp": 0.0102601, "balance_loss_clip": 1.05688119, "balance_loss_mlp": 1.01723027, "epoch": 0.3447363674622738, "flos": 25112834461440.0, "grad_norm": 2.2299411255988915, "language_loss": 0.76754868, "learning_rate": 3.047951170539086e-06, "loss": 0.7894488, "num_input_tokens_seen": 61647755, "step": 2867, "time_per_iteration": 2.88020396232605 }, { "auxiliary_loss_clip": 0.01177361, "auxiliary_loss_mlp": 0.01030161, "balance_loss_clip": 1.05631661, "balance_loss_mlp": 1.02288592, "epoch": 0.3448566103529129, "flos": 11984684451840.0, "grad_norm": 1.9146075009550583, "language_loss": 0.84136397, "learning_rate": 3.047287614990349e-06, "loss": 0.8634392, "num_input_tokens_seen": 61665675, "step": 2868, "time_per_iteration": 2.7803828716278076 }, { "auxiliary_loss_clip": 0.01184415, "auxiliary_loss_mlp": 0.01031643, "balance_loss_clip": 1.06008029, "balance_loss_mlp": 1.02258265, "epoch": 0.344976853243552, "flos": 40187882465280.0, "grad_norm": 2.206290095117522, "language_loss": 0.61963212, "learning_rate": 3.046623900568914e-06, "loss": 0.64179265, "num_input_tokens_seen": 61688240, "step": 2869, "time_per_iteration": 2.9775185585021973 }, { "auxiliary_loss_clip": 0.0118295, "auxiliary_loss_mlp": 0.01032435, "balance_loss_clip": 1.05546296, "balance_loss_mlp": 1.02386951, "epoch": 0.34509709613419104, "flos": 28723652127360.0, "grad_norm": 2.4711659011262617, "language_loss": 0.70167077, "learning_rate": 3.045960027375465e-06, "loss": 0.72382462, "num_input_tokens_seen": 61706075, "step": 2870, "time_per_iteration": 2.9290108680725098 }, { "auxiliary_loss_clip": 0.01194548, "auxiliary_loss_mlp": 0.01035077, "balance_loss_clip": 1.05620003, "balance_loss_mlp": 1.02573729, "epoch": 0.34521733902483015, "flos": 29967597982080.0, "grad_norm": 4.270147178687621, "language_loss": 0.8251462, "learning_rate": 3.045295995510711e-06, "loss": 0.84744245, "num_input_tokens_seen": 61723045, "step": 2871, "time_per_iteration": 2.827791452407837 }, { "auxiliary_loss_clip": 0.01181924, "auxiliary_loss_mlp": 0.01027726, "balance_loss_clip": 1.0563252, "balance_loss_mlp": 1.01973915, "epoch": 0.34533758191546926, "flos": 27923059843200.0, "grad_norm": 1.8732248054825023, "language_loss": 0.7369343, "learning_rate": 3.0446318050753865e-06, "loss": 0.75903082, "num_input_tokens_seen": 61743525, "step": 2872, "time_per_iteration": 2.878361463546753 }, { "auxiliary_loss_clip": 0.01184581, "auxiliary_loss_mlp": 0.01026491, "balance_loss_clip": 1.05690038, "balance_loss_mlp": 1.01847398, "epoch": 0.3454578248061083, "flos": 27125879351040.0, "grad_norm": 2.1021313546853495, "language_loss": 0.77700788, "learning_rate": 3.0439674561702474e-06, "loss": 0.79911864, "num_input_tokens_seen": 61763025, "step": 2873, "time_per_iteration": 2.9176158905029297 }, { "auxiliary_loss_clip": 0.01185452, "auxiliary_loss_mlp": 0.01022316, "balance_loss_clip": 1.05459118, "balance_loss_mlp": 1.01428115, "epoch": 0.3455780676967474, "flos": 19024899166080.0, "grad_norm": 3.311222185038102, "language_loss": 0.88436502, "learning_rate": 3.043302948896076e-06, "loss": 0.90644276, "num_input_tokens_seen": 61781630, "step": 2874, "time_per_iteration": 2.8434581756591797 }, { "auxiliary_loss_clip": 0.01172571, "auxiliary_loss_mlp": 0.01030648, "balance_loss_clip": 1.05854726, "balance_loss_mlp": 1.02204156, "epoch": 0.34569831058738654, "flos": 34496005507200.0, "grad_norm": 2.6805785789689383, "language_loss": 0.6070689, "learning_rate": 3.0426382833536756e-06, "loss": 0.62910104, "num_input_tokens_seen": 61804985, "step": 2875, "time_per_iteration": 3.04890513420105 }, { "auxiliary_loss_clip": 0.01178826, "auxiliary_loss_mlp": 0.01027937, "balance_loss_clip": 1.05526841, "balance_loss_mlp": 1.01989663, "epoch": 0.3458185534780256, "flos": 31138681098240.0, "grad_norm": 4.526686233586603, "language_loss": 0.778813, "learning_rate": 3.041973459643877e-06, "loss": 0.80088061, "num_input_tokens_seen": 61824440, "step": 2876, "time_per_iteration": 3.0671300888061523 }, { "auxiliary_loss_clip": 0.01175563, "auxiliary_loss_mlp": 0.01029864, "balance_loss_clip": 1.05500579, "balance_loss_mlp": 1.02144778, "epoch": 0.3459387963686647, "flos": 32452508862720.0, "grad_norm": 2.1705865333467718, "language_loss": 0.67196321, "learning_rate": 3.0413084778675334e-06, "loss": 0.69401747, "num_input_tokens_seen": 61845690, "step": 2877, "time_per_iteration": 2.9203882217407227 }, { "auxiliary_loss_clip": 0.01180489, "auxiliary_loss_mlp": 0.01058449, "balance_loss_clip": 1.05499327, "balance_loss_mlp": 1.02112079, "epoch": 0.3460590392593038, "flos": 24675658030080.0, "grad_norm": 2.00375639140621, "language_loss": 0.83665419, "learning_rate": 3.0406433381255214e-06, "loss": 0.8590436, "num_input_tokens_seen": 61863725, "step": 2878, "time_per_iteration": 2.828660011291504 }, { "auxiliary_loss_clip": 0.01189244, "auxiliary_loss_mlp": 0.01025762, "balance_loss_clip": 1.05883288, "balance_loss_mlp": 1.01760256, "epoch": 0.34617928214994287, "flos": 18807316531200.0, "grad_norm": 2.664446400000468, "language_loss": 0.82144636, "learning_rate": 3.0399780405187425e-06, "loss": 0.8435964, "num_input_tokens_seen": 61882720, "step": 2879, "time_per_iteration": 3.758342981338501 }, { "auxiliary_loss_clip": 0.01186853, "auxiliary_loss_mlp": 0.01033447, "balance_loss_clip": 1.05699086, "balance_loss_mlp": 1.02553177, "epoch": 0.346299525040582, "flos": 24857653265280.0, "grad_norm": 3.112364937962535, "language_loss": 0.78677702, "learning_rate": 3.0393125851481216e-06, "loss": 0.80897999, "num_input_tokens_seen": 61902595, "step": 2880, "time_per_iteration": 2.950930595397949 }, { "auxiliary_loss_clip": 0.0117706, "auxiliary_loss_mlp": 0.01026044, "balance_loss_clip": 1.05557573, "balance_loss_mlp": 1.01830149, "epoch": 0.3464197679312211, "flos": 16434914025600.0, "grad_norm": 2.370123201687182, "language_loss": 0.86453378, "learning_rate": 3.038646972114608e-06, "loss": 0.88656485, "num_input_tokens_seen": 61918920, "step": 2881, "time_per_iteration": 2.707028388977051 }, { "auxiliary_loss_clip": 0.01179248, "auxiliary_loss_mlp": 0.01029893, "balance_loss_clip": 1.05777788, "balance_loss_mlp": 1.02175093, "epoch": 0.34654001082186014, "flos": 22382474970240.0, "grad_norm": 1.7419591934010688, "language_loss": 0.67266703, "learning_rate": 3.037981201519174e-06, "loss": 0.69475842, "num_input_tokens_seen": 61939520, "step": 2882, "time_per_iteration": 3.77909779548645 }, { "auxiliary_loss_clip": 0.01188476, "auxiliary_loss_mlp": 0.01029096, "balance_loss_clip": 1.05813932, "balance_loss_mlp": 1.02118325, "epoch": 0.34666025371249926, "flos": 19573901614080.0, "grad_norm": 2.7693645501488624, "language_loss": 0.7156195, "learning_rate": 3.0373152734628175e-06, "loss": 0.73779523, "num_input_tokens_seen": 61957800, "step": 2883, "time_per_iteration": 3.6749112606048584 }, { "auxiliary_loss_clip": 0.01183324, "auxiliary_loss_mlp": 0.01028664, "balance_loss_clip": 1.05532742, "balance_loss_mlp": 1.02045071, "epoch": 0.34678049660313837, "flos": 15267637751040.0, "grad_norm": 1.8375011327810427, "language_loss": 0.75930047, "learning_rate": 3.0366491880465584e-06, "loss": 0.78142035, "num_input_tokens_seen": 61975820, "step": 2884, "time_per_iteration": 3.7184226512908936 }, { "auxiliary_loss_clip": 0.01197113, "auxiliary_loss_mlp": 0.01034595, "balance_loss_clip": 1.05969512, "balance_loss_mlp": 1.02601242, "epoch": 0.3469007394937774, "flos": 21181550630400.0, "grad_norm": 1.5641547223574184, "language_loss": 0.82214969, "learning_rate": 3.035982945371443e-06, "loss": 0.84446681, "num_input_tokens_seen": 61997515, "step": 2885, "time_per_iteration": 2.764190196990967 }, { "auxiliary_loss_clip": 0.01193628, "auxiliary_loss_mlp": 0.01028496, "balance_loss_clip": 1.05682528, "balance_loss_mlp": 1.01992512, "epoch": 0.34702098238441653, "flos": 22375471818240.0, "grad_norm": 2.594987095113389, "language_loss": 0.84910303, "learning_rate": 3.035316545538537e-06, "loss": 0.8713243, "num_input_tokens_seen": 62016310, "step": 2886, "time_per_iteration": 2.770376682281494 }, { "auxiliary_loss_clip": 0.01181326, "auxiliary_loss_mlp": 0.01028836, "balance_loss_clip": 1.05664539, "balance_loss_mlp": 1.01995492, "epoch": 0.3471412252750556, "flos": 22929430343040.0, "grad_norm": 2.1180890253611304, "language_loss": 0.7932294, "learning_rate": 3.034649988648935e-06, "loss": 0.81533104, "num_input_tokens_seen": 62036075, "step": 2887, "time_per_iteration": 2.826171398162842 }, { "auxiliary_loss_clip": 0.01187578, "auxiliary_loss_mlp": 0.01026675, "balance_loss_clip": 1.05511189, "balance_loss_mlp": 1.01839018, "epoch": 0.3472614681656947, "flos": 21324259365120.0, "grad_norm": 1.837733667393182, "language_loss": 0.8070516, "learning_rate": 3.033983274803752e-06, "loss": 0.82919419, "num_input_tokens_seen": 62055865, "step": 2888, "time_per_iteration": 2.774869203567505 }, { "auxiliary_loss_clip": 0.011841, "auxiliary_loss_mlp": 0.01029097, "balance_loss_clip": 1.05722404, "balance_loss_mlp": 1.0208416, "epoch": 0.3473817110563338, "flos": 23475739271040.0, "grad_norm": 4.206438199228913, "language_loss": 0.72239786, "learning_rate": 3.0333164041041283e-06, "loss": 0.74452984, "num_input_tokens_seen": 62072180, "step": 2889, "time_per_iteration": 2.7078967094421387 }, { "auxiliary_loss_clip": 0.01182451, "auxiliary_loss_mlp": 0.01031712, "balance_loss_clip": 1.06049418, "balance_loss_mlp": 1.02355194, "epoch": 0.34750195394697286, "flos": 22346025644160.0, "grad_norm": 1.7622043287567395, "language_loss": 0.72048128, "learning_rate": 3.032649376651228e-06, "loss": 0.74262297, "num_input_tokens_seen": 62091600, "step": 2890, "time_per_iteration": 2.934258460998535 }, { "auxiliary_loss_clip": 0.0118419, "auxiliary_loss_mlp": 0.010316, "balance_loss_clip": 1.05749071, "balance_loss_mlp": 1.02304053, "epoch": 0.347622196837612, "flos": 29095004885760.0, "grad_norm": 1.848592416219004, "language_loss": 0.75971222, "learning_rate": 3.031982192546238e-06, "loss": 0.78187013, "num_input_tokens_seen": 62114695, "step": 2891, "time_per_iteration": 2.8290700912475586 }, { "auxiliary_loss_clip": 0.01192547, "auxiliary_loss_mlp": 0.01033535, "balance_loss_clip": 1.05702496, "balance_loss_mlp": 1.02490401, "epoch": 0.3477424397282511, "flos": 22455732758400.0, "grad_norm": 2.0584916957445016, "language_loss": 0.94448876, "learning_rate": 3.0313148518903696e-06, "loss": 0.96674961, "num_input_tokens_seen": 62134520, "step": 2892, "time_per_iteration": 2.7651193141937256 }, { "auxiliary_loss_clip": 0.01187309, "auxiliary_loss_mlp": 0.01029893, "balance_loss_clip": 1.05720472, "balance_loss_mlp": 1.02051759, "epoch": 0.34786268261889014, "flos": 15778790242560.0, "grad_norm": 2.5632401154635955, "language_loss": 0.8090632, "learning_rate": 3.030647354784859e-06, "loss": 0.83123529, "num_input_tokens_seen": 62151560, "step": 2893, "time_per_iteration": 2.7563281059265137 }, { "auxiliary_loss_clip": 0.0117788, "auxiliary_loss_mlp": 0.01032121, "balance_loss_clip": 1.05344772, "balance_loss_mlp": 1.02439642, "epoch": 0.34798292550952925, "flos": 20777627214720.0, "grad_norm": 1.8201240566191657, "language_loss": 0.77094525, "learning_rate": 3.029979701330964e-06, "loss": 0.79304528, "num_input_tokens_seen": 62170985, "step": 2894, "time_per_iteration": 2.8647561073303223 }, { "auxiliary_loss_clip": 0.01190701, "auxiliary_loss_mlp": 0.01028732, "balance_loss_clip": 1.05764496, "balance_loss_mlp": 1.02067375, "epoch": 0.34810316840016836, "flos": 19937820257280.0, "grad_norm": 2.649123622411446, "language_loss": 0.79699326, "learning_rate": 3.029311891629966e-06, "loss": 0.81918764, "num_input_tokens_seen": 62189440, "step": 2895, "time_per_iteration": 2.775768280029297 }, { "auxiliary_loss_clip": 0.01182612, "auxiliary_loss_mlp": 0.01031568, "balance_loss_clip": 1.05434704, "balance_loss_mlp": 1.02368867, "epoch": 0.3482234112908074, "flos": 23623296341760.0, "grad_norm": 1.8425856818600548, "language_loss": 0.74132264, "learning_rate": 3.0286439257831744e-06, "loss": 0.76346445, "num_input_tokens_seen": 62208910, "step": 2896, "time_per_iteration": 2.7182395458221436 }, { "auxiliary_loss_clip": 0.01196968, "auxiliary_loss_mlp": 0.01029448, "balance_loss_clip": 1.05706906, "balance_loss_mlp": 1.01942885, "epoch": 0.3483436541814465, "flos": 23986712194560.0, "grad_norm": 2.0620690900958856, "language_loss": 0.71570897, "learning_rate": 3.0279758038919156e-06, "loss": 0.73797309, "num_input_tokens_seen": 62227135, "step": 2897, "time_per_iteration": 2.724087715148926 }, { "auxiliary_loss_clip": 0.01190068, "auxiliary_loss_mlp": 0.01034246, "balance_loss_clip": 1.05584598, "balance_loss_mlp": 1.02531683, "epoch": 0.34846389707208564, "flos": 22638338524800.0, "grad_norm": 2.5903332682803444, "language_loss": 0.78352076, "learning_rate": 3.0273075260575455e-06, "loss": 0.80576384, "num_input_tokens_seen": 62246035, "step": 2898, "time_per_iteration": 2.7117488384246826 }, { "auxiliary_loss_clip": 0.01190074, "auxiliary_loss_mlp": 0.01032581, "balance_loss_clip": 1.0583818, "balance_loss_mlp": 1.02396202, "epoch": 0.3485841399627247, "flos": 21792857218560.0, "grad_norm": 1.8630066331922839, "language_loss": 0.81115723, "learning_rate": 3.0266390923814396e-06, "loss": 0.8333838, "num_input_tokens_seen": 62264095, "step": 2899, "time_per_iteration": 2.867079734802246 }, { "auxiliary_loss_clip": 0.01187865, "auxiliary_loss_mlp": 0.01032197, "balance_loss_clip": 1.05745733, "balance_loss_mlp": 1.02362597, "epoch": 0.3487043828533638, "flos": 17019036996480.0, "grad_norm": 5.47519812609978, "language_loss": 0.82413471, "learning_rate": 3.0259705029650008e-06, "loss": 0.84633535, "num_input_tokens_seen": 62282025, "step": 2900, "time_per_iteration": 2.8166232109069824 }, { "auxiliary_loss_clip": 0.01193093, "auxiliary_loss_mlp": 0.01029204, "balance_loss_clip": 1.05624735, "balance_loss_mlp": 1.02083588, "epoch": 0.34882462574400286, "flos": 22601135013120.0, "grad_norm": 2.3475866691867653, "language_loss": 0.72765881, "learning_rate": 3.025301757909652e-06, "loss": 0.7498818, "num_input_tokens_seen": 62302220, "step": 2901, "time_per_iteration": 2.8028059005737305 }, { "auxiliary_loss_clip": 0.0118567, "auxiliary_loss_mlp": 0.01059856, "balance_loss_clip": 1.05630946, "balance_loss_mlp": 1.0226208, "epoch": 0.34894486863464197, "flos": 29861518141440.0, "grad_norm": 1.5567021396872809, "language_loss": 0.80631971, "learning_rate": 3.024632857316842e-06, "loss": 0.82877499, "num_input_tokens_seen": 62323535, "step": 2902, "time_per_iteration": 2.8460066318511963 }, { "auxiliary_loss_clip": 0.01189597, "auxiliary_loss_mlp": 0.01028113, "balance_loss_clip": 1.05622768, "balance_loss_mlp": 1.01929784, "epoch": 0.3490651115252811, "flos": 22122265870080.0, "grad_norm": 1.862125346577793, "language_loss": 0.77546871, "learning_rate": 3.0239638012880412e-06, "loss": 0.79764581, "num_input_tokens_seen": 62343430, "step": 2903, "time_per_iteration": 2.7296504974365234 }, { "auxiliary_loss_clip": 0.01178163, "auxiliary_loss_mlp": 0.01033184, "balance_loss_clip": 1.05821121, "balance_loss_mlp": 1.0241065, "epoch": 0.34918535441592014, "flos": 12676682943360.0, "grad_norm": 3.8811083949949925, "language_loss": 0.81998432, "learning_rate": 3.0232945899247466e-06, "loss": 0.84209788, "num_input_tokens_seen": 62360365, "step": 2904, "time_per_iteration": 2.8007748126983643 }, { "auxiliary_loss_clip": 0.01192191, "auxiliary_loss_mlp": 0.01031741, "balance_loss_clip": 1.0568006, "balance_loss_mlp": 1.02288342, "epoch": 0.34930559730655925, "flos": 23185617120000.0, "grad_norm": 1.958638676624776, "language_loss": 0.77450454, "learning_rate": 3.022625223328476e-06, "loss": 0.79674381, "num_input_tokens_seen": 62382105, "step": 2905, "time_per_iteration": 3.7319087982177734 }, { "auxiliary_loss_clip": 0.01195451, "auxiliary_loss_mlp": 0.01026658, "balance_loss_clip": 1.0563097, "balance_loss_mlp": 1.0178839, "epoch": 0.34942584019719836, "flos": 22855023319680.0, "grad_norm": 1.6919952257709265, "language_loss": 0.69030499, "learning_rate": 3.0219557016007723e-06, "loss": 0.71252608, "num_input_tokens_seen": 62402235, "step": 2906, "time_per_iteration": 2.7938485145568848 }, { "auxiliary_loss_clip": 0.01184058, "auxiliary_loss_mlp": 0.01030354, "balance_loss_clip": 1.0552628, "balance_loss_mlp": 1.02148461, "epoch": 0.3495460830878374, "flos": 24426043441920.0, "grad_norm": 2.0114349203559505, "language_loss": 0.69668186, "learning_rate": 3.021286024843202e-06, "loss": 0.718826, "num_input_tokens_seen": 62420430, "step": 2907, "time_per_iteration": 2.673257827758789 }, { "auxiliary_loss_clip": 0.01097125, "auxiliary_loss_mlp": 0.01003379, "balance_loss_clip": 1.02472925, "balance_loss_mlp": 1.00179327, "epoch": 0.3496663259784765, "flos": 70008749389440.0, "grad_norm": 1.079874211815976, "language_loss": 0.64751697, "learning_rate": 3.0206161931573526e-06, "loss": 0.668522, "num_input_tokens_seen": 62472980, "step": 2908, "time_per_iteration": 4.154672145843506 }, { "auxiliary_loss_clip": 0.01183197, "auxiliary_loss_mlp": 0.01030205, "balance_loss_clip": 1.05334663, "balance_loss_mlp": 1.02195609, "epoch": 0.34978656886911563, "flos": 28692805322880.0, "grad_norm": 1.6742891489252503, "language_loss": 0.92986119, "learning_rate": 3.0199462066448388e-06, "loss": 0.95199525, "num_input_tokens_seen": 62495175, "step": 2909, "time_per_iteration": 3.7547569274902344 }, { "auxiliary_loss_clip": 0.01193511, "auxiliary_loss_mlp": 0.01026564, "balance_loss_clip": 1.05913293, "balance_loss_mlp": 1.01842797, "epoch": 0.3499068117597547, "flos": 21142156389120.0, "grad_norm": 1.7410689634559753, "language_loss": 0.69129145, "learning_rate": 3.019276065407296e-06, "loss": 0.71349216, "num_input_tokens_seen": 62514295, "step": 2910, "time_per_iteration": 2.8440194129943848 }, { "auxiliary_loss_clip": 0.01188512, "auxiliary_loss_mlp": 0.01027596, "balance_loss_clip": 1.05984879, "balance_loss_mlp": 1.01862526, "epoch": 0.3500270546503938, "flos": 22782699285120.0, "grad_norm": 1.703656033384134, "language_loss": 0.80561405, "learning_rate": 3.018605769546385e-06, "loss": 0.82777512, "num_input_tokens_seen": 62534850, "step": 2911, "time_per_iteration": 3.772299289703369 }, { "auxiliary_loss_clip": 0.01191786, "auxiliary_loss_mlp": 0.01030669, "balance_loss_clip": 1.05771291, "balance_loss_mlp": 1.02137661, "epoch": 0.3501472975410329, "flos": 22894058424960.0, "grad_norm": 2.6423352195776184, "language_loss": 0.79542035, "learning_rate": 3.017935319163788e-06, "loss": 0.81764489, "num_input_tokens_seen": 62553810, "step": 2912, "time_per_iteration": 2.7946057319641113 }, { "auxiliary_loss_clip": 0.01192333, "auxiliary_loss_mlp": 0.01026466, "balance_loss_clip": 1.05706882, "balance_loss_mlp": 1.01684034, "epoch": 0.35026754043167196, "flos": 25446588658560.0, "grad_norm": 2.0058766922435454, "language_loss": 0.70626581, "learning_rate": 3.017264714361213e-06, "loss": 0.72845387, "num_input_tokens_seen": 62573460, "step": 2913, "time_per_iteration": 2.8790836334228516 }, { "auxiliary_loss_clip": 0.01187074, "auxiliary_loss_mlp": 0.01060632, "balance_loss_clip": 1.05644619, "balance_loss_mlp": 1.02217102, "epoch": 0.3503877833223111, "flos": 19573757959680.0, "grad_norm": 3.1743727171521288, "language_loss": 0.82153201, "learning_rate": 3.016593955240389e-06, "loss": 0.84400904, "num_input_tokens_seen": 62592150, "step": 2914, "time_per_iteration": 2.895113229751587 }, { "auxiliary_loss_clip": 0.01091421, "auxiliary_loss_mlp": 0.01004317, "balance_loss_clip": 1.02362776, "balance_loss_mlp": 1.00275552, "epoch": 0.3505080262129502, "flos": 65072075880960.0, "grad_norm": 0.8202500359130713, "language_loss": 0.63633108, "learning_rate": 3.015923041903071e-06, "loss": 0.65728843, "num_input_tokens_seen": 62658275, "step": 2915, "time_per_iteration": 3.4496262073516846 }, { "auxiliary_loss_clip": 0.01187411, "auxiliary_loss_mlp": 0.01032737, "balance_loss_clip": 1.05663073, "balance_loss_mlp": 1.02429128, "epoch": 0.35062826910358924, "flos": 29314562768640.0, "grad_norm": 2.0326627565389868, "language_loss": 0.83560061, "learning_rate": 3.0152519744510347e-06, "loss": 0.85780203, "num_input_tokens_seen": 62678075, "step": 2916, "time_per_iteration": 3.0750136375427246 }, { "auxiliary_loss_clip": 0.01183083, "auxiliary_loss_mlp": 0.01032363, "balance_loss_clip": 1.05259204, "balance_loss_mlp": 1.02407205, "epoch": 0.35074851199422835, "flos": 23987717775360.0, "grad_norm": 2.2542530260868245, "language_loss": 0.83212137, "learning_rate": 3.014580752986081e-06, "loss": 0.85427576, "num_input_tokens_seen": 62696950, "step": 2917, "time_per_iteration": 2.971047878265381 }, { "auxiliary_loss_clip": 0.01180734, "auxiliary_loss_mlp": 0.01031197, "balance_loss_clip": 1.05599785, "balance_loss_mlp": 1.02301908, "epoch": 0.3508687548848674, "flos": 15224436668160.0, "grad_norm": 2.265897975070167, "language_loss": 0.78350294, "learning_rate": 3.0139093776100345e-06, "loss": 0.80562222, "num_input_tokens_seen": 62713540, "step": 2918, "time_per_iteration": 2.9001784324645996 }, { "auxiliary_loss_clip": 0.01191294, "auxiliary_loss_mlp": 0.01028769, "balance_loss_clip": 1.0549283, "balance_loss_mlp": 1.01992941, "epoch": 0.3509889977755065, "flos": 21361750185600.0, "grad_norm": 2.2826802086828324, "language_loss": 0.75651932, "learning_rate": 3.013237848424741e-06, "loss": 0.7787199, "num_input_tokens_seen": 62732925, "step": 2919, "time_per_iteration": 2.8504221439361572 }, { "auxiliary_loss_clip": 0.01188873, "auxiliary_loss_mlp": 0.01028311, "balance_loss_clip": 1.05756927, "balance_loss_mlp": 1.01987672, "epoch": 0.35110924066614563, "flos": 19135360465920.0, "grad_norm": 2.166668950713362, "language_loss": 0.74881285, "learning_rate": 3.012566165532072e-06, "loss": 0.77098465, "num_input_tokens_seen": 62751715, "step": 2920, "time_per_iteration": 2.9573276042938232 }, { "auxiliary_loss_clip": 0.01182435, "auxiliary_loss_mlp": 0.01031223, "balance_loss_clip": 1.05592048, "balance_loss_mlp": 1.0225451, "epoch": 0.3512294835567847, "flos": 21980885938560.0, "grad_norm": 2.2297890104783757, "language_loss": 0.76702464, "learning_rate": 3.0118943290339207e-06, "loss": 0.78916121, "num_input_tokens_seen": 62771925, "step": 2921, "time_per_iteration": 2.8655710220336914 }, { "auxiliary_loss_clip": 0.0117643, "auxiliary_loss_mlp": 0.0102989, "balance_loss_clip": 1.05515099, "balance_loss_mlp": 1.02062154, "epoch": 0.3513497264474238, "flos": 17817294896640.0, "grad_norm": 2.3854487168788774, "language_loss": 0.68013978, "learning_rate": 3.011222339032204e-06, "loss": 0.70220304, "num_input_tokens_seen": 62790075, "step": 2922, "time_per_iteration": 2.8516855239868164 }, { "auxiliary_loss_clip": 0.01193611, "auxiliary_loss_mlp": 0.01035161, "balance_loss_clip": 1.05791962, "balance_loss_mlp": 1.02707839, "epoch": 0.3514699693380629, "flos": 26943417239040.0, "grad_norm": 1.7682308169232086, "language_loss": 0.6906358, "learning_rate": 3.0105501956288626e-06, "loss": 0.71292353, "num_input_tokens_seen": 62810545, "step": 2923, "time_per_iteration": 2.7509334087371826 }, { "auxiliary_loss_clip": 0.01196703, "auxiliary_loss_mlp": 0.01034289, "balance_loss_clip": 1.05778325, "balance_loss_mlp": 1.02465081, "epoch": 0.35159021222870196, "flos": 15267565923840.0, "grad_norm": 2.801825137306914, "language_loss": 0.72873324, "learning_rate": 3.0098778989258602e-06, "loss": 0.7510432, "num_input_tokens_seen": 62829155, "step": 2924, "time_per_iteration": 2.6890511512756348 }, { "auxiliary_loss_clip": 0.01179445, "auxiliary_loss_mlp": 0.01029245, "balance_loss_clip": 1.05746627, "balance_loss_mlp": 1.02042937, "epoch": 0.35171045511934107, "flos": 13984154000640.0, "grad_norm": 1.8851005825148428, "language_loss": 0.88162637, "learning_rate": 3.009205449025183e-06, "loss": 0.90371329, "num_input_tokens_seen": 62845350, "step": 2925, "time_per_iteration": 2.8350603580474854 }, { "auxiliary_loss_clip": 0.01179531, "auxiliary_loss_mlp": 0.01028436, "balance_loss_clip": 1.05514824, "balance_loss_mlp": 1.01920283, "epoch": 0.3518306980099802, "flos": 14283434119680.0, "grad_norm": 2.541984834419373, "language_loss": 0.6312483, "learning_rate": 3.008532846028842e-06, "loss": 0.653328, "num_input_tokens_seen": 62862110, "step": 2926, "time_per_iteration": 2.7905232906341553 }, { "auxiliary_loss_clip": 0.01195506, "auxiliary_loss_mlp": 0.01033324, "balance_loss_clip": 1.05756176, "balance_loss_mlp": 1.02390683, "epoch": 0.35195094090061924, "flos": 27052872958080.0, "grad_norm": 2.1570125677127465, "language_loss": 0.72252721, "learning_rate": 3.0078600900388694e-06, "loss": 0.74481553, "num_input_tokens_seen": 62882415, "step": 2927, "time_per_iteration": 2.944004774093628 }, { "auxiliary_loss_clip": 0.01174535, "auxiliary_loss_mlp": 0.01032699, "balance_loss_clip": 1.05337524, "balance_loss_mlp": 1.0241816, "epoch": 0.35207118379125835, "flos": 25629266252160.0, "grad_norm": 2.266895947889856, "language_loss": 0.73999727, "learning_rate": 3.007187181157323e-06, "loss": 0.76206958, "num_input_tokens_seen": 62902425, "step": 2928, "time_per_iteration": 2.855018138885498 }, { "auxiliary_loss_clip": 0.01167298, "auxiliary_loss_mlp": 0.01028109, "balance_loss_clip": 1.05517423, "balance_loss_mlp": 1.01867366, "epoch": 0.35219142668189746, "flos": 18004713085440.0, "grad_norm": 2.6491247664105324, "language_loss": 0.67668295, "learning_rate": 3.006514119486282e-06, "loss": 0.69863713, "num_input_tokens_seen": 62919255, "step": 2929, "time_per_iteration": 2.861575126647949 }, { "auxiliary_loss_clip": 0.01176062, "auxiliary_loss_mlp": 0.01029228, "balance_loss_clip": 1.05376554, "balance_loss_mlp": 1.02074623, "epoch": 0.3523116695725365, "flos": 14028109269120.0, "grad_norm": 1.9590427914035033, "language_loss": 0.69675893, "learning_rate": 3.005840905127849e-06, "loss": 0.71881187, "num_input_tokens_seen": 62936160, "step": 2930, "time_per_iteration": 2.775933027267456 }, { "auxiliary_loss_clip": 0.01192695, "auxiliary_loss_mlp": 0.01025703, "balance_loss_clip": 1.05620146, "balance_loss_mlp": 1.01720309, "epoch": 0.3524319124631756, "flos": 21433966479360.0, "grad_norm": 2.8217469978241194, "language_loss": 0.86853206, "learning_rate": 3.0051675381841516e-06, "loss": 0.89071596, "num_input_tokens_seen": 62953470, "step": 2931, "time_per_iteration": 3.631450891494751 }, { "auxiliary_loss_clip": 0.0117332, "auxiliary_loss_mlp": 0.01059586, "balance_loss_clip": 1.05570054, "balance_loss_mlp": 1.01925647, "epoch": 0.3525521553538147, "flos": 26322773114880.0, "grad_norm": 1.6742033869836501, "language_loss": 0.7676394, "learning_rate": 3.0044940187573363e-06, "loss": 0.78996849, "num_input_tokens_seen": 62974480, "step": 2932, "time_per_iteration": 2.9310362339019775 }, { "auxiliary_loss_clip": 0.0119166, "auxiliary_loss_mlp": 0.01030414, "balance_loss_clip": 1.05525351, "balance_loss_mlp": 1.02212334, "epoch": 0.3526723982444538, "flos": 21543314457600.0, "grad_norm": 6.655771573083925, "language_loss": 0.65286362, "learning_rate": 3.003820346949578e-06, "loss": 0.67508435, "num_input_tokens_seen": 62992560, "step": 2933, "time_per_iteration": 2.7965919971466064 }, { "auxiliary_loss_clip": 0.01194442, "auxiliary_loss_mlp": 0.01031664, "balance_loss_clip": 1.05591404, "balance_loss_mlp": 1.02297354, "epoch": 0.3527926411350929, "flos": 23733649900800.0, "grad_norm": 2.2009081934907395, "language_loss": 0.79444206, "learning_rate": 3.003146522863071e-06, "loss": 0.81670314, "num_input_tokens_seen": 63013445, "step": 2934, "time_per_iteration": 3.803037643432617 }, { "auxiliary_loss_clip": 0.01183352, "auxiliary_loss_mlp": 0.01027289, "balance_loss_clip": 1.05543041, "balance_loss_mlp": 1.01869416, "epoch": 0.35291288402573195, "flos": 30445461544320.0, "grad_norm": 2.9280167189265787, "language_loss": 0.86153245, "learning_rate": 3.0024725466000345e-06, "loss": 0.88363886, "num_input_tokens_seen": 63033400, "step": 2935, "time_per_iteration": 2.89351224899292 }, { "auxiliary_loss_clip": 0.0119034, "auxiliary_loss_mlp": 0.01025803, "balance_loss_clip": 1.05919099, "balance_loss_mlp": 1.01752353, "epoch": 0.35303312691637107, "flos": 23112179763840.0, "grad_norm": 1.945241795031516, "language_loss": 0.788468, "learning_rate": 3.0017984182627087e-06, "loss": 0.81062949, "num_input_tokens_seen": 63052725, "step": 2936, "time_per_iteration": 4.676474332809448 }, { "auxiliary_loss_clip": 0.01183424, "auxiliary_loss_mlp": 0.01067636, "balance_loss_clip": 1.05570722, "balance_loss_mlp": 1.02570117, "epoch": 0.3531533698070102, "flos": 21835699165440.0, "grad_norm": 2.042294189036763, "language_loss": 0.82137465, "learning_rate": 3.00112413795336e-06, "loss": 0.8438853, "num_input_tokens_seen": 63072560, "step": 2937, "time_per_iteration": 2.83181095123291 }, { "auxiliary_loss_clip": 0.01187505, "auxiliary_loss_mlp": 0.01026053, "balance_loss_clip": 1.05595732, "balance_loss_mlp": 1.01750576, "epoch": 0.35327361269764923, "flos": 15778969810560.0, "grad_norm": 2.337301935922499, "language_loss": 0.8017298, "learning_rate": 3.000449705774275e-06, "loss": 0.82386541, "num_input_tokens_seen": 63090800, "step": 2938, "time_per_iteration": 2.753382921218872 }, { "auxiliary_loss_clip": 0.01192693, "auxiliary_loss_mlp": 0.01027489, "balance_loss_clip": 1.05851293, "balance_loss_mlp": 1.01814294, "epoch": 0.35339385558828834, "flos": 22090413484800.0, "grad_norm": 2.1785742075043224, "language_loss": 0.71646905, "learning_rate": 2.9997751218277654e-06, "loss": 0.73867089, "num_input_tokens_seen": 63108955, "step": 2939, "time_per_iteration": 2.6888458728790283 }, { "auxiliary_loss_clip": 0.01194036, "auxiliary_loss_mlp": 0.01030468, "balance_loss_clip": 1.05704665, "balance_loss_mlp": 1.02180195, "epoch": 0.35351409847892745, "flos": 24165008328960.0, "grad_norm": 3.867091375987045, "language_loss": 0.77732152, "learning_rate": 2.999100386216166e-06, "loss": 0.79956657, "num_input_tokens_seen": 63127895, "step": 2940, "time_per_iteration": 2.8148257732391357 }, { "auxiliary_loss_clip": 0.011881, "auxiliary_loss_mlp": 0.01026322, "balance_loss_clip": 1.05688095, "balance_loss_mlp": 1.01750612, "epoch": 0.3536343413695665, "flos": 27052298340480.0, "grad_norm": 1.895440090920096, "language_loss": 0.74394274, "learning_rate": 2.998425499041831e-06, "loss": 0.76608694, "num_input_tokens_seen": 63148410, "step": 2941, "time_per_iteration": 2.8255105018615723 }, { "auxiliary_loss_clip": 0.01089068, "auxiliary_loss_mlp": 0.01001027, "balance_loss_clip": 1.0208385, "balance_loss_mlp": 0.99934578, "epoch": 0.3537545842602056, "flos": 65991066370560.0, "grad_norm": 1.2806588943778277, "language_loss": 0.64601564, "learning_rate": 2.997750460407142e-06, "loss": 0.66691655, "num_input_tokens_seen": 63209765, "step": 2942, "time_per_iteration": 3.3756229877471924 }, { "auxiliary_loss_clip": 0.01190042, "auxiliary_loss_mlp": 0.01029636, "balance_loss_clip": 1.05681765, "balance_loss_mlp": 1.0202359, "epoch": 0.35387482715084473, "flos": 18436897526400.0, "grad_norm": 3.4895285031046233, "language_loss": 0.69941086, "learning_rate": 2.997075270414501e-06, "loss": 0.72160757, "num_input_tokens_seen": 63226980, "step": 2943, "time_per_iteration": 2.742591381072998 }, { "auxiliary_loss_clip": 0.01090066, "auxiliary_loss_mlp": 0.0100353, "balance_loss_clip": 1.02203059, "balance_loss_mlp": 1.0019207, "epoch": 0.3539950700414838, "flos": 65588579498880.0, "grad_norm": 0.6980422957786135, "language_loss": 0.5773195, "learning_rate": 2.9963999291663347e-06, "loss": 0.59825546, "num_input_tokens_seen": 63292760, "step": 2944, "time_per_iteration": 3.330785036087036 }, { "auxiliary_loss_clip": 0.01182196, "auxiliary_loss_mlp": 0.01028147, "balance_loss_clip": 1.0586524, "balance_loss_mlp": 1.02003443, "epoch": 0.3541153129321229, "flos": 20521655919360.0, "grad_norm": 2.497674635829076, "language_loss": 0.74146378, "learning_rate": 2.9957244367650915e-06, "loss": 0.76356721, "num_input_tokens_seen": 63309005, "step": 2945, "time_per_iteration": 2.85205078125 }, { "auxiliary_loss_clip": 0.01177114, "auxiliary_loss_mlp": 0.01037505, "balance_loss_clip": 1.05896342, "balance_loss_mlp": 1.0284034, "epoch": 0.354235555822762, "flos": 19573578391680.0, "grad_norm": 2.602794456041517, "language_loss": 0.83719063, "learning_rate": 2.9950487933132425e-06, "loss": 0.85933685, "num_input_tokens_seen": 63326420, "step": 2946, "time_per_iteration": 2.7986109256744385 }, { "auxiliary_loss_clip": 0.01193648, "auxiliary_loss_mlp": 0.01028671, "balance_loss_clip": 1.05699325, "balance_loss_mlp": 1.02000475, "epoch": 0.35435579871340106, "flos": 20777268078720.0, "grad_norm": 2.547726550986094, "language_loss": 0.71619248, "learning_rate": 2.994372998913283e-06, "loss": 0.73841578, "num_input_tokens_seen": 63344925, "step": 2947, "time_per_iteration": 2.90444016456604 }, { "auxiliary_loss_clip": 0.01189695, "auxiliary_loss_mlp": 0.01034769, "balance_loss_clip": 1.0591172, "balance_loss_mlp": 1.02591133, "epoch": 0.35447604160404017, "flos": 23951807153280.0, "grad_norm": 4.341259710723119, "language_loss": 0.62141103, "learning_rate": 2.99369705366773e-06, "loss": 0.64365566, "num_input_tokens_seen": 63365170, "step": 2948, "time_per_iteration": 2.7977724075317383 }, { "auxiliary_loss_clip": 0.01182861, "auxiliary_loss_mlp": 0.01029759, "balance_loss_clip": 1.05609417, "balance_loss_mlp": 1.02129495, "epoch": 0.3545962844946792, "flos": 23435662671360.0, "grad_norm": 11.656571675192637, "language_loss": 0.82413614, "learning_rate": 2.9930209576791244e-06, "loss": 0.84626234, "num_input_tokens_seen": 63383645, "step": 2949, "time_per_iteration": 2.8910775184631348 }, { "auxiliary_loss_clip": 0.01186106, "auxiliary_loss_mlp": 0.01028425, "balance_loss_clip": 1.05583644, "balance_loss_mlp": 1.02003825, "epoch": 0.35471652738531834, "flos": 22085134185600.0, "grad_norm": 1.8313392116786817, "language_loss": 0.63734996, "learning_rate": 2.9923447110500285e-06, "loss": 0.65949523, "num_input_tokens_seen": 63402390, "step": 2950, "time_per_iteration": 2.7601966857910156 }, { "auxiliary_loss_clip": 0.01182737, "auxiliary_loss_mlp": 0.01027548, "balance_loss_clip": 1.05798995, "balance_loss_mlp": 1.01891112, "epoch": 0.35483677027595745, "flos": 27341881787520.0, "grad_norm": 1.5325448818974876, "language_loss": 0.75416398, "learning_rate": 2.9916683138830295e-06, "loss": 0.77626681, "num_input_tokens_seen": 63423055, "step": 2951, "time_per_iteration": 2.7736215591430664 }, { "auxiliary_loss_clip": 0.0118172, "auxiliary_loss_mlp": 0.01031426, "balance_loss_clip": 1.05573571, "balance_loss_mlp": 1.02296233, "epoch": 0.3549570131665965, "flos": 13516166678400.0, "grad_norm": 2.4581407026630906, "language_loss": 0.80862868, "learning_rate": 2.9909917662807353e-06, "loss": 0.83076018, "num_input_tokens_seen": 63440855, "step": 2952, "time_per_iteration": 2.8757588863372803 }, { "auxiliary_loss_clip": 0.01186641, "auxiliary_loss_mlp": 0.01035731, "balance_loss_clip": 1.05512178, "balance_loss_mlp": 1.02682662, "epoch": 0.3550772560572356, "flos": 20887549810560.0, "grad_norm": 2.135791504450234, "language_loss": 0.68992281, "learning_rate": 2.9903150683457783e-06, "loss": 0.71214652, "num_input_tokens_seen": 63459400, "step": 2953, "time_per_iteration": 2.6751890182495117 }, { "auxiliary_loss_clip": 0.01184602, "auxiliary_loss_mlp": 0.01033404, "balance_loss_clip": 1.05264676, "balance_loss_mlp": 1.02474976, "epoch": 0.3551974989478747, "flos": 20194042947840.0, "grad_norm": 1.900245793890513, "language_loss": 0.64784473, "learning_rate": 2.9896382201808126e-06, "loss": 0.67002487, "num_input_tokens_seen": 63476800, "step": 2954, "time_per_iteration": 2.7217466831207275 }, { "auxiliary_loss_clip": 0.01193134, "auxiliary_loss_mlp": 0.0102523, "balance_loss_clip": 1.05600607, "balance_loss_mlp": 1.01702809, "epoch": 0.3553177418385138, "flos": 19828831415040.0, "grad_norm": 3.388006486998776, "language_loss": 0.81160533, "learning_rate": 2.988961221888516e-06, "loss": 0.83378899, "num_input_tokens_seen": 63493475, "step": 2955, "time_per_iteration": 2.6451494693756104 }, { "auxiliary_loss_clip": 0.01173307, "auxiliary_loss_mlp": 0.01030576, "balance_loss_clip": 1.05486047, "balance_loss_mlp": 1.02107477, "epoch": 0.3554379847291529, "flos": 14829132516480.0, "grad_norm": 2.5270449672467303, "language_loss": 0.79112267, "learning_rate": 2.988284073571589e-06, "loss": 0.81316155, "num_input_tokens_seen": 63509560, "step": 2956, "time_per_iteration": 2.7355051040649414 }, { "auxiliary_loss_clip": 0.01188245, "auxiliary_loss_mlp": 0.01054291, "balance_loss_clip": 1.05437207, "balance_loss_mlp": 1.01620734, "epoch": 0.355558227619792, "flos": 20485350247680.0, "grad_norm": 2.9928447110241674, "language_loss": 0.73189902, "learning_rate": 2.9876067753327528e-06, "loss": 0.75432444, "num_input_tokens_seen": 63527290, "step": 2957, "time_per_iteration": 3.6686854362487793 }, { "auxiliary_loss_clip": 0.01191241, "auxiliary_loss_mlp": 0.01028461, "balance_loss_clip": 1.05432868, "balance_loss_mlp": 1.02037287, "epoch": 0.35567847051043106, "flos": 37663613256960.0, "grad_norm": 2.040475500265387, "language_loss": 0.80858147, "learning_rate": 2.986929327274754e-06, "loss": 0.83077854, "num_input_tokens_seen": 63547870, "step": 2958, "time_per_iteration": 2.966919183731079 }, { "auxiliary_loss_clip": 0.01185484, "auxiliary_loss_mlp": 0.0102663, "balance_loss_clip": 1.05535817, "balance_loss_mlp": 1.01826167, "epoch": 0.35579871340107017, "flos": 26943058103040.0, "grad_norm": 1.817019982000179, "language_loss": 0.78871018, "learning_rate": 2.9862517295003617e-06, "loss": 0.81083137, "num_input_tokens_seen": 63568285, "step": 2959, "time_per_iteration": 3.846235752105713 }, { "auxiliary_loss_clip": 0.01181724, "auxiliary_loss_mlp": 0.01028537, "balance_loss_clip": 1.05532575, "balance_loss_mlp": 1.02013266, "epoch": 0.3559189562917093, "flos": 28293335193600.0, "grad_norm": 1.8034793595410537, "language_loss": 0.72632676, "learning_rate": 2.9855739821123654e-06, "loss": 0.74842942, "num_input_tokens_seen": 63589865, "step": 2960, "time_per_iteration": 2.8407013416290283 }, { "auxiliary_loss_clip": 0.0118458, "auxiliary_loss_mlp": 0.01032348, "balance_loss_clip": 1.05501842, "balance_loss_mlp": 1.02441502, "epoch": 0.35603919918234833, "flos": 25664063552640.0, "grad_norm": 4.065896369115062, "language_loss": 0.81849277, "learning_rate": 2.98489608521358e-06, "loss": 0.84066212, "num_input_tokens_seen": 63609805, "step": 2961, "time_per_iteration": 2.8115077018737793 }, { "auxiliary_loss_clip": 0.01191553, "auxiliary_loss_mlp": 0.01058981, "balance_loss_clip": 1.05519509, "balance_loss_mlp": 1.02008009, "epoch": 0.35615944207298744, "flos": 23000856537600.0, "grad_norm": 2.312563257538813, "language_loss": 0.79539478, "learning_rate": 2.9842180389068425e-06, "loss": 0.81790006, "num_input_tokens_seen": 63627115, "step": 2962, "time_per_iteration": 3.6740450859069824 }, { "auxiliary_loss_clip": 0.0108183, "auxiliary_loss_mlp": 0.00999884, "balance_loss_clip": 1.02082443, "balance_loss_mlp": 0.99819148, "epoch": 0.35627968496362655, "flos": 68251283723520.0, "grad_norm": 0.7592094742435004, "language_loss": 0.59207749, "learning_rate": 2.98353984329501e-06, "loss": 0.61289465, "num_input_tokens_seen": 63691460, "step": 2963, "time_per_iteration": 3.440871238708496 }, { "auxiliary_loss_clip": 0.01183322, "auxiliary_loss_mlp": 0.01030394, "balance_loss_clip": 1.05572569, "balance_loss_mlp": 1.02103043, "epoch": 0.3563999278542656, "flos": 22641714403200.0, "grad_norm": 1.6726304218968948, "language_loss": 0.70545852, "learning_rate": 2.982861498480965e-06, "loss": 0.72759569, "num_input_tokens_seen": 63713840, "step": 2964, "time_per_iteration": 2.7752997875213623 }, { "auxiliary_loss_clip": 0.01179221, "auxiliary_loss_mlp": 0.01034034, "balance_loss_clip": 1.05627179, "balance_loss_mlp": 1.02565384, "epoch": 0.3565201707449047, "flos": 25952533678080.0, "grad_norm": 1.5814073043606824, "language_loss": 0.82669443, "learning_rate": 2.9821830045676122e-06, "loss": 0.84882694, "num_input_tokens_seen": 63733540, "step": 2965, "time_per_iteration": 2.8824386596679688 }, { "auxiliary_loss_clip": 0.01192442, "auxiliary_loss_mlp": 0.01027473, "balance_loss_clip": 1.05592203, "balance_loss_mlp": 1.01871741, "epoch": 0.3566404136355438, "flos": 28475725478400.0, "grad_norm": 1.9674352725315294, "language_loss": 0.72763306, "learning_rate": 2.9815043616578793e-06, "loss": 0.74983221, "num_input_tokens_seen": 63754335, "step": 2966, "time_per_iteration": 2.737494707107544 }, { "auxiliary_loss_clip": 0.01182897, "auxiliary_loss_mlp": 0.01026732, "balance_loss_clip": 1.05769897, "balance_loss_mlp": 1.01881111, "epoch": 0.3567606565261829, "flos": 38363117690880.0, "grad_norm": 3.5868064766587247, "language_loss": 0.76684582, "learning_rate": 2.9808255698547145e-06, "loss": 0.7889421, "num_input_tokens_seen": 63777135, "step": 2967, "time_per_iteration": 3.02976393699646 }, { "auxiliary_loss_clip": 0.01186406, "auxiliary_loss_mlp": 0.010356, "balance_loss_clip": 1.05580521, "balance_loss_mlp": 1.02757716, "epoch": 0.356880899416822, "flos": 21981029592960.0, "grad_norm": 2.31509553156362, "language_loss": 0.79878026, "learning_rate": 2.9801466292610913e-06, "loss": 0.82100034, "num_input_tokens_seen": 63797020, "step": 2968, "time_per_iteration": 2.729907989501953 }, { "auxiliary_loss_clip": 0.01184358, "auxiliary_loss_mlp": 0.0102895, "balance_loss_clip": 1.0539912, "balance_loss_mlp": 1.02089107, "epoch": 0.35700114230746105, "flos": 18989132198400.0, "grad_norm": 2.040051303618455, "language_loss": 0.80743462, "learning_rate": 2.979467539980003e-06, "loss": 0.82956773, "num_input_tokens_seen": 63813810, "step": 2969, "time_per_iteration": 2.865469455718994 }, { "auxiliary_loss_clip": 0.0118791, "auxiliary_loss_mlp": 0.01029036, "balance_loss_clip": 1.05451632, "balance_loss_mlp": 1.02044654, "epoch": 0.35712138519810016, "flos": 19756112330880.0, "grad_norm": 1.901237900665599, "language_loss": 0.77046663, "learning_rate": 2.978788302114468e-06, "loss": 0.79263604, "num_input_tokens_seen": 63830925, "step": 2970, "time_per_iteration": 2.6701462268829346 }, { "auxiliary_loss_clip": 0.01184869, "auxiliary_loss_mlp": 0.01026843, "balance_loss_clip": 1.05417097, "balance_loss_mlp": 1.01831377, "epoch": 0.35724162808873927, "flos": 35183012008320.0, "grad_norm": 2.525460506362163, "language_loss": 0.83253896, "learning_rate": 2.9781089157675255e-06, "loss": 0.8546561, "num_input_tokens_seen": 63849385, "step": 2971, "time_per_iteration": 2.864063024520874 }, { "auxiliary_loss_clip": 0.01182653, "auxiliary_loss_mlp": 0.01026817, "balance_loss_clip": 1.0545063, "balance_loss_mlp": 1.01843643, "epoch": 0.3573618709793783, "flos": 25556726736000.0, "grad_norm": 1.693022881879681, "language_loss": 0.88272262, "learning_rate": 2.977429381042238e-06, "loss": 0.9048174, "num_input_tokens_seen": 63870060, "step": 2972, "time_per_iteration": 2.7531745433807373 }, { "auxiliary_loss_clip": 0.01186089, "auxiliary_loss_mlp": 0.01034865, "balance_loss_clip": 1.0573045, "balance_loss_mlp": 1.02696109, "epoch": 0.35748211387001744, "flos": 29132352051840.0, "grad_norm": 2.1448367005221343, "language_loss": 0.88979179, "learning_rate": 2.9767496980416913e-06, "loss": 0.91200125, "num_input_tokens_seen": 63889355, "step": 2973, "time_per_iteration": 2.7667031288146973 }, { "auxiliary_loss_clip": 0.01180066, "auxiliary_loss_mlp": 0.01028992, "balance_loss_clip": 1.05427265, "balance_loss_mlp": 1.01995599, "epoch": 0.35760235676065655, "flos": 13954169122560.0, "grad_norm": 2.3798291386207744, "language_loss": 0.81251347, "learning_rate": 2.9760698668689914e-06, "loss": 0.83460408, "num_input_tokens_seen": 63905580, "step": 2974, "time_per_iteration": 2.700798988342285 }, { "auxiliary_loss_clip": 0.01185578, "auxiliary_loss_mlp": 0.01027099, "balance_loss_clip": 1.05338383, "balance_loss_mlp": 1.01858783, "epoch": 0.3577225996512956, "flos": 44018688977280.0, "grad_norm": 2.0556677364553613, "language_loss": 0.71107477, "learning_rate": 2.975389887627269e-06, "loss": 0.73320162, "num_input_tokens_seen": 63928180, "step": 2975, "time_per_iteration": 2.8633575439453125 }, { "auxiliary_loss_clip": 0.01183948, "auxiliary_loss_mlp": 0.01029123, "balance_loss_clip": 1.05368304, "balance_loss_mlp": 1.02093291, "epoch": 0.3578428425419347, "flos": 17055199013760.0, "grad_norm": 2.3909200167125104, "language_loss": 0.90398341, "learning_rate": 2.9747097604196764e-06, "loss": 0.92611414, "num_input_tokens_seen": 63944825, "step": 2976, "time_per_iteration": 2.82780385017395 }, { "auxiliary_loss_clip": 0.01084364, "auxiliary_loss_mlp": 0.01012796, "balance_loss_clip": 1.02217627, "balance_loss_mlp": 1.01109159, "epoch": 0.3579630854325738, "flos": 71676550707840.0, "grad_norm": 0.6755140021897906, "language_loss": 0.56620038, "learning_rate": 2.9740294853493875e-06, "loss": 0.58717191, "num_input_tokens_seen": 64016385, "step": 2977, "time_per_iteration": 3.7295522689819336 }, { "auxiliary_loss_clip": 0.0118607, "auxiliary_loss_mlp": 0.01030314, "balance_loss_clip": 1.0571686, "balance_loss_mlp": 1.02217805, "epoch": 0.3580833283232129, "flos": 25046651652480.0, "grad_norm": 2.4756387622622045, "language_loss": 0.67317569, "learning_rate": 2.9733490625196008e-06, "loss": 0.69533956, "num_input_tokens_seen": 64036245, "step": 2978, "time_per_iteration": 3.3231661319732666 }, { "auxiliary_loss_clip": 0.01171926, "auxiliary_loss_mlp": 0.01028201, "balance_loss_clip": 1.05462384, "balance_loss_mlp": 1.02070308, "epoch": 0.358203571213852, "flos": 13953127628160.0, "grad_norm": 3.0808693205978908, "language_loss": 0.75703841, "learning_rate": 2.9726684920335353e-06, "loss": 0.77903968, "num_input_tokens_seen": 64054110, "step": 2979, "time_per_iteration": 2.8256657123565674 }, { "auxiliary_loss_clip": 0.01190469, "auxiliary_loss_mlp": 0.01064272, "balance_loss_clip": 1.05314326, "balance_loss_mlp": 1.02423334, "epoch": 0.35832381410449105, "flos": 20302457172480.0, "grad_norm": 2.0852141130102364, "language_loss": 0.81985021, "learning_rate": 2.971987773994432e-06, "loss": 0.84239763, "num_input_tokens_seen": 64070295, "step": 2980, "time_per_iteration": 2.683228015899658 }, { "auxiliary_loss_clip": 0.01178039, "auxiliary_loss_mlp": 0.01028965, "balance_loss_clip": 1.0525291, "balance_loss_mlp": 1.02034605, "epoch": 0.35844405699513016, "flos": 16983234115200.0, "grad_norm": 2.461591165682796, "language_loss": 0.83060753, "learning_rate": 2.9713069085055566e-06, "loss": 0.85267758, "num_input_tokens_seen": 64088605, "step": 2981, "time_per_iteration": 2.8745832443237305 }, { "auxiliary_loss_clip": 0.0118207, "auxiliary_loss_mlp": 0.01031289, "balance_loss_clip": 1.05640745, "balance_loss_mlp": 1.02303326, "epoch": 0.35856429988576927, "flos": 23216858974080.0, "grad_norm": 1.942401142753432, "language_loss": 0.78790385, "learning_rate": 2.9706258956701958e-06, "loss": 0.81003737, "num_input_tokens_seen": 64108595, "step": 2982, "time_per_iteration": 3.0113372802734375 }, { "auxiliary_loss_clip": 0.0118764, "auxiliary_loss_mlp": 0.01029185, "balance_loss_clip": 1.05408382, "balance_loss_mlp": 1.02011919, "epoch": 0.3586845427764083, "flos": 23034576430080.0, "grad_norm": 2.606408889772106, "language_loss": 0.77592933, "learning_rate": 2.9699447355916575e-06, "loss": 0.79809755, "num_input_tokens_seen": 64127405, "step": 2983, "time_per_iteration": 3.679799795150757 }, { "auxiliary_loss_clip": 0.0118729, "auxiliary_loss_mlp": 0.01052788, "balance_loss_clip": 1.05250978, "balance_loss_mlp": 1.01465678, "epoch": 0.35880478566704743, "flos": 20010682995840.0, "grad_norm": 1.9562628026442634, "language_loss": 0.73725742, "learning_rate": 2.969263428373275e-06, "loss": 0.75965822, "num_input_tokens_seen": 64145755, "step": 2984, "time_per_iteration": 2.6978185176849365 }, { "auxiliary_loss_clip": 0.01184211, "auxiliary_loss_mlp": 0.01024987, "balance_loss_clip": 1.05365849, "balance_loss_mlp": 1.01636219, "epoch": 0.35892502855768654, "flos": 13699095667200.0, "grad_norm": 2.0355501559840437, "language_loss": 0.79398507, "learning_rate": 2.9685819741184007e-06, "loss": 0.81607699, "num_input_tokens_seen": 64164195, "step": 2985, "time_per_iteration": 3.746119260787964 }, { "auxiliary_loss_clip": 0.01178301, "auxiliary_loss_mlp": 0.01028351, "balance_loss_clip": 1.05556726, "balance_loss_mlp": 1.01997697, "epoch": 0.3590452714483256, "flos": 18114096977280.0, "grad_norm": 2.43544329307665, "language_loss": 0.68614686, "learning_rate": 2.967900372930411e-06, "loss": 0.70821345, "num_input_tokens_seen": 64182705, "step": 2986, "time_per_iteration": 2.7580065727233887 }, { "auxiliary_loss_clip": 0.01179051, "auxiliary_loss_mlp": 0.0102759, "balance_loss_clip": 1.05450273, "balance_loss_mlp": 1.01905429, "epoch": 0.3591655143389647, "flos": 17749352321280.0, "grad_norm": 2.3229658626670737, "language_loss": 0.79492915, "learning_rate": 2.9672186249127046e-06, "loss": 0.8169955, "num_input_tokens_seen": 64202170, "step": 2987, "time_per_iteration": 2.754694700241089 }, { "auxiliary_loss_clip": 0.0118129, "auxiliary_loss_mlp": 0.01028308, "balance_loss_clip": 1.05474532, "balance_loss_mlp": 1.02039886, "epoch": 0.3592857572296038, "flos": 25224409082880.0, "grad_norm": 2.917854532527696, "language_loss": 0.78983474, "learning_rate": 2.9665367301687014e-06, "loss": 0.81193072, "num_input_tokens_seen": 64220415, "step": 2988, "time_per_iteration": 3.619091510772705 }, { "auxiliary_loss_clip": 0.01179492, "auxiliary_loss_mlp": 0.01026459, "balance_loss_clip": 1.05537271, "balance_loss_mlp": 1.01806712, "epoch": 0.3594060001202429, "flos": 29384408764800.0, "grad_norm": 1.7945725533675256, "language_loss": 0.7666409, "learning_rate": 2.965854688801845e-06, "loss": 0.78870046, "num_input_tokens_seen": 64242475, "step": 2989, "time_per_iteration": 3.7238526344299316 }, { "auxiliary_loss_clip": 0.01182813, "auxiliary_loss_mlp": 0.01027593, "balance_loss_clip": 1.05127859, "balance_loss_mlp": 1.01843762, "epoch": 0.359526243010882, "flos": 17052900543360.0, "grad_norm": 2.151551951200747, "language_loss": 0.76774096, "learning_rate": 2.9651725009156005e-06, "loss": 0.78984499, "num_input_tokens_seen": 64260220, "step": 2990, "time_per_iteration": 2.791435956954956 }, { "auxiliary_loss_clip": 0.01175365, "auxiliary_loss_mlp": 0.01028185, "balance_loss_clip": 1.05314076, "balance_loss_mlp": 1.01973927, "epoch": 0.3596464859015211, "flos": 22965089569920.0, "grad_norm": 1.914782307497524, "language_loss": 0.74088359, "learning_rate": 2.964490166613454e-06, "loss": 0.76291913, "num_input_tokens_seen": 64280145, "step": 2991, "time_per_iteration": 2.8214869499206543 }, { "auxiliary_loss_clip": 0.01091998, "auxiliary_loss_mlp": 0.0100181, "balance_loss_clip": 1.02242875, "balance_loss_mlp": 1.00016487, "epoch": 0.35976672879216015, "flos": 54739462590720.0, "grad_norm": 0.7577800920443035, "language_loss": 0.57704479, "learning_rate": 2.963807685998917e-06, "loss": 0.59798288, "num_input_tokens_seen": 64336010, "step": 2992, "time_per_iteration": 3.1487765312194824 }, { "auxiliary_loss_clip": 0.01179311, "auxiliary_loss_mlp": 0.01028456, "balance_loss_clip": 1.0532546, "balance_loss_mlp": 1.02002144, "epoch": 0.35988697168279926, "flos": 43139020901760.0, "grad_norm": 1.5988438186491623, "language_loss": 0.78064293, "learning_rate": 2.9631250591755196e-06, "loss": 0.80272067, "num_input_tokens_seen": 64358725, "step": 2993, "time_per_iteration": 3.0018842220306396 }, { "auxiliary_loss_clip": 0.01180936, "auxiliary_loss_mlp": 0.01032775, "balance_loss_clip": 1.05596352, "balance_loss_mlp": 1.02332199, "epoch": 0.36000721457343837, "flos": 35845600239360.0, "grad_norm": 1.9056992874975893, "language_loss": 0.57529694, "learning_rate": 2.962442286246817e-06, "loss": 0.59743404, "num_input_tokens_seen": 64381555, "step": 2994, "time_per_iteration": 2.9614927768707275 }, { "auxiliary_loss_clip": 0.01184726, "auxiliary_loss_mlp": 0.01030562, "balance_loss_clip": 1.05396581, "balance_loss_mlp": 1.0220449, "epoch": 0.3601274574640774, "flos": 18291100222080.0, "grad_norm": 1.7132923038756591, "language_loss": 0.69576454, "learning_rate": 2.9617593673163853e-06, "loss": 0.71791744, "num_input_tokens_seen": 64400375, "step": 2995, "time_per_iteration": 2.811955213546753 }, { "auxiliary_loss_clip": 0.01187531, "auxiliary_loss_mlp": 0.01025157, "balance_loss_clip": 1.05359936, "balance_loss_mlp": 1.01684797, "epoch": 0.36024770035471654, "flos": 13333955961600.0, "grad_norm": 2.307486666259981, "language_loss": 0.77022052, "learning_rate": 2.9610763024878216e-06, "loss": 0.79234737, "num_input_tokens_seen": 64415880, "step": 2996, "time_per_iteration": 2.763500690460205 }, { "auxiliary_loss_clip": 0.01180134, "auxiliary_loss_mlp": 0.01027273, "balance_loss_clip": 1.05524457, "balance_loss_mlp": 1.01909518, "epoch": 0.3603679432453556, "flos": 20267013427200.0, "grad_norm": 1.7851227813129493, "language_loss": 0.91642654, "learning_rate": 2.960393091864747e-06, "loss": 0.93850064, "num_input_tokens_seen": 64434260, "step": 2997, "time_per_iteration": 2.867985725402832 }, { "auxiliary_loss_clip": 0.01184343, "auxiliary_loss_mlp": 0.01025694, "balance_loss_clip": 1.05498219, "balance_loss_mlp": 1.01715899, "epoch": 0.3604881861359947, "flos": 22451135817600.0, "grad_norm": 4.564343056444048, "language_loss": 0.74679506, "learning_rate": 2.959709735550804e-06, "loss": 0.76889545, "num_input_tokens_seen": 64453855, "step": 2998, "time_per_iteration": 2.7572169303894043 }, { "auxiliary_loss_clip": 0.01181275, "auxiliary_loss_mlp": 0.01029461, "balance_loss_clip": 1.05475843, "balance_loss_mlp": 1.02127743, "epoch": 0.3606084290266338, "flos": 22054251467520.0, "grad_norm": 2.106745198435454, "language_loss": 0.75363022, "learning_rate": 2.9590262336496575e-06, "loss": 0.77573752, "num_input_tokens_seen": 64473585, "step": 2999, "time_per_iteration": 2.8161888122558594 }, { "auxiliary_loss_clip": 0.01179268, "auxiliary_loss_mlp": 0.01026927, "balance_loss_clip": 1.05855942, "balance_loss_mlp": 1.01841593, "epoch": 0.36072867191727287, "flos": 15632921111040.0, "grad_norm": 2.192408564424037, "language_loss": 0.85407382, "learning_rate": 2.9583425862649936e-06, "loss": 0.87613583, "num_input_tokens_seen": 64491720, "step": 3000, "time_per_iteration": 2.767000913619995 }, { "auxiliary_loss_clip": 0.01196657, "auxiliary_loss_mlp": 0.01029127, "balance_loss_clip": 1.05849886, "balance_loss_mlp": 1.02081871, "epoch": 0.360848914807912, "flos": 19677000625920.0, "grad_norm": 2.7238297694034057, "language_loss": 0.74492919, "learning_rate": 2.9576587935005215e-06, "loss": 0.767187, "num_input_tokens_seen": 64509800, "step": 3001, "time_per_iteration": 2.8341825008392334 }, { "auxiliary_loss_clip": 0.01190658, "auxiliary_loss_mlp": 0.01027519, "balance_loss_clip": 1.05528355, "balance_loss_mlp": 1.01907253, "epoch": 0.3609691576985511, "flos": 18877808972160.0, "grad_norm": 3.4906045945627984, "language_loss": 0.7187314, "learning_rate": 2.9569748554599713e-06, "loss": 0.74091321, "num_input_tokens_seen": 64525410, "step": 3002, "time_per_iteration": 2.7248339653015137 }, { "auxiliary_loss_clip": 0.01185228, "auxiliary_loss_mlp": 0.0102693, "balance_loss_clip": 1.05677152, "balance_loss_mlp": 1.01869214, "epoch": 0.36108940058919015, "flos": 42224088648960.0, "grad_norm": 2.656260881704635, "language_loss": 0.73248065, "learning_rate": 2.956290772247097e-06, "loss": 0.75460225, "num_input_tokens_seen": 64544085, "step": 3003, "time_per_iteration": 2.925309896469116 }, { "auxiliary_loss_clip": 0.01173191, "auxiliary_loss_mlp": 0.01029256, "balance_loss_clip": 1.05823696, "balance_loss_mlp": 1.02116191, "epoch": 0.36120964347982926, "flos": 23185150243200.0, "grad_norm": 13.512796539361386, "language_loss": 0.73300368, "learning_rate": 2.9556065439656724e-06, "loss": 0.75502819, "num_input_tokens_seen": 64563135, "step": 3004, "time_per_iteration": 2.8254950046539307 }, { "auxiliary_loss_clip": 0.01170821, "auxiliary_loss_mlp": 0.01022086, "balance_loss_clip": 1.05784702, "balance_loss_mlp": 1.01356864, "epoch": 0.36132988637046837, "flos": 18113055482880.0, "grad_norm": 2.3544000487192203, "language_loss": 0.81667328, "learning_rate": 2.9549221707194952e-06, "loss": 0.83860242, "num_input_tokens_seen": 64581985, "step": 3005, "time_per_iteration": 2.7590715885162354 }, { "auxiliary_loss_clip": 0.01187596, "auxiliary_loss_mlp": 0.01030794, "balance_loss_clip": 1.05438566, "balance_loss_mlp": 1.02221096, "epoch": 0.3614501292611074, "flos": 27813101333760.0, "grad_norm": 2.9328346682514117, "language_loss": 0.73519886, "learning_rate": 2.954237652612384e-06, "loss": 0.75738275, "num_input_tokens_seen": 64601035, "step": 3006, "time_per_iteration": 2.82627010345459 }, { "auxiliary_loss_clip": 0.01180089, "auxiliary_loss_mlp": 0.01029588, "balance_loss_clip": 1.0545696, "balance_loss_mlp": 1.02174354, "epoch": 0.36157037215174653, "flos": 22634926732800.0, "grad_norm": 4.420015063690864, "language_loss": 0.84654069, "learning_rate": 2.9535529897481796e-06, "loss": 0.86863744, "num_input_tokens_seen": 64618580, "step": 3007, "time_per_iteration": 2.749589681625366 }, { "auxiliary_loss_clip": 0.01189648, "auxiliary_loss_mlp": 0.01030242, "balance_loss_clip": 1.05462646, "balance_loss_mlp": 1.02174234, "epoch": 0.36169061504238564, "flos": 12600839376000.0, "grad_norm": 2.3147461075479794, "language_loss": 0.76837051, "learning_rate": 2.9528681822307446e-06, "loss": 0.79056942, "num_input_tokens_seen": 64635430, "step": 3008, "time_per_iteration": 2.7655365467071533 }, { "auxiliary_loss_clip": 0.01182849, "auxiliary_loss_mlp": 0.01058228, "balance_loss_clip": 1.05607963, "balance_loss_mlp": 1.0194571, "epoch": 0.3618108579330247, "flos": 26684644682880.0, "grad_norm": 1.982764364617406, "language_loss": 0.82224882, "learning_rate": 2.952183230163964e-06, "loss": 0.84465963, "num_input_tokens_seen": 64655005, "step": 3009, "time_per_iteration": 3.7352614402770996 }, { "auxiliary_loss_clip": 0.01172401, "auxiliary_loss_mlp": 0.01021389, "balance_loss_clip": 1.0517596, "balance_loss_mlp": 1.01342034, "epoch": 0.3619311008236638, "flos": 22817029708800.0, "grad_norm": 1.9468083028189598, "language_loss": 0.73002565, "learning_rate": 2.9514981336517448e-06, "loss": 0.75196356, "num_input_tokens_seen": 64674775, "step": 3010, "time_per_iteration": 2.7901880741119385 }, { "auxiliary_loss_clip": 0.01185299, "auxiliary_loss_mlp": 0.01029747, "balance_loss_clip": 1.05526483, "balance_loss_mlp": 1.02152133, "epoch": 0.36205134371430286, "flos": 25919603884800.0, "grad_norm": 1.9333374511600367, "language_loss": 0.81322879, "learning_rate": 2.950812892798015e-06, "loss": 0.83537924, "num_input_tokens_seen": 64695670, "step": 3011, "time_per_iteration": 3.7615749835968018 }, { "auxiliary_loss_clip": 0.01171492, "auxiliary_loss_mlp": 0.01056822, "balance_loss_clip": 1.05640161, "balance_loss_mlp": 1.01931977, "epoch": 0.362171586604942, "flos": 26139592730880.0, "grad_norm": 2.043303367356616, "language_loss": 0.86897171, "learning_rate": 2.9501275077067256e-06, "loss": 0.89125484, "num_input_tokens_seen": 64716290, "step": 3012, "time_per_iteration": 2.8467912673950195 }, { "auxiliary_loss_clip": 0.01160972, "auxiliary_loss_mlp": 0.01034543, "balance_loss_clip": 1.0560559, "balance_loss_mlp": 1.02644587, "epoch": 0.3622918294955811, "flos": 28074208273920.0, "grad_norm": 1.4390483126221627, "language_loss": 0.8856765, "learning_rate": 2.949441978481848e-06, "loss": 0.90763164, "num_input_tokens_seen": 64737190, "step": 3013, "time_per_iteration": 2.9178481101989746 }, { "auxiliary_loss_clip": 0.01186192, "auxiliary_loss_mlp": 0.01035227, "balance_loss_clip": 1.05671668, "balance_loss_mlp": 1.02694166, "epoch": 0.36241207238622014, "flos": 19828005402240.0, "grad_norm": 2.5103065566402027, "language_loss": 0.80395526, "learning_rate": 2.9487563052273778e-06, "loss": 0.82616937, "num_input_tokens_seen": 64753950, "step": 3014, "time_per_iteration": 3.711183786392212 }, { "auxiliary_loss_clip": 0.01178012, "auxiliary_loss_mlp": 0.01027854, "balance_loss_clip": 1.05338073, "balance_loss_mlp": 1.01998007, "epoch": 0.36253231527685925, "flos": 21397158017280.0, "grad_norm": 2.0597467877840936, "language_loss": 0.85558319, "learning_rate": 2.94807048804733e-06, "loss": 0.8776418, "num_input_tokens_seen": 64773570, "step": 3015, "time_per_iteration": 3.739612340927124 }, { "auxiliary_loss_clip": 0.01185935, "auxiliary_loss_mlp": 0.01031772, "balance_loss_clip": 1.05587196, "balance_loss_mlp": 1.02374315, "epoch": 0.36265255816749836, "flos": 18362885552640.0, "grad_norm": 4.819093809705116, "language_loss": 0.90595865, "learning_rate": 2.9473845270457434e-06, "loss": 0.92813575, "num_input_tokens_seen": 64790385, "step": 3016, "time_per_iteration": 2.7775473594665527 }, { "auxiliary_loss_clip": 0.0117699, "auxiliary_loss_mlp": 0.0102541, "balance_loss_clip": 1.05478406, "balance_loss_mlp": 1.01777172, "epoch": 0.3627728010581374, "flos": 18660046769280.0, "grad_norm": 9.885581376662417, "language_loss": 0.69982916, "learning_rate": 2.946698422326677e-06, "loss": 0.72185314, "num_input_tokens_seen": 64807845, "step": 3017, "time_per_iteration": 2.821096181869507 }, { "auxiliary_loss_clip": 0.01176187, "auxiliary_loss_mlp": 0.01028467, "balance_loss_clip": 1.05452466, "balance_loss_mlp": 1.02004504, "epoch": 0.36289304394877653, "flos": 27524272072320.0, "grad_norm": 2.967939549201528, "language_loss": 0.79787302, "learning_rate": 2.946012173994213e-06, "loss": 0.81991959, "num_input_tokens_seen": 64827630, "step": 3018, "time_per_iteration": 2.8422939777374268 }, { "auxiliary_loss_clip": 0.01179612, "auxiliary_loss_mlp": 0.01026891, "balance_loss_clip": 1.05460334, "balance_loss_mlp": 1.01902318, "epoch": 0.36301328683941564, "flos": 34533244932480.0, "grad_norm": 1.8985945205483148, "language_loss": 0.67787325, "learning_rate": 2.945325782152454e-06, "loss": 0.69993836, "num_input_tokens_seen": 64850665, "step": 3019, "time_per_iteration": 2.8790595531463623 }, { "auxiliary_loss_clip": 0.01184318, "auxiliary_loss_mlp": 0.01029615, "balance_loss_clip": 1.05458486, "balance_loss_mlp": 1.02184236, "epoch": 0.3631335297300547, "flos": 19025976574080.0, "grad_norm": 2.82919679730411, "language_loss": 0.78744757, "learning_rate": 2.9446392469055257e-06, "loss": 0.80958694, "num_input_tokens_seen": 64868700, "step": 3020, "time_per_iteration": 2.767446279525757 }, { "auxiliary_loss_clip": 0.01174296, "auxiliary_loss_mlp": 0.0103096, "balance_loss_clip": 1.05732632, "balance_loss_mlp": 1.02333117, "epoch": 0.3632537726206938, "flos": 19536769929600.0, "grad_norm": 1.9846346105281003, "language_loss": 0.79776359, "learning_rate": 2.9439525683575745e-06, "loss": 0.81981617, "num_input_tokens_seen": 64887620, "step": 3021, "time_per_iteration": 2.872082471847534 }, { "auxiliary_loss_clip": 0.0119242, "auxiliary_loss_mlp": 0.01034436, "balance_loss_clip": 1.05650675, "balance_loss_mlp": 1.02568638, "epoch": 0.3633740155113329, "flos": 21068611292160.0, "grad_norm": 2.038214183189607, "language_loss": 0.74737978, "learning_rate": 2.9432657466127694e-06, "loss": 0.76964831, "num_input_tokens_seen": 64907190, "step": 3022, "time_per_iteration": 2.8751702308654785 }, { "auxiliary_loss_clip": 0.01176368, "auxiliary_loss_mlp": 0.0103296, "balance_loss_clip": 1.05905557, "balance_loss_mlp": 1.0252229, "epoch": 0.36349425840197197, "flos": 20298722158080.0, "grad_norm": 1.7460652799004683, "language_loss": 0.76634431, "learning_rate": 2.9425787817753007e-06, "loss": 0.7884376, "num_input_tokens_seen": 64925850, "step": 3023, "time_per_iteration": 2.927739381790161 }, { "auxiliary_loss_clip": 0.01177428, "auxiliary_loss_mlp": 0.01027003, "balance_loss_clip": 1.05298471, "balance_loss_mlp": 1.01918292, "epoch": 0.3636145012926111, "flos": 29716762331520.0, "grad_norm": 1.638055903583727, "language_loss": 0.716699, "learning_rate": 2.94189167394938e-06, "loss": 0.73874331, "num_input_tokens_seen": 64948285, "step": 3024, "time_per_iteration": 2.862611770629883 }, { "auxiliary_loss_clip": 0.01190515, "auxiliary_loss_mlp": 0.01033726, "balance_loss_clip": 1.05737925, "balance_loss_mlp": 1.02564323, "epoch": 0.3637347441832502, "flos": 21431847576960.0, "grad_norm": 1.8200537720204997, "language_loss": 0.80966032, "learning_rate": 2.941204423239241e-06, "loss": 0.83190268, "num_input_tokens_seen": 64967160, "step": 3025, "time_per_iteration": 2.694814443588257 }, { "auxiliary_loss_clip": 0.01184444, "auxiliary_loss_mlp": 0.01025946, "balance_loss_clip": 1.05669463, "balance_loss_mlp": 1.01788187, "epoch": 0.36385498707388925, "flos": 29533941083520.0, "grad_norm": 2.0730198860008318, "language_loss": 0.76245016, "learning_rate": 2.9405170297491395e-06, "loss": 0.78455412, "num_input_tokens_seen": 64987155, "step": 3026, "time_per_iteration": 2.854109048843384 }, { "auxiliary_loss_clip": 0.01163519, "auxiliary_loss_mlp": 0.0105738, "balance_loss_clip": 1.05623245, "balance_loss_mlp": 1.02023149, "epoch": 0.36397522996452836, "flos": 22236569925120.0, "grad_norm": 5.869891302150219, "language_loss": 0.80538034, "learning_rate": 2.939829493583353e-06, "loss": 0.82758939, "num_input_tokens_seen": 65003800, "step": 3027, "time_per_iteration": 2.8128790855407715 }, { "auxiliary_loss_clip": 0.01173115, "auxiliary_loss_mlp": 0.01025621, "balance_loss_clip": 1.05311584, "balance_loss_mlp": 1.01699674, "epoch": 0.3640954728551674, "flos": 21506505995520.0, "grad_norm": 2.5734768104808365, "language_loss": 0.82893473, "learning_rate": 2.939141814846179e-06, "loss": 0.85092211, "num_input_tokens_seen": 65021215, "step": 3028, "time_per_iteration": 2.799014091491699 }, { "auxiliary_loss_clip": 0.01180305, "auxiliary_loss_mlp": 0.01023906, "balance_loss_clip": 1.05281484, "balance_loss_mlp": 1.01539493, "epoch": 0.3642157157458065, "flos": 17712867081600.0, "grad_norm": 1.9081989101145835, "language_loss": 0.82396662, "learning_rate": 2.938453993641938e-06, "loss": 0.84600878, "num_input_tokens_seen": 65039590, "step": 3029, "time_per_iteration": 2.707292079925537 }, { "auxiliary_loss_clip": 0.01180354, "auxiliary_loss_mlp": 0.01029502, "balance_loss_clip": 1.05738616, "balance_loss_mlp": 1.02158618, "epoch": 0.36433595863644563, "flos": 17639537466240.0, "grad_norm": 2.1053938000209524, "language_loss": 0.70221114, "learning_rate": 2.937766030074973e-06, "loss": 0.72430968, "num_input_tokens_seen": 65056845, "step": 3030, "time_per_iteration": 2.855219602584839 }, { "auxiliary_loss_clip": 0.01180093, "auxiliary_loss_mlp": 0.01030665, "balance_loss_clip": 1.05348241, "balance_loss_mlp": 1.02224851, "epoch": 0.3644562015270847, "flos": 26833279161600.0, "grad_norm": 1.8564691621284848, "language_loss": 0.82573754, "learning_rate": 2.937077924249646e-06, "loss": 0.84784508, "num_input_tokens_seen": 65079435, "step": 3031, "time_per_iteration": 2.8682968616485596 }, { "auxiliary_loss_clip": 0.01186886, "auxiliary_loss_mlp": 0.01029965, "balance_loss_clip": 1.05389023, "balance_loss_mlp": 1.02194273, "epoch": 0.3645764444177238, "flos": 14282715847680.0, "grad_norm": 1.9908266717873238, "language_loss": 0.75289583, "learning_rate": 2.9363896762703443e-06, "loss": 0.77506435, "num_input_tokens_seen": 65096500, "step": 3032, "time_per_iteration": 2.8369150161743164 }, { "auxiliary_loss_clip": 0.01188048, "auxiliary_loss_mlp": 0.0102306, "balance_loss_clip": 1.0542711, "balance_loss_mlp": 1.01516879, "epoch": 0.3646966873083629, "flos": 20667489137280.0, "grad_norm": 1.6927161503454198, "language_loss": 0.84392905, "learning_rate": 2.9357012862414725e-06, "loss": 0.86604023, "num_input_tokens_seen": 65115860, "step": 3033, "time_per_iteration": 2.6888339519500732 }, { "auxiliary_loss_clip": 0.01183994, "auxiliary_loss_mlp": 0.01030825, "balance_loss_clip": 1.05451155, "balance_loss_mlp": 1.02267694, "epoch": 0.36481693019900197, "flos": 27782613665280.0, "grad_norm": 2.0929830410205525, "language_loss": 0.71619993, "learning_rate": 2.9350127542674593e-06, "loss": 0.73834807, "num_input_tokens_seen": 65138070, "step": 3034, "time_per_iteration": 2.9196298122406006 }, { "auxiliary_loss_clip": 0.011867, "auxiliary_loss_mlp": 0.0102822, "balance_loss_clip": 1.05582952, "balance_loss_mlp": 1.01976204, "epoch": 0.3649371730896411, "flos": 19712588025600.0, "grad_norm": 2.2324549513910252, "language_loss": 0.76379979, "learning_rate": 2.934324080452755e-06, "loss": 0.78594905, "num_input_tokens_seen": 65155860, "step": 3035, "time_per_iteration": 3.812631130218506 }, { "auxiliary_loss_clip": 0.01173514, "auxiliary_loss_mlp": 0.01056407, "balance_loss_clip": 1.0559082, "balance_loss_mlp": 1.01832771, "epoch": 0.3650574159802802, "flos": 24750496016640.0, "grad_norm": 1.8332912739107796, "language_loss": 0.78066128, "learning_rate": 2.9336352649018307e-06, "loss": 0.80296052, "num_input_tokens_seen": 65175930, "step": 3036, "time_per_iteration": 2.879991292953491 }, { "auxiliary_loss_clip": 0.011854, "auxiliary_loss_mlp": 0.01031988, "balance_loss_clip": 1.05764902, "balance_loss_mlp": 1.02447546, "epoch": 0.36517765887091924, "flos": 32853487363200.0, "grad_norm": 2.3007222923315958, "language_loss": 0.70207155, "learning_rate": 2.9329463077191783e-06, "loss": 0.72424543, "num_input_tokens_seen": 65199305, "step": 3037, "time_per_iteration": 3.848787546157837 }, { "auxiliary_loss_clip": 0.01176313, "auxiliary_loss_mlp": 0.01032557, "balance_loss_clip": 1.05574942, "balance_loss_mlp": 1.02457285, "epoch": 0.36529790176155835, "flos": 20120318282880.0, "grad_norm": 2.098332679078844, "language_loss": 0.64481342, "learning_rate": 2.9322572090093135e-06, "loss": 0.66690207, "num_input_tokens_seen": 65218010, "step": 3038, "time_per_iteration": 2.800175189971924 }, { "auxiliary_loss_clip": 0.01175475, "auxiliary_loss_mlp": 0.01028075, "balance_loss_clip": 1.05414152, "balance_loss_mlp": 1.01987302, "epoch": 0.36541814465219746, "flos": 17639573379840.0, "grad_norm": 3.679341884662678, "language_loss": 0.76912898, "learning_rate": 2.9315679688767713e-06, "loss": 0.79116452, "num_input_tokens_seen": 65236020, "step": 3039, "time_per_iteration": 2.7873165607452393 }, { "auxiliary_loss_clip": 0.01180226, "auxiliary_loss_mlp": 0.01030631, "balance_loss_clip": 1.05596662, "balance_loss_mlp": 1.02253389, "epoch": 0.3655383875428365, "flos": 22674356887680.0, "grad_norm": 1.6248734574654602, "language_loss": 0.66619843, "learning_rate": 2.9308785874261085e-06, "loss": 0.68830699, "num_input_tokens_seen": 65256210, "step": 3040, "time_per_iteration": 3.7142608165740967 }, { "auxiliary_loss_clip": 0.01191426, "auxiliary_loss_mlp": 0.01030473, "balance_loss_clip": 1.05708802, "balance_loss_mlp": 1.02263534, "epoch": 0.36565863043347563, "flos": 21981173247360.0, "grad_norm": 2.365035257360044, "language_loss": 0.818519, "learning_rate": 2.9301890647619045e-06, "loss": 0.84073806, "num_input_tokens_seen": 65275505, "step": 3041, "time_per_iteration": 3.681276321411133 }, { "auxiliary_loss_clip": 0.01190235, "auxiliary_loss_mlp": 0.01027062, "balance_loss_clip": 1.05845308, "balance_loss_mlp": 1.01853895, "epoch": 0.36577887332411474, "flos": 24827632473600.0, "grad_norm": 5.061525385642923, "language_loss": 0.80207962, "learning_rate": 2.929499400988759e-06, "loss": 0.82425261, "num_input_tokens_seen": 65296665, "step": 3042, "time_per_iteration": 2.699171543121338 }, { "auxiliary_loss_clip": 0.01187408, "auxiliary_loss_mlp": 0.01025415, "balance_loss_clip": 1.05755877, "balance_loss_mlp": 1.01661742, "epoch": 0.3658991162147538, "flos": 28293191539200.0, "grad_norm": 2.7573066666988675, "language_loss": 0.65127987, "learning_rate": 2.9288095962112927e-06, "loss": 0.67340809, "num_input_tokens_seen": 65317370, "step": 3043, "time_per_iteration": 2.721714735031128 }, { "auxiliary_loss_clip": 0.0118804, "auxiliary_loss_mlp": 0.01031404, "balance_loss_clip": 1.05610216, "balance_loss_mlp": 1.02266026, "epoch": 0.3660193591053929, "flos": 17785550252160.0, "grad_norm": 4.571387646817725, "language_loss": 0.8537699, "learning_rate": 2.9281196505341503e-06, "loss": 0.8759644, "num_input_tokens_seen": 65334540, "step": 3044, "time_per_iteration": 2.6328842639923096 }, { "auxiliary_loss_clip": 0.01170717, "auxiliary_loss_mlp": 0.01051579, "balance_loss_clip": 1.05447483, "balance_loss_mlp": 1.01576364, "epoch": 0.36613960199603196, "flos": 10342776839040.0, "grad_norm": 2.0721947823140416, "language_loss": 0.78275877, "learning_rate": 2.9274295640619946e-06, "loss": 0.80498177, "num_input_tokens_seen": 65351670, "step": 3045, "time_per_iteration": 2.809788942337036 }, { "auxiliary_loss_clip": 0.01179977, "auxiliary_loss_mlp": 0.01024888, "balance_loss_clip": 1.05396318, "balance_loss_mlp": 1.01713371, "epoch": 0.36625984488667107, "flos": 19755609540480.0, "grad_norm": 1.7568435871755672, "language_loss": 0.78484243, "learning_rate": 2.9267393368995103e-06, "loss": 0.80689108, "num_input_tokens_seen": 65370900, "step": 3046, "time_per_iteration": 2.777278184890747 }, { "auxiliary_loss_clip": 0.01191348, "auxiliary_loss_mlp": 0.01026739, "balance_loss_clip": 1.05693281, "balance_loss_mlp": 1.0188241, "epoch": 0.3663800877773102, "flos": 17674262939520.0, "grad_norm": 10.455617841664722, "language_loss": 0.74346691, "learning_rate": 2.926048969151407e-06, "loss": 0.76564777, "num_input_tokens_seen": 65388185, "step": 3047, "time_per_iteration": 2.688753604888916 }, { "auxiliary_loss_clip": 0.01169705, "auxiliary_loss_mlp": 0.01030831, "balance_loss_clip": 1.05576062, "balance_loss_mlp": 1.02269816, "epoch": 0.36650033066794924, "flos": 20303606407680.0, "grad_norm": 3.0211909370885826, "language_loss": 0.68021339, "learning_rate": 2.92535846092241e-06, "loss": 0.70221877, "num_input_tokens_seen": 65407200, "step": 3048, "time_per_iteration": 2.841978073120117 }, { "auxiliary_loss_clip": 0.01185744, "auxiliary_loss_mlp": 0.01029107, "balance_loss_clip": 1.05736351, "balance_loss_mlp": 1.02110195, "epoch": 0.36662057355858835, "flos": 24716237420160.0, "grad_norm": 1.6039788509797965, "language_loss": 0.82382774, "learning_rate": 2.9246678123172704e-06, "loss": 0.84597623, "num_input_tokens_seen": 65427290, "step": 3049, "time_per_iteration": 2.7319247722625732 }, { "auxiliary_loss_clip": 0.01193777, "auxiliary_loss_mlp": 0.01030118, "balance_loss_clip": 1.05776668, "balance_loss_mlp": 1.02120137, "epoch": 0.36674081644922746, "flos": 12385267902720.0, "grad_norm": 2.698159629739359, "language_loss": 0.73954165, "learning_rate": 2.9239770234407596e-06, "loss": 0.76178062, "num_input_tokens_seen": 65445595, "step": 3050, "time_per_iteration": 2.775144338607788 }, { "auxiliary_loss_clip": 0.01185936, "auxiliary_loss_mlp": 0.01022649, "balance_loss_clip": 1.05396104, "balance_loss_mlp": 1.01392865, "epoch": 0.3668610593398665, "flos": 21105922544640.0, "grad_norm": 3.62502973909265, "language_loss": 0.68116313, "learning_rate": 2.9232860943976686e-06, "loss": 0.70324898, "num_input_tokens_seen": 65466330, "step": 3051, "time_per_iteration": 2.7399935722351074 }, { "auxiliary_loss_clip": 0.01182304, "auxiliary_loss_mlp": 0.01028826, "balance_loss_clip": 1.05537987, "balance_loss_mlp": 1.02110445, "epoch": 0.3669813022305056, "flos": 26758082039040.0, "grad_norm": 1.7946406851725871, "language_loss": 0.84229922, "learning_rate": 2.9225950252928115e-06, "loss": 0.86441052, "num_input_tokens_seen": 65487180, "step": 3052, "time_per_iteration": 2.858372688293457 }, { "auxiliary_loss_clip": 0.01186464, "auxiliary_loss_mlp": 0.01027048, "balance_loss_clip": 1.05753803, "balance_loss_mlp": 1.01888835, "epoch": 0.36710154512114473, "flos": 19099521671040.0, "grad_norm": 7.070951522609389, "language_loss": 0.81969166, "learning_rate": 2.9219038162310217e-06, "loss": 0.84182674, "num_input_tokens_seen": 65505380, "step": 3053, "time_per_iteration": 2.741300582885742 }, { "auxiliary_loss_clip": 0.0117207, "auxiliary_loss_mlp": 0.01060143, "balance_loss_clip": 1.05566716, "balance_loss_mlp": 1.02213144, "epoch": 0.3672217880117838, "flos": 20812029465600.0, "grad_norm": 2.6544387066637056, "language_loss": 0.8291167, "learning_rate": 2.921212467317157e-06, "loss": 0.85143876, "num_input_tokens_seen": 65524825, "step": 3054, "time_per_iteration": 2.8041298389434814 }, { "auxiliary_loss_clip": 0.01173509, "auxiliary_loss_mlp": 0.01028576, "balance_loss_clip": 1.05585384, "balance_loss_mlp": 1.01976657, "epoch": 0.3673420309024229, "flos": 13590394133760.0, "grad_norm": 3.0330806535963206, "language_loss": 0.80022663, "learning_rate": 2.920520978656093e-06, "loss": 0.82224751, "num_input_tokens_seen": 65541790, "step": 3055, "time_per_iteration": 2.7292816638946533 }, { "auxiliary_loss_clip": 0.01186659, "auxiliary_loss_mlp": 0.01057703, "balance_loss_clip": 1.05453515, "balance_loss_mlp": 1.02037978, "epoch": 0.367462273793062, "flos": 28986877969920.0, "grad_norm": 2.2981110475375113, "language_loss": 0.76871073, "learning_rate": 2.919829350352729e-06, "loss": 0.79115433, "num_input_tokens_seen": 65563395, "step": 3056, "time_per_iteration": 2.85233736038208 }, { "auxiliary_loss_clip": 0.01091646, "auxiliary_loss_mlp": 0.01005284, "balance_loss_clip": 1.02156663, "balance_loss_mlp": 1.00366271, "epoch": 0.36758251668370107, "flos": 62643148346880.0, "grad_norm": 0.7883075774870829, "language_loss": 0.59986031, "learning_rate": 2.919137582511983e-06, "loss": 0.6208297, "num_input_tokens_seen": 65619835, "step": 3057, "time_per_iteration": 3.191061496734619 }, { "auxiliary_loss_clip": 0.01182343, "auxiliary_loss_mlp": 0.01028119, "balance_loss_clip": 1.05624187, "balance_loss_mlp": 1.01915443, "epoch": 0.3677027595743402, "flos": 12713886455040.0, "grad_norm": 2.953831405942611, "language_loss": 0.63949126, "learning_rate": 2.918445675238797e-06, "loss": 0.66159588, "num_input_tokens_seen": 65636760, "step": 3058, "time_per_iteration": 2.7084872722625732 }, { "auxiliary_loss_clip": 0.01188915, "auxiliary_loss_mlp": 0.01030984, "balance_loss_clip": 1.05422747, "balance_loss_mlp": 1.02297282, "epoch": 0.36782300246497923, "flos": 25046579825280.0, "grad_norm": 1.8486772838051329, "language_loss": 0.69586295, "learning_rate": 2.917753628638132e-06, "loss": 0.71806198, "num_input_tokens_seen": 65657065, "step": 3059, "time_per_iteration": 2.730377435684204 }, { "auxiliary_loss_clip": 0.0118269, "auxiliary_loss_mlp": 0.01026252, "balance_loss_clip": 1.0561707, "balance_loss_mlp": 1.01788402, "epoch": 0.36794324535561834, "flos": 17419512706560.0, "grad_norm": 3.4495056701641222, "language_loss": 0.7030462, "learning_rate": 2.9170614428149716e-06, "loss": 0.72513556, "num_input_tokens_seen": 65675400, "step": 3060, "time_per_iteration": 2.6779820919036865 }, { "auxiliary_loss_clip": 0.01181436, "auxiliary_loss_mlp": 0.01027459, "balance_loss_clip": 1.05948877, "balance_loss_mlp": 1.01870322, "epoch": 0.36806348824625745, "flos": 24089128848000.0, "grad_norm": 2.2548173386557457, "language_loss": 0.86595893, "learning_rate": 2.9163691178743195e-06, "loss": 0.88804787, "num_input_tokens_seen": 65694050, "step": 3061, "time_per_iteration": 2.8060152530670166 }, { "auxiliary_loss_clip": 0.01183297, "auxiliary_loss_mlp": 0.01031425, "balance_loss_clip": 1.05527568, "balance_loss_mlp": 1.02317584, "epoch": 0.3681837311368965, "flos": 20521871400960.0, "grad_norm": 2.0177811032944155, "language_loss": 0.77315223, "learning_rate": 2.9156766539212006e-06, "loss": 0.79529941, "num_input_tokens_seen": 65711695, "step": 3062, "time_per_iteration": 3.6870689392089844 }, { "auxiliary_loss_clip": 0.01189669, "auxiliary_loss_mlp": 0.01025854, "balance_loss_clip": 1.05424404, "balance_loss_mlp": 1.01797724, "epoch": 0.3683039740275356, "flos": 21466644877440.0, "grad_norm": 2.597857400896473, "language_loss": 0.72085249, "learning_rate": 2.9149840510606614e-06, "loss": 0.74300772, "num_input_tokens_seen": 65730350, "step": 3063, "time_per_iteration": 3.7595436573028564 }, { "auxiliary_loss_clip": 0.01085166, "auxiliary_loss_mlp": 0.01039547, "balance_loss_clip": 1.01867437, "balance_loss_mlp": 0.99827427, "epoch": 0.36842421691817473, "flos": 70380999987840.0, "grad_norm": 1.0339217611084577, "language_loss": 0.64164937, "learning_rate": 2.914291309397769e-06, "loss": 0.66289651, "num_input_tokens_seen": 65787820, "step": 3064, "time_per_iteration": 3.395108699798584 }, { "auxiliary_loss_clip": 0.01162833, "auxiliary_loss_mlp": 0.01024471, "balance_loss_clip": 1.05424583, "balance_loss_mlp": 1.01623344, "epoch": 0.3685444598088138, "flos": 23331378510720.0, "grad_norm": 2.5748432890920343, "language_loss": 0.78605276, "learning_rate": 2.9135984290376117e-06, "loss": 0.80792582, "num_input_tokens_seen": 65806685, "step": 3065, "time_per_iteration": 2.953824281692505 }, { "auxiliary_loss_clip": 0.01171304, "auxiliary_loss_mlp": 0.01026832, "balance_loss_clip": 1.05550122, "balance_loss_mlp": 1.01863003, "epoch": 0.3686647026994529, "flos": 23070271570560.0, "grad_norm": 2.059128422802248, "language_loss": 0.82507712, "learning_rate": 2.9129054100853e-06, "loss": 0.84705848, "num_input_tokens_seen": 65825525, "step": 3066, "time_per_iteration": 3.8477015495300293 }, { "auxiliary_loss_clip": 0.01181844, "auxiliary_loss_mlp": 0.01032234, "balance_loss_clip": 1.05330586, "balance_loss_mlp": 1.02282858, "epoch": 0.368784945590092, "flos": 25119909440640.0, "grad_norm": 1.8512910810395615, "language_loss": 0.75992799, "learning_rate": 2.912212252645963e-06, "loss": 0.78206873, "num_input_tokens_seen": 65848110, "step": 3067, "time_per_iteration": 3.8419277667999268 }, { "auxiliary_loss_clip": 0.01193309, "auxiliary_loss_mlp": 0.01032828, "balance_loss_clip": 1.05615199, "balance_loss_mlp": 1.02369642, "epoch": 0.36890518848073106, "flos": 18442284566400.0, "grad_norm": 2.48672945699691, "language_loss": 0.76054668, "learning_rate": 2.9115189568247523e-06, "loss": 0.78280807, "num_input_tokens_seen": 65865670, "step": 3068, "time_per_iteration": 2.8817832469940186 }, { "auxiliary_loss_clip": 0.01165575, "auxiliary_loss_mlp": 0.01025028, "balance_loss_clip": 1.05638492, "balance_loss_mlp": 1.01652229, "epoch": 0.36902543137137017, "flos": 16362446336640.0, "grad_norm": 2.2443363794103903, "language_loss": 0.91907185, "learning_rate": 2.910825522726841e-06, "loss": 0.94097787, "num_input_tokens_seen": 65883195, "step": 3069, "time_per_iteration": 2.7475805282592773 }, { "auxiliary_loss_clip": 0.01170743, "auxiliary_loss_mlp": 0.0103519, "balance_loss_clip": 1.05340374, "balance_loss_mlp": 1.02661288, "epoch": 0.3691456742620093, "flos": 12275596702080.0, "grad_norm": 1.9545545054941562, "language_loss": 0.7736541, "learning_rate": 2.9101319504574215e-06, "loss": 0.79571342, "num_input_tokens_seen": 65899635, "step": 3070, "time_per_iteration": 2.752270221710205 }, { "auxiliary_loss_clip": 0.0117781, "auxiliary_loss_mlp": 0.01030506, "balance_loss_clip": 1.05236304, "balance_loss_mlp": 1.02154768, "epoch": 0.36926591715264834, "flos": 17786412178560.0, "grad_norm": 1.839343343346735, "language_loss": 0.76193547, "learning_rate": 2.909438240121709e-06, "loss": 0.78401864, "num_input_tokens_seen": 65919910, "step": 3071, "time_per_iteration": 2.731572389602661 }, { "auxiliary_loss_clip": 0.01177647, "auxiliary_loss_mlp": 0.01026159, "balance_loss_clip": 1.05494714, "balance_loss_mlp": 1.01859558, "epoch": 0.36938616004328745, "flos": 28948309741440.0, "grad_norm": 1.8712240752201694, "language_loss": 0.70154822, "learning_rate": 2.908744391824939e-06, "loss": 0.72358632, "num_input_tokens_seen": 65940930, "step": 3072, "time_per_iteration": 2.7678143978118896 }, { "auxiliary_loss_clip": 0.01171792, "auxiliary_loss_mlp": 0.01027952, "balance_loss_clip": 1.05163765, "balance_loss_mlp": 1.0192616, "epoch": 0.36950640293392656, "flos": 29205394358400.0, "grad_norm": 2.160054935350172, "language_loss": 0.78866184, "learning_rate": 2.908050405672367e-06, "loss": 0.81065929, "num_input_tokens_seen": 65960475, "step": 3073, "time_per_iteration": 2.8586268424987793 }, { "auxiliary_loss_clip": 0.01189889, "auxiliary_loss_mlp": 0.01028575, "balance_loss_clip": 1.05353546, "balance_loss_mlp": 1.02025414, "epoch": 0.3696266458245656, "flos": 24827776128000.0, "grad_norm": 1.87263990114704, "language_loss": 0.7953999, "learning_rate": 2.9073562817692703e-06, "loss": 0.81758451, "num_input_tokens_seen": 65979160, "step": 3074, "time_per_iteration": 2.7050154209136963 }, { "auxiliary_loss_clip": 0.01089916, "auxiliary_loss_mlp": 0.01012855, "balance_loss_clip": 1.01906776, "balance_loss_mlp": 1.01115072, "epoch": 0.3697468887152047, "flos": 59887257264000.0, "grad_norm": 0.7184715967048465, "language_loss": 0.56511426, "learning_rate": 2.9066620202209468e-06, "loss": 0.586142, "num_input_tokens_seen": 66041650, "step": 3075, "time_per_iteration": 3.2704854011535645 }, { "auxiliary_loss_clip": 0.01172556, "auxiliary_loss_mlp": 0.01026788, "balance_loss_clip": 1.05521631, "balance_loss_mlp": 1.01852632, "epoch": 0.3698671316058438, "flos": 26137581569280.0, "grad_norm": 2.0132671301416414, "language_loss": 0.77690369, "learning_rate": 2.905967621132716e-06, "loss": 0.79889715, "num_input_tokens_seen": 66059260, "step": 3076, "time_per_iteration": 2.749112606048584 }, { "auxiliary_loss_clip": 0.01186441, "auxiliary_loss_mlp": 0.01025006, "balance_loss_clip": 1.05487275, "balance_loss_mlp": 1.01623774, "epoch": 0.3699873744964829, "flos": 24607464059520.0, "grad_norm": 2.069349444498004, "language_loss": 0.75524539, "learning_rate": 2.9052730846099172e-06, "loss": 0.77735984, "num_input_tokens_seen": 66080605, "step": 3077, "time_per_iteration": 2.8739891052246094 }, { "auxiliary_loss_clip": 0.01084322, "auxiliary_loss_mlp": 0.01006932, "balance_loss_clip": 1.01746845, "balance_loss_mlp": 1.00512052, "epoch": 0.370107617387122, "flos": 64885340050560.0, "grad_norm": 0.8490335482630261, "language_loss": 0.60889769, "learning_rate": 2.9045784107579123e-06, "loss": 0.62981021, "num_input_tokens_seen": 66140710, "step": 3078, "time_per_iteration": 3.330082416534424 }, { "auxiliary_loss_clip": 0.01187359, "auxiliary_loss_mlp": 0.01030557, "balance_loss_clip": 1.05373371, "balance_loss_mlp": 1.02227831, "epoch": 0.37022786027776106, "flos": 15961683317760.0, "grad_norm": 1.7819992286698654, "language_loss": 0.66898912, "learning_rate": 2.9038835996820807e-06, "loss": 0.69116831, "num_input_tokens_seen": 66158320, "step": 3079, "time_per_iteration": 2.6972382068634033 }, { "auxiliary_loss_clip": 0.01186533, "auxiliary_loss_mlp": 0.01027032, "balance_loss_clip": 1.05583119, "balance_loss_mlp": 1.01914048, "epoch": 0.37034810316840017, "flos": 18546927863040.0, "grad_norm": 1.952612599721463, "language_loss": 0.79695249, "learning_rate": 2.903188651487826e-06, "loss": 0.81908816, "num_input_tokens_seen": 66176875, "step": 3080, "time_per_iteration": 2.781810760498047 }, { "auxiliary_loss_clip": 0.01187742, "auxiliary_loss_mlp": 0.01032297, "balance_loss_clip": 1.05528462, "balance_loss_mlp": 1.02407146, "epoch": 0.3704683460590393, "flos": 17821927751040.0, "grad_norm": 2.302648520731816, "language_loss": 0.86502343, "learning_rate": 2.902493566280571e-06, "loss": 0.88722384, "num_input_tokens_seen": 66194980, "step": 3081, "time_per_iteration": 2.6579809188842773 }, { "auxiliary_loss_clip": 0.01180825, "auxiliary_loss_mlp": 0.01034567, "balance_loss_clip": 1.05377865, "balance_loss_mlp": 1.025841, "epoch": 0.37058858894967833, "flos": 14134081368960.0, "grad_norm": 2.155857785415119, "language_loss": 0.81219947, "learning_rate": 2.9017983441657595e-06, "loss": 0.83435333, "num_input_tokens_seen": 66212310, "step": 3082, "time_per_iteration": 2.788877010345459 }, { "auxiliary_loss_clip": 0.01178274, "auxiliary_loss_mlp": 0.01033126, "balance_loss_clip": 1.05309772, "balance_loss_mlp": 1.02401233, "epoch": 0.37070883184031744, "flos": 13954492344960.0, "grad_norm": 5.787919220997837, "language_loss": 0.75360441, "learning_rate": 2.9011029852488564e-06, "loss": 0.77571845, "num_input_tokens_seen": 66229545, "step": 3083, "time_per_iteration": 2.9004273414611816 }, { "auxiliary_loss_clip": 0.01086525, "auxiliary_loss_mlp": 0.01003101, "balance_loss_clip": 1.01748657, "balance_loss_mlp": 1.00137258, "epoch": 0.37082907473095655, "flos": 52315419306240.0, "grad_norm": 1.059124119873227, "language_loss": 0.62483227, "learning_rate": 2.9004074896353465e-06, "loss": 0.64572859, "num_input_tokens_seen": 66283545, "step": 3084, "time_per_iteration": 3.151355504989624 }, { "auxiliary_loss_clip": 0.01187337, "auxiliary_loss_mlp": 0.0102741, "balance_loss_clip": 1.05667806, "balance_loss_mlp": 1.01961374, "epoch": 0.3709493176215956, "flos": 15998096730240.0, "grad_norm": 1.8256511916517375, "language_loss": 0.81434917, "learning_rate": 2.8997118574307362e-06, "loss": 0.83649665, "num_input_tokens_seen": 66300500, "step": 3085, "time_per_iteration": 2.6607470512390137 }, { "auxiliary_loss_clip": 0.01186963, "auxiliary_loss_mlp": 0.01031904, "balance_loss_clip": 1.05677867, "balance_loss_mlp": 1.02299941, "epoch": 0.3710695605122347, "flos": 20959837931520.0, "grad_norm": 1.8477920502531728, "language_loss": 0.73856711, "learning_rate": 2.899016088740553e-06, "loss": 0.76075578, "num_input_tokens_seen": 66318610, "step": 3086, "time_per_iteration": 2.687448501586914 }, { "auxiliary_loss_clip": 0.0117202, "auxiliary_loss_mlp": 0.01028908, "balance_loss_clip": 1.05270767, "balance_loss_mlp": 1.02056909, "epoch": 0.37118980340287383, "flos": 14355578586240.0, "grad_norm": 1.9347867957026048, "language_loss": 0.78909528, "learning_rate": 2.898320183670344e-06, "loss": 0.81110454, "num_input_tokens_seen": 66336025, "step": 3087, "time_per_iteration": 3.6947615146636963 }, { "auxiliary_loss_clip": 0.0117347, "auxiliary_loss_mlp": 0.01033582, "balance_loss_clip": 1.05497861, "balance_loss_mlp": 1.02526152, "epoch": 0.3713100462935129, "flos": 25885381201920.0, "grad_norm": 1.7406310836914052, "language_loss": 0.88591325, "learning_rate": 2.8976241423256767e-06, "loss": 0.90798378, "num_input_tokens_seen": 66356120, "step": 3088, "time_per_iteration": 2.8786158561706543 }, { "auxiliary_loss_clip": 0.01179384, "auxiliary_loss_mlp": 0.01029101, "balance_loss_clip": 1.05629683, "balance_loss_mlp": 1.02051806, "epoch": 0.371430289184152, "flos": 30518934814080.0, "grad_norm": 2.143752594733215, "language_loss": 0.68075395, "learning_rate": 2.896927964812142e-06, "loss": 0.70283878, "num_input_tokens_seen": 66376685, "step": 3089, "time_per_iteration": 3.8058207035064697 }, { "auxiliary_loss_clip": 0.01178841, "auxiliary_loss_mlp": 0.01032747, "balance_loss_clip": 1.0550015, "balance_loss_mlp": 1.02453947, "epoch": 0.37155053207479105, "flos": 15742233175680.0, "grad_norm": 2.7375357699815694, "language_loss": 0.74732929, "learning_rate": 2.8962316512353465e-06, "loss": 0.76944518, "num_input_tokens_seen": 66394230, "step": 3090, "time_per_iteration": 2.751903772354126 }, { "auxiliary_loss_clip": 0.0117088, "auxiliary_loss_mlp": 0.0103215, "balance_loss_clip": 1.05510354, "balance_loss_mlp": 1.0240798, "epoch": 0.37167077496543016, "flos": 23404061681280.0, "grad_norm": 1.63631482195849, "language_loss": 0.7482301, "learning_rate": 2.8955352017009233e-06, "loss": 0.77026045, "num_input_tokens_seen": 66413475, "step": 3091, "time_per_iteration": 3.881122350692749 }, { "auxiliary_loss_clip": 0.01181178, "auxiliary_loss_mlp": 0.01037492, "balance_loss_clip": 1.05661714, "balance_loss_mlp": 1.02885556, "epoch": 0.3717910178560693, "flos": 22088653718400.0, "grad_norm": 1.9459412011436934, "language_loss": 0.77390325, "learning_rate": 2.8948386163145212e-06, "loss": 0.79608995, "num_input_tokens_seen": 66432685, "step": 3092, "time_per_iteration": 2.7693660259246826 }, { "auxiliary_loss_clip": 0.01188744, "auxiliary_loss_mlp": 0.01027033, "balance_loss_clip": 1.05413103, "balance_loss_mlp": 1.01905203, "epoch": 0.3719112607467083, "flos": 26939969533440.0, "grad_norm": 2.0621842227336225, "language_loss": 0.79600203, "learning_rate": 2.8941418951818135e-06, "loss": 0.81815982, "num_input_tokens_seen": 66452245, "step": 3093, "time_per_iteration": 3.7100534439086914 }, { "auxiliary_loss_clip": 0.01179589, "auxiliary_loss_mlp": 0.0103163, "balance_loss_clip": 1.05421543, "balance_loss_mlp": 1.0236907, "epoch": 0.37203150363734744, "flos": 12166500119040.0, "grad_norm": 2.7245802262156977, "language_loss": 0.71242195, "learning_rate": 2.8934450384084903e-06, "loss": 0.73453414, "num_input_tokens_seen": 66469760, "step": 3094, "time_per_iteration": 2.787480115890503 }, { "auxiliary_loss_clip": 0.01174714, "auxiliary_loss_mlp": 0.01031556, "balance_loss_clip": 1.05322933, "balance_loss_mlp": 1.0227108, "epoch": 0.37215174652798655, "flos": 23697595624320.0, "grad_norm": 1.9244833216495014, "language_loss": 0.69481748, "learning_rate": 2.8927480461002653e-06, "loss": 0.7168802, "num_input_tokens_seen": 66489730, "step": 3095, "time_per_iteration": 2.8470728397369385 }, { "auxiliary_loss_clip": 0.01185749, "auxiliary_loss_mlp": 0.01031387, "balance_loss_clip": 1.05623126, "balance_loss_mlp": 1.0222261, "epoch": 0.3722719894186256, "flos": 17887751424000.0, "grad_norm": 2.38203971863925, "language_loss": 0.85910559, "learning_rate": 2.892050918362872e-06, "loss": 0.88127697, "num_input_tokens_seen": 66504785, "step": 3096, "time_per_iteration": 2.8076725006103516 }, { "auxiliary_loss_clip": 0.01077522, "auxiliary_loss_mlp": 0.01003803, "balance_loss_clip": 1.0181632, "balance_loss_mlp": 1.0018481, "epoch": 0.3723922323092647, "flos": 62419891363200.0, "grad_norm": 0.8745123954408499, "language_loss": 0.55879462, "learning_rate": 2.8913536553020626e-06, "loss": 0.57960784, "num_input_tokens_seen": 66558840, "step": 3097, "time_per_iteration": 3.7058351039886475 }, { "auxiliary_loss_clip": 0.01168206, "auxiliary_loss_mlp": 0.01031204, "balance_loss_clip": 1.05408871, "balance_loss_mlp": 1.02232862, "epoch": 0.3725124751999038, "flos": 23039747988480.0, "grad_norm": 1.8186781735384863, "language_loss": 0.84671259, "learning_rate": 2.8906562570236137e-06, "loss": 0.8687067, "num_input_tokens_seen": 66576750, "step": 3098, "time_per_iteration": 3.086869716644287 }, { "auxiliary_loss_clip": 0.01168098, "auxiliary_loss_mlp": 0.01027324, "balance_loss_clip": 1.0515728, "balance_loss_mlp": 1.01961684, "epoch": 0.3726327180905429, "flos": 20920551431040.0, "grad_norm": 1.5429986928614563, "language_loss": 0.76725078, "learning_rate": 2.889958723633318e-06, "loss": 0.78920496, "num_input_tokens_seen": 66595690, "step": 3099, "time_per_iteration": 2.843045234680176 }, { "auxiliary_loss_clip": 0.011788, "auxiliary_loss_mlp": 0.0102677, "balance_loss_clip": 1.0543232, "balance_loss_mlp": 1.01859832, "epoch": 0.372752960981182, "flos": 30592156688640.0, "grad_norm": 1.6987060783935348, "language_loss": 0.73806262, "learning_rate": 2.889261055236992e-06, "loss": 0.76011837, "num_input_tokens_seen": 66617905, "step": 3100, "time_per_iteration": 2.88970685005188 }, { "auxiliary_loss_clip": 0.01176479, "auxiliary_loss_mlp": 0.01031194, "balance_loss_clip": 1.05364108, "balance_loss_mlp": 1.02279019, "epoch": 0.3728732038718211, "flos": 25116749043840.0, "grad_norm": 3.088997025524146, "language_loss": 0.82567036, "learning_rate": 2.8885632519404704e-06, "loss": 0.84774715, "num_input_tokens_seen": 66638175, "step": 3101, "time_per_iteration": 2.829911231994629 }, { "auxiliary_loss_clip": 0.01182307, "auxiliary_loss_mlp": 0.01029026, "balance_loss_clip": 1.05635786, "balance_loss_mlp": 1.020818, "epoch": 0.37299344676246016, "flos": 25302048330240.0, "grad_norm": 2.012689952276899, "language_loss": 0.75744808, "learning_rate": 2.8878653138496107e-06, "loss": 0.77956134, "num_input_tokens_seen": 66658670, "step": 3102, "time_per_iteration": 2.7956430912017822 }, { "auxiliary_loss_clip": 0.01170468, "auxiliary_loss_mlp": 0.01030766, "balance_loss_clip": 1.05425596, "balance_loss_mlp": 1.02264798, "epoch": 0.37311368965309927, "flos": 23842531002240.0, "grad_norm": 4.116300095172341, "language_loss": 0.76269257, "learning_rate": 2.8871672410702878e-06, "loss": 0.78470492, "num_input_tokens_seen": 66676030, "step": 3103, "time_per_iteration": 2.7633280754089355 }, { "auxiliary_loss_clip": 0.01186794, "auxiliary_loss_mlp": 0.01030512, "balance_loss_clip": 1.05586433, "balance_loss_mlp": 1.02150536, "epoch": 0.3732339325437384, "flos": 25811943845760.0, "grad_norm": 2.5046158788839485, "language_loss": 0.82027984, "learning_rate": 2.8864690337084008e-06, "loss": 0.84245288, "num_input_tokens_seen": 66695305, "step": 3104, "time_per_iteration": 2.7208774089813232 }, { "auxiliary_loss_clip": 0.01182801, "auxiliary_loss_mlp": 0.01028671, "balance_loss_clip": 1.05553412, "balance_loss_mlp": 1.0201236, "epoch": 0.37335417543437743, "flos": 26208433146240.0, "grad_norm": 1.7797693670267467, "language_loss": 0.78161401, "learning_rate": 2.885770691869866e-06, "loss": 0.8037287, "num_input_tokens_seen": 66716185, "step": 3105, "time_per_iteration": 2.713175058364868 }, { "auxiliary_loss_clip": 0.01181713, "auxiliary_loss_mlp": 0.01028074, "balance_loss_clip": 1.05447352, "balance_loss_mlp": 1.02055788, "epoch": 0.37347441832501654, "flos": 24023879792640.0, "grad_norm": 2.4648552041300325, "language_loss": 0.74867857, "learning_rate": 2.8850722156606207e-06, "loss": 0.77077639, "num_input_tokens_seen": 66734575, "step": 3106, "time_per_iteration": 2.6821982860565186 }, { "auxiliary_loss_clip": 0.01180705, "auxiliary_loss_mlp": 0.01023948, "balance_loss_clip": 1.05458021, "balance_loss_mlp": 1.01632464, "epoch": 0.3735946612156556, "flos": 19714922409600.0, "grad_norm": 1.6332267150097364, "language_loss": 0.66837263, "learning_rate": 2.8843736051866252e-06, "loss": 0.69041914, "num_input_tokens_seen": 66753500, "step": 3107, "time_per_iteration": 2.704240322113037 }, { "auxiliary_loss_clip": 0.01176068, "auxiliary_loss_mlp": 0.01063821, "balance_loss_clip": 1.05803514, "balance_loss_mlp": 1.02543342, "epoch": 0.3737149041062947, "flos": 23039604334080.0, "grad_norm": 1.887163919878685, "language_loss": 0.69078708, "learning_rate": 2.8836748605538557e-06, "loss": 0.71318591, "num_input_tokens_seen": 66775140, "step": 3108, "time_per_iteration": 2.7800357341766357 }, { "auxiliary_loss_clip": 0.01187088, "auxiliary_loss_mlp": 0.01029879, "balance_loss_clip": 1.0547986, "balance_loss_mlp": 1.02130747, "epoch": 0.3738351469969338, "flos": 34678108483200.0, "grad_norm": 2.1331813263104813, "language_loss": 0.63204271, "learning_rate": 2.882975981868313e-06, "loss": 0.65421236, "num_input_tokens_seen": 66795525, "step": 3109, "time_per_iteration": 2.9055192470550537 }, { "auxiliary_loss_clip": 0.01184718, "auxiliary_loss_mlp": 0.01037771, "balance_loss_clip": 1.0559485, "balance_loss_mlp": 1.02814519, "epoch": 0.3739553898875729, "flos": 43507967448960.0, "grad_norm": 2.102371417691833, "language_loss": 0.68903148, "learning_rate": 2.882276969236016e-06, "loss": 0.71125638, "num_input_tokens_seen": 66816885, "step": 3110, "time_per_iteration": 2.887239694595337 }, { "auxiliary_loss_clip": 0.01182048, "auxiliary_loss_mlp": 0.01028296, "balance_loss_clip": 1.05728245, "balance_loss_mlp": 1.01992702, "epoch": 0.374075632778212, "flos": 12856487448960.0, "grad_norm": 2.4128629478129593, "language_loss": 0.76754516, "learning_rate": 2.881577822763005e-06, "loss": 0.78964865, "num_input_tokens_seen": 66834835, "step": 3111, "time_per_iteration": 2.713750123977661 }, { "auxiliary_loss_clip": 0.01185235, "auxiliary_loss_mlp": 0.01024594, "balance_loss_clip": 1.05493915, "balance_loss_mlp": 1.01639855, "epoch": 0.3741958756688511, "flos": 26024031699840.0, "grad_norm": 3.1840815941629987, "language_loss": 0.87530589, "learning_rate": 2.880878542555338e-06, "loss": 0.89740419, "num_input_tokens_seen": 66852600, "step": 3112, "time_per_iteration": 2.8094544410705566 }, { "auxiliary_loss_clip": 0.01194201, "auxiliary_loss_mlp": 0.01036668, "balance_loss_clip": 1.05743074, "balance_loss_mlp": 1.02770972, "epoch": 0.37431611855949015, "flos": 21433894652160.0, "grad_norm": 2.3053210482610096, "language_loss": 0.7996614, "learning_rate": 2.8801791287190976e-06, "loss": 0.82197011, "num_input_tokens_seen": 66870595, "step": 3113, "time_per_iteration": 3.572577953338623 }, { "auxiliary_loss_clip": 0.01189452, "auxiliary_loss_mlp": 0.01033951, "balance_loss_clip": 1.05654263, "balance_loss_mlp": 1.02511191, "epoch": 0.37443636145012926, "flos": 24207096090240.0, "grad_norm": 5.551129795879554, "language_loss": 0.86036009, "learning_rate": 2.8794795813603817e-06, "loss": 0.88259411, "num_input_tokens_seen": 66886060, "step": 3114, "time_per_iteration": 2.794229507446289 }, { "auxiliary_loss_clip": 0.01191414, "auxiliary_loss_mlp": 0.01027126, "balance_loss_clip": 1.05583477, "balance_loss_mlp": 1.01878762, "epoch": 0.3745566043407684, "flos": 15378601841280.0, "grad_norm": 1.8655449690790886, "language_loss": 0.8187592, "learning_rate": 2.878779900585314e-06, "loss": 0.84094465, "num_input_tokens_seen": 66903900, "step": 3115, "time_per_iteration": 3.8846497535705566 }, { "auxiliary_loss_clip": 0.0119131, "auxiliary_loss_mlp": 0.01032053, "balance_loss_clip": 1.05901718, "balance_loss_mlp": 1.02397084, "epoch": 0.37467684723140743, "flos": 24608218245120.0, "grad_norm": 1.694901420793577, "language_loss": 0.75348341, "learning_rate": 2.8780800865000336e-06, "loss": 0.77571702, "num_input_tokens_seen": 66925210, "step": 3116, "time_per_iteration": 2.7392516136169434 }, { "auxiliary_loss_clip": 0.01082529, "auxiliary_loss_mlp": 0.01001799, "balance_loss_clip": 1.01547801, "balance_loss_mlp": 1.00015378, "epoch": 0.37479709012204654, "flos": 64377491610240.0, "grad_norm": 0.9700642314450132, "language_loss": 0.59179413, "learning_rate": 2.877380139210702e-06, "loss": 0.6126374, "num_input_tokens_seen": 66983880, "step": 3117, "time_per_iteration": 3.2545604705810547 }, { "auxiliary_loss_clip": 0.01184662, "auxiliary_loss_mlp": 0.01035564, "balance_loss_clip": 1.05905068, "balance_loss_mlp": 1.02690363, "epoch": 0.37491733301268565, "flos": 23803962773760.0, "grad_norm": 1.7307448028219348, "language_loss": 0.76557165, "learning_rate": 2.876680058823501e-06, "loss": 0.78777385, "num_input_tokens_seen": 67004280, "step": 3118, "time_per_iteration": 3.803523302078247 }, { "auxiliary_loss_clip": 0.01175475, "auxiliary_loss_mlp": 0.01033059, "balance_loss_clip": 1.05458117, "balance_loss_mlp": 1.02438045, "epoch": 0.3750375759033247, "flos": 32160950167680.0, "grad_norm": 2.9382874430343566, "language_loss": 0.66010725, "learning_rate": 2.8759798454446314e-06, "loss": 0.68219256, "num_input_tokens_seen": 67027445, "step": 3119, "time_per_iteration": 4.012906312942505 }, { "auxiliary_loss_clip": 0.0118962, "auxiliary_loss_mlp": 0.01029271, "balance_loss_clip": 1.05583405, "balance_loss_mlp": 1.02116513, "epoch": 0.3751578187939638, "flos": 23367791923200.0, "grad_norm": 1.9162024731565057, "language_loss": 0.81609285, "learning_rate": 2.8752794991803173e-06, "loss": 0.83828175, "num_input_tokens_seen": 67045130, "step": 3120, "time_per_iteration": 2.756507635116577 }, { "auxiliary_loss_clip": 0.01182141, "auxiliary_loss_mlp": 0.01029495, "balance_loss_clip": 1.05918097, "balance_loss_mlp": 1.02147269, "epoch": 0.37527806168460287, "flos": 14605731878400.0, "grad_norm": 2.1881479437935596, "language_loss": 0.75356942, "learning_rate": 2.8745790201367976e-06, "loss": 0.77568579, "num_input_tokens_seen": 67060885, "step": 3121, "time_per_iteration": 2.7676613330841064 }, { "auxiliary_loss_clip": 0.01190728, "auxiliary_loss_mlp": 0.01030642, "balance_loss_clip": 1.05445433, "balance_loss_mlp": 1.02233303, "epoch": 0.375398304575242, "flos": 26390823431040.0, "grad_norm": 2.3785203111863704, "language_loss": 0.84428048, "learning_rate": 2.8738784084203373e-06, "loss": 0.86649418, "num_input_tokens_seen": 67080960, "step": 3122, "time_per_iteration": 2.7517592906951904 }, { "auxiliary_loss_clip": 0.01179618, "auxiliary_loss_mlp": 0.01027082, "balance_loss_clip": 1.05561924, "balance_loss_mlp": 1.01939917, "epoch": 0.3755185474658811, "flos": 22236605838720.0, "grad_norm": 1.6388133643746565, "language_loss": 0.78840995, "learning_rate": 2.873177664137216e-06, "loss": 0.81047702, "num_input_tokens_seen": 67101890, "step": 3123, "time_per_iteration": 2.8369650840759277 }, { "auxiliary_loss_clip": 0.01177405, "auxiliary_loss_mlp": 0.01026387, "balance_loss_clip": 1.05662715, "balance_loss_mlp": 1.01809621, "epoch": 0.37563879035652015, "flos": 30812935633920.0, "grad_norm": 1.9643515369931703, "language_loss": 0.69387144, "learning_rate": 2.8724767873937384e-06, "loss": 0.71590936, "num_input_tokens_seen": 67126010, "step": 3124, "time_per_iteration": 2.8546199798583984 }, { "auxiliary_loss_clip": 0.01183957, "auxiliary_loss_mlp": 0.01032892, "balance_loss_clip": 1.05554652, "balance_loss_mlp": 1.02474415, "epoch": 0.37575903324715926, "flos": 20773533064320.0, "grad_norm": 2.1527728870998, "language_loss": 0.8699435, "learning_rate": 2.871775778296225e-06, "loss": 0.89211202, "num_input_tokens_seen": 67143100, "step": 3125, "time_per_iteration": 2.7415544986724854 }, { "auxiliary_loss_clip": 0.01185994, "auxiliary_loss_mlp": 0.01027622, "balance_loss_clip": 1.05610669, "balance_loss_mlp": 1.01835322, "epoch": 0.37587927613779837, "flos": 18697681244160.0, "grad_norm": 2.153962836132435, "language_loss": 0.78394365, "learning_rate": 2.8710746369510196e-06, "loss": 0.80607975, "num_input_tokens_seen": 67161085, "step": 3126, "time_per_iteration": 2.8273766040802 }, { "auxiliary_loss_clip": 0.01182128, "auxiliary_loss_mlp": 0.01026905, "balance_loss_clip": 1.05886269, "balance_loss_mlp": 1.01825595, "epoch": 0.3759995190284374, "flos": 13624796384640.0, "grad_norm": 2.522985445552142, "language_loss": 0.83310962, "learning_rate": 2.8703733634644846e-06, "loss": 0.85519993, "num_input_tokens_seen": 67175840, "step": 3127, "time_per_iteration": 2.846569061279297 }, { "auxiliary_loss_clip": 0.01186953, "auxiliary_loss_mlp": 0.01027719, "balance_loss_clip": 1.05536222, "balance_loss_mlp": 1.02001858, "epoch": 0.37611976191907653, "flos": 20484847457280.0, "grad_norm": 2.035438254384417, "language_loss": 0.79245281, "learning_rate": 2.869671957943002e-06, "loss": 0.81459963, "num_input_tokens_seen": 67194995, "step": 3128, "time_per_iteration": 2.780644178390503 }, { "auxiliary_loss_clip": 0.0117482, "auxiliary_loss_mlp": 0.01030852, "balance_loss_clip": 1.05572152, "balance_loss_mlp": 1.02347887, "epoch": 0.37624000480971564, "flos": 21141797253120.0, "grad_norm": 1.7485046946433247, "language_loss": 0.74319077, "learning_rate": 2.8689704204929747e-06, "loss": 0.76524752, "num_input_tokens_seen": 67214175, "step": 3129, "time_per_iteration": 2.71986985206604 }, { "auxiliary_loss_clip": 0.01188879, "auxiliary_loss_mlp": 0.01032708, "balance_loss_clip": 1.05456924, "balance_loss_mlp": 1.02391088, "epoch": 0.3763602477003547, "flos": 22564470205440.0, "grad_norm": 1.8396522361102898, "language_loss": 0.81065726, "learning_rate": 2.8682687512208253e-06, "loss": 0.83287311, "num_input_tokens_seen": 67233185, "step": 3130, "time_per_iteration": 2.8121323585510254 }, { "auxiliary_loss_clip": 0.01192778, "auxiliary_loss_mlp": 0.01027018, "balance_loss_clip": 1.05640721, "balance_loss_mlp": 1.01866746, "epoch": 0.3764804905909938, "flos": 27526857851520.0, "grad_norm": 2.0613464295288675, "language_loss": 0.80371749, "learning_rate": 2.8675669502329972e-06, "loss": 0.82591546, "num_input_tokens_seen": 67254715, "step": 3131, "time_per_iteration": 2.7679786682128906 }, { "auxiliary_loss_clip": 0.01185715, "auxiliary_loss_mlp": 0.01060212, "balance_loss_clip": 1.05623209, "balance_loss_mlp": 1.02179241, "epoch": 0.3766007334816329, "flos": 22528092706560.0, "grad_norm": 2.336528783168299, "language_loss": 0.85657227, "learning_rate": 2.866865017635952e-06, "loss": 0.87903148, "num_input_tokens_seen": 67272535, "step": 3132, "time_per_iteration": 2.7622249126434326 }, { "auxiliary_loss_clip": 0.01176307, "auxiliary_loss_mlp": 0.01028059, "balance_loss_clip": 1.05679464, "balance_loss_mlp": 1.01949358, "epoch": 0.376720976372272, "flos": 25957166532480.0, "grad_norm": 1.9159448265241479, "language_loss": 0.79615211, "learning_rate": 2.866162953536174e-06, "loss": 0.81819576, "num_input_tokens_seen": 67293505, "step": 3133, "time_per_iteration": 2.8137319087982178 }, { "auxiliary_loss_clip": 0.01186275, "auxiliary_loss_mlp": 0.01059272, "balance_loss_clip": 1.05802488, "balance_loss_mlp": 1.02179837, "epoch": 0.3768412192629111, "flos": 18041162411520.0, "grad_norm": 1.63618597220875, "language_loss": 0.75375795, "learning_rate": 2.8654607580401634e-06, "loss": 0.77621341, "num_input_tokens_seen": 67313240, "step": 3134, "time_per_iteration": 2.7740557193756104 }, { "auxiliary_loss_clip": 0.01081336, "auxiliary_loss_mlp": 0.0100134, "balance_loss_clip": 1.01670599, "balance_loss_mlp": 0.99982566, "epoch": 0.3769614621535502, "flos": 62989472304000.0, "grad_norm": 0.8791073842189675, "language_loss": 0.65162086, "learning_rate": 2.8647584312544446e-06, "loss": 0.67244762, "num_input_tokens_seen": 67378445, "step": 3135, "time_per_iteration": 3.3558475971221924 }, { "auxiliary_loss_clip": 0.01177046, "auxiliary_loss_mlp": 0.01056386, "balance_loss_clip": 1.0546968, "balance_loss_mlp": 1.01915073, "epoch": 0.37708170504418925, "flos": 23661685002240.0, "grad_norm": 1.386802097088067, "language_loss": 0.84989858, "learning_rate": 2.864055973285559e-06, "loss": 0.87223285, "num_input_tokens_seen": 67400445, "step": 3136, "time_per_iteration": 2.8290274143218994 }, { "auxiliary_loss_clip": 0.01173497, "auxiliary_loss_mlp": 0.01030825, "balance_loss_clip": 1.05532777, "balance_loss_mlp": 1.02323699, "epoch": 0.37720194793482836, "flos": 24423170353920.0, "grad_norm": 1.7444005588200946, "language_loss": 0.86376131, "learning_rate": 2.8633533842400698e-06, "loss": 0.88580453, "num_input_tokens_seen": 67420645, "step": 3137, "time_per_iteration": 2.861424446105957 }, { "auxiliary_loss_clip": 0.01188531, "auxiliary_loss_mlp": 0.01062242, "balance_loss_clip": 1.05877566, "balance_loss_mlp": 1.02440727, "epoch": 0.3773221908254674, "flos": 20996502739200.0, "grad_norm": 1.9248120831527107, "language_loss": 0.7739588, "learning_rate": 2.862650664224558e-06, "loss": 0.79646659, "num_input_tokens_seen": 67439495, "step": 3138, "time_per_iteration": 2.7350716590881348 }, { "auxiliary_loss_clip": 0.01183397, "auxiliary_loss_mlp": 0.01027237, "balance_loss_clip": 1.05875897, "balance_loss_mlp": 1.01963198, "epoch": 0.37744243371610653, "flos": 37631724958080.0, "grad_norm": 1.729332897425738, "language_loss": 0.69436264, "learning_rate": 2.861947813345627e-06, "loss": 0.71646905, "num_input_tokens_seen": 67462195, "step": 3139, "time_per_iteration": 3.9786770343780518 }, { "auxiliary_loss_clip": 0.01191193, "auxiliary_loss_mlp": 0.01061448, "balance_loss_clip": 1.05687404, "balance_loss_mlp": 1.02318597, "epoch": 0.37756267660674564, "flos": 26140526484480.0, "grad_norm": 1.7706199267765599, "language_loss": 0.72308338, "learning_rate": 2.8612448317098974e-06, "loss": 0.74560982, "num_input_tokens_seen": 67482530, "step": 3140, "time_per_iteration": 3.947864532470703 }, { "auxiliary_loss_clip": 0.01182048, "auxiliary_loss_mlp": 0.01062156, "balance_loss_clip": 1.05547643, "balance_loss_mlp": 1.02252412, "epoch": 0.3776829194973847, "flos": 19427888828160.0, "grad_norm": 2.0819945734261505, "language_loss": 0.83326626, "learning_rate": 2.8605417194240114e-06, "loss": 0.8557083, "num_input_tokens_seen": 67500890, "step": 3141, "time_per_iteration": 2.844592332839966 }, { "auxiliary_loss_clip": 0.01178575, "auxiliary_loss_mlp": 0.01029789, "balance_loss_clip": 1.05325842, "balance_loss_mlp": 1.02197456, "epoch": 0.3778031623880238, "flos": 17382309194880.0, "grad_norm": 1.8986604806986593, "language_loss": 0.79006159, "learning_rate": 2.8598384765946315e-06, "loss": 0.81214523, "num_input_tokens_seen": 67519545, "step": 3142, "time_per_iteration": 2.778125762939453 }, { "auxiliary_loss_clip": 0.01187991, "auxiliary_loss_mlp": 0.01025499, "balance_loss_clip": 1.05304074, "balance_loss_mlp": 1.01745284, "epoch": 0.3779234052786629, "flos": 27125843437440.0, "grad_norm": 1.8312355100418263, "language_loss": 0.71454936, "learning_rate": 2.8591351033284377e-06, "loss": 0.73668426, "num_input_tokens_seen": 67539275, "step": 3143, "time_per_iteration": 2.911194086074829 }, { "auxiliary_loss_clip": 0.01188523, "auxiliary_loss_mlp": 0.01033575, "balance_loss_clip": 1.05476463, "balance_loss_mlp": 1.02543259, "epoch": 0.37804364816930197, "flos": 19682639061120.0, "grad_norm": 2.146109317828753, "language_loss": 0.84002757, "learning_rate": 2.8584315997321325e-06, "loss": 0.86224854, "num_input_tokens_seen": 67558280, "step": 3144, "time_per_iteration": 3.671538829803467 }, { "auxiliary_loss_clip": 0.01186157, "auxiliary_loss_mlp": 0.01033144, "balance_loss_clip": 1.05211377, "balance_loss_mlp": 1.02477598, "epoch": 0.3781638910599411, "flos": 22702905221760.0, "grad_norm": 4.969511508457959, "language_loss": 0.78752387, "learning_rate": 2.8577279659124356e-06, "loss": 0.80971682, "num_input_tokens_seen": 67575955, "step": 3145, "time_per_iteration": 3.6867570877075195 }, { "auxiliary_loss_clip": 0.0118132, "auxiliary_loss_mlp": 0.01025658, "balance_loss_clip": 1.05474448, "balance_loss_mlp": 1.01769543, "epoch": 0.3782841339505802, "flos": 14647604158080.0, "grad_norm": 1.8958725514906731, "language_loss": 0.83243859, "learning_rate": 2.857024201976089e-06, "loss": 0.8545084, "num_input_tokens_seen": 67593515, "step": 3146, "time_per_iteration": 2.715744972229004 }, { "auxiliary_loss_clip": 0.01179832, "auxiliary_loss_mlp": 0.01030255, "balance_loss_clip": 1.05479372, "balance_loss_mlp": 1.02171421, "epoch": 0.37840437684121925, "flos": 32818223185920.0, "grad_norm": 1.9998739457782564, "language_loss": 0.73268199, "learning_rate": 2.8563203080298516e-06, "loss": 0.75478292, "num_input_tokens_seen": 67614290, "step": 3147, "time_per_iteration": 2.882709264755249 }, { "auxiliary_loss_clip": 0.01180655, "auxiliary_loss_mlp": 0.01055366, "balance_loss_clip": 1.05574322, "balance_loss_mlp": 1.01692104, "epoch": 0.37852461973185836, "flos": 18369206346240.0, "grad_norm": 2.550069886258807, "language_loss": 0.888973, "learning_rate": 2.855616284180505e-06, "loss": 0.9113332, "num_input_tokens_seen": 67631340, "step": 3148, "time_per_iteration": 2.791741132736206 }, { "auxiliary_loss_clip": 0.01084958, "auxiliary_loss_mlp": 0.01002121, "balance_loss_clip": 1.01659477, "balance_loss_mlp": 1.0006609, "epoch": 0.37864486262249747, "flos": 59500680117120.0, "grad_norm": 1.5696059374317037, "language_loss": 0.66093737, "learning_rate": 2.8549121305348477e-06, "loss": 0.68180817, "num_input_tokens_seen": 67691125, "step": 3149, "time_per_iteration": 3.2116405963897705 }, { "auxiliary_loss_clip": 0.01187893, "auxiliary_loss_mlp": 0.01027941, "balance_loss_clip": 1.05700672, "balance_loss_mlp": 1.02055883, "epoch": 0.3787651055131365, "flos": 23363015414400.0, "grad_norm": 2.5646585605453978, "language_loss": 0.83799565, "learning_rate": 2.8542078471997006e-06, "loss": 0.86015403, "num_input_tokens_seen": 67708740, "step": 3150, "time_per_iteration": 2.7716400623321533 }, { "auxiliary_loss_clip": 0.01185874, "auxiliary_loss_mlp": 0.01027334, "balance_loss_clip": 1.05504274, "balance_loss_mlp": 1.01921546, "epoch": 0.37888534840377563, "flos": 24601394661120.0, "grad_norm": 1.6369244555113145, "language_loss": 0.75860137, "learning_rate": 2.8535034342819013e-06, "loss": 0.78073347, "num_input_tokens_seen": 67726150, "step": 3151, "time_per_iteration": 2.8586323261260986 }, { "auxiliary_loss_clip": 0.01188849, "auxiliary_loss_mlp": 0.01027056, "balance_loss_clip": 1.05668902, "balance_loss_mlp": 1.01926541, "epoch": 0.37900559129441475, "flos": 23986891762560.0, "grad_norm": 1.5334775210970262, "language_loss": 0.72470307, "learning_rate": 2.85279889188831e-06, "loss": 0.74686205, "num_input_tokens_seen": 67746525, "step": 3152, "time_per_iteration": 2.777475595474243 }, { "auxiliary_loss_clip": 0.01182894, "auxiliary_loss_mlp": 0.01028868, "balance_loss_clip": 1.05611145, "balance_loss_mlp": 1.0203979, "epoch": 0.3791258341850538, "flos": 24644667571200.0, "grad_norm": 2.0872261537956924, "language_loss": 0.81072307, "learning_rate": 2.852094220125805e-06, "loss": 0.83284068, "num_input_tokens_seen": 67766035, "step": 3153, "time_per_iteration": 2.840949296951294 }, { "auxiliary_loss_clip": 0.01187012, "auxiliary_loss_mlp": 0.0102918, "balance_loss_clip": 1.05714536, "balance_loss_mlp": 1.02076399, "epoch": 0.3792460770756929, "flos": 17420841509760.0, "grad_norm": 3.7034404141588024, "language_loss": 0.71321023, "learning_rate": 2.8513894191012846e-06, "loss": 0.73537207, "num_input_tokens_seen": 67785015, "step": 3154, "time_per_iteration": 2.807121515274048 }, { "auxiliary_loss_clip": 0.01189727, "auxiliary_loss_mlp": 0.01030437, "balance_loss_clip": 1.05536282, "balance_loss_mlp": 1.02209258, "epoch": 0.37936631996633197, "flos": 24206557386240.0, "grad_norm": 1.663524823279728, "language_loss": 0.79063261, "learning_rate": 2.8506844889216664e-06, "loss": 0.81283426, "num_input_tokens_seen": 67804400, "step": 3155, "time_per_iteration": 2.7955453395843506 }, { "auxiliary_loss_clip": 0.01080865, "auxiliary_loss_mlp": 0.01003859, "balance_loss_clip": 1.01897204, "balance_loss_mlp": 1.00227308, "epoch": 0.3794865628569711, "flos": 70297114752000.0, "grad_norm": 0.8579798080026879, "language_loss": 0.62788594, "learning_rate": 2.849979429693887e-06, "loss": 0.64873314, "num_input_tokens_seen": 67865385, "step": 3156, "time_per_iteration": 3.2977800369262695 }, { "auxiliary_loss_clip": 0.01190204, "auxiliary_loss_mlp": 0.01024274, "balance_loss_clip": 1.05669641, "balance_loss_mlp": 1.01564956, "epoch": 0.3796068057476102, "flos": 15779364860160.0, "grad_norm": 1.8767494958112967, "language_loss": 0.74379742, "learning_rate": 2.8492742415249042e-06, "loss": 0.76594222, "num_input_tokens_seen": 67883030, "step": 3157, "time_per_iteration": 2.740473747253418 }, { "auxiliary_loss_clip": 0.01187668, "auxiliary_loss_mlp": 0.01027485, "balance_loss_clip": 1.05484438, "balance_loss_mlp": 1.01922345, "epoch": 0.37972704863824924, "flos": 25191694771200.0, "grad_norm": 1.7568933489615264, "language_loss": 0.764961, "learning_rate": 2.848568924521694e-06, "loss": 0.78711247, "num_input_tokens_seen": 67903810, "step": 3158, "time_per_iteration": 2.7122459411621094 }, { "auxiliary_loss_clip": 0.0118204, "auxiliary_loss_mlp": 0.01029101, "balance_loss_clip": 1.05524182, "balance_loss_mlp": 1.0207082, "epoch": 0.37984729152888835, "flos": 26210372480640.0, "grad_norm": 2.028504905685484, "language_loss": 0.73721188, "learning_rate": 2.8478634787912526e-06, "loss": 0.7593233, "num_input_tokens_seen": 67921865, "step": 3159, "time_per_iteration": 2.7334342002868652 }, { "auxiliary_loss_clip": 0.01185911, "auxiliary_loss_mlp": 0.01030582, "balance_loss_clip": 1.05567241, "balance_loss_mlp": 1.02211785, "epoch": 0.37996753441952746, "flos": 25629302165760.0, "grad_norm": 2.422488172423084, "language_loss": 0.76660252, "learning_rate": 2.847157904440596e-06, "loss": 0.78876746, "num_input_tokens_seen": 67941595, "step": 3160, "time_per_iteration": 2.746058225631714 }, { "auxiliary_loss_clip": 0.01184365, "auxiliary_loss_mlp": 0.01028797, "balance_loss_clip": 1.05546653, "balance_loss_mlp": 1.02055967, "epoch": 0.3800877773101665, "flos": 20118414862080.0, "grad_norm": 1.6347813571853922, "language_loss": 0.74010515, "learning_rate": 2.846452201576759e-06, "loss": 0.76223671, "num_input_tokens_seen": 67960970, "step": 3161, "time_per_iteration": 2.820570468902588 }, { "auxiliary_loss_clip": 0.01081517, "auxiliary_loss_mlp": 0.01003292, "balance_loss_clip": 1.01313472, "balance_loss_mlp": 1.0017662, "epoch": 0.38020802020080563, "flos": 63053608037760.0, "grad_norm": 0.8631680659079469, "language_loss": 0.62817168, "learning_rate": 2.845746370306795e-06, "loss": 0.64901984, "num_input_tokens_seen": 68026160, "step": 3162, "time_per_iteration": 3.5095102787017822 }, { "auxiliary_loss_clip": 0.0118534, "auxiliary_loss_mlp": 0.01033547, "balance_loss_clip": 1.05439687, "balance_loss_mlp": 1.02566743, "epoch": 0.38032826309144474, "flos": 21288420570240.0, "grad_norm": 2.1376559286053185, "language_loss": 0.78439724, "learning_rate": 2.84504041073778e-06, "loss": 0.80658615, "num_input_tokens_seen": 68044575, "step": 3163, "time_per_iteration": 2.8319952487945557 }, { "auxiliary_loss_clip": 0.01178038, "auxiliary_loss_mlp": 0.01036304, "balance_loss_clip": 1.05663681, "balance_loss_mlp": 1.02725554, "epoch": 0.3804485059820838, "flos": 18954119416320.0, "grad_norm": 1.765315235279054, "language_loss": 0.79628801, "learning_rate": 2.844334322976806e-06, "loss": 0.81843144, "num_input_tokens_seen": 68064790, "step": 3164, "time_per_iteration": 2.871211290359497 }, { "auxiliary_loss_clip": 0.01181193, "auxiliary_loss_mlp": 0.01029165, "balance_loss_clip": 1.05626357, "balance_loss_mlp": 1.02090955, "epoch": 0.3805687488727229, "flos": 21833759831040.0, "grad_norm": 1.827727371610606, "language_loss": 0.83570325, "learning_rate": 2.8436281071309866e-06, "loss": 0.8578068, "num_input_tokens_seen": 68083330, "step": 3165, "time_per_iteration": 3.7116763591766357 }, { "auxiliary_loss_clip": 0.01082041, "auxiliary_loss_mlp": 0.01002524, "balance_loss_clip": 1.01401925, "balance_loss_mlp": 1.00092685, "epoch": 0.380688991763362, "flos": 58546209968640.0, "grad_norm": 0.7258854256560782, "language_loss": 0.53036457, "learning_rate": 2.842921763307455e-06, "loss": 0.55121022, "num_input_tokens_seen": 68146140, "step": 3166, "time_per_iteration": 4.31610369682312 }, { "auxiliary_loss_clip": 0.01179316, "auxiliary_loss_mlp": 0.010274, "balance_loss_clip": 1.05634844, "balance_loss_mlp": 1.01918602, "epoch": 0.38080923465400107, "flos": 23799509487360.0, "grad_norm": 1.7893220639827287, "language_loss": 0.82434714, "learning_rate": 2.842215291613361e-06, "loss": 0.84641433, "num_input_tokens_seen": 68164520, "step": 3167, "time_per_iteration": 2.962144374847412 }, { "auxiliary_loss_clip": 0.01074098, "auxiliary_loss_mlp": 0.01001364, "balance_loss_clip": 1.0204978, "balance_loss_mlp": 0.99974233, "epoch": 0.3809294775446402, "flos": 54969866380800.0, "grad_norm": 0.8474030332267419, "language_loss": 0.59241843, "learning_rate": 2.8415086921558774e-06, "loss": 0.61317301, "num_input_tokens_seen": 68227945, "step": 3168, "time_per_iteration": 4.300323486328125 }, { "auxiliary_loss_clip": 0.01171039, "auxiliary_loss_mlp": 0.01026775, "balance_loss_clip": 1.05270743, "balance_loss_mlp": 1.01877618, "epoch": 0.38104972043527924, "flos": 24643697904000.0, "grad_norm": 1.7158916425704949, "language_loss": 0.78512651, "learning_rate": 2.840801965042194e-06, "loss": 0.80710471, "num_input_tokens_seen": 68247405, "step": 3169, "time_per_iteration": 3.2856736183166504 }, { "auxiliary_loss_clip": 0.01175944, "auxiliary_loss_mlp": 0.01028719, "balance_loss_clip": 1.05490923, "balance_loss_mlp": 1.01954544, "epoch": 0.38116996332591835, "flos": 22856783086080.0, "grad_norm": 1.977644970945682, "language_loss": 0.83872104, "learning_rate": 2.840095110379521e-06, "loss": 0.86076766, "num_input_tokens_seen": 68266925, "step": 3170, "time_per_iteration": 3.7324392795562744 }, { "auxiliary_loss_clip": 0.01073417, "auxiliary_loss_mlp": 0.01007205, "balance_loss_clip": 1.01373231, "balance_loss_mlp": 1.0055604, "epoch": 0.38129020621655746, "flos": 60836160804480.0, "grad_norm": 0.7238019628995968, "language_loss": 0.53868467, "learning_rate": 2.8393881282750884e-06, "loss": 0.55949086, "num_input_tokens_seen": 68329755, "step": 3171, "time_per_iteration": 4.377333879470825 }, { "auxiliary_loss_clip": 0.01181503, "auxiliary_loss_mlp": 0.01030344, "balance_loss_clip": 1.05449772, "balance_loss_mlp": 1.02183211, "epoch": 0.3814104491071965, "flos": 21648101408640.0, "grad_norm": 2.3285284016918646, "language_loss": 0.78664863, "learning_rate": 2.838681018836144e-06, "loss": 0.80876708, "num_input_tokens_seen": 68347075, "step": 3172, "time_per_iteration": 2.9315521717071533 }, { "auxiliary_loss_clip": 0.01180513, "auxiliary_loss_mlp": 0.01056735, "balance_loss_clip": 1.05371916, "balance_loss_mlp": 1.01641846, "epoch": 0.3815306919978356, "flos": 19099090707840.0, "grad_norm": 3.252477878482664, "language_loss": 0.78317052, "learning_rate": 2.837973782169955e-06, "loss": 0.80554301, "num_input_tokens_seen": 68365450, "step": 3173, "time_per_iteration": 2.7560932636260986 }, { "auxiliary_loss_clip": 0.01079556, "auxiliary_loss_mlp": 0.01004407, "balance_loss_clip": 1.01225543, "balance_loss_mlp": 1.00282145, "epoch": 0.38165093488847474, "flos": 67067918156160.0, "grad_norm": 0.8066499453061108, "language_loss": 0.59131509, "learning_rate": 2.8372664183838096e-06, "loss": 0.61215466, "num_input_tokens_seen": 68428470, "step": 3174, "time_per_iteration": 3.2885642051696777 }, { "auxiliary_loss_clip": 0.01186545, "auxiliary_loss_mlp": 0.01026812, "balance_loss_clip": 1.05307293, "balance_loss_mlp": 1.01851499, "epoch": 0.3817711777791138, "flos": 22341105480960.0, "grad_norm": 2.7445919910357284, "language_loss": 0.68538272, "learning_rate": 2.836558927585015e-06, "loss": 0.70751631, "num_input_tokens_seen": 68445440, "step": 3175, "time_per_iteration": 2.6599271297454834 }, { "auxiliary_loss_clip": 0.01186097, "auxiliary_loss_mlp": 0.01033679, "balance_loss_clip": 1.05364776, "balance_loss_mlp": 1.02551293, "epoch": 0.3818914206697529, "flos": 22820621068800.0, "grad_norm": 1.8478094945922856, "language_loss": 0.82742077, "learning_rate": 2.8358513098808957e-06, "loss": 0.84961849, "num_input_tokens_seen": 68465755, "step": 3176, "time_per_iteration": 2.6339492797851562 }, { "auxiliary_loss_clip": 0.01169515, "auxiliary_loss_mlp": 0.0103108, "balance_loss_clip": 1.05669022, "balance_loss_mlp": 1.02179909, "epoch": 0.382011663560392, "flos": 24386074583040.0, "grad_norm": 1.8569801391612488, "language_loss": 0.76436943, "learning_rate": 2.835143565378798e-06, "loss": 0.78637534, "num_input_tokens_seen": 68486220, "step": 3177, "time_per_iteration": 2.70967435836792 }, { "auxiliary_loss_clip": 0.01168352, "auxiliary_loss_mlp": 0.01027736, "balance_loss_clip": 1.05519009, "balance_loss_mlp": 1.01902759, "epoch": 0.38213190645103107, "flos": 21981568296960.0, "grad_norm": 2.216049124637512, "language_loss": 0.7814641, "learning_rate": 2.8344356941860847e-06, "loss": 0.80342495, "num_input_tokens_seen": 68505850, "step": 3178, "time_per_iteration": 2.674999952316284 }, { "auxiliary_loss_clip": 0.01177009, "auxiliary_loss_mlp": 0.01037339, "balance_loss_clip": 1.05760646, "balance_loss_mlp": 1.02897608, "epoch": 0.3822521493416702, "flos": 35516945773440.0, "grad_norm": 2.2200520011822276, "language_loss": 0.66089827, "learning_rate": 2.8337276964101403e-06, "loss": 0.68304181, "num_input_tokens_seen": 68526290, "step": 3179, "time_per_iteration": 2.747504472732544 }, { "auxiliary_loss_clip": 0.01187057, "auxiliary_loss_mlp": 0.0103029, "balance_loss_clip": 1.05426168, "balance_loss_mlp": 1.02165389, "epoch": 0.3823723922323093, "flos": 21069904181760.0, "grad_norm": 1.8995399330850764, "language_loss": 0.76272732, "learning_rate": 2.833019572158367e-06, "loss": 0.78490078, "num_input_tokens_seen": 68544725, "step": 3180, "time_per_iteration": 2.6490843296051025 }, { "auxiliary_loss_clip": 0.01181695, "auxiliary_loss_mlp": 0.01028627, "balance_loss_clip": 1.05548406, "balance_loss_mlp": 1.02055097, "epoch": 0.38249263512294834, "flos": 19789149864960.0, "grad_norm": 1.7726862921000037, "language_loss": 0.80483389, "learning_rate": 2.8323113215381872e-06, "loss": 0.82693708, "num_input_tokens_seen": 68563070, "step": 3181, "time_per_iteration": 2.9061543941497803 }, { "auxiliary_loss_clip": 0.01182714, "auxiliary_loss_mlp": 0.0103019, "balance_loss_clip": 1.05774164, "balance_loss_mlp": 1.02169609, "epoch": 0.38261287801358745, "flos": 21433930565760.0, "grad_norm": 2.443624297732005, "language_loss": 0.76057839, "learning_rate": 2.831602944657042e-06, "loss": 0.78270745, "num_input_tokens_seen": 68581150, "step": 3182, "time_per_iteration": 2.9588143825531006 }, { "auxiliary_loss_clip": 0.01188393, "auxiliary_loss_mlp": 0.01030862, "balance_loss_clip": 1.05408192, "balance_loss_mlp": 1.02272046, "epoch": 0.38273312090422656, "flos": 21981568296960.0, "grad_norm": 2.616776054733385, "language_loss": 0.74800932, "learning_rate": 2.830894441622391e-06, "loss": 0.77020192, "num_input_tokens_seen": 68597800, "step": 3183, "time_per_iteration": 2.8906898498535156 }, { "auxiliary_loss_clip": 0.01180524, "auxiliary_loss_mlp": 0.01060634, "balance_loss_clip": 1.05497098, "balance_loss_mlp": 1.01820707, "epoch": 0.3828533637948656, "flos": 24790895838720.0, "grad_norm": 28.941077957984554, "language_loss": 0.7988615, "learning_rate": 2.8301858125417134e-06, "loss": 0.82127309, "num_input_tokens_seen": 68617640, "step": 3184, "time_per_iteration": 3.112088918685913 }, { "auxiliary_loss_clip": 0.01182637, "auxiliary_loss_mlp": 0.01026736, "balance_loss_clip": 1.0557462, "balance_loss_mlp": 1.01886773, "epoch": 0.38297360668550473, "flos": 22455445449600.0, "grad_norm": 1.8740494649026622, "language_loss": 0.74030429, "learning_rate": 2.8294770575225082e-06, "loss": 0.76239806, "num_input_tokens_seen": 68637770, "step": 3185, "time_per_iteration": 2.8905704021453857 }, { "auxiliary_loss_clip": 0.01188719, "auxiliary_loss_mlp": 0.0103723, "balance_loss_clip": 1.05864668, "balance_loss_mlp": 1.02860498, "epoch": 0.3830938495761438, "flos": 24896903852160.0, "grad_norm": 1.6652240662132207, "language_loss": 0.83776492, "learning_rate": 2.828768176672293e-06, "loss": 0.86002439, "num_input_tokens_seen": 68656885, "step": 3186, "time_per_iteration": 2.9314045906066895 }, { "auxiliary_loss_clip": 0.01178882, "auxiliary_loss_mlp": 0.01022446, "balance_loss_clip": 1.05503941, "balance_loss_mlp": 1.01353478, "epoch": 0.3832140924667829, "flos": 33036236784000.0, "grad_norm": 2.5295987389022576, "language_loss": 0.71287638, "learning_rate": 2.8280591700986044e-06, "loss": 0.73488963, "num_input_tokens_seen": 68678750, "step": 3187, "time_per_iteration": 2.901881694793701 }, { "auxiliary_loss_clip": 0.01186447, "auxiliary_loss_mlp": 0.01028408, "balance_loss_clip": 1.05367851, "balance_loss_mlp": 1.01980722, "epoch": 0.383334335357422, "flos": 31903721896320.0, "grad_norm": 1.9270440808347338, "language_loss": 0.75168169, "learning_rate": 2.827350037908999e-06, "loss": 0.77383024, "num_input_tokens_seen": 68698190, "step": 3188, "time_per_iteration": 2.8482415676116943 }, { "auxiliary_loss_clip": 0.01186881, "auxiliary_loss_mlp": 0.0103272, "balance_loss_clip": 1.0549382, "balance_loss_mlp": 1.02386296, "epoch": 0.38345457824806106, "flos": 19791915212160.0, "grad_norm": 2.1227589517305576, "language_loss": 0.79100311, "learning_rate": 2.8266407802110496e-06, "loss": 0.8131991, "num_input_tokens_seen": 68716445, "step": 3189, "time_per_iteration": 2.8281779289245605 }, { "auxiliary_loss_clip": 0.01177909, "auxiliary_loss_mlp": 0.01023259, "balance_loss_clip": 1.05722165, "balance_loss_mlp": 1.01520085, "epoch": 0.3835748211387002, "flos": 22419391173120.0, "grad_norm": 1.875054813142282, "language_loss": 0.76095974, "learning_rate": 2.8259313971123515e-06, "loss": 0.78297138, "num_input_tokens_seen": 68737565, "step": 3190, "time_per_iteration": 2.9931018352508545 }, { "auxiliary_loss_clip": 0.01184043, "auxiliary_loss_mlp": 0.01030068, "balance_loss_clip": 1.05708563, "balance_loss_mlp": 1.02245009, "epoch": 0.3836950640293393, "flos": 25118436983040.0, "grad_norm": 1.5237139138551834, "language_loss": 0.78308421, "learning_rate": 2.8252218887205166e-06, "loss": 0.80522531, "num_input_tokens_seen": 68758255, "step": 3191, "time_per_iteration": 2.8127620220184326 }, { "auxiliary_loss_clip": 0.01177016, "auxiliary_loss_mlp": 0.01035348, "balance_loss_clip": 1.05645728, "balance_loss_mlp": 1.02659225, "epoch": 0.38381530691997834, "flos": 21799213925760.0, "grad_norm": 1.8180983174811485, "language_loss": 0.80827463, "learning_rate": 2.824512255143178e-06, "loss": 0.83039832, "num_input_tokens_seen": 68777490, "step": 3192, "time_per_iteration": 3.8212757110595703 }, { "auxiliary_loss_clip": 0.01184583, "auxiliary_loss_mlp": 0.01030667, "balance_loss_clip": 1.05597925, "balance_loss_mlp": 1.02239406, "epoch": 0.38393554981061745, "flos": 21252689516160.0, "grad_norm": 1.8698666972366722, "language_loss": 0.79302466, "learning_rate": 2.8238024964879855e-06, "loss": 0.81517714, "num_input_tokens_seen": 68798385, "step": 3193, "time_per_iteration": 3.8728761672973633 }, { "auxiliary_loss_clip": 0.01195536, "auxiliary_loss_mlp": 0.01026938, "balance_loss_clip": 1.0573318, "balance_loss_mlp": 1.01796138, "epoch": 0.38405579270125656, "flos": 17019360218880.0, "grad_norm": 2.7081297588734152, "language_loss": 0.76836616, "learning_rate": 2.8230926128626095e-06, "loss": 0.79059088, "num_input_tokens_seen": 68816880, "step": 3194, "time_per_iteration": 2.9025685787200928 }, { "auxiliary_loss_clip": 0.01177741, "auxiliary_loss_mlp": 0.01027855, "balance_loss_clip": 1.05311179, "balance_loss_mlp": 1.01924205, "epoch": 0.3841760355918956, "flos": 21835375943040.0, "grad_norm": 2.02615315865156, "language_loss": 0.79011285, "learning_rate": 2.822382604374738e-06, "loss": 0.81216884, "num_input_tokens_seen": 68835805, "step": 3195, "time_per_iteration": 2.708721876144409 }, { "auxiliary_loss_clip": 0.01181315, "auxiliary_loss_mlp": 0.01033259, "balance_loss_clip": 1.05635691, "balance_loss_mlp": 1.02478886, "epoch": 0.3842962784825347, "flos": 25915114684800.0, "grad_norm": 2.330672112898488, "language_loss": 0.65478647, "learning_rate": 2.8216724711320793e-06, "loss": 0.67693216, "num_input_tokens_seen": 68854930, "step": 3196, "time_per_iteration": 3.8205413818359375 }, { "auxiliary_loss_clip": 0.01185224, "auxiliary_loss_mlp": 0.01070578, "balance_loss_clip": 1.05314851, "balance_loss_mlp": 1.02688682, "epoch": 0.38441652137317384, "flos": 25337492075520.0, "grad_norm": 1.7273453746058505, "language_loss": 0.79969579, "learning_rate": 2.820962213242361e-06, "loss": 0.82225382, "num_input_tokens_seen": 68874260, "step": 3197, "time_per_iteration": 3.947632312774658 }, { "auxiliary_loss_clip": 0.01180319, "auxiliary_loss_mlp": 0.01034257, "balance_loss_clip": 1.05403376, "balance_loss_mlp": 1.02549529, "epoch": 0.3845367642638129, "flos": 18113486446080.0, "grad_norm": 2.217237650088281, "language_loss": 0.84550112, "learning_rate": 2.8202518308133264e-06, "loss": 0.86764693, "num_input_tokens_seen": 68891535, "step": 3198, "time_per_iteration": 2.6927409172058105 }, { "auxiliary_loss_clip": 0.01193015, "auxiliary_loss_mlp": 0.0103336, "balance_loss_clip": 1.05609, "balance_loss_mlp": 1.02471745, "epoch": 0.384657007154452, "flos": 25228395492480.0, "grad_norm": 3.0024764225891705, "language_loss": 0.7330308, "learning_rate": 2.8195413239527426e-06, "loss": 0.7552945, "num_input_tokens_seen": 68911275, "step": 3199, "time_per_iteration": 2.758570432662964 }, { "auxiliary_loss_clip": 0.0118398, "auxiliary_loss_mlp": 0.01030024, "balance_loss_clip": 1.05592608, "balance_loss_mlp": 1.02144074, "epoch": 0.38477725004509106, "flos": 19865855358720.0, "grad_norm": 1.87366498207033, "language_loss": 0.80905247, "learning_rate": 2.8188306927683906e-06, "loss": 0.83119255, "num_input_tokens_seen": 68930745, "step": 3200, "time_per_iteration": 2.7981808185577393 }, { "auxiliary_loss_clip": 0.01182999, "auxiliary_loss_mlp": 0.01024509, "balance_loss_clip": 1.05476642, "balance_loss_mlp": 1.01669455, "epoch": 0.38489749293573017, "flos": 18259391491200.0, "grad_norm": 2.293084172430518, "language_loss": 0.74951547, "learning_rate": 2.818119937368074e-06, "loss": 0.77159053, "num_input_tokens_seen": 68949380, "step": 3201, "time_per_iteration": 2.7495548725128174 }, { "auxiliary_loss_clip": 0.01193955, "auxiliary_loss_mlp": 0.01031847, "balance_loss_clip": 1.05663776, "balance_loss_mlp": 1.02278066, "epoch": 0.3850177358263693, "flos": 24389163152640.0, "grad_norm": 3.1025674423226297, "language_loss": 0.66123372, "learning_rate": 2.817409057859613e-06, "loss": 0.68349171, "num_input_tokens_seen": 68968370, "step": 3202, "time_per_iteration": 2.9497854709625244 }, { "auxiliary_loss_clip": 0.01175031, "auxiliary_loss_mlp": 0.01028702, "balance_loss_clip": 1.05827999, "balance_loss_mlp": 1.02002931, "epoch": 0.38513797871700833, "flos": 17671533505920.0, "grad_norm": 1.9068986361587428, "language_loss": 0.79282343, "learning_rate": 2.8166980543508482e-06, "loss": 0.8148607, "num_input_tokens_seen": 68984260, "step": 3203, "time_per_iteration": 2.7628889083862305 }, { "auxiliary_loss_clip": 0.01191223, "auxiliary_loss_mlp": 0.01028228, "balance_loss_clip": 1.05611968, "balance_loss_mlp": 1.01961517, "epoch": 0.38525822160764744, "flos": 25739583897600.0, "grad_norm": 2.176201101496508, "language_loss": 0.797342, "learning_rate": 2.815986926949638e-06, "loss": 0.81953651, "num_input_tokens_seen": 69002760, "step": 3204, "time_per_iteration": 2.8513801097869873 }, { "auxiliary_loss_clip": 0.01184732, "auxiliary_loss_mlp": 0.01030634, "balance_loss_clip": 1.05614972, "balance_loss_mlp": 1.02245057, "epoch": 0.38537846449828655, "flos": 20193647898240.0, "grad_norm": 1.7697774921210367, "language_loss": 0.80300498, "learning_rate": 2.8152756757638597e-06, "loss": 0.82515866, "num_input_tokens_seen": 69021260, "step": 3205, "time_per_iteration": 2.753753662109375 }, { "auxiliary_loss_clip": 0.01186042, "auxiliary_loss_mlp": 0.01029588, "balance_loss_clip": 1.05722117, "balance_loss_mlp": 1.02145231, "epoch": 0.3854987073889256, "flos": 23039352938880.0, "grad_norm": 2.8476319386223508, "language_loss": 0.84424663, "learning_rate": 2.8145643009014093e-06, "loss": 0.86640298, "num_input_tokens_seen": 69039755, "step": 3206, "time_per_iteration": 2.778000593185425 }, { "auxiliary_loss_clip": 0.01188032, "auxiliary_loss_mlp": 0.01024345, "balance_loss_clip": 1.05628157, "balance_loss_mlp": 1.01620924, "epoch": 0.3856189502795647, "flos": 20190631155840.0, "grad_norm": 1.800766018475718, "language_loss": 0.79488289, "learning_rate": 2.813852802470202e-06, "loss": 0.81700671, "num_input_tokens_seen": 69057650, "step": 3207, "time_per_iteration": 2.7862050533294678 }, { "auxiliary_loss_clip": 0.01180624, "auxiliary_loss_mlp": 0.01027709, "balance_loss_clip": 1.05603123, "balance_loss_mlp": 1.01934671, "epoch": 0.38573919317020383, "flos": 25702631781120.0, "grad_norm": 1.937908469343013, "language_loss": 0.72592771, "learning_rate": 2.8131411805781717e-06, "loss": 0.74801111, "num_input_tokens_seen": 69077775, "step": 3208, "time_per_iteration": 2.710726261138916 }, { "auxiliary_loss_clip": 0.01184078, "auxiliary_loss_mlp": 0.01032098, "balance_loss_clip": 1.05953979, "balance_loss_mlp": 1.02348471, "epoch": 0.3858594360608429, "flos": 29821405628160.0, "grad_norm": 2.3329831517125115, "language_loss": 0.640692, "learning_rate": 2.8124294353332707e-06, "loss": 0.66285384, "num_input_tokens_seen": 69096450, "step": 3209, "time_per_iteration": 2.926076650619507 }, { "auxiliary_loss_clip": 0.0118858, "auxiliary_loss_mlp": 0.01031992, "balance_loss_clip": 1.05813003, "balance_loss_mlp": 1.02377844, "epoch": 0.385979678951482, "flos": 24790428961920.0, "grad_norm": 2.008642116036483, "language_loss": 0.7708661, "learning_rate": 2.8117175668434713e-06, "loss": 0.79307187, "num_input_tokens_seen": 69116110, "step": 3210, "time_per_iteration": 2.848428726196289 }, { "auxiliary_loss_clip": 0.01193048, "auxiliary_loss_mlp": 0.01025805, "balance_loss_clip": 1.05697584, "balance_loss_mlp": 1.01733494, "epoch": 0.3860999218421211, "flos": 21287881866240.0, "grad_norm": 2.593152606275876, "language_loss": 0.70613563, "learning_rate": 2.811005575216762e-06, "loss": 0.72832417, "num_input_tokens_seen": 69134825, "step": 3211, "time_per_iteration": 2.8395144939422607 }, { "auxiliary_loss_clip": 0.01173948, "auxiliary_loss_mlp": 0.01032386, "balance_loss_clip": 1.05470991, "balance_loss_mlp": 1.02392769, "epoch": 0.38622016473276016, "flos": 24536720223360.0, "grad_norm": 2.05957067038151, "language_loss": 0.7891283, "learning_rate": 2.8102934605611513e-06, "loss": 0.81119162, "num_input_tokens_seen": 69156460, "step": 3212, "time_per_iteration": 2.9448349475860596 }, { "auxiliary_loss_clip": 0.01190746, "auxiliary_loss_mlp": 0.01031516, "balance_loss_clip": 1.05953836, "balance_loss_mlp": 1.02283764, "epoch": 0.3863404076233993, "flos": 20558212986240.0, "grad_norm": 2.0551830887847653, "language_loss": 0.67378044, "learning_rate": 2.8095812229846665e-06, "loss": 0.69600296, "num_input_tokens_seen": 69176420, "step": 3213, "time_per_iteration": 2.9369513988494873 }, { "auxiliary_loss_clip": 0.01187825, "auxiliary_loss_mlp": 0.01025817, "balance_loss_clip": 1.05788743, "balance_loss_mlp": 1.01725817, "epoch": 0.3864606505140384, "flos": 22346277039360.0, "grad_norm": 2.7542750667144653, "language_loss": 0.69175017, "learning_rate": 2.808868862595355e-06, "loss": 0.71388662, "num_input_tokens_seen": 69196665, "step": 3214, "time_per_iteration": 2.842087745666504 }, { "auxiliary_loss_clip": 0.01188493, "auxiliary_loss_mlp": 0.01034167, "balance_loss_clip": 1.05647922, "balance_loss_mlp": 1.02574158, "epoch": 0.38658089340467744, "flos": 25703601448320.0, "grad_norm": 1.681167425955008, "language_loss": 0.79458904, "learning_rate": 2.8081563795012795e-06, "loss": 0.81681561, "num_input_tokens_seen": 69216290, "step": 3215, "time_per_iteration": 2.814577341079712 }, { "auxiliary_loss_clip": 0.01190858, "auxiliary_loss_mlp": 0.01025166, "balance_loss_clip": 1.05583262, "balance_loss_mlp": 1.01689887, "epoch": 0.38670113629531655, "flos": 33802534558080.0, "grad_norm": 1.8997979198861423, "language_loss": 0.73929381, "learning_rate": 2.807443773810524e-06, "loss": 0.76145405, "num_input_tokens_seen": 69237550, "step": 3216, "time_per_iteration": 2.8633973598480225 }, { "auxiliary_loss_clip": 0.01181542, "auxiliary_loss_mlp": 0.01028714, "balance_loss_clip": 1.05846667, "balance_loss_mlp": 1.02045834, "epoch": 0.3868213791859556, "flos": 23331522165120.0, "grad_norm": 1.8370442369841582, "language_loss": 0.89443552, "learning_rate": 2.80673104563119e-06, "loss": 0.91653812, "num_input_tokens_seen": 69258175, "step": 3217, "time_per_iteration": 3.7361063957214355 }, { "auxiliary_loss_clip": 0.0118533, "auxiliary_loss_mlp": 0.01026882, "balance_loss_clip": 1.05668283, "balance_loss_mlp": 1.01891875, "epoch": 0.3869416220765947, "flos": 18441530380800.0, "grad_norm": 1.7717570727214274, "language_loss": 0.78930056, "learning_rate": 2.8060181950713976e-06, "loss": 0.81142265, "num_input_tokens_seen": 69274965, "step": 3218, "time_per_iteration": 3.794506072998047 }, { "auxiliary_loss_clip": 0.01177627, "auxiliary_loss_mlp": 0.01031598, "balance_loss_clip": 1.05573797, "balance_loss_mlp": 1.02244258, "epoch": 0.3870618649672338, "flos": 15632992938240.0, "grad_norm": 2.596609264593969, "language_loss": 0.8096742, "learning_rate": 2.805305222239286e-06, "loss": 0.83176637, "num_input_tokens_seen": 69292220, "step": 3219, "time_per_iteration": 2.8170859813690186 }, { "auxiliary_loss_clip": 0.01180847, "auxiliary_loss_mlp": 0.01032509, "balance_loss_clip": 1.05604219, "balance_loss_mlp": 1.02365208, "epoch": 0.3871821078578729, "flos": 23513804709120.0, "grad_norm": 2.055685000271281, "language_loss": 0.73852754, "learning_rate": 2.8045921272430118e-06, "loss": 0.76066113, "num_input_tokens_seen": 69311900, "step": 3220, "time_per_iteration": 2.7917709350585938 }, { "auxiliary_loss_clip": 0.01192907, "auxiliary_loss_mlp": 0.01029059, "balance_loss_clip": 1.05599284, "balance_loss_mlp": 1.02080965, "epoch": 0.387302350748512, "flos": 17778259791360.0, "grad_norm": 2.232846465638462, "language_loss": 0.76531833, "learning_rate": 2.803878910190753e-06, "loss": 0.78753793, "num_input_tokens_seen": 69328820, "step": 3221, "time_per_iteration": 3.7177236080169678 }, { "auxiliary_loss_clip": 0.01190943, "auxiliary_loss_mlp": 0.01031647, "balance_loss_clip": 1.0565865, "balance_loss_mlp": 1.02328444, "epoch": 0.3874225936391511, "flos": 11503409097600.0, "grad_norm": 2.604676229207303, "language_loss": 0.82305574, "learning_rate": 2.8031655711907017e-06, "loss": 0.84528172, "num_input_tokens_seen": 69342525, "step": 3222, "time_per_iteration": 3.8438503742218018 }, { "auxiliary_loss_clip": 0.01191398, "auxiliary_loss_mlp": 0.01030095, "balance_loss_clip": 1.05984521, "balance_loss_mlp": 1.02196479, "epoch": 0.38754283652979016, "flos": 21945154884480.0, "grad_norm": 2.8960568891776433, "language_loss": 0.80715317, "learning_rate": 2.8024521103510723e-06, "loss": 0.82936811, "num_input_tokens_seen": 69359295, "step": 3223, "time_per_iteration": 2.8488879203796387 }, { "auxiliary_loss_clip": 0.01186631, "auxiliary_loss_mlp": 0.01021095, "balance_loss_clip": 1.05521262, "balance_loss_mlp": 1.01302421, "epoch": 0.38766307942042927, "flos": 21175984022400.0, "grad_norm": 1.8321769303220103, "language_loss": 0.75616801, "learning_rate": 2.8017385277800952e-06, "loss": 0.77824533, "num_input_tokens_seen": 69377650, "step": 3224, "time_per_iteration": 2.8250818252563477 }, { "auxiliary_loss_clip": 0.01190359, "auxiliary_loss_mlp": 0.01033642, "balance_loss_clip": 1.06018746, "balance_loss_mlp": 1.0250771, "epoch": 0.3877833223110684, "flos": 27417294391680.0, "grad_norm": 4.109594607337064, "language_loss": 0.74962246, "learning_rate": 2.8010248235860213e-06, "loss": 0.77186251, "num_input_tokens_seen": 69397765, "step": 3225, "time_per_iteration": 2.8561851978302 }, { "auxiliary_loss_clip": 0.0108623, "auxiliary_loss_mlp": 0.01053221, "balance_loss_clip": 1.01737022, "balance_loss_mlp": 1.00712669, "epoch": 0.38790356520170743, "flos": 64500019879680.0, "grad_norm": 0.8340817031472402, "language_loss": 0.6275903, "learning_rate": 2.8003109978771192e-06, "loss": 0.64898479, "num_input_tokens_seen": 69458930, "step": 3226, "time_per_iteration": 3.5430150032043457 }, { "auxiliary_loss_clip": 0.01174559, "auxiliary_loss_mlp": 0.01030535, "balance_loss_clip": 1.0542047, "balance_loss_mlp": 1.02225614, "epoch": 0.38802380809234654, "flos": 22345415112960.0, "grad_norm": 16.004959750155884, "language_loss": 0.79259145, "learning_rate": 2.799597050761674e-06, "loss": 0.81464243, "num_input_tokens_seen": 69475135, "step": 3227, "time_per_iteration": 2.9296741485595703 }, { "auxiliary_loss_clip": 0.01193216, "auxiliary_loss_mlp": 0.01024937, "balance_loss_clip": 1.05766809, "balance_loss_mlp": 1.01721191, "epoch": 0.38814405098298566, "flos": 25261361199360.0, "grad_norm": 2.105010111767984, "language_loss": 0.79434371, "learning_rate": 2.7988829823479924e-06, "loss": 0.81652522, "num_input_tokens_seen": 69493525, "step": 3228, "time_per_iteration": 2.815220832824707 }, { "auxiliary_loss_clip": 0.01180811, "auxiliary_loss_mlp": 0.01028738, "balance_loss_clip": 1.05711913, "balance_loss_mlp": 1.02017307, "epoch": 0.3882642938736247, "flos": 18841180078080.0, "grad_norm": 1.702540353251745, "language_loss": 0.63823783, "learning_rate": 2.7981687927443976e-06, "loss": 0.66033328, "num_input_tokens_seen": 69510325, "step": 3229, "time_per_iteration": 2.8369247913360596 }, { "auxiliary_loss_clip": 0.01186185, "auxiliary_loss_mlp": 0.01030749, "balance_loss_clip": 1.05578697, "balance_loss_mlp": 1.0225836, "epoch": 0.3883845367642638, "flos": 21652806090240.0, "grad_norm": 1.7964448417172305, "language_loss": 0.85728157, "learning_rate": 2.797454482059231e-06, "loss": 0.87945092, "num_input_tokens_seen": 69530480, "step": 3230, "time_per_iteration": 2.888551950454712 }, { "auxiliary_loss_clip": 0.01191187, "auxiliary_loss_mlp": 0.01030336, "balance_loss_clip": 1.05686188, "balance_loss_mlp": 1.02223575, "epoch": 0.3885047796549029, "flos": 20557530627840.0, "grad_norm": 2.0821356393505157, "language_loss": 0.84406185, "learning_rate": 2.7967400504008537e-06, "loss": 0.8662771, "num_input_tokens_seen": 69549780, "step": 3231, "time_per_iteration": 2.795400619506836 }, { "auxiliary_loss_clip": 0.01081601, "auxiliary_loss_mlp": 0.01008678, "balance_loss_clip": 1.01968956, "balance_loss_mlp": 1.00716436, "epoch": 0.388625022545542, "flos": 64325491695360.0, "grad_norm": 0.7941187366882527, "language_loss": 0.57450968, "learning_rate": 2.7960254978776456e-06, "loss": 0.59541249, "num_input_tokens_seen": 69611870, "step": 3232, "time_per_iteration": 3.3519458770751953 }, { "auxiliary_loss_clip": 0.01192065, "auxiliary_loss_mlp": 0.01028878, "balance_loss_clip": 1.05770636, "balance_loss_mlp": 1.02075386, "epoch": 0.3887452654361811, "flos": 18113881495680.0, "grad_norm": 2.2215886216008194, "language_loss": 0.81807673, "learning_rate": 2.7953108245980006e-06, "loss": 0.84028614, "num_input_tokens_seen": 69630385, "step": 3233, "time_per_iteration": 2.7491109371185303 }, { "auxiliary_loss_clip": 0.01178014, "auxiliary_loss_mlp": 0.01031887, "balance_loss_clip": 1.05623126, "balance_loss_mlp": 1.02409649, "epoch": 0.38886550832682015, "flos": 24975261371520.0, "grad_norm": 1.8980778294418867, "language_loss": 0.73958635, "learning_rate": 2.7945960306703365e-06, "loss": 0.76168537, "num_input_tokens_seen": 69653370, "step": 3234, "time_per_iteration": 2.817626476287842 }, { "auxiliary_loss_clip": 0.01189341, "auxiliary_loss_mlp": 0.01029728, "balance_loss_clip": 1.05584395, "balance_loss_mlp": 1.02079964, "epoch": 0.38898575121745926, "flos": 27199496275200.0, "grad_norm": 1.7453276259924115, "language_loss": 0.6549893, "learning_rate": 2.7938811162030865e-06, "loss": 0.67718005, "num_input_tokens_seen": 69673635, "step": 3235, "time_per_iteration": 2.875782012939453 }, { "auxiliary_loss_clip": 0.01186849, "auxiliary_loss_mlp": 0.01024957, "balance_loss_clip": 1.05686855, "balance_loss_mlp": 1.01765501, "epoch": 0.3891059941080984, "flos": 28763728727040.0, "grad_norm": 1.9051096598430732, "language_loss": 0.82286239, "learning_rate": 2.793166081304702e-06, "loss": 0.84498048, "num_input_tokens_seen": 69694130, "step": 3236, "time_per_iteration": 2.890371084213257 }, { "auxiliary_loss_clip": 0.01185112, "auxiliary_loss_mlp": 0.01029608, "balance_loss_clip": 1.05457354, "balance_loss_mlp": 1.0215373, "epoch": 0.38922623699873743, "flos": 22893447893760.0, "grad_norm": 2.1615421195187126, "language_loss": 0.82313848, "learning_rate": 2.7924509260836543e-06, "loss": 0.84528565, "num_input_tokens_seen": 69713255, "step": 3237, "time_per_iteration": 2.891578435897827 }, { "auxiliary_loss_clip": 0.0117807, "auxiliary_loss_mlp": 0.01034982, "balance_loss_clip": 1.0555768, "balance_loss_mlp": 1.02708399, "epoch": 0.38934647988937654, "flos": 19792418002560.0, "grad_norm": 1.4023815330605665, "language_loss": 0.68194288, "learning_rate": 2.791735650648431e-06, "loss": 0.70407337, "num_input_tokens_seen": 69732375, "step": 3238, "time_per_iteration": 2.794034957885742 }, { "auxiliary_loss_clip": 0.01177295, "auxiliary_loss_mlp": 0.01025548, "balance_loss_clip": 1.05303025, "balance_loss_mlp": 1.01735854, "epoch": 0.38946672278001565, "flos": 19202081978880.0, "grad_norm": 2.1133792140047047, "language_loss": 0.74242961, "learning_rate": 2.791020255107538e-06, "loss": 0.76445812, "num_input_tokens_seen": 69749745, "step": 3239, "time_per_iteration": 2.7879698276519775 }, { "auxiliary_loss_clip": 0.01180156, "auxiliary_loss_mlp": 0.01029031, "balance_loss_clip": 1.05678916, "balance_loss_mlp": 1.02090073, "epoch": 0.3895869656706547, "flos": 24936477661440.0, "grad_norm": 1.592929671362458, "language_loss": 0.80881929, "learning_rate": 2.7903047395695023e-06, "loss": 0.83091116, "num_input_tokens_seen": 69769645, "step": 3240, "time_per_iteration": 2.8597052097320557 }, { "auxiliary_loss_clip": 0.01184233, "auxiliary_loss_mlp": 0.01061572, "balance_loss_clip": 1.05620694, "balance_loss_mlp": 1.02190769, "epoch": 0.3897072085612938, "flos": 24133622820480.0, "grad_norm": 2.139066720730482, "language_loss": 0.89971167, "learning_rate": 2.789589104142865e-06, "loss": 0.92216969, "num_input_tokens_seen": 69787270, "step": 3241, "time_per_iteration": 2.8049678802490234 }, { "auxiliary_loss_clip": 0.0117857, "auxiliary_loss_mlp": 0.01032313, "balance_loss_clip": 1.05276942, "balance_loss_mlp": 1.02452826, "epoch": 0.3898274514519329, "flos": 17166342672000.0, "grad_norm": 20.528917523336496, "language_loss": 0.7652415, "learning_rate": 2.7888733489361895e-06, "loss": 0.7873503, "num_input_tokens_seen": 69805685, "step": 3242, "time_per_iteration": 2.8668696880340576 }, { "auxiliary_loss_clip": 0.01087983, "auxiliary_loss_mlp": 0.01005975, "balance_loss_clip": 1.02014303, "balance_loss_mlp": 1.00459802, "epoch": 0.389947694342572, "flos": 66074807952000.0, "grad_norm": 0.731582507091518, "language_loss": 0.58748996, "learning_rate": 2.788157474058054e-06, "loss": 0.60842955, "num_input_tokens_seen": 69867960, "step": 3243, "time_per_iteration": 4.216807842254639 }, { "auxiliary_loss_clip": 0.01184868, "auxiliary_loss_mlp": 0.01028924, "balance_loss_clip": 1.05327773, "balance_loss_mlp": 1.02078831, "epoch": 0.3900679372332111, "flos": 25740912700800.0, "grad_norm": 1.5631629738066393, "language_loss": 0.70041203, "learning_rate": 2.7874414796170555e-06, "loss": 0.72254992, "num_input_tokens_seen": 69889450, "step": 3244, "time_per_iteration": 2.906883955001831 }, { "auxiliary_loss_clip": 0.01183662, "auxiliary_loss_mlp": 0.01037212, "balance_loss_clip": 1.05593562, "balance_loss_mlp": 1.02795553, "epoch": 0.3901881801238502, "flos": 11801611808640.0, "grad_norm": 2.3730798270711713, "language_loss": 0.84059125, "learning_rate": 2.7867253657218113e-06, "loss": 0.8628, "num_input_tokens_seen": 69903340, "step": 3245, "time_per_iteration": 3.960406541824341 }, { "auxiliary_loss_clip": 0.01183768, "auxiliary_loss_mlp": 0.01056307, "balance_loss_clip": 1.05697143, "balance_loss_mlp": 1.01906741, "epoch": 0.39030842301448926, "flos": 27308951994240.0, "grad_norm": 1.8231862728959207, "language_loss": 0.73031902, "learning_rate": 2.7860091324809544e-06, "loss": 0.7527197, "num_input_tokens_seen": 69924400, "step": 3246, "time_per_iteration": 2.775116443634033 }, { "auxiliary_loss_clip": 0.01181873, "auxiliary_loss_mlp": 0.01033833, "balance_loss_clip": 1.05535245, "balance_loss_mlp": 1.02589321, "epoch": 0.39042866590512837, "flos": 27163334257920.0, "grad_norm": 1.624474017862182, "language_loss": 0.80918217, "learning_rate": 2.7852927800031377e-06, "loss": 0.83133924, "num_input_tokens_seen": 69944565, "step": 3247, "time_per_iteration": 3.668849468231201 }, { "auxiliary_loss_clip": 0.01185874, "auxiliary_loss_mlp": 0.0102563, "balance_loss_clip": 1.05728769, "balance_loss_mlp": 1.0176909, "epoch": 0.3905489087957674, "flos": 29716115886720.0, "grad_norm": 1.8022361027030833, "language_loss": 0.827613, "learning_rate": 2.7845763083970298e-06, "loss": 0.84972799, "num_input_tokens_seen": 69964965, "step": 3248, "time_per_iteration": 3.7771944999694824 }, { "auxiliary_loss_clip": 0.01181486, "auxiliary_loss_mlp": 0.01027776, "balance_loss_clip": 1.05645502, "balance_loss_mlp": 1.01889491, "epoch": 0.39066915168640653, "flos": 24498618871680.0, "grad_norm": 1.8561279580177772, "language_loss": 0.81983238, "learning_rate": 2.7838597177713205e-06, "loss": 0.84192502, "num_input_tokens_seen": 69986055, "step": 3249, "time_per_iteration": 2.7733724117279053 }, { "auxiliary_loss_clip": 0.0116265, "auxiliary_loss_mlp": 0.01023548, "balance_loss_clip": 1.056229, "balance_loss_mlp": 1.0157578, "epoch": 0.39078939457704565, "flos": 20558572122240.0, "grad_norm": 2.1567341490468506, "language_loss": 0.73611581, "learning_rate": 2.7831430082347143e-06, "loss": 0.75797778, "num_input_tokens_seen": 70005260, "step": 3250, "time_per_iteration": 2.7835094928741455 }, { "auxiliary_loss_clip": 0.01187547, "auxiliary_loss_mlp": 0.01059906, "balance_loss_clip": 1.05718851, "balance_loss_mlp": 1.02104378, "epoch": 0.3909096374676847, "flos": 22783417557120.0, "grad_norm": 2.2515289700014813, "language_loss": 0.82202846, "learning_rate": 2.7824261798959373e-06, "loss": 0.84450305, "num_input_tokens_seen": 70023440, "step": 3251, "time_per_iteration": 2.7685484886169434 }, { "auxiliary_loss_clip": 0.01184933, "auxiliary_loss_mlp": 0.01032562, "balance_loss_clip": 1.05429149, "balance_loss_mlp": 1.02460432, "epoch": 0.3910298803583238, "flos": 23003119094400.0, "grad_norm": 1.8712583546866606, "language_loss": 0.7942462, "learning_rate": 2.78170923286373e-06, "loss": 0.81642109, "num_input_tokens_seen": 70043040, "step": 3252, "time_per_iteration": 2.777754306793213 }, { "auxiliary_loss_clip": 0.01161798, "auxiliary_loss_mlp": 0.01031349, "balance_loss_clip": 1.05634367, "balance_loss_mlp": 1.0225035, "epoch": 0.3911501232489629, "flos": 24316264500480.0, "grad_norm": 2.8485716524234834, "language_loss": 0.84184515, "learning_rate": 2.780992167246854e-06, "loss": 0.86377662, "num_input_tokens_seen": 70060565, "step": 3253, "time_per_iteration": 2.8642964363098145 }, { "auxiliary_loss_clip": 0.01088717, "auxiliary_loss_mlp": 0.01001869, "balance_loss_clip": 1.01983523, "balance_loss_mlp": 1.00039721, "epoch": 0.391270366139602, "flos": 60869054684160.0, "grad_norm": 0.9949237549226352, "language_loss": 0.72199512, "learning_rate": 2.7802749831540883e-06, "loss": 0.74290097, "num_input_tokens_seen": 70119465, "step": 3254, "time_per_iteration": 3.347283363342285 }, { "auxiliary_loss_clip": 0.01180182, "auxiliary_loss_mlp": 0.01026901, "balance_loss_clip": 1.055691, "balance_loss_mlp": 1.01989758, "epoch": 0.3913906090302411, "flos": 21543494025600.0, "grad_norm": 2.2261830027095995, "language_loss": 0.82277918, "learning_rate": 2.7795576806942268e-06, "loss": 0.84484994, "num_input_tokens_seen": 70138270, "step": 3255, "time_per_iteration": 2.832268714904785 }, { "auxiliary_loss_clip": 0.01081179, "auxiliary_loss_mlp": 0.01016429, "balance_loss_clip": 1.02037549, "balance_loss_mlp": 1.01474845, "epoch": 0.3915108519208802, "flos": 49839953702400.0, "grad_norm": 0.7564429266850232, "language_loss": 0.54927945, "learning_rate": 2.778840259976085e-06, "loss": 0.57025558, "num_input_tokens_seen": 70193500, "step": 3256, "time_per_iteration": 3.3435516357421875 }, { "auxiliary_loss_clip": 0.01186697, "auxiliary_loss_mlp": 0.01027675, "balance_loss_clip": 1.05566347, "balance_loss_mlp": 1.01944327, "epoch": 0.39163109481151925, "flos": 16506447960960.0, "grad_norm": 2.244475186267117, "language_loss": 0.76936978, "learning_rate": 2.778122721108495e-06, "loss": 0.79151344, "num_input_tokens_seen": 70211730, "step": 3257, "time_per_iteration": 2.766171455383301 }, { "auxiliary_loss_clip": 0.01184627, "auxiliary_loss_mlp": 0.01029751, "balance_loss_clip": 1.05703604, "balance_loss_mlp": 1.02137613, "epoch": 0.39175133770215836, "flos": 26067484177920.0, "grad_norm": 2.0523131752439046, "language_loss": 0.88529992, "learning_rate": 2.7774050642003076e-06, "loss": 0.90744376, "num_input_tokens_seen": 70232540, "step": 3258, "time_per_iteration": 2.84831166267395 }, { "auxiliary_loss_clip": 0.01192428, "auxiliary_loss_mlp": 0.01032565, "balance_loss_clip": 1.05695546, "balance_loss_mlp": 1.02371407, "epoch": 0.3918715805927975, "flos": 21872076664320.0, "grad_norm": 4.036849834049557, "language_loss": 0.93334413, "learning_rate": 2.7766872893603896e-06, "loss": 0.95559406, "num_input_tokens_seen": 70252515, "step": 3259, "time_per_iteration": 2.752519369125366 }, { "auxiliary_loss_clip": 0.01186996, "auxiliary_loss_mlp": 0.01024432, "balance_loss_clip": 1.05590987, "balance_loss_mlp": 1.0174526, "epoch": 0.39199182348343653, "flos": 20376181837440.0, "grad_norm": 1.6565774141252447, "language_loss": 0.7324959, "learning_rate": 2.7759693966976275e-06, "loss": 0.75461018, "num_input_tokens_seen": 70271020, "step": 3260, "time_per_iteration": 2.7490854263305664 }, { "auxiliary_loss_clip": 0.01179182, "auxiliary_loss_mlp": 0.01027884, "balance_loss_clip": 1.05633044, "balance_loss_mlp": 1.01964641, "epoch": 0.39211206637407564, "flos": 21683545153920.0, "grad_norm": 2.5161537832134053, "language_loss": 0.85446262, "learning_rate": 2.7752513863209242e-06, "loss": 0.87653333, "num_input_tokens_seen": 70289600, "step": 3261, "time_per_iteration": 2.821998119354248 }, { "auxiliary_loss_clip": 0.01176236, "auxiliary_loss_mlp": 0.01064873, "balance_loss_clip": 1.05509353, "balance_loss_mlp": 1.02554536, "epoch": 0.39223230926471475, "flos": 21066276908160.0, "grad_norm": 1.708213045876506, "language_loss": 0.84320664, "learning_rate": 2.774533258339203e-06, "loss": 0.86561775, "num_input_tokens_seen": 70307060, "step": 3262, "time_per_iteration": 2.953439712524414 }, { "auxiliary_loss_clip": 0.01177036, "auxiliary_loss_mlp": 0.01033921, "balance_loss_clip": 1.05452728, "balance_loss_mlp": 1.02516484, "epoch": 0.3923525521553538, "flos": 17603016312960.0, "grad_norm": 2.4227133021881313, "language_loss": 0.79511392, "learning_rate": 2.7738150128614014e-06, "loss": 0.81722349, "num_input_tokens_seen": 70324465, "step": 3263, "time_per_iteration": 2.6968820095062256 }, { "auxiliary_loss_clip": 0.01168976, "auxiliary_loss_mlp": 0.01035117, "balance_loss_clip": 1.05448651, "balance_loss_mlp": 1.02630115, "epoch": 0.3924727950459929, "flos": 20558284813440.0, "grad_norm": 5.704212541625771, "language_loss": 0.89846396, "learning_rate": 2.7730966499964777e-06, "loss": 0.92050493, "num_input_tokens_seen": 70341415, "step": 3264, "time_per_iteration": 2.790893316268921 }, { "auxiliary_loss_clip": 0.01189946, "auxiliary_loss_mlp": 0.01032633, "balance_loss_clip": 1.05507672, "balance_loss_mlp": 1.02359688, "epoch": 0.39259303793663197, "flos": 16216110328320.0, "grad_norm": 4.6183812451355735, "language_loss": 0.80676484, "learning_rate": 2.772378169853408e-06, "loss": 0.82899058, "num_input_tokens_seen": 70358985, "step": 3265, "time_per_iteration": 2.746913194656372 }, { "auxiliary_loss_clip": 0.01177678, "auxiliary_loss_mlp": 0.0103529, "balance_loss_clip": 1.05589068, "balance_loss_mlp": 1.02726102, "epoch": 0.3927132808272711, "flos": 16797001075200.0, "grad_norm": 2.253052461349756, "language_loss": 0.74403679, "learning_rate": 2.771659572541183e-06, "loss": 0.76616645, "num_input_tokens_seen": 70376915, "step": 3266, "time_per_iteration": 2.7981152534484863 }, { "auxiliary_loss_clip": 0.01187983, "auxiliary_loss_mlp": 0.0102617, "balance_loss_clip": 1.05708742, "balance_loss_mlp": 1.01897001, "epoch": 0.3928335237179102, "flos": 20267228908800.0, "grad_norm": 2.0574867928332137, "language_loss": 0.86998492, "learning_rate": 2.7709408581688143e-06, "loss": 0.8921265, "num_input_tokens_seen": 70396900, "step": 3267, "time_per_iteration": 2.786226511001587 }, { "auxiliary_loss_clip": 0.01184591, "auxiliary_loss_mlp": 0.01033098, "balance_loss_clip": 1.05716372, "balance_loss_mlp": 1.02510476, "epoch": 0.39295376660854925, "flos": 24973250209920.0, "grad_norm": 1.9275599309538538, "language_loss": 0.87832868, "learning_rate": 2.7702220268453307e-06, "loss": 0.90050554, "num_input_tokens_seen": 70417260, "step": 3268, "time_per_iteration": 2.8880486488342285 }, { "auxiliary_loss_clip": 0.01182724, "auxiliary_loss_mlp": 0.01028467, "balance_loss_clip": 1.0540731, "balance_loss_mlp": 1.02012789, "epoch": 0.39307400949918836, "flos": 18697788984960.0, "grad_norm": 2.4178011544966997, "language_loss": 0.84625506, "learning_rate": 2.7695030786797785e-06, "loss": 0.86836702, "num_input_tokens_seen": 70433155, "step": 3269, "time_per_iteration": 3.680187463760376 }, { "auxiliary_loss_clip": 0.01173877, "auxiliary_loss_mlp": 0.01035262, "balance_loss_clip": 1.05561805, "balance_loss_mlp": 1.02662563, "epoch": 0.39319425238982747, "flos": 22415476590720.0, "grad_norm": 2.6354878119213945, "language_loss": 0.7482363, "learning_rate": 2.7687840137812206e-06, "loss": 0.77032763, "num_input_tokens_seen": 70451240, "step": 3270, "time_per_iteration": 2.8046483993530273 }, { "auxiliary_loss_clip": 0.01080698, "auxiliary_loss_mlp": 0.01023818, "balance_loss_clip": 1.01771092, "balance_loss_mlp": 1.02223253, "epoch": 0.3933144952804665, "flos": 66192954762240.0, "grad_norm": 0.7983251639002813, "language_loss": 0.6203621, "learning_rate": 2.7680648322587395e-06, "loss": 0.64140725, "num_input_tokens_seen": 70516115, "step": 3271, "time_per_iteration": 4.3572821617126465 }, { "auxiliary_loss_clip": 0.01187426, "auxiliary_loss_mlp": 0.01029712, "balance_loss_clip": 1.0551362, "balance_loss_mlp": 1.02123046, "epoch": 0.39343473817110564, "flos": 15487159720320.0, "grad_norm": 1.9075317427833183, "language_loss": 0.8089937, "learning_rate": 2.7673455342214334e-06, "loss": 0.83116508, "num_input_tokens_seen": 70533105, "step": 3272, "time_per_iteration": 2.821870803833008 }, { "auxiliary_loss_clip": 0.01185119, "auxiliary_loss_mlp": 0.01029252, "balance_loss_clip": 1.05514538, "balance_loss_mlp": 1.02157533, "epoch": 0.39355498106174475, "flos": 21324905809920.0, "grad_norm": 2.018015162129214, "language_loss": 0.75739002, "learning_rate": 2.7666261197784198e-06, "loss": 0.77953368, "num_input_tokens_seen": 70551920, "step": 3273, "time_per_iteration": 3.8522305488586426 }, { "auxiliary_loss_clip": 0.01178121, "auxiliary_loss_mlp": 0.01027078, "balance_loss_clip": 1.05605769, "balance_loss_mlp": 1.0194366, "epoch": 0.3936752239523838, "flos": 13296357400320.0, "grad_norm": 2.121130924440967, "language_loss": 0.76628399, "learning_rate": 2.7659065890388336e-06, "loss": 0.78833592, "num_input_tokens_seen": 70567920, "step": 3274, "time_per_iteration": 2.80045747756958 }, { "auxiliary_loss_clip": 0.01184102, "auxiliary_loss_mlp": 0.01033752, "balance_loss_clip": 1.05459309, "balance_loss_mlp": 1.02541947, "epoch": 0.3937954668430229, "flos": 16800161472000.0, "grad_norm": 1.991678143603365, "language_loss": 0.84852445, "learning_rate": 2.7651869421118266e-06, "loss": 0.87070298, "num_input_tokens_seen": 70584530, "step": 3275, "time_per_iteration": 3.8709259033203125 }, { "auxiliary_loss_clip": 0.01189792, "auxiliary_loss_mlp": 0.01029632, "balance_loss_clip": 1.05851781, "balance_loss_mlp": 1.02115607, "epoch": 0.393915709733662, "flos": 21064229832960.0, "grad_norm": 1.6701205861905455, "language_loss": 0.83089733, "learning_rate": 2.76446717910657e-06, "loss": 0.85309154, "num_input_tokens_seen": 70605235, "step": 3276, "time_per_iteration": 2.918304443359375 }, { "auxiliary_loss_clip": 0.01180621, "auxiliary_loss_mlp": 0.01027339, "balance_loss_clip": 1.05425215, "balance_loss_mlp": 1.01985312, "epoch": 0.3940359526243011, "flos": 17165265264000.0, "grad_norm": 2.7979544196088306, "language_loss": 0.767093, "learning_rate": 2.763747300132249e-06, "loss": 0.78917265, "num_input_tokens_seen": 70622675, "step": 3277, "time_per_iteration": 2.9313793182373047 }, { "auxiliary_loss_clip": 0.01187703, "auxiliary_loss_mlp": 0.01028956, "balance_loss_clip": 1.05519998, "balance_loss_mlp": 1.02083838, "epoch": 0.3941561955149402, "flos": 20995856294400.0, "grad_norm": 1.7145315604471898, "language_loss": 0.86617708, "learning_rate": 2.7630273052980704e-06, "loss": 0.88834369, "num_input_tokens_seen": 70643265, "step": 3278, "time_per_iteration": 2.7949435710906982 }, { "auxiliary_loss_clip": 0.01173785, "auxiliary_loss_mlp": 0.01029539, "balance_loss_clip": 1.05633008, "balance_loss_mlp": 1.02106357, "epoch": 0.39427643840557924, "flos": 18843406721280.0, "grad_norm": 2.1698741329132663, "language_loss": 0.67232573, "learning_rate": 2.7623071947132554e-06, "loss": 0.69435906, "num_input_tokens_seen": 70660295, "step": 3279, "time_per_iteration": 2.7857940196990967 }, { "auxiliary_loss_clip": 0.01190012, "auxiliary_loss_mlp": 0.01026612, "balance_loss_clip": 1.05619264, "balance_loss_mlp": 1.01816607, "epoch": 0.39439668129621835, "flos": 23258659426560.0, "grad_norm": 2.249952031856281, "language_loss": 0.78769064, "learning_rate": 2.7615869684870458e-06, "loss": 0.80985689, "num_input_tokens_seen": 70679605, "step": 3280, "time_per_iteration": 2.8353450298309326 }, { "auxiliary_loss_clip": 0.01182183, "auxiliary_loss_mlp": 0.01026752, "balance_loss_clip": 1.05550671, "balance_loss_mlp": 1.01893795, "epoch": 0.39451692418685746, "flos": 26652289507200.0, "grad_norm": 1.7343353209920431, "language_loss": 0.85011303, "learning_rate": 2.7608666267286986e-06, "loss": 0.8722024, "num_input_tokens_seen": 70699835, "step": 3281, "time_per_iteration": 2.804378032684326 }, { "auxiliary_loss_clip": 0.01168646, "auxiliary_loss_mlp": 0.01034525, "balance_loss_clip": 1.05260873, "balance_loss_mlp": 1.02562594, "epoch": 0.3946371670774965, "flos": 18258709132800.0, "grad_norm": 2.3478476344860524, "language_loss": 0.86954528, "learning_rate": 2.760146169547489e-06, "loss": 0.89157701, "num_input_tokens_seen": 70716600, "step": 3282, "time_per_iteration": 2.8422954082489014 }, { "auxiliary_loss_clip": 0.01187346, "auxiliary_loss_mlp": 0.01033627, "balance_loss_clip": 1.0585835, "balance_loss_mlp": 1.02543128, "epoch": 0.39475740996813563, "flos": 24206126423040.0, "grad_norm": 1.556624580653669, "language_loss": 0.7658565, "learning_rate": 2.75942559705271e-06, "loss": 0.78806615, "num_input_tokens_seen": 70736335, "step": 3283, "time_per_iteration": 2.72694993019104 }, { "auxiliary_loss_clip": 0.01185331, "auxiliary_loss_mlp": 0.01030321, "balance_loss_clip": 1.05599284, "balance_loss_mlp": 1.02214372, "epoch": 0.39487765285877474, "flos": 19317858491520.0, "grad_norm": 1.9534301073985985, "language_loss": 0.88735151, "learning_rate": 2.7587049093536713e-06, "loss": 0.90950799, "num_input_tokens_seen": 70752665, "step": 3284, "time_per_iteration": 2.7413268089294434 }, { "auxiliary_loss_clip": 0.01187898, "auxiliary_loss_mlp": 0.01028026, "balance_loss_clip": 1.05507851, "balance_loss_mlp": 1.01989615, "epoch": 0.3949978957494138, "flos": 17311744926720.0, "grad_norm": 3.43005564247958, "language_loss": 0.80979186, "learning_rate": 2.757984106559701e-06, "loss": 0.83195114, "num_input_tokens_seen": 70771650, "step": 3285, "time_per_iteration": 2.7291061878204346 }, { "auxiliary_loss_clip": 0.01175997, "auxiliary_loss_mlp": 0.01031881, "balance_loss_clip": 1.05576122, "balance_loss_mlp": 1.0232029, "epoch": 0.3951181386400529, "flos": 36317861280000.0, "grad_norm": 2.3628752552720282, "language_loss": 0.71133459, "learning_rate": 2.7572631887801446e-06, "loss": 0.7334134, "num_input_tokens_seen": 70793275, "step": 3286, "time_per_iteration": 2.866502523422241 }, { "auxiliary_loss_clip": 0.01184374, "auxiliary_loss_mlp": 0.01034507, "balance_loss_clip": 1.05507076, "balance_loss_mlp": 1.02630556, "epoch": 0.395238381530692, "flos": 23110348170240.0, "grad_norm": 1.6664008305930873, "language_loss": 0.76461267, "learning_rate": 2.7565421561243654e-06, "loss": 0.78680146, "num_input_tokens_seen": 70811440, "step": 3287, "time_per_iteration": 2.7686147689819336 }, { "auxiliary_loss_clip": 0.01172586, "auxiliary_loss_mlp": 0.01029515, "balance_loss_clip": 1.05562055, "balance_loss_mlp": 1.0213964, "epoch": 0.3953586244213311, "flos": 24347614095360.0, "grad_norm": 2.246678053396392, "language_loss": 0.81944793, "learning_rate": 2.7558210087017413e-06, "loss": 0.84146893, "num_input_tokens_seen": 70831375, "step": 3288, "time_per_iteration": 2.7693727016448975 }, { "auxiliary_loss_clip": 0.01171981, "auxiliary_loss_mlp": 0.0103084, "balance_loss_clip": 1.05544221, "balance_loss_mlp": 1.02222669, "epoch": 0.3954788673119702, "flos": 23440080044160.0, "grad_norm": 2.178237827966341, "language_loss": 0.73823929, "learning_rate": 2.7550997466216724e-06, "loss": 0.7602675, "num_input_tokens_seen": 70849170, "step": 3289, "time_per_iteration": 2.834315538406372 }, { "auxiliary_loss_clip": 0.01177689, "auxiliary_loss_mlp": 0.01034399, "balance_loss_clip": 1.0561235, "balance_loss_mlp": 1.02576208, "epoch": 0.3955991102026093, "flos": 17494063384320.0, "grad_norm": 2.1192374632775604, "language_loss": 0.81170148, "learning_rate": 2.7543783699935714e-06, "loss": 0.83382231, "num_input_tokens_seen": 70867200, "step": 3290, "time_per_iteration": 2.7268261909484863 }, { "auxiliary_loss_clip": 0.01185704, "auxiliary_loss_mlp": 0.01032119, "balance_loss_clip": 1.05952549, "balance_loss_mlp": 1.02445698, "epoch": 0.39571935309324835, "flos": 18221326053120.0, "grad_norm": 2.544315177669122, "language_loss": 0.86543745, "learning_rate": 2.753656878926872e-06, "loss": 0.88761568, "num_input_tokens_seen": 70883080, "step": 3291, "time_per_iteration": 2.7823078632354736 }, { "auxiliary_loss_clip": 0.01171802, "auxiliary_loss_mlp": 0.01028723, "balance_loss_clip": 1.05400074, "balance_loss_mlp": 1.01963902, "epoch": 0.39583959598388746, "flos": 17748813617280.0, "grad_norm": 2.2308621702843348, "language_loss": 0.73892814, "learning_rate": 2.752935273531023e-06, "loss": 0.76093334, "num_input_tokens_seen": 70901230, "step": 3292, "time_per_iteration": 2.7375762462615967 }, { "auxiliary_loss_clip": 0.01189446, "auxiliary_loss_mlp": 0.01033208, "balance_loss_clip": 1.05838037, "balance_loss_mlp": 1.02344489, "epoch": 0.39595983887452657, "flos": 19352368483200.0, "grad_norm": 1.9340951250134741, "language_loss": 0.78483927, "learning_rate": 2.752213553915492e-06, "loss": 0.80706578, "num_input_tokens_seen": 70919585, "step": 3293, "time_per_iteration": 2.6446735858917236 }, { "auxiliary_loss_clip": 0.01082284, "auxiliary_loss_mlp": 0.01005538, "balance_loss_clip": 1.02011347, "balance_loss_mlp": 1.00397611, "epoch": 0.3960800817651656, "flos": 60682282940160.0, "grad_norm": 0.822675434309886, "language_loss": 0.66044343, "learning_rate": 2.751491720189762e-06, "loss": 0.68132162, "num_input_tokens_seen": 70977695, "step": 3294, "time_per_iteration": 3.243425130844116 }, { "auxiliary_loss_clip": 0.0118116, "auxiliary_loss_mlp": 0.01060038, "balance_loss_clip": 1.05627561, "balance_loss_mlp": 1.02141225, "epoch": 0.39620032465580474, "flos": 16836718538880.0, "grad_norm": 2.212495736768351, "language_loss": 0.91458076, "learning_rate": 2.7507697724633364e-06, "loss": 0.93699276, "num_input_tokens_seen": 70994455, "step": 3295, "time_per_iteration": 3.6390974521636963 }, { "auxiliary_loss_clip": 0.01088764, "auxiliary_loss_mlp": 0.0101189, "balance_loss_clip": 1.02746141, "balance_loss_mlp": 1.01042366, "epoch": 0.3963205675464438, "flos": 69071445941760.0, "grad_norm": 0.7757645549512151, "language_loss": 0.54634452, "learning_rate": 2.7500477108457327e-06, "loss": 0.56735104, "num_input_tokens_seen": 71046465, "step": 3296, "time_per_iteration": 3.1954143047332764 }, { "auxiliary_loss_clip": 0.01183517, "auxiliary_loss_mlp": 0.01031441, "balance_loss_clip": 1.05520332, "balance_loss_mlp": 1.02235746, "epoch": 0.3964408104370829, "flos": 25667439431040.0, "grad_norm": 2.0348715537547286, "language_loss": 0.81094265, "learning_rate": 2.7493255354464877e-06, "loss": 0.83309221, "num_input_tokens_seen": 71064275, "step": 3297, "time_per_iteration": 3.7356786727905273 }, { "auxiliary_loss_clip": 0.01150522, "auxiliary_loss_mlp": 0.01028787, "balance_loss_clip": 1.05548644, "balance_loss_mlp": 1.01971495, "epoch": 0.396561053327722, "flos": 24277480790400.0, "grad_norm": 1.7651888640793667, "language_loss": 0.75899041, "learning_rate": 2.748603246375156e-06, "loss": 0.78078347, "num_input_tokens_seen": 71082290, "step": 3298, "time_per_iteration": 3.0930330753326416 }, { "auxiliary_loss_clip": 0.01190419, "auxiliary_loss_mlp": 0.01024484, "balance_loss_clip": 1.05818641, "balance_loss_mlp": 1.01627648, "epoch": 0.39668129621836107, "flos": 20522302364160.0, "grad_norm": 3.081099549937445, "language_loss": 0.69743365, "learning_rate": 2.7478808437413055e-06, "loss": 0.71958268, "num_input_tokens_seen": 71101700, "step": 3299, "time_per_iteration": 3.8727993965148926 }, { "auxiliary_loss_clip": 0.01174843, "auxiliary_loss_mlp": 0.01029129, "balance_loss_clip": 1.05653977, "balance_loss_mlp": 1.02080202, "epoch": 0.3968015391090002, "flos": 27052585649280.0, "grad_norm": 5.033102953469433, "language_loss": 0.66096449, "learning_rate": 2.7471583276545263e-06, "loss": 0.68300426, "num_input_tokens_seen": 71122360, "step": 3300, "time_per_iteration": 3.925626516342163 }, { "auxiliary_loss_clip": 0.01183087, "auxiliary_loss_mlp": 0.01029161, "balance_loss_clip": 1.05472744, "balance_loss_mlp": 1.0207274, "epoch": 0.3969217819996393, "flos": 12531819392640.0, "grad_norm": 2.1196723802715938, "language_loss": 0.71156377, "learning_rate": 2.7464356982244224e-06, "loss": 0.73368621, "num_input_tokens_seen": 71140360, "step": 3301, "time_per_iteration": 2.776516914367676 }, { "auxiliary_loss_clip": 0.01089178, "auxiliary_loss_mlp": 0.0100863, "balance_loss_clip": 1.02691555, "balance_loss_mlp": 1.00708032, "epoch": 0.39704202489027834, "flos": 66241399230720.0, "grad_norm": 0.7726117157839542, "language_loss": 0.61810338, "learning_rate": 2.745712955560617e-06, "loss": 0.63908148, "num_input_tokens_seen": 71196565, "step": 3302, "time_per_iteration": 3.2999930381774902 }, { "auxiliary_loss_clip": 0.01171997, "auxiliary_loss_mlp": 0.01029547, "balance_loss_clip": 1.05751061, "balance_loss_mlp": 1.02098227, "epoch": 0.39716226778091746, "flos": 16982982720000.0, "grad_norm": 2.4605208729479977, "language_loss": 0.76850957, "learning_rate": 2.7449900997727496e-06, "loss": 0.79052496, "num_input_tokens_seen": 71214675, "step": 3303, "time_per_iteration": 2.9158055782318115 }, { "auxiliary_loss_clip": 0.01176262, "auxiliary_loss_mlp": 0.01029608, "balance_loss_clip": 1.05602181, "balance_loss_mlp": 1.02058983, "epoch": 0.39728251067155657, "flos": 23477139901440.0, "grad_norm": 1.7831552235194659, "language_loss": 0.84247112, "learning_rate": 2.744267130970476e-06, "loss": 0.86452985, "num_input_tokens_seen": 71234400, "step": 3304, "time_per_iteration": 2.8447158336639404 }, { "auxiliary_loss_clip": 0.0117974, "auxiliary_loss_mlp": 0.01034079, "balance_loss_clip": 1.05842948, "balance_loss_mlp": 1.02535295, "epoch": 0.3974027535621956, "flos": 20704441253760.0, "grad_norm": 2.030252391417821, "language_loss": 0.76922679, "learning_rate": 2.7435440492634697e-06, "loss": 0.79136491, "num_input_tokens_seen": 71253725, "step": 3305, "time_per_iteration": 2.803697109222412 }, { "auxiliary_loss_clip": 0.01185738, "auxiliary_loss_mlp": 0.01031806, "balance_loss_clip": 1.05654645, "balance_loss_mlp": 1.0219357, "epoch": 0.39752299645283473, "flos": 21543278544000.0, "grad_norm": 2.2567328615307143, "language_loss": 0.67528665, "learning_rate": 2.7428208547614228e-06, "loss": 0.69746208, "num_input_tokens_seen": 71273220, "step": 3306, "time_per_iteration": 2.8928351402282715 }, { "auxiliary_loss_clip": 0.01186091, "auxiliary_loss_mlp": 0.0103099, "balance_loss_clip": 1.05606282, "balance_loss_mlp": 1.0221386, "epoch": 0.39764323934347384, "flos": 19208295031680.0, "grad_norm": 2.003848162681728, "language_loss": 0.77407062, "learning_rate": 2.742097547574043e-06, "loss": 0.7962414, "num_input_tokens_seen": 71291445, "step": 3307, "time_per_iteration": 3.014770984649658 }, { "auxiliary_loss_clip": 0.01189273, "auxiliary_loss_mlp": 0.01054946, "balance_loss_clip": 1.05724823, "balance_loss_mlp": 1.01576459, "epoch": 0.3977634822341129, "flos": 20850202644480.0, "grad_norm": 16.680115127696062, "language_loss": 0.77853847, "learning_rate": 2.7413741278110544e-06, "loss": 0.80098069, "num_input_tokens_seen": 71310135, "step": 3308, "time_per_iteration": 2.8380825519561768 }, { "auxiliary_loss_clip": 0.01186405, "auxiliary_loss_mlp": 0.010323, "balance_loss_clip": 1.05748677, "balance_loss_mlp": 1.02399123, "epoch": 0.397883725124752, "flos": 39786042038400.0, "grad_norm": 2.1607885619110925, "language_loss": 0.68822199, "learning_rate": 2.7406505955822016e-06, "loss": 0.71040905, "num_input_tokens_seen": 71331160, "step": 3309, "time_per_iteration": 2.9377307891845703 }, { "auxiliary_loss_clip": 0.01181919, "auxiliary_loss_mlp": 0.01033256, "balance_loss_clip": 1.05523288, "balance_loss_mlp": 1.02442825, "epoch": 0.39800396801539106, "flos": 17379507934080.0, "grad_norm": 5.531534933523638, "language_loss": 0.66060007, "learning_rate": 2.7399269509972415e-06, "loss": 0.68275177, "num_input_tokens_seen": 71345315, "step": 3310, "time_per_iteration": 2.825702428817749 }, { "auxiliary_loss_clip": 0.01180306, "auxiliary_loss_mlp": 0.01032185, "balance_loss_clip": 1.05658352, "balance_loss_mlp": 1.023, "epoch": 0.3981242109060302, "flos": 19202764337280.0, "grad_norm": 4.084418477678058, "language_loss": 0.85402238, "learning_rate": 2.7392031941659514e-06, "loss": 0.87614739, "num_input_tokens_seen": 71363160, "step": 3311, "time_per_iteration": 2.7399978637695312 }, { "auxiliary_loss_clip": 0.0118505, "auxiliary_loss_mlp": 0.01027512, "balance_loss_clip": 1.06021166, "balance_loss_mlp": 1.01914334, "epoch": 0.3982444537966693, "flos": 24565124903040.0, "grad_norm": 2.1417426755296023, "language_loss": 0.86463082, "learning_rate": 2.7384793251981244e-06, "loss": 0.88675642, "num_input_tokens_seen": 71382145, "step": 3312, "time_per_iteration": 2.915886163711548 }, { "auxiliary_loss_clip": 0.0119059, "auxiliary_loss_mlp": 0.01031351, "balance_loss_clip": 1.05701852, "balance_loss_mlp": 1.02291656, "epoch": 0.39836469668730834, "flos": 26213856099840.0, "grad_norm": 4.508271279842097, "language_loss": 0.81162441, "learning_rate": 2.737755344203571e-06, "loss": 0.83384383, "num_input_tokens_seen": 71402095, "step": 3313, "time_per_iteration": 2.8290483951568604 }, { "auxiliary_loss_clip": 0.01186744, "auxiliary_loss_mlp": 0.01034822, "balance_loss_clip": 1.0565455, "balance_loss_mlp": 1.02632809, "epoch": 0.39848493957794745, "flos": 27636134002560.0, "grad_norm": 1.6351670010597603, "language_loss": 0.79899192, "learning_rate": 2.7370312512921186e-06, "loss": 0.82120764, "num_input_tokens_seen": 71423875, "step": 3314, "time_per_iteration": 2.8078153133392334 }, { "auxiliary_loss_clip": 0.01188606, "auxiliary_loss_mlp": 0.01033441, "balance_loss_clip": 1.05589938, "balance_loss_mlp": 1.02394032, "epoch": 0.39860518246858656, "flos": 12239326944000.0, "grad_norm": 4.927452920718998, "language_loss": 0.76850617, "learning_rate": 2.736307046573611e-06, "loss": 0.79072666, "num_input_tokens_seen": 71439745, "step": 3315, "time_per_iteration": 2.762326717376709 }, { "auxiliary_loss_clip": 0.01186605, "auxiliary_loss_mlp": 0.01027521, "balance_loss_clip": 1.05490804, "balance_loss_mlp": 1.01911688, "epoch": 0.3987254253592256, "flos": 22379135005440.0, "grad_norm": 2.0635892485372187, "language_loss": 0.81842029, "learning_rate": 2.73558273015791e-06, "loss": 0.84056157, "num_input_tokens_seen": 71459575, "step": 3316, "time_per_iteration": 2.675956964492798 }, { "auxiliary_loss_clip": 0.01191981, "auxiliary_loss_mlp": 0.010322, "balance_loss_clip": 1.05777788, "balance_loss_mlp": 1.0235033, "epoch": 0.3988456682498647, "flos": 23514020190720.0, "grad_norm": 2.2618847563431483, "language_loss": 0.70517409, "learning_rate": 2.734858302154894e-06, "loss": 0.72741586, "num_input_tokens_seen": 71481075, "step": 3317, "time_per_iteration": 2.740563154220581 }, { "auxiliary_loss_clip": 0.01176409, "auxiliary_loss_mlp": 0.01028127, "balance_loss_clip": 1.05491328, "balance_loss_mlp": 1.01928759, "epoch": 0.39896591114050384, "flos": 19208761908480.0, "grad_norm": 2.3063335845543964, "language_loss": 0.76616061, "learning_rate": 2.734133762674457e-06, "loss": 0.78820592, "num_input_tokens_seen": 71500665, "step": 3318, "time_per_iteration": 2.673466205596924 }, { "auxiliary_loss_clip": 0.01182267, "auxiliary_loss_mlp": 0.01028915, "balance_loss_clip": 1.05590355, "balance_loss_mlp": 1.01925266, "epoch": 0.3990861540311429, "flos": 28401031146240.0, "grad_norm": 1.9854900391715533, "language_loss": 0.70875359, "learning_rate": 2.7334091118265124e-06, "loss": 0.73086542, "num_input_tokens_seen": 71522560, "step": 3319, "time_per_iteration": 2.8299624919891357 }, { "auxiliary_loss_clip": 0.01089022, "auxiliary_loss_mlp": 0.01014614, "balance_loss_clip": 1.02133465, "balance_loss_mlp": 1.01298094, "epoch": 0.399206396921782, "flos": 61758563086080.0, "grad_norm": 0.6774767142183501, "language_loss": 0.57832515, "learning_rate": 2.732684349720989e-06, "loss": 0.59936154, "num_input_tokens_seen": 71590520, "step": 3320, "time_per_iteration": 3.3581383228302 }, { "auxiliary_loss_clip": 0.01182606, "auxiliary_loss_mlp": 0.01035035, "balance_loss_clip": 1.05651879, "balance_loss_mlp": 1.02621353, "epoch": 0.3993266398124211, "flos": 28074567409920.0, "grad_norm": 2.0158837565234884, "language_loss": 0.75766671, "learning_rate": 2.7319594764678318e-06, "loss": 0.77984309, "num_input_tokens_seen": 71612620, "step": 3321, "time_per_iteration": 2.895676374435425 }, { "auxiliary_loss_clip": 0.01179655, "auxiliary_loss_mlp": 0.01026936, "balance_loss_clip": 1.05580807, "balance_loss_mlp": 1.01801324, "epoch": 0.39944688270306017, "flos": 23225083188480.0, "grad_norm": 1.969085778770759, "language_loss": 0.83202028, "learning_rate": 2.7312344921770044e-06, "loss": 0.85408622, "num_input_tokens_seen": 71634320, "step": 3322, "time_per_iteration": 3.6908936500549316 }, { "auxiliary_loss_clip": 0.01184128, "auxiliary_loss_mlp": 0.01030196, "balance_loss_clip": 1.05511475, "balance_loss_mlp": 1.02202404, "epoch": 0.3995671255936993, "flos": 19390433921280.0, "grad_norm": 1.9581844361834768, "language_loss": 0.78402197, "learning_rate": 2.7305093969584857e-06, "loss": 0.80616522, "num_input_tokens_seen": 71653145, "step": 3323, "time_per_iteration": 3.864696502685547 }, { "auxiliary_loss_clip": 0.01182657, "auxiliary_loss_mlp": 0.0103123, "balance_loss_clip": 1.05691278, "balance_loss_mlp": 1.02279568, "epoch": 0.3996873684843384, "flos": 23842638743040.0, "grad_norm": 1.884459649315743, "language_loss": 0.79938012, "learning_rate": 2.729784190922272e-06, "loss": 0.82151896, "num_input_tokens_seen": 71674580, "step": 3324, "time_per_iteration": 2.6906838417053223 }, { "auxiliary_loss_clip": 0.0108617, "auxiliary_loss_mlp": 0.01005169, "balance_loss_clip": 1.0211308, "balance_loss_mlp": 1.00367296, "epoch": 0.39980761137497745, "flos": 66576877280640.0, "grad_norm": 0.959181094037773, "language_loss": 0.57191157, "learning_rate": 2.729058874178378e-06, "loss": 0.59282494, "num_input_tokens_seen": 71745260, "step": 3325, "time_per_iteration": 4.327344179153442 }, { "auxiliary_loss_clip": 0.01187159, "auxiliary_loss_mlp": 0.01029522, "balance_loss_clip": 1.05822515, "balance_loss_mlp": 1.02070022, "epoch": 0.39992785426561656, "flos": 28549162834560.0, "grad_norm": 2.212781729383679, "language_loss": 0.69105142, "learning_rate": 2.7283334468368315e-06, "loss": 0.71321827, "num_input_tokens_seen": 71766540, "step": 3326, "time_per_iteration": 3.8367321491241455 }, { "auxiliary_loss_clip": 0.01167471, "auxiliary_loss_mlp": 0.01026745, "balance_loss_clip": 1.05713332, "balance_loss_mlp": 1.01786423, "epoch": 0.4000480971562556, "flos": 15049408671360.0, "grad_norm": 1.8328914450979565, "language_loss": 0.73075128, "learning_rate": 2.72760790900768e-06, "loss": 0.75269341, "num_input_tokens_seen": 71783125, "step": 3327, "time_per_iteration": 2.8789753913879395 }, { "auxiliary_loss_clip": 0.01193972, "auxiliary_loss_mlp": 0.01032735, "balance_loss_clip": 1.0591234, "balance_loss_mlp": 1.02398491, "epoch": 0.4001683400468947, "flos": 23915609222400.0, "grad_norm": 1.7904741633853927, "language_loss": 0.78079939, "learning_rate": 2.7268822608009875e-06, "loss": 0.80306649, "num_input_tokens_seen": 71802500, "step": 3328, "time_per_iteration": 2.7536871433258057 }, { "auxiliary_loss_clip": 0.01186026, "auxiliary_loss_mlp": 0.01029912, "balance_loss_clip": 1.05761123, "balance_loss_mlp": 1.0208106, "epoch": 0.40028858293753383, "flos": 24352677912960.0, "grad_norm": 2.216271609279033, "language_loss": 0.78251952, "learning_rate": 2.726156502326834e-06, "loss": 0.80467892, "num_input_tokens_seen": 71823800, "step": 3329, "time_per_iteration": 276.74068427085876 }, { "auxiliary_loss_clip": 0.01086332, "auxiliary_loss_mlp": 0.01029579, "balance_loss_clip": 1.02392721, "balance_loss_mlp": 1.0282315, "epoch": 0.4004088258281729, "flos": 66787025800320.0, "grad_norm": 0.7065477185821994, "language_loss": 0.60228658, "learning_rate": 2.725430633695316e-06, "loss": 0.62344569, "num_input_tokens_seen": 71886880, "step": 3330, "time_per_iteration": 3.363321542739868 }, { "auxiliary_loss_clip": 0.01085904, "auxiliary_loss_mlp": 0.01002886, "balance_loss_clip": 1.01908302, "balance_loss_mlp": 1.00142574, "epoch": 0.400529068718812, "flos": 58598386473600.0, "grad_norm": 0.8823638966905554, "language_loss": 0.57959443, "learning_rate": 2.7247046550165485e-06, "loss": 0.60048229, "num_input_tokens_seen": 71939005, "step": 3331, "time_per_iteration": 3.0758354663848877 }, { "auxiliary_loss_clip": 0.01191266, "auxiliary_loss_mlp": 0.01033, "balance_loss_clip": 1.05744696, "balance_loss_mlp": 1.02369022, "epoch": 0.4006493116094511, "flos": 25377460934400.0, "grad_norm": 1.4791673390959441, "language_loss": 0.7591418, "learning_rate": 2.7239785664006606e-06, "loss": 0.78138447, "num_input_tokens_seen": 71962545, "step": 3332, "time_per_iteration": 2.7276246547698975 }, { "auxiliary_loss_clip": 0.01086459, "auxiliary_loss_mlp": 0.01002043, "balance_loss_clip": 1.01967525, "balance_loss_mlp": 1.00055885, "epoch": 0.40076955450009016, "flos": 60280729822080.0, "grad_norm": 0.7722334639816213, "language_loss": 0.61802912, "learning_rate": 2.7232523679578002e-06, "loss": 0.63891423, "num_input_tokens_seen": 72025625, "step": 3333, "time_per_iteration": 3.290334463119507 }, { "auxiliary_loss_clip": 0.01184599, "auxiliary_loss_mlp": 0.01028815, "balance_loss_clip": 1.05687714, "balance_loss_mlp": 1.02046406, "epoch": 0.4008897973907293, "flos": 16617268396800.0, "grad_norm": 2.2370833491252906, "language_loss": 0.79390758, "learning_rate": 2.7225260597981295e-06, "loss": 0.81604171, "num_input_tokens_seen": 72043330, "step": 3334, "time_per_iteration": 2.6783533096313477 }, { "auxiliary_loss_clip": 0.01173628, "auxiliary_loss_mlp": 0.01063977, "balance_loss_clip": 1.05405903, "balance_loss_mlp": 1.02365053, "epoch": 0.4010100402813684, "flos": 15377344865280.0, "grad_norm": 3.4410931103591516, "language_loss": 0.78560722, "learning_rate": 2.721799642031831e-06, "loss": 0.80798334, "num_input_tokens_seen": 72059500, "step": 3335, "time_per_iteration": 2.853154182434082 }, { "auxiliary_loss_clip": 0.01191058, "auxiliary_loss_mlp": 0.01028742, "balance_loss_clip": 1.0571878, "balance_loss_mlp": 1.02000427, "epoch": 0.40113028317200744, "flos": 13298835438720.0, "grad_norm": 2.246468090450341, "language_loss": 0.77815509, "learning_rate": 2.721073114769101e-06, "loss": 0.80035305, "num_input_tokens_seen": 72077175, "step": 3336, "time_per_iteration": 2.8962960243225098 }, { "auxiliary_loss_clip": 0.01174736, "auxiliary_loss_mlp": 0.01031564, "balance_loss_clip": 1.05503654, "balance_loss_mlp": 1.02246809, "epoch": 0.40125052606264655, "flos": 20668027841280.0, "grad_norm": 1.9740536176424026, "language_loss": 0.75273848, "learning_rate": 2.7203464781201523e-06, "loss": 0.77480143, "num_input_tokens_seen": 72096490, "step": 3337, "time_per_iteration": 2.7904467582702637 }, { "auxiliary_loss_clip": 0.01195139, "auxiliary_loss_mlp": 0.01035971, "balance_loss_clip": 1.06020629, "balance_loss_mlp": 1.02709556, "epoch": 0.40137076895328566, "flos": 24607679541120.0, "grad_norm": 2.6454195543578964, "language_loss": 0.77794445, "learning_rate": 2.719619732195215e-06, "loss": 0.80025554, "num_input_tokens_seen": 72118130, "step": 3338, "time_per_iteration": 2.745453357696533 }, { "auxiliary_loss_clip": 0.01179992, "auxiliary_loss_mlp": 0.01027764, "balance_loss_clip": 1.0556891, "balance_loss_mlp": 1.01856077, "epoch": 0.4014910118439247, "flos": 24206593299840.0, "grad_norm": 1.4200634616532268, "language_loss": 0.72583199, "learning_rate": 2.7188928771045377e-06, "loss": 0.74790961, "num_input_tokens_seen": 72139450, "step": 3339, "time_per_iteration": 2.9703376293182373 }, { "auxiliary_loss_clip": 0.01174894, "auxiliary_loss_mlp": 0.01028974, "balance_loss_clip": 1.05477655, "balance_loss_mlp": 1.02052212, "epoch": 0.4016112547345638, "flos": 26725080418560.0, "grad_norm": 5.437686005421575, "language_loss": 0.79872566, "learning_rate": 2.7181659129583815e-06, "loss": 0.82076436, "num_input_tokens_seen": 72159040, "step": 3340, "time_per_iteration": 2.9061362743377686 }, { "auxiliary_loss_clip": 0.01178312, "auxiliary_loss_mlp": 0.01032654, "balance_loss_clip": 1.05468535, "balance_loss_mlp": 1.02382684, "epoch": 0.4017314976252029, "flos": 21288025520640.0, "grad_norm": 2.1364539165840837, "language_loss": 0.76138985, "learning_rate": 2.7174388398670276e-06, "loss": 0.78349948, "num_input_tokens_seen": 72178220, "step": 3341, "time_per_iteration": 2.8960072994232178 }, { "auxiliary_loss_clip": 0.01189028, "auxiliary_loss_mlp": 0.01034708, "balance_loss_clip": 1.05375266, "balance_loss_mlp": 1.02626753, "epoch": 0.401851740515842, "flos": 25484690010240.0, "grad_norm": 5.229929957661063, "language_loss": 0.92011064, "learning_rate": 2.716711657940773e-06, "loss": 0.94234794, "num_input_tokens_seen": 72199230, "step": 3342, "time_per_iteration": 2.880715847015381 }, { "auxiliary_loss_clip": 0.01080706, "auxiliary_loss_mlp": 0.01001992, "balance_loss_clip": 1.0173012, "balance_loss_mlp": 1.00068116, "epoch": 0.4019719834064811, "flos": 55395334978560.0, "grad_norm": 0.8190714569172922, "language_loss": 0.56477106, "learning_rate": 2.7159843672899284e-06, "loss": 0.58559799, "num_input_tokens_seen": 72263430, "step": 3343, "time_per_iteration": 3.4539294242858887 }, { "auxiliary_loss_clip": 0.01188532, "auxiliary_loss_mlp": 0.01028421, "balance_loss_clip": 1.05870533, "balance_loss_mlp": 1.01985002, "epoch": 0.40209222629712016, "flos": 18180100218240.0, "grad_norm": 3.5436333492378234, "language_loss": 0.81954217, "learning_rate": 2.715256968024825e-06, "loss": 0.8417117, "num_input_tokens_seen": 72280505, "step": 3344, "time_per_iteration": 2.7711446285247803 }, { "auxiliary_loss_clip": 0.01187974, "auxiliary_loss_mlp": 0.0102943, "balance_loss_clip": 1.05479634, "balance_loss_mlp": 1.02020359, "epoch": 0.40221246918775927, "flos": 25961009287680.0, "grad_norm": 1.6395004820457417, "language_loss": 0.82560861, "learning_rate": 2.7145294602558083e-06, "loss": 0.84778267, "num_input_tokens_seen": 72301215, "step": 3345, "time_per_iteration": 2.817847967147827 }, { "auxiliary_loss_clip": 0.01186834, "auxiliary_loss_mlp": 0.01030239, "balance_loss_clip": 1.05711687, "balance_loss_mlp": 1.02108979, "epoch": 0.4023327120783984, "flos": 33838912056960.0, "grad_norm": 1.8219955366346454, "language_loss": 0.71015388, "learning_rate": 2.713801844093241e-06, "loss": 0.7323246, "num_input_tokens_seen": 72322365, "step": 3346, "time_per_iteration": 2.8940863609313965 }, { "auxiliary_loss_clip": 0.01188412, "auxiliary_loss_mlp": 0.01028148, "balance_loss_clip": 1.0575316, "balance_loss_mlp": 1.01943398, "epoch": 0.40245295496903744, "flos": 26900252069760.0, "grad_norm": 2.122060719572822, "language_loss": 0.88543373, "learning_rate": 2.7130741196475014e-06, "loss": 0.90759933, "num_input_tokens_seen": 72340495, "step": 3347, "time_per_iteration": 3.8583457469940186 }, { "auxiliary_loss_clip": 0.01187048, "auxiliary_loss_mlp": 0.01037645, "balance_loss_clip": 1.05895281, "balance_loss_mlp": 1.02848995, "epoch": 0.40257319785967655, "flos": 36902738436480.0, "grad_norm": 3.314609320781103, "language_loss": 0.79096293, "learning_rate": 2.7123462870289848e-06, "loss": 0.81320989, "num_input_tokens_seen": 72360545, "step": 3348, "time_per_iteration": 2.903235673904419 }, { "auxiliary_loss_clip": 0.01185224, "auxiliary_loss_mlp": 0.01036922, "balance_loss_clip": 1.05578232, "balance_loss_mlp": 1.02831542, "epoch": 0.40269344075031566, "flos": 24353180703360.0, "grad_norm": 1.7571488620703168, "language_loss": 0.81244308, "learning_rate": 2.711618346348102e-06, "loss": 0.83466458, "num_input_tokens_seen": 72381070, "step": 3349, "time_per_iteration": 4.171966075897217 }, { "auxiliary_loss_clip": 0.01182292, "auxiliary_loss_mlp": 0.01026496, "balance_loss_clip": 1.06030607, "balance_loss_mlp": 1.01826739, "epoch": 0.4028136836409547, "flos": 14389657614720.0, "grad_norm": 1.729066645145478, "language_loss": 0.63712728, "learning_rate": 2.7108902977152825e-06, "loss": 0.65921521, "num_input_tokens_seen": 72398970, "step": 3350, "time_per_iteration": 2.948171615600586 }, { "auxiliary_loss_clip": 0.01187199, "auxiliary_loss_mlp": 0.010331, "balance_loss_clip": 1.05810189, "balance_loss_mlp": 1.02414155, "epoch": 0.4029339265315938, "flos": 26136037284480.0, "grad_norm": 2.2495609490605224, "language_loss": 0.75354022, "learning_rate": 2.7101621412409704e-06, "loss": 0.77574319, "num_input_tokens_seen": 72418455, "step": 3351, "time_per_iteration": 3.810971736907959 }, { "auxiliary_loss_clip": 0.01193293, "auxiliary_loss_mlp": 0.01034542, "balance_loss_clip": 1.0576812, "balance_loss_mlp": 1.02540445, "epoch": 0.40305416942223293, "flos": 23256325042560.0, "grad_norm": 2.209013744966228, "language_loss": 0.86015475, "learning_rate": 2.7094338770356256e-06, "loss": 0.88243306, "num_input_tokens_seen": 72437540, "step": 3352, "time_per_iteration": 3.857717990875244 }, { "auxiliary_loss_clip": 0.01180888, "auxiliary_loss_mlp": 0.01024584, "balance_loss_clip": 1.05705857, "balance_loss_mlp": 1.01601887, "epoch": 0.403174412312872, "flos": 27089645506560.0, "grad_norm": 2.0015283831238886, "language_loss": 0.6428948, "learning_rate": 2.708705505209726e-06, "loss": 0.66494954, "num_input_tokens_seen": 72458315, "step": 3353, "time_per_iteration": 2.9089560508728027 }, { "auxiliary_loss_clip": 0.0117313, "auxiliary_loss_mlp": 0.01025621, "balance_loss_clip": 1.05804396, "balance_loss_mlp": 1.01617932, "epoch": 0.4032946552035111, "flos": 21756336065280.0, "grad_norm": 2.3145521815912, "language_loss": 0.91666645, "learning_rate": 2.7079770258737646e-06, "loss": 0.93865395, "num_input_tokens_seen": 72476225, "step": 3354, "time_per_iteration": 2.8682594299316406 }, { "auxiliary_loss_clip": 0.01180797, "auxiliary_loss_mlp": 0.01031468, "balance_loss_clip": 1.05809116, "balance_loss_mlp": 1.02224159, "epoch": 0.4034148980941502, "flos": 17343956448000.0, "grad_norm": 2.4082540131597296, "language_loss": 0.75084448, "learning_rate": 2.707248439138251e-06, "loss": 0.77296716, "num_input_tokens_seen": 72492460, "step": 3355, "time_per_iteration": 2.8386831283569336 }, { "auxiliary_loss_clip": 0.01180043, "auxiliary_loss_mlp": 0.01028025, "balance_loss_clip": 1.05844569, "balance_loss_mlp": 1.01957881, "epoch": 0.40353514098478926, "flos": 22017838055040.0, "grad_norm": 7.5499650275754115, "language_loss": 0.65770721, "learning_rate": 2.7065197451137114e-06, "loss": 0.67978793, "num_input_tokens_seen": 72513840, "step": 3356, "time_per_iteration": 2.925363779067993 }, { "auxiliary_loss_clip": 0.01180141, "auxiliary_loss_mlp": 0.01032257, "balance_loss_clip": 1.05498505, "balance_loss_mlp": 1.0235368, "epoch": 0.4036553838754284, "flos": 14246446089600.0, "grad_norm": 2.3823758940516466, "language_loss": 0.67418611, "learning_rate": 2.7057909439106894e-06, "loss": 0.69631004, "num_input_tokens_seen": 72531695, "step": 3357, "time_per_iteration": 2.824228525161743 }, { "auxiliary_loss_clip": 0.0117949, "auxiliary_loss_mlp": 0.01059733, "balance_loss_clip": 1.05636275, "balance_loss_mlp": 1.02130353, "epoch": 0.40377562676606743, "flos": 24790644443520.0, "grad_norm": 2.247794369564543, "language_loss": 0.78332233, "learning_rate": 2.7050620356397417e-06, "loss": 0.80571455, "num_input_tokens_seen": 72550645, "step": 3358, "time_per_iteration": 2.842843770980835 }, { "auxiliary_loss_clip": 0.01188766, "auxiliary_loss_mlp": 0.01027449, "balance_loss_clip": 1.05866098, "balance_loss_mlp": 1.01958156, "epoch": 0.40389586965670654, "flos": 24061226958720.0, "grad_norm": 1.8844698471558046, "language_loss": 0.723019, "learning_rate": 2.7043330204114437e-06, "loss": 0.74518114, "num_input_tokens_seen": 72569355, "step": 3359, "time_per_iteration": 2.787666082382202 }, { "auxiliary_loss_clip": 0.01185526, "auxiliary_loss_mlp": 0.01026028, "balance_loss_clip": 1.05473578, "balance_loss_mlp": 1.01811242, "epoch": 0.40401611254734565, "flos": 16399613934720.0, "grad_norm": 2.2459231663111683, "language_loss": 0.85393953, "learning_rate": 2.7036038983363862e-06, "loss": 0.87605512, "num_input_tokens_seen": 72585960, "step": 3360, "time_per_iteration": 2.804270029067993 }, { "auxiliary_loss_clip": 0.01179826, "auxiliary_loss_mlp": 0.01031083, "balance_loss_clip": 1.05492532, "balance_loss_mlp": 1.02263129, "epoch": 0.4041363554379847, "flos": 23988220565760.0, "grad_norm": 1.7420603869899873, "language_loss": 0.84290802, "learning_rate": 2.702874669525177e-06, "loss": 0.86501718, "num_input_tokens_seen": 72604440, "step": 3361, "time_per_iteration": 2.8254306316375732 }, { "auxiliary_loss_clip": 0.01179739, "auxiliary_loss_mlp": 0.01026125, "balance_loss_clip": 1.0573473, "balance_loss_mlp": 1.01771426, "epoch": 0.4042565983286238, "flos": 28401964899840.0, "grad_norm": 1.8934372388635423, "language_loss": 0.69318843, "learning_rate": 2.7021453340884394e-06, "loss": 0.71524709, "num_input_tokens_seen": 72622165, "step": 3362, "time_per_iteration": 2.838430881500244 }, { "auxiliary_loss_clip": 0.01176225, "auxiliary_loss_mlp": 0.01062678, "balance_loss_clip": 1.05858922, "balance_loss_mlp": 1.02426374, "epoch": 0.40437684121926293, "flos": 17710963660800.0, "grad_norm": 2.4652419851174905, "language_loss": 0.72721952, "learning_rate": 2.7014158921368125e-06, "loss": 0.74960852, "num_input_tokens_seen": 72640490, "step": 3363, "time_per_iteration": 2.7834672927856445 }, { "auxiliary_loss_clip": 0.01188129, "auxiliary_loss_mlp": 0.01033018, "balance_loss_clip": 1.05492806, "balance_loss_mlp": 1.02418435, "epoch": 0.404497084109902, "flos": 24018959629440.0, "grad_norm": 2.0463431623624806, "language_loss": 0.85386664, "learning_rate": 2.700686343780953e-06, "loss": 0.87607813, "num_input_tokens_seen": 72660360, "step": 3364, "time_per_iteration": 2.7435760498046875 }, { "auxiliary_loss_clip": 0.01183689, "auxiliary_loss_mlp": 0.01027259, "balance_loss_clip": 1.05555558, "balance_loss_mlp": 1.0189147, "epoch": 0.4046173270005411, "flos": 22929861306240.0, "grad_norm": 1.8889922010490416, "language_loss": 0.87978232, "learning_rate": 2.699956689131532e-06, "loss": 0.90189183, "num_input_tokens_seen": 72680345, "step": 3365, "time_per_iteration": 2.954390525817871 }, { "auxiliary_loss_clip": 0.0118166, "auxiliary_loss_mlp": 0.01025701, "balance_loss_clip": 1.05357468, "balance_loss_mlp": 1.01741004, "epoch": 0.4047375698911802, "flos": 20668135582080.0, "grad_norm": 2.1265795277534694, "language_loss": 0.84898847, "learning_rate": 2.699226928299238e-06, "loss": 0.87106204, "num_input_tokens_seen": 72698365, "step": 3366, "time_per_iteration": 2.9087605476379395 }, { "auxiliary_loss_clip": 0.01185221, "auxiliary_loss_mlp": 0.0102292, "balance_loss_clip": 1.05471778, "balance_loss_mlp": 1.01507616, "epoch": 0.40485781278181926, "flos": 28912865996160.0, "grad_norm": 2.29853888766719, "language_loss": 0.78945649, "learning_rate": 2.698497061394774e-06, "loss": 0.81153786, "num_input_tokens_seen": 72716850, "step": 3367, "time_per_iteration": 2.8049912452697754 }, { "auxiliary_loss_clip": 0.01186556, "auxiliary_loss_mlp": 0.01062383, "balance_loss_clip": 1.05916095, "balance_loss_mlp": 1.02287662, "epoch": 0.40497805567245837, "flos": 23148377694720.0, "grad_norm": 1.7808140581777967, "language_loss": 0.8049342, "learning_rate": 2.6977670885288627e-06, "loss": 0.82742363, "num_input_tokens_seen": 72738250, "step": 3368, "time_per_iteration": 2.9188709259033203 }, { "auxiliary_loss_clip": 0.01172247, "auxiliary_loss_mlp": 0.01031555, "balance_loss_clip": 1.05478704, "balance_loss_mlp": 1.0222863, "epoch": 0.4050982985630975, "flos": 16289404030080.0, "grad_norm": 2.220764378121858, "language_loss": 0.75233144, "learning_rate": 2.6970370098122378e-06, "loss": 0.77436942, "num_input_tokens_seen": 72755235, "step": 3369, "time_per_iteration": 2.728395700454712 }, { "auxiliary_loss_clip": 0.01188028, "auxiliary_loss_mlp": 0.01027838, "balance_loss_clip": 1.05494273, "balance_loss_mlp": 1.01923132, "epoch": 0.40521854145373654, "flos": 34459484353920.0, "grad_norm": 1.7593066540095095, "language_loss": 0.86701095, "learning_rate": 2.6963068253556535e-06, "loss": 0.88916963, "num_input_tokens_seen": 72776620, "step": 3370, "time_per_iteration": 2.8651349544525146 }, { "auxiliary_loss_clip": 0.01192736, "auxiliary_loss_mlp": 0.01025724, "balance_loss_clip": 1.05562544, "balance_loss_mlp": 1.01659882, "epoch": 0.40533878434437565, "flos": 25331099454720.0, "grad_norm": 2.935316287444793, "language_loss": 0.85284972, "learning_rate": 2.6955765352698763e-06, "loss": 0.87503433, "num_input_tokens_seen": 72796765, "step": 3371, "time_per_iteration": 2.768651247024536 }, { "auxiliary_loss_clip": 0.01189815, "auxiliary_loss_mlp": 0.01028804, "balance_loss_clip": 1.05417705, "balance_loss_mlp": 1.02005363, "epoch": 0.40545902723501476, "flos": 15012061505280.0, "grad_norm": 2.057714043624562, "language_loss": 0.7336117, "learning_rate": 2.6948461396656923e-06, "loss": 0.75579792, "num_input_tokens_seen": 72814175, "step": 3372, "time_per_iteration": 2.7180917263031006 }, { "auxiliary_loss_clip": 0.01191378, "auxiliary_loss_mlp": 0.01031199, "balance_loss_clip": 1.05623126, "balance_loss_mlp": 1.02250302, "epoch": 0.4055792701256538, "flos": 25521103422720.0, "grad_norm": 3.108282491439343, "language_loss": 0.74453604, "learning_rate": 2.6941156386539013e-06, "loss": 0.76676178, "num_input_tokens_seen": 72834125, "step": 3373, "time_per_iteration": 3.7082033157348633 }, { "auxiliary_loss_clip": 0.01179982, "auxiliary_loss_mlp": 0.01029472, "balance_loss_clip": 1.05931187, "balance_loss_mlp": 1.02103829, "epoch": 0.4056995130162929, "flos": 19574583972480.0, "grad_norm": 2.1860015063667655, "language_loss": 0.81205928, "learning_rate": 2.6933850323453203e-06, "loss": 0.83415377, "num_input_tokens_seen": 72852570, "step": 3374, "time_per_iteration": 2.7655222415924072 }, { "auxiliary_loss_clip": 0.01188365, "auxiliary_loss_mlp": 0.01027198, "balance_loss_clip": 1.05714035, "balance_loss_mlp": 1.01872849, "epoch": 0.405819755906932, "flos": 15413794191360.0, "grad_norm": 1.8212689603981105, "language_loss": 0.74673307, "learning_rate": 2.6926543208507806e-06, "loss": 0.76888871, "num_input_tokens_seen": 72871250, "step": 3375, "time_per_iteration": 4.140122175216675 }, { "auxiliary_loss_clip": 0.0118543, "auxiliary_loss_mlp": 0.01029277, "balance_loss_clip": 1.0549041, "balance_loss_mlp": 1.0205096, "epoch": 0.4059399987975711, "flos": 21433930565760.0, "grad_norm": 2.2469147741919713, "language_loss": 0.8067745, "learning_rate": 2.6919235042811316e-06, "loss": 0.82892156, "num_input_tokens_seen": 72890035, "step": 3376, "time_per_iteration": 2.75065541267395 }, { "auxiliary_loss_clip": 0.0117932, "auxiliary_loss_mlp": 0.01031687, "balance_loss_clip": 1.0577755, "balance_loss_mlp": 1.0223887, "epoch": 0.4060602416882102, "flos": 25556942217600.0, "grad_norm": 2.000813160476783, "language_loss": 0.76331294, "learning_rate": 2.691192582747237e-06, "loss": 0.78542304, "num_input_tokens_seen": 72909665, "step": 3377, "time_per_iteration": 3.6168863773345947 }, { "auxiliary_loss_clip": 0.01190243, "auxiliary_loss_mlp": 0.01030936, "balance_loss_clip": 1.05696225, "balance_loss_mlp": 1.02257967, "epoch": 0.40618048457884925, "flos": 23766759262080.0, "grad_norm": 2.179841573009648, "language_loss": 0.74325335, "learning_rate": 2.6904615563599765e-06, "loss": 0.76546514, "num_input_tokens_seen": 72929465, "step": 3378, "time_per_iteration": 3.7558913230895996 }, { "auxiliary_loss_clip": 0.01176248, "auxiliary_loss_mlp": 0.01027729, "balance_loss_clip": 1.05758476, "balance_loss_mlp": 1.01892495, "epoch": 0.40630072746948837, "flos": 17639681120640.0, "grad_norm": 1.856863449090536, "language_loss": 0.83278668, "learning_rate": 2.6897304252302477e-06, "loss": 0.85482645, "num_input_tokens_seen": 72946785, "step": 3379, "time_per_iteration": 2.733426332473755 }, { "auxiliary_loss_clip": 0.01082191, "auxiliary_loss_mlp": 0.01008907, "balance_loss_clip": 1.02162099, "balance_loss_mlp": 1.00754762, "epoch": 0.4064209703601275, "flos": 60836053063680.0, "grad_norm": 0.7877042595974182, "language_loss": 0.54756498, "learning_rate": 2.688999189468962e-06, "loss": 0.56847596, "num_input_tokens_seen": 73003215, "step": 3380, "time_per_iteration": 3.1729342937469482 }, { "auxiliary_loss_clip": 0.0118089, "auxiliary_loss_mlp": 0.01024527, "balance_loss_clip": 1.05418229, "balance_loss_mlp": 1.01599181, "epoch": 0.40654121325076653, "flos": 24024346669440.0, "grad_norm": 2.250351017464715, "language_loss": 0.76646757, "learning_rate": 2.6882678491870464e-06, "loss": 0.78852177, "num_input_tokens_seen": 73023650, "step": 3381, "time_per_iteration": 2.790923833847046 }, { "auxiliary_loss_clip": 0.01190586, "auxiliary_loss_mlp": 0.01029987, "balance_loss_clip": 1.05742502, "balance_loss_mlp": 1.02162468, "epoch": 0.40666145614140564, "flos": 27344252085120.0, "grad_norm": 1.9928791439502316, "language_loss": 0.71713459, "learning_rate": 2.6875364044954453e-06, "loss": 0.73934031, "num_input_tokens_seen": 73043880, "step": 3382, "time_per_iteration": 2.833536386489868 }, { "auxiliary_loss_clip": 0.01182633, "auxiliary_loss_mlp": 0.01024708, "balance_loss_clip": 1.05554962, "balance_loss_mlp": 1.01642323, "epoch": 0.40678169903204475, "flos": 26176724415360.0, "grad_norm": 1.4956268802684427, "language_loss": 0.82160664, "learning_rate": 2.6868048555051185e-06, "loss": 0.84368002, "num_input_tokens_seen": 73065410, "step": 3383, "time_per_iteration": 2.905465602874756 }, { "auxiliary_loss_clip": 0.01189623, "auxiliary_loss_mlp": 0.01034328, "balance_loss_clip": 1.05591011, "balance_loss_mlp": 1.02508879, "epoch": 0.4069019419226838, "flos": 28622420622720.0, "grad_norm": 7.146104585317563, "language_loss": 0.85933506, "learning_rate": 2.686073202327041e-06, "loss": 0.88157463, "num_input_tokens_seen": 73084410, "step": 3384, "time_per_iteration": 2.805241346359253 }, { "auxiliary_loss_clip": 0.01174226, "auxiliary_loss_mlp": 0.01034694, "balance_loss_clip": 1.05490828, "balance_loss_mlp": 1.02617097, "epoch": 0.4070221848133229, "flos": 25229006023680.0, "grad_norm": 2.089742543769577, "language_loss": 0.73356694, "learning_rate": 2.6853414450722043e-06, "loss": 0.75565618, "num_input_tokens_seen": 73104075, "step": 3385, "time_per_iteration": 2.9014182090759277 }, { "auxiliary_loss_clip": 0.01182933, "auxiliary_loss_mlp": 0.01029019, "balance_loss_clip": 1.05482566, "balance_loss_mlp": 1.02118146, "epoch": 0.40714242770396203, "flos": 18405224709120.0, "grad_norm": 2.0524828697188098, "language_loss": 0.85343874, "learning_rate": 2.684609583851616e-06, "loss": 0.87555832, "num_input_tokens_seen": 73122250, "step": 3386, "time_per_iteration": 2.800015449523926 }, { "auxiliary_loss_clip": 0.01177634, "auxiliary_loss_mlp": 0.01025895, "balance_loss_clip": 1.05637217, "balance_loss_mlp": 1.01730013, "epoch": 0.4072626705946011, "flos": 30228920403840.0, "grad_norm": 1.6666287399024085, "language_loss": 0.80850601, "learning_rate": 2.683877618776297e-06, "loss": 0.83054125, "num_input_tokens_seen": 73144505, "step": 3387, "time_per_iteration": 3.045844793319702 }, { "auxiliary_loss_clip": 0.01183102, "auxiliary_loss_mlp": 0.01032114, "balance_loss_clip": 1.05847871, "balance_loss_mlp": 1.02319098, "epoch": 0.4073829134852402, "flos": 21834549930240.0, "grad_norm": 2.395817025128697, "language_loss": 0.7447564, "learning_rate": 2.6831455499572876e-06, "loss": 0.76690853, "num_input_tokens_seen": 73162440, "step": 3388, "time_per_iteration": 2.850229024887085 }, { "auxiliary_loss_clip": 0.01188272, "auxiliary_loss_mlp": 0.01032232, "balance_loss_clip": 1.05563629, "balance_loss_mlp": 1.023875, "epoch": 0.40750315637587925, "flos": 25260211964160.0, "grad_norm": 2.629332488629671, "language_loss": 0.77631509, "learning_rate": 2.682413377505641e-06, "loss": 0.79852009, "num_input_tokens_seen": 73181245, "step": 3389, "time_per_iteration": 2.8204588890075684 }, { "auxiliary_loss_clip": 0.01182733, "auxiliary_loss_mlp": 0.01027456, "balance_loss_clip": 1.05215263, "balance_loss_mlp": 1.01882529, "epoch": 0.40762339926651836, "flos": 19712767593600.0, "grad_norm": 1.98554126250693, "language_loss": 0.76715791, "learning_rate": 2.6816811015324284e-06, "loss": 0.78925979, "num_input_tokens_seen": 73199295, "step": 3390, "time_per_iteration": 2.8240256309509277 }, { "auxiliary_loss_clip": 0.01091206, "auxiliary_loss_mlp": 0.01001984, "balance_loss_clip": 1.02565813, "balance_loss_mlp": 1.00065446, "epoch": 0.40774364215715747, "flos": 71449307314560.0, "grad_norm": 0.725628510786498, "language_loss": 0.56654423, "learning_rate": 2.6809487221487343e-06, "loss": 0.58747613, "num_input_tokens_seen": 73258780, "step": 3391, "time_per_iteration": 3.1937153339385986 }, { "auxiliary_loss_clip": 0.01180419, "auxiliary_loss_mlp": 0.01029156, "balance_loss_clip": 1.05588102, "balance_loss_mlp": 1.02022123, "epoch": 0.4078638850477965, "flos": 15084134144640.0, "grad_norm": 2.6570576217462096, "language_loss": 0.81802571, "learning_rate": 2.6802162394656605e-06, "loss": 0.84012151, "num_input_tokens_seen": 73275490, "step": 3392, "time_per_iteration": 2.7676563262939453 }, { "auxiliary_loss_clip": 0.01177583, "auxiliary_loss_mlp": 0.01028348, "balance_loss_clip": 1.05273235, "balance_loss_mlp": 1.02018833, "epoch": 0.40798412793843564, "flos": 23842890138240.0, "grad_norm": 2.1559430213217636, "language_loss": 0.71657926, "learning_rate": 2.679483653594324e-06, "loss": 0.73863858, "num_input_tokens_seen": 73297260, "step": 3393, "time_per_iteration": 2.86175799369812 }, { "auxiliary_loss_clip": 0.01187161, "auxiliary_loss_mlp": 0.01024864, "balance_loss_clip": 1.05554724, "balance_loss_mlp": 1.01601243, "epoch": 0.40810437082907475, "flos": 21065774117760.0, "grad_norm": 2.2853545835265914, "language_loss": 0.767272, "learning_rate": 2.678750964645857e-06, "loss": 0.78939223, "num_input_tokens_seen": 73316340, "step": 3394, "time_per_iteration": 2.7692902088165283 }, { "auxiliary_loss_clip": 0.01184913, "auxiliary_loss_mlp": 0.0103096, "balance_loss_clip": 1.05653739, "balance_loss_mlp": 1.02288938, "epoch": 0.4082246137197138, "flos": 11321377948800.0, "grad_norm": 3.3843896041670507, "language_loss": 0.83197582, "learning_rate": 2.6780181727314094e-06, "loss": 0.8541345, "num_input_tokens_seen": 73331245, "step": 3395, "time_per_iteration": 2.7745132446289062 }, { "auxiliary_loss_clip": 0.01182501, "auxiliary_loss_mlp": 0.0106477, "balance_loss_clip": 1.05687141, "balance_loss_mlp": 1.0255549, "epoch": 0.4083448566103529, "flos": 19062569554560.0, "grad_norm": 1.7450988899129056, "language_loss": 0.7791723, "learning_rate": 2.6772852779621435e-06, "loss": 0.80164504, "num_input_tokens_seen": 73349105, "step": 3396, "time_per_iteration": 2.8961308002471924 }, { "auxiliary_loss_clip": 0.01177421, "auxiliary_loss_mlp": 0.01059356, "balance_loss_clip": 1.05528879, "balance_loss_mlp": 1.02037299, "epoch": 0.408465099500992, "flos": 23550254035200.0, "grad_norm": 2.909114745761866, "language_loss": 0.87022197, "learning_rate": 2.676552280449239e-06, "loss": 0.89258981, "num_input_tokens_seen": 73368990, "step": 3397, "time_per_iteration": 2.796595573425293 }, { "auxiliary_loss_clip": 0.01177475, "auxiliary_loss_mlp": 0.01028839, "balance_loss_clip": 1.05446815, "balance_loss_mlp": 1.02006555, "epoch": 0.4085853423916311, "flos": 12750012558720.0, "grad_norm": 2.413023675801469, "language_loss": 0.76185393, "learning_rate": 2.6758191803038917e-06, "loss": 0.78391707, "num_input_tokens_seen": 73387485, "step": 3398, "time_per_iteration": 2.7989513874053955 }, { "auxiliary_loss_clip": 0.0116761, "auxiliary_loss_mlp": 0.01032962, "balance_loss_clip": 1.05995989, "balance_loss_mlp": 1.02462292, "epoch": 0.4087055852822702, "flos": 24353072962560.0, "grad_norm": 2.4866551021992978, "language_loss": 0.82789493, "learning_rate": 2.6750859776373125e-06, "loss": 0.8499006, "num_input_tokens_seen": 73406940, "step": 3399, "time_per_iteration": 3.8448169231414795 }, { "auxiliary_loss_clip": 0.01089251, "auxiliary_loss_mlp": 0.01002122, "balance_loss_clip": 1.02710891, "balance_loss_mlp": 1.00063789, "epoch": 0.4088258281729093, "flos": 66387950720640.0, "grad_norm": 0.860993405174207, "language_loss": 0.60339057, "learning_rate": 2.674352672560727e-06, "loss": 0.62430429, "num_input_tokens_seen": 73468385, "step": 3400, "time_per_iteration": 3.722829580307007 }, { "auxiliary_loss_clip": 0.01177431, "auxiliary_loss_mlp": 0.01030274, "balance_loss_clip": 1.05371892, "balance_loss_mlp": 1.02111912, "epoch": 0.40894607106354836, "flos": 20449260057600.0, "grad_norm": 4.554659516395867, "language_loss": 0.76970899, "learning_rate": 2.673619265185377e-06, "loss": 0.79178607, "num_input_tokens_seen": 73488225, "step": 3401, "time_per_iteration": 3.9235098361968994 }, { "auxiliary_loss_clip": 0.0118829, "auxiliary_loss_mlp": 0.01032747, "balance_loss_clip": 1.05658054, "balance_loss_mlp": 1.02335882, "epoch": 0.40906631395418747, "flos": 27053627143680.0, "grad_norm": 1.7467207994610854, "language_loss": 0.77935457, "learning_rate": 2.672885755622521e-06, "loss": 0.80156493, "num_input_tokens_seen": 73510640, "step": 3402, "time_per_iteration": 2.767110586166382 }, { "auxiliary_loss_clip": 0.01171294, "auxiliary_loss_mlp": 0.01029409, "balance_loss_clip": 1.05355585, "balance_loss_mlp": 1.02172637, "epoch": 0.4091865568448266, "flos": 25484151306240.0, "grad_norm": 3.242698230577831, "language_loss": 0.70675337, "learning_rate": 2.67215214398343e-06, "loss": 0.72876042, "num_input_tokens_seen": 73530655, "step": 3403, "time_per_iteration": 2.846930980682373 }, { "auxiliary_loss_clip": 0.01179595, "auxiliary_loss_mlp": 0.01028187, "balance_loss_clip": 1.05577087, "balance_loss_mlp": 1.02000892, "epoch": 0.40930679973546563, "flos": 28657864368000.0, "grad_norm": 4.002030742761246, "language_loss": 0.7808187, "learning_rate": 2.671418430379393e-06, "loss": 0.80289656, "num_input_tokens_seen": 73549340, "step": 3404, "time_per_iteration": 4.919038772583008 }, { "auxiliary_loss_clip": 0.01185844, "auxiliary_loss_mlp": 0.01026829, "balance_loss_clip": 1.05296111, "balance_loss_mlp": 1.01846671, "epoch": 0.40942704262610474, "flos": 20886292834560.0, "grad_norm": 1.9063568349940998, "language_loss": 0.83001828, "learning_rate": 2.670684614921715e-06, "loss": 0.85214502, "num_input_tokens_seen": 73568315, "step": 3405, "time_per_iteration": 2.652527093887329 }, { "auxiliary_loss_clip": 0.01186313, "auxiliary_loss_mlp": 0.0102609, "balance_loss_clip": 1.05727494, "balance_loss_mlp": 1.01791799, "epoch": 0.4095472855167438, "flos": 21618080616960.0, "grad_norm": 2.176730552791572, "language_loss": 0.69511664, "learning_rate": 2.6699506977217128e-06, "loss": 0.71724069, "num_input_tokens_seen": 73588490, "step": 3406, "time_per_iteration": 2.7790322303771973 }, { "auxiliary_loss_clip": 0.01183231, "auxiliary_loss_mlp": 0.01022998, "balance_loss_clip": 1.0579232, "balance_loss_mlp": 1.01519024, "epoch": 0.4096675284073829, "flos": 27926112499200.0, "grad_norm": 2.3213292935891197, "language_loss": 0.69837451, "learning_rate": 2.6692166788907233e-06, "loss": 0.72043681, "num_input_tokens_seen": 73608685, "step": 3407, "time_per_iteration": 2.8020317554473877 }, { "auxiliary_loss_clip": 0.01181946, "auxiliary_loss_mlp": 0.01031588, "balance_loss_clip": 1.05378294, "balance_loss_mlp": 1.0231595, "epoch": 0.409787771298022, "flos": 19206607092480.0, "grad_norm": 2.4433731669138745, "language_loss": 0.7725358, "learning_rate": 2.6684825585400957e-06, "loss": 0.79467118, "num_input_tokens_seen": 73627630, "step": 3408, "time_per_iteration": 2.7933642864227295 }, { "auxiliary_loss_clip": 0.01080257, "auxiliary_loss_mlp": 0.01000792, "balance_loss_clip": 1.02262604, "balance_loss_mlp": 0.99949282, "epoch": 0.4099080141886611, "flos": 59269234832640.0, "grad_norm": 0.8166071640126911, "language_loss": 0.65149987, "learning_rate": 2.6677483367811947e-06, "loss": 0.67231035, "num_input_tokens_seen": 73687670, "step": 3409, "time_per_iteration": 3.402855634689331 }, { "auxiliary_loss_clip": 0.01187483, "auxiliary_loss_mlp": 0.01026302, "balance_loss_clip": 1.0548408, "balance_loss_mlp": 1.01773083, "epoch": 0.4100282570793002, "flos": 21906443001600.0, "grad_norm": 1.9315222941466152, "language_loss": 0.75492644, "learning_rate": 2.6670140137254028e-06, "loss": 0.77706426, "num_input_tokens_seen": 73707145, "step": 3410, "time_per_iteration": 2.710672616958618 }, { "auxiliary_loss_clip": 0.01172701, "auxiliary_loss_mlp": 0.01029824, "balance_loss_clip": 1.05631447, "balance_loss_mlp": 1.02239728, "epoch": 0.4101484999699393, "flos": 18551596631040.0, "grad_norm": 3.7206712880955806, "language_loss": 0.89945054, "learning_rate": 2.666279589484115e-06, "loss": 0.92147583, "num_input_tokens_seen": 73725045, "step": 3411, "time_per_iteration": 2.8367440700531006 }, { "auxiliary_loss_clip": 0.01178481, "auxiliary_loss_mlp": 0.01024372, "balance_loss_clip": 1.05857706, "balance_loss_mlp": 1.01670122, "epoch": 0.41026874286057835, "flos": 19094529680640.0, "grad_norm": 1.8566610371089691, "language_loss": 0.81375074, "learning_rate": 2.6655450641687435e-06, "loss": 0.83577931, "num_input_tokens_seen": 73742610, "step": 3412, "time_per_iteration": 2.7594411373138428 }, { "auxiliary_loss_clip": 0.01187508, "auxiliary_loss_mlp": 0.01030166, "balance_loss_clip": 1.05607486, "balance_loss_mlp": 1.02152359, "epoch": 0.41038898575121746, "flos": 31209568588800.0, "grad_norm": 4.234550275912483, "language_loss": 0.69389319, "learning_rate": 2.664810437890715e-06, "loss": 0.71606988, "num_input_tokens_seen": 73764280, "step": 3413, "time_per_iteration": 2.768934488296509 }, { "auxiliary_loss_clip": 0.01162133, "auxiliary_loss_mlp": 0.01029815, "balance_loss_clip": 1.05795467, "balance_loss_mlp": 1.02194166, "epoch": 0.41050922864185657, "flos": 14355865895040.0, "grad_norm": 2.3934444160406674, "language_loss": 0.79452288, "learning_rate": 2.6640757107614714e-06, "loss": 0.81644237, "num_input_tokens_seen": 73782375, "step": 3414, "time_per_iteration": 2.8919193744659424 }, { "auxiliary_loss_clip": 0.01171143, "auxiliary_loss_mlp": 0.01029068, "balance_loss_clip": 1.05491018, "balance_loss_mlp": 1.02067566, "epoch": 0.4106294715324956, "flos": 30956290813440.0, "grad_norm": 2.2831278382844555, "language_loss": 0.69281423, "learning_rate": 2.6633408828924697e-06, "loss": 0.71481633, "num_input_tokens_seen": 73801240, "step": 3415, "time_per_iteration": 2.7878832817077637 }, { "auxiliary_loss_clip": 0.01186548, "auxiliary_loss_mlp": 0.0102628, "balance_loss_clip": 1.05765462, "balance_loss_mlp": 1.01774454, "epoch": 0.41074971442313474, "flos": 24457321209600.0, "grad_norm": 1.6174060928687928, "language_loss": 0.70327348, "learning_rate": 2.662605954395185e-06, "loss": 0.72540176, "num_input_tokens_seen": 73821200, "step": 3416, "time_per_iteration": 2.8188767433166504 }, { "auxiliary_loss_clip": 0.01187107, "auxiliary_loss_mlp": 0.01026944, "balance_loss_clip": 1.05539107, "balance_loss_mlp": 1.018677, "epoch": 0.41086995731377385, "flos": 21542991235200.0, "grad_norm": 1.7676183996482417, "language_loss": 0.83505464, "learning_rate": 2.6618709253811027e-06, "loss": 0.85719514, "num_input_tokens_seen": 73840655, "step": 3417, "time_per_iteration": 2.8034517765045166 }, { "auxiliary_loss_clip": 0.01182956, "auxiliary_loss_mlp": 0.01026896, "balance_loss_clip": 1.0555439, "balance_loss_mlp": 1.01943374, "epoch": 0.4109902002044129, "flos": 20702753314560.0, "grad_norm": 1.999423478680163, "language_loss": 0.87821764, "learning_rate": 2.6611357959617277e-06, "loss": 0.90031612, "num_input_tokens_seen": 73860275, "step": 3418, "time_per_iteration": 2.807915687561035 }, { "auxiliary_loss_clip": 0.01171604, "auxiliary_loss_mlp": 0.01032558, "balance_loss_clip": 1.05576062, "balance_loss_mlp": 1.02384353, "epoch": 0.411110443095052, "flos": 18179992477440.0, "grad_norm": 1.89557064773602, "language_loss": 0.90947688, "learning_rate": 2.660400566248578e-06, "loss": 0.93151855, "num_input_tokens_seen": 73878400, "step": 3419, "time_per_iteration": 2.727273941040039 }, { "auxiliary_loss_clip": 0.01177791, "auxiliary_loss_mlp": 0.01032691, "balance_loss_clip": 1.05678868, "balance_loss_mlp": 1.02349949, "epoch": 0.41123068598569107, "flos": 14575244209920.0, "grad_norm": 3.4012055005336643, "language_loss": 0.67409456, "learning_rate": 2.6596652363531876e-06, "loss": 0.6961993, "num_input_tokens_seen": 73894275, "step": 3420, "time_per_iteration": 2.7720704078674316 }, { "auxiliary_loss_clip": 0.01184865, "auxiliary_loss_mlp": 0.01027886, "balance_loss_clip": 1.05487752, "balance_loss_mlp": 1.01982188, "epoch": 0.4113509288763302, "flos": 21177995184000.0, "grad_norm": 2.2044213392934493, "language_loss": 0.78198647, "learning_rate": 2.6589298063871055e-06, "loss": 0.80411398, "num_input_tokens_seen": 73914450, "step": 3421, "time_per_iteration": 2.6923916339874268 }, { "auxiliary_loss_clip": 0.01187422, "auxiliary_loss_mlp": 0.01027568, "balance_loss_clip": 1.05605137, "balance_loss_mlp": 1.01961708, "epoch": 0.4114711717669693, "flos": 18442212739200.0, "grad_norm": 1.847989864296736, "language_loss": 0.69845343, "learning_rate": 2.658194276461895e-06, "loss": 0.72060335, "num_input_tokens_seen": 73932375, "step": 3422, "time_per_iteration": 2.728506326675415 }, { "auxiliary_loss_clip": 0.0118427, "auxiliary_loss_mlp": 0.01034628, "balance_loss_clip": 1.05537009, "balance_loss_mlp": 1.02557993, "epoch": 0.41159141465760835, "flos": 27233395735680.0, "grad_norm": 2.3711357022312467, "language_loss": 0.66982377, "learning_rate": 2.6574586466891368e-06, "loss": 0.69201273, "num_input_tokens_seen": 73952850, "step": 3423, "time_per_iteration": 2.8002493381500244 }, { "auxiliary_loss_clip": 0.0117956, "auxiliary_loss_mlp": 0.01058904, "balance_loss_clip": 1.05303299, "balance_loss_mlp": 1.02015543, "epoch": 0.41171165754824746, "flos": 20006876154240.0, "grad_norm": 2.3626310523853444, "language_loss": 0.65019214, "learning_rate": 2.6567229171804247e-06, "loss": 0.67257673, "num_input_tokens_seen": 73970735, "step": 3424, "time_per_iteration": 2.7739810943603516 }, { "auxiliary_loss_clip": 0.0118234, "auxiliary_loss_mlp": 0.01026975, "balance_loss_clip": 1.05193782, "balance_loss_mlp": 1.01839828, "epoch": 0.41183190043888657, "flos": 18004318035840.0, "grad_norm": 2.5058114016575623, "language_loss": 0.87728691, "learning_rate": 2.655987088047368e-06, "loss": 0.89938003, "num_input_tokens_seen": 73989080, "step": 3425, "time_per_iteration": 3.713876485824585 }, { "auxiliary_loss_clip": 0.01176525, "auxiliary_loss_mlp": 0.01029764, "balance_loss_clip": 1.0539279, "balance_loss_mlp": 1.02069855, "epoch": 0.4119521433295256, "flos": 27163370171520.0, "grad_norm": 2.0188904871027926, "language_loss": 0.78762519, "learning_rate": 2.6552511594015912e-06, "loss": 0.80968809, "num_input_tokens_seen": 74009470, "step": 3426, "time_per_iteration": 2.7576608657836914 }, { "auxiliary_loss_clip": 0.01180107, "auxiliary_loss_mlp": 0.01031672, "balance_loss_clip": 1.05261302, "balance_loss_mlp": 1.02260017, "epoch": 0.41207238622016473, "flos": 15122020014720.0, "grad_norm": 2.9969259384758615, "language_loss": 0.85423154, "learning_rate": 2.654515131354735e-06, "loss": 0.87634933, "num_input_tokens_seen": 74027735, "step": 3427, "time_per_iteration": 3.728480815887451 }, { "auxiliary_loss_clip": 0.01178449, "auxiliary_loss_mlp": 0.0103121, "balance_loss_clip": 1.05536163, "balance_loss_mlp": 1.02319276, "epoch": 0.41219262911080384, "flos": 27052872958080.0, "grad_norm": 2.3129868682053925, "language_loss": 0.84588861, "learning_rate": 2.653779004018453e-06, "loss": 0.86798519, "num_input_tokens_seen": 74048300, "step": 3428, "time_per_iteration": 2.7849740982055664 }, { "auxiliary_loss_clip": 0.01173548, "auxiliary_loss_mlp": 0.01033745, "balance_loss_clip": 1.05438697, "balance_loss_mlp": 1.02513218, "epoch": 0.4123128720014429, "flos": 24686360282880.0, "grad_norm": 2.4895655922712643, "language_loss": 0.82392579, "learning_rate": 2.653042777504417e-06, "loss": 0.84599876, "num_input_tokens_seen": 74070890, "step": 3429, "time_per_iteration": 3.6749696731567383 }, { "auxiliary_loss_clip": 0.01185599, "auxiliary_loss_mlp": 0.01029816, "balance_loss_clip": 1.05428493, "balance_loss_mlp": 1.02151275, "epoch": 0.412433114892082, "flos": 26244774731520.0, "grad_norm": 2.719278524388515, "language_loss": 0.80347186, "learning_rate": 2.6523064519243105e-06, "loss": 0.82562602, "num_input_tokens_seen": 74090460, "step": 3430, "time_per_iteration": 3.7858874797821045 }, { "auxiliary_loss_clip": 0.01184504, "auxiliary_loss_mlp": 0.01031882, "balance_loss_clip": 1.05581057, "balance_loss_mlp": 1.02340615, "epoch": 0.4125533577827211, "flos": 21361031913600.0, "grad_norm": 2.712493669646607, "language_loss": 0.79068184, "learning_rate": 2.6515700273898333e-06, "loss": 0.81284571, "num_input_tokens_seen": 74108335, "step": 3431, "time_per_iteration": 2.742191791534424 }, { "auxiliary_loss_clip": 0.01168771, "auxiliary_loss_mlp": 0.01026647, "balance_loss_clip": 1.05270457, "balance_loss_mlp": 1.01885104, "epoch": 0.4126736006733602, "flos": 26067556005120.0, "grad_norm": 2.069325901298946, "language_loss": 0.69161308, "learning_rate": 2.6508335040127018e-06, "loss": 0.71356726, "num_input_tokens_seen": 74128030, "step": 3432, "time_per_iteration": 2.7698678970336914 }, { "auxiliary_loss_clip": 0.01187816, "auxiliary_loss_mlp": 0.01033281, "balance_loss_clip": 1.05622387, "balance_loss_mlp": 1.02474606, "epoch": 0.4127938435639993, "flos": 25666146541440.0, "grad_norm": 1.686369627405253, "language_loss": 0.76918995, "learning_rate": 2.6500968819046446e-06, "loss": 0.79140091, "num_input_tokens_seen": 74148330, "step": 3433, "time_per_iteration": 2.7817671298980713 }, { "auxiliary_loss_clip": 0.01166172, "auxiliary_loss_mlp": 0.01026644, "balance_loss_clip": 1.05386031, "balance_loss_mlp": 1.01773334, "epoch": 0.4129140864546384, "flos": 17995914253440.0, "grad_norm": 4.603526238171524, "language_loss": 0.59227979, "learning_rate": 2.649360161177408e-06, "loss": 0.61420798, "num_input_tokens_seen": 74163390, "step": 3434, "time_per_iteration": 2.7204580307006836 }, { "auxiliary_loss_clip": 0.01190307, "auxiliary_loss_mlp": 0.0102872, "balance_loss_clip": 1.05487263, "balance_loss_mlp": 1.01980364, "epoch": 0.41303432934527745, "flos": 23732895715200.0, "grad_norm": 3.4237210312017896, "language_loss": 0.73477167, "learning_rate": 2.6486233419427504e-06, "loss": 0.75696194, "num_input_tokens_seen": 74183205, "step": 3435, "time_per_iteration": 2.812865734100342 }, { "auxiliary_loss_clip": 0.0117275, "auxiliary_loss_mlp": 0.01029966, "balance_loss_clip": 1.05639684, "balance_loss_mlp": 1.02183044, "epoch": 0.41315457223591656, "flos": 19755286318080.0, "grad_norm": 2.4942705057205807, "language_loss": 0.7453469, "learning_rate": 2.6478864243124484e-06, "loss": 0.76737404, "num_input_tokens_seen": 74202870, "step": 3436, "time_per_iteration": 2.786393880844116 }, { "auxiliary_loss_clip": 0.01185316, "auxiliary_loss_mlp": 0.01029256, "balance_loss_clip": 1.05423903, "balance_loss_mlp": 1.02118516, "epoch": 0.4132748151265556, "flos": 20923316778240.0, "grad_norm": 2.4080281167821593, "language_loss": 0.85105884, "learning_rate": 2.6471494083982903e-06, "loss": 0.87320465, "num_input_tokens_seen": 74222255, "step": 3437, "time_per_iteration": 2.723327398300171 }, { "auxiliary_loss_clip": 0.0118381, "auxiliary_loss_mlp": 0.01027616, "balance_loss_clip": 1.05591846, "balance_loss_mlp": 1.01970077, "epoch": 0.4133950580171947, "flos": 32232520016640.0, "grad_norm": 1.803832728345597, "language_loss": 0.74745244, "learning_rate": 2.6464122943120818e-06, "loss": 0.76956666, "num_input_tokens_seen": 74242480, "step": 3438, "time_per_iteration": 2.84263277053833 }, { "auxiliary_loss_clip": 0.01175614, "auxiliary_loss_mlp": 0.01028487, "balance_loss_clip": 1.05646205, "balance_loss_mlp": 1.02032733, "epoch": 0.41351530090783384, "flos": 23292487059840.0, "grad_norm": 2.694618958951112, "language_loss": 0.82471347, "learning_rate": 2.645675082165642e-06, "loss": 0.84675443, "num_input_tokens_seen": 74258690, "step": 3439, "time_per_iteration": 2.8252880573272705 }, { "auxiliary_loss_clip": 0.01182051, "auxiliary_loss_mlp": 0.01028481, "balance_loss_clip": 1.05604696, "balance_loss_mlp": 1.01955819, "epoch": 0.4136355437984729, "flos": 25593571111680.0, "grad_norm": 8.517669478144038, "language_loss": 0.75441563, "learning_rate": 2.644937772070806e-06, "loss": 0.77652097, "num_input_tokens_seen": 74277135, "step": 3440, "time_per_iteration": 2.783578634262085 }, { "auxiliary_loss_clip": 0.01190817, "auxiliary_loss_mlp": 0.0102803, "balance_loss_clip": 1.05785632, "balance_loss_mlp": 1.01976848, "epoch": 0.413755786689112, "flos": 19828615933440.0, "grad_norm": 3.075089882144361, "language_loss": 0.83362281, "learning_rate": 2.6442003641394225e-06, "loss": 0.85581142, "num_input_tokens_seen": 74294730, "step": 3441, "time_per_iteration": 2.7979934215545654 }, { "auxiliary_loss_clip": 0.01178598, "auxiliary_loss_mlp": 0.01027953, "balance_loss_clip": 1.05344021, "balance_loss_mlp": 1.01925659, "epoch": 0.4138760295797511, "flos": 26870446759680.0, "grad_norm": 1.5477230609788883, "language_loss": 0.83721095, "learning_rate": 2.643462858483356e-06, "loss": 0.85927641, "num_input_tokens_seen": 74315015, "step": 3442, "time_per_iteration": 2.8330628871917725 }, { "auxiliary_loss_clip": 0.01168479, "auxiliary_loss_mlp": 0.01026303, "balance_loss_clip": 1.05325174, "balance_loss_mlp": 1.0178268, "epoch": 0.41399627247039017, "flos": 16399254798720.0, "grad_norm": 2.2497215788931455, "language_loss": 0.72429794, "learning_rate": 2.6427252552144856e-06, "loss": 0.74624574, "num_input_tokens_seen": 74333665, "step": 3443, "time_per_iteration": 2.7861061096191406 }, { "auxiliary_loss_clip": 0.01188323, "auxiliary_loss_mlp": 0.0103199, "balance_loss_clip": 1.05503273, "balance_loss_mlp": 1.02287066, "epoch": 0.4141165153610293, "flos": 22930220442240.0, "grad_norm": 2.0316195863310376, "language_loss": 0.74953353, "learning_rate": 2.6419875544447044e-06, "loss": 0.77173662, "num_input_tokens_seen": 74355065, "step": 3444, "time_per_iteration": 2.7016890048980713 }, { "auxiliary_loss_clip": 0.01189926, "auxiliary_loss_mlp": 0.01032559, "balance_loss_clip": 1.05611014, "balance_loss_mlp": 1.02393448, "epoch": 0.4142367582516684, "flos": 25192556697600.0, "grad_norm": 1.7505285965730664, "language_loss": 0.71446341, "learning_rate": 2.6412497562859218e-06, "loss": 0.73668826, "num_input_tokens_seen": 74376345, "step": 3445, "time_per_iteration": 2.7046518325805664 }, { "auxiliary_loss_clip": 0.01190297, "auxiliary_loss_mlp": 0.01030302, "balance_loss_clip": 1.05623114, "balance_loss_mlp": 1.02126622, "epoch": 0.41435700114230745, "flos": 21690476478720.0, "grad_norm": 3.437327317441141, "language_loss": 0.76503813, "learning_rate": 2.6405118608500617e-06, "loss": 0.78724408, "num_input_tokens_seen": 74395170, "step": 3446, "time_per_iteration": 2.722651958465576 }, { "auxiliary_loss_clip": 0.01170652, "auxiliary_loss_mlp": 0.01027373, "balance_loss_clip": 1.05669212, "balance_loss_mlp": 1.0194757, "epoch": 0.41447724403294656, "flos": 25995160143360.0, "grad_norm": 3.1648196094311167, "language_loss": 0.81388068, "learning_rate": 2.6397738682490613e-06, "loss": 0.83586097, "num_input_tokens_seen": 74416070, "step": 3447, "time_per_iteration": 2.8285768032073975 }, { "auxiliary_loss_clip": 0.01187434, "auxiliary_loss_mlp": 0.01025758, "balance_loss_clip": 1.05471897, "balance_loss_mlp": 1.01754451, "epoch": 0.41459748692358567, "flos": 18259678800000.0, "grad_norm": 1.765502343020739, "language_loss": 0.7506845, "learning_rate": 2.6390357785948734e-06, "loss": 0.77281642, "num_input_tokens_seen": 74433185, "step": 3448, "time_per_iteration": 2.6812326908111572 }, { "auxiliary_loss_clip": 0.01186586, "auxiliary_loss_mlp": 0.01030485, "balance_loss_clip": 1.05738401, "balance_loss_mlp": 1.02120459, "epoch": 0.4147177298142247, "flos": 24168456034560.0, "grad_norm": 1.6837018213173869, "language_loss": 0.80664474, "learning_rate": 2.6382975919994667e-06, "loss": 0.82881546, "num_input_tokens_seen": 74453760, "step": 3449, "time_per_iteration": 3.0039596557617188 }, { "auxiliary_loss_clip": 0.01184335, "auxiliary_loss_mlp": 0.01029361, "balance_loss_clip": 1.0562197, "balance_loss_mlp": 1.02145743, "epoch": 0.41483797270486383, "flos": 20084659056000.0, "grad_norm": 1.9607767351510554, "language_loss": 0.73154044, "learning_rate": 2.637559308574822e-06, "loss": 0.75367743, "num_input_tokens_seen": 74473505, "step": 3450, "time_per_iteration": 2.8622512817382812 }, { "auxiliary_loss_clip": 0.01187505, "auxiliary_loss_mlp": 0.01035451, "balance_loss_clip": 1.05539751, "balance_loss_mlp": 1.02739835, "epoch": 0.4149582155955029, "flos": 30081040110720.0, "grad_norm": 4.789282013373478, "language_loss": 0.71261251, "learning_rate": 2.6368209284329376e-06, "loss": 0.734842, "num_input_tokens_seen": 74494135, "step": 3451, "time_per_iteration": 3.724757671356201 }, { "auxiliary_loss_clip": 0.01183586, "auxiliary_loss_mlp": 0.01033917, "balance_loss_clip": 1.05399561, "balance_loss_mlp": 1.02592957, "epoch": 0.415078458486142, "flos": 16764394504320.0, "grad_norm": 1.953550604038139, "language_loss": 0.75373828, "learning_rate": 2.636082451685825e-06, "loss": 0.77591336, "num_input_tokens_seen": 74512335, "step": 3452, "time_per_iteration": 2.808526039123535 }, { "auxiliary_loss_clip": 0.0118462, "auxiliary_loss_mlp": 0.01029421, "balance_loss_clip": 1.05656314, "balance_loss_mlp": 1.0207963, "epoch": 0.4151987013767811, "flos": 26033692458240.0, "grad_norm": 1.8102849902504918, "language_loss": 0.86322761, "learning_rate": 2.6353438784455094e-06, "loss": 0.88536799, "num_input_tokens_seen": 74535620, "step": 3453, "time_per_iteration": 3.716857671737671 }, { "auxiliary_loss_clip": 0.01181712, "auxiliary_loss_mlp": 0.0103551, "balance_loss_clip": 1.05806899, "balance_loss_mlp": 1.02609265, "epoch": 0.41531894426742016, "flos": 24608002763520.0, "grad_norm": 2.864179819676123, "language_loss": 0.71293086, "learning_rate": 2.6346052088240326e-06, "loss": 0.73510301, "num_input_tokens_seen": 74555140, "step": 3454, "time_per_iteration": 2.7502973079681396 }, { "auxiliary_loss_clip": 0.01184106, "auxiliary_loss_mlp": 0.01031964, "balance_loss_clip": 1.05598736, "balance_loss_mlp": 1.02317238, "epoch": 0.4154391871580593, "flos": 14975791747200.0, "grad_norm": 3.930719671062104, "language_loss": 0.77365935, "learning_rate": 2.63386644293345e-06, "loss": 0.79582, "num_input_tokens_seen": 74571485, "step": 3455, "time_per_iteration": 3.7501955032348633 }, { "auxiliary_loss_clip": 0.01176333, "auxiliary_loss_mlp": 0.01027677, "balance_loss_clip": 1.05522978, "balance_loss_mlp": 1.01962423, "epoch": 0.4155594300486984, "flos": 14647173194880.0, "grad_norm": 10.91605578085025, "language_loss": 0.83094341, "learning_rate": 2.633127580885833e-06, "loss": 0.85298347, "num_input_tokens_seen": 74585985, "step": 3456, "time_per_iteration": 3.724397897720337 }, { "auxiliary_loss_clip": 0.0118814, "auxiliary_loss_mlp": 0.01030516, "balance_loss_clip": 1.05727184, "balance_loss_mlp": 1.02204669, "epoch": 0.41567967293933744, "flos": 29497276275840.0, "grad_norm": 2.910462638354902, "language_loss": 0.65235752, "learning_rate": 2.632388622793265e-06, "loss": 0.67454404, "num_input_tokens_seen": 74605140, "step": 3457, "time_per_iteration": 2.7716965675354004 }, { "auxiliary_loss_clip": 0.01187602, "auxiliary_loss_mlp": 0.01027875, "balance_loss_clip": 1.05940342, "balance_loss_mlp": 1.01937509, "epoch": 0.41579991582997655, "flos": 19238387650560.0, "grad_norm": 1.859183200419629, "language_loss": 0.67639244, "learning_rate": 2.6316495687678457e-06, "loss": 0.69854712, "num_input_tokens_seen": 74623790, "step": 3458, "time_per_iteration": 2.8383188247680664 }, { "auxiliary_loss_clip": 0.01169129, "auxiliary_loss_mlp": 0.01026958, "balance_loss_clip": 1.05441976, "balance_loss_mlp": 1.01813638, "epoch": 0.41592015872061566, "flos": 24462061804800.0, "grad_norm": 2.730406997404671, "language_loss": 0.76108783, "learning_rate": 2.6309104189216887e-06, "loss": 0.78304875, "num_input_tokens_seen": 74641355, "step": 3459, "time_per_iteration": 2.820702314376831 }, { "auxiliary_loss_clip": 0.01164385, "auxiliary_loss_mlp": 0.01063451, "balance_loss_clip": 1.05278039, "balance_loss_mlp": 1.020877, "epoch": 0.4160404016112547, "flos": 20775651966720.0, "grad_norm": 2.8887903153407364, "language_loss": 0.74866021, "learning_rate": 2.630171173366923e-06, "loss": 0.77093858, "num_input_tokens_seen": 74657155, "step": 3460, "time_per_iteration": 2.8018558025360107 }, { "auxiliary_loss_clip": 0.01177246, "auxiliary_loss_mlp": 0.01031374, "balance_loss_clip": 1.05758977, "balance_loss_mlp": 1.02335107, "epoch": 0.41616064450189383, "flos": 13916462820480.0, "grad_norm": 2.4537290437971415, "language_loss": 0.73743927, "learning_rate": 2.629431832215691e-06, "loss": 0.75952548, "num_input_tokens_seen": 74671960, "step": 3461, "time_per_iteration": 2.7831978797912598 }, { "auxiliary_loss_clip": 0.01177006, "auxiliary_loss_mlp": 0.01028385, "balance_loss_clip": 1.05328643, "balance_loss_mlp": 1.02006447, "epoch": 0.41628088739253294, "flos": 20010826650240.0, "grad_norm": 1.837163211692744, "language_loss": 0.87130034, "learning_rate": 2.628692395580151e-06, "loss": 0.89335424, "num_input_tokens_seen": 74692050, "step": 3462, "time_per_iteration": 2.9075512886047363 }, { "auxiliary_loss_clip": 0.011613, "auxiliary_loss_mlp": 0.01039086, "balance_loss_clip": 1.05888367, "balance_loss_mlp": 1.03062797, "epoch": 0.416401130283172, "flos": 29168801377920.0, "grad_norm": 3.733970967173933, "language_loss": 0.7922076, "learning_rate": 2.6279528635724747e-06, "loss": 0.81421137, "num_input_tokens_seen": 74712205, "step": 3463, "time_per_iteration": 2.9343981742858887 }, { "auxiliary_loss_clip": 0.01186064, "auxiliary_loss_mlp": 0.01028199, "balance_loss_clip": 1.05631995, "balance_loss_mlp": 1.0189904, "epoch": 0.4165213731738111, "flos": 16246813478400.0, "grad_norm": 3.2412320201551608, "language_loss": 0.78458846, "learning_rate": 2.627213236304848e-06, "loss": 0.8067311, "num_input_tokens_seen": 74729005, "step": 3464, "time_per_iteration": 2.7979888916015625 }, { "auxiliary_loss_clip": 0.01188905, "auxiliary_loss_mlp": 0.01030503, "balance_loss_clip": 1.05649102, "balance_loss_mlp": 1.0220269, "epoch": 0.4166416160644502, "flos": 33765438787200.0, "grad_norm": 2.191353319807859, "language_loss": 0.70786572, "learning_rate": 2.626473513889472e-06, "loss": 0.73005986, "num_input_tokens_seen": 74751385, "step": 3465, "time_per_iteration": 2.9024758338928223 }, { "auxiliary_loss_clip": 0.01179301, "auxiliary_loss_mlp": 0.01025579, "balance_loss_clip": 1.05679667, "balance_loss_mlp": 1.01729417, "epoch": 0.41676185895508927, "flos": 20917498775040.0, "grad_norm": 2.0118006109118407, "language_loss": 0.82468832, "learning_rate": 2.625733696438562e-06, "loss": 0.84673715, "num_input_tokens_seen": 74768890, "step": 3466, "time_per_iteration": 2.8050994873046875 }, { "auxiliary_loss_clip": 0.01180213, "auxiliary_loss_mlp": 0.01029181, "balance_loss_clip": 1.05604911, "balance_loss_mlp": 1.02052009, "epoch": 0.4168821018457284, "flos": 18406122549120.0, "grad_norm": 3.4639786853529637, "language_loss": 0.7523455, "learning_rate": 2.6249937840643476e-06, "loss": 0.77443939, "num_input_tokens_seen": 74787195, "step": 3467, "time_per_iteration": 2.793912410736084 }, { "auxiliary_loss_clip": 0.0118879, "auxiliary_loss_mlp": 0.01061232, "balance_loss_clip": 1.05724788, "balance_loss_mlp": 1.01966071, "epoch": 0.41700234473636744, "flos": 18698399516160.0, "grad_norm": 2.5870594805162606, "language_loss": 0.667979, "learning_rate": 2.6242537768790733e-06, "loss": 0.69047928, "num_input_tokens_seen": 74806350, "step": 3468, "time_per_iteration": 2.7483954429626465 }, { "auxiliary_loss_clip": 0.01186242, "auxiliary_loss_mlp": 0.01030138, "balance_loss_clip": 1.05804217, "balance_loss_mlp": 1.02164984, "epoch": 0.41712258762700655, "flos": 31033283616000.0, "grad_norm": 1.8648067106834914, "language_loss": 0.68951893, "learning_rate": 2.6235136749949975e-06, "loss": 0.71168268, "num_input_tokens_seen": 74829800, "step": 3469, "time_per_iteration": 2.836926221847534 }, { "auxiliary_loss_clip": 0.01187752, "auxiliary_loss_mlp": 0.01029232, "balance_loss_clip": 1.05627131, "balance_loss_mlp": 1.02119136, "epoch": 0.41724283051764566, "flos": 35914763877120.0, "grad_norm": 3.247307490084391, "language_loss": 0.61808896, "learning_rate": 2.6227734785243924e-06, "loss": 0.64025879, "num_input_tokens_seen": 74849760, "step": 3470, "time_per_iteration": 2.776878595352173 }, { "auxiliary_loss_clip": 0.01165471, "auxiliary_loss_mlp": 0.01027295, "balance_loss_clip": 1.0539186, "balance_loss_mlp": 1.01930165, "epoch": 0.4173630734082847, "flos": 25333649320320.0, "grad_norm": 1.8650632449101356, "language_loss": 0.79158187, "learning_rate": 2.6220331875795466e-06, "loss": 0.81350946, "num_input_tokens_seen": 74869110, "step": 3471, "time_per_iteration": 2.912493944168091 }, { "auxiliary_loss_clip": 0.01184635, "auxiliary_loss_mlp": 0.01031777, "balance_loss_clip": 1.05935144, "balance_loss_mlp": 1.02358103, "epoch": 0.4174833162989238, "flos": 26685398868480.0, "grad_norm": 1.6250259423553146, "language_loss": 0.75039661, "learning_rate": 2.62129280227276e-06, "loss": 0.77256078, "num_input_tokens_seen": 74889110, "step": 3472, "time_per_iteration": 2.7860207557678223 }, { "auxiliary_loss_clip": 0.01189356, "auxiliary_loss_mlp": 0.01032528, "balance_loss_clip": 1.05622244, "balance_loss_mlp": 1.02436173, "epoch": 0.41760355918956293, "flos": 74739584010240.0, "grad_norm": 2.1737015958869543, "language_loss": 0.68348974, "learning_rate": 2.62055232271635e-06, "loss": 0.70570862, "num_input_tokens_seen": 74916260, "step": 3473, "time_per_iteration": 3.1273629665374756 }, { "auxiliary_loss_clip": 0.01173789, "auxiliary_loss_mlp": 0.01026264, "balance_loss_clip": 1.05894613, "balance_loss_mlp": 1.01811004, "epoch": 0.417723802080202, "flos": 14317513148160.0, "grad_norm": 2.3151902299233886, "language_loss": 0.880485, "learning_rate": 2.619811749022646e-06, "loss": 0.90248549, "num_input_tokens_seen": 74931570, "step": 3474, "time_per_iteration": 2.7614307403564453 }, { "auxiliary_loss_clip": 0.0118918, "auxiliary_loss_mlp": 0.01026252, "balance_loss_clip": 1.05815065, "balance_loss_mlp": 1.01746678, "epoch": 0.4178440449708411, "flos": 14643797316480.0, "grad_norm": 2.444152727273057, "language_loss": 0.71453029, "learning_rate": 2.6190710813039917e-06, "loss": 0.73668456, "num_input_tokens_seen": 74944695, "step": 3475, "time_per_iteration": 2.7096664905548096 }, { "auxiliary_loss_clip": 0.0117516, "auxiliary_loss_mlp": 0.010639, "balance_loss_clip": 1.05677092, "balance_loss_mlp": 1.02441335, "epoch": 0.4179642878614802, "flos": 21507296094720.0, "grad_norm": 2.9797356734450644, "language_loss": 0.83479208, "learning_rate": 2.618330319672747e-06, "loss": 0.85718262, "num_input_tokens_seen": 74964115, "step": 3476, "time_per_iteration": 2.8047757148742676 }, { "auxiliary_loss_clip": 0.01189348, "auxiliary_loss_mlp": 0.01029771, "balance_loss_clip": 1.05610514, "balance_loss_mlp": 1.02159023, "epoch": 0.41808453075211927, "flos": 18441997257600.0, "grad_norm": 2.05125969798044, "language_loss": 0.91905576, "learning_rate": 2.617589464241284e-06, "loss": 0.94124699, "num_input_tokens_seen": 74978515, "step": 3477, "time_per_iteration": 3.6014320850372314 }, { "auxiliary_loss_clip": 0.01181668, "auxiliary_loss_mlp": 0.01036231, "balance_loss_clip": 1.05632043, "balance_loss_mlp": 1.02844691, "epoch": 0.4182047736427584, "flos": 20301020628480.0, "grad_norm": 1.9536412163842596, "language_loss": 0.74182153, "learning_rate": 2.6168485151219914e-06, "loss": 0.76400048, "num_input_tokens_seen": 74998135, "step": 3478, "time_per_iteration": 3.6445751190185547 }, { "auxiliary_loss_clip": 0.01187197, "auxiliary_loss_mlp": 0.01025298, "balance_loss_clip": 1.05805409, "balance_loss_mlp": 1.01675701, "epoch": 0.4183250165333975, "flos": 18876623823360.0, "grad_norm": 3.175743426037883, "language_loss": 0.71599579, "learning_rate": 2.616107472427269e-06, "loss": 0.73812073, "num_input_tokens_seen": 75012830, "step": 3479, "time_per_iteration": 2.6868064403533936 }, { "auxiliary_loss_clip": 0.01188641, "auxiliary_loss_mlp": 0.01030897, "balance_loss_clip": 1.05449939, "balance_loss_mlp": 1.02271366, "epoch": 0.41844525942403654, "flos": 17740050698880.0, "grad_norm": 3.4861218216396157, "language_loss": 0.76220989, "learning_rate": 2.615366336269533e-06, "loss": 0.78440523, "num_input_tokens_seen": 75026495, "step": 3480, "time_per_iteration": 2.69561505317688 }, { "auxiliary_loss_clip": 0.01192245, "auxiliary_loss_mlp": 0.01029676, "balance_loss_clip": 1.0567013, "balance_loss_mlp": 1.02090263, "epoch": 0.41856550231467565, "flos": 18361377181440.0, "grad_norm": 2.5137397580675267, "language_loss": 0.80076134, "learning_rate": 2.6146251067612126e-06, "loss": 0.82298052, "num_input_tokens_seen": 75041970, "step": 3481, "time_per_iteration": 3.617565393447876 }, { "auxiliary_loss_clip": 0.01183752, "auxiliary_loss_mlp": 0.01028234, "balance_loss_clip": 1.05765855, "balance_loss_mlp": 1.02025855, "epoch": 0.41868574520531476, "flos": 22781801445120.0, "grad_norm": 8.292021372215167, "language_loss": 0.82522309, "learning_rate": 2.6138837840147525e-06, "loss": 0.84734291, "num_input_tokens_seen": 75061005, "step": 3482, "time_per_iteration": 3.749541759490967 }, { "auxiliary_loss_clip": 0.01176634, "auxiliary_loss_mlp": 0.01031782, "balance_loss_clip": 1.0552119, "balance_loss_mlp": 1.02352667, "epoch": 0.4188059880959538, "flos": 13699167494400.0, "grad_norm": 3.403595618126411, "language_loss": 0.76232362, "learning_rate": 2.6131423681426103e-06, "loss": 0.78440773, "num_input_tokens_seen": 75076920, "step": 3483, "time_per_iteration": 2.773324966430664 }, { "auxiliary_loss_clip": 0.01187927, "auxiliary_loss_mlp": 0.0103318, "balance_loss_clip": 1.05666375, "balance_loss_mlp": 1.02531219, "epoch": 0.41892623098659293, "flos": 37818281220480.0, "grad_norm": 2.138160269135069, "language_loss": 0.72734708, "learning_rate": 2.6124008592572587e-06, "loss": 0.74955821, "num_input_tokens_seen": 75100905, "step": 3484, "time_per_iteration": 2.9144861698150635 }, { "auxiliary_loss_clip": 0.01190617, "auxiliary_loss_mlp": 0.01034657, "balance_loss_clip": 1.05548525, "balance_loss_mlp": 1.02602589, "epoch": 0.419046473877232, "flos": 23258874908160.0, "grad_norm": 2.2285203977478636, "language_loss": 0.81727582, "learning_rate": 2.6116592574711835e-06, "loss": 0.83952856, "num_input_tokens_seen": 75119205, "step": 3485, "time_per_iteration": 2.6942784786224365 }, { "auxiliary_loss_clip": 0.01192986, "auxiliary_loss_mlp": 0.01032726, "balance_loss_clip": 1.05814183, "balance_loss_mlp": 1.02384448, "epoch": 0.4191667167678711, "flos": 20741034234240.0, "grad_norm": 1.9384696506968528, "language_loss": 0.84206676, "learning_rate": 2.6109175628968853e-06, "loss": 0.86432385, "num_input_tokens_seen": 75138970, "step": 3486, "time_per_iteration": 2.7656798362731934 }, { "auxiliary_loss_clip": 0.01176739, "auxiliary_loss_mlp": 0.0102996, "balance_loss_clip": 1.05574775, "balance_loss_mlp": 1.02203822, "epoch": 0.4192869596585102, "flos": 23586416052480.0, "grad_norm": 3.506405879486776, "language_loss": 0.82900882, "learning_rate": 2.610175775646878e-06, "loss": 0.85107583, "num_input_tokens_seen": 75157550, "step": 3487, "time_per_iteration": 2.6468048095703125 }, { "auxiliary_loss_clip": 0.01179091, "auxiliary_loss_mlp": 0.01026699, "balance_loss_clip": 1.05474257, "balance_loss_mlp": 1.01765084, "epoch": 0.41940720254914926, "flos": 25081269384960.0, "grad_norm": 2.300726559606633, "language_loss": 0.73257536, "learning_rate": 2.6094338958336907e-06, "loss": 0.75463331, "num_input_tokens_seen": 75176220, "step": 3488, "time_per_iteration": 2.7952475547790527 }, { "auxiliary_loss_clip": 0.01179154, "auxiliary_loss_mlp": 0.01025881, "balance_loss_clip": 1.05547333, "balance_loss_mlp": 1.01832283, "epoch": 0.41952744543978837, "flos": 15554132628480.0, "grad_norm": 6.246146264847659, "language_loss": 0.82244599, "learning_rate": 2.608691923569867e-06, "loss": 0.84449637, "num_input_tokens_seen": 75193095, "step": 3489, "time_per_iteration": 2.7455527782440186 }, { "auxiliary_loss_clip": 0.01187299, "auxiliary_loss_mlp": 0.01029382, "balance_loss_clip": 1.05796301, "balance_loss_mlp": 1.02131486, "epoch": 0.4196476883304275, "flos": 24644775312000.0, "grad_norm": 1.7487944073687793, "language_loss": 0.76013726, "learning_rate": 2.6079498589679616e-06, "loss": 0.78230405, "num_input_tokens_seen": 75214185, "step": 3490, "time_per_iteration": 2.7269861698150635 }, { "auxiliary_loss_clip": 0.0117107, "auxiliary_loss_mlp": 0.01033565, "balance_loss_clip": 1.05685186, "balance_loss_mlp": 1.02420712, "epoch": 0.41976793122106654, "flos": 24531333183360.0, "grad_norm": 2.3163921746158276, "language_loss": 0.75919688, "learning_rate": 2.6072077021405465e-06, "loss": 0.78124321, "num_input_tokens_seen": 75233020, "step": 3491, "time_per_iteration": 2.788184404373169 }, { "auxiliary_loss_clip": 0.0118953, "auxiliary_loss_mlp": 0.01032775, "balance_loss_clip": 1.05860281, "balance_loss_mlp": 1.02415586, "epoch": 0.41988817411170565, "flos": 21175301664000.0, "grad_norm": 1.869281243815, "language_loss": 0.69301009, "learning_rate": 2.6064654532002054e-06, "loss": 0.71523315, "num_input_tokens_seen": 75252030, "step": 3492, "time_per_iteration": 2.702058792114258 }, { "auxiliary_loss_clip": 0.01186592, "auxiliary_loss_mlp": 0.01032029, "balance_loss_clip": 1.05577528, "balance_loss_mlp": 1.02331424, "epoch": 0.42000841700234476, "flos": 31649402626560.0, "grad_norm": 1.481248645492709, "language_loss": 0.7589339, "learning_rate": 2.6057231122595375e-06, "loss": 0.78112018, "num_input_tokens_seen": 75273340, "step": 3493, "time_per_iteration": 2.7936787605285645 }, { "auxiliary_loss_clip": 0.01181588, "auxiliary_loss_mlp": 0.01028676, "balance_loss_clip": 1.05459523, "balance_loss_mlp": 1.0198189, "epoch": 0.4201286598929838, "flos": 21281525159040.0, "grad_norm": 1.7841979256706655, "language_loss": 0.73035342, "learning_rate": 2.604980679431154e-06, "loss": 0.75245607, "num_input_tokens_seen": 75291580, "step": 3494, "time_per_iteration": 2.7731690406799316 }, { "auxiliary_loss_clip": 0.01185851, "auxiliary_loss_mlp": 0.01030621, "balance_loss_clip": 1.05357122, "balance_loss_mlp": 1.02201962, "epoch": 0.4202489027836229, "flos": 18546532813440.0, "grad_norm": 2.225762259150314, "language_loss": 0.74634868, "learning_rate": 2.604238154827684e-06, "loss": 0.76851338, "num_input_tokens_seen": 75308205, "step": 3495, "time_per_iteration": 2.850797653198242 }, { "auxiliary_loss_clip": 0.01190141, "auxiliary_loss_mlp": 0.01025894, "balance_loss_clip": 1.05831194, "balance_loss_mlp": 1.01815188, "epoch": 0.42036914567426203, "flos": 19317643009920.0, "grad_norm": 2.0519693897837903, "language_loss": 0.72183424, "learning_rate": 2.6034955385617656e-06, "loss": 0.74399459, "num_input_tokens_seen": 75326535, "step": 3496, "time_per_iteration": 2.708448648452759 }, { "auxiliary_loss_clip": 0.01089146, "auxiliary_loss_mlp": 0.01005953, "balance_loss_clip": 1.02490687, "balance_loss_mlp": 1.00457573, "epoch": 0.4204893885649011, "flos": 67842942935040.0, "grad_norm": 0.7248315399703988, "language_loss": 0.61610526, "learning_rate": 2.6027528307460544e-06, "loss": 0.63705623, "num_input_tokens_seen": 75390540, "step": 3497, "time_per_iteration": 3.449867010116577 }, { "auxiliary_loss_clip": 0.01188142, "auxiliary_loss_mlp": 0.01026748, "balance_loss_clip": 1.05559838, "balance_loss_mlp": 1.0188148, "epoch": 0.4206096314555402, "flos": 21908777385600.0, "grad_norm": 1.9329222343935581, "language_loss": 0.86419564, "learning_rate": 2.602010031493217e-06, "loss": 0.88634455, "num_input_tokens_seen": 75408770, "step": 3498, "time_per_iteration": 2.8972365856170654 }, { "auxiliary_loss_clip": 0.0117309, "auxiliary_loss_mlp": 0.01031339, "balance_loss_clip": 1.05413091, "balance_loss_mlp": 1.02309537, "epoch": 0.42072987434617926, "flos": 29278185269760.0, "grad_norm": 3.396781638154006, "language_loss": 0.86431313, "learning_rate": 2.6012671409159367e-06, "loss": 0.88635743, "num_input_tokens_seen": 75430105, "step": 3499, "time_per_iteration": 2.963932991027832 }, { "auxiliary_loss_clip": 0.01177212, "auxiliary_loss_mlp": 0.01027919, "balance_loss_clip": 1.05645847, "balance_loss_mlp": 1.01934767, "epoch": 0.42085011723681837, "flos": 27600726170880.0, "grad_norm": 1.7956132842934007, "language_loss": 0.81701422, "learning_rate": 2.6005241591269097e-06, "loss": 0.83906549, "num_input_tokens_seen": 75449475, "step": 3500, "time_per_iteration": 2.9454522132873535 }, { "auxiliary_loss_clip": 0.01173676, "auxiliary_loss_mlp": 0.01026191, "balance_loss_clip": 1.05658889, "balance_loss_mlp": 1.01800156, "epoch": 0.4209703601274575, "flos": 27818632028160.0, "grad_norm": 1.9166833481684558, "language_loss": 0.80120116, "learning_rate": 2.5997810862388454e-06, "loss": 0.82319987, "num_input_tokens_seen": 75469315, "step": 3501, "time_per_iteration": 2.9153945446014404 }, { "auxiliary_loss_clip": 0.0118106, "auxiliary_loss_mlp": 0.01033752, "balance_loss_clip": 1.0535779, "balance_loss_mlp": 1.02554417, "epoch": 0.42109060301809653, "flos": 27525529048320.0, "grad_norm": 2.853150020584012, "language_loss": 0.7563262, "learning_rate": 2.599037922364467e-06, "loss": 0.77847421, "num_input_tokens_seen": 75488215, "step": 3502, "time_per_iteration": 2.8915786743164062 }, { "auxiliary_loss_clip": 0.0116626, "auxiliary_loss_mlp": 0.01027533, "balance_loss_clip": 1.05094957, "balance_loss_mlp": 1.01912248, "epoch": 0.42121084590873564, "flos": 29314275459840.0, "grad_norm": 2.845762777107707, "language_loss": 0.75292504, "learning_rate": 2.5982946676165112e-06, "loss": 0.774863, "num_input_tokens_seen": 75507985, "step": 3503, "time_per_iteration": 3.8837337493896484 }, { "auxiliary_loss_clip": 0.01086405, "auxiliary_loss_mlp": 0.01007725, "balance_loss_clip": 1.02832091, "balance_loss_mlp": 1.00643718, "epoch": 0.42133108879937475, "flos": 67398835178880.0, "grad_norm": 0.7433345192241447, "language_loss": 0.57557917, "learning_rate": 2.5975513221077313e-06, "loss": 0.59652048, "num_input_tokens_seen": 75571955, "step": 3504, "time_per_iteration": 4.297323942184448 }, { "auxiliary_loss_clip": 0.01172478, "auxiliary_loss_mlp": 0.01028185, "balance_loss_clip": 1.05412877, "balance_loss_mlp": 1.02072239, "epoch": 0.4214513316900138, "flos": 23106038538240.0, "grad_norm": 3.3580005329497866, "language_loss": 0.88333547, "learning_rate": 2.5968078859508897e-06, "loss": 0.9053421, "num_input_tokens_seen": 75589155, "step": 3505, "time_per_iteration": 2.823530673980713 }, { "auxiliary_loss_clip": 0.01181995, "auxiliary_loss_mlp": 0.01035677, "balance_loss_clip": 1.05273175, "balance_loss_mlp": 1.02709424, "epoch": 0.4215715745806529, "flos": 15336190857600.0, "grad_norm": 2.3390064737413128, "language_loss": 0.79890776, "learning_rate": 2.5960643592587673e-06, "loss": 0.8210845, "num_input_tokens_seen": 75606565, "step": 3506, "time_per_iteration": 2.804375410079956 }, { "auxiliary_loss_clip": 0.01175576, "auxiliary_loss_mlp": 0.01022174, "balance_loss_clip": 1.05336118, "balance_loss_mlp": 1.0146873, "epoch": 0.42169181747129203, "flos": 22127257860480.0, "grad_norm": 1.8686607752995745, "language_loss": 0.81286794, "learning_rate": 2.5953207421441553e-06, "loss": 0.83484542, "num_input_tokens_seen": 75625165, "step": 3507, "time_per_iteration": 3.762974500656128 }, { "auxiliary_loss_clip": 0.0117965, "auxiliary_loss_mlp": 0.01025008, "balance_loss_clip": 1.05610132, "balance_loss_mlp": 1.01720607, "epoch": 0.4218120603619311, "flos": 22630724841600.0, "grad_norm": 2.4917181776633193, "language_loss": 0.75623751, "learning_rate": 2.5945770347198603e-06, "loss": 0.77828407, "num_input_tokens_seen": 75643320, "step": 3508, "time_per_iteration": 2.8054490089416504 }, { "auxiliary_loss_clip": 0.01175901, "auxiliary_loss_mlp": 0.01032067, "balance_loss_clip": 1.05268717, "balance_loss_mlp": 1.02423501, "epoch": 0.4219323032525702, "flos": 19682818629120.0, "grad_norm": 8.309240830183251, "language_loss": 0.82021177, "learning_rate": 2.593833237098701e-06, "loss": 0.84229147, "num_input_tokens_seen": 75660920, "step": 3509, "time_per_iteration": 3.751594305038452 }, { "auxiliary_loss_clip": 0.011813, "auxiliary_loss_mlp": 0.01032544, "balance_loss_clip": 1.05225635, "balance_loss_mlp": 1.02384734, "epoch": 0.4220525461432093, "flos": 30190747224960.0, "grad_norm": 2.164387365883852, "language_loss": 0.62742722, "learning_rate": 2.593089349393512e-06, "loss": 0.64956558, "num_input_tokens_seen": 75681410, "step": 3510, "time_per_iteration": 2.7274341583251953 }, { "auxiliary_loss_clip": 0.01179432, "auxiliary_loss_mlp": 0.01029717, "balance_loss_clip": 1.0532428, "balance_loss_mlp": 1.02186704, "epoch": 0.42217278903384836, "flos": 24315941278080.0, "grad_norm": 3.4688594439043356, "language_loss": 0.83511567, "learning_rate": 2.592345371717141e-06, "loss": 0.85720718, "num_input_tokens_seen": 75700940, "step": 3511, "time_per_iteration": 2.7647786140441895 }, { "auxiliary_loss_clip": 0.01185175, "auxiliary_loss_mlp": 0.01031859, "balance_loss_clip": 1.05923152, "balance_loss_mlp": 1.02369928, "epoch": 0.42229303192448747, "flos": 17092474352640.0, "grad_norm": 2.8712857920712227, "language_loss": 0.72025812, "learning_rate": 2.591601304182448e-06, "loss": 0.74242848, "num_input_tokens_seen": 75718910, "step": 3512, "time_per_iteration": 2.6611690521240234 }, { "auxiliary_loss_clip": 0.01182447, "auxiliary_loss_mlp": 0.01024751, "balance_loss_clip": 1.05715442, "balance_loss_mlp": 1.01679337, "epoch": 0.4224132748151266, "flos": 22784530878720.0, "grad_norm": 1.778645868073209, "language_loss": 0.79445541, "learning_rate": 2.5908571469023067e-06, "loss": 0.81652737, "num_input_tokens_seen": 75738395, "step": 3513, "time_per_iteration": 2.758849620819092 }, { "auxiliary_loss_clip": 0.01185421, "auxiliary_loss_mlp": 0.01026812, "balance_loss_clip": 1.05437171, "balance_loss_mlp": 1.018646, "epoch": 0.42253351770576564, "flos": 17819090576640.0, "grad_norm": 3.620401020042909, "language_loss": 0.75399423, "learning_rate": 2.5901128999896067e-06, "loss": 0.77611661, "num_input_tokens_seen": 75753825, "step": 3514, "time_per_iteration": 2.6924521923065186 }, { "auxiliary_loss_clip": 0.01181523, "auxiliary_loss_mlp": 0.01025089, "balance_loss_clip": 1.05535388, "balance_loss_mlp": 1.01657152, "epoch": 0.42265376059640475, "flos": 28512390286080.0, "grad_norm": 1.7163165592597034, "language_loss": 0.6842891, "learning_rate": 2.5893685635572487e-06, "loss": 0.70635521, "num_input_tokens_seen": 75774675, "step": 3515, "time_per_iteration": 2.7635931968688965 }, { "auxiliary_loss_clip": 0.01177329, "auxiliary_loss_mlp": 0.01031111, "balance_loss_clip": 1.05395877, "balance_loss_mlp": 1.02280211, "epoch": 0.4227740034870438, "flos": 16253349753600.0, "grad_norm": 2.251884051845517, "language_loss": 0.69330406, "learning_rate": 2.5886241377181483e-06, "loss": 0.71538842, "num_input_tokens_seen": 75793545, "step": 3516, "time_per_iteration": 2.8669888973236084 }, { "auxiliary_loss_clip": 0.01186461, "auxiliary_loss_mlp": 0.01031886, "balance_loss_clip": 1.05584145, "balance_loss_mlp": 1.02288556, "epoch": 0.4228942463776829, "flos": 25295691623040.0, "grad_norm": 2.044514619535214, "language_loss": 0.8094101, "learning_rate": 2.587879622585234e-06, "loss": 0.83159357, "num_input_tokens_seen": 75812145, "step": 3517, "time_per_iteration": 2.807061195373535 }, { "auxiliary_loss_clip": 0.01185215, "auxiliary_loss_mlp": 0.01025722, "balance_loss_clip": 1.05840111, "balance_loss_mlp": 1.017681, "epoch": 0.423014489268322, "flos": 26395779507840.0, "grad_norm": 3.188151343591825, "language_loss": 0.75768352, "learning_rate": 2.5871350182714486e-06, "loss": 0.7797929, "num_input_tokens_seen": 75833025, "step": 3518, "time_per_iteration": 2.781193971633911 }, { "auxiliary_loss_clip": 0.01183977, "auxiliary_loss_mlp": 0.01030678, "balance_loss_clip": 1.05442286, "balance_loss_mlp": 1.02248251, "epoch": 0.4231347321589611, "flos": 17274002711040.0, "grad_norm": 2.0479915053873414, "language_loss": 0.80566525, "learning_rate": 2.586390324889748e-06, "loss": 0.82781184, "num_input_tokens_seen": 75848925, "step": 3519, "time_per_iteration": 2.695591688156128 }, { "auxiliary_loss_clip": 0.01180795, "auxiliary_loss_mlp": 0.01025979, "balance_loss_clip": 1.05503976, "balance_loss_mlp": 1.01849866, "epoch": 0.4232549750496002, "flos": 22999635475200.0, "grad_norm": 2.1355162483288, "language_loss": 0.67395341, "learning_rate": 2.5856455425531003e-06, "loss": 0.69602114, "num_input_tokens_seen": 75870400, "step": 3520, "time_per_iteration": 2.845116138458252 }, { "auxiliary_loss_clip": 0.01180586, "auxiliary_loss_mlp": 0.01022463, "balance_loss_clip": 1.05450773, "balance_loss_mlp": 1.01422584, "epoch": 0.4233752179402393, "flos": 21248343970560.0, "grad_norm": 1.9369552075939147, "language_loss": 0.80388987, "learning_rate": 2.5849006713744902e-06, "loss": 0.82592034, "num_input_tokens_seen": 75889195, "step": 3521, "time_per_iteration": 2.6921679973602295 }, { "auxiliary_loss_clip": 0.01180566, "auxiliary_loss_mlp": 0.01030693, "balance_loss_clip": 1.05831635, "balance_loss_mlp": 1.02283692, "epoch": 0.42349546083087836, "flos": 20704297599360.0, "grad_norm": 2.889554124718933, "language_loss": 0.73101878, "learning_rate": 2.5841557114669135e-06, "loss": 0.75313133, "num_input_tokens_seen": 75906055, "step": 3522, "time_per_iteration": 2.7388834953308105 }, { "auxiliary_loss_clip": 0.01189371, "auxiliary_loss_mlp": 0.01029318, "balance_loss_clip": 1.05513859, "balance_loss_mlp": 1.02031136, "epoch": 0.42361570372151747, "flos": 18585065128320.0, "grad_norm": 2.49666553536476, "language_loss": 0.67722392, "learning_rate": 2.58341066294338e-06, "loss": 0.6994108, "num_input_tokens_seen": 75922720, "step": 3523, "time_per_iteration": 2.713212251663208 }, { "auxiliary_loss_clip": 0.01177225, "auxiliary_loss_mlp": 0.01060142, "balance_loss_clip": 1.05464315, "balance_loss_mlp": 1.02012444, "epoch": 0.4237359466121566, "flos": 20959478795520.0, "grad_norm": 1.9385007473249833, "language_loss": 0.85197788, "learning_rate": 2.5826655259169124e-06, "loss": 0.87435156, "num_input_tokens_seen": 75941375, "step": 3524, "time_per_iteration": 2.853691816329956 }, { "auxiliary_loss_clip": 0.01186011, "auxiliary_loss_mlp": 0.01029177, "balance_loss_clip": 1.05574298, "balance_loss_mlp": 1.02130961, "epoch": 0.42385618950279563, "flos": 18038181582720.0, "grad_norm": 1.9971189407139964, "language_loss": 0.90658164, "learning_rate": 2.5819203005005475e-06, "loss": 0.92873347, "num_input_tokens_seen": 75958710, "step": 3525, "time_per_iteration": 2.820129871368408 }, { "auxiliary_loss_clip": 0.01173781, "auxiliary_loss_mlp": 0.01027294, "balance_loss_clip": 1.05414915, "balance_loss_mlp": 1.01897919, "epoch": 0.42397643239343474, "flos": 23769129559680.0, "grad_norm": 1.6005630049439108, "language_loss": 0.78483045, "learning_rate": 2.581174986807336e-06, "loss": 0.80684114, "num_input_tokens_seen": 75978945, "step": 3526, "time_per_iteration": 2.7674789428710938 }, { "auxiliary_loss_clip": 0.01175628, "auxiliary_loss_mlp": 0.01057393, "balance_loss_clip": 1.05440187, "balance_loss_mlp": 1.01780927, "epoch": 0.42409667528407385, "flos": 16545088016640.0, "grad_norm": 2.2564344216484318, "language_loss": 0.91518831, "learning_rate": 2.580429584950341e-06, "loss": 0.93751854, "num_input_tokens_seen": 75994695, "step": 3527, "time_per_iteration": 2.734548568725586 }, { "auxiliary_loss_clip": 0.01177504, "auxiliary_loss_mlp": 0.01033224, "balance_loss_clip": 1.05241144, "balance_loss_mlp": 1.02485514, "epoch": 0.4242169181747129, "flos": 16034186920320.0, "grad_norm": 2.651405913230284, "language_loss": 0.66509002, "learning_rate": 2.5796840950426397e-06, "loss": 0.68719733, "num_input_tokens_seen": 76011780, "step": 3528, "time_per_iteration": 2.732957124710083 }, { "auxiliary_loss_clip": 0.01177273, "auxiliary_loss_mlp": 0.01025887, "balance_loss_clip": 1.05647182, "balance_loss_mlp": 1.01758456, "epoch": 0.424337161065352, "flos": 20084012611200.0, "grad_norm": 3.412927961754089, "language_loss": 0.65878648, "learning_rate": 2.578938517197322e-06, "loss": 0.68081808, "num_input_tokens_seen": 76029875, "step": 3529, "time_per_iteration": 3.898240566253662 }, { "auxiliary_loss_clip": 0.01170327, "auxiliary_loss_mlp": 0.01025894, "balance_loss_clip": 1.05356932, "balance_loss_mlp": 1.01840174, "epoch": 0.4244574039559911, "flos": 23878369797120.0, "grad_norm": 3.7704376211190844, "language_loss": 0.61989772, "learning_rate": 2.5781928515274916e-06, "loss": 0.64185989, "num_input_tokens_seen": 76048595, "step": 3530, "time_per_iteration": 2.8269431591033936 }, { "auxiliary_loss_clip": 0.01186442, "auxiliary_loss_mlp": 0.010262, "balance_loss_clip": 1.05734491, "balance_loss_mlp": 1.01829064, "epoch": 0.4245776468466302, "flos": 17565920542080.0, "grad_norm": 2.020014144473268, "language_loss": 0.67780757, "learning_rate": 2.577447098146265e-06, "loss": 0.69993401, "num_input_tokens_seen": 76065770, "step": 3531, "time_per_iteration": 3.7329139709472656 }, { "auxiliary_loss_clip": 0.01181076, "auxiliary_loss_mlp": 0.01025013, "balance_loss_clip": 1.05742359, "balance_loss_mlp": 1.01768196, "epoch": 0.4246978897372693, "flos": 27776256958080.0, "grad_norm": 1.784905702432424, "language_loss": 0.79202133, "learning_rate": 2.5767012571667724e-06, "loss": 0.81408226, "num_input_tokens_seen": 76085250, "step": 3532, "time_per_iteration": 2.8336598873138428 }, { "auxiliary_loss_clip": 0.01186335, "auxiliary_loss_mlp": 0.01026778, "balance_loss_clip": 1.05582106, "balance_loss_mlp": 1.01842713, "epoch": 0.42481813262790835, "flos": 15596615439360.0, "grad_norm": 1.9322111869425378, "language_loss": 0.68850195, "learning_rate": 2.5759553287021587e-06, "loss": 0.71063304, "num_input_tokens_seen": 76103580, "step": 3533, "time_per_iteration": 3.698195219039917 }, { "auxiliary_loss_clip": 0.01176463, "auxiliary_loss_mlp": 0.01027809, "balance_loss_clip": 1.05472946, "balance_loss_mlp": 1.02019811, "epoch": 0.42493837551854746, "flos": 23951088881280.0, "grad_norm": 3.267036230527462, "language_loss": 0.77138186, "learning_rate": 2.5752093128655786e-06, "loss": 0.79342455, "num_input_tokens_seen": 76121825, "step": 3534, "time_per_iteration": 3.6408255100250244 }, { "auxiliary_loss_clip": 0.01176246, "auxiliary_loss_mlp": 0.01028509, "balance_loss_clip": 1.0566653, "balance_loss_mlp": 1.02010489, "epoch": 0.4250586184091866, "flos": 20813466009600.0, "grad_norm": 2.6625643577460614, "language_loss": 0.74307668, "learning_rate": 2.574463209770204e-06, "loss": 0.7651242, "num_input_tokens_seen": 76141140, "step": 3535, "time_per_iteration": 2.7002346515655518 }, { "auxiliary_loss_clip": 0.01173934, "auxiliary_loss_mlp": 0.01026006, "balance_loss_clip": 1.05293059, "balance_loss_mlp": 1.01797104, "epoch": 0.42517886129982563, "flos": 30371018607360.0, "grad_norm": 1.9979654256974133, "language_loss": 0.79372507, "learning_rate": 2.5737170195292165e-06, "loss": 0.81572443, "num_input_tokens_seen": 76164475, "step": 3536, "time_per_iteration": 2.8665807247161865 }, { "auxiliary_loss_clip": 0.01176806, "auxiliary_loss_mlp": 0.01023975, "balance_loss_clip": 1.05460608, "balance_loss_mlp": 1.01617837, "epoch": 0.42529910419046474, "flos": 20080636732800.0, "grad_norm": 2.532550468507904, "language_loss": 0.77882385, "learning_rate": 2.572970742255814e-06, "loss": 0.80083162, "num_input_tokens_seen": 76182965, "step": 3537, "time_per_iteration": 2.755016565322876 }, { "auxiliary_loss_clip": 0.01181046, "auxiliary_loss_mlp": 0.01025479, "balance_loss_clip": 1.05506682, "balance_loss_mlp": 1.01815379, "epoch": 0.42541934708110385, "flos": 22632448694400.0, "grad_norm": 1.943482393655313, "language_loss": 0.81347716, "learning_rate": 2.5722243780632046e-06, "loss": 0.83554244, "num_input_tokens_seen": 76201230, "step": 3538, "time_per_iteration": 2.7791056632995605 }, { "auxiliary_loss_clip": 0.01090523, "auxiliary_loss_mlp": 0.0100485, "balance_loss_clip": 1.02685237, "balance_loss_mlp": 1.0033536, "epoch": 0.4255395899717429, "flos": 66200676186240.0, "grad_norm": 0.8426721026099101, "language_loss": 0.60450304, "learning_rate": 2.5714779270646125e-06, "loss": 0.62545681, "num_input_tokens_seen": 76262000, "step": 3539, "time_per_iteration": 3.2807793617248535 }, { "auxiliary_loss_clip": 0.01181081, "auxiliary_loss_mlp": 0.01055379, "balance_loss_clip": 1.05486465, "balance_loss_mlp": 1.01822805, "epoch": 0.425659832862382, "flos": 17931814433280.0, "grad_norm": 2.610275998485695, "language_loss": 0.77983689, "learning_rate": 2.5707313893732735e-06, "loss": 0.80220151, "num_input_tokens_seen": 76280540, "step": 3540, "time_per_iteration": 2.815478801727295 }, { "auxiliary_loss_clip": 0.01160889, "auxiliary_loss_mlp": 0.01025154, "balance_loss_clip": 1.05301726, "balance_loss_mlp": 1.01661873, "epoch": 0.4257800757530211, "flos": 24022550989440.0, "grad_norm": 3.1964930414874466, "language_loss": 0.77268147, "learning_rate": 2.5699847651024364e-06, "loss": 0.79454195, "num_input_tokens_seen": 76301180, "step": 3541, "time_per_iteration": 2.939120054244995 }, { "auxiliary_loss_clip": 0.01179036, "auxiliary_loss_mlp": 0.01032011, "balance_loss_clip": 1.05624545, "balance_loss_mlp": 1.0241549, "epoch": 0.4259003186436602, "flos": 23696015425920.0, "grad_norm": 2.3957048045994944, "language_loss": 0.76958215, "learning_rate": 2.5692380543653627e-06, "loss": 0.79169267, "num_input_tokens_seen": 76319335, "step": 3542, "time_per_iteration": 2.7581584453582764 }, { "auxiliary_loss_clip": 0.01188779, "auxiliary_loss_mlp": 0.01061661, "balance_loss_clip": 1.05781293, "balance_loss_mlp": 1.02443314, "epoch": 0.4260205615342993, "flos": 15259772672640.0, "grad_norm": 2.109019567940928, "language_loss": 0.70083034, "learning_rate": 2.5684912572753293e-06, "loss": 0.72333473, "num_input_tokens_seen": 76335010, "step": 3543, "time_per_iteration": 2.7934341430664062 }, { "auxiliary_loss_clip": 0.01181239, "auxiliary_loss_mlp": 0.01027609, "balance_loss_clip": 1.05427837, "balance_loss_mlp": 1.0203377, "epoch": 0.4261408044249384, "flos": 30665306736000.0, "grad_norm": 3.2203874052523376, "language_loss": 0.84117854, "learning_rate": 2.5677443739456245e-06, "loss": 0.86326706, "num_input_tokens_seen": 76356670, "step": 3544, "time_per_iteration": 2.752614736557007 }, { "auxiliary_loss_clip": 0.01176389, "auxiliary_loss_mlp": 0.01022327, "balance_loss_clip": 1.05280042, "balance_loss_mlp": 1.01469111, "epoch": 0.42626104731557746, "flos": 23257905240960.0, "grad_norm": 2.5485028721499345, "language_loss": 0.79756463, "learning_rate": 2.5669974044895495e-06, "loss": 0.81955177, "num_input_tokens_seen": 76373065, "step": 3545, "time_per_iteration": 2.734043598175049 }, { "auxiliary_loss_clip": 0.01182676, "auxiliary_loss_mlp": 0.010288, "balance_loss_clip": 1.05491757, "balance_loss_mlp": 1.0215162, "epoch": 0.42638129020621657, "flos": 25884770670720.0, "grad_norm": 2.5938060127990017, "language_loss": 0.79669052, "learning_rate": 2.5662503490204187e-06, "loss": 0.81880534, "num_input_tokens_seen": 76393230, "step": 3546, "time_per_iteration": 2.7293529510498047 }, { "auxiliary_loss_clip": 0.01176192, "auxiliary_loss_mlp": 0.0103303, "balance_loss_clip": 1.05371201, "balance_loss_mlp": 1.02506065, "epoch": 0.4265015330968556, "flos": 26502362138880.0, "grad_norm": 5.054697905624802, "language_loss": 0.76419985, "learning_rate": 2.5655032076515603e-06, "loss": 0.7862922, "num_input_tokens_seen": 76412555, "step": 3547, "time_per_iteration": 2.7049827575683594 }, { "auxiliary_loss_clip": 0.01177421, "auxiliary_loss_mlp": 0.01029962, "balance_loss_clip": 1.05348718, "balance_loss_mlp": 1.02241337, "epoch": 0.42662177598749473, "flos": 24389522288640.0, "grad_norm": 2.6514785990472514, "language_loss": 0.82214975, "learning_rate": 2.5647559804963155e-06, "loss": 0.84422356, "num_input_tokens_seen": 76432485, "step": 3548, "time_per_iteration": 2.6306726932525635 }, { "auxiliary_loss_clip": 0.0118027, "auxiliary_loss_mlp": 0.01027059, "balance_loss_clip": 1.05540514, "balance_loss_mlp": 1.01975477, "epoch": 0.42674201887813384, "flos": 23148629089920.0, "grad_norm": 2.4317819517914754, "language_loss": 0.78459609, "learning_rate": 2.5640086676680364e-06, "loss": 0.80666935, "num_input_tokens_seen": 76453980, "step": 3549, "time_per_iteration": 2.809194326400757 }, { "auxiliary_loss_clip": 0.01180446, "auxiliary_loss_mlp": 0.01037065, "balance_loss_clip": 1.05414546, "balance_loss_mlp": 1.02870846, "epoch": 0.4268622617687729, "flos": 21689614552320.0, "grad_norm": 2.4368148116838118, "language_loss": 0.81088829, "learning_rate": 2.5632612692800923e-06, "loss": 0.83306342, "num_input_tokens_seen": 76473045, "step": 3550, "time_per_iteration": 2.6416587829589844 }, { "auxiliary_loss_clip": 0.01179397, "auxiliary_loss_mlp": 0.0103153, "balance_loss_clip": 1.05639172, "balance_loss_mlp": 1.0233109, "epoch": 0.426982504659412, "flos": 23440151871360.0, "grad_norm": 2.2673475707448825, "language_loss": 0.75663269, "learning_rate": 2.5625137854458603e-06, "loss": 0.77874196, "num_input_tokens_seen": 76492060, "step": 3551, "time_per_iteration": 2.719646692276001 }, { "auxiliary_loss_clip": 0.01176243, "auxiliary_loss_mlp": 0.01025152, "balance_loss_clip": 1.05131257, "balance_loss_mlp": 1.01757598, "epoch": 0.4271027475500511, "flos": 18916556768640.0, "grad_norm": 1.8486527260998202, "language_loss": 0.79946649, "learning_rate": 2.561766216278735e-06, "loss": 0.82148039, "num_input_tokens_seen": 76509655, "step": 3552, "time_per_iteration": 2.6813371181488037 }, { "auxiliary_loss_clip": 0.01167921, "auxiliary_loss_mlp": 0.01024051, "balance_loss_clip": 1.052791, "balance_loss_mlp": 1.01671338, "epoch": 0.4272229904406902, "flos": 26870554500480.0, "grad_norm": 2.134646215138576, "language_loss": 0.8135106, "learning_rate": 2.561018561892121e-06, "loss": 0.83543026, "num_input_tokens_seen": 76528795, "step": 3553, "time_per_iteration": 2.786371946334839 }, { "auxiliary_loss_clip": 0.01176805, "auxiliary_loss_mlp": 0.01024303, "balance_loss_clip": 1.05308533, "balance_loss_mlp": 1.01638162, "epoch": 0.4273432333313293, "flos": 23951376190080.0, "grad_norm": 2.0236660249708525, "language_loss": 0.76637727, "learning_rate": 2.5602708223994363e-06, "loss": 0.78838837, "num_input_tokens_seen": 76550660, "step": 3554, "time_per_iteration": 2.808579206466675 }, { "auxiliary_loss_clip": 0.01179793, "auxiliary_loss_mlp": 0.01027245, "balance_loss_clip": 1.05488634, "balance_loss_mlp": 1.01970518, "epoch": 0.4274634762219684, "flos": 29570354496000.0, "grad_norm": 2.2554060802068694, "language_loss": 0.67597127, "learning_rate": 2.559522997914115e-06, "loss": 0.69804162, "num_input_tokens_seen": 76570240, "step": 3555, "time_per_iteration": 2.8099677562713623 }, { "auxiliary_loss_clip": 0.01184998, "auxiliary_loss_mlp": 0.01026086, "balance_loss_clip": 1.05692101, "balance_loss_mlp": 1.018713, "epoch": 0.42758371911260745, "flos": 21434146047360.0, "grad_norm": 2.255812585690212, "language_loss": 0.85233212, "learning_rate": 2.558775088549599e-06, "loss": 0.87444293, "num_input_tokens_seen": 76589820, "step": 3556, "time_per_iteration": 3.558990240097046 }, { "auxiliary_loss_clip": 0.01186163, "auxiliary_loss_mlp": 0.01027096, "balance_loss_clip": 1.0539974, "balance_loss_mlp": 1.01900768, "epoch": 0.42770396200324656, "flos": 14752822072320.0, "grad_norm": 2.7037920155729815, "language_loss": 0.66716206, "learning_rate": 2.5580270944193467e-06, "loss": 0.6892947, "num_input_tokens_seen": 76606640, "step": 3557, "time_per_iteration": 3.630056381225586 }, { "auxiliary_loss_clip": 0.01093443, "auxiliary_loss_mlp": 0.01003185, "balance_loss_clip": 1.02792525, "balance_loss_mlp": 1.00167656, "epoch": 0.4278242048938857, "flos": 70654712601600.0, "grad_norm": 0.830209577784689, "language_loss": 0.55481911, "learning_rate": 2.557279015636827e-06, "loss": 0.57578534, "num_input_tokens_seen": 76667050, "step": 3558, "time_per_iteration": 3.221250057220459 }, { "auxiliary_loss_clip": 0.01085222, "auxiliary_loss_mlp": 0.01003011, "balance_loss_clip": 1.02548099, "balance_loss_mlp": 1.00162268, "epoch": 0.42794444778452473, "flos": 69366165033600.0, "grad_norm": 0.7621311213010337, "language_loss": 0.61198771, "learning_rate": 2.5565308523155245e-06, "loss": 0.63287008, "num_input_tokens_seen": 76726650, "step": 3559, "time_per_iteration": 4.130234479904175 }, { "auxiliary_loss_clip": 0.01166953, "auxiliary_loss_mlp": 0.01025821, "balance_loss_clip": 1.05632424, "balance_loss_mlp": 1.01780391, "epoch": 0.42806469067516384, "flos": 18215328481920.0, "grad_norm": 3.586272457280791, "language_loss": 0.81697577, "learning_rate": 2.5557826045689336e-06, "loss": 0.83890349, "num_input_tokens_seen": 76742890, "step": 3560, "time_per_iteration": 3.736922264099121 }, { "auxiliary_loss_clip": 0.01084781, "auxiliary_loss_mlp": 0.01002161, "balance_loss_clip": 1.0272032, "balance_loss_mlp": 1.00088549, "epoch": 0.4281849335658029, "flos": 54535814432640.0, "grad_norm": 0.8397851489466232, "language_loss": 0.58803296, "learning_rate": 2.5550342725105643e-06, "loss": 0.60890234, "num_input_tokens_seen": 76801055, "step": 3561, "time_per_iteration": 3.2356972694396973 }, { "auxiliary_loss_clip": 0.01185265, "auxiliary_loss_mlp": 0.01026912, "balance_loss_clip": 1.05888176, "balance_loss_mlp": 1.01866245, "epoch": 0.428305176456442, "flos": 17274828723840.0, "grad_norm": 1.8140124825635406, "language_loss": 0.80498266, "learning_rate": 2.554285856253937e-06, "loss": 0.82710439, "num_input_tokens_seen": 76819890, "step": 3562, "time_per_iteration": 2.673706293106079 }, { "auxiliary_loss_clip": 0.01175381, "auxiliary_loss_mlp": 0.01029213, "balance_loss_clip": 1.05606055, "balance_loss_mlp": 1.022084, "epoch": 0.4284254193470811, "flos": 26359509749760.0, "grad_norm": 1.80123738333031, "language_loss": 0.77755743, "learning_rate": 2.5535373559125855e-06, "loss": 0.79960334, "num_input_tokens_seen": 76840255, "step": 3563, "time_per_iteration": 2.7949724197387695 }, { "auxiliary_loss_clip": 0.01170515, "auxiliary_loss_mlp": 0.01032328, "balance_loss_clip": 1.05617392, "balance_loss_mlp": 1.02421546, "epoch": 0.42854566223772017, "flos": 29714248379520.0, "grad_norm": 2.1617338046789545, "language_loss": 0.81781137, "learning_rate": 2.552788771600057e-06, "loss": 0.83983982, "num_input_tokens_seen": 76860565, "step": 3564, "time_per_iteration": 2.8635599613189697 }, { "auxiliary_loss_clip": 0.01181313, "auxiliary_loss_mlp": 0.01024843, "balance_loss_clip": 1.05748618, "balance_loss_mlp": 1.01682639, "epoch": 0.4286659051283593, "flos": 22018161277440.0, "grad_norm": 2.0445637675729316, "language_loss": 0.81944609, "learning_rate": 2.5520401034299118e-06, "loss": 0.84150755, "num_input_tokens_seen": 76878325, "step": 3565, "time_per_iteration": 2.8505759239196777 }, { "auxiliary_loss_clip": 0.01184688, "auxiliary_loss_mlp": 0.01029694, "balance_loss_clip": 1.05478132, "balance_loss_mlp": 1.02126539, "epoch": 0.4287861480189984, "flos": 13334422838400.0, "grad_norm": 2.339734481051752, "language_loss": 0.87896091, "learning_rate": 2.551291351515722e-06, "loss": 0.90110469, "num_input_tokens_seen": 76895340, "step": 3566, "time_per_iteration": 2.682499885559082 }, { "auxiliary_loss_clip": 0.01172299, "auxiliary_loss_mlp": 0.01056418, "balance_loss_clip": 1.05239093, "balance_loss_mlp": 1.0194813, "epoch": 0.42890639090963745, "flos": 26651535321600.0, "grad_norm": 1.9098812729588284, "language_loss": 0.85881799, "learning_rate": 2.5505425159710726e-06, "loss": 0.88110518, "num_input_tokens_seen": 76915150, "step": 3567, "time_per_iteration": 2.8649539947509766 }, { "auxiliary_loss_clip": 0.01187529, "auxiliary_loss_mlp": 0.01053192, "balance_loss_clip": 1.05633962, "balance_loss_mlp": 1.0163641, "epoch": 0.42902663380027656, "flos": 24055768091520.0, "grad_norm": 2.0938531389025528, "language_loss": 0.83118176, "learning_rate": 2.549793596909561e-06, "loss": 0.853589, "num_input_tokens_seen": 76933770, "step": 3568, "time_per_iteration": 2.8418235778808594 }, { "auxiliary_loss_clip": 0.0117264, "auxiliary_loss_mlp": 0.01028097, "balance_loss_clip": 1.05230498, "balance_loss_mlp": 1.0204109, "epoch": 0.42914687669091567, "flos": 15632561975040.0, "grad_norm": 4.317484473352192, "language_loss": 0.65815258, "learning_rate": 2.5490445944447976e-06, "loss": 0.68015993, "num_input_tokens_seen": 76952265, "step": 3569, "time_per_iteration": 2.8027942180633545 }, { "auxiliary_loss_clip": 0.01180629, "auxiliary_loss_mlp": 0.01031274, "balance_loss_clip": 1.05490136, "balance_loss_mlp": 1.02294087, "epoch": 0.4292671195815547, "flos": 31467802440960.0, "grad_norm": 3.118090521779586, "language_loss": 0.65906119, "learning_rate": 2.548295508690406e-06, "loss": 0.68118024, "num_input_tokens_seen": 76973560, "step": 3570, "time_per_iteration": 2.8725268840789795 }, { "auxiliary_loss_clip": 0.01183304, "auxiliary_loss_mlp": 0.01035547, "balance_loss_clip": 1.05233467, "balance_loss_mlp": 1.02711308, "epoch": 0.42938736247219383, "flos": 30257756046720.0, "grad_norm": 1.8404513502672533, "language_loss": 0.7613588, "learning_rate": 2.5475463397600217e-06, "loss": 0.78354728, "num_input_tokens_seen": 76993640, "step": 3571, "time_per_iteration": 2.868726968765259 }, { "auxiliary_loss_clip": 0.01188898, "auxiliary_loss_mlp": 0.01039335, "balance_loss_clip": 1.05607772, "balance_loss_mlp": 1.03109717, "epoch": 0.42950760536283294, "flos": 29349683291520.0, "grad_norm": 2.8815075568287556, "language_loss": 0.78169048, "learning_rate": 2.546797087767293e-06, "loss": 0.80397284, "num_input_tokens_seen": 77013765, "step": 3572, "time_per_iteration": 2.8862850666046143 }, { "auxiliary_loss_clip": 0.01170084, "auxiliary_loss_mlp": 0.01025338, "balance_loss_clip": 1.05535328, "balance_loss_mlp": 1.01739836, "epoch": 0.429627848253472, "flos": 26869943969280.0, "grad_norm": 2.7098788595619743, "language_loss": 0.87135518, "learning_rate": 2.546047752825881e-06, "loss": 0.89330941, "num_input_tokens_seen": 77034370, "step": 3573, "time_per_iteration": 2.9504997730255127 }, { "auxiliary_loss_clip": 0.01175427, "auxiliary_loss_mlp": 0.01026179, "balance_loss_clip": 1.05322742, "balance_loss_mlp": 1.01812625, "epoch": 0.4297480911441111, "flos": 13881270470400.0, "grad_norm": 2.8498192956491537, "language_loss": 0.9331674, "learning_rate": 2.5452983350494595e-06, "loss": 0.95518351, "num_input_tokens_seen": 77049925, "step": 3574, "time_per_iteration": 2.89965558052063 }, { "auxiliary_loss_clip": 0.01183292, "auxiliary_loss_mlp": 0.01057729, "balance_loss_clip": 1.05489409, "balance_loss_mlp": 1.01769042, "epoch": 0.4298683340347502, "flos": 20741141975040.0, "grad_norm": 2.4811988705907715, "language_loss": 0.65117419, "learning_rate": 2.544548834551713e-06, "loss": 0.6735844, "num_input_tokens_seen": 77068930, "step": 3575, "time_per_iteration": 2.8039727210998535 }, { "auxiliary_loss_clip": 0.01174931, "auxiliary_loss_mlp": 0.01051365, "balance_loss_clip": 1.05534482, "balance_loss_mlp": 1.01578379, "epoch": 0.4299885769253893, "flos": 20882126856960.0, "grad_norm": 3.271969184139374, "language_loss": 0.94296074, "learning_rate": 2.5437992514463424e-06, "loss": 0.96522379, "num_input_tokens_seen": 77082255, "step": 3576, "time_per_iteration": 2.868760585784912 }, { "auxiliary_loss_clip": 0.01181128, "auxiliary_loss_mlp": 0.01031925, "balance_loss_clip": 1.05504537, "balance_loss_mlp": 1.02335358, "epoch": 0.4301088198160284, "flos": 25484618183040.0, "grad_norm": 1.7695021246132145, "language_loss": 0.8825196, "learning_rate": 2.5430495858470565e-06, "loss": 0.90465009, "num_input_tokens_seen": 77101725, "step": 3577, "time_per_iteration": 2.8285932540893555 }, { "auxiliary_loss_clip": 0.01181565, "auxiliary_loss_mlp": 0.01024974, "balance_loss_clip": 1.05560017, "balance_loss_mlp": 1.016868, "epoch": 0.43022906270666744, "flos": 18259427404800.0, "grad_norm": 2.4724482856290804, "language_loss": 0.76984215, "learning_rate": 2.54229983786758e-06, "loss": 0.79190755, "num_input_tokens_seen": 77119670, "step": 3578, "time_per_iteration": 2.807699203491211 }, { "auxiliary_loss_clip": 0.01181193, "auxiliary_loss_mlp": 0.01031961, "balance_loss_clip": 1.0550611, "balance_loss_mlp": 1.02399194, "epoch": 0.43034930559730655, "flos": 23399536567680.0, "grad_norm": 2.787135342494135, "language_loss": 0.85090673, "learning_rate": 2.541550007621651e-06, "loss": 0.87303829, "num_input_tokens_seen": 77138160, "step": 3579, "time_per_iteration": 2.9125404357910156 }, { "auxiliary_loss_clip": 0.0117858, "auxiliary_loss_mlp": 0.01028994, "balance_loss_clip": 1.05398107, "balance_loss_mlp": 1.02077472, "epoch": 0.43046954848794566, "flos": 28184382264960.0, "grad_norm": 2.5352065836116933, "language_loss": 0.80017519, "learning_rate": 2.5408000952230156e-06, "loss": 0.82225084, "num_input_tokens_seen": 77156950, "step": 3580, "time_per_iteration": 2.7983412742614746 }, { "auxiliary_loss_clip": 0.01185094, "auxiliary_loss_mlp": 0.01027518, "balance_loss_clip": 1.05356479, "balance_loss_mlp": 1.01947176, "epoch": 0.4305897913785847, "flos": 28580476515840.0, "grad_norm": 2.152853675447143, "language_loss": 0.90788639, "learning_rate": 2.5400501007854357e-06, "loss": 0.93001246, "num_input_tokens_seen": 77176395, "step": 3581, "time_per_iteration": 3.727186679840088 }, { "auxiliary_loss_clip": 0.01173634, "auxiliary_loss_mlp": 0.0102379, "balance_loss_clip": 1.05471611, "balance_loss_mlp": 1.01595187, "epoch": 0.43071003426922383, "flos": 20448721353600.0, "grad_norm": 1.7655208269989024, "language_loss": 0.75335974, "learning_rate": 2.539300024422685e-06, "loss": 0.775334, "num_input_tokens_seen": 77194340, "step": 3582, "time_per_iteration": 2.759444236755371 }, { "auxiliary_loss_clip": 0.01081301, "auxiliary_loss_mlp": 0.01009086, "balance_loss_clip": 1.02005458, "balance_loss_mlp": 1.00763178, "epoch": 0.43083027715986294, "flos": 51997969883520.0, "grad_norm": 0.7856723091790955, "language_loss": 0.60978484, "learning_rate": 2.538549866248549e-06, "loss": 0.63068873, "num_input_tokens_seen": 77249320, "step": 3583, "time_per_iteration": 4.01883339881897 }, { "auxiliary_loss_clip": 0.01182238, "auxiliary_loss_mlp": 0.01025065, "balance_loss_clip": 1.05419135, "balance_loss_mlp": 1.01686931, "epoch": 0.430950520050502, "flos": 16690885320960.0, "grad_norm": 2.0124645817067073, "language_loss": 0.81352109, "learning_rate": 2.5377996263768274e-06, "loss": 0.83559406, "num_input_tokens_seen": 77267400, "step": 3584, "time_per_iteration": 2.7372524738311768 }, { "auxiliary_loss_clip": 0.01179086, "auxiliary_loss_mlp": 0.0103198, "balance_loss_clip": 1.05385661, "balance_loss_mlp": 1.02380276, "epoch": 0.4310707629411411, "flos": 24608433726720.0, "grad_norm": 1.7135642416246983, "language_loss": 0.68159872, "learning_rate": 2.5370493049213293e-06, "loss": 0.70370936, "num_input_tokens_seen": 77287045, "step": 3585, "time_per_iteration": 3.615584135055542 }, { "auxiliary_loss_clip": 0.01157958, "auxiliary_loss_mlp": 0.01035031, "balance_loss_clip": 1.05368698, "balance_loss_mlp": 1.02713931, "epoch": 0.4311910058317802, "flos": 26432983019520.0, "grad_norm": 2.1584027457174653, "language_loss": 0.80147684, "learning_rate": 2.536298901995878e-06, "loss": 0.82340676, "num_input_tokens_seen": 77306255, "step": 3586, "time_per_iteration": 2.851494550704956 }, { "auxiliary_loss_clip": 0.0118045, "auxiliary_loss_mlp": 0.01023363, "balance_loss_clip": 1.05469847, "balance_loss_mlp": 1.01536405, "epoch": 0.43131124872241927, "flos": 25155891889920.0, "grad_norm": 1.7291808284267927, "language_loss": 0.80308115, "learning_rate": 2.535548417714311e-06, "loss": 0.82511932, "num_input_tokens_seen": 77325555, "step": 3587, "time_per_iteration": 3.7642674446105957 }, { "auxiliary_loss_clip": 0.01185388, "auxiliary_loss_mlp": 0.01027051, "balance_loss_clip": 1.05360556, "balance_loss_mlp": 1.0184257, "epoch": 0.4314314916130584, "flos": 21614812479360.0, "grad_norm": 1.5886693299585808, "language_loss": 0.87297165, "learning_rate": 2.534797852190474e-06, "loss": 0.895096, "num_input_tokens_seen": 77345735, "step": 3588, "time_per_iteration": 2.8106725215911865 }, { "auxiliary_loss_clip": 0.01180691, "auxiliary_loss_mlp": 0.01028189, "balance_loss_clip": 1.05415022, "balance_loss_mlp": 1.02017784, "epoch": 0.4315517345036975, "flos": 19275016544640.0, "grad_norm": 2.0523236335115156, "language_loss": 0.82034218, "learning_rate": 2.5340472055382283e-06, "loss": 0.84243095, "num_input_tokens_seen": 77361765, "step": 3589, "time_per_iteration": 2.7484078407287598 }, { "auxiliary_loss_clip": 0.01175469, "auxiliary_loss_mlp": 0.01026861, "balance_loss_clip": 1.05380738, "balance_loss_mlp": 1.01920807, "epoch": 0.43167197739433655, "flos": 24273853516800.0, "grad_norm": 2.868629338308043, "language_loss": 0.80895257, "learning_rate": 2.5332964778714468e-06, "loss": 0.83097589, "num_input_tokens_seen": 77378950, "step": 3590, "time_per_iteration": 2.7858266830444336 }, { "auxiliary_loss_clip": 0.01173226, "auxiliary_loss_mlp": 0.01031787, "balance_loss_clip": 1.05479431, "balance_loss_mlp": 1.02421165, "epoch": 0.43179222028497566, "flos": 16867816738560.0, "grad_norm": 5.635873864854633, "language_loss": 0.66182393, "learning_rate": 2.5325456693040123e-06, "loss": 0.68387401, "num_input_tokens_seen": 77396145, "step": 3591, "time_per_iteration": 2.784722089767456 }, { "auxiliary_loss_clip": 0.01186749, "auxiliary_loss_mlp": 0.01029802, "balance_loss_clip": 1.05300617, "balance_loss_mlp": 1.02085495, "epoch": 0.43191246317561477, "flos": 17639214243840.0, "grad_norm": 4.0503519722016375, "language_loss": 0.74987805, "learning_rate": 2.531794779949824e-06, "loss": 0.77204359, "num_input_tokens_seen": 77414045, "step": 3592, "time_per_iteration": 2.6976470947265625 }, { "auxiliary_loss_clip": 0.01164568, "auxiliary_loss_mlp": 0.0102917, "balance_loss_clip": 1.05201972, "balance_loss_mlp": 1.02151036, "epoch": 0.4320327060662538, "flos": 23878800760320.0, "grad_norm": 1.9918972334934533, "language_loss": 0.87971228, "learning_rate": 2.5310438099227903e-06, "loss": 0.90164965, "num_input_tokens_seen": 77431310, "step": 3593, "time_per_iteration": 2.721060276031494 }, { "auxiliary_loss_clip": 0.01085474, "auxiliary_loss_mlp": 0.01004524, "balance_loss_clip": 1.02049899, "balance_loss_mlp": 1.00317693, "epoch": 0.43215294895689293, "flos": 66394917959040.0, "grad_norm": 0.8134197784350028, "language_loss": 0.53331631, "learning_rate": 2.530292759336833e-06, "loss": 0.55421627, "num_input_tokens_seen": 77492045, "step": 3594, "time_per_iteration": 3.389080047607422 }, { "auxiliary_loss_clip": 0.01173899, "auxiliary_loss_mlp": 0.01026986, "balance_loss_clip": 1.05327189, "balance_loss_mlp": 1.01834893, "epoch": 0.432273191847532, "flos": 20594267262720.0, "grad_norm": 2.1969310763203094, "language_loss": 0.6950618, "learning_rate": 2.5295416283058855e-06, "loss": 0.71707058, "num_input_tokens_seen": 77510910, "step": 3595, "time_per_iteration": 2.65315842628479 }, { "auxiliary_loss_clip": 0.01173527, "auxiliary_loss_mlp": 0.01054687, "balance_loss_clip": 1.05338323, "balance_loss_mlp": 1.01731849, "epoch": 0.4323934347381711, "flos": 19282127437440.0, "grad_norm": 1.6150391725854971, "language_loss": 0.66206801, "learning_rate": 2.5287904169438943e-06, "loss": 0.68435013, "num_input_tokens_seen": 77530115, "step": 3596, "time_per_iteration": 2.7109432220458984 }, { "auxiliary_loss_clip": 0.01176889, "auxiliary_loss_mlp": 0.01032146, "balance_loss_clip": 1.05695021, "balance_loss_mlp": 1.0235275, "epoch": 0.4325136776288102, "flos": 21726315273600.0, "grad_norm": 3.695091888567236, "language_loss": 0.63987237, "learning_rate": 2.528039125364817e-06, "loss": 0.66196269, "num_input_tokens_seen": 77548920, "step": 3597, "time_per_iteration": 2.9139456748962402 }, { "auxiliary_loss_clip": 0.01175197, "auxiliary_loss_mlp": 0.0103055, "balance_loss_clip": 1.05221319, "balance_loss_mlp": 1.02262223, "epoch": 0.43263392051944927, "flos": 22340746344960.0, "grad_norm": 2.2835419289536727, "language_loss": 0.76319635, "learning_rate": 2.5272877536826246e-06, "loss": 0.78525382, "num_input_tokens_seen": 77567715, "step": 3598, "time_per_iteration": 2.7417545318603516 }, { "auxiliary_loss_clip": 0.01173588, "auxiliary_loss_mlp": 0.01030624, "balance_loss_clip": 1.05075133, "balance_loss_mlp": 1.02242851, "epoch": 0.4327541634100884, "flos": 29168406328320.0, "grad_norm": 2.2526665666646166, "language_loss": 0.70434523, "learning_rate": 2.5265363020112986e-06, "loss": 0.72638732, "num_input_tokens_seen": 77588035, "step": 3599, "time_per_iteration": 2.8755533695220947 }, { "auxiliary_loss_clip": 0.0117915, "auxiliary_loss_mlp": 0.01033518, "balance_loss_clip": 1.05399585, "balance_loss_mlp": 1.0243094, "epoch": 0.4328744063007275, "flos": 26067448264320.0, "grad_norm": 2.08040121262674, "language_loss": 0.83922637, "learning_rate": 2.5257847704648344e-06, "loss": 0.86135304, "num_input_tokens_seen": 77609265, "step": 3600, "time_per_iteration": 2.6826541423797607 }, { "auxiliary_loss_clip": 0.01184418, "auxiliary_loss_mlp": 0.01023386, "balance_loss_clip": 1.05443549, "balance_loss_mlp": 1.01573908, "epoch": 0.43299464919136654, "flos": 16581357774720.0, "grad_norm": 1.8720044775990003, "language_loss": 0.75400352, "learning_rate": 2.525033159157239e-06, "loss": 0.77608156, "num_input_tokens_seen": 77625580, "step": 3601, "time_per_iteration": 2.68205189704895 }, { "auxiliary_loss_clip": 0.01183534, "auxiliary_loss_mlp": 0.01031451, "balance_loss_clip": 1.0559026, "balance_loss_mlp": 1.02248597, "epoch": 0.43311489208200565, "flos": 16107265140480.0, "grad_norm": 1.875751699045195, "language_loss": 0.77159089, "learning_rate": 2.52428146820253e-06, "loss": 0.79374069, "num_input_tokens_seen": 77643835, "step": 3602, "time_per_iteration": 2.7030789852142334 }, { "auxiliary_loss_clip": 0.01176552, "auxiliary_loss_mlp": 0.01036191, "balance_loss_clip": 1.05528021, "balance_loss_mlp": 1.02830541, "epoch": 0.43323513497264476, "flos": 22930220442240.0, "grad_norm": 1.978339376304102, "language_loss": 0.82032287, "learning_rate": 2.52352969771474e-06, "loss": 0.84245032, "num_input_tokens_seen": 77663060, "step": 3603, "time_per_iteration": 2.844158887863159 }, { "auxiliary_loss_clip": 0.01182047, "auxiliary_loss_mlp": 0.01028352, "balance_loss_clip": 1.05516577, "balance_loss_mlp": 1.02105021, "epoch": 0.4333553778632838, "flos": 25299031587840.0, "grad_norm": 2.0456212993301164, "language_loss": 0.88536417, "learning_rate": 2.5227778478079106e-06, "loss": 0.9074682, "num_input_tokens_seen": 77682470, "step": 3604, "time_per_iteration": 2.834348678588867 }, { "auxiliary_loss_clip": 0.01177899, "auxiliary_loss_mlp": 0.0102754, "balance_loss_clip": 1.05345738, "balance_loss_mlp": 1.01933861, "epoch": 0.43347562075392293, "flos": 19387165783680.0, "grad_norm": 1.5701548804800958, "language_loss": 0.77002484, "learning_rate": 2.522025918596098e-06, "loss": 0.79207921, "num_input_tokens_seen": 77700770, "step": 3605, "time_per_iteration": 2.7842230796813965 }, { "auxiliary_loss_clip": 0.01179605, "auxiliary_loss_mlp": 0.01025117, "balance_loss_clip": 1.05230367, "balance_loss_mlp": 1.01792824, "epoch": 0.43359586364456204, "flos": 26325969425280.0, "grad_norm": 1.6987328282154033, "language_loss": 0.65496576, "learning_rate": 2.521273910193368e-06, "loss": 0.67701298, "num_input_tokens_seen": 77723950, "step": 3606, "time_per_iteration": 2.7991995811462402 }, { "auxiliary_loss_clip": 0.01189434, "auxiliary_loss_mlp": 0.01028739, "balance_loss_clip": 1.056041, "balance_loss_mlp": 1.02062094, "epoch": 0.4337161065352011, "flos": 15989261984640.0, "grad_norm": 2.3084314117761795, "language_loss": 0.86949515, "learning_rate": 2.5205218227138006e-06, "loss": 0.89167696, "num_input_tokens_seen": 77736905, "step": 3607, "time_per_iteration": 2.616363525390625 }, { "auxiliary_loss_clip": 0.01182766, "auxiliary_loss_mlp": 0.01026652, "balance_loss_clip": 1.05174041, "balance_loss_mlp": 1.01894486, "epoch": 0.4338363494258402, "flos": 20224710184320.0, "grad_norm": 2.257515834223676, "language_loss": 0.78984296, "learning_rate": 2.519769656271486e-06, "loss": 0.81193721, "num_input_tokens_seen": 77754325, "step": 3608, "time_per_iteration": 3.770101547241211 }, { "auxiliary_loss_clip": 0.0116305, "auxiliary_loss_mlp": 0.01027453, "balance_loss_clip": 1.05315614, "balance_loss_mlp": 1.01973414, "epoch": 0.43395659231647926, "flos": 20083904870400.0, "grad_norm": 2.095074586916477, "language_loss": 0.67674136, "learning_rate": 2.5190174109805285e-06, "loss": 0.69864631, "num_input_tokens_seen": 77774150, "step": 3609, "time_per_iteration": 3.698371410369873 }, { "auxiliary_loss_clip": 0.01168655, "auxiliary_loss_mlp": 0.01030045, "balance_loss_clip": 1.05129719, "balance_loss_mlp": 1.02205205, "epoch": 0.43407683520711837, "flos": 19901801894400.0, "grad_norm": 7.194747962950033, "language_loss": 0.63856399, "learning_rate": 2.518265086955042e-06, "loss": 0.66055101, "num_input_tokens_seen": 77791870, "step": 3610, "time_per_iteration": 2.6537535190582275 }, { "auxiliary_loss_clip": 0.01182032, "auxiliary_loss_mlp": 0.01026657, "balance_loss_clip": 1.05113435, "balance_loss_mlp": 1.018718, "epoch": 0.4341970780977575, "flos": 23108732058240.0, "grad_norm": 4.551384677647918, "language_loss": 0.83262777, "learning_rate": 2.5175126843091534e-06, "loss": 0.85471463, "num_input_tokens_seen": 77811240, "step": 3611, "time_per_iteration": 3.6233408451080322 }, { "auxiliary_loss_clip": 0.01177446, "auxiliary_loss_mlp": 0.0103196, "balance_loss_clip": 1.05223656, "balance_loss_mlp": 1.02389574, "epoch": 0.43431732098839654, "flos": 37408288406400.0, "grad_norm": 2.2046162008332435, "language_loss": 0.75585145, "learning_rate": 2.5167602031570034e-06, "loss": 0.77794552, "num_input_tokens_seen": 77831425, "step": 3612, "time_per_iteration": 2.8099875450134277 }, { "auxiliary_loss_clip": 0.01185889, "auxiliary_loss_mlp": 0.01029932, "balance_loss_clip": 1.0546031, "balance_loss_mlp": 1.0214591, "epoch": 0.43443756387903565, "flos": 31868206323840.0, "grad_norm": 1.6820983383088366, "language_loss": 0.73549128, "learning_rate": 2.51600764361274e-06, "loss": 0.75764942, "num_input_tokens_seen": 77852950, "step": 3613, "time_per_iteration": 3.8816092014312744 }, { "auxiliary_loss_clip": 0.01183744, "auxiliary_loss_mlp": 0.01030635, "balance_loss_clip": 1.05286944, "balance_loss_mlp": 1.02277911, "epoch": 0.43455780676967476, "flos": 23477139901440.0, "grad_norm": 2.2385327556796617, "language_loss": 0.78776634, "learning_rate": 2.5152550057905283e-06, "loss": 0.80991018, "num_input_tokens_seen": 77872840, "step": 3614, "time_per_iteration": 2.706997871398926 }, { "auxiliary_loss_clip": 0.01184729, "auxiliary_loss_mlp": 0.01064557, "balance_loss_clip": 1.05560446, "balance_loss_mlp": 1.02689147, "epoch": 0.4346780496603138, "flos": 24207060176640.0, "grad_norm": 2.31505945482274, "language_loss": 0.77202296, "learning_rate": 2.5145022898045415e-06, "loss": 0.79451585, "num_input_tokens_seen": 77892025, "step": 3615, "time_per_iteration": 2.6352696418762207 }, { "auxiliary_loss_clip": 0.01183052, "auxiliary_loss_mlp": 0.01030178, "balance_loss_clip": 1.05497289, "balance_loss_mlp": 1.0213623, "epoch": 0.4347982925509529, "flos": 17092366611840.0, "grad_norm": 2.1403761377311685, "language_loss": 0.89687645, "learning_rate": 2.5137494957689664e-06, "loss": 0.91900873, "num_input_tokens_seen": 77907635, "step": 3616, "time_per_iteration": 2.6949660778045654 }, { "auxiliary_loss_clip": 0.0108459, "auxiliary_loss_mlp": 0.01009297, "balance_loss_clip": 1.02182317, "balance_loss_mlp": 1.00791407, "epoch": 0.43491853544159204, "flos": 60945544696320.0, "grad_norm": 0.7657535768169875, "language_loss": 0.57335055, "learning_rate": 2.5129966237980016e-06, "loss": 0.59428942, "num_input_tokens_seen": 77970630, "step": 3617, "time_per_iteration": 3.328266143798828 }, { "auxiliary_loss_clip": 0.01177112, "auxiliary_loss_mlp": 0.0102704, "balance_loss_clip": 1.0527823, "balance_loss_mlp": 1.01942205, "epoch": 0.4350387783322311, "flos": 21944652094080.0, "grad_norm": 2.0778147122721538, "language_loss": 0.78458703, "learning_rate": 2.512243674005857e-06, "loss": 0.80662847, "num_input_tokens_seen": 77989995, "step": 3618, "time_per_iteration": 2.843963146209717 }, { "auxiliary_loss_clip": 0.01171941, "auxiliary_loss_mlp": 0.01029044, "balance_loss_clip": 1.05752826, "balance_loss_mlp": 1.0206337, "epoch": 0.4351590212228702, "flos": 25082705928960.0, "grad_norm": 2.1870118060197576, "language_loss": 0.85893762, "learning_rate": 2.5114906465067537e-06, "loss": 0.88094747, "num_input_tokens_seen": 78010980, "step": 3619, "time_per_iteration": 2.867716073989868 }, { "auxiliary_loss_clip": 0.01181932, "auxiliary_loss_mlp": 0.01024493, "balance_loss_clip": 1.05194056, "balance_loss_mlp": 1.01663697, "epoch": 0.4352792641135093, "flos": 21506541909120.0, "grad_norm": 2.1076315524338547, "language_loss": 0.75371122, "learning_rate": 2.5107375414149264e-06, "loss": 0.77577543, "num_input_tokens_seen": 78030225, "step": 3620, "time_per_iteration": 2.7458107471466064 }, { "auxiliary_loss_clip": 0.01165473, "auxiliary_loss_mlp": 0.01021541, "balance_loss_clip": 1.05174363, "balance_loss_mlp": 1.01341057, "epoch": 0.43539950700414837, "flos": 16253457494400.0, "grad_norm": 2.6223910201443936, "language_loss": 0.71768773, "learning_rate": 2.5099843588446197e-06, "loss": 0.73955786, "num_input_tokens_seen": 78048545, "step": 3621, "time_per_iteration": 2.834620714187622 }, { "auxiliary_loss_clip": 0.01176873, "auxiliary_loss_mlp": 0.01026603, "balance_loss_clip": 1.05464554, "balance_loss_mlp": 1.01911724, "epoch": 0.4355197498947875, "flos": 16691819074560.0, "grad_norm": 1.6792809902973935, "language_loss": 0.61721009, "learning_rate": 2.509231098910091e-06, "loss": 0.63924485, "num_input_tokens_seen": 78068415, "step": 3622, "time_per_iteration": 2.7550041675567627 }, { "auxiliary_loss_clip": 0.01172803, "auxiliary_loss_mlp": 0.01029008, "balance_loss_clip": 1.05478466, "balance_loss_mlp": 1.02088404, "epoch": 0.4356399927854266, "flos": 16362733645440.0, "grad_norm": 2.2473446812371174, "language_loss": 0.74835634, "learning_rate": 2.508477761725611e-06, "loss": 0.77037442, "num_input_tokens_seen": 78086690, "step": 3623, "time_per_iteration": 2.759783983230591 }, { "auxiliary_loss_clip": 0.01183349, "auxiliary_loss_mlp": 0.01029969, "balance_loss_clip": 1.0534575, "balance_loss_mlp": 1.02167201, "epoch": 0.43576023567606564, "flos": 17202037812480.0, "grad_norm": 1.8836203012354427, "language_loss": 0.80682105, "learning_rate": 2.507724347405458e-06, "loss": 0.82895422, "num_input_tokens_seen": 78104640, "step": 3624, "time_per_iteration": 2.6424901485443115 }, { "auxiliary_loss_clip": 0.01165418, "auxiliary_loss_mlp": 0.01031256, "balance_loss_clip": 1.05140412, "balance_loss_mlp": 1.02348673, "epoch": 0.43588047856670475, "flos": 15917656222080.0, "grad_norm": 2.016746016489333, "language_loss": 0.82144684, "learning_rate": 2.5069708560639243e-06, "loss": 0.84341359, "num_input_tokens_seen": 78122550, "step": 3625, "time_per_iteration": 2.7041873931884766 }, { "auxiliary_loss_clip": 0.01177358, "auxiliary_loss_mlp": 0.01030451, "balance_loss_clip": 1.05478811, "balance_loss_mlp": 1.02187991, "epoch": 0.4360007214573438, "flos": 23659566099840.0, "grad_norm": 3.807153998492385, "language_loss": 0.61368757, "learning_rate": 2.5062172878153158e-06, "loss": 0.63576567, "num_input_tokens_seen": 78141825, "step": 3626, "time_per_iteration": 2.7006032466888428 }, { "auxiliary_loss_clip": 0.0117399, "auxiliary_loss_mlp": 0.01028547, "balance_loss_clip": 1.05211449, "balance_loss_mlp": 1.02105474, "epoch": 0.4361209643479829, "flos": 21978767036160.0, "grad_norm": 2.1140461342792642, "language_loss": 0.87507164, "learning_rate": 2.505463642773947e-06, "loss": 0.89709705, "num_input_tokens_seen": 78161790, "step": 3627, "time_per_iteration": 2.7317070960998535 }, { "auxiliary_loss_clip": 0.0117237, "auxiliary_loss_mlp": 0.01052856, "balance_loss_clip": 1.05184841, "balance_loss_mlp": 1.01600099, "epoch": 0.43624120723862203, "flos": 17420159151360.0, "grad_norm": 2.37301760895529, "language_loss": 0.74831581, "learning_rate": 2.504709921054146e-06, "loss": 0.77056801, "num_input_tokens_seen": 78178605, "step": 3628, "time_per_iteration": 2.703310251235962 }, { "auxiliary_loss_clip": 0.01172489, "auxiliary_loss_mlp": 0.01025887, "balance_loss_clip": 1.05391681, "balance_loss_mlp": 1.01748252, "epoch": 0.4363614501292611, "flos": 17895293280000.0, "grad_norm": 2.488491134563911, "language_loss": 0.84238434, "learning_rate": 2.50395612277025e-06, "loss": 0.86436808, "num_input_tokens_seen": 78194460, "step": 3629, "time_per_iteration": 2.677881956100464 }, { "auxiliary_loss_clip": 0.01178953, "auxiliary_loss_mlp": 0.01023854, "balance_loss_clip": 1.05125093, "balance_loss_mlp": 1.01580775, "epoch": 0.4364816930199002, "flos": 20302888135680.0, "grad_norm": 2.6597529499222996, "language_loss": 0.72684389, "learning_rate": 2.503202248036612e-06, "loss": 0.74887192, "num_input_tokens_seen": 78213315, "step": 3630, "time_per_iteration": 2.7162864208221436 }, { "auxiliary_loss_clip": 0.01181973, "auxiliary_loss_mlp": 0.01022651, "balance_loss_clip": 1.05265427, "balance_loss_mlp": 1.0142765, "epoch": 0.4366019359105393, "flos": 24061334699520.0, "grad_norm": 3.061444782788585, "language_loss": 0.73330176, "learning_rate": 2.5024482969675927e-06, "loss": 0.75534797, "num_input_tokens_seen": 78233270, "step": 3631, "time_per_iteration": 2.645726442337036 }, { "auxiliary_loss_clip": 0.01173376, "auxiliary_loss_mlp": 0.01019856, "balance_loss_clip": 1.05208933, "balance_loss_mlp": 1.01249743, "epoch": 0.43672217880117836, "flos": 21754109422080.0, "grad_norm": 3.0826151663735124, "language_loss": 0.84257847, "learning_rate": 2.501694269677566e-06, "loss": 0.86451077, "num_input_tokens_seen": 78251040, "step": 3632, "time_per_iteration": 2.707653045654297 }, { "auxiliary_loss_clip": 0.01181349, "auxiliary_loss_mlp": 0.01023907, "balance_loss_clip": 1.05120957, "balance_loss_mlp": 1.01605129, "epoch": 0.4368424216918175, "flos": 18035200753920.0, "grad_norm": 2.203549394591567, "language_loss": 0.80777681, "learning_rate": 2.500940166280918e-06, "loss": 0.82982934, "num_input_tokens_seen": 78269470, "step": 3633, "time_per_iteration": 3.6064889430999756 }, { "auxiliary_loss_clip": 0.01177377, "auxiliary_loss_mlp": 0.01025716, "balance_loss_clip": 1.05283141, "balance_loss_mlp": 1.01849246, "epoch": 0.4369626645824566, "flos": 25447127362560.0, "grad_norm": 1.9240614780225553, "language_loss": 0.7947312, "learning_rate": 2.500185986892045e-06, "loss": 0.81676215, "num_input_tokens_seen": 78288955, "step": 3634, "time_per_iteration": 2.7341976165771484 }, { "auxiliary_loss_clip": 0.01176609, "auxiliary_loss_mlp": 0.01029504, "balance_loss_clip": 1.05285132, "balance_loss_mlp": 1.02121866, "epoch": 0.43708290747309564, "flos": 25302694775040.0, "grad_norm": 2.2689544052834933, "language_loss": 0.77058476, "learning_rate": 2.499431731625355e-06, "loss": 0.79264593, "num_input_tokens_seen": 78307980, "step": 3635, "time_per_iteration": 3.6192970275878906 }, { "auxiliary_loss_clip": 0.0118353, "auxiliary_loss_mlp": 0.01033098, "balance_loss_clip": 1.05299783, "balance_loss_mlp": 1.02482474, "epoch": 0.43720315036373475, "flos": 31575103344000.0, "grad_norm": 2.9344162611422773, "language_loss": 0.79576111, "learning_rate": 2.4986774005952686e-06, "loss": 0.81792736, "num_input_tokens_seen": 78330355, "step": 3636, "time_per_iteration": 2.7591700553894043 }, { "auxiliary_loss_clip": 0.01174686, "auxiliary_loss_mlp": 0.0102401, "balance_loss_clip": 1.05223417, "balance_loss_mlp": 1.01638627, "epoch": 0.43732339325437386, "flos": 23112000195840.0, "grad_norm": 3.3004896746397567, "language_loss": 0.84597075, "learning_rate": 2.4979229939162166e-06, "loss": 0.86795771, "num_input_tokens_seen": 78349135, "step": 3637, "time_per_iteration": 3.6027979850769043 }, { "auxiliary_loss_clip": 0.01177248, "auxiliary_loss_mlp": 0.01024409, "balance_loss_clip": 1.05424583, "balance_loss_mlp": 1.01661825, "epoch": 0.4374436361450129, "flos": 27746272080000.0, "grad_norm": 1.7437437138247511, "language_loss": 0.80357754, "learning_rate": 2.4971685117026433e-06, "loss": 0.82559419, "num_input_tokens_seen": 78368900, "step": 3638, "time_per_iteration": 2.7449135780334473 }, { "auxiliary_loss_clip": 0.01181413, "auxiliary_loss_mlp": 0.01023277, "balance_loss_clip": 1.0532012, "balance_loss_mlp": 1.01588285, "epoch": 0.437563879035652, "flos": 24172370616960.0, "grad_norm": 4.356191688869664, "language_loss": 0.76718187, "learning_rate": 2.4964139540690018e-06, "loss": 0.7892288, "num_input_tokens_seen": 78392235, "step": 3639, "time_per_iteration": 2.771782636642456 }, { "auxiliary_loss_clip": 0.01175286, "auxiliary_loss_mlp": 0.01027048, "balance_loss_clip": 1.05258703, "balance_loss_mlp": 1.01831555, "epoch": 0.4376841219262911, "flos": 23477211728640.0, "grad_norm": 2.062292181068267, "language_loss": 0.73195314, "learning_rate": 2.495659321129758e-06, "loss": 0.75397652, "num_input_tokens_seen": 78409980, "step": 3640, "time_per_iteration": 3.701288938522339 }, { "auxiliary_loss_clip": 0.0117814, "auxiliary_loss_mlp": 0.01031242, "balance_loss_clip": 1.05376577, "balance_loss_mlp": 1.02316523, "epoch": 0.4378043648169302, "flos": 25447809720960.0, "grad_norm": 1.8825542274975529, "language_loss": 0.75329167, "learning_rate": 2.494904612999389e-06, "loss": 0.7753855, "num_input_tokens_seen": 78428690, "step": 3641, "time_per_iteration": 2.704268455505371 }, { "auxiliary_loss_clip": 0.01085815, "auxiliary_loss_mlp": 0.01002692, "balance_loss_clip": 1.02320313, "balance_loss_mlp": 1.00125563, "epoch": 0.4379246077075693, "flos": 53914056986880.0, "grad_norm": 0.743640615478664, "language_loss": 0.56492144, "learning_rate": 2.4941498297923843e-06, "loss": 0.58580655, "num_input_tokens_seen": 78489260, "step": 3642, "time_per_iteration": 3.3086206912994385 }, { "auxiliary_loss_clip": 0.011801, "auxiliary_loss_mlp": 0.01022998, "balance_loss_clip": 1.05547822, "balance_loss_mlp": 1.01514769, "epoch": 0.43804485059820836, "flos": 20588305605120.0, "grad_norm": 2.896448628617234, "language_loss": 0.69880307, "learning_rate": 2.4933949716232424e-06, "loss": 0.72083402, "num_input_tokens_seen": 78506785, "step": 3643, "time_per_iteration": 2.813662052154541 }, { "auxiliary_loss_clip": 0.0117127, "auxiliary_loss_mlp": 0.01033004, "balance_loss_clip": 1.05265999, "balance_loss_mlp": 1.02515996, "epoch": 0.43816509348884747, "flos": 23876214981120.0, "grad_norm": 2.092945116941535, "language_loss": 0.73894477, "learning_rate": 2.492640038606476e-06, "loss": 0.76098752, "num_input_tokens_seen": 78525150, "step": 3644, "time_per_iteration": 2.7123043537139893 }, { "auxiliary_loss_clip": 0.01179077, "auxiliary_loss_mlp": 0.01023234, "balance_loss_clip": 1.0511179, "balance_loss_mlp": 1.0151751, "epoch": 0.4382853363794866, "flos": 14684448533760.0, "grad_norm": 1.940693627326946, "language_loss": 0.78381658, "learning_rate": 2.491885030856608e-06, "loss": 0.80583966, "num_input_tokens_seen": 78543245, "step": 3645, "time_per_iteration": 2.7310197353363037 }, { "auxiliary_loss_clip": 0.01180334, "auxiliary_loss_mlp": 0.01030567, "balance_loss_clip": 1.05516505, "balance_loss_mlp": 1.02312231, "epoch": 0.43840557927012563, "flos": 17165301177600.0, "grad_norm": 2.5066659541155283, "language_loss": 0.83206427, "learning_rate": 2.4911299484881713e-06, "loss": 0.85417324, "num_input_tokens_seen": 78560775, "step": 3646, "time_per_iteration": 2.7287464141845703 }, { "auxiliary_loss_clip": 0.01170227, "auxiliary_loss_mlp": 0.01026309, "balance_loss_clip": 1.05140793, "balance_loss_mlp": 1.01859665, "epoch": 0.43852582216076474, "flos": 19390685316480.0, "grad_norm": 2.844128062180726, "language_loss": 0.81097394, "learning_rate": 2.490374791615712e-06, "loss": 0.83293927, "num_input_tokens_seen": 78580800, "step": 3647, "time_per_iteration": 2.7348432540893555 }, { "auxiliary_loss_clip": 0.01189084, "auxiliary_loss_mlp": 0.01060088, "balance_loss_clip": 1.0557189, "balance_loss_mlp": 1.02201891, "epoch": 0.43864606505140386, "flos": 18075133699200.0, "grad_norm": 2.612490145587614, "language_loss": 0.77862412, "learning_rate": 2.4896195603537867e-06, "loss": 0.80111581, "num_input_tokens_seen": 78595410, "step": 3648, "time_per_iteration": 2.6504058837890625 }, { "auxiliary_loss_clip": 0.01169581, "auxiliary_loss_mlp": 0.01032112, "balance_loss_clip": 1.05765033, "balance_loss_mlp": 1.0240953, "epoch": 0.4387663079420429, "flos": 19644896845440.0, "grad_norm": 2.2440109971680693, "language_loss": 0.73706263, "learning_rate": 2.488864254816964e-06, "loss": 0.75907958, "num_input_tokens_seen": 78614100, "step": 3649, "time_per_iteration": 2.8199660778045654 }, { "auxiliary_loss_clip": 0.01182179, "auxiliary_loss_mlp": 0.01034276, "balance_loss_clip": 1.05388916, "balance_loss_mlp": 1.0248524, "epoch": 0.438886550832682, "flos": 19719339782400.0, "grad_norm": 2.5817829539309254, "language_loss": 0.68201077, "learning_rate": 2.4881088751198218e-06, "loss": 0.70417535, "num_input_tokens_seen": 78632260, "step": 3650, "time_per_iteration": 2.644444704055786 }, { "auxiliary_loss_clip": 0.01180483, "auxiliary_loss_mlp": 0.01027048, "balance_loss_clip": 1.05381095, "balance_loss_mlp": 1.01869166, "epoch": 0.43900679372332113, "flos": 14536675981440.0, "grad_norm": 3.0673128421378393, "language_loss": 0.65014267, "learning_rate": 2.4873534213769517e-06, "loss": 0.67221802, "num_input_tokens_seen": 78647490, "step": 3651, "time_per_iteration": 2.770263195037842 }, { "auxiliary_loss_clip": 0.01168036, "auxiliary_loss_mlp": 0.01030657, "balance_loss_clip": 1.05495596, "balance_loss_mlp": 1.02266717, "epoch": 0.4391270366139602, "flos": 24056234968320.0, "grad_norm": 1.9886707765005207, "language_loss": 0.71953106, "learning_rate": 2.4865978937029547e-06, "loss": 0.74151802, "num_input_tokens_seen": 78666470, "step": 3652, "time_per_iteration": 2.835165500640869 }, { "auxiliary_loss_clip": 0.01164471, "auxiliary_loss_mlp": 0.01035602, "balance_loss_clip": 1.05562925, "balance_loss_mlp": 1.02705479, "epoch": 0.4392472795045993, "flos": 31538510363520.0, "grad_norm": 2.010987830821423, "language_loss": 0.66401196, "learning_rate": 2.485842292212445e-06, "loss": 0.68601274, "num_input_tokens_seen": 78687685, "step": 3653, "time_per_iteration": 2.9029541015625 }, { "auxiliary_loss_clip": 0.01188198, "auxiliary_loss_mlp": 0.01030195, "balance_loss_clip": 1.05682445, "balance_loss_mlp": 1.02223814, "epoch": 0.4393675223952384, "flos": 14866300114560.0, "grad_norm": 1.8997061949951757, "language_loss": 0.80628389, "learning_rate": 2.485086617020045e-06, "loss": 0.82846791, "num_input_tokens_seen": 78706180, "step": 3654, "time_per_iteration": 2.704324722290039 }, { "auxiliary_loss_clip": 0.01169248, "auxiliary_loss_mlp": 0.01030926, "balance_loss_clip": 1.05158103, "balance_loss_mlp": 1.02273035, "epoch": 0.43948776528587746, "flos": 14825900292480.0, "grad_norm": 2.114947654816444, "language_loss": 0.82198036, "learning_rate": 2.4843308682403903e-06, "loss": 0.8439821, "num_input_tokens_seen": 78723095, "step": 3655, "time_per_iteration": 2.6867001056671143 }, { "auxiliary_loss_clip": 0.011824, "auxiliary_loss_mlp": 0.01025763, "balance_loss_clip": 1.05191755, "balance_loss_mlp": 1.01820493, "epoch": 0.4396080081765166, "flos": 13914523486080.0, "grad_norm": 1.8699479600462612, "language_loss": 0.8284719, "learning_rate": 2.4835750459881294e-06, "loss": 0.85055357, "num_input_tokens_seen": 78739720, "step": 3656, "time_per_iteration": 2.6492819786071777 }, { "auxiliary_loss_clip": 0.01175019, "auxiliary_loss_mlp": 0.01029433, "balance_loss_clip": 1.0563221, "balance_loss_mlp": 1.02090955, "epoch": 0.43972825106715563, "flos": 18222978078720.0, "grad_norm": 2.023965354600933, "language_loss": 0.81521195, "learning_rate": 2.4828191503779177e-06, "loss": 0.83725649, "num_input_tokens_seen": 78757820, "step": 3657, "time_per_iteration": 2.839367151260376 }, { "auxiliary_loss_clip": 0.01172603, "auxiliary_loss_mlp": 0.01026532, "balance_loss_clip": 1.05163026, "balance_loss_mlp": 1.0186646, "epoch": 0.43984849395779474, "flos": 16873239692160.0, "grad_norm": 2.5478801485141154, "language_loss": 0.89356411, "learning_rate": 2.482063181524425e-06, "loss": 0.91555548, "num_input_tokens_seen": 78773720, "step": 3658, "time_per_iteration": 2.7599411010742188 }, { "auxiliary_loss_clip": 0.01187167, "auxiliary_loss_mlp": 0.01031658, "balance_loss_clip": 1.0550319, "balance_loss_mlp": 1.0230329, "epoch": 0.43996873684843385, "flos": 18691504104960.0, "grad_norm": 3.5456664332893895, "language_loss": 0.81194723, "learning_rate": 2.4813071395423307e-06, "loss": 0.83413547, "num_input_tokens_seen": 78791285, "step": 3659, "time_per_iteration": 3.617483615875244 }, { "auxiliary_loss_clip": 0.01179596, "auxiliary_loss_mlp": 0.01029803, "balance_loss_clip": 1.05183959, "balance_loss_mlp": 1.02114272, "epoch": 0.4400889797390729, "flos": 23653460787840.0, "grad_norm": 3.561562055506441, "language_loss": 0.64757544, "learning_rate": 2.4805510245463263e-06, "loss": 0.66966939, "num_input_tokens_seen": 78811440, "step": 3660, "time_per_iteration": 2.7554237842559814 }, { "auxiliary_loss_clip": 0.01182903, "auxiliary_loss_mlp": 0.01031165, "balance_loss_clip": 1.05480194, "balance_loss_mlp": 1.02304101, "epoch": 0.440209222629712, "flos": 23149203707520.0, "grad_norm": 1.9212950207381763, "language_loss": 0.60712814, "learning_rate": 2.4797948366511137e-06, "loss": 0.62926883, "num_input_tokens_seen": 78831150, "step": 3661, "time_per_iteration": 3.6172876358032227 }, { "auxiliary_loss_clip": 0.01174862, "auxiliary_loss_mlp": 0.01028864, "balance_loss_clip": 1.05463243, "balance_loss_mlp": 1.02025151, "epoch": 0.4403294655203511, "flos": 24823394668800.0, "grad_norm": 2.2424029724572194, "language_loss": 0.76599008, "learning_rate": 2.4790385759714055e-06, "loss": 0.78802735, "num_input_tokens_seen": 78850215, "step": 3662, "time_per_iteration": 2.886981725692749 }, { "auxiliary_loss_clip": 0.01175882, "auxiliary_loss_mlp": 0.01025373, "balance_loss_clip": 1.0529139, "balance_loss_mlp": 1.01738012, "epoch": 0.4404497084109902, "flos": 22565080736640.0, "grad_norm": 1.8885718436111596, "language_loss": 0.70995522, "learning_rate": 2.478282242621926e-06, "loss": 0.73196781, "num_input_tokens_seen": 78870675, "step": 3663, "time_per_iteration": 3.679041862487793 }, { "auxiliary_loss_clip": 0.01083963, "auxiliary_loss_mlp": 0.01000906, "balance_loss_clip": 1.02335143, "balance_loss_mlp": 0.99945211, "epoch": 0.4405699513016293, "flos": 64967073448320.0, "grad_norm": 0.8540406980094282, "language_loss": 0.59527135, "learning_rate": 2.477525836717411e-06, "loss": 0.61612004, "num_input_tokens_seen": 78938440, "step": 3664, "time_per_iteration": 3.422426462173462 }, { "auxiliary_loss_clip": 0.01180414, "auxiliary_loss_mlp": 0.01032451, "balance_loss_clip": 1.05363393, "balance_loss_mlp": 1.02370679, "epoch": 0.4406901941922684, "flos": 35661952978560.0, "grad_norm": 3.0934386878197317, "language_loss": 0.79929256, "learning_rate": 2.476769358372606e-06, "loss": 0.82142115, "num_input_tokens_seen": 78960090, "step": 3665, "time_per_iteration": 3.758594036102295 }, { "auxiliary_loss_clip": 0.01176319, "auxiliary_loss_mlp": 0.01025309, "balance_loss_clip": 1.05923963, "balance_loss_mlp": 1.01778054, "epoch": 0.44081043708290746, "flos": 18040767361920.0, "grad_norm": 2.2974758881364146, "language_loss": 0.7506057, "learning_rate": 2.4760128077022683e-06, "loss": 0.77262199, "num_input_tokens_seen": 78978225, "step": 3666, "time_per_iteration": 2.736607551574707 }, { "auxiliary_loss_clip": 0.01164657, "auxiliary_loss_mlp": 0.01026451, "balance_loss_clip": 1.05422664, "balance_loss_mlp": 1.0186131, "epoch": 0.44093067997354657, "flos": 30153507799680.0, "grad_norm": 5.5549417680827835, "language_loss": 0.68423522, "learning_rate": 2.4752561848211672e-06, "loss": 0.70614624, "num_input_tokens_seen": 79000625, "step": 3667, "time_per_iteration": 2.8234901428222656 }, { "auxiliary_loss_clip": 0.01177757, "auxiliary_loss_mlp": 0.01026087, "balance_loss_clip": 1.05672693, "balance_loss_mlp": 1.01849294, "epoch": 0.4410509228641857, "flos": 23255068066560.0, "grad_norm": 2.2261085621025045, "language_loss": 0.71259409, "learning_rate": 2.4744994898440797e-06, "loss": 0.73463255, "num_input_tokens_seen": 79019415, "step": 3668, "time_per_iteration": 2.718508720397949 }, { "auxiliary_loss_clip": 0.01178225, "auxiliary_loss_mlp": 0.01031237, "balance_loss_clip": 1.0527693, "balance_loss_mlp": 1.02315474, "epoch": 0.44117116575482473, "flos": 19500571998720.0, "grad_norm": 2.0267047490813264, "language_loss": 0.83275568, "learning_rate": 2.473742722885797e-06, "loss": 0.85485035, "num_input_tokens_seen": 79038435, "step": 3669, "time_per_iteration": 2.7151196002960205 }, { "auxiliary_loss_clip": 0.01178417, "auxiliary_loss_mlp": 0.01061486, "balance_loss_clip": 1.05361605, "balance_loss_mlp": 1.02306223, "epoch": 0.44129140864546385, "flos": 27053124353280.0, "grad_norm": 2.4407551303546384, "language_loss": 0.65207684, "learning_rate": 2.4729858840611197e-06, "loss": 0.67447591, "num_input_tokens_seen": 79057345, "step": 3670, "time_per_iteration": 2.6985130310058594 }, { "auxiliary_loss_clip": 0.01181187, "auxiliary_loss_mlp": 0.01025879, "balance_loss_clip": 1.05338883, "balance_loss_mlp": 1.01792145, "epoch": 0.4414116515361029, "flos": 26102101910400.0, "grad_norm": 2.3131990345259434, "language_loss": 0.72801352, "learning_rate": 2.4722289734848605e-06, "loss": 0.75008416, "num_input_tokens_seen": 79077810, "step": 3671, "time_per_iteration": 2.722869873046875 }, { "auxiliary_loss_clip": 0.01168841, "auxiliary_loss_mlp": 0.0103142, "balance_loss_clip": 1.05387843, "balance_loss_mlp": 1.02357626, "epoch": 0.441531894426742, "flos": 21906083865600.0, "grad_norm": 2.071684639617397, "language_loss": 0.77890337, "learning_rate": 2.471471991271841e-06, "loss": 0.80090594, "num_input_tokens_seen": 79094935, "step": 3672, "time_per_iteration": 2.7514941692352295 }, { "auxiliary_loss_clip": 0.0117398, "auxiliary_loss_mlp": 0.01027768, "balance_loss_clip": 1.05254233, "balance_loss_mlp": 1.01993585, "epoch": 0.4416521373173811, "flos": 23437099215360.0, "grad_norm": 2.194482533573756, "language_loss": 0.79551792, "learning_rate": 2.470714937536896e-06, "loss": 0.8175354, "num_input_tokens_seen": 79113660, "step": 3673, "time_per_iteration": 2.723891496658325 }, { "auxiliary_loss_clip": 0.01168921, "auxiliary_loss_mlp": 0.01027096, "balance_loss_clip": 1.05420864, "balance_loss_mlp": 1.01817966, "epoch": 0.4417723802080202, "flos": 20334345471360.0, "grad_norm": 2.5965455848935903, "language_loss": 0.70146197, "learning_rate": 2.469957812394868e-06, "loss": 0.72342217, "num_input_tokens_seen": 79132470, "step": 3674, "time_per_iteration": 2.784224033355713 }, { "auxiliary_loss_clip": 0.01185214, "auxiliary_loss_mlp": 0.01031318, "balance_loss_clip": 1.05629134, "balance_loss_mlp": 1.02328324, "epoch": 0.4418926230986593, "flos": 18880682060160.0, "grad_norm": 1.9136953703224604, "language_loss": 0.76196897, "learning_rate": 2.4692006159606148e-06, "loss": 0.78413427, "num_input_tokens_seen": 79150000, "step": 3675, "time_per_iteration": 2.723874092102051 }, { "auxiliary_loss_clip": 0.0118332, "auxiliary_loss_mlp": 0.01029885, "balance_loss_clip": 1.05316317, "balance_loss_mlp": 1.02172542, "epoch": 0.4420128659892984, "flos": 19464409981440.0, "grad_norm": 1.8441913138400894, "language_loss": 0.79004061, "learning_rate": 2.468443348349e-06, "loss": 0.81217271, "num_input_tokens_seen": 79167875, "step": 3676, "time_per_iteration": 2.7552475929260254 }, { "auxiliary_loss_clip": 0.01171691, "auxiliary_loss_mlp": 0.01025476, "balance_loss_clip": 1.05643821, "balance_loss_mlp": 1.01766491, "epoch": 0.44213310887993745, "flos": 17894359526400.0, "grad_norm": 4.393790819699078, "language_loss": 0.82286763, "learning_rate": 2.467686009674902e-06, "loss": 0.84483927, "num_input_tokens_seen": 79182325, "step": 3677, "time_per_iteration": 2.6795413494110107 }, { "auxiliary_loss_clip": 0.01177619, "auxiliary_loss_mlp": 0.01024611, "balance_loss_clip": 1.05359828, "balance_loss_mlp": 1.01646876, "epoch": 0.44225335177057656, "flos": 19204667758080.0, "grad_norm": 2.314960693281025, "language_loss": 0.85150546, "learning_rate": 2.466928600053209e-06, "loss": 0.87352777, "num_input_tokens_seen": 79197630, "step": 3678, "time_per_iteration": 2.8362603187561035 }, { "auxiliary_loss_clip": 0.01175455, "auxiliary_loss_mlp": 0.0103363, "balance_loss_clip": 1.05268872, "balance_loss_mlp": 1.02543426, "epoch": 0.4423735946612157, "flos": 23471321898240.0, "grad_norm": 2.162689037804125, "language_loss": 0.71437597, "learning_rate": 2.466171119598818e-06, "loss": 0.73646688, "num_input_tokens_seen": 79217600, "step": 3679, "time_per_iteration": 2.728849411010742 }, { "auxiliary_loss_clip": 0.01185651, "auxiliary_loss_mlp": 0.01029337, "balance_loss_clip": 1.05147815, "balance_loss_mlp": 1.02102852, "epoch": 0.44249383755185473, "flos": 26685398868480.0, "grad_norm": 1.8241186994015561, "language_loss": 0.76860881, "learning_rate": 2.465413568426639e-06, "loss": 0.79075873, "num_input_tokens_seen": 79238550, "step": 3680, "time_per_iteration": 2.749351739883423 }, { "auxiliary_loss_clip": 0.01173788, "auxiliary_loss_mlp": 0.01031268, "balance_loss_clip": 1.05145979, "balance_loss_mlp": 1.02374589, "epoch": 0.44261408044249384, "flos": 23147659422720.0, "grad_norm": 1.812674051435108, "language_loss": 0.81664526, "learning_rate": 2.464655946651591e-06, "loss": 0.83869576, "num_input_tokens_seen": 79257555, "step": 3681, "time_per_iteration": 2.730137586593628 }, { "auxiliary_loss_clip": 0.01180827, "auxiliary_loss_mlp": 0.01031895, "balance_loss_clip": 1.05289364, "balance_loss_mlp": 1.02390814, "epoch": 0.44273432333313295, "flos": 24462564595200.0, "grad_norm": 7.592324242779041, "language_loss": 0.80921769, "learning_rate": 2.4638982543886065e-06, "loss": 0.8313449, "num_input_tokens_seen": 79277595, "step": 3682, "time_per_iteration": 2.8422353267669678 }, { "auxiliary_loss_clip": 0.01184501, "auxiliary_loss_mlp": 0.01031442, "balance_loss_clip": 1.05531788, "balance_loss_mlp": 1.02345479, "epoch": 0.442854566223772, "flos": 17528932512000.0, "grad_norm": 2.5299494315137014, "language_loss": 0.87600011, "learning_rate": 2.4631404917526254e-06, "loss": 0.89815956, "num_input_tokens_seen": 79294550, "step": 3683, "time_per_iteration": 2.6046390533447266 }, { "auxiliary_loss_clip": 0.0117547, "auxiliary_loss_mlp": 0.01028772, "balance_loss_clip": 1.05384183, "balance_loss_mlp": 1.02070165, "epoch": 0.4429748091144111, "flos": 24896293320960.0, "grad_norm": 2.152575717339572, "language_loss": 0.79088378, "learning_rate": 2.4623826588586e-06, "loss": 0.81292617, "num_input_tokens_seen": 79314820, "step": 3684, "time_per_iteration": 2.745396614074707 }, { "auxiliary_loss_clip": 0.01174376, "auxiliary_loss_mlp": 0.01031139, "balance_loss_clip": 1.05290473, "balance_loss_mlp": 1.0224731, "epoch": 0.4430950520050502, "flos": 21614704738560.0, "grad_norm": 1.6992381461048234, "language_loss": 0.82938755, "learning_rate": 2.461624755821492e-06, "loss": 0.85144269, "num_input_tokens_seen": 79334300, "step": 3685, "time_per_iteration": 3.5738139152526855 }, { "auxiliary_loss_clip": 0.0117447, "auxiliary_loss_mlp": 0.01035103, "balance_loss_clip": 1.05409658, "balance_loss_mlp": 1.02629983, "epoch": 0.4432152948956893, "flos": 24572271709440.0, "grad_norm": 1.815134869513665, "language_loss": 0.76403332, "learning_rate": 2.4608667827562763e-06, "loss": 0.78612906, "num_input_tokens_seen": 79353630, "step": 3686, "time_per_iteration": 2.63588809967041 }, { "auxiliary_loss_clip": 0.01188285, "auxiliary_loss_mlp": 0.01033311, "balance_loss_clip": 1.05631232, "balance_loss_mlp": 1.02491295, "epoch": 0.4433355377863284, "flos": 21762261809280.0, "grad_norm": 2.524779713245117, "language_loss": 0.89862335, "learning_rate": 2.460108739777936e-06, "loss": 0.92083925, "num_input_tokens_seen": 79372765, "step": 3687, "time_per_iteration": 3.5092082023620605 }, { "auxiliary_loss_clip": 0.01177013, "auxiliary_loss_mlp": 0.01028114, "balance_loss_clip": 1.05473197, "balance_loss_mlp": 1.01949501, "epoch": 0.44345578067696745, "flos": 20084479488000.0, "grad_norm": 1.7256487794086393, "language_loss": 0.76522195, "learning_rate": 2.4593506270014656e-06, "loss": 0.78727317, "num_input_tokens_seen": 79391735, "step": 3688, "time_per_iteration": 2.530566692352295 }, { "auxiliary_loss_clip": 0.01181881, "auxiliary_loss_mlp": 0.01026746, "balance_loss_clip": 1.05304027, "balance_loss_mlp": 1.0187825, "epoch": 0.44357602356760656, "flos": 24169497528960.0, "grad_norm": 2.1469972034637648, "language_loss": 0.81781542, "learning_rate": 2.45859244454187e-06, "loss": 0.83990175, "num_input_tokens_seen": 79411525, "step": 3689, "time_per_iteration": 3.4181337356567383 }, { "auxiliary_loss_clip": 0.01177544, "auxiliary_loss_mlp": 0.01026356, "balance_loss_clip": 1.05273008, "balance_loss_mlp": 1.0183388, "epoch": 0.44369626645824567, "flos": 22707717644160.0, "grad_norm": 1.89748284871233, "language_loss": 0.66134465, "learning_rate": 2.4578341925141655e-06, "loss": 0.68338364, "num_input_tokens_seen": 79430740, "step": 3690, "time_per_iteration": 2.6778860092163086 }, { "auxiliary_loss_clip": 0.01187962, "auxiliary_loss_mlp": 0.01027127, "balance_loss_clip": 1.05502093, "balance_loss_mlp": 1.01846623, "epoch": 0.4438165093488847, "flos": 38030225420160.0, "grad_norm": 4.606794501299761, "language_loss": 0.72340989, "learning_rate": 2.457075871033378e-06, "loss": 0.74556077, "num_input_tokens_seen": 79452615, "step": 3691, "time_per_iteration": 2.7405505180358887 }, { "auxiliary_loss_clip": 0.01173305, "auxiliary_loss_mlp": 0.01029224, "balance_loss_clip": 1.05387032, "balance_loss_mlp": 1.02133274, "epoch": 0.44393675223952384, "flos": 15523213996800.0, "grad_norm": 4.347309904034511, "language_loss": 0.88457394, "learning_rate": 2.4563174802145445e-06, "loss": 0.90659928, "num_input_tokens_seen": 79469865, "step": 3692, "time_per_iteration": 3.6492741107940674 }, { "auxiliary_loss_clip": 0.01084948, "auxiliary_loss_mlp": 0.01007134, "balance_loss_clip": 1.02188444, "balance_loss_mlp": 1.0055846, "epoch": 0.44405699513016295, "flos": 64574893779840.0, "grad_norm": 0.7143488842039485, "language_loss": 0.4857789, "learning_rate": 2.455559020172712e-06, "loss": 0.50669968, "num_input_tokens_seen": 79537220, "step": 3693, "time_per_iteration": 3.468982458114624 }, { "auxiliary_loss_clip": 0.01173956, "auxiliary_loss_mlp": 0.01028734, "balance_loss_clip": 1.05798268, "balance_loss_mlp": 1.02077699, "epoch": 0.444177238020802, "flos": 23987394552960.0, "grad_norm": 2.5648049656320993, "language_loss": 0.89896375, "learning_rate": 2.4548004910229385e-06, "loss": 0.92099071, "num_input_tokens_seen": 79554795, "step": 3694, "time_per_iteration": 2.8180313110351562 }, { "auxiliary_loss_clip": 0.01182063, "auxiliary_loss_mlp": 0.01056345, "balance_loss_clip": 1.05302191, "balance_loss_mlp": 1.01900625, "epoch": 0.4442974809114411, "flos": 22563069575040.0, "grad_norm": 2.316353329186112, "language_loss": 0.87109387, "learning_rate": 2.4540418928802913e-06, "loss": 0.89347804, "num_input_tokens_seen": 79573530, "step": 3695, "time_per_iteration": 2.815584659576416 }, { "auxiliary_loss_clip": 0.01179563, "auxiliary_loss_mlp": 0.01029104, "balance_loss_clip": 1.05501628, "balance_loss_mlp": 1.02030683, "epoch": 0.4444177238020802, "flos": 17675699483520.0, "grad_norm": 2.552781323680816, "language_loss": 0.65901589, "learning_rate": 2.4532832258598506e-06, "loss": 0.68110251, "num_input_tokens_seen": 79591360, "step": 3696, "time_per_iteration": 2.7869696617126465 }, { "auxiliary_loss_clip": 0.01180471, "auxiliary_loss_mlp": 0.01027023, "balance_loss_clip": 1.05286241, "balance_loss_mlp": 1.01966214, "epoch": 0.4445379666927193, "flos": 28621594609920.0, "grad_norm": 1.8576298847634383, "language_loss": 0.80843449, "learning_rate": 2.4525244900767047e-06, "loss": 0.83050942, "num_input_tokens_seen": 79612175, "step": 3697, "time_per_iteration": 2.8019165992736816 }, { "auxiliary_loss_clip": 0.01084696, "auxiliary_loss_mlp": 0.0100345, "balance_loss_clip": 1.02511263, "balance_loss_mlp": 1.00193024, "epoch": 0.4446582095833584, "flos": 70487370115200.0, "grad_norm": 0.790277375765083, "language_loss": 0.60539931, "learning_rate": 2.4517656856459536e-06, "loss": 0.62628078, "num_input_tokens_seen": 79678020, "step": 3698, "time_per_iteration": 3.454115629196167 }, { "auxiliary_loss_clip": 0.01177662, "auxiliary_loss_mlp": 0.01027724, "balance_loss_clip": 1.05055737, "balance_loss_mlp": 1.01974869, "epoch": 0.4447784524739975, "flos": 26505199313280.0, "grad_norm": 1.6332634969866437, "language_loss": 0.68234348, "learning_rate": 2.4510068126827073e-06, "loss": 0.70439732, "num_input_tokens_seen": 79699020, "step": 3699, "time_per_iteration": 2.6756739616394043 }, { "auxiliary_loss_clip": 0.01178085, "auxiliary_loss_mlp": 0.0103078, "balance_loss_clip": 1.05467272, "balance_loss_mlp": 1.02266812, "epoch": 0.44489869536463655, "flos": 11656209553920.0, "grad_norm": 2.4138100072806554, "language_loss": 0.8212111, "learning_rate": 2.450247871302086e-06, "loss": 0.84329969, "num_input_tokens_seen": 79716795, "step": 3700, "time_per_iteration": 2.7286629676818848 }, { "auxiliary_loss_clip": 0.01182592, "auxiliary_loss_mlp": 0.01029333, "balance_loss_clip": 1.05223763, "balance_loss_mlp": 1.02104235, "epoch": 0.44501893825527566, "flos": 20448469958400.0, "grad_norm": 2.4105649700186214, "language_loss": 0.83166641, "learning_rate": 2.44948886161922e-06, "loss": 0.85378575, "num_input_tokens_seen": 79735810, "step": 3701, "time_per_iteration": 2.6458513736724854 }, { "auxiliary_loss_clip": 0.01183714, "auxiliary_loss_mlp": 0.01024357, "balance_loss_clip": 1.05449295, "balance_loss_mlp": 1.01622748, "epoch": 0.4451391811459148, "flos": 18261079430400.0, "grad_norm": 1.7920886041719153, "language_loss": 0.84891754, "learning_rate": 2.4487297837492524e-06, "loss": 0.87099826, "num_input_tokens_seen": 79754975, "step": 3702, "time_per_iteration": 2.7648673057556152 }, { "auxiliary_loss_clip": 0.01173274, "auxiliary_loss_mlp": 0.01024334, "balance_loss_clip": 1.05331886, "balance_loss_mlp": 1.01598954, "epoch": 0.44525942403655383, "flos": 16910155895040.0, "grad_norm": 1.950448850449, "language_loss": 0.62216353, "learning_rate": 2.4479706378073323e-06, "loss": 0.64413965, "num_input_tokens_seen": 79773515, "step": 3703, "time_per_iteration": 2.7513444423675537 }, { "auxiliary_loss_clip": 0.01166154, "auxiliary_loss_mlp": 0.01029565, "balance_loss_clip": 1.05326128, "balance_loss_mlp": 1.02121985, "epoch": 0.44537966692719294, "flos": 23258838994560.0, "grad_norm": 1.6310537883338945, "language_loss": 0.83831745, "learning_rate": 2.447211423908623e-06, "loss": 0.86027461, "num_input_tokens_seen": 79793560, "step": 3704, "time_per_iteration": 2.81095290184021 }, { "auxiliary_loss_clip": 0.01180675, "auxiliary_loss_mlp": 0.01028643, "balance_loss_clip": 1.05242085, "balance_loss_mlp": 1.02097535, "epoch": 0.445499909817832, "flos": 21724160457600.0, "grad_norm": 12.478069405453445, "language_loss": 0.74900717, "learning_rate": 2.4464521421682966e-06, "loss": 0.77110034, "num_input_tokens_seen": 79811150, "step": 3705, "time_per_iteration": 2.7457275390625 }, { "auxiliary_loss_clip": 0.01172719, "auxiliary_loss_mlp": 0.01021794, "balance_loss_clip": 1.052809, "balance_loss_mlp": 1.01413488, "epoch": 0.4456201527084711, "flos": 23987969170560.0, "grad_norm": 1.4652439984343575, "language_loss": 0.87816489, "learning_rate": 2.4456927927015345e-06, "loss": 0.90011001, "num_input_tokens_seen": 79832190, "step": 3706, "time_per_iteration": 2.988140344619751 }, { "auxiliary_loss_clip": 0.01182516, "auxiliary_loss_mlp": 0.0103097, "balance_loss_clip": 1.0549053, "balance_loss_mlp": 1.02183294, "epoch": 0.4457403955991102, "flos": 18807065136000.0, "grad_norm": 2.527849030830163, "language_loss": 0.76471949, "learning_rate": 2.4449333756235307e-06, "loss": 0.78685427, "num_input_tokens_seen": 79848905, "step": 3707, "time_per_iteration": 2.912554979324341 }, { "auxiliary_loss_clip": 0.01183503, "auxiliary_loss_mlp": 0.01028, "balance_loss_clip": 1.05362105, "balance_loss_mlp": 1.02020371, "epoch": 0.4458606384897493, "flos": 19207756327680.0, "grad_norm": 8.25655986742649, "language_loss": 0.78933442, "learning_rate": 2.4441738910494876e-06, "loss": 0.81144941, "num_input_tokens_seen": 79863640, "step": 3708, "time_per_iteration": 2.7853305339813232 }, { "auxiliary_loss_clip": 0.01183444, "auxiliary_loss_mlp": 0.01031399, "balance_loss_clip": 1.05323231, "balance_loss_mlp": 1.02324462, "epoch": 0.4459808813803884, "flos": 21361283308800.0, "grad_norm": 2.06555507957946, "language_loss": 0.82743597, "learning_rate": 2.4434143390946176e-06, "loss": 0.8495844, "num_input_tokens_seen": 79882450, "step": 3709, "time_per_iteration": 2.8674697875976562 }, { "auxiliary_loss_clip": 0.01172613, "auxiliary_loss_mlp": 0.01024442, "balance_loss_clip": 1.05338001, "balance_loss_mlp": 1.01698589, "epoch": 0.4461011242710275, "flos": 23288967527040.0, "grad_norm": 2.7624907179166787, "language_loss": 0.85251075, "learning_rate": 2.4426547198741457e-06, "loss": 0.87448132, "num_input_tokens_seen": 79900655, "step": 3710, "time_per_iteration": 2.904374122619629 }, { "auxiliary_loss_clip": 0.01170537, "auxiliary_loss_mlp": 0.01036432, "balance_loss_clip": 1.05623388, "balance_loss_mlp": 1.02845657, "epoch": 0.44622136716166655, "flos": 20193001453440.0, "grad_norm": 2.416870248158959, "language_loss": 0.74585778, "learning_rate": 2.441895033503305e-06, "loss": 0.76792747, "num_input_tokens_seen": 79918575, "step": 3711, "time_per_iteration": 3.8684771060943604 }, { "auxiliary_loss_clip": 0.01179534, "auxiliary_loss_mlp": 0.01032267, "balance_loss_clip": 1.05386245, "balance_loss_mlp": 1.02386856, "epoch": 0.44634161005230566, "flos": 21283033530240.0, "grad_norm": 1.7496978631256674, "language_loss": 0.82209897, "learning_rate": 2.4411352800973375e-06, "loss": 0.84421706, "num_input_tokens_seen": 79937010, "step": 3712, "time_per_iteration": 3.8748619556427 }, { "auxiliary_loss_clip": 0.0116943, "auxiliary_loss_mlp": 0.01027975, "balance_loss_clip": 1.05204749, "balance_loss_mlp": 1.01970756, "epoch": 0.44646185294294477, "flos": 22929358515840.0, "grad_norm": 2.7152497357549925, "language_loss": 0.75623441, "learning_rate": 2.4403754597715005e-06, "loss": 0.77820843, "num_input_tokens_seen": 79956455, "step": 3713, "time_per_iteration": 2.787473678588867 }, { "auxiliary_loss_clip": 0.0118467, "auxiliary_loss_mlp": 0.01030629, "balance_loss_clip": 1.05392647, "balance_loss_mlp": 1.02214122, "epoch": 0.4465820958335838, "flos": 22637692080000.0, "grad_norm": 2.6194393673366445, "language_loss": 0.92473018, "learning_rate": 2.4396155726410553e-06, "loss": 0.9468832, "num_input_tokens_seen": 79975065, "step": 3714, "time_per_iteration": 2.7586660385131836 }, { "auxiliary_loss_clip": 0.01185068, "auxiliary_loss_mlp": 0.01025752, "balance_loss_clip": 1.0535183, "balance_loss_mlp": 1.01768136, "epoch": 0.44670233872422294, "flos": 22672525294080.0, "grad_norm": 2.2739380118679935, "language_loss": 0.90558088, "learning_rate": 2.438855618821278e-06, "loss": 0.92768908, "num_input_tokens_seen": 79990865, "step": 3715, "time_per_iteration": 3.6738080978393555 }, { "auxiliary_loss_clip": 0.01172776, "auxiliary_loss_mlp": 0.01027014, "balance_loss_clip": 1.05050778, "balance_loss_mlp": 1.01908672, "epoch": 0.44682258161486205, "flos": 23582178247680.0, "grad_norm": 1.6791925038147275, "language_loss": 0.6766938, "learning_rate": 2.4380955984274517e-06, "loss": 0.69869167, "num_input_tokens_seen": 80009520, "step": 3716, "time_per_iteration": 2.811467170715332 }, { "auxiliary_loss_clip": 0.01182392, "auxiliary_loss_mlp": 0.01032957, "balance_loss_clip": 1.05416179, "balance_loss_mlp": 1.02461219, "epoch": 0.4469428245055011, "flos": 26501356558080.0, "grad_norm": 1.9190460871559134, "language_loss": 0.7731846, "learning_rate": 2.4373355115748716e-06, "loss": 0.79533815, "num_input_tokens_seen": 80030350, "step": 3717, "time_per_iteration": 2.82489013671875 }, { "auxiliary_loss_clip": 0.01169829, "auxiliary_loss_mlp": 0.01027645, "balance_loss_clip": 1.05373108, "balance_loss_mlp": 1.01966429, "epoch": 0.4470630673961402, "flos": 21504925797120.0, "grad_norm": 5.061787967157699, "language_loss": 0.72016823, "learning_rate": 2.436575358378842e-06, "loss": 0.74214303, "num_input_tokens_seen": 80049840, "step": 3718, "time_per_iteration": 3.708913803100586 }, { "auxiliary_loss_clip": 0.01182051, "auxiliary_loss_mlp": 0.01027722, "balance_loss_clip": 1.05418015, "balance_loss_mlp": 1.01963997, "epoch": 0.44718331028677927, "flos": 16173986653440.0, "grad_norm": 2.7498087919972813, "language_loss": 0.83477199, "learning_rate": 2.4358151389546782e-06, "loss": 0.85686976, "num_input_tokens_seen": 80066525, "step": 3719, "time_per_iteration": 2.7875475883483887 }, { "auxiliary_loss_clip": 0.01182349, "auxiliary_loss_mlp": 0.0102594, "balance_loss_clip": 1.05282354, "balance_loss_mlp": 1.01825702, "epoch": 0.4473035531774184, "flos": 19681238430720.0, "grad_norm": 2.467776787423783, "language_loss": 0.75651962, "learning_rate": 2.4350548534177035e-06, "loss": 0.77860248, "num_input_tokens_seen": 80083355, "step": 3720, "time_per_iteration": 2.7804622650146484 }, { "auxiliary_loss_clip": 0.01173153, "auxiliary_loss_mlp": 0.01030342, "balance_loss_clip": 1.05372465, "balance_loss_mlp": 1.02229548, "epoch": 0.4474237960680575, "flos": 41427590515200.0, "grad_norm": 1.9545338371454157, "language_loss": 0.66450787, "learning_rate": 2.434294501883254e-06, "loss": 0.68654281, "num_input_tokens_seen": 80106450, "step": 3721, "time_per_iteration": 2.9261958599090576 }, { "auxiliary_loss_clip": 0.01173117, "auxiliary_loss_mlp": 0.01035225, "balance_loss_clip": 1.05577302, "balance_loss_mlp": 1.02735114, "epoch": 0.44754403895869654, "flos": 22891328991360.0, "grad_norm": 1.6608811121032079, "language_loss": 0.65801483, "learning_rate": 2.433534084466674e-06, "loss": 0.68009824, "num_input_tokens_seen": 80125670, "step": 3722, "time_per_iteration": 2.992985486984253 }, { "auxiliary_loss_clip": 0.01178148, "auxiliary_loss_mlp": 0.01029902, "balance_loss_clip": 1.05131638, "balance_loss_mlp": 1.02190924, "epoch": 0.44766428184933565, "flos": 25630271832960.0, "grad_norm": 1.9876866960574775, "language_loss": 0.70725334, "learning_rate": 2.4327736012833178e-06, "loss": 0.72933388, "num_input_tokens_seen": 80147390, "step": 3723, "time_per_iteration": 2.7841317653656006 }, { "auxiliary_loss_clip": 0.01175728, "auxiliary_loss_mlp": 0.01024625, "balance_loss_clip": 1.05139124, "balance_loss_mlp": 1.0160836, "epoch": 0.44778452473997477, "flos": 20448972748800.0, "grad_norm": 2.386888520810542, "language_loss": 0.76642239, "learning_rate": 2.4320130524485506e-06, "loss": 0.78842592, "num_input_tokens_seen": 80166185, "step": 3724, "time_per_iteration": 2.789079189300537 }, { "auxiliary_loss_clip": 0.01169794, "auxiliary_loss_mlp": 0.01029204, "balance_loss_clip": 1.05441999, "balance_loss_mlp": 1.02145553, "epoch": 0.4479047676306138, "flos": 21975462984960.0, "grad_norm": 1.6525633890851297, "language_loss": 0.79756105, "learning_rate": 2.431252438077746e-06, "loss": 0.81955099, "num_input_tokens_seen": 80185685, "step": 3725, "time_per_iteration": 2.782125473022461 }, { "auxiliary_loss_clip": 0.0118086, "auxiliary_loss_mlp": 0.01053278, "balance_loss_clip": 1.05190921, "balance_loss_mlp": 1.016119, "epoch": 0.44802501052125293, "flos": 21467219495040.0, "grad_norm": 2.8043180005315924, "language_loss": 0.77051854, "learning_rate": 2.4304917582862906e-06, "loss": 0.79285991, "num_input_tokens_seen": 80204865, "step": 3726, "time_per_iteration": 2.8464436531066895 }, { "auxiliary_loss_clip": 0.01181974, "auxiliary_loss_mlp": 0.01025966, "balance_loss_clip": 1.05410671, "balance_loss_mlp": 1.01860499, "epoch": 0.44814525341189204, "flos": 22126970551680.0, "grad_norm": 2.4178266016593875, "language_loss": 0.87858391, "learning_rate": 2.4297310131895774e-06, "loss": 0.90066332, "num_input_tokens_seen": 80223410, "step": 3727, "time_per_iteration": 2.896144151687622 }, { "auxiliary_loss_clip": 0.01179128, "auxiliary_loss_mlp": 0.01028434, "balance_loss_clip": 1.05327845, "balance_loss_mlp": 1.02072084, "epoch": 0.4482654963025311, "flos": 16653933204480.0, "grad_norm": 4.1757912403043775, "language_loss": 0.74638414, "learning_rate": 2.4289702029030113e-06, "loss": 0.7684598, "num_input_tokens_seen": 80240880, "step": 3728, "time_per_iteration": 2.7531282901763916 }, { "auxiliary_loss_clip": 0.01180635, "auxiliary_loss_mlp": 0.01030639, "balance_loss_clip": 1.05539513, "balance_loss_mlp": 1.0224402, "epoch": 0.4483857391931702, "flos": 18841251905280.0, "grad_norm": 2.445797109949506, "language_loss": 0.8325156, "learning_rate": 2.4282093275420057e-06, "loss": 0.85462838, "num_input_tokens_seen": 80259910, "step": 3729, "time_per_iteration": 2.775247097015381 }, { "auxiliary_loss_clip": 0.01181473, "auxiliary_loss_mlp": 0.01027, "balance_loss_clip": 1.05338645, "balance_loss_mlp": 1.01833904, "epoch": 0.4485059820838093, "flos": 20372590477440.0, "grad_norm": 1.9945687241699894, "language_loss": 0.70646143, "learning_rate": 2.4274483872219863e-06, "loss": 0.72854614, "num_input_tokens_seen": 80277270, "step": 3730, "time_per_iteration": 2.7036736011505127 }, { "auxiliary_loss_clip": 0.01178991, "auxiliary_loss_mlp": 0.01026637, "balance_loss_clip": 1.05394876, "balance_loss_mlp": 1.0187546, "epoch": 0.4486262249744484, "flos": 20047742853120.0, "grad_norm": 2.647500371281863, "language_loss": 0.93679714, "learning_rate": 2.426687382058386e-06, "loss": 0.95885342, "num_input_tokens_seen": 80295550, "step": 3731, "time_per_iteration": 2.762796401977539 }, { "auxiliary_loss_clip": 0.01087836, "auxiliary_loss_mlp": 0.01004057, "balance_loss_clip": 1.02758884, "balance_loss_mlp": 1.00254941, "epoch": 0.4487464678650875, "flos": 64595684776320.0, "grad_norm": 0.8728014808712112, "language_loss": 0.59807479, "learning_rate": 2.425926312166649e-06, "loss": 0.61899376, "num_input_tokens_seen": 80348425, "step": 3732, "time_per_iteration": 3.187138319015503 }, { "auxiliary_loss_clip": 0.01179436, "auxiliary_loss_mlp": 0.01024664, "balance_loss_clip": 1.0533042, "balance_loss_mlp": 1.01625967, "epoch": 0.4488667107557266, "flos": 20769798049920.0, "grad_norm": 2.5237015046241984, "language_loss": 0.73808789, "learning_rate": 2.42516517766223e-06, "loss": 0.76012886, "num_input_tokens_seen": 80366505, "step": 3733, "time_per_iteration": 2.749103307723999 }, { "auxiliary_loss_clip": 0.01183985, "auxiliary_loss_mlp": 0.01029139, "balance_loss_clip": 1.05541635, "balance_loss_mlp": 1.02149141, "epoch": 0.44898695364636565, "flos": 23951735326080.0, "grad_norm": 2.794253710463276, "language_loss": 0.68277991, "learning_rate": 2.4244039786605907e-06, "loss": 0.70491111, "num_input_tokens_seen": 80387510, "step": 3734, "time_per_iteration": 2.768922805786133 }, { "auxiliary_loss_clip": 0.01171584, "auxiliary_loss_mlp": 0.01026487, "balance_loss_clip": 1.05214322, "balance_loss_mlp": 1.01839554, "epoch": 0.44910719653700476, "flos": 18624351628800.0, "grad_norm": 2.680593831246311, "language_loss": 0.82776576, "learning_rate": 2.4236427152772055e-06, "loss": 0.84974647, "num_input_tokens_seen": 80405915, "step": 3735, "time_per_iteration": 2.776413917541504 }, { "auxiliary_loss_clip": 0.01081729, "auxiliary_loss_mlp": 0.01001984, "balance_loss_clip": 1.03012919, "balance_loss_mlp": 1.00052381, "epoch": 0.4492274394276438, "flos": 57033435749760.0, "grad_norm": 0.8803288889150948, "language_loss": 0.57357967, "learning_rate": 2.422881387627557e-06, "loss": 0.59441674, "num_input_tokens_seen": 80458365, "step": 3736, "time_per_iteration": 3.100391387939453 }, { "auxiliary_loss_clip": 0.01177937, "auxiliary_loss_mlp": 0.01028417, "balance_loss_clip": 1.05438399, "balance_loss_mlp": 1.02013791, "epoch": 0.4493476823182829, "flos": 23254888498560.0, "grad_norm": 1.7342191378216434, "language_loss": 0.77519703, "learning_rate": 2.422119995827139e-06, "loss": 0.79726058, "num_input_tokens_seen": 80478490, "step": 3737, "time_per_iteration": 2.782386302947998 }, { "auxiliary_loss_clip": 0.01182856, "auxiliary_loss_mlp": 0.01036406, "balance_loss_clip": 1.05473912, "balance_loss_mlp": 1.02795362, "epoch": 0.44946792520892204, "flos": 15815131827840.0, "grad_norm": 3.0777044260579065, "language_loss": 0.74134165, "learning_rate": 2.4213585399914528e-06, "loss": 0.76353431, "num_input_tokens_seen": 80495695, "step": 3738, "time_per_iteration": 4.661665439605713 }, { "auxiliary_loss_clip": 0.01178413, "auxiliary_loss_mlp": 0.01028624, "balance_loss_clip": 1.05408728, "balance_loss_mlp": 1.02055311, "epoch": 0.4495881680995611, "flos": 19610063631360.0, "grad_norm": 2.5067585957511778, "language_loss": 0.85488379, "learning_rate": 2.4205970202360113e-06, "loss": 0.8769542, "num_input_tokens_seen": 80515260, "step": 3739, "time_per_iteration": 2.9141414165496826 }, { "auxiliary_loss_clip": 0.01162424, "auxiliary_loss_mlp": 0.01025975, "balance_loss_clip": 1.05368841, "balance_loss_mlp": 1.01754689, "epoch": 0.4497084109902002, "flos": 26031465815040.0, "grad_norm": 7.2694511159645625, "language_loss": 0.78224492, "learning_rate": 2.4198354366763354e-06, "loss": 0.80412889, "num_input_tokens_seen": 80533900, "step": 3740, "time_per_iteration": 2.8554253578186035 }, { "auxiliary_loss_clip": 0.0117926, "auxiliary_loss_mlp": 0.01023182, "balance_loss_clip": 1.05440068, "balance_loss_mlp": 1.01540351, "epoch": 0.4498286538808393, "flos": 14793688771200.0, "grad_norm": 1.8288098174827105, "language_loss": 0.78364623, "learning_rate": 2.4190737894279587e-06, "loss": 0.80567062, "num_input_tokens_seen": 80551270, "step": 3741, "time_per_iteration": 3.5601606369018555 }, { "auxiliary_loss_clip": 0.01168807, "auxiliary_loss_mlp": 0.01027112, "balance_loss_clip": 1.05457616, "balance_loss_mlp": 1.01917219, "epoch": 0.44994889677147837, "flos": 15450171690240.0, "grad_norm": 3.6103448052294884, "language_loss": 0.80101371, "learning_rate": 2.4183120786064203e-06, "loss": 0.82297289, "num_input_tokens_seen": 80568145, "step": 3742, "time_per_iteration": 2.881425619125366 }, { "auxiliary_loss_clip": 0.01182131, "auxiliary_loss_mlp": 0.01063961, "balance_loss_clip": 1.0548439, "balance_loss_mlp": 1.02655685, "epoch": 0.4500691396621175, "flos": 21798316085760.0, "grad_norm": 4.968277719792922, "language_loss": 0.85929465, "learning_rate": 2.417550304327273e-06, "loss": 0.88175559, "num_input_tokens_seen": 80586185, "step": 3743, "time_per_iteration": 3.823413610458374 }, { "auxiliary_loss_clip": 0.01187246, "auxiliary_loss_mlp": 0.01028733, "balance_loss_clip": 1.05545318, "balance_loss_mlp": 1.02029896, "epoch": 0.4501893825527566, "flos": 32382016421760.0, "grad_norm": 1.7653202761006022, "language_loss": 0.75755525, "learning_rate": 2.4167884667060763e-06, "loss": 0.77971506, "num_input_tokens_seen": 80608895, "step": 3744, "time_per_iteration": 2.8640120029449463 }, { "auxiliary_loss_clip": 0.01178557, "auxiliary_loss_mlp": 0.01033657, "balance_loss_clip": 1.05430627, "balance_loss_mlp": 1.02472186, "epoch": 0.45030962544339564, "flos": 16544944362240.0, "grad_norm": 2.4907630382845114, "language_loss": 0.87602007, "learning_rate": 2.4160265658584e-06, "loss": 0.89814222, "num_input_tokens_seen": 80623785, "step": 3745, "time_per_iteration": 2.781771659851074 }, { "auxiliary_loss_clip": 0.01184874, "auxiliary_loss_mlp": 0.01033528, "balance_loss_clip": 1.05413365, "balance_loss_mlp": 1.02484941, "epoch": 0.45042986833403476, "flos": 19573039687680.0, "grad_norm": 1.9156186300278768, "language_loss": 0.68490291, "learning_rate": 2.4152646018998253e-06, "loss": 0.70708692, "num_input_tokens_seen": 80642735, "step": 3746, "time_per_iteration": 2.757380962371826 }, { "auxiliary_loss_clip": 0.01178707, "auxiliary_loss_mlp": 0.01020912, "balance_loss_clip": 1.05414426, "balance_loss_mlp": 1.01290667, "epoch": 0.45055011122467387, "flos": 23112467072640.0, "grad_norm": 1.8798144174980413, "language_loss": 0.7200284, "learning_rate": 2.4145025749459403e-06, "loss": 0.7420246, "num_input_tokens_seen": 80663760, "step": 3747, "time_per_iteration": 2.772425651550293 }, { "auxiliary_loss_clip": 0.011627, "auxiliary_loss_mlp": 0.01025858, "balance_loss_clip": 1.05626166, "balance_loss_mlp": 1.01731038, "epoch": 0.4506703541153129, "flos": 19934623946880.0, "grad_norm": 2.033816951797605, "language_loss": 0.69588709, "learning_rate": 2.413740485112344e-06, "loss": 0.7177726, "num_input_tokens_seen": 80682100, "step": 3748, "time_per_iteration": 2.754953622817993 }, { "auxiliary_loss_clip": 0.01172242, "auxiliary_loss_mlp": 0.01028618, "balance_loss_clip": 1.05462539, "balance_loss_mlp": 1.02119112, "epoch": 0.45079059700595203, "flos": 19499530504320.0, "grad_norm": 3.252843883894269, "language_loss": 0.82265007, "learning_rate": 2.412978332514646e-06, "loss": 0.84465873, "num_input_tokens_seen": 80700880, "step": 3749, "time_per_iteration": 2.70831298828125 }, { "auxiliary_loss_clip": 0.01184216, "auxiliary_loss_mlp": 0.01030577, "balance_loss_clip": 1.05863035, "balance_loss_mlp": 1.02200627, "epoch": 0.4509108398965911, "flos": 27636313570560.0, "grad_norm": 3.0804256978146647, "language_loss": 0.7227667, "learning_rate": 2.4122161172684623e-06, "loss": 0.74491459, "num_input_tokens_seen": 80721675, "step": 3750, "time_per_iteration": 2.7368319034576416 }, { "auxiliary_loss_clip": 0.01177944, "auxiliary_loss_mlp": 0.01021666, "balance_loss_clip": 1.05345249, "balance_loss_mlp": 1.01274323, "epoch": 0.4510310827872302, "flos": 20995712640000.0, "grad_norm": 14.894202696608076, "language_loss": 0.84864944, "learning_rate": 2.4114538394894216e-06, "loss": 0.87064552, "num_input_tokens_seen": 80739315, "step": 3751, "time_per_iteration": 2.785489082336426 }, { "auxiliary_loss_clip": 0.01178195, "auxiliary_loss_mlp": 0.0102595, "balance_loss_clip": 1.05274463, "balance_loss_mlp": 1.01818991, "epoch": 0.4511513256778693, "flos": 16216684945920.0, "grad_norm": 1.8106805720497334, "language_loss": 0.83343649, "learning_rate": 2.410691499293161e-06, "loss": 0.85547793, "num_input_tokens_seen": 80757470, "step": 3752, "time_per_iteration": 2.6996710300445557 }, { "auxiliary_loss_clip": 0.01179085, "auxiliary_loss_mlp": 0.01022872, "balance_loss_clip": 1.05301213, "balance_loss_mlp": 1.01486731, "epoch": 0.45127156856850836, "flos": 25186702780800.0, "grad_norm": 1.7761520871958667, "language_loss": 0.74012762, "learning_rate": 2.409929096795326e-06, "loss": 0.76214719, "num_input_tokens_seen": 80777840, "step": 3753, "time_per_iteration": 2.7423200607299805 }, { "auxiliary_loss_clip": 0.0118493, "auxiliary_loss_mlp": 0.0102507, "balance_loss_clip": 1.05552197, "balance_loss_mlp": 1.01674891, "epoch": 0.4513918114591475, "flos": 20412523422720.0, "grad_norm": 9.241241822613226, "language_loss": 0.79203832, "learning_rate": 2.409166632111573e-06, "loss": 0.81413829, "num_input_tokens_seen": 80795975, "step": 3754, "time_per_iteration": 2.671271800994873 }, { "auxiliary_loss_clip": 0.01186157, "auxiliary_loss_mlp": 0.01025777, "balance_loss_clip": 1.05264103, "balance_loss_mlp": 1.01677632, "epoch": 0.4515120543497866, "flos": 26648482665600.0, "grad_norm": 2.130714089535036, "language_loss": 0.80504835, "learning_rate": 2.4084041053575674e-06, "loss": 0.82716775, "num_input_tokens_seen": 80815395, "step": 3755, "time_per_iteration": 2.757657527923584 }, { "auxiliary_loss_clip": 0.01183944, "auxiliary_loss_mlp": 0.0102672, "balance_loss_clip": 1.05626678, "balance_loss_mlp": 1.01863742, "epoch": 0.45163229724042564, "flos": 20595093275520.0, "grad_norm": 3.164813808240444, "language_loss": 0.72635669, "learning_rate": 2.4076415166489834e-06, "loss": 0.74846333, "num_input_tokens_seen": 80834805, "step": 3756, "time_per_iteration": 2.7184135913848877 }, { "auxiliary_loss_clip": 0.0117475, "auxiliary_loss_mlp": 0.01023727, "balance_loss_clip": 1.05179691, "balance_loss_mlp": 1.01579356, "epoch": 0.45175254013106475, "flos": 21689004021120.0, "grad_norm": 1.8672715308640269, "language_loss": 0.79140854, "learning_rate": 2.406878866101506e-06, "loss": 0.81339329, "num_input_tokens_seen": 80853770, "step": 3757, "time_per_iteration": 2.681213617324829 }, { "auxiliary_loss_clip": 0.01185217, "auxiliary_loss_mlp": 0.01027006, "balance_loss_clip": 1.05501342, "balance_loss_mlp": 1.01953113, "epoch": 0.45187278302170386, "flos": 18878850466560.0, "grad_norm": 3.7270995485887646, "language_loss": 0.78333664, "learning_rate": 2.4061161538308273e-06, "loss": 0.8054589, "num_input_tokens_seen": 80870615, "step": 3758, "time_per_iteration": 2.686241865158081 }, { "auxiliary_loss_clip": 0.01180365, "auxiliary_loss_mlp": 0.01024556, "balance_loss_clip": 1.05519462, "balance_loss_mlp": 1.01636016, "epoch": 0.4519930259123429, "flos": 18582479349120.0, "grad_norm": 1.9154756906795245, "language_loss": 0.89216697, "learning_rate": 2.4053533799526523e-06, "loss": 0.91421616, "num_input_tokens_seen": 80886335, "step": 3759, "time_per_iteration": 2.6603949069976807 }, { "auxiliary_loss_clip": 0.01170528, "auxiliary_loss_mlp": 0.01025041, "balance_loss_clip": 1.05222988, "balance_loss_mlp": 1.01723897, "epoch": 0.452113268802982, "flos": 25192377129600.0, "grad_norm": 1.8033298956244466, "language_loss": 0.86206949, "learning_rate": 2.404590544582691e-06, "loss": 0.88402522, "num_input_tokens_seen": 80904570, "step": 3760, "time_per_iteration": 2.7138519287109375 }, { "auxiliary_loss_clip": 0.0118018, "auxiliary_loss_mlp": 0.01028335, "balance_loss_clip": 1.056391, "balance_loss_mlp": 1.01930523, "epoch": 0.45223351169362114, "flos": 39378922312320.0, "grad_norm": 1.6258770871039345, "language_loss": 0.81193608, "learning_rate": 2.403827647836666e-06, "loss": 0.83402121, "num_input_tokens_seen": 80925125, "step": 3761, "time_per_iteration": 2.848201036453247 }, { "auxiliary_loss_clip": 0.01184746, "auxiliary_loss_mlp": 0.01025491, "balance_loss_clip": 1.05308604, "balance_loss_mlp": 1.01768827, "epoch": 0.4523537545842602, "flos": 21582169994880.0, "grad_norm": 2.0996773176541215, "language_loss": 0.69673491, "learning_rate": 2.4030646898303075e-06, "loss": 0.71883732, "num_input_tokens_seen": 80946615, "step": 3762, "time_per_iteration": 2.7513110637664795 }, { "auxiliary_loss_clip": 0.01180533, "auxiliary_loss_mlp": 0.01029946, "balance_loss_clip": 1.05254221, "balance_loss_mlp": 1.02188182, "epoch": 0.4524739974748993, "flos": 28439527547520.0, "grad_norm": 2.390110667603125, "language_loss": 0.81967211, "learning_rate": 2.4023016706793566e-06, "loss": 0.84177691, "num_input_tokens_seen": 80966410, "step": 3763, "time_per_iteration": 3.668736219406128 }, { "auxiliary_loss_clip": 0.01090766, "auxiliary_loss_mlp": 0.01003371, "balance_loss_clip": 1.02326941, "balance_loss_mlp": 1.00197589, "epoch": 0.4525942403655384, "flos": 61556492148480.0, "grad_norm": 0.7774578292132676, "language_loss": 0.56847101, "learning_rate": 2.401538590499561e-06, "loss": 0.58941239, "num_input_tokens_seen": 81026865, "step": 3764, "time_per_iteration": 3.276575803756714 }, { "auxiliary_loss_clip": 0.01184086, "auxiliary_loss_mlp": 0.01054386, "balance_loss_clip": 1.05396008, "balance_loss_mlp": 1.01432419, "epoch": 0.45271448325617747, "flos": 27529838680320.0, "grad_norm": 2.125331659897881, "language_loss": 0.71819711, "learning_rate": 2.400775449406682e-06, "loss": 0.74058187, "num_input_tokens_seen": 81050060, "step": 3765, "time_per_iteration": 3.7315409183502197 }, { "auxiliary_loss_clip": 0.01181173, "auxiliary_loss_mlp": 0.01029604, "balance_loss_clip": 1.05302823, "balance_loss_mlp": 1.02192068, "epoch": 0.4528347261468166, "flos": 22452608275200.0, "grad_norm": 2.2606014235083762, "language_loss": 0.72682405, "learning_rate": 2.400012247516485e-06, "loss": 0.74893183, "num_input_tokens_seen": 81070625, "step": 3766, "time_per_iteration": 2.681331157684326 }, { "auxiliary_loss_clip": 0.01178607, "auxiliary_loss_mlp": 0.01030663, "balance_loss_clip": 1.05498075, "balance_loss_mlp": 1.02264011, "epoch": 0.45295496903745563, "flos": 21103875469440.0, "grad_norm": 1.8332230195245731, "language_loss": 0.90484542, "learning_rate": 2.3992489849447484e-06, "loss": 0.92693818, "num_input_tokens_seen": 81089080, "step": 3767, "time_per_iteration": 3.646304130554199 }, { "auxiliary_loss_clip": 0.01176985, "auxiliary_loss_mlp": 0.01032212, "balance_loss_clip": 1.05151927, "balance_loss_mlp": 1.0245409, "epoch": 0.45307521192809475, "flos": 23221168606080.0, "grad_norm": 1.6552234047354792, "language_loss": 0.79213452, "learning_rate": 2.3984856618072584e-06, "loss": 0.81422651, "num_input_tokens_seen": 81109115, "step": 3768, "time_per_iteration": 2.774970293045044 }, { "auxiliary_loss_clip": 0.01180961, "auxiliary_loss_mlp": 0.01028713, "balance_loss_clip": 1.05630088, "balance_loss_mlp": 1.02117324, "epoch": 0.45319545481873386, "flos": 15560094286080.0, "grad_norm": 2.3324665579635178, "language_loss": 0.7410028, "learning_rate": 2.3977222782198098e-06, "loss": 0.76309955, "num_input_tokens_seen": 81127750, "step": 3769, "time_per_iteration": 3.6550440788269043 }, { "auxiliary_loss_clip": 0.01166863, "auxiliary_loss_mlp": 0.01029113, "balance_loss_clip": 1.05321193, "balance_loss_mlp": 1.02093565, "epoch": 0.4533156977093729, "flos": 21944759834880.0, "grad_norm": 1.9792535431561284, "language_loss": 0.7556988, "learning_rate": 2.3969588342982077e-06, "loss": 0.77765858, "num_input_tokens_seen": 81147125, "step": 3770, "time_per_iteration": 2.7156810760498047 }, { "auxiliary_loss_clip": 0.01181046, "auxiliary_loss_mlp": 0.01023326, "balance_loss_clip": 1.05556524, "balance_loss_mlp": 1.01546371, "epoch": 0.453435940600012, "flos": 24242180699520.0, "grad_norm": 1.5940639334545752, "language_loss": 0.72707009, "learning_rate": 2.396195330158267e-06, "loss": 0.74911386, "num_input_tokens_seen": 81167015, "step": 3771, "time_per_iteration": 2.769318103790283 }, { "auxiliary_loss_clip": 0.01181393, "auxiliary_loss_mlp": 0.01030247, "balance_loss_clip": 1.05093408, "balance_loss_mlp": 1.02161682, "epoch": 0.45355618349065113, "flos": 23440367352960.0, "grad_norm": 2.0211375115355157, "language_loss": 0.7951659, "learning_rate": 2.3954317659158094e-06, "loss": 0.81728232, "num_input_tokens_seen": 81187350, "step": 3772, "time_per_iteration": 2.6755518913269043 }, { "auxiliary_loss_clip": 0.01087791, "auxiliary_loss_mlp": 0.01001514, "balance_loss_clip": 1.02367449, "balance_loss_mlp": 1.00013769, "epoch": 0.4536764263812902, "flos": 66903161448960.0, "grad_norm": 0.8855762888860588, "language_loss": 0.57006508, "learning_rate": 2.394668141686667e-06, "loss": 0.59095812, "num_input_tokens_seen": 81249315, "step": 3773, "time_per_iteration": 3.286201000213623 }, { "auxiliary_loss_clip": 0.01177549, "auxiliary_loss_mlp": 0.01023926, "balance_loss_clip": 1.05344677, "balance_loss_mlp": 1.01646399, "epoch": 0.4537966692719293, "flos": 42739766254080.0, "grad_norm": 2.2314099108413576, "language_loss": 0.69846356, "learning_rate": 2.3939044575866813e-06, "loss": 0.7204783, "num_input_tokens_seen": 81272065, "step": 3774, "time_per_iteration": 2.8184568881988525 }, { "auxiliary_loss_clip": 0.01174507, "auxiliary_loss_mlp": 0.01060515, "balance_loss_clip": 1.05363798, "balance_loss_mlp": 1.02168596, "epoch": 0.4539169121625684, "flos": 35549480517120.0, "grad_norm": 3.5878925888784226, "language_loss": 0.75503057, "learning_rate": 2.3931407137317024e-06, "loss": 0.77738076, "num_input_tokens_seen": 81292220, "step": 3775, "time_per_iteration": 2.830143451690674 }, { "auxiliary_loss_clip": 0.01172836, "auxiliary_loss_mlp": 0.01030824, "balance_loss_clip": 1.05148017, "balance_loss_mlp": 1.02312934, "epoch": 0.45403715505320746, "flos": 18514716341760.0, "grad_norm": 2.2944841149521698, "language_loss": 0.84929705, "learning_rate": 2.3923769102375907e-06, "loss": 0.87133366, "num_input_tokens_seen": 81311085, "step": 3776, "time_per_iteration": 2.6758360862731934 }, { "auxiliary_loss_clip": 0.01173244, "auxiliary_loss_mlp": 0.01025281, "balance_loss_clip": 1.05148602, "balance_loss_mlp": 1.01724029, "epoch": 0.4541573979438466, "flos": 25045825639680.0, "grad_norm": 3.642758318576156, "language_loss": 0.78379929, "learning_rate": 2.391613047220213e-06, "loss": 0.80578458, "num_input_tokens_seen": 81330985, "step": 3777, "time_per_iteration": 2.84879732131958 }, { "auxiliary_loss_clip": 0.01174513, "auxiliary_loss_mlp": 0.01025412, "balance_loss_clip": 1.05113029, "balance_loss_mlp": 1.01725864, "epoch": 0.4542776408344857, "flos": 18332397884160.0, "grad_norm": 2.2122077705457186, "language_loss": 0.79062057, "learning_rate": 2.390849124795447e-06, "loss": 0.81261992, "num_input_tokens_seen": 81346985, "step": 3778, "time_per_iteration": 2.662686347961426 }, { "auxiliary_loss_clip": 0.01183666, "auxiliary_loss_mlp": 0.01025752, "balance_loss_clip": 1.05268323, "balance_loss_mlp": 1.01735353, "epoch": 0.45439788372512474, "flos": 20701173116160.0, "grad_norm": 3.7429085389790213, "language_loss": 0.84321988, "learning_rate": 2.3900851430791804e-06, "loss": 0.86531413, "num_input_tokens_seen": 81365005, "step": 3779, "time_per_iteration": 2.6093406677246094 }, { "auxiliary_loss_clip": 0.01186746, "auxiliary_loss_mlp": 0.01027741, "balance_loss_clip": 1.0537405, "balance_loss_mlp": 1.01946187, "epoch": 0.45451812661576385, "flos": 22309432663680.0, "grad_norm": 2.622690131851081, "language_loss": 0.84625852, "learning_rate": 2.389321102187307e-06, "loss": 0.86840343, "num_input_tokens_seen": 81383785, "step": 3780, "time_per_iteration": 2.666262626647949 }, { "auxiliary_loss_clip": 0.01172929, "auxiliary_loss_mlp": 0.01060492, "balance_loss_clip": 1.05058098, "balance_loss_mlp": 1.02274525, "epoch": 0.4546383695064029, "flos": 21763303303680.0, "grad_norm": 1.6893644350661972, "language_loss": 0.81812322, "learning_rate": 2.3885570022357326e-06, "loss": 0.84045744, "num_input_tokens_seen": 81402915, "step": 3781, "time_per_iteration": 2.6917319297790527 }, { "auxiliary_loss_clip": 0.0108883, "auxiliary_loss_mlp": 0.0100529, "balance_loss_clip": 1.0229578, "balance_loss_mlp": 1.00374079, "epoch": 0.454758612397042, "flos": 64242755694720.0, "grad_norm": 0.8024174424159639, "language_loss": 0.6082927, "learning_rate": 2.38779284334037e-06, "loss": 0.62923384, "num_input_tokens_seen": 81467890, "step": 3782, "time_per_iteration": 3.331373929977417 }, { "auxiliary_loss_clip": 0.01159283, "auxiliary_loss_mlp": 0.01026623, "balance_loss_clip": 1.05199575, "balance_loss_mlp": 1.01848686, "epoch": 0.4548788552876811, "flos": 27304175485440.0, "grad_norm": 2.1699623672175425, "language_loss": 0.78931111, "learning_rate": 2.387028625617141e-06, "loss": 0.81117022, "num_input_tokens_seen": 81487105, "step": 3783, "time_per_iteration": 2.7817327976226807 }, { "auxiliary_loss_clip": 0.01163983, "auxiliary_loss_mlp": 0.01026127, "balance_loss_clip": 1.05109179, "balance_loss_mlp": 1.01787746, "epoch": 0.4549990981783202, "flos": 22857142222080.0, "grad_norm": 2.1802321746647846, "language_loss": 0.84538627, "learning_rate": 2.3862643491819766e-06, "loss": 0.8672874, "num_input_tokens_seen": 81505670, "step": 3784, "time_per_iteration": 2.6998960971832275 }, { "auxiliary_loss_clip": 0.01176628, "auxiliary_loss_mlp": 0.01024632, "balance_loss_clip": 1.05065155, "balance_loss_mlp": 1.01648974, "epoch": 0.4551193410689593, "flos": 23258587599360.0, "grad_norm": 1.7762675427641925, "language_loss": 0.84288436, "learning_rate": 2.3855000141508186e-06, "loss": 0.86489695, "num_input_tokens_seen": 81525825, "step": 3785, "time_per_iteration": 2.6176562309265137 }, { "auxiliary_loss_clip": 0.01179828, "auxiliary_loss_mlp": 0.01025976, "balance_loss_clip": 1.05455565, "balance_loss_mlp": 1.01723206, "epoch": 0.4552395839595984, "flos": 20777519473920.0, "grad_norm": 2.143409506199595, "language_loss": 0.84147865, "learning_rate": 2.3847356206396143e-06, "loss": 0.86353672, "num_input_tokens_seen": 81543135, "step": 3786, "time_per_iteration": 2.728079319000244 }, { "auxiliary_loss_clip": 0.01182273, "auxiliary_loss_mlp": 0.01032986, "balance_loss_clip": 1.05328512, "balance_loss_mlp": 1.02501059, "epoch": 0.45535982685023746, "flos": 23257510191360.0, "grad_norm": 1.7397714548159515, "language_loss": 0.78616476, "learning_rate": 2.3839711687643227e-06, "loss": 0.80831736, "num_input_tokens_seen": 81564360, "step": 3787, "time_per_iteration": 2.6752138137817383 }, { "auxiliary_loss_clip": 0.01180701, "auxiliary_loss_mlp": 0.0102872, "balance_loss_clip": 1.05427456, "balance_loss_mlp": 1.02010703, "epoch": 0.45548006974087657, "flos": 19646117907840.0, "grad_norm": 3.7242405784484274, "language_loss": 0.74102867, "learning_rate": 2.38320665864091e-06, "loss": 0.76312292, "num_input_tokens_seen": 81583710, "step": 3788, "time_per_iteration": 2.6961207389831543 }, { "auxiliary_loss_clip": 0.01166878, "auxiliary_loss_mlp": 0.01028013, "balance_loss_clip": 1.05006814, "balance_loss_mlp": 1.01966286, "epoch": 0.4556003126315157, "flos": 20047778766720.0, "grad_norm": 1.84417887560819, "language_loss": 0.82159334, "learning_rate": 2.3824420903853516e-06, "loss": 0.84354228, "num_input_tokens_seen": 81602175, "step": 3789, "time_per_iteration": 3.7438576221466064 }, { "auxiliary_loss_clip": 0.01178794, "auxiliary_loss_mlp": 0.01024713, "balance_loss_clip": 1.0546453, "balance_loss_mlp": 1.01718497, "epoch": 0.45572055552215474, "flos": 22959738443520.0, "grad_norm": 2.507299552945029, "language_loss": 0.8228817, "learning_rate": 2.3816774641136324e-06, "loss": 0.84491676, "num_input_tokens_seen": 81619430, "step": 3790, "time_per_iteration": 2.714599370956421 }, { "auxiliary_loss_clip": 0.01176793, "auxiliary_loss_mlp": 0.01062077, "balance_loss_clip": 1.05172968, "balance_loss_mlp": 1.02505684, "epoch": 0.45584079841279385, "flos": 33109925535360.0, "grad_norm": 3.17688126765538, "language_loss": 0.71067119, "learning_rate": 2.380912779941745e-06, "loss": 0.73305982, "num_input_tokens_seen": 81642550, "step": 3791, "time_per_iteration": 3.6721317768096924 }, { "auxiliary_loss_clip": 0.0118135, "auxiliary_loss_mlp": 0.01025262, "balance_loss_clip": 1.05043292, "balance_loss_mlp": 1.0165534, "epoch": 0.45596104130343296, "flos": 27272179445760.0, "grad_norm": 2.244833797774885, "language_loss": 0.8303231, "learning_rate": 2.3801480379856918e-06, "loss": 0.85238922, "num_input_tokens_seen": 81664260, "step": 3792, "time_per_iteration": 3.577894926071167 }, { "auxiliary_loss_clip": 0.01176707, "auxiliary_loss_mlp": 0.01032233, "balance_loss_clip": 1.05316138, "balance_loss_mlp": 1.02365005, "epoch": 0.456081284194072, "flos": 21579799697280.0, "grad_norm": 1.8506087445339217, "language_loss": 0.83700573, "learning_rate": 2.379383238361484e-06, "loss": 0.85909522, "num_input_tokens_seen": 81683620, "step": 3793, "time_per_iteration": 2.756488084793091 }, { "auxiliary_loss_clip": 0.01176767, "auxiliary_loss_mlp": 0.01025373, "balance_loss_clip": 1.05008936, "balance_loss_mlp": 1.01806009, "epoch": 0.4562015270847111, "flos": 35918822113920.0, "grad_norm": 1.9675148800915807, "language_loss": 0.79552913, "learning_rate": 2.3786183811851407e-06, "loss": 0.81755048, "num_input_tokens_seen": 81704325, "step": 3794, "time_per_iteration": 2.8085830211639404 }, { "auxiliary_loss_clip": 0.01185085, "auxiliary_loss_mlp": 0.01024863, "balance_loss_clip": 1.05457532, "balance_loss_mlp": 1.01729977, "epoch": 0.45632176997535023, "flos": 13589783602560.0, "grad_norm": 1.7445742592338325, "language_loss": 0.80184388, "learning_rate": 2.3778534665726892e-06, "loss": 0.82394326, "num_input_tokens_seen": 81721155, "step": 3795, "time_per_iteration": 2.6656134128570557 }, { "auxiliary_loss_clip": 0.01169247, "auxiliary_loss_mlp": 0.01027529, "balance_loss_clip": 1.0512805, "balance_loss_mlp": 1.01978648, "epoch": 0.4564420128659893, "flos": 32635401937920.0, "grad_norm": 1.9379488541505128, "language_loss": 0.72598135, "learning_rate": 2.377088494640168e-06, "loss": 0.74794912, "num_input_tokens_seen": 81742905, "step": 3796, "time_per_iteration": 3.809849739074707 }, { "auxiliary_loss_clip": 0.01175696, "auxiliary_loss_mlp": 0.01027195, "balance_loss_clip": 1.0532124, "balance_loss_mlp": 1.01945806, "epoch": 0.4565622557566284, "flos": 20377690208640.0, "grad_norm": 1.8909352209712893, "language_loss": 0.78040171, "learning_rate": 2.3763234655036216e-06, "loss": 0.80243063, "num_input_tokens_seen": 81762105, "step": 3797, "time_per_iteration": 2.830871820449829 }, { "auxiliary_loss_clip": 0.01174178, "auxiliary_loss_mlp": 0.01031974, "balance_loss_clip": 1.05195606, "balance_loss_mlp": 1.02423191, "epoch": 0.45668249864726745, "flos": 25374372364800.0, "grad_norm": 2.03676101644837, "language_loss": 0.86870116, "learning_rate": 2.3755583792791046e-06, "loss": 0.89076269, "num_input_tokens_seen": 81781975, "step": 3798, "time_per_iteration": 2.7417960166931152 }, { "auxiliary_loss_clip": 0.01179292, "auxiliary_loss_mlp": 0.0102637, "balance_loss_clip": 1.05177569, "balance_loss_mlp": 1.01784062, "epoch": 0.45680274153790656, "flos": 15559806977280.0, "grad_norm": 1.9724079773184704, "language_loss": 0.74547362, "learning_rate": 2.3747932360826803e-06, "loss": 0.7675302, "num_input_tokens_seen": 81798905, "step": 3799, "time_per_iteration": 2.8345580101013184 }, { "auxiliary_loss_clip": 0.01178708, "auxiliary_loss_mlp": 0.01029493, "balance_loss_clip": 1.05405736, "balance_loss_mlp": 1.02100253, "epoch": 0.4569229844285457, "flos": 19792884879360.0, "grad_norm": 2.1675909860908726, "language_loss": 0.82161981, "learning_rate": 2.3740280360304205e-06, "loss": 0.84370184, "num_input_tokens_seen": 81816630, "step": 3800, "time_per_iteration": 2.6484599113464355 }, { "auxiliary_loss_clip": 0.0117228, "auxiliary_loss_mlp": 0.01022935, "balance_loss_clip": 1.05391717, "balance_loss_mlp": 1.01469111, "epoch": 0.45704322731918473, "flos": 24093941270400.0, "grad_norm": 1.7601358214903342, "language_loss": 0.68053782, "learning_rate": 2.3732627792384038e-06, "loss": 0.70248997, "num_input_tokens_seen": 81837700, "step": 3801, "time_per_iteration": 2.976976156234741 }, { "auxiliary_loss_clip": 0.01181183, "auxiliary_loss_mlp": 0.01024231, "balance_loss_clip": 1.05042124, "balance_loss_mlp": 1.01614904, "epoch": 0.45716347020982384, "flos": 31317803245440.0, "grad_norm": 2.0920665202287783, "language_loss": 0.75542688, "learning_rate": 2.3724974658227207e-06, "loss": 0.77748108, "num_input_tokens_seen": 81858490, "step": 3802, "time_per_iteration": 2.7413933277130127 }, { "auxiliary_loss_clip": 0.01175887, "auxiliary_loss_mlp": 0.01050341, "balance_loss_clip": 1.05238771, "balance_loss_mlp": 1.0149374, "epoch": 0.45728371310046295, "flos": 26501392471680.0, "grad_norm": 2.034034396667641, "language_loss": 0.71019483, "learning_rate": 2.3717320958994687e-06, "loss": 0.73245716, "num_input_tokens_seen": 81876050, "step": 3803, "time_per_iteration": 2.6772921085357666 }, { "auxiliary_loss_clip": 0.01176179, "auxiliary_loss_mlp": 0.01023936, "balance_loss_clip": 1.05101001, "balance_loss_mlp": 1.01569235, "epoch": 0.457403955991102, "flos": 17929408222080.0, "grad_norm": 2.3933354576342256, "language_loss": 0.70570385, "learning_rate": 2.3709666695847534e-06, "loss": 0.727705, "num_input_tokens_seen": 81894230, "step": 3804, "time_per_iteration": 2.7034502029418945 }, { "auxiliary_loss_clip": 0.01168545, "auxiliary_loss_mlp": 0.01026692, "balance_loss_clip": 1.05475867, "balance_loss_mlp": 1.0183351, "epoch": 0.4575241988817411, "flos": 42230660837760.0, "grad_norm": 1.9564199213218427, "language_loss": 0.70507705, "learning_rate": 2.370201186994689e-06, "loss": 0.72702944, "num_input_tokens_seen": 81917915, "step": 3805, "time_per_iteration": 2.8884968757629395 }, { "auxiliary_loss_clip": 0.01171004, "auxiliary_loss_mlp": 0.01034457, "balance_loss_clip": 1.05424476, "balance_loss_mlp": 1.02597558, "epoch": 0.45764444177238023, "flos": 30117309868800.0, "grad_norm": 2.3338922167772336, "language_loss": 0.69995439, "learning_rate": 2.369435648245399e-06, "loss": 0.722009, "num_input_tokens_seen": 81938130, "step": 3806, "time_per_iteration": 2.8089749813079834 }, { "auxiliary_loss_clip": 0.01174449, "auxiliary_loss_mlp": 0.01028293, "balance_loss_clip": 1.05218947, "balance_loss_mlp": 1.01926923, "epoch": 0.4577646846630193, "flos": 24060293205120.0, "grad_norm": 1.7693033788826587, "language_loss": 0.8549003, "learning_rate": 2.368670053453015e-06, "loss": 0.87692779, "num_input_tokens_seen": 81959820, "step": 3807, "time_per_iteration": 2.774481773376465 }, { "auxiliary_loss_clip": 0.01185305, "auxiliary_loss_mlp": 0.01032391, "balance_loss_clip": 1.05486691, "balance_loss_mlp": 1.02409387, "epoch": 0.4578849275536584, "flos": 17418578952960.0, "grad_norm": 2.5255565589697673, "language_loss": 0.74340951, "learning_rate": 2.3679044027336757e-06, "loss": 0.76558644, "num_input_tokens_seen": 81975710, "step": 3808, "time_per_iteration": 2.6429550647735596 }, { "auxiliary_loss_clip": 0.01184427, "auxiliary_loss_mlp": 0.01029027, "balance_loss_clip": 1.05280137, "balance_loss_mlp": 1.02023578, "epoch": 0.4580051704442975, "flos": 13510169107200.0, "grad_norm": 3.6758378397998643, "language_loss": 0.69021285, "learning_rate": 2.3671386962035326e-06, "loss": 0.71234739, "num_input_tokens_seen": 81993180, "step": 3809, "time_per_iteration": 2.6480302810668945 }, { "auxiliary_loss_clip": 0.01181415, "auxiliary_loss_mlp": 0.01028819, "balance_loss_clip": 1.05373192, "balance_loss_mlp": 1.02046227, "epoch": 0.45812541333493656, "flos": 18037606965120.0, "grad_norm": 3.017973041326797, "language_loss": 0.68642175, "learning_rate": 2.3663729339787405e-06, "loss": 0.70852417, "num_input_tokens_seen": 82010115, "step": 3810, "time_per_iteration": 2.730515241622925 }, { "auxiliary_loss_clip": 0.01185527, "auxiliary_loss_mlp": 0.0103012, "balance_loss_clip": 1.05388296, "balance_loss_mlp": 1.02156699, "epoch": 0.45824565622557567, "flos": 20222196232320.0, "grad_norm": 3.3153768259014487, "language_loss": 0.73455405, "learning_rate": 2.365607116175466e-06, "loss": 0.75671053, "num_input_tokens_seen": 82025540, "step": 3811, "time_per_iteration": 2.6501364707946777 }, { "auxiliary_loss_clip": 0.01184381, "auxiliary_loss_mlp": 0.01027178, "balance_loss_clip": 1.05427349, "balance_loss_mlp": 1.01900601, "epoch": 0.4583658991162148, "flos": 19864885691520.0, "grad_norm": 2.8899289829574384, "language_loss": 0.66737723, "learning_rate": 2.3648412429098825e-06, "loss": 0.68949282, "num_input_tokens_seen": 82043890, "step": 3812, "time_per_iteration": 2.6478629112243652 }, { "auxiliary_loss_clip": 0.01170613, "auxiliary_loss_mlp": 0.01036156, "balance_loss_clip": 1.05501914, "balance_loss_mlp": 1.02728057, "epoch": 0.45848614200685384, "flos": 21029935322880.0, "grad_norm": 2.064909686366204, "language_loss": 0.8224203, "learning_rate": 2.364075314298172e-06, "loss": 0.84448797, "num_input_tokens_seen": 82061345, "step": 3813, "time_per_iteration": 2.751887798309326 }, { "auxiliary_loss_clip": 0.01184469, "auxiliary_loss_mlp": 0.01061403, "balance_loss_clip": 1.05379522, "balance_loss_mlp": 1.0256635, "epoch": 0.45860638489749295, "flos": 21069293650560.0, "grad_norm": 2.1418463117012987, "language_loss": 0.70470446, "learning_rate": 2.3633093304565267e-06, "loss": 0.7271632, "num_input_tokens_seen": 82080400, "step": 3814, "time_per_iteration": 2.6916263103485107 }, { "auxiliary_loss_clip": 0.01190148, "auxiliary_loss_mlp": 0.01032661, "balance_loss_clip": 1.05617046, "balance_loss_mlp": 1.02419126, "epoch": 0.458726627788132, "flos": 26833889692800.0, "grad_norm": 2.0029075908748384, "language_loss": 0.6298728, "learning_rate": 2.3625432915011443e-06, "loss": 0.65210086, "num_input_tokens_seen": 82102310, "step": 3815, "time_per_iteration": 3.587552547454834 }, { "auxiliary_loss_clip": 0.01175506, "auxiliary_loss_mlp": 0.01027656, "balance_loss_clip": 1.0547905, "balance_loss_mlp": 1.01861989, "epoch": 0.4588468706787711, "flos": 24097927680000.0, "grad_norm": 6.761633464817552, "language_loss": 0.65373087, "learning_rate": 2.3617771975482334e-06, "loss": 0.67576241, "num_input_tokens_seen": 82121140, "step": 3816, "time_per_iteration": 2.654756546020508 }, { "auxiliary_loss_clip": 0.01165102, "auxiliary_loss_mlp": 0.01027746, "balance_loss_clip": 1.05234885, "balance_loss_mlp": 1.01935911, "epoch": 0.4589671135694102, "flos": 17889331622400.0, "grad_norm": 1.6086218718400531, "language_loss": 0.74539709, "learning_rate": 2.3610110487140083e-06, "loss": 0.76732552, "num_input_tokens_seen": 82139575, "step": 3817, "time_per_iteration": 2.741982936859131 }, { "auxiliary_loss_clip": 0.01175184, "auxiliary_loss_mlp": 0.01031187, "balance_loss_clip": 1.05324578, "balance_loss_mlp": 1.02266908, "epoch": 0.4590873564600493, "flos": 25626967781760.0, "grad_norm": 1.863953884163544, "language_loss": 0.80780554, "learning_rate": 2.360244845114695e-06, "loss": 0.82986915, "num_input_tokens_seen": 82159195, "step": 3818, "time_per_iteration": 4.63935661315918 }, { "auxiliary_loss_clip": 0.01170976, "auxiliary_loss_mlp": 0.01027301, "balance_loss_clip": 1.05266094, "balance_loss_mlp": 1.01923609, "epoch": 0.4592075993506884, "flos": 18514788168960.0, "grad_norm": 2.2438530869026, "language_loss": 0.68697411, "learning_rate": 2.3594785868665245e-06, "loss": 0.70895684, "num_input_tokens_seen": 82175500, "step": 3819, "time_per_iteration": 2.61379075050354 }, { "auxiliary_loss_clip": 0.01176451, "auxiliary_loss_mlp": 0.01056254, "balance_loss_clip": 1.0530529, "balance_loss_mlp": 1.01824296, "epoch": 0.4593278422413275, "flos": 20631111638400.0, "grad_norm": 2.4578332141273633, "language_loss": 0.80557537, "learning_rate": 2.3587122740857386e-06, "loss": 0.82790244, "num_input_tokens_seen": 82192600, "step": 3820, "time_per_iteration": 2.755754232406616 }, { "auxiliary_loss_clip": 0.01176037, "auxiliary_loss_mlp": 0.0102323, "balance_loss_clip": 1.05025268, "balance_loss_mlp": 1.01535654, "epoch": 0.45944808513196655, "flos": 21358517961600.0, "grad_norm": 1.585371508229338, "language_loss": 0.78027475, "learning_rate": 2.357945906888586e-06, "loss": 0.80226743, "num_input_tokens_seen": 82212040, "step": 3821, "time_per_iteration": 2.731334686279297 }, { "auxiliary_loss_clip": 0.01181418, "auxiliary_loss_mlp": 0.01027546, "balance_loss_clip": 1.0536375, "balance_loss_mlp": 1.01912975, "epoch": 0.45956832802260567, "flos": 21427789340160.0, "grad_norm": 2.704553418045979, "language_loss": 0.79914492, "learning_rate": 2.357179485391324e-06, "loss": 0.82123458, "num_input_tokens_seen": 82229895, "step": 3822, "time_per_iteration": 3.6961355209350586 }, { "auxiliary_loss_clip": 0.01181324, "auxiliary_loss_mlp": 0.01026662, "balance_loss_clip": 1.05350459, "balance_loss_mlp": 1.01874089, "epoch": 0.4596885709132448, "flos": 22382654538240.0, "grad_norm": 2.146596891714686, "language_loss": 0.85733354, "learning_rate": 2.3564130097102173e-06, "loss": 0.87941343, "num_input_tokens_seen": 82249550, "step": 3823, "time_per_iteration": 2.669794797897339 }, { "auxiliary_loss_clip": 0.01171661, "auxiliary_loss_mlp": 0.01031355, "balance_loss_clip": 1.05498135, "balance_loss_mlp": 1.02358842, "epoch": 0.45980881380388383, "flos": 28981957806720.0, "grad_norm": 1.7872133355624265, "language_loss": 0.74962699, "learning_rate": 2.355646479961541e-06, "loss": 0.77165711, "num_input_tokens_seen": 82268860, "step": 3824, "time_per_iteration": 2.72060227394104 }, { "auxiliary_loss_clip": 0.01183069, "auxiliary_loss_mlp": 0.01032995, "balance_loss_clip": 1.05311608, "balance_loss_mlp": 1.0241555, "epoch": 0.45992905669452294, "flos": 33396599980800.0, "grad_norm": 1.8392346860621602, "language_loss": 0.71622926, "learning_rate": 2.354879896261576e-06, "loss": 0.73838985, "num_input_tokens_seen": 82289070, "step": 3825, "time_per_iteration": 2.768007278442383 }, { "auxiliary_loss_clip": 0.01167384, "auxiliary_loss_mlp": 0.01033814, "balance_loss_clip": 1.05222535, "balance_loss_mlp": 1.02560067, "epoch": 0.46004929958516205, "flos": 36318184502400.0, "grad_norm": 2.094160769606136, "language_loss": 0.57537878, "learning_rate": 2.3541132587266133e-06, "loss": 0.59739077, "num_input_tokens_seen": 82311790, "step": 3826, "time_per_iteration": 2.8740947246551514 }, { "auxiliary_loss_clip": 0.01179812, "auxiliary_loss_mlp": 0.01028811, "balance_loss_clip": 1.05398965, "balance_loss_mlp": 1.02018034, "epoch": 0.4601695424758011, "flos": 17238451224960.0, "grad_norm": 1.9638942604537941, "language_loss": 0.69393879, "learning_rate": 2.3533465674729515e-06, "loss": 0.71602499, "num_input_tokens_seen": 82329020, "step": 3827, "time_per_iteration": 2.757467269897461 }, { "auxiliary_loss_clip": 0.01185229, "auxiliary_loss_mlp": 0.01032273, "balance_loss_clip": 1.05498672, "balance_loss_mlp": 1.02392244, "epoch": 0.4602897853664402, "flos": 15888425529600.0, "grad_norm": 2.6200332082288122, "language_loss": 0.72966802, "learning_rate": 2.352579822616895e-06, "loss": 0.75184298, "num_input_tokens_seen": 82346455, "step": 3828, "time_per_iteration": 2.6778616905212402 }, { "auxiliary_loss_clip": 0.01178239, "auxiliary_loss_mlp": 0.01023584, "balance_loss_clip": 1.05231392, "balance_loss_mlp": 1.01560855, "epoch": 0.4604100282570793, "flos": 25412617370880.0, "grad_norm": 1.7246839938360774, "language_loss": 0.78150278, "learning_rate": 2.351813024274761e-06, "loss": 0.80352104, "num_input_tokens_seen": 82367810, "step": 3829, "time_per_iteration": 2.744110584259033 }, { "auxiliary_loss_clip": 0.0117983, "auxiliary_loss_mlp": 0.01031267, "balance_loss_clip": 1.05512166, "balance_loss_mlp": 1.0233041, "epoch": 0.4605302711477184, "flos": 27630711048960.0, "grad_norm": 2.0313057415547955, "language_loss": 0.74025095, "learning_rate": 2.3510461725628693e-06, "loss": 0.76236188, "num_input_tokens_seen": 82388275, "step": 3830, "time_per_iteration": 2.8486223220825195 }, { "auxiliary_loss_clip": 0.01175731, "auxiliary_loss_mlp": 0.01029036, "balance_loss_clip": 1.05423141, "balance_loss_mlp": 1.02123332, "epoch": 0.4606505140383575, "flos": 23839657914240.0, "grad_norm": 2.050765301077008, "language_loss": 0.70997888, "learning_rate": 2.350279267597554e-06, "loss": 0.73202658, "num_input_tokens_seen": 82408915, "step": 3831, "time_per_iteration": 2.7983765602111816 }, { "auxiliary_loss_clip": 0.01180345, "auxiliary_loss_mlp": 0.01027251, "balance_loss_clip": 1.05290079, "balance_loss_mlp": 1.01876903, "epoch": 0.46077075692899655, "flos": 16107013745280.0, "grad_norm": 2.387565026798546, "language_loss": 0.82922018, "learning_rate": 2.3495123094951515e-06, "loss": 0.85129607, "num_input_tokens_seen": 82427260, "step": 3832, "time_per_iteration": 2.901243209838867 }, { "auxiliary_loss_clip": 0.01169785, "auxiliary_loss_mlp": 0.01027349, "balance_loss_clip": 1.05235493, "balance_loss_mlp": 1.01889682, "epoch": 0.46089099981963566, "flos": 48798147634560.0, "grad_norm": 2.2450413810670566, "language_loss": 0.75505066, "learning_rate": 2.34874529837201e-06, "loss": 0.77702206, "num_input_tokens_seen": 82450805, "step": 3833, "time_per_iteration": 2.9142274856567383 }, { "auxiliary_loss_clip": 0.01165499, "auxiliary_loss_mlp": 0.01028342, "balance_loss_clip": 1.05420935, "balance_loss_mlp": 1.02022362, "epoch": 0.46101124271027477, "flos": 19099234362240.0, "grad_norm": 2.0337477578401724, "language_loss": 0.78806108, "learning_rate": 2.347978234344483e-06, "loss": 0.80999947, "num_input_tokens_seen": 82467010, "step": 3834, "time_per_iteration": 2.8133599758148193 }, { "auxiliary_loss_clip": 0.01185824, "auxiliary_loss_mlp": 0.01032596, "balance_loss_clip": 1.05541587, "balance_loss_mlp": 1.02407241, "epoch": 0.4611314856009138, "flos": 39347931853440.0, "grad_norm": 1.9188888693867674, "language_loss": 0.68797946, "learning_rate": 2.347211117528935e-06, "loss": 0.71016371, "num_input_tokens_seen": 82489310, "step": 3835, "time_per_iteration": 2.905675172805786 }, { "auxiliary_loss_clip": 0.01186532, "auxiliary_loss_mlp": 0.01027884, "balance_loss_clip": 1.05957532, "balance_loss_mlp": 1.01944959, "epoch": 0.46125172849155294, "flos": 20810772489600.0, "grad_norm": 2.164478439652794, "language_loss": 0.71776587, "learning_rate": 2.3464439480417374e-06, "loss": 0.73991001, "num_input_tokens_seen": 82508830, "step": 3836, "time_per_iteration": 2.7282001972198486 }, { "auxiliary_loss_clip": 0.01183772, "auxiliary_loss_mlp": 0.01028344, "balance_loss_clip": 1.05339575, "balance_loss_mlp": 1.01981163, "epoch": 0.46137197138219205, "flos": 17930808852480.0, "grad_norm": 4.065108238048805, "language_loss": 0.7656014, "learning_rate": 2.3456767259992676e-06, "loss": 0.78772253, "num_input_tokens_seen": 82526475, "step": 3837, "time_per_iteration": 2.667724847793579 }, { "auxiliary_loss_clip": 0.01183835, "auxiliary_loss_mlp": 0.01054715, "balance_loss_clip": 1.05200064, "balance_loss_mlp": 1.01728058, "epoch": 0.4614922142728311, "flos": 16836610798080.0, "grad_norm": 2.6124723173999063, "language_loss": 0.89100349, "learning_rate": 2.3449094515179135e-06, "loss": 0.91338897, "num_input_tokens_seen": 82543935, "step": 3838, "time_per_iteration": 2.630537271499634 }, { "auxiliary_loss_clip": 0.01181257, "auxiliary_loss_mlp": 0.01032267, "balance_loss_clip": 1.05315137, "balance_loss_mlp": 1.0238328, "epoch": 0.4616124571634702, "flos": 26614906427520.0, "grad_norm": 1.9386438719348609, "language_loss": 0.81790602, "learning_rate": 2.34414212471407e-06, "loss": 0.84004128, "num_input_tokens_seen": 82563730, "step": 3839, "time_per_iteration": 2.807616710662842 }, { "auxiliary_loss_clip": 0.01186889, "auxiliary_loss_mlp": 0.01026708, "balance_loss_clip": 1.05389047, "balance_loss_mlp": 1.01846457, "epoch": 0.4617327000541093, "flos": 20340127560960.0, "grad_norm": 2.2280974995237974, "language_loss": 0.72868109, "learning_rate": 2.3433747457041394e-06, "loss": 0.75081706, "num_input_tokens_seen": 82582435, "step": 3840, "time_per_iteration": 2.6922435760498047 }, { "auxiliary_loss_clip": 0.01172746, "auxiliary_loss_mlp": 0.01027668, "balance_loss_clip": 1.05459106, "balance_loss_mlp": 1.01962137, "epoch": 0.4618529429447484, "flos": 29570749545600.0, "grad_norm": 2.2665636657420807, "language_loss": 0.85190272, "learning_rate": 2.342607314604533e-06, "loss": 0.87390685, "num_input_tokens_seen": 82602185, "step": 3841, "time_per_iteration": 3.7907633781433105 }, { "auxiliary_loss_clip": 0.0117934, "auxiliary_loss_mlp": 0.01034508, "balance_loss_clip": 1.0544492, "balance_loss_mlp": 1.02628875, "epoch": 0.4619731858353875, "flos": 19787030962560.0, "grad_norm": 1.7482346440978234, "language_loss": 0.84314901, "learning_rate": 2.3418398315316694e-06, "loss": 0.86528754, "num_input_tokens_seen": 82620005, "step": 3842, "time_per_iteration": 2.7239325046539307 }, { "auxiliary_loss_clip": 0.01185562, "auxiliary_loss_mlp": 0.01033887, "balance_loss_clip": 1.05621803, "balance_loss_mlp": 1.02503622, "epoch": 0.4620934287260266, "flos": 18951138587520.0, "grad_norm": 2.690874899073226, "language_loss": 0.78828365, "learning_rate": 2.3410722966019755e-06, "loss": 0.81047809, "num_input_tokens_seen": 82635120, "step": 3843, "time_per_iteration": 2.765958070755005 }, { "auxiliary_loss_clip": 0.01181162, "auxiliary_loss_mlp": 0.01029904, "balance_loss_clip": 1.05443335, "balance_loss_mlp": 1.02152956, "epoch": 0.46221367161666566, "flos": 37341674634240.0, "grad_norm": 1.8938189131783085, "language_loss": 0.6549226, "learning_rate": 2.3403047099318848e-06, "loss": 0.67703331, "num_input_tokens_seen": 82659190, "step": 3844, "time_per_iteration": 4.72355318069458 }, { "auxiliary_loss_clip": 0.01169869, "auxiliary_loss_mlp": 0.01030053, "balance_loss_clip": 1.05472279, "balance_loss_mlp": 1.02178597, "epoch": 0.46233391450730477, "flos": 14428549065600.0, "grad_norm": 2.095957254067218, "language_loss": 0.75107157, "learning_rate": 2.3395370716378405e-06, "loss": 0.77307075, "num_input_tokens_seen": 82676635, "step": 3845, "time_per_iteration": 2.7038042545318604 }, { "auxiliary_loss_clip": 0.01185418, "auxiliary_loss_mlp": 0.01029272, "balance_loss_clip": 1.05420065, "balance_loss_mlp": 1.02084374, "epoch": 0.4624541573979438, "flos": 22493044010880.0, "grad_norm": 2.171314136459523, "language_loss": 0.72218025, "learning_rate": 2.338769381836292e-06, "loss": 0.74432719, "num_input_tokens_seen": 82696245, "step": 3846, "time_per_iteration": 2.817199945449829 }, { "auxiliary_loss_clip": 0.01169683, "auxiliary_loss_mlp": 0.01032673, "balance_loss_clip": 1.05404687, "balance_loss_mlp": 1.02408981, "epoch": 0.46257440028858293, "flos": 14465070218880.0, "grad_norm": 2.6061909675195425, "language_loss": 0.73003912, "learning_rate": 2.3380016406436984e-06, "loss": 0.75206268, "num_input_tokens_seen": 82713725, "step": 3847, "time_per_iteration": 3.730623245239258 }, { "auxiliary_loss_clip": 0.01169699, "auxiliary_loss_mlp": 0.01027879, "balance_loss_clip": 1.05633283, "balance_loss_mlp": 1.01930785, "epoch": 0.46269464317922204, "flos": 23332204523520.0, "grad_norm": 2.0332708205872287, "language_loss": 0.81464064, "learning_rate": 2.337233848176524e-06, "loss": 0.83661646, "num_input_tokens_seen": 82731495, "step": 3848, "time_per_iteration": 2.8750321865081787 }, { "auxiliary_loss_clip": 0.01165315, "auxiliary_loss_mlp": 0.01025187, "balance_loss_clip": 1.05449069, "balance_loss_mlp": 1.01694334, "epoch": 0.4628148860698611, "flos": 18552027594240.0, "grad_norm": 2.1003760207371553, "language_loss": 0.83165264, "learning_rate": 2.3364660045512435e-06, "loss": 0.85355771, "num_input_tokens_seen": 82750255, "step": 3849, "time_per_iteration": 2.9086925983428955 }, { "auxiliary_loss_clip": 0.01080797, "auxiliary_loss_mlp": 0.01003843, "balance_loss_clip": 1.02317584, "balance_loss_mlp": 1.0024066, "epoch": 0.4629351289605002, "flos": 70667569670400.0, "grad_norm": 0.7384501266473003, "language_loss": 0.58168709, "learning_rate": 2.335698109884337e-06, "loss": 0.60253352, "num_input_tokens_seen": 82815460, "step": 3850, "time_per_iteration": 3.4586565494537354 }, { "auxiliary_loss_clip": 0.01090208, "auxiliary_loss_mlp": 0.01004725, "balance_loss_clip": 1.03297758, "balance_loss_mlp": 1.00322855, "epoch": 0.4630553718511393, "flos": 59687200465920.0, "grad_norm": 0.7909148387928417, "language_loss": 0.59816563, "learning_rate": 2.334930164292294e-06, "loss": 0.619115, "num_input_tokens_seen": 82878010, "step": 3851, "time_per_iteration": 3.4906394481658936 }, { "auxiliary_loss_clip": 0.01168093, "auxiliary_loss_mlp": 0.01030094, "balance_loss_clip": 1.05536914, "balance_loss_mlp": 1.02175546, "epoch": 0.4631756147417784, "flos": 15960605909760.0, "grad_norm": 2.386892959268762, "language_loss": 0.80636209, "learning_rate": 2.334162167891612e-06, "loss": 0.82834399, "num_input_tokens_seen": 82895275, "step": 3852, "time_per_iteration": 2.817561626434326 }, { "auxiliary_loss_clip": 0.01181836, "auxiliary_loss_mlp": 0.01028819, "balance_loss_clip": 1.05543756, "balance_loss_mlp": 1.02030158, "epoch": 0.4632958576324175, "flos": 16472907636480.0, "grad_norm": 2.5910999457655817, "language_loss": 0.75049627, "learning_rate": 2.333394120798795e-06, "loss": 0.7726028, "num_input_tokens_seen": 82914010, "step": 3853, "time_per_iteration": 2.8101749420166016 }, { "auxiliary_loss_clip": 0.01180394, "auxiliary_loss_mlp": 0.01025742, "balance_loss_clip": 1.0534023, "balance_loss_mlp": 1.01767707, "epoch": 0.4634161005230566, "flos": 22346492520960.0, "grad_norm": 2.295217482603641, "language_loss": 0.719455, "learning_rate": 2.3326260231303545e-06, "loss": 0.74151635, "num_input_tokens_seen": 82932610, "step": 3854, "time_per_iteration": 2.7776169776916504 }, { "auxiliary_loss_clip": 0.01180895, "auxiliary_loss_mlp": 0.01025751, "balance_loss_clip": 1.05351126, "balance_loss_mlp": 1.01884866, "epoch": 0.46353634341369565, "flos": 15742233175680.0, "grad_norm": 1.7317369329448897, "language_loss": 0.86793876, "learning_rate": 2.331857875002811e-06, "loss": 0.89000523, "num_input_tokens_seen": 82951210, "step": 3855, "time_per_iteration": 2.748683214187622 }, { "auxiliary_loss_clip": 0.01180528, "auxiliary_loss_mlp": 0.01026612, "balance_loss_clip": 1.05701602, "balance_loss_mlp": 1.01798701, "epoch": 0.46365658630433476, "flos": 28329820433280.0, "grad_norm": 1.8870772657093733, "language_loss": 0.76525271, "learning_rate": 2.3310896765326916e-06, "loss": 0.78732413, "num_input_tokens_seen": 82972210, "step": 3856, "time_per_iteration": 2.749094247817993 }, { "auxiliary_loss_clip": 0.01172054, "auxiliary_loss_mlp": 0.01028015, "balance_loss_clip": 1.05566847, "balance_loss_mlp": 1.01925337, "epoch": 0.46377682919497387, "flos": 24608074590720.0, "grad_norm": 3.922190109949323, "language_loss": 0.84112, "learning_rate": 2.330321427836531e-06, "loss": 0.86312068, "num_input_tokens_seen": 82994080, "step": 3857, "time_per_iteration": 2.9204938411712646 }, { "auxiliary_loss_clip": 0.01180072, "auxiliary_loss_mlp": 0.01032397, "balance_loss_clip": 1.05558813, "balance_loss_mlp": 1.02473509, "epoch": 0.4638970720856129, "flos": 19060953442560.0, "grad_norm": 1.9275728218169088, "language_loss": 0.8250339, "learning_rate": 2.3295531290308733e-06, "loss": 0.84715855, "num_input_tokens_seen": 83012230, "step": 3858, "time_per_iteration": 2.7385458946228027 }, { "auxiliary_loss_clip": 0.01190079, "auxiliary_loss_mlp": 0.01056795, "balance_loss_clip": 1.05773783, "balance_loss_mlp": 1.01930141, "epoch": 0.46401731497625204, "flos": 18471012468480.0, "grad_norm": 3.5427301274735443, "language_loss": 0.75063741, "learning_rate": 2.3287847802322678e-06, "loss": 0.77310616, "num_input_tokens_seen": 83027800, "step": 3859, "time_per_iteration": 2.7528939247131348 }, { "auxiliary_loss_clip": 0.01183339, "auxiliary_loss_mlp": 0.01034846, "balance_loss_clip": 1.05451429, "balance_loss_mlp": 1.02636445, "epoch": 0.4641375578668911, "flos": 26067053214720.0, "grad_norm": 1.8214230651003995, "language_loss": 0.83957672, "learning_rate": 2.3280163815572723e-06, "loss": 0.86175859, "num_input_tokens_seen": 83048395, "step": 3860, "time_per_iteration": 2.94238018989563 }, { "auxiliary_loss_clip": 0.01172129, "auxiliary_loss_mlp": 0.01031019, "balance_loss_clip": 1.05402839, "balance_loss_mlp": 1.02268028, "epoch": 0.4642578007575302, "flos": 19570382081280.0, "grad_norm": 2.1335713538731844, "language_loss": 0.77102691, "learning_rate": 2.3272479331224522e-06, "loss": 0.7930584, "num_input_tokens_seen": 83065825, "step": 3861, "time_per_iteration": 2.813295602798462 }, { "auxiliary_loss_clip": 0.01185792, "auxiliary_loss_mlp": 0.01025856, "balance_loss_clip": 1.05330515, "balance_loss_mlp": 1.01800573, "epoch": 0.4643780436481693, "flos": 28186249772160.0, "grad_norm": 1.6995987072017755, "language_loss": 0.78360337, "learning_rate": 2.3264794350443817e-06, "loss": 0.80571985, "num_input_tokens_seen": 83087920, "step": 3862, "time_per_iteration": 2.7705607414245605 }, { "auxiliary_loss_clip": 0.01183639, "auxiliary_loss_mlp": 0.01026512, "balance_loss_clip": 1.0526849, "balance_loss_mlp": 1.0184294, "epoch": 0.46449828653880837, "flos": 25375270204800.0, "grad_norm": 2.0377685673986092, "language_loss": 0.78810561, "learning_rate": 2.3257108874396396e-06, "loss": 0.81020713, "num_input_tokens_seen": 83109015, "step": 3863, "time_per_iteration": 2.816828489303589 }, { "auxiliary_loss_clip": 0.01178693, "auxiliary_loss_mlp": 0.01030945, "balance_loss_clip": 1.05345893, "balance_loss_mlp": 1.02261806, "epoch": 0.4646185294294475, "flos": 16034330574720.0, "grad_norm": 2.272515515259748, "language_loss": 0.73941803, "learning_rate": 2.3249422904248152e-06, "loss": 0.76151443, "num_input_tokens_seen": 83127450, "step": 3864, "time_per_iteration": 2.7593960762023926 }, { "auxiliary_loss_clip": 0.01183638, "auxiliary_loss_mlp": 0.01027676, "balance_loss_clip": 1.05395138, "balance_loss_mlp": 1.01950455, "epoch": 0.4647387723200866, "flos": 26363101109760.0, "grad_norm": 1.595816994786183, "language_loss": 0.87138844, "learning_rate": 2.324173644116504e-06, "loss": 0.89350158, "num_input_tokens_seen": 83150300, "step": 3865, "time_per_iteration": 2.7938756942749023 }, { "auxiliary_loss_clip": 0.01179682, "auxiliary_loss_mlp": 0.01033645, "balance_loss_clip": 1.05577087, "balance_loss_mlp": 1.02517557, "epoch": 0.46485901521072565, "flos": 27160209774720.0, "grad_norm": 1.883957013158374, "language_loss": 0.81488383, "learning_rate": 2.3234049486313087e-06, "loss": 0.83701712, "num_input_tokens_seen": 83171750, "step": 3866, "time_per_iteration": 2.7728734016418457 }, { "auxiliary_loss_clip": 0.01178994, "auxiliary_loss_mlp": 0.01027994, "balance_loss_clip": 1.05175316, "balance_loss_mlp": 1.02010214, "epoch": 0.46497925810136476, "flos": 24279851088000.0, "grad_norm": 4.987042866525958, "language_loss": 0.75787807, "learning_rate": 2.322636204085839e-06, "loss": 0.77994788, "num_input_tokens_seen": 83191820, "step": 3867, "time_per_iteration": 3.734858989715576 }, { "auxiliary_loss_clip": 0.01173091, "auxiliary_loss_mlp": 0.01029648, "balance_loss_clip": 1.05448771, "balance_loss_mlp": 1.02176774, "epoch": 0.46509950099200387, "flos": 16253134272000.0, "grad_norm": 2.3511551591925315, "language_loss": 0.7909615, "learning_rate": 2.3218674105967143e-06, "loss": 0.812989, "num_input_tokens_seen": 83210085, "step": 3868, "time_per_iteration": 2.7610602378845215 }, { "auxiliary_loss_clip": 0.01172019, "auxiliary_loss_mlp": 0.01023493, "balance_loss_clip": 1.05385661, "balance_loss_mlp": 1.01531553, "epoch": 0.4652197438826429, "flos": 23442270773760.0, "grad_norm": 1.5317079331854573, "language_loss": 0.83476186, "learning_rate": 2.3210985682805593e-06, "loss": 0.85671699, "num_input_tokens_seen": 83231865, "step": 3869, "time_per_iteration": 2.809345245361328 }, { "auxiliary_loss_clip": 0.01187432, "auxiliary_loss_mlp": 0.01026637, "balance_loss_clip": 1.05725265, "balance_loss_mlp": 1.01799989, "epoch": 0.46533998677328203, "flos": 16216397637120.0, "grad_norm": 2.489224774143291, "language_loss": 0.68368661, "learning_rate": 2.320329677254007e-06, "loss": 0.70582736, "num_input_tokens_seen": 83249195, "step": 3870, "time_per_iteration": 4.4815452098846436 }, { "auxiliary_loss_clip": 0.01186781, "auxiliary_loss_mlp": 0.01027157, "balance_loss_clip": 1.05656922, "balance_loss_mlp": 1.0188359, "epoch": 0.46546022966392114, "flos": 21141869080320.0, "grad_norm": 2.4498484546057457, "language_loss": 0.72700626, "learning_rate": 2.319560737633697e-06, "loss": 0.74914569, "num_input_tokens_seen": 83267915, "step": 3871, "time_per_iteration": 2.7128515243530273 }, { "auxiliary_loss_clip": 0.01185213, "auxiliary_loss_mlp": 0.01029105, "balance_loss_clip": 1.05623472, "balance_loss_mlp": 1.02002764, "epoch": 0.4655804725545602, "flos": 41171942442240.0, "grad_norm": 1.842795432833291, "language_loss": 0.68400669, "learning_rate": 2.3187917495362775e-06, "loss": 0.70614994, "num_input_tokens_seen": 83292325, "step": 3872, "time_per_iteration": 2.8805348873138428 }, { "auxiliary_loss_clip": 0.01171953, "auxiliary_loss_mlp": 0.01026356, "balance_loss_clip": 1.05654359, "balance_loss_mlp": 1.01828527, "epoch": 0.4657007154451993, "flos": 19570956698880.0, "grad_norm": 3.2357701137447252, "language_loss": 0.76989275, "learning_rate": 2.318022713078403e-06, "loss": 0.7918759, "num_input_tokens_seen": 83306905, "step": 3873, "time_per_iteration": 2.695538282394409 }, { "auxiliary_loss_clip": 0.01176558, "auxiliary_loss_mlp": 0.01024461, "balance_loss_clip": 1.05255032, "balance_loss_mlp": 1.01645577, "epoch": 0.4658209583358384, "flos": 15517826956800.0, "grad_norm": 2.0801823448577395, "language_loss": 0.85501915, "learning_rate": 2.3172536283767354e-06, "loss": 0.8770293, "num_input_tokens_seen": 83320665, "step": 3874, "time_per_iteration": 3.6353538036346436 }, { "auxiliary_loss_clip": 0.01171731, "auxiliary_loss_mlp": 0.01029066, "balance_loss_clip": 1.05625224, "balance_loss_mlp": 1.0206145, "epoch": 0.4659412012264775, "flos": 14903180403840.0, "grad_norm": 1.854118624058924, "language_loss": 0.81527883, "learning_rate": 2.3164844955479447e-06, "loss": 0.83728683, "num_input_tokens_seen": 83336475, "step": 3875, "time_per_iteration": 2.7341983318328857 }, { "auxiliary_loss_clip": 0.01183861, "auxiliary_loss_mlp": 0.01026598, "balance_loss_clip": 1.05707121, "balance_loss_mlp": 1.0181222, "epoch": 0.4660614441171166, "flos": 24425612478720.0, "grad_norm": 1.5522217440518749, "language_loss": 0.70459175, "learning_rate": 2.3157153147087082e-06, "loss": 0.72669637, "num_input_tokens_seen": 83358365, "step": 3876, "time_per_iteration": 2.8241796493530273 }, { "auxiliary_loss_clip": 0.0117969, "auxiliary_loss_mlp": 0.01031814, "balance_loss_clip": 1.05613101, "balance_loss_mlp": 1.02409506, "epoch": 0.46618168700775564, "flos": 22091095843200.0, "grad_norm": 1.7570104566547144, "language_loss": 0.83121288, "learning_rate": 2.314946085975709e-06, "loss": 0.85332787, "num_input_tokens_seen": 83377345, "step": 3877, "time_per_iteration": 2.7599735260009766 }, { "auxiliary_loss_clip": 0.01164801, "auxiliary_loss_mlp": 0.0103203, "balance_loss_clip": 1.05540812, "balance_loss_mlp": 1.02414441, "epoch": 0.46630192989839475, "flos": 26176975810560.0, "grad_norm": 1.6839388089730571, "language_loss": 0.82643104, "learning_rate": 2.3141768094656393e-06, "loss": 0.8483994, "num_input_tokens_seen": 83395920, "step": 3878, "time_per_iteration": 2.7797722816467285 }, { "auxiliary_loss_clip": 0.01172517, "auxiliary_loss_mlp": 0.01026425, "balance_loss_clip": 1.05281663, "balance_loss_mlp": 1.01885545, "epoch": 0.46642217278903386, "flos": 11509622150400.0, "grad_norm": 2.3195393754895006, "language_loss": 0.83529162, "learning_rate": 2.3134074852951966e-06, "loss": 0.85728109, "num_input_tokens_seen": 83412510, "step": 3879, "time_per_iteration": 2.8134357929229736 }, { "auxiliary_loss_clip": 0.01173758, "auxiliary_loss_mlp": 0.01029166, "balance_loss_clip": 1.0584898, "balance_loss_mlp": 1.02071977, "epoch": 0.4665424156796729, "flos": 32306819299200.0, "grad_norm": 10.303480937032978, "language_loss": 0.77669013, "learning_rate": 2.312638113581088e-06, "loss": 0.79871929, "num_input_tokens_seen": 83432995, "step": 3880, "time_per_iteration": 2.858436107635498 }, { "auxiliary_loss_clip": 0.01182271, "auxiliary_loss_mlp": 0.01031908, "balance_loss_clip": 1.05366111, "balance_loss_mlp": 1.02350998, "epoch": 0.46666265857031203, "flos": 18436179254400.0, "grad_norm": 2.597205555412809, "language_loss": 0.78521943, "learning_rate": 2.311868694440027e-06, "loss": 0.80736125, "num_input_tokens_seen": 83447415, "step": 3881, "time_per_iteration": 2.7336068153381348 }, { "auxiliary_loss_clip": 0.01085223, "auxiliary_loss_mlp": 0.01002754, "balance_loss_clip": 1.02163529, "balance_loss_mlp": 1.00132942, "epoch": 0.46678290146095114, "flos": 68438989221120.0, "grad_norm": 0.7392280207092603, "language_loss": 0.62479329, "learning_rate": 2.3110992279887323e-06, "loss": 0.64567304, "num_input_tokens_seen": 83519340, "step": 3882, "time_per_iteration": 3.352262020111084 }, { "auxiliary_loss_clip": 0.01182491, "auxiliary_loss_mlp": 0.01032419, "balance_loss_clip": 1.05663848, "balance_loss_mlp": 1.02380633, "epoch": 0.4669031443515902, "flos": 17712507945600.0, "grad_norm": 2.936158867700055, "language_loss": 0.85028088, "learning_rate": 2.310329714343932e-06, "loss": 0.87243003, "num_input_tokens_seen": 83535490, "step": 3883, "time_per_iteration": 2.7526562213897705 }, { "auxiliary_loss_clip": 0.01179148, "auxiliary_loss_mlp": 0.01029326, "balance_loss_clip": 1.058887, "balance_loss_mlp": 1.02080238, "epoch": 0.4670233872422293, "flos": 23947748916480.0, "grad_norm": 2.0791937006025627, "language_loss": 0.8182987, "learning_rate": 2.309560153622361e-06, "loss": 0.84038341, "num_input_tokens_seen": 83552400, "step": 3884, "time_per_iteration": 2.7702765464782715 }, { "auxiliary_loss_clip": 0.01175691, "auxiliary_loss_mlp": 0.01031259, "balance_loss_clip": 1.05584192, "balance_loss_mlp": 1.02226496, "epoch": 0.4671436301328684, "flos": 28111268131200.0, "grad_norm": 2.024998545937851, "language_loss": 0.74158186, "learning_rate": 2.3087905459407602e-06, "loss": 0.76365137, "num_input_tokens_seen": 83571340, "step": 3885, "time_per_iteration": 2.9315028190612793 }, { "auxiliary_loss_clip": 0.01083528, "auxiliary_loss_mlp": 0.01002664, "balance_loss_clip": 1.02017593, "balance_loss_mlp": 1.00114369, "epoch": 0.46726387302350747, "flos": 69369684566400.0, "grad_norm": 0.8159844426838698, "language_loss": 0.62877488, "learning_rate": 2.3080208914158795e-06, "loss": 0.64963686, "num_input_tokens_seen": 83634340, "step": 3886, "time_per_iteration": 3.376927137374878 }, { "auxiliary_loss_clip": 0.01183091, "auxiliary_loss_mlp": 0.01027363, "balance_loss_clip": 1.06029725, "balance_loss_mlp": 1.01910758, "epoch": 0.4673841159141466, "flos": 25519666878720.0, "grad_norm": 2.35973823391309, "language_loss": 0.72558951, "learning_rate": 2.3072511901644753e-06, "loss": 0.74769408, "num_input_tokens_seen": 83653410, "step": 3887, "time_per_iteration": 2.8434860706329346 }, { "auxiliary_loss_clip": 0.01185157, "auxiliary_loss_mlp": 0.01027608, "balance_loss_clip": 1.05682683, "balance_loss_mlp": 1.0196681, "epoch": 0.4675043588047857, "flos": 24499265316480.0, "grad_norm": 1.967434562814711, "language_loss": 0.80832207, "learning_rate": 2.306481442303309e-06, "loss": 0.83044976, "num_input_tokens_seen": 83672985, "step": 3888, "time_per_iteration": 2.770429849624634 }, { "auxiliary_loss_clip": 0.01182914, "auxiliary_loss_mlp": 0.0103126, "balance_loss_clip": 1.0535388, "balance_loss_mlp": 1.02291548, "epoch": 0.46762460169542475, "flos": 20960771685120.0, "grad_norm": 3.569318306725812, "language_loss": 0.73215652, "learning_rate": 2.3057116479491515e-06, "loss": 0.75429833, "num_input_tokens_seen": 83692395, "step": 3889, "time_per_iteration": 2.7314260005950928 }, { "auxiliary_loss_clip": 0.01178361, "auxiliary_loss_mlp": 0.01029489, "balance_loss_clip": 1.05244553, "balance_loss_mlp": 1.02055991, "epoch": 0.46774484458606386, "flos": 19171666137600.0, "grad_norm": 2.0228718373014734, "language_loss": 0.76044273, "learning_rate": 2.30494180721878e-06, "loss": 0.78252119, "num_input_tokens_seen": 83709735, "step": 3890, "time_per_iteration": 2.7263152599334717 }, { "auxiliary_loss_clip": 0.01182993, "auxiliary_loss_mlp": 0.01034534, "balance_loss_clip": 1.05670524, "balance_loss_mlp": 1.02583146, "epoch": 0.4678650874767029, "flos": 17967689141760.0, "grad_norm": 2.481831583138062, "language_loss": 0.89761031, "learning_rate": 2.3041719202289794e-06, "loss": 0.9197855, "num_input_tokens_seen": 83725910, "step": 3891, "time_per_iteration": 2.725280523300171 }, { "auxiliary_loss_clip": 0.0118115, "auxiliary_loss_mlp": 0.01025197, "balance_loss_clip": 1.05387461, "balance_loss_mlp": 1.01747847, "epoch": 0.467985330367342, "flos": 21360816432000.0, "grad_norm": 2.0666943961905457, "language_loss": 0.80457532, "learning_rate": 2.30340198709654e-06, "loss": 0.82663882, "num_input_tokens_seen": 83745745, "step": 3892, "time_per_iteration": 2.6899399757385254 }, { "auxiliary_loss_clip": 0.0118408, "auxiliary_loss_mlp": 0.01030226, "balance_loss_clip": 1.05419517, "balance_loss_mlp": 1.02158284, "epoch": 0.46810557325798113, "flos": 20521835487360.0, "grad_norm": 2.047307894040422, "language_loss": 0.74477196, "learning_rate": 2.3026320079382605e-06, "loss": 0.76691502, "num_input_tokens_seen": 83762680, "step": 3893, "time_per_iteration": 2.7223503589630127 }, { "auxiliary_loss_clip": 0.01185895, "auxiliary_loss_mlp": 0.01028129, "balance_loss_clip": 1.05587113, "balance_loss_mlp": 1.02023172, "epoch": 0.4682258161486202, "flos": 30117848572800.0, "grad_norm": 1.8287344485886212, "language_loss": 0.76120901, "learning_rate": 2.3018619828709454e-06, "loss": 0.78334916, "num_input_tokens_seen": 83784220, "step": 3894, "time_per_iteration": 3.644557476043701 }, { "auxiliary_loss_clip": 0.01180221, "auxiliary_loss_mlp": 0.01061245, "balance_loss_clip": 1.05558419, "balance_loss_mlp": 1.02117825, "epoch": 0.4683460590392593, "flos": 25293357239040.0, "grad_norm": 2.9500871733417786, "language_loss": 0.82497883, "learning_rate": 2.3010919120114084e-06, "loss": 0.84739351, "num_input_tokens_seen": 83800750, "step": 3895, "time_per_iteration": 2.7262303829193115 }, { "auxiliary_loss_clip": 0.01177579, "auxiliary_loss_mlp": 0.01025324, "balance_loss_clip": 1.05124068, "balance_loss_mlp": 1.01712787, "epoch": 0.4684663019298984, "flos": 15368330551680.0, "grad_norm": 2.5086092976191, "language_loss": 0.65306604, "learning_rate": 2.3003217954764672e-06, "loss": 0.67509508, "num_input_tokens_seen": 83815455, "step": 3896, "time_per_iteration": 3.5364787578582764 }, { "auxiliary_loss_clip": 0.01185242, "auxiliary_loss_mlp": 0.01031263, "balance_loss_clip": 1.05375171, "balance_loss_mlp": 1.02272153, "epoch": 0.46858654482053747, "flos": 27778842737280.0, "grad_norm": 1.6935712379286088, "language_loss": 0.79427767, "learning_rate": 2.299551633382949e-06, "loss": 0.81644273, "num_input_tokens_seen": 83835765, "step": 3897, "time_per_iteration": 2.703343152999878 }, { "auxiliary_loss_clip": 0.01171378, "auxiliary_loss_mlp": 0.01027237, "balance_loss_clip": 1.05407524, "balance_loss_mlp": 1.01853418, "epoch": 0.4687067877111766, "flos": 18040623707520.0, "grad_norm": 2.29947207705324, "language_loss": 0.85217243, "learning_rate": 2.2987814258476854e-06, "loss": 0.87415856, "num_input_tokens_seen": 83853565, "step": 3898, "time_per_iteration": 2.719944715499878 }, { "auxiliary_loss_clip": 0.01177919, "auxiliary_loss_mlp": 0.01030361, "balance_loss_clip": 1.05595887, "balance_loss_mlp": 1.02171206, "epoch": 0.4688270306018157, "flos": 16977380198400.0, "grad_norm": 2.9439610854813565, "language_loss": 0.67903686, "learning_rate": 2.2980111729875177e-06, "loss": 0.70111972, "num_input_tokens_seen": 83869815, "step": 3899, "time_per_iteration": 3.8010878562927246 }, { "auxiliary_loss_clip": 0.01175407, "auxiliary_loss_mlp": 0.01027426, "balance_loss_clip": 1.05540729, "balance_loss_mlp": 1.01917672, "epoch": 0.46894727349245474, "flos": 17821640442240.0, "grad_norm": 1.7285509402688544, "language_loss": 0.82241589, "learning_rate": 2.2972408749192917e-06, "loss": 0.84444422, "num_input_tokens_seen": 83887545, "step": 3900, "time_per_iteration": 2.6563937664031982 }, { "auxiliary_loss_clip": 0.0117685, "auxiliary_loss_mlp": 0.01057615, "balance_loss_clip": 1.05366254, "balance_loss_mlp": 1.01900601, "epoch": 0.46906751638309385, "flos": 21471349559040.0, "grad_norm": 2.1933932069266757, "language_loss": 0.66952109, "learning_rate": 2.296470531759861e-06, "loss": 0.6918658, "num_input_tokens_seen": 83905645, "step": 3901, "time_per_iteration": 2.803342580795288 }, { "auxiliary_loss_clip": 0.01170855, "auxiliary_loss_mlp": 0.01037508, "balance_loss_clip": 1.05374801, "balance_loss_mlp": 1.02906203, "epoch": 0.46918775927373296, "flos": 20337829090560.0, "grad_norm": 5.1974766419080805, "language_loss": 0.79307246, "learning_rate": 2.2957001436260866e-06, "loss": 0.8151561, "num_input_tokens_seen": 83922705, "step": 3902, "time_per_iteration": 2.8694026470184326 }, { "auxiliary_loss_clip": 0.01175476, "auxiliary_loss_mlp": 0.01033982, "balance_loss_clip": 1.05474579, "balance_loss_mlp": 1.02514255, "epoch": 0.469308002164372, "flos": 18403249461120.0, "grad_norm": 1.6196252446808426, "language_loss": 0.73071408, "learning_rate": 2.294929710634836e-06, "loss": 0.75280863, "num_input_tokens_seen": 83940795, "step": 3903, "time_per_iteration": 2.7771782875061035 }, { "auxiliary_loss_clip": 0.01180712, "auxiliary_loss_mlp": 0.01026612, "balance_loss_clip": 1.05388951, "balance_loss_mlp": 1.01811838, "epoch": 0.46942824505501113, "flos": 37962067363200.0, "grad_norm": 1.9498098453521304, "language_loss": 0.61310267, "learning_rate": 2.2941592329029823e-06, "loss": 0.63517588, "num_input_tokens_seen": 83961900, "step": 3904, "time_per_iteration": 2.8393924236297607 }, { "auxiliary_loss_clip": 0.0117807, "auxiliary_loss_mlp": 0.01031728, "balance_loss_clip": 1.05492771, "balance_loss_mlp": 1.02310944, "epoch": 0.46954848794565024, "flos": 21872507627520.0, "grad_norm": 2.1755542261160814, "language_loss": 0.7934643, "learning_rate": 2.2933887105474067e-06, "loss": 0.81556237, "num_input_tokens_seen": 83980075, "step": 3905, "time_per_iteration": 2.963968276977539 }, { "auxiliary_loss_clip": 0.01175997, "auxiliary_loss_mlp": 0.01027461, "balance_loss_clip": 1.053859, "balance_loss_mlp": 1.02016568, "epoch": 0.4696687308362893, "flos": 22016545165440.0, "grad_norm": 1.9988093730767025, "language_loss": 0.81587052, "learning_rate": 2.2926181436849974e-06, "loss": 0.83790517, "num_input_tokens_seen": 83999430, "step": 3906, "time_per_iteration": 2.813709020614624 }, { "auxiliary_loss_clip": 0.01175923, "auxiliary_loss_mlp": 0.0102665, "balance_loss_clip": 1.05218196, "balance_loss_mlp": 1.01784611, "epoch": 0.4697889737269284, "flos": 21613663244160.0, "grad_norm": 1.6429326503594746, "language_loss": 0.72808301, "learning_rate": 2.2918475324326478e-06, "loss": 0.75010872, "num_input_tokens_seen": 84019150, "step": 3907, "time_per_iteration": 2.806800127029419 }, { "auxiliary_loss_clip": 0.01185739, "auxiliary_loss_mlp": 0.01059411, "balance_loss_clip": 1.05583286, "balance_loss_mlp": 1.01894879, "epoch": 0.46990921661756746, "flos": 25228323665280.0, "grad_norm": 2.1908406294916936, "language_loss": 0.91789401, "learning_rate": 2.2910768769072603e-06, "loss": 0.94034541, "num_input_tokens_seen": 84037930, "step": 3908, "time_per_iteration": 2.8271684646606445 }, { "auxiliary_loss_clip": 0.01175344, "auxiliary_loss_mlp": 0.01031408, "balance_loss_clip": 1.05300927, "balance_loss_mlp": 1.02292621, "epoch": 0.47002945950820657, "flos": 13844031045120.0, "grad_norm": 2.033007159352802, "language_loss": 0.7585603, "learning_rate": 2.2903061772257417e-06, "loss": 0.78062785, "num_input_tokens_seen": 84055915, "step": 3909, "time_per_iteration": 2.757927894592285 }, { "auxiliary_loss_clip": 0.01179989, "auxiliary_loss_mlp": 0.01032452, "balance_loss_clip": 1.05472183, "balance_loss_mlp": 1.02353513, "epoch": 0.4701497023988457, "flos": 26247001374720.0, "grad_norm": 1.6698057395912311, "language_loss": 0.78495598, "learning_rate": 2.289535433505007e-06, "loss": 0.80708039, "num_input_tokens_seen": 84077270, "step": 3910, "time_per_iteration": 2.891791820526123 }, { "auxiliary_loss_clip": 0.01180286, "auxiliary_loss_mlp": 0.01027738, "balance_loss_clip": 1.05379367, "balance_loss_mlp": 1.01958692, "epoch": 0.47026994528948474, "flos": 25629517647360.0, "grad_norm": 2.150684192244086, "language_loss": 0.63470733, "learning_rate": 2.2887646458619767e-06, "loss": 0.65678751, "num_input_tokens_seen": 84098635, "step": 3911, "time_per_iteration": 2.78525972366333 }, { "auxiliary_loss_clip": 0.01180434, "auxiliary_loss_mlp": 0.01028443, "balance_loss_clip": 1.05511737, "balance_loss_mlp": 1.01975834, "epoch": 0.47039018818012385, "flos": 20554406144640.0, "grad_norm": 1.992693259250436, "language_loss": 0.76794213, "learning_rate": 2.2879938144135797e-06, "loss": 0.79003096, "num_input_tokens_seen": 84114740, "step": 3912, "time_per_iteration": 2.8438713550567627 }, { "auxiliary_loss_clip": 0.01175342, "auxiliary_loss_mlp": 0.01057182, "balance_loss_clip": 1.05518258, "balance_loss_mlp": 1.02113557, "epoch": 0.47051043107076296, "flos": 21577249831680.0, "grad_norm": 1.9727186914077042, "language_loss": 0.75640833, "learning_rate": 2.2872229392767496e-06, "loss": 0.77873355, "num_input_tokens_seen": 84134845, "step": 3913, "time_per_iteration": 2.884444236755371 }, { "auxiliary_loss_clip": 0.01183539, "auxiliary_loss_mlp": 0.01025967, "balance_loss_clip": 1.05354488, "balance_loss_mlp": 1.01845121, "epoch": 0.470630673961402, "flos": 18953185662720.0, "grad_norm": 1.7324183479354438, "language_loss": 0.74776989, "learning_rate": 2.286452020568428e-06, "loss": 0.76986492, "num_input_tokens_seen": 84152920, "step": 3914, "time_per_iteration": 2.799967050552368 }, { "auxiliary_loss_clip": 0.01187923, "auxiliary_loss_mlp": 0.01027624, "balance_loss_clip": 1.05448258, "balance_loss_mlp": 1.01921976, "epoch": 0.4707509168520411, "flos": 19938969492480.0, "grad_norm": 1.8214800157519164, "language_loss": 0.73116887, "learning_rate": 2.2856810584055637e-06, "loss": 0.75332427, "num_input_tokens_seen": 84170455, "step": 3915, "time_per_iteration": 2.6719751358032227 }, { "auxiliary_loss_clip": 0.01178394, "auxiliary_loss_mlp": 0.01027292, "balance_loss_clip": 1.05142927, "balance_loss_mlp": 1.0189538, "epoch": 0.47087115974268023, "flos": 40118754741120.0, "grad_norm": 1.8966713996166944, "language_loss": 0.67788631, "learning_rate": 2.2849100529051085e-06, "loss": 0.69994318, "num_input_tokens_seen": 84197390, "step": 3916, "time_per_iteration": 2.880030632019043 }, { "auxiliary_loss_clip": 0.01180824, "auxiliary_loss_mlp": 0.01037715, "balance_loss_clip": 1.05208826, "balance_loss_mlp": 1.02971005, "epoch": 0.4709914026333193, "flos": 13552723745280.0, "grad_norm": 2.610669391213229, "language_loss": 0.80097103, "learning_rate": 2.284139004184026e-06, "loss": 0.82315642, "num_input_tokens_seen": 84214620, "step": 3917, "time_per_iteration": 2.7153689861297607 }, { "auxiliary_loss_clip": 0.0118478, "auxiliary_loss_mlp": 0.01031384, "balance_loss_clip": 1.05312777, "balance_loss_mlp": 1.02300382, "epoch": 0.4711116455239584, "flos": 19974628719360.0, "grad_norm": 2.7421869973644366, "language_loss": 0.74763179, "learning_rate": 2.2833679123592814e-06, "loss": 0.76979339, "num_input_tokens_seen": 84231880, "step": 3918, "time_per_iteration": 2.7357213497161865 }, { "auxiliary_loss_clip": 0.01175413, "auxiliary_loss_mlp": 0.01035873, "balance_loss_clip": 1.05349493, "balance_loss_mlp": 1.02740312, "epoch": 0.4712318884145975, "flos": 32124824064000.0, "grad_norm": 3.4251786174507606, "language_loss": 0.63086081, "learning_rate": 2.2825967775478508e-06, "loss": 0.65297371, "num_input_tokens_seen": 84252980, "step": 3919, "time_per_iteration": 2.8897321224212646 }, { "auxiliary_loss_clip": 0.01180495, "auxiliary_loss_mlp": 0.010279, "balance_loss_clip": 1.05056632, "balance_loss_mlp": 1.01956129, "epoch": 0.47135213130523657, "flos": 20047850593920.0, "grad_norm": 2.358466336399503, "language_loss": 0.83328557, "learning_rate": 2.2818255998667135e-06, "loss": 0.85536951, "num_input_tokens_seen": 84271490, "step": 3920, "time_per_iteration": 3.6035654544830322 }, { "auxiliary_loss_clip": 0.01176742, "auxiliary_loss_mlp": 0.01029266, "balance_loss_clip": 1.05307186, "balance_loss_mlp": 1.02098691, "epoch": 0.4714723741958757, "flos": 19426990988160.0, "grad_norm": 1.7453701972035065, "language_loss": 0.78916818, "learning_rate": 2.2810543794328566e-06, "loss": 0.81122828, "num_input_tokens_seen": 84290525, "step": 3921, "time_per_iteration": 2.799285888671875 }, { "auxiliary_loss_clip": 0.01186171, "auxiliary_loss_mlp": 0.0102792, "balance_loss_clip": 1.05540061, "balance_loss_mlp": 1.01958108, "epoch": 0.4715926170865148, "flos": 20373883367040.0, "grad_norm": 1.7457665723115783, "language_loss": 0.82420927, "learning_rate": 2.2802831163632735e-06, "loss": 0.84635019, "num_input_tokens_seen": 84309245, "step": 3922, "time_per_iteration": 5.238365650177002 }, { "auxiliary_loss_clip": 0.01169448, "auxiliary_loss_mlp": 0.01026919, "balance_loss_clip": 1.05285788, "balance_loss_mlp": 1.01823473, "epoch": 0.47171285997715384, "flos": 22672884430080.0, "grad_norm": 1.5871333237506104, "language_loss": 0.74549556, "learning_rate": 2.279511810774965e-06, "loss": 0.76745927, "num_input_tokens_seen": 84330775, "step": 3923, "time_per_iteration": 2.839245080947876 }, { "auxiliary_loss_clip": 0.01182197, "auxiliary_loss_mlp": 0.01028548, "balance_loss_clip": 1.05145359, "balance_loss_mlp": 1.02034009, "epoch": 0.47183310286779295, "flos": 21105419754240.0, "grad_norm": 1.8390156394616368, "language_loss": 0.71756709, "learning_rate": 2.2787404627849364e-06, "loss": 0.73967451, "num_input_tokens_seen": 84349985, "step": 3924, "time_per_iteration": 2.69075870513916 }, { "auxiliary_loss_clip": 0.01175503, "auxiliary_loss_mlp": 0.01022644, "balance_loss_clip": 1.05252278, "balance_loss_mlp": 1.01374495, "epoch": 0.471953345758432, "flos": 21726566668800.0, "grad_norm": 1.743287296789059, "language_loss": 0.78989297, "learning_rate": 2.277969072510202e-06, "loss": 0.81187439, "num_input_tokens_seen": 84368965, "step": 3925, "time_per_iteration": 3.81003475189209 }, { "auxiliary_loss_clip": 0.01172246, "auxiliary_loss_mlp": 0.01029475, "balance_loss_clip": 1.05095327, "balance_loss_mlp": 1.02106524, "epoch": 0.4720735886490711, "flos": 19861078849920.0, "grad_norm": 1.6951233610176846, "language_loss": 0.814421, "learning_rate": 2.2771976400677803e-06, "loss": 0.8364383, "num_input_tokens_seen": 84387795, "step": 3926, "time_per_iteration": 2.7981579303741455 }, { "auxiliary_loss_clip": 0.01161432, "auxiliary_loss_mlp": 0.01028291, "balance_loss_clip": 1.05321884, "balance_loss_mlp": 1.02038741, "epoch": 0.47219383153971023, "flos": 19171809792000.0, "grad_norm": 2.1202595039095207, "language_loss": 0.79241812, "learning_rate": 2.2764261655746965e-06, "loss": 0.81431538, "num_input_tokens_seen": 84405290, "step": 3927, "time_per_iteration": 2.8236565589904785 }, { "auxiliary_loss_clip": 0.01172605, "auxiliary_loss_mlp": 0.01027114, "balance_loss_clip": 1.0537951, "balance_loss_mlp": 1.01854277, "epoch": 0.4723140744303493, "flos": 23224005780480.0, "grad_norm": 1.7252306250603109, "language_loss": 0.75696772, "learning_rate": 2.2756546491479832e-06, "loss": 0.77896494, "num_input_tokens_seen": 84426205, "step": 3928, "time_per_iteration": 2.867982864379883 }, { "auxiliary_loss_clip": 0.01184148, "auxiliary_loss_mlp": 0.01057274, "balance_loss_clip": 1.05223787, "balance_loss_mlp": 1.01994586, "epoch": 0.4724343173209884, "flos": 18223265387520.0, "grad_norm": 2.7742073207261213, "language_loss": 0.79872286, "learning_rate": 2.274883090904679e-06, "loss": 0.82113707, "num_input_tokens_seen": 84443970, "step": 3929, "time_per_iteration": 2.7205424308776855 }, { "auxiliary_loss_clip": 0.01184085, "auxiliary_loss_mlp": 0.01026481, "balance_loss_clip": 1.05321217, "balance_loss_mlp": 1.01779628, "epoch": 0.4725545602116275, "flos": 21251037490560.0, "grad_norm": 4.41939532237853, "language_loss": 0.67815524, "learning_rate": 2.2741114909618283e-06, "loss": 0.70026088, "num_input_tokens_seen": 84459865, "step": 3930, "time_per_iteration": 2.83561372756958 }, { "auxiliary_loss_clip": 0.01175448, "auxiliary_loss_mlp": 0.01026841, "balance_loss_clip": 1.05326116, "balance_loss_mlp": 1.01835942, "epoch": 0.47267480310226656, "flos": 21434002392960.0, "grad_norm": 1.710147183047731, "language_loss": 0.71983892, "learning_rate": 2.2733398494364828e-06, "loss": 0.74186182, "num_input_tokens_seen": 84479110, "step": 3931, "time_per_iteration": 2.734389305114746 }, { "auxiliary_loss_clip": 0.01169243, "auxiliary_loss_mlp": 0.01031112, "balance_loss_clip": 1.05347669, "balance_loss_mlp": 1.0226897, "epoch": 0.47279504599290567, "flos": 18770508069120.0, "grad_norm": 2.5543803168779986, "language_loss": 0.84097326, "learning_rate": 2.272568166445699e-06, "loss": 0.86297679, "num_input_tokens_seen": 84497675, "step": 3932, "time_per_iteration": 2.8856890201568604 }, { "auxiliary_loss_clip": 0.01177757, "auxiliary_loss_mlp": 0.01027703, "balance_loss_clip": 1.05160475, "balance_loss_mlp": 1.01984739, "epoch": 0.4729152888835448, "flos": 21105742976640.0, "grad_norm": 2.7377304611207345, "language_loss": 0.64476776, "learning_rate": 2.271796442106541e-06, "loss": 0.66682243, "num_input_tokens_seen": 84517030, "step": 3933, "time_per_iteration": 2.703080654144287 }, { "auxiliary_loss_clip": 0.01082645, "auxiliary_loss_mlp": 0.01002239, "balance_loss_clip": 1.02373445, "balance_loss_mlp": 1.00071871, "epoch": 0.47303553177418384, "flos": 70201877840640.0, "grad_norm": 0.8005230627594695, "language_loss": 0.56485242, "learning_rate": 2.271024676536079e-06, "loss": 0.58570123, "num_input_tokens_seen": 84577290, "step": 3934, "time_per_iteration": 3.374880075454712 }, { "auxiliary_loss_clip": 0.01184254, "auxiliary_loss_mlp": 0.01031286, "balance_loss_clip": 1.05716729, "balance_loss_mlp": 1.02252388, "epoch": 0.47315577466482295, "flos": 22455122227200.0, "grad_norm": 9.370278372732747, "language_loss": 0.73799348, "learning_rate": 2.2702528698513894e-06, "loss": 0.76014888, "num_input_tokens_seen": 84598415, "step": 3935, "time_per_iteration": 2.907552719116211 }, { "auxiliary_loss_clip": 0.01180955, "auxiliary_loss_mlp": 0.01025726, "balance_loss_clip": 1.05330789, "balance_loss_mlp": 1.01740503, "epoch": 0.47327601755546206, "flos": 24352857480960.0, "grad_norm": 1.8882308876548586, "language_loss": 0.78631878, "learning_rate": 2.269481022169554e-06, "loss": 0.80838561, "num_input_tokens_seen": 84617010, "step": 3936, "time_per_iteration": 2.8154966831207275 }, { "auxiliary_loss_clip": 0.01182884, "auxiliary_loss_mlp": 0.01027533, "balance_loss_clip": 1.05148363, "balance_loss_mlp": 1.01880074, "epoch": 0.4733962604461011, "flos": 22926772736640.0, "grad_norm": 1.8327145138484258, "language_loss": 0.80915236, "learning_rate": 2.2687091336076614e-06, "loss": 0.83125657, "num_input_tokens_seen": 84636350, "step": 3937, "time_per_iteration": 2.8578593730926514 }, { "auxiliary_loss_clip": 0.01176627, "auxiliary_loss_mlp": 0.0103487, "balance_loss_clip": 1.05239654, "balance_loss_mlp": 1.02702022, "epoch": 0.4735165033367402, "flos": 18327369980160.0, "grad_norm": 3.8507201664749577, "language_loss": 0.79912829, "learning_rate": 2.267937204282807e-06, "loss": 0.82124329, "num_input_tokens_seen": 84653490, "step": 3938, "time_per_iteration": 2.7331433296203613 }, { "auxiliary_loss_clip": 0.011876, "auxiliary_loss_mlp": 0.01025148, "balance_loss_clip": 1.05639267, "balance_loss_mlp": 1.01622486, "epoch": 0.4736367462273793, "flos": 23037018554880.0, "grad_norm": 2.280093054090627, "language_loss": 0.79029107, "learning_rate": 2.2671652343120926e-06, "loss": 0.81241852, "num_input_tokens_seen": 84673965, "step": 3939, "time_per_iteration": 2.944822311401367 }, { "auxiliary_loss_clip": 0.0118236, "auxiliary_loss_mlp": 0.01024915, "balance_loss_clip": 1.05290008, "balance_loss_mlp": 1.01687431, "epoch": 0.4737569891180184, "flos": 25374336451200.0, "grad_norm": 1.8022655942715924, "language_loss": 0.80714273, "learning_rate": 2.2663932238126236e-06, "loss": 0.82921541, "num_input_tokens_seen": 84692525, "step": 3940, "time_per_iteration": 2.7429468631744385 }, { "auxiliary_loss_clip": 0.01178643, "auxiliary_loss_mlp": 0.01020802, "balance_loss_clip": 1.05205989, "balance_loss_mlp": 1.01260662, "epoch": 0.4738772320086575, "flos": 25849326925440.0, "grad_norm": 1.8544331125088849, "language_loss": 0.80267203, "learning_rate": 2.265621172901515e-06, "loss": 0.8246665, "num_input_tokens_seen": 84715640, "step": 3941, "time_per_iteration": 2.7889554500579834 }, { "auxiliary_loss_clip": 0.01188774, "auxiliary_loss_mlp": 0.01031672, "balance_loss_clip": 1.05698609, "balance_loss_mlp": 1.02360737, "epoch": 0.47399747489929656, "flos": 27564420499200.0, "grad_norm": 1.969915205611548, "language_loss": 0.71172142, "learning_rate": 2.2648490816958854e-06, "loss": 0.73392582, "num_input_tokens_seen": 84736635, "step": 3942, "time_per_iteration": 2.844696521759033 }, { "auxiliary_loss_clip": 0.01181347, "auxiliary_loss_mlp": 0.01029256, "balance_loss_clip": 1.05389297, "balance_loss_mlp": 1.02035093, "epoch": 0.47411771778993567, "flos": 24863650836480.0, "grad_norm": 5.742061027099506, "language_loss": 0.73071867, "learning_rate": 2.264076950312861e-06, "loss": 0.75282472, "num_input_tokens_seen": 84755445, "step": 3943, "time_per_iteration": 2.8278558254241943 }, { "auxiliary_loss_clip": 0.01181095, "auxiliary_loss_mlp": 0.01028814, "balance_loss_clip": 1.0535686, "balance_loss_mlp": 1.02025437, "epoch": 0.4742379606805748, "flos": 22748009725440.0, "grad_norm": 2.414442017937272, "language_loss": 0.82543397, "learning_rate": 2.2633047788695727e-06, "loss": 0.84753311, "num_input_tokens_seen": 84775750, "step": 3944, "time_per_iteration": 2.8727517127990723 }, { "auxiliary_loss_clip": 0.01175846, "auxiliary_loss_mlp": 0.01025084, "balance_loss_clip": 1.05432296, "balance_loss_mlp": 1.01752925, "epoch": 0.47435820357121383, "flos": 19681130689920.0, "grad_norm": 1.864839555412806, "language_loss": 0.6431402, "learning_rate": 2.262532567483159e-06, "loss": 0.66514957, "num_input_tokens_seen": 84794310, "step": 3945, "time_per_iteration": 2.7963638305664062 }, { "auxiliary_loss_clip": 0.01187114, "auxiliary_loss_mlp": 0.01062223, "balance_loss_clip": 1.05569792, "balance_loss_mlp": 1.02386343, "epoch": 0.47447844646185294, "flos": 25228718714880.0, "grad_norm": 2.1751468754611096, "language_loss": 0.80629766, "learning_rate": 2.2617603162707635e-06, "loss": 0.82879102, "num_input_tokens_seen": 84814720, "step": 3946, "time_per_iteration": 3.7269539833068848 }, { "auxiliary_loss_clip": 0.01185758, "auxiliary_loss_mlp": 0.01026163, "balance_loss_clip": 1.05503011, "balance_loss_mlp": 1.01828599, "epoch": 0.47459868935249205, "flos": 24570619683840.0, "grad_norm": 1.8545178912087774, "language_loss": 0.82822847, "learning_rate": 2.2609880253495363e-06, "loss": 0.85034764, "num_input_tokens_seen": 84834355, "step": 3947, "time_per_iteration": 2.876065731048584 }, { "auxiliary_loss_clip": 0.01187764, "auxiliary_loss_mlp": 0.0102947, "balance_loss_clip": 1.0560832, "balance_loss_mlp": 1.02099979, "epoch": 0.4747189322431311, "flos": 20558500295040.0, "grad_norm": 2.4060511788631276, "language_loss": 0.86680788, "learning_rate": 2.260215694836633e-06, "loss": 0.88898015, "num_input_tokens_seen": 84853530, "step": 3948, "time_per_iteration": 4.773800849914551 }, { "auxiliary_loss_clip": 0.01177202, "auxiliary_loss_mlp": 0.0105789, "balance_loss_clip": 1.05482364, "balance_loss_mlp": 1.01819909, "epoch": 0.4748391751337702, "flos": 25995231970560.0, "grad_norm": 1.9819945680699464, "language_loss": 0.64859259, "learning_rate": 2.2594433248492157e-06, "loss": 0.67094356, "num_input_tokens_seen": 84872505, "step": 3949, "time_per_iteration": 2.9353344440460205 }, { "auxiliary_loss_clip": 0.01187099, "auxiliary_loss_mlp": 0.01031716, "balance_loss_clip": 1.05487394, "balance_loss_mlp": 1.02236438, "epoch": 0.47495941802440933, "flos": 22821052032000.0, "grad_norm": 1.9388886935782865, "language_loss": 0.80318195, "learning_rate": 2.2586709155044527e-06, "loss": 0.82537007, "num_input_tokens_seen": 84893105, "step": 3950, "time_per_iteration": 2.807169198989868 }, { "auxiliary_loss_clip": 0.01183788, "auxiliary_loss_mlp": 0.01026281, "balance_loss_clip": 1.05325103, "balance_loss_mlp": 1.01796007, "epoch": 0.4750796609150484, "flos": 27891782075520.0, "grad_norm": 1.6818525153128268, "language_loss": 0.76312184, "learning_rate": 2.2578984669195167e-06, "loss": 0.78522259, "num_input_tokens_seen": 84914070, "step": 3951, "time_per_iteration": 2.763395309448242 }, { "auxiliary_loss_clip": 0.01176351, "auxiliary_loss_mlp": 0.01023156, "balance_loss_clip": 1.05092037, "balance_loss_mlp": 1.01501441, "epoch": 0.4751999038056875, "flos": 35660085471360.0, "grad_norm": 1.937244272764647, "language_loss": 0.68320978, "learning_rate": 2.2571259792115887e-06, "loss": 0.70520484, "num_input_tokens_seen": 84935290, "step": 3952, "time_per_iteration": 3.6575796604156494 }, { "auxiliary_loss_clip": 0.01177232, "auxiliary_loss_mlp": 0.01028527, "balance_loss_clip": 1.05379987, "balance_loss_mlp": 1.02098107, "epoch": 0.4753201466963266, "flos": 22090880361600.0, "grad_norm": 1.8448577022550956, "language_loss": 0.79477119, "learning_rate": 2.2563534524978544e-06, "loss": 0.81682873, "num_input_tokens_seen": 84952760, "step": 3953, "time_per_iteration": 2.899500846862793 }, { "auxiliary_loss_clip": 0.0116963, "auxiliary_loss_mlp": 0.01023513, "balance_loss_clip": 1.05361509, "balance_loss_mlp": 1.01552546, "epoch": 0.47544038958696566, "flos": 30190854965760.0, "grad_norm": 5.252686320897502, "language_loss": 0.70896465, "learning_rate": 2.2555808868955052e-06, "loss": 0.73089612, "num_input_tokens_seen": 84974890, "step": 3954, "time_per_iteration": 2.821742534637451 }, { "auxiliary_loss_clip": 0.01171089, "auxiliary_loss_mlp": 0.0103591, "balance_loss_clip": 1.05322289, "balance_loss_mlp": 1.02667737, "epoch": 0.47556063247760477, "flos": 23472219738240.0, "grad_norm": 2.6742091394812797, "language_loss": 0.7380625, "learning_rate": 2.254808282521738e-06, "loss": 0.76013249, "num_input_tokens_seen": 84993640, "step": 3955, "time_per_iteration": 2.9189188480377197 }, { "auxiliary_loss_clip": 0.01176237, "auxiliary_loss_mlp": 0.0105785, "balance_loss_clip": 1.05214608, "balance_loss_mlp": 1.02068293, "epoch": 0.4756808753682438, "flos": 25155209531520.0, "grad_norm": 2.5869971128677043, "language_loss": 0.80824959, "learning_rate": 2.2540356394937573e-06, "loss": 0.83059043, "num_input_tokens_seen": 85012340, "step": 3956, "time_per_iteration": 2.8576443195343018 }, { "auxiliary_loss_clip": 0.0118133, "auxiliary_loss_mlp": 0.01027167, "balance_loss_clip": 1.05499339, "balance_loss_mlp": 1.01850605, "epoch": 0.47580111825888294, "flos": 15669729573120.0, "grad_norm": 21.284732633042168, "language_loss": 0.83787996, "learning_rate": 2.253262957928772e-06, "loss": 0.85996497, "num_input_tokens_seen": 85029225, "step": 3957, "time_per_iteration": 2.826627254486084 }, { "auxiliary_loss_clip": 0.01172364, "auxiliary_loss_mlp": 0.01028069, "balance_loss_clip": 1.05208993, "balance_loss_mlp": 1.01956308, "epoch": 0.47592136114952205, "flos": 17636556637440.0, "grad_norm": 1.6829274473513285, "language_loss": 0.71980917, "learning_rate": 2.2524902379439976e-06, "loss": 0.74181342, "num_input_tokens_seen": 85047895, "step": 3958, "time_per_iteration": 2.7211782932281494 }, { "auxiliary_loss_clip": 0.01082577, "auxiliary_loss_mlp": 0.01001292, "balance_loss_clip": 1.03005171, "balance_loss_mlp": 0.99982613, "epoch": 0.4760416040401611, "flos": 61417159292160.0, "grad_norm": 0.7415908258872429, "language_loss": 0.6367473, "learning_rate": 2.251717479656655e-06, "loss": 0.65758598, "num_input_tokens_seen": 85112690, "step": 3959, "time_per_iteration": 3.406252145767212 }, { "auxiliary_loss_clip": 0.01187137, "auxiliary_loss_mlp": 0.01029855, "balance_loss_clip": 1.05510306, "balance_loss_mlp": 1.02114034, "epoch": 0.4761618469308002, "flos": 18405871153920.0, "grad_norm": 2.095200322775709, "language_loss": 0.76061356, "learning_rate": 2.2509446831839704e-06, "loss": 0.78278351, "num_input_tokens_seen": 85132130, "step": 3960, "time_per_iteration": 2.7792160511016846 }, { "auxiliary_loss_clip": 0.01178638, "auxiliary_loss_mlp": 0.01028971, "balance_loss_clip": 1.05226517, "balance_loss_mlp": 1.02074528, "epoch": 0.4762820898214393, "flos": 18040911016320.0, "grad_norm": 3.6478130334319148, "language_loss": 0.82363826, "learning_rate": 2.250171848643177e-06, "loss": 0.84571439, "num_input_tokens_seen": 85149420, "step": 3961, "time_per_iteration": 2.7870566844940186 }, { "auxiliary_loss_clip": 0.01170618, "auxiliary_loss_mlp": 0.01027453, "balance_loss_clip": 1.05294549, "balance_loss_mlp": 1.02033663, "epoch": 0.4764023327120784, "flos": 19318253541120.0, "grad_norm": 1.7398232937869924, "language_loss": 0.86088264, "learning_rate": 2.249398976151513e-06, "loss": 0.88286334, "num_input_tokens_seen": 85166970, "step": 3962, "time_per_iteration": 2.7649786472320557 }, { "auxiliary_loss_clip": 0.01180924, "auxiliary_loss_mlp": 0.01032444, "balance_loss_clip": 1.05214453, "balance_loss_mlp": 1.02361679, "epoch": 0.4765225756027175, "flos": 22747255539840.0, "grad_norm": 5.804595707459842, "language_loss": 0.78784478, "learning_rate": 2.248626065826223e-06, "loss": 0.80997849, "num_input_tokens_seen": 85185175, "step": 3963, "time_per_iteration": 2.721691608428955 }, { "auxiliary_loss_clip": 0.0108346, "auxiliary_loss_mlp": 0.01005309, "balance_loss_clip": 1.02038467, "balance_loss_mlp": 1.00378859, "epoch": 0.4766428184933566, "flos": 65933392106880.0, "grad_norm": 0.7645000247270942, "language_loss": 0.62576187, "learning_rate": 2.2478531177845564e-06, "loss": 0.6466496, "num_input_tokens_seen": 85246170, "step": 3964, "time_per_iteration": 3.313261032104492 }, { "auxiliary_loss_clip": 0.0118087, "auxiliary_loss_mlp": 0.01033302, "balance_loss_clip": 1.05590653, "balance_loss_mlp": 1.0249697, "epoch": 0.47676306138399566, "flos": 24136495908480.0, "grad_norm": 1.9196167667605724, "language_loss": 0.84902978, "learning_rate": 2.247080132143769e-06, "loss": 0.87117147, "num_input_tokens_seen": 85268525, "step": 3965, "time_per_iteration": 2.871953010559082 }, { "auxiliary_loss_clip": 0.01175088, "auxiliary_loss_mlp": 0.0102723, "balance_loss_clip": 1.05311728, "balance_loss_mlp": 1.01873088, "epoch": 0.47688330427463477, "flos": 12604322995200.0, "grad_norm": 2.139337311401645, "language_loss": 0.69382226, "learning_rate": 2.246307109021121e-06, "loss": 0.71584547, "num_input_tokens_seen": 85285930, "step": 3966, "time_per_iteration": 2.817148447036743 }, { "auxiliary_loss_clip": 0.01172117, "auxiliary_loss_mlp": 0.01031237, "balance_loss_clip": 1.05094278, "balance_loss_mlp": 1.02280951, "epoch": 0.4770035471652739, "flos": 21390585828480.0, "grad_norm": 1.8640575493986753, "language_loss": 0.82336783, "learning_rate": 2.2455340485338817e-06, "loss": 0.84540141, "num_input_tokens_seen": 85303565, "step": 3967, "time_per_iteration": 2.9418094158172607 }, { "auxiliary_loss_clip": 0.01180452, "auxiliary_loss_mlp": 0.01032957, "balance_loss_clip": 1.05166674, "balance_loss_mlp": 1.02476144, "epoch": 0.47712379005591293, "flos": 25156251025920.0, "grad_norm": 1.9721425537602348, "language_loss": 0.67613041, "learning_rate": 2.244760950799322e-06, "loss": 0.69826448, "num_input_tokens_seen": 85321835, "step": 3968, "time_per_iteration": 2.9061481952667236 }, { "auxiliary_loss_clip": 0.01160119, "auxiliary_loss_mlp": 0.01027068, "balance_loss_clip": 1.05192518, "balance_loss_mlp": 1.01925945, "epoch": 0.47724403294655204, "flos": 22054323294720.0, "grad_norm": 2.360593492963748, "language_loss": 0.72401136, "learning_rate": 2.2439878159347203e-06, "loss": 0.74588323, "num_input_tokens_seen": 85341260, "step": 3969, "time_per_iteration": 2.899249792098999 }, { "auxiliary_loss_clip": 0.01082506, "auxiliary_loss_mlp": 0.01004861, "balance_loss_clip": 1.01951015, "balance_loss_mlp": 1.00327587, "epoch": 0.4773642758371911, "flos": 70229387658240.0, "grad_norm": 0.7279140795444137, "language_loss": 0.55234492, "learning_rate": 2.2432146440573616e-06, "loss": 0.57321858, "num_input_tokens_seen": 85407220, "step": 3970, "time_per_iteration": 3.40527081489563 }, { "auxiliary_loss_clip": 0.01176172, "auxiliary_loss_mlp": 0.01025762, "balance_loss_clip": 1.0544759, "balance_loss_mlp": 1.01724482, "epoch": 0.4774845187278302, "flos": 23548602009600.0, "grad_norm": 2.0062819174856266, "language_loss": 0.66564441, "learning_rate": 2.242441435284534e-06, "loss": 0.68766373, "num_input_tokens_seen": 85426095, "step": 3971, "time_per_iteration": 2.724453926086426 }, { "auxiliary_loss_clip": 0.01179758, "auxiliary_loss_mlp": 0.01026125, "balance_loss_clip": 1.05291939, "balance_loss_mlp": 1.01768494, "epoch": 0.4776047616184693, "flos": 23075371301760.0, "grad_norm": 5.839927162024223, "language_loss": 0.85279042, "learning_rate": 2.2416681897335337e-06, "loss": 0.8748492, "num_input_tokens_seen": 85444245, "step": 3972, "time_per_iteration": 3.8154690265655518 }, { "auxiliary_loss_clip": 0.0116927, "auxiliary_loss_mlp": 0.01028876, "balance_loss_clip": 1.05347645, "balance_loss_mlp": 1.02125287, "epoch": 0.4777250045091084, "flos": 31898119374720.0, "grad_norm": 2.0837194155457355, "language_loss": 0.66963041, "learning_rate": 2.240894907521661e-06, "loss": 0.69161189, "num_input_tokens_seen": 85463325, "step": 3973, "time_per_iteration": 4.055041313171387 }, { "auxiliary_loss_clip": 0.01176677, "auxiliary_loss_mlp": 0.01024406, "balance_loss_clip": 1.05321205, "balance_loss_mlp": 1.01660085, "epoch": 0.4778452473997475, "flos": 24278163148800.0, "grad_norm": 2.3876751843668105, "language_loss": 0.63856256, "learning_rate": 2.240121588766223e-06, "loss": 0.66057336, "num_input_tokens_seen": 85483375, "step": 3974, "time_per_iteration": 3.7422428131103516 }, { "auxiliary_loss_clip": 0.01169895, "auxiliary_loss_mlp": 0.01026976, "balance_loss_clip": 1.05123258, "balance_loss_mlp": 1.01922727, "epoch": 0.4779654902903866, "flos": 31575031516800.0, "grad_norm": 2.562655326718579, "language_loss": 0.71568996, "learning_rate": 2.239348233584531e-06, "loss": 0.73765868, "num_input_tokens_seen": 85504230, "step": 3975, "time_per_iteration": 2.865642786026001 }, { "auxiliary_loss_clip": 0.01179773, "auxiliary_loss_mlp": 0.01029738, "balance_loss_clip": 1.05277538, "balance_loss_mlp": 1.02144146, "epoch": 0.47808573318102565, "flos": 19500428344320.0, "grad_norm": 2.145640684363773, "language_loss": 0.80936933, "learning_rate": 2.2385748420939013e-06, "loss": 0.83146441, "num_input_tokens_seen": 85523425, "step": 3976, "time_per_iteration": 2.7372305393218994 }, { "auxiliary_loss_clip": 0.01182044, "auxiliary_loss_mlp": 0.01025241, "balance_loss_clip": 1.05599236, "balance_loss_mlp": 1.01799893, "epoch": 0.47820597607166476, "flos": 22601135013120.0, "grad_norm": 2.2049434400626224, "language_loss": 0.72866857, "learning_rate": 2.2378014144116583e-06, "loss": 0.75074142, "num_input_tokens_seen": 85542235, "step": 3977, "time_per_iteration": 3.6592509746551514 }, { "auxiliary_loss_clip": 0.01182531, "auxiliary_loss_mlp": 0.01032988, "balance_loss_clip": 1.05288017, "balance_loss_mlp": 1.02511418, "epoch": 0.4783262189623039, "flos": 23003011353600.0, "grad_norm": 1.9371147945025595, "language_loss": 0.79662365, "learning_rate": 2.23702795065513e-06, "loss": 0.81877887, "num_input_tokens_seen": 85561815, "step": 3978, "time_per_iteration": 2.8355588912963867 }, { "auxiliary_loss_clip": 0.01083036, "auxiliary_loss_mlp": 0.0100226, "balance_loss_clip": 1.01972198, "balance_loss_mlp": 1.00069809, "epoch": 0.47844646185294293, "flos": 49772801226240.0, "grad_norm": 0.9885498349451431, "language_loss": 0.67519081, "learning_rate": 2.2362544509416493e-06, "loss": 0.69604373, "num_input_tokens_seen": 85613930, "step": 3979, "time_per_iteration": 3.1754090785980225 }, { "auxiliary_loss_clip": 0.01169958, "auxiliary_loss_mlp": 0.01021913, "balance_loss_clip": 1.05080938, "balance_loss_mlp": 1.01418853, "epoch": 0.47856670474358204, "flos": 20229558520320.0, "grad_norm": 2.1868221817052604, "language_loss": 0.82760763, "learning_rate": 2.2354809153885572e-06, "loss": 0.84952641, "num_input_tokens_seen": 85631000, "step": 3980, "time_per_iteration": 2.785722255706787 }, { "auxiliary_loss_clip": 0.01179634, "auxiliary_loss_mlp": 0.01025237, "balance_loss_clip": 1.05285227, "balance_loss_mlp": 1.0169102, "epoch": 0.47868694763422115, "flos": 20990936131200.0, "grad_norm": 2.076264364580135, "language_loss": 0.8333571, "learning_rate": 2.234707344113197e-06, "loss": 0.85540581, "num_input_tokens_seen": 85649095, "step": 3981, "time_per_iteration": 2.7786595821380615 }, { "auxiliary_loss_clip": 0.01178673, "auxiliary_loss_mlp": 0.01027988, "balance_loss_clip": 1.0524317, "balance_loss_mlp": 1.02055573, "epoch": 0.4788071905248602, "flos": 19026551191680.0, "grad_norm": 1.8077844394847564, "language_loss": 0.77719969, "learning_rate": 2.233933737232919e-06, "loss": 0.79926634, "num_input_tokens_seen": 85666875, "step": 3982, "time_per_iteration": 2.711097478866577 }, { "auxiliary_loss_clip": 0.01165567, "auxiliary_loss_mlp": 0.01055444, "balance_loss_clip": 1.05266881, "balance_loss_mlp": 1.01774442, "epoch": 0.4789274334154993, "flos": 23002221254400.0, "grad_norm": 1.8011014721243612, "language_loss": 0.78388262, "learning_rate": 2.2331600948650793e-06, "loss": 0.80609268, "num_input_tokens_seen": 85687020, "step": 3983, "time_per_iteration": 2.9672000408172607 }, { "auxiliary_loss_clip": 0.01163163, "auxiliary_loss_mlp": 0.01059505, "balance_loss_clip": 1.05415702, "balance_loss_mlp": 1.02100968, "epoch": 0.4790476763061384, "flos": 23075586783360.0, "grad_norm": 1.6122007282194686, "language_loss": 0.80328, "learning_rate": 2.2323864171270386e-06, "loss": 0.82550675, "num_input_tokens_seen": 85708290, "step": 3984, "time_per_iteration": 2.859679937362671 }, { "auxiliary_loss_clip": 0.01176933, "auxiliary_loss_mlp": 0.01028342, "balance_loss_clip": 1.05351448, "balance_loss_mlp": 1.01989627, "epoch": 0.4791679191967775, "flos": 21179288073600.0, "grad_norm": 1.8612831278209645, "language_loss": 0.72385085, "learning_rate": 2.231612704136164e-06, "loss": 0.74590361, "num_input_tokens_seen": 85728660, "step": 3985, "time_per_iteration": 2.860266923904419 }, { "auxiliary_loss_clip": 0.01177148, "auxiliary_loss_mlp": 0.01024071, "balance_loss_clip": 1.05415928, "balance_loss_mlp": 1.01605392, "epoch": 0.4792881620874166, "flos": 22301495758080.0, "grad_norm": 2.991225651177725, "language_loss": 0.74858111, "learning_rate": 2.2308389560098253e-06, "loss": 0.77059329, "num_input_tokens_seen": 85745035, "step": 3986, "time_per_iteration": 2.7170519828796387 }, { "auxiliary_loss_clip": 0.01175108, "auxiliary_loss_mlp": 0.01022299, "balance_loss_clip": 1.05391765, "balance_loss_mlp": 1.01433015, "epoch": 0.47940840497805565, "flos": 17420877423360.0, "grad_norm": 1.899149456205117, "language_loss": 0.77243292, "learning_rate": 2.2300651728654008e-06, "loss": 0.79440701, "num_input_tokens_seen": 85760295, "step": 3987, "time_per_iteration": 2.7226388454437256 }, { "auxiliary_loss_clip": 0.01079247, "auxiliary_loss_mlp": 0.01047395, "balance_loss_clip": 1.0219214, "balance_loss_mlp": 1.00470352, "epoch": 0.47952864786869476, "flos": 65358175708800.0, "grad_norm": 1.068649330126633, "language_loss": 0.60180783, "learning_rate": 2.229291354820272e-06, "loss": 0.62307423, "num_input_tokens_seen": 85821305, "step": 3988, "time_per_iteration": 3.2875092029571533 }, { "auxiliary_loss_clip": 0.01181475, "auxiliary_loss_mlp": 0.01033617, "balance_loss_clip": 1.05519509, "balance_loss_mlp": 1.02510595, "epoch": 0.47964889075933387, "flos": 16799802336000.0, "grad_norm": 2.7259115191740366, "language_loss": 0.75386345, "learning_rate": 2.228517501991828e-06, "loss": 0.77601445, "num_input_tokens_seen": 85840105, "step": 3989, "time_per_iteration": 2.7665462493896484 }, { "auxiliary_loss_clip": 0.01080353, "auxiliary_loss_mlp": 0.01002169, "balance_loss_clip": 1.02270031, "balance_loss_mlp": 1.00080431, "epoch": 0.4797691336499729, "flos": 70079244808320.0, "grad_norm": 0.815833627382138, "language_loss": 0.6110037, "learning_rate": 2.22774361449746e-06, "loss": 0.6318289, "num_input_tokens_seen": 85896585, "step": 3990, "time_per_iteration": 3.3920788764953613 }, { "auxiliary_loss_clip": 0.01165188, "auxiliary_loss_mlp": 0.01024885, "balance_loss_clip": 1.05338359, "balance_loss_mlp": 1.01655281, "epoch": 0.47988937654061203, "flos": 18953329317120.0, "grad_norm": 2.994003023724289, "language_loss": 0.70779467, "learning_rate": 2.2269696924545668e-06, "loss": 0.72969544, "num_input_tokens_seen": 85914415, "step": 3991, "time_per_iteration": 2.8651864528656006 }, { "auxiliary_loss_clip": 0.01174154, "auxiliary_loss_mlp": 0.01023511, "balance_loss_clip": 1.05582047, "balance_loss_mlp": 1.0161413, "epoch": 0.48000961943125114, "flos": 14461981649280.0, "grad_norm": 3.3552246547403426, "language_loss": 0.77594531, "learning_rate": 2.2261957359805523e-06, "loss": 0.7979219, "num_input_tokens_seen": 85931650, "step": 3992, "time_per_iteration": 2.809690237045288 }, { "auxiliary_loss_clip": 0.01182454, "auxiliary_loss_mlp": 0.01027326, "balance_loss_clip": 1.05275643, "balance_loss_mlp": 1.01929092, "epoch": 0.4801298623218902, "flos": 27051149105280.0, "grad_norm": 3.01872727964803, "language_loss": 0.73965025, "learning_rate": 2.225421745192823e-06, "loss": 0.76174796, "num_input_tokens_seen": 85951805, "step": 3993, "time_per_iteration": 2.8208394050598145 }, { "auxiliary_loss_clip": 0.0117862, "auxiliary_loss_mlp": 0.01026589, "balance_loss_clip": 1.05456471, "balance_loss_mlp": 1.01851261, "epoch": 0.4802501052125293, "flos": 26355236031360.0, "grad_norm": 2.123962031096543, "language_loss": 0.78097701, "learning_rate": 2.2246477202087955e-06, "loss": 0.80302912, "num_input_tokens_seen": 85972485, "step": 3994, "time_per_iteration": 2.831911563873291 }, { "auxiliary_loss_clip": 0.01176464, "auxiliary_loss_mlp": 0.01030471, "balance_loss_clip": 1.0524323, "balance_loss_mlp": 1.02292466, "epoch": 0.4803703481031684, "flos": 20993916960000.0, "grad_norm": 13.656443406277873, "language_loss": 0.83077973, "learning_rate": 2.223873661145887e-06, "loss": 0.85284901, "num_input_tokens_seen": 85992540, "step": 3995, "time_per_iteration": 2.925621509552002 }, { "auxiliary_loss_clip": 0.01169483, "auxiliary_loss_mlp": 0.01055517, "balance_loss_clip": 1.05262482, "balance_loss_mlp": 1.01765537, "epoch": 0.4804905909938075, "flos": 20703722981760.0, "grad_norm": 1.9683228354431999, "language_loss": 0.71261334, "learning_rate": 2.2230995681215226e-06, "loss": 0.73486328, "num_input_tokens_seen": 86012065, "step": 3996, "time_per_iteration": 2.849548578262329 }, { "auxiliary_loss_clip": 0.01170707, "auxiliary_loss_mlp": 0.01026068, "balance_loss_clip": 1.05423808, "balance_loss_mlp": 1.01741958, "epoch": 0.4806108338844466, "flos": 16654831044480.0, "grad_norm": 2.498166530950348, "language_loss": 0.77782047, "learning_rate": 2.2223254412531305e-06, "loss": 0.79978818, "num_input_tokens_seen": 86029435, "step": 3997, "time_per_iteration": 2.8183515071868896 }, { "auxiliary_loss_clip": 0.01161037, "auxiliary_loss_mlp": 0.0102478, "balance_loss_clip": 1.04781079, "balance_loss_mlp": 1.01723993, "epoch": 0.4807310767750857, "flos": 20011329440640.0, "grad_norm": 6.568012979953175, "language_loss": 0.81898129, "learning_rate": 2.221551280658146e-06, "loss": 0.84083945, "num_input_tokens_seen": 86048495, "step": 3998, "time_per_iteration": 3.871800184249878 }, { "auxiliary_loss_clip": 0.01167162, "auxiliary_loss_mlp": 0.01023679, "balance_loss_clip": 1.05540073, "balance_loss_mlp": 1.01624668, "epoch": 0.48085131966572475, "flos": 23185257984000.0, "grad_norm": 1.741273041122503, "language_loss": 0.73797566, "learning_rate": 2.2207770864540085e-06, "loss": 0.75988406, "num_input_tokens_seen": 86067470, "step": 3999, "time_per_iteration": 4.045331239700317 }, { "auxiliary_loss_clip": 0.01167741, "auxiliary_loss_mlp": 0.0102315, "balance_loss_clip": 1.05152011, "balance_loss_mlp": 1.01543093, "epoch": 0.48097156255636386, "flos": 20558643949440.0, "grad_norm": 1.908912565972414, "language_loss": 0.73013359, "learning_rate": 2.220002858758162e-06, "loss": 0.75204253, "num_input_tokens_seen": 86085460, "step": 4000, "time_per_iteration": 3.671708345413208 }, { "auxiliary_loss_clip": 0.01082643, "auxiliary_loss_mlp": 0.01003089, "balance_loss_clip": 1.02000332, "balance_loss_mlp": 1.00157535, "epoch": 0.481091805447003, "flos": 70511608817280.0, "grad_norm": 0.8880112064848117, "language_loss": 0.60873234, "learning_rate": 2.2192285976880573e-06, "loss": 0.62958968, "num_input_tokens_seen": 86149715, "step": 4001, "time_per_iteration": 3.377824306488037 }, { "auxiliary_loss_clip": 0.01172909, "auxiliary_loss_mlp": 0.01070987, "balance_loss_clip": 1.05213976, "balance_loss_mlp": 1.02970958, "epoch": 0.48121204833764203, "flos": 36428214839040.0, "grad_norm": 1.6277341038295943, "language_loss": 0.8102653, "learning_rate": 2.2184543033611485e-06, "loss": 0.83270419, "num_input_tokens_seen": 86170795, "step": 4002, "time_per_iteration": 3.002718210220337 }, { "auxiliary_loss_clip": 0.01177513, "auxiliary_loss_mlp": 0.01022057, "balance_loss_clip": 1.05022764, "balance_loss_mlp": 1.01388478, "epoch": 0.48133229122828114, "flos": 27490264871040.0, "grad_norm": 2.2951677682245326, "language_loss": 0.82059622, "learning_rate": 2.2176799758948957e-06, "loss": 0.84259188, "num_input_tokens_seen": 86190955, "step": 4003, "time_per_iteration": 3.805093765258789 }, { "auxiliary_loss_clip": 0.0117327, "auxiliary_loss_mlp": 0.01034458, "balance_loss_clip": 1.0561626, "balance_loss_mlp": 1.02590466, "epoch": 0.4814525341189202, "flos": 43072802179200.0, "grad_norm": 2.525179649601361, "language_loss": 0.73076749, "learning_rate": 2.2169056154067635e-06, "loss": 0.75284481, "num_input_tokens_seen": 86214875, "step": 4004, "time_per_iteration": 2.968693494796753 }, { "auxiliary_loss_clip": 0.01177816, "auxiliary_loss_mlp": 0.01053618, "balance_loss_clip": 1.05362749, "balance_loss_mlp": 1.01671457, "epoch": 0.4815727770095593, "flos": 24236901400320.0, "grad_norm": 1.93918192950976, "language_loss": 0.82284367, "learning_rate": 2.216131222014222e-06, "loss": 0.84515804, "num_input_tokens_seen": 86232950, "step": 4005, "time_per_iteration": 2.7821362018585205 }, { "auxiliary_loss_clip": 0.0116901, "auxiliary_loss_mlp": 0.01028835, "balance_loss_clip": 1.05526912, "balance_loss_mlp": 1.02112782, "epoch": 0.4816930199001984, "flos": 18113630100480.0, "grad_norm": 2.530347700577987, "language_loss": 0.80650485, "learning_rate": 2.2153567958347455e-06, "loss": 0.82848328, "num_input_tokens_seen": 86249160, "step": 4006, "time_per_iteration": 2.882204055786133 }, { "auxiliary_loss_clip": 0.0117777, "auxiliary_loss_mlp": 0.01027548, "balance_loss_clip": 1.0557909, "balance_loss_mlp": 1.01971567, "epoch": 0.48181326279083747, "flos": 17274720983040.0, "grad_norm": 2.1758755197308894, "language_loss": 0.7960996, "learning_rate": 2.214582336985815e-06, "loss": 0.81815279, "num_input_tokens_seen": 86267060, "step": 4007, "time_per_iteration": 2.693971872329712 }, { "auxiliary_loss_clip": 0.01168863, "auxiliary_loss_mlp": 0.01031889, "balance_loss_clip": 1.05224895, "balance_loss_mlp": 1.02378821, "epoch": 0.4819335056814766, "flos": 14903252231040.0, "grad_norm": 2.37932109566224, "language_loss": 0.66548598, "learning_rate": 2.2138078455849142e-06, "loss": 0.68749344, "num_input_tokens_seen": 86285055, "step": 4008, "time_per_iteration": 2.765109062194824 }, { "auxiliary_loss_clip": 0.01180839, "auxiliary_loss_mlp": 0.01028312, "balance_loss_clip": 1.05315828, "balance_loss_mlp": 1.02104032, "epoch": 0.4820537485721157, "flos": 19244888012160.0, "grad_norm": 3.16023231827346, "language_loss": 0.78741544, "learning_rate": 2.2130333217495334e-06, "loss": 0.80950701, "num_input_tokens_seen": 86304225, "step": 4009, "time_per_iteration": 2.7235496044158936 }, { "auxiliary_loss_clip": 0.01172633, "auxiliary_loss_mlp": 0.0102485, "balance_loss_clip": 1.05190063, "balance_loss_mlp": 1.01610589, "epoch": 0.48217399146275475, "flos": 16033791870720.0, "grad_norm": 2.7176517088543166, "language_loss": 0.68079507, "learning_rate": 2.2122587655971665e-06, "loss": 0.70276988, "num_input_tokens_seen": 86319170, "step": 4010, "time_per_iteration": 2.740482807159424 }, { "auxiliary_loss_clip": 0.01173643, "auxiliary_loss_mlp": 0.01026259, "balance_loss_clip": 1.05190444, "balance_loss_mlp": 1.01861811, "epoch": 0.48229423435339386, "flos": 24134197438080.0, "grad_norm": 1.8665932702391672, "language_loss": 0.64098108, "learning_rate": 2.211484177245314e-06, "loss": 0.66298014, "num_input_tokens_seen": 86338760, "step": 4011, "time_per_iteration": 2.7744007110595703 }, { "auxiliary_loss_clip": 0.01182063, "auxiliary_loss_mlp": 0.01019212, "balance_loss_clip": 1.05318761, "balance_loss_mlp": 1.01171947, "epoch": 0.48241447724403297, "flos": 23805435231360.0, "grad_norm": 2.071605741304652, "language_loss": 0.72478497, "learning_rate": 2.21070955681148e-06, "loss": 0.74679774, "num_input_tokens_seen": 86357865, "step": 4012, "time_per_iteration": 2.6865456104278564 }, { "auxiliary_loss_clip": 0.01164789, "auxiliary_loss_mlp": 0.01029225, "balance_loss_clip": 1.05372405, "balance_loss_mlp": 1.02161956, "epoch": 0.482534720134672, "flos": 23110312256640.0, "grad_norm": 1.8486735726711068, "language_loss": 0.78274256, "learning_rate": 2.209934904413174e-06, "loss": 0.80468273, "num_input_tokens_seen": 86379470, "step": 4013, "time_per_iteration": 2.7778944969177246 }, { "auxiliary_loss_clip": 0.01169871, "auxiliary_loss_mlp": 0.01033132, "balance_loss_clip": 1.05449247, "balance_loss_mlp": 1.02509749, "epoch": 0.48265496302531113, "flos": 20923819568640.0, "grad_norm": 3.254733442299739, "language_loss": 0.72396249, "learning_rate": 2.2091602201679095e-06, "loss": 0.74599254, "num_input_tokens_seen": 86399080, "step": 4014, "time_per_iteration": 2.831733226776123 }, { "auxiliary_loss_clip": 0.01173213, "auxiliary_loss_mlp": 0.01024295, "balance_loss_clip": 1.05221891, "balance_loss_mlp": 1.01620686, "epoch": 0.48277520591595025, "flos": 15231152511360.0, "grad_norm": 2.267059439261722, "language_loss": 0.83027238, "learning_rate": 2.208385504193206e-06, "loss": 0.85224742, "num_input_tokens_seen": 86416580, "step": 4015, "time_per_iteration": 2.7283999919891357 }, { "auxiliary_loss_clip": 0.01179691, "auxiliary_loss_mlp": 0.01020312, "balance_loss_clip": 1.05060148, "balance_loss_mlp": 1.01268244, "epoch": 0.4828954488065893, "flos": 17858664385920.0, "grad_norm": 4.635193118427357, "language_loss": 0.80876517, "learning_rate": 2.2076107566065873e-06, "loss": 0.83076525, "num_input_tokens_seen": 86434365, "step": 4016, "time_per_iteration": 2.6726720333099365 }, { "auxiliary_loss_clip": 0.01180783, "auxiliary_loss_mlp": 0.01027565, "balance_loss_clip": 1.05303717, "balance_loss_mlp": 1.01994181, "epoch": 0.4830156916972284, "flos": 32087405070720.0, "grad_norm": 2.5709730916086633, "language_loss": 0.75832719, "learning_rate": 2.2068359775255816e-06, "loss": 0.78041065, "num_input_tokens_seen": 86452675, "step": 4017, "time_per_iteration": 2.8192176818847656 }, { "auxiliary_loss_clip": 0.011645, "auxiliary_loss_mlp": 0.01023089, "balance_loss_clip": 1.05093026, "balance_loss_mlp": 1.01528692, "epoch": 0.48313593458786747, "flos": 21871717528320.0, "grad_norm": 12.62070041357154, "language_loss": 0.79121786, "learning_rate": 2.206061167067723e-06, "loss": 0.81309378, "num_input_tokens_seen": 86470785, "step": 4018, "time_per_iteration": 2.84395694732666 }, { "auxiliary_loss_clip": 0.01172277, "auxiliary_loss_mlp": 0.01024328, "balance_loss_clip": 1.05295801, "balance_loss_mlp": 1.01620436, "epoch": 0.4832561774785066, "flos": 22601206840320.0, "grad_norm": 2.248139995330448, "language_loss": 0.80006427, "learning_rate": 2.205286325350549e-06, "loss": 0.82203031, "num_input_tokens_seen": 86489850, "step": 4019, "time_per_iteration": 2.7269537448883057 }, { "auxiliary_loss_clip": 0.011673, "auxiliary_loss_mlp": 0.0102751, "balance_loss_clip": 1.0511409, "balance_loss_mlp": 1.01973724, "epoch": 0.4833764203691457, "flos": 13437342282240.0, "grad_norm": 2.3131261078916685, "language_loss": 0.72573709, "learning_rate": 2.204511452491603e-06, "loss": 0.74768519, "num_input_tokens_seen": 86506475, "step": 4020, "time_per_iteration": 2.759697675704956 }, { "auxiliary_loss_clip": 0.01176379, "auxiliary_loss_mlp": 0.01023926, "balance_loss_clip": 1.05291939, "balance_loss_mlp": 1.01637435, "epoch": 0.48349666325978474, "flos": 44128036955520.0, "grad_norm": 1.8117910499626129, "language_loss": 0.7529884, "learning_rate": 2.2037365486084316e-06, "loss": 0.77499145, "num_input_tokens_seen": 86529715, "step": 4021, "time_per_iteration": 2.912036895751953 }, { "auxiliary_loss_clip": 0.01181671, "auxiliary_loss_mlp": 0.01028719, "balance_loss_clip": 1.05221725, "balance_loss_mlp": 1.02095222, "epoch": 0.48361690615042385, "flos": 26028377245440.0, "grad_norm": 1.9249305179614085, "language_loss": 0.78210104, "learning_rate": 2.2029616138185886e-06, "loss": 0.804205, "num_input_tokens_seen": 86548715, "step": 4022, "time_per_iteration": 2.7473111152648926 }, { "auxiliary_loss_clip": 0.01167588, "auxiliary_loss_mlp": 0.01029758, "balance_loss_clip": 1.05332959, "balance_loss_mlp": 1.02192593, "epoch": 0.48373714904106296, "flos": 22273306560000.0, "grad_norm": 1.7544610293068956, "language_loss": 0.82868826, "learning_rate": 2.202186648239629e-06, "loss": 0.85066164, "num_input_tokens_seen": 86568650, "step": 4023, "time_per_iteration": 2.757185935974121 }, { "auxiliary_loss_clip": 0.011712, "auxiliary_loss_mlp": 0.01027141, "balance_loss_clip": 1.05035853, "balance_loss_mlp": 1.01948214, "epoch": 0.483857391931702, "flos": 28292293699200.0, "grad_norm": 1.813492789962035, "language_loss": 0.71777296, "learning_rate": 2.201411651989117e-06, "loss": 0.73975641, "num_input_tokens_seen": 86590630, "step": 4024, "time_per_iteration": 3.800018072128296 }, { "auxiliary_loss_clip": 0.01168813, "auxiliary_loss_mlp": 0.01054038, "balance_loss_clip": 1.05121422, "balance_loss_mlp": 1.0162518, "epoch": 0.48397763482234113, "flos": 27418048577280.0, "grad_norm": 1.9119767416634255, "language_loss": 0.78194654, "learning_rate": 2.2006366251846167e-06, "loss": 0.80417508, "num_input_tokens_seen": 86611270, "step": 4025, "time_per_iteration": 2.8571887016296387 }, { "auxiliary_loss_clip": 0.01174689, "auxiliary_loss_mlp": 0.01025156, "balance_loss_clip": 1.05393291, "balance_loss_mlp": 1.01792634, "epoch": 0.48409787771298024, "flos": 16797252470400.0, "grad_norm": 2.0058241111095123, "language_loss": 0.75542772, "learning_rate": 2.1998615679436997e-06, "loss": 0.77742624, "num_input_tokens_seen": 86628810, "step": 4026, "time_per_iteration": 4.670713186264038 }, { "auxiliary_loss_clip": 0.01180406, "auxiliary_loss_mlp": 0.010272, "balance_loss_clip": 1.05174875, "balance_loss_mlp": 1.01864648, "epoch": 0.4842181206036193, "flos": 25083496028160.0, "grad_norm": 2.393021094221106, "language_loss": 0.773871, "learning_rate": 2.199086480383942e-06, "loss": 0.79594707, "num_input_tokens_seen": 86648185, "step": 4027, "time_per_iteration": 2.809067726135254 }, { "auxiliary_loss_clip": 0.01189411, "auxiliary_loss_mlp": 0.01032182, "balance_loss_clip": 1.05654812, "balance_loss_mlp": 1.02343774, "epoch": 0.4843383634942584, "flos": 30372311496960.0, "grad_norm": 3.4343517539561526, "language_loss": 0.67786217, "learning_rate": 2.1983113626229234e-06, "loss": 0.70007807, "num_input_tokens_seen": 86667435, "step": 4028, "time_per_iteration": 2.966118335723877 }, { "auxiliary_loss_clip": 0.01168283, "auxiliary_loss_mlp": 0.01059001, "balance_loss_clip": 1.05299807, "balance_loss_mlp": 1.01885068, "epoch": 0.4844586063848975, "flos": 20413564917120.0, "grad_norm": 1.6610085979735525, "language_loss": 0.78794849, "learning_rate": 2.1975362147782293e-06, "loss": 0.81022131, "num_input_tokens_seen": 86686630, "step": 4029, "time_per_iteration": 2.843942165374756 }, { "auxiliary_loss_clip": 0.01085529, "auxiliary_loss_mlp": 0.01007708, "balance_loss_clip": 1.02868986, "balance_loss_mlp": 1.0064503, "epoch": 0.48457884927553657, "flos": 70303722854400.0, "grad_norm": 0.6892225095638694, "language_loss": 0.54143238, "learning_rate": 2.196761036967448e-06, "loss": 0.5623647, "num_input_tokens_seen": 86754595, "step": 4030, "time_per_iteration": 4.326223850250244 }, { "auxiliary_loss_clip": 0.01170454, "auxiliary_loss_mlp": 0.0102422, "balance_loss_clip": 1.0494957, "balance_loss_mlp": 1.0169425, "epoch": 0.4846990921661757, "flos": 19934516206080.0, "grad_norm": 5.533550318379303, "language_loss": 0.77373844, "learning_rate": 2.1959858293081743e-06, "loss": 0.79568523, "num_input_tokens_seen": 86773730, "step": 4031, "time_per_iteration": 2.790104389190674 }, { "auxiliary_loss_clip": 0.01170084, "auxiliary_loss_mlp": 0.01030201, "balance_loss_clip": 1.05436492, "balance_loss_mlp": 1.02258372, "epoch": 0.4848193350568148, "flos": 23075945919360.0, "grad_norm": 1.6405843175941563, "language_loss": 0.76089811, "learning_rate": 2.1952105919180056e-06, "loss": 0.78290099, "num_input_tokens_seen": 86792985, "step": 4032, "time_per_iteration": 2.829463005065918 }, { "auxiliary_loss_clip": 0.01173173, "auxiliary_loss_mlp": 0.0102752, "balance_loss_clip": 1.05360985, "balance_loss_mlp": 1.01982534, "epoch": 0.48493957794745385, "flos": 22455481363200.0, "grad_norm": 3.439947748106399, "language_loss": 0.68129474, "learning_rate": 2.1944353249145456e-06, "loss": 0.70330167, "num_input_tokens_seen": 86812095, "step": 4033, "time_per_iteration": 2.746196746826172 }, { "auxiliary_loss_clip": 0.01181393, "auxiliary_loss_mlp": 0.01023182, "balance_loss_clip": 1.0545491, "balance_loss_mlp": 1.01608288, "epoch": 0.48505982083809296, "flos": 25046112948480.0, "grad_norm": 2.0713818804506503, "language_loss": 0.74777633, "learning_rate": 2.193660028415401e-06, "loss": 0.76982212, "num_input_tokens_seen": 86832875, "step": 4034, "time_per_iteration": 2.785405397415161 }, { "auxiliary_loss_clip": 0.01167914, "auxiliary_loss_mlp": 0.01024438, "balance_loss_clip": 1.05225182, "balance_loss_mlp": 1.01672566, "epoch": 0.485180063728732, "flos": 26761386090240.0, "grad_norm": 1.95718032318338, "language_loss": 0.82279599, "learning_rate": 2.1928847025381852e-06, "loss": 0.84471953, "num_input_tokens_seen": 86853480, "step": 4035, "time_per_iteration": 2.814882755279541 }, { "auxiliary_loss_clip": 0.01175923, "auxiliary_loss_mlp": 0.01025889, "balance_loss_clip": 1.05003428, "balance_loss_mlp": 1.0174911, "epoch": 0.4853003066193711, "flos": 24059143969920.0, "grad_norm": 1.7769891963681361, "language_loss": 0.83771956, "learning_rate": 2.192109347400512e-06, "loss": 0.85973769, "num_input_tokens_seen": 86873695, "step": 4036, "time_per_iteration": 2.758516788482666 }, { "auxiliary_loss_clip": 0.0117814, "auxiliary_loss_mlp": 0.01030909, "balance_loss_clip": 1.05363035, "balance_loss_mlp": 1.02243316, "epoch": 0.48542054951001024, "flos": 23076376882560.0, "grad_norm": 1.6873432499697338, "language_loss": 0.78661609, "learning_rate": 2.191333963120004e-06, "loss": 0.80870658, "num_input_tokens_seen": 86892675, "step": 4037, "time_per_iteration": 2.7756834030151367 }, { "auxiliary_loss_clip": 0.01171165, "auxiliary_loss_mlp": 0.01028342, "balance_loss_clip": 1.05030179, "balance_loss_mlp": 1.02042663, "epoch": 0.4855407924006493, "flos": 25664889565440.0, "grad_norm": 6.831849644832923, "language_loss": 0.70229638, "learning_rate": 2.190558549814286e-06, "loss": 0.72429144, "num_input_tokens_seen": 86912835, "step": 4038, "time_per_iteration": 2.8845558166503906 }, { "auxiliary_loss_clip": 0.01174544, "auxiliary_loss_mlp": 0.01026803, "balance_loss_clip": 1.05356658, "balance_loss_mlp": 1.018893, "epoch": 0.4856610352912884, "flos": 23987933256960.0, "grad_norm": 1.732812265928087, "language_loss": 0.79384398, "learning_rate": 2.1897831076009872e-06, "loss": 0.81585741, "num_input_tokens_seen": 86932475, "step": 4039, "time_per_iteration": 2.8420469760894775 }, { "auxiliary_loss_clip": 0.01177806, "auxiliary_loss_mlp": 0.01026067, "balance_loss_clip": 1.05208898, "balance_loss_mlp": 1.01873553, "epoch": 0.4857812781819275, "flos": 24096814358400.0, "grad_norm": 1.6448798976324674, "language_loss": 0.80015564, "learning_rate": 2.1890076365977426e-06, "loss": 0.8221944, "num_input_tokens_seen": 86952300, "step": 4040, "time_per_iteration": 2.8363664150238037 }, { "auxiliary_loss_clip": 0.01084934, "auxiliary_loss_mlp": 0.01001079, "balance_loss_clip": 1.02737522, "balance_loss_mlp": 0.99980372, "epoch": 0.48590152107256657, "flos": 56266635185280.0, "grad_norm": 0.8592663530137988, "language_loss": 0.52754599, "learning_rate": 2.188232136922189e-06, "loss": 0.54840612, "num_input_tokens_seen": 87010420, "step": 4041, "time_per_iteration": 3.219264507293701 }, { "auxiliary_loss_clip": 0.01161797, "auxiliary_loss_mlp": 0.01027825, "balance_loss_clip": 1.05207372, "balance_loss_mlp": 1.01893759, "epoch": 0.4860217639632057, "flos": 20046988667520.0, "grad_norm": 2.003589892497396, "language_loss": 0.76130784, "learning_rate": 2.187456608691971e-06, "loss": 0.78320402, "num_input_tokens_seen": 87029295, "step": 4042, "time_per_iteration": 2.9138362407684326 }, { "auxiliary_loss_clip": 0.01177128, "auxiliary_loss_mlp": 0.01031864, "balance_loss_clip": 1.05545664, "balance_loss_mlp": 1.02488184, "epoch": 0.4861420068538448, "flos": 17822143232640.0, "grad_norm": 2.1739475410104623, "language_loss": 0.87719929, "learning_rate": 2.1866810520247334e-06, "loss": 0.89928925, "num_input_tokens_seen": 87048165, "step": 4043, "time_per_iteration": 2.7594187259674072 }, { "auxiliary_loss_clip": 0.01182569, "auxiliary_loss_mlp": 0.01024992, "balance_loss_clip": 1.05335844, "balance_loss_mlp": 1.01674318, "epoch": 0.48626224974448384, "flos": 26250125857920.0, "grad_norm": 2.1604520479771927, "language_loss": 0.65178621, "learning_rate": 2.185905467038129e-06, "loss": 0.67386186, "num_input_tokens_seen": 87067070, "step": 4044, "time_per_iteration": 2.745185613632202 }, { "auxiliary_loss_clip": 0.01179117, "auxiliary_loss_mlp": 0.01030299, "balance_loss_clip": 1.05367494, "balance_loss_mlp": 1.02285743, "epoch": 0.48638249263512295, "flos": 22054502862720.0, "grad_norm": 1.7764794252099538, "language_loss": 0.77626479, "learning_rate": 2.1851298538498127e-06, "loss": 0.79835892, "num_input_tokens_seen": 87086785, "step": 4045, "time_per_iteration": 2.7663466930389404 }, { "auxiliary_loss_clip": 0.01184819, "auxiliary_loss_mlp": 0.01058039, "balance_loss_clip": 1.05496228, "balance_loss_mlp": 1.01829171, "epoch": 0.48650273552576206, "flos": 25119945354240.0, "grad_norm": 2.014940735441128, "language_loss": 0.80165291, "learning_rate": 2.184354212577446e-06, "loss": 0.82408154, "num_input_tokens_seen": 87107090, "step": 4046, "time_per_iteration": 2.9271533489227295 }, { "auxiliary_loss_clip": 0.01185082, "auxiliary_loss_mlp": 0.01026888, "balance_loss_clip": 1.05395937, "balance_loss_mlp": 1.01865673, "epoch": 0.4866229784164011, "flos": 17456931699840.0, "grad_norm": 2.9065034537052963, "language_loss": 0.63260949, "learning_rate": 2.1835785433386907e-06, "loss": 0.65472925, "num_input_tokens_seen": 87125905, "step": 4047, "time_per_iteration": 2.7820801734924316 }, { "auxiliary_loss_clip": 0.01163883, "auxiliary_loss_mlp": 0.01027786, "balance_loss_clip": 1.05352306, "balance_loss_mlp": 1.01935148, "epoch": 0.48674322130704023, "flos": 23331127115520.0, "grad_norm": 1.8363910446326366, "language_loss": 0.65582216, "learning_rate": 2.182802846251216e-06, "loss": 0.67773885, "num_input_tokens_seen": 87146175, "step": 4048, "time_per_iteration": 2.8268015384674072 }, { "auxiliary_loss_clip": 0.01177569, "auxiliary_loss_mlp": 0.01028295, "balance_loss_clip": 1.05193841, "balance_loss_mlp": 1.01992631, "epoch": 0.4868634641976793, "flos": 28804344030720.0, "grad_norm": 1.7702063123622096, "language_loss": 0.72117168, "learning_rate": 2.182027121432696e-06, "loss": 0.74323034, "num_input_tokens_seen": 87166800, "step": 4049, "time_per_iteration": 2.8613853454589844 }, { "auxiliary_loss_clip": 0.01185475, "auxiliary_loss_mlp": 0.01031109, "balance_loss_clip": 1.05389071, "balance_loss_mlp": 1.02294326, "epoch": 0.4869837070883184, "flos": 19025976574080.0, "grad_norm": 2.637087734691529, "language_loss": 0.81985694, "learning_rate": 2.1812513690008054e-06, "loss": 0.84202278, "num_input_tokens_seen": 87185920, "step": 4050, "time_per_iteration": 3.9218525886535645 }, { "auxiliary_loss_clip": 0.01184791, "auxiliary_loss_mlp": 0.01030854, "balance_loss_clip": 1.05431533, "balance_loss_mlp": 1.02254546, "epoch": 0.4871039499789575, "flos": 15121409483520.0, "grad_norm": 2.3983332229986076, "language_loss": 0.79652834, "learning_rate": 2.180475589073227e-06, "loss": 0.81868482, "num_input_tokens_seen": 87203620, "step": 4051, "time_per_iteration": 3.7604315280914307 }, { "auxiliary_loss_clip": 0.0116845, "auxiliary_loss_mlp": 0.01024307, "balance_loss_clip": 1.05049706, "balance_loss_mlp": 1.01661229, "epoch": 0.48722419286959656, "flos": 26174066808960.0, "grad_norm": 1.8727615435461016, "language_loss": 0.73411375, "learning_rate": 2.1796997817676456e-06, "loss": 0.75604129, "num_input_tokens_seen": 87224630, "step": 4052, "time_per_iteration": 3.852053642272949 }, { "auxiliary_loss_clip": 0.01180403, "auxiliary_loss_mlp": 0.01059296, "balance_loss_clip": 1.05342102, "balance_loss_mlp": 1.02046144, "epoch": 0.4873444357602357, "flos": 24026142349440.0, "grad_norm": 1.4596395746054838, "language_loss": 0.67231011, "learning_rate": 2.1789239472017494e-06, "loss": 0.6947071, "num_input_tokens_seen": 87246280, "step": 4053, "time_per_iteration": 2.8092269897460938 }, { "auxiliary_loss_clip": 0.01173142, "auxiliary_loss_mlp": 0.01029645, "balance_loss_clip": 1.05352283, "balance_loss_mlp": 1.02111602, "epoch": 0.4874646786508748, "flos": 22820441500800.0, "grad_norm": 2.24568790563111, "language_loss": 0.73316222, "learning_rate": 2.1781480854932326e-06, "loss": 0.75519013, "num_input_tokens_seen": 87266045, "step": 4054, "time_per_iteration": 2.7485618591308594 }, { "auxiliary_loss_clip": 0.01168157, "auxiliary_loss_mlp": 0.01030361, "balance_loss_clip": 1.05407202, "balance_loss_mlp": 1.02242172, "epoch": 0.48758492154151384, "flos": 21287594557440.0, "grad_norm": 1.9544463518671773, "language_loss": 0.79251146, "learning_rate": 2.1773721967597933e-06, "loss": 0.81449664, "num_input_tokens_seen": 87284495, "step": 4055, "time_per_iteration": 2.788424491882324 }, { "auxiliary_loss_clip": 0.01078746, "auxiliary_loss_mlp": 0.01000691, "balance_loss_clip": 1.02598536, "balance_loss_mlp": 0.99934429, "epoch": 0.48770516443215295, "flos": 62244109180800.0, "grad_norm": 0.8546310867262299, "language_loss": 0.57385254, "learning_rate": 2.1765962811191322e-06, "loss": 0.59464693, "num_input_tokens_seen": 87338960, "step": 4056, "time_per_iteration": 4.046236991882324 }, { "auxiliary_loss_clip": 0.01077471, "auxiliary_loss_mlp": 0.01002684, "balance_loss_clip": 1.02540219, "balance_loss_mlp": 1.00146794, "epoch": 0.48782540732279206, "flos": 66133451882880.0, "grad_norm": 0.8224079326508952, "language_loss": 0.62039155, "learning_rate": 2.1758203386889566e-06, "loss": 0.64119309, "num_input_tokens_seen": 87401730, "step": 4057, "time_per_iteration": 3.3101773262023926 }, { "auxiliary_loss_clip": 0.0117631, "auxiliary_loss_mlp": 0.01062133, "balance_loss_clip": 1.0572598, "balance_loss_mlp": 1.02605927, "epoch": 0.4879456502134311, "flos": 14607922608000.0, "grad_norm": 3.9848316297153423, "language_loss": 0.85141855, "learning_rate": 2.1750443695869746e-06, "loss": 0.8738029, "num_input_tokens_seen": 87417300, "step": 4058, "time_per_iteration": 2.7801716327667236 }, { "auxiliary_loss_clip": 0.01177819, "auxiliary_loss_mlp": 0.01027884, "balance_loss_clip": 1.05006468, "balance_loss_mlp": 1.0203377, "epoch": 0.4880658931040702, "flos": 19500464257920.0, "grad_norm": 1.780528152823735, "language_loss": 0.86091709, "learning_rate": 2.174268373930901e-06, "loss": 0.88297415, "num_input_tokens_seen": 87434815, "step": 4059, "time_per_iteration": 2.655111074447632 }, { "auxiliary_loss_clip": 0.01165073, "auxiliary_loss_mlp": 0.01061356, "balance_loss_clip": 1.05328119, "balance_loss_mlp": 1.0219388, "epoch": 0.48818613599470934, "flos": 16723060928640.0, "grad_norm": 2.0959478727192815, "language_loss": 0.79440892, "learning_rate": 2.1734923518384537e-06, "loss": 0.81667322, "num_input_tokens_seen": 87451420, "step": 4060, "time_per_iteration": 2.944730520248413 }, { "auxiliary_loss_clip": 0.01158947, "auxiliary_loss_mlp": 0.01032101, "balance_loss_clip": 1.0532831, "balance_loss_mlp": 1.02433491, "epoch": 0.4883063788853484, "flos": 26756932803840.0, "grad_norm": 7.2361016506123415, "language_loss": 0.82520461, "learning_rate": 2.1727163034273547e-06, "loss": 0.84711516, "num_input_tokens_seen": 87469585, "step": 4061, "time_per_iteration": 2.867307424545288 }, { "auxiliary_loss_clip": 0.01182465, "auxiliary_loss_mlp": 0.01026773, "balance_loss_clip": 1.05448341, "balance_loss_mlp": 1.0188576, "epoch": 0.4884266217759875, "flos": 16763388923520.0, "grad_norm": 2.0107615170668076, "language_loss": 0.78925717, "learning_rate": 2.17194022881533e-06, "loss": 0.81134957, "num_input_tokens_seen": 87485675, "step": 4062, "time_per_iteration": 2.6750435829162598 }, { "auxiliary_loss_clip": 0.01181845, "auxiliary_loss_mlp": 0.01032602, "balance_loss_clip": 1.0552696, "balance_loss_mlp": 1.02357149, "epoch": 0.4885468646666266, "flos": 24207132003840.0, "grad_norm": 2.7502809729315603, "language_loss": 0.67443252, "learning_rate": 2.1711641281201092e-06, "loss": 0.69657707, "num_input_tokens_seen": 87505605, "step": 4063, "time_per_iteration": 2.817237377166748 }, { "auxiliary_loss_clip": 0.01177233, "auxiliary_loss_mlp": 0.01029376, "balance_loss_clip": 1.0543437, "balance_loss_mlp": 1.021312, "epoch": 0.48866710755726567, "flos": 14610795696000.0, "grad_norm": 2.138208824226634, "language_loss": 0.79283333, "learning_rate": 2.1703880014594264e-06, "loss": 0.81489944, "num_input_tokens_seen": 87523195, "step": 4064, "time_per_iteration": 2.7014667987823486 }, { "auxiliary_loss_clip": 0.01162154, "auxiliary_loss_mlp": 0.01025667, "balance_loss_clip": 1.05403566, "balance_loss_mlp": 1.0183177, "epoch": 0.4887873504479048, "flos": 28804451771520.0, "grad_norm": 1.9914020298640063, "language_loss": 0.73920512, "learning_rate": 2.1696118489510182e-06, "loss": 0.76108325, "num_input_tokens_seen": 87544125, "step": 4065, "time_per_iteration": 2.9067440032958984 }, { "auxiliary_loss_clip": 0.01179542, "auxiliary_loss_mlp": 0.0105671, "balance_loss_clip": 1.05545926, "balance_loss_mlp": 1.01908588, "epoch": 0.48890759333854383, "flos": 22784387224320.0, "grad_norm": 1.8018721838532903, "language_loss": 0.72819406, "learning_rate": 2.1688356707126286e-06, "loss": 0.75055659, "num_input_tokens_seen": 87563745, "step": 4066, "time_per_iteration": 2.764864206314087 }, { "auxiliary_loss_clip": 0.01169906, "auxiliary_loss_mlp": 0.01033997, "balance_loss_clip": 1.05484116, "balance_loss_mlp": 1.02595592, "epoch": 0.48902783622918294, "flos": 17786088956160.0, "grad_norm": 8.975929332218554, "language_loss": 0.698313, "learning_rate": 2.168059466862001e-06, "loss": 0.72035205, "num_input_tokens_seen": 87581895, "step": 4067, "time_per_iteration": 2.8153154850006104 }, { "auxiliary_loss_clip": 0.01174688, "auxiliary_loss_mlp": 0.01027718, "balance_loss_clip": 1.04895496, "balance_loss_mlp": 1.02010643, "epoch": 0.48914807911982205, "flos": 22310294590080.0, "grad_norm": 2.112020442240096, "language_loss": 0.81628239, "learning_rate": 2.167283237516887e-06, "loss": 0.83830643, "num_input_tokens_seen": 87600170, "step": 4068, "time_per_iteration": 2.7321252822875977 }, { "auxiliary_loss_clip": 0.01176999, "auxiliary_loss_mlp": 0.01029222, "balance_loss_clip": 1.05142605, "balance_loss_mlp": 1.02119374, "epoch": 0.4892683220104611, "flos": 16363020954240.0, "grad_norm": 1.8048697902390125, "language_loss": 0.74600023, "learning_rate": 2.1665069827950383e-06, "loss": 0.76806235, "num_input_tokens_seen": 87617455, "step": 4069, "time_per_iteration": 2.7355785369873047 }, { "auxiliary_loss_clip": 0.01175695, "auxiliary_loss_mlp": 0.01029339, "balance_loss_clip": 1.05482852, "balance_loss_mlp": 1.02185595, "epoch": 0.4893885649011002, "flos": 15739144606080.0, "grad_norm": 2.850078167165207, "language_loss": 0.86809641, "learning_rate": 2.1657307028142126e-06, "loss": 0.89014673, "num_input_tokens_seen": 87634995, "step": 4070, "time_per_iteration": 2.7850029468536377 }, { "auxiliary_loss_clip": 0.0117265, "auxiliary_loss_mlp": 0.01026724, "balance_loss_clip": 1.05259395, "balance_loss_mlp": 1.01849842, "epoch": 0.48950880779173933, "flos": 28581984887040.0, "grad_norm": 2.1158678962701614, "language_loss": 0.6785562, "learning_rate": 2.164954397692171e-06, "loss": 0.70054996, "num_input_tokens_seen": 87654420, "step": 4071, "time_per_iteration": 2.7794742584228516 }, { "auxiliary_loss_clip": 0.01086392, "auxiliary_loss_mlp": 0.01005412, "balance_loss_clip": 1.02352023, "balance_loss_mlp": 1.00387466, "epoch": 0.4896290506823784, "flos": 66186310746240.0, "grad_norm": 1.2432597183592227, "language_loss": 0.77299732, "learning_rate": 2.164178067546678e-06, "loss": 0.79391533, "num_input_tokens_seen": 87713585, "step": 4072, "time_per_iteration": 3.452704429626465 }, { "auxiliary_loss_clip": 0.0118036, "auxiliary_loss_mlp": 0.01031584, "balance_loss_clip": 1.05226433, "balance_loss_mlp": 1.02323961, "epoch": 0.4897492935730175, "flos": 12531065207040.0, "grad_norm": 5.185445361742552, "language_loss": 0.91218793, "learning_rate": 2.163401712495504e-06, "loss": 0.93430734, "num_input_tokens_seen": 87731280, "step": 4073, "time_per_iteration": 2.7980098724365234 }, { "auxiliary_loss_clip": 0.01173822, "auxiliary_loss_mlp": 0.01031086, "balance_loss_clip": 1.05595112, "balance_loss_mlp": 1.022771, "epoch": 0.4898695364636566, "flos": 23476816679040.0, "grad_norm": 1.7153722906264424, "language_loss": 0.79315555, "learning_rate": 2.1626253326564194e-06, "loss": 0.81520468, "num_input_tokens_seen": 87750230, "step": 4074, "time_per_iteration": 2.925116777420044 }, { "auxiliary_loss_clip": 0.01173864, "auxiliary_loss_mlp": 0.01028076, "balance_loss_clip": 1.05210364, "balance_loss_mlp": 1.01945686, "epoch": 0.48998977935429566, "flos": 27160209774720.0, "grad_norm": 1.7083046605524852, "language_loss": 0.76675516, "learning_rate": 2.161848928147201e-06, "loss": 0.78877455, "num_input_tokens_seen": 87770500, "step": 4075, "time_per_iteration": 3.79101300239563 }, { "auxiliary_loss_clip": 0.01176218, "auxiliary_loss_mlp": 0.01024804, "balance_loss_clip": 1.05296898, "balance_loss_mlp": 1.01634014, "epoch": 0.4901100222449348, "flos": 20339588856960.0, "grad_norm": 2.295717850118098, "language_loss": 0.80855811, "learning_rate": 2.161072499085629e-06, "loss": 0.83056837, "num_input_tokens_seen": 87789495, "step": 4076, "time_per_iteration": 2.778984308242798 }, { "auxiliary_loss_clip": 0.01178803, "auxiliary_loss_mlp": 0.01032892, "balance_loss_clip": 1.05691743, "balance_loss_mlp": 1.02506614, "epoch": 0.4902302651355739, "flos": 30446359384320.0, "grad_norm": 1.619246063491755, "language_loss": 0.832241, "learning_rate": 2.160296045589487e-06, "loss": 0.85435796, "num_input_tokens_seen": 87812955, "step": 4077, "time_per_iteration": 4.10127067565918 }, { "auxiliary_loss_clip": 0.01179007, "auxiliary_loss_mlp": 0.0103484, "balance_loss_clip": 1.05497158, "balance_loss_mlp": 1.02638173, "epoch": 0.49035050802621294, "flos": 19174180089600.0, "grad_norm": 2.067610619161259, "language_loss": 0.69979399, "learning_rate": 2.159519567776562e-06, "loss": 0.72193253, "num_input_tokens_seen": 87832605, "step": 4078, "time_per_iteration": 3.8832643032073975 }, { "auxiliary_loss_clip": 0.01176886, "auxiliary_loss_mlp": 0.01028637, "balance_loss_clip": 1.05382323, "balance_loss_mlp": 1.02029228, "epoch": 0.49047075091685205, "flos": 22228489365120.0, "grad_norm": 2.740872307059372, "language_loss": 0.70846021, "learning_rate": 2.1587430657646463e-06, "loss": 0.73051542, "num_input_tokens_seen": 87846040, "step": 4079, "time_per_iteration": 2.861027717590332 }, { "auxiliary_loss_clip": 0.01174626, "auxiliary_loss_mlp": 0.01027546, "balance_loss_clip": 1.05344033, "balance_loss_mlp": 1.01975513, "epoch": 0.4905909938074911, "flos": 20156516213760.0, "grad_norm": 1.9685544230491423, "language_loss": 0.78052878, "learning_rate": 2.157966539671533e-06, "loss": 0.80255049, "num_input_tokens_seen": 87865680, "step": 4080, "time_per_iteration": 2.7700576782226562 }, { "auxiliary_loss_clip": 0.01174975, "auxiliary_loss_mlp": 0.01022715, "balance_loss_clip": 1.05359185, "balance_loss_mlp": 1.01500177, "epoch": 0.4907112366981302, "flos": 17202217380480.0, "grad_norm": 4.2097532377630404, "language_loss": 0.67150331, "learning_rate": 2.157189989615021e-06, "loss": 0.69348025, "num_input_tokens_seen": 87884270, "step": 4081, "time_per_iteration": 2.7515110969543457 }, { "auxiliary_loss_clip": 0.01182568, "auxiliary_loss_mlp": 0.01056648, "balance_loss_clip": 1.05508161, "balance_loss_mlp": 1.01736283, "epoch": 0.4908314795887693, "flos": 21688968107520.0, "grad_norm": 1.94929670553126, "language_loss": 0.75492454, "learning_rate": 2.156413415712913e-06, "loss": 0.77731669, "num_input_tokens_seen": 87906320, "step": 4082, "time_per_iteration": 3.6306240558624268 }, { "auxiliary_loss_clip": 0.01178567, "auxiliary_loss_mlp": 0.01052443, "balance_loss_clip": 1.0528779, "balance_loss_mlp": 1.01339006, "epoch": 0.4909517224794084, "flos": 26213676531840.0, "grad_norm": 1.6087653012353513, "language_loss": 0.78613114, "learning_rate": 2.155636818083014e-06, "loss": 0.80844128, "num_input_tokens_seen": 87927690, "step": 4083, "time_per_iteration": 2.7430741786956787 }, { "auxiliary_loss_clip": 0.0116817, "auxiliary_loss_mlp": 0.01026235, "balance_loss_clip": 1.05189145, "balance_loss_mlp": 1.01875496, "epoch": 0.4910719653700475, "flos": 23148377694720.0, "grad_norm": 1.7957934197089613, "language_loss": 0.84347796, "learning_rate": 2.154860196843134e-06, "loss": 0.86542201, "num_input_tokens_seen": 87946885, "step": 4084, "time_per_iteration": 2.781630277633667 }, { "auxiliary_loss_clip": 0.01181843, "auxiliary_loss_mlp": 0.0103031, "balance_loss_clip": 1.05235386, "balance_loss_mlp": 1.02216804, "epoch": 0.4911922082606866, "flos": 23331845387520.0, "grad_norm": 2.047594529871099, "language_loss": 0.76818413, "learning_rate": 2.154083552111085e-06, "loss": 0.79030567, "num_input_tokens_seen": 87966055, "step": 4085, "time_per_iteration": 2.742039442062378 }, { "auxiliary_loss_clip": 0.01183117, "auxiliary_loss_mlp": 0.01028711, "balance_loss_clip": 1.05177093, "balance_loss_mlp": 1.02058077, "epoch": 0.49131245115132566, "flos": 29203239542400.0, "grad_norm": 8.100791783719526, "language_loss": 0.81466895, "learning_rate": 2.1533068840046834e-06, "loss": 0.83678722, "num_input_tokens_seen": 87986320, "step": 4086, "time_per_iteration": 2.766115427017212 }, { "auxiliary_loss_clip": 0.01170657, "auxiliary_loss_mlp": 0.01059576, "balance_loss_clip": 1.05341017, "balance_loss_mlp": 1.01994288, "epoch": 0.49143269404196477, "flos": 20147465986560.0, "grad_norm": 2.882881353453331, "language_loss": 0.61551833, "learning_rate": 2.152530192641749e-06, "loss": 0.63782072, "num_input_tokens_seen": 88001230, "step": 4087, "time_per_iteration": 2.6837830543518066 }, { "auxiliary_loss_clip": 0.01183298, "auxiliary_loss_mlp": 0.01030668, "balance_loss_clip": 1.05473936, "balance_loss_mlp": 1.02255905, "epoch": 0.4915529369326039, "flos": 24389809597440.0, "grad_norm": 2.018607908930055, "language_loss": 0.72522789, "learning_rate": 2.1517534781401068e-06, "loss": 0.74736756, "num_input_tokens_seen": 88019110, "step": 4088, "time_per_iteration": 2.687879800796509 }, { "auxiliary_loss_clip": 0.01177095, "auxiliary_loss_mlp": 0.01028912, "balance_loss_clip": 1.05205536, "balance_loss_mlp": 1.02105606, "epoch": 0.49167317982324293, "flos": 10524305197440.0, "grad_norm": 1.9465935020744982, "language_loss": 0.69236916, "learning_rate": 2.150976740617581e-06, "loss": 0.7144292, "num_input_tokens_seen": 88035670, "step": 4089, "time_per_iteration": 2.637967348098755 }, { "auxiliary_loss_clip": 0.01179628, "auxiliary_loss_mlp": 0.01026517, "balance_loss_clip": 1.05409729, "balance_loss_mlp": 1.01851749, "epoch": 0.49179342271388204, "flos": 25593427457280.0, "grad_norm": 7.3399341931130735, "language_loss": 0.70829189, "learning_rate": 2.150199980192006e-06, "loss": 0.73035336, "num_input_tokens_seen": 88054790, "step": 4090, "time_per_iteration": 2.7755982875823975 }, { "auxiliary_loss_clip": 0.011754, "auxiliary_loss_mlp": 0.01026558, "balance_loss_clip": 1.05517304, "balance_loss_mlp": 1.01867223, "epoch": 0.49191366560452116, "flos": 21102043875840.0, "grad_norm": 2.531605513765391, "language_loss": 0.81052887, "learning_rate": 2.1494231969812114e-06, "loss": 0.83254844, "num_input_tokens_seen": 88073780, "step": 4091, "time_per_iteration": 2.658738613128662 }, { "auxiliary_loss_clip": 0.01176524, "auxiliary_loss_mlp": 0.01032205, "balance_loss_clip": 1.05478156, "balance_loss_mlp": 1.02378869, "epoch": 0.4920339084951602, "flos": 26067520091520.0, "grad_norm": 2.810865998828359, "language_loss": 0.81746197, "learning_rate": 2.1486463911030372e-06, "loss": 0.83954924, "num_input_tokens_seen": 88094430, "step": 4092, "time_per_iteration": 2.832288980484009 }, { "auxiliary_loss_clip": 0.01174068, "auxiliary_loss_mlp": 0.01026223, "balance_loss_clip": 1.05151415, "balance_loss_mlp": 1.01843905, "epoch": 0.4921541513857993, "flos": 25081269384960.0, "grad_norm": 2.2306613340659527, "language_loss": 0.74454153, "learning_rate": 2.147869562675324e-06, "loss": 0.76654446, "num_input_tokens_seen": 88113400, "step": 4093, "time_per_iteration": 2.757826089859009 }, { "auxiliary_loss_clip": 0.0117804, "auxiliary_loss_mlp": 0.01026819, "balance_loss_clip": 1.05261588, "balance_loss_mlp": 1.01884103, "epoch": 0.49227439427643843, "flos": 24389809597440.0, "grad_norm": 1.930692342441121, "language_loss": 0.72681785, "learning_rate": 2.147092711815915e-06, "loss": 0.74886644, "num_input_tokens_seen": 88132750, "step": 4094, "time_per_iteration": 2.802122116088867 }, { "auxiliary_loss_clip": 0.01171128, "auxiliary_loss_mlp": 0.01032672, "balance_loss_clip": 1.05390477, "balance_loss_mlp": 1.02482831, "epoch": 0.4923946371670775, "flos": 11363753018880.0, "grad_norm": 2.7935904449970947, "language_loss": 0.86005694, "learning_rate": 2.1463158386426593e-06, "loss": 0.88209492, "num_input_tokens_seen": 88150560, "step": 4095, "time_per_iteration": 2.709562063217163 }, { "auxiliary_loss_clip": 0.01181948, "auxiliary_loss_mlp": 0.01027339, "balance_loss_clip": 1.05527556, "balance_loss_mlp": 1.01903594, "epoch": 0.4925148800577166, "flos": 30445964334720.0, "grad_norm": 2.320778997580296, "language_loss": 0.7730428, "learning_rate": 2.145538943273407e-06, "loss": 0.79513568, "num_input_tokens_seen": 88170835, "step": 4096, "time_per_iteration": 2.771270275115967 }, { "auxiliary_loss_clip": 0.01182918, "auxiliary_loss_mlp": 0.01027289, "balance_loss_clip": 1.05423164, "balance_loss_mlp": 1.01937938, "epoch": 0.49263512294835565, "flos": 20850454039680.0, "grad_norm": 1.7864493749945445, "language_loss": 0.71932292, "learning_rate": 2.144762025826013e-06, "loss": 0.74142504, "num_input_tokens_seen": 88189925, "step": 4097, "time_per_iteration": 2.649797201156616 }, { "auxiliary_loss_clip": 0.01184239, "auxiliary_loss_mlp": 0.01030059, "balance_loss_clip": 1.05558002, "balance_loss_mlp": 1.02238798, "epoch": 0.49275536583899476, "flos": 23767477534080.0, "grad_norm": 4.496932732312619, "language_loss": 0.87109983, "learning_rate": 2.143985086418334e-06, "loss": 0.89324278, "num_input_tokens_seen": 88205105, "step": 4098, "time_per_iteration": 2.6893036365509033 }, { "auxiliary_loss_clip": 0.01176137, "auxiliary_loss_mlp": 0.01030917, "balance_loss_clip": 1.05179322, "balance_loss_mlp": 1.02307892, "epoch": 0.4928756087296339, "flos": 22273522041600.0, "grad_norm": 1.4817394498184522, "language_loss": 0.76435781, "learning_rate": 2.1432081251682324e-06, "loss": 0.78642833, "num_input_tokens_seen": 88225475, "step": 4099, "time_per_iteration": 2.74340558052063 }, { "auxiliary_loss_clip": 0.01178105, "auxiliary_loss_mlp": 0.01025843, "balance_loss_clip": 1.05672431, "balance_loss_mlp": 1.01792789, "epoch": 0.49299585162027293, "flos": 19645471463040.0, "grad_norm": 1.6571788415203814, "language_loss": 0.8682363, "learning_rate": 2.142431142193572e-06, "loss": 0.89027578, "num_input_tokens_seen": 88243255, "step": 4100, "time_per_iteration": 2.6264634132385254 }, { "auxiliary_loss_clip": 0.01178474, "auxiliary_loss_mlp": 0.01029072, "balance_loss_clip": 1.05260432, "balance_loss_mlp": 1.02124035, "epoch": 0.49311609451091204, "flos": 38837138497920.0, "grad_norm": 2.7052650520222503, "language_loss": 0.71171552, "learning_rate": 2.1416541376122207e-06, "loss": 0.73379099, "num_input_tokens_seen": 88263435, "step": 4101, "time_per_iteration": 2.820962905883789 }, { "auxiliary_loss_clip": 0.01180085, "auxiliary_loss_mlp": 0.01030399, "balance_loss_clip": 1.05140603, "balance_loss_mlp": 1.02200055, "epoch": 0.49323633740155115, "flos": 28329102161280.0, "grad_norm": 2.467349055687541, "language_loss": 0.72882569, "learning_rate": 2.1408771115420496e-06, "loss": 0.75093055, "num_input_tokens_seen": 88283295, "step": 4102, "time_per_iteration": 3.598848342895508 }, { "auxiliary_loss_clip": 0.01169887, "auxiliary_loss_mlp": 0.01027395, "balance_loss_clip": 1.05436921, "balance_loss_mlp": 1.02002776, "epoch": 0.4933565802921902, "flos": 21135584200320.0, "grad_norm": 1.8939786346983245, "language_loss": 0.64673394, "learning_rate": 2.140100064100932e-06, "loss": 0.66870677, "num_input_tokens_seen": 88299270, "step": 4103, "time_per_iteration": 3.7261953353881836 }, { "auxiliary_loss_clip": 0.01174338, "auxiliary_loss_mlp": 0.01028655, "balance_loss_clip": 1.05211115, "balance_loss_mlp": 1.02101052, "epoch": 0.4934768231828293, "flos": 18039007595520.0, "grad_norm": 2.6910277257628175, "language_loss": 0.76173973, "learning_rate": 2.139322995406746e-06, "loss": 0.78376973, "num_input_tokens_seen": 88316905, "step": 4104, "time_per_iteration": 3.613988161087036 }, { "auxiliary_loss_clip": 0.01183634, "auxiliary_loss_mlp": 0.01028631, "balance_loss_clip": 1.05557489, "balance_loss_mlp": 1.02061367, "epoch": 0.4935970660734684, "flos": 23469957181440.0, "grad_norm": 2.2216661409819958, "language_loss": 0.79589486, "learning_rate": 2.1385459055773727e-06, "loss": 0.81801748, "num_input_tokens_seen": 88335095, "step": 4105, "time_per_iteration": 2.8562917709350586 }, { "auxiliary_loss_clip": 0.01157863, "auxiliary_loss_mlp": 0.01058555, "balance_loss_clip": 1.05386925, "balance_loss_mlp": 1.01796114, "epoch": 0.4937173089641075, "flos": 64479258840960.0, "grad_norm": 2.5633269053621346, "language_loss": 0.73965746, "learning_rate": 2.137768794730696e-06, "loss": 0.76182163, "num_input_tokens_seen": 88358545, "step": 4106, "time_per_iteration": 3.157581329345703 }, { "auxiliary_loss_clip": 0.01181443, "auxiliary_loss_mlp": 0.01029544, "balance_loss_clip": 1.05639577, "balance_loss_mlp": 1.02143812, "epoch": 0.4938375518547466, "flos": 22346025644160.0, "grad_norm": 1.7592173491020715, "language_loss": 0.80446392, "learning_rate": 2.1369916629846026e-06, "loss": 0.82657379, "num_input_tokens_seen": 88378295, "step": 4107, "time_per_iteration": 2.841827392578125 }, { "auxiliary_loss_clip": 0.01175741, "auxiliary_loss_mlp": 0.01029311, "balance_loss_clip": 1.05287898, "balance_loss_mlp": 1.02146661, "epoch": 0.4939577947453857, "flos": 17858700299520.0, "grad_norm": 2.4169964190158364, "language_loss": 0.74810785, "learning_rate": 2.136214510456983e-06, "loss": 0.77015841, "num_input_tokens_seen": 88396750, "step": 4108, "time_per_iteration": 3.7944860458374023 }, { "auxiliary_loss_clip": 0.01089272, "auxiliary_loss_mlp": 0.01049177, "balance_loss_clip": 1.02476931, "balance_loss_mlp": 1.00401044, "epoch": 0.49407803763602476, "flos": 70066746875520.0, "grad_norm": 0.908781906469262, "language_loss": 0.63171178, "learning_rate": 2.1354373372657296e-06, "loss": 0.65309626, "num_input_tokens_seen": 88455190, "step": 4109, "time_per_iteration": 3.4409093856811523 }, { "auxiliary_loss_clip": 0.01181804, "auxiliary_loss_mlp": 0.0103226, "balance_loss_clip": 1.05560589, "balance_loss_mlp": 1.02452898, "epoch": 0.49419828052666387, "flos": 24317485562880.0, "grad_norm": 6.942882240575707, "language_loss": 0.71073562, "learning_rate": 2.1346601435287404e-06, "loss": 0.73287624, "num_input_tokens_seen": 88477460, "step": 4110, "time_per_iteration": 2.82024884223938 }, { "auxiliary_loss_clip": 0.01173021, "auxiliary_loss_mlp": 0.01031555, "balance_loss_clip": 1.0508424, "balance_loss_mlp": 1.02385676, "epoch": 0.494318523417303, "flos": 29386060790400.0, "grad_norm": 1.8493584592450947, "language_loss": 0.8015312, "learning_rate": 2.1338829293639144e-06, "loss": 0.82357693, "num_input_tokens_seen": 88497820, "step": 4111, "time_per_iteration": 2.75970458984375 }, { "auxiliary_loss_clip": 0.01166852, "auxiliary_loss_mlp": 0.01029627, "balance_loss_clip": 1.05314016, "balance_loss_mlp": 1.0221169, "epoch": 0.49443876630794203, "flos": 15268284195840.0, "grad_norm": 2.11249333038284, "language_loss": 0.83276868, "learning_rate": 2.1331056948891547e-06, "loss": 0.85473347, "num_input_tokens_seen": 88514920, "step": 4112, "time_per_iteration": 2.805208206176758 }, { "auxiliary_loss_clip": 0.01168517, "auxiliary_loss_mlp": 0.01026845, "balance_loss_clip": 1.05035615, "balance_loss_mlp": 1.01916742, "epoch": 0.49455900919858115, "flos": 12347453859840.0, "grad_norm": 2.108387253584433, "language_loss": 0.76569045, "learning_rate": 2.1323284402223666e-06, "loss": 0.78764415, "num_input_tokens_seen": 88530910, "step": 4113, "time_per_iteration": 2.715247869491577 }, { "auxiliary_loss_clip": 0.01179433, "auxiliary_loss_mlp": 0.0106299, "balance_loss_clip": 1.05636084, "balance_loss_mlp": 1.02234817, "epoch": 0.4946792520892202, "flos": 22779610715520.0, "grad_norm": 1.8519708700362638, "language_loss": 0.88536304, "learning_rate": 2.1315511654814597e-06, "loss": 0.9077872, "num_input_tokens_seen": 88549320, "step": 4114, "time_per_iteration": 2.803098201751709 }, { "auxiliary_loss_clip": 0.01166424, "auxiliary_loss_mlp": 0.01022175, "balance_loss_clip": 1.0526104, "balance_loss_mlp": 1.0150044, "epoch": 0.4947994949798593, "flos": 23148126299520.0, "grad_norm": 2.1082057774993026, "language_loss": 0.78202629, "learning_rate": 2.1307738707843456e-06, "loss": 0.80391228, "num_input_tokens_seen": 88568985, "step": 4115, "time_per_iteration": 2.7224173545837402 }, { "auxiliary_loss_clip": 0.01184444, "auxiliary_loss_mlp": 0.01028231, "balance_loss_clip": 1.05574119, "balance_loss_mlp": 1.02019632, "epoch": 0.4949197378704984, "flos": 23659997063040.0, "grad_norm": 2.2202326024887364, "language_loss": 0.69001526, "learning_rate": 2.1299965562489385e-06, "loss": 0.71214205, "num_input_tokens_seen": 88588790, "step": 4116, "time_per_iteration": 2.7798924446105957 }, { "auxiliary_loss_clip": 0.01177026, "auxiliary_loss_mlp": 0.01026856, "balance_loss_clip": 1.0542618, "balance_loss_mlp": 1.01865399, "epoch": 0.4950399807611375, "flos": 26911493026560.0, "grad_norm": 1.5445144864121299, "language_loss": 0.78686166, "learning_rate": 2.129219221993158e-06, "loss": 0.80890048, "num_input_tokens_seen": 88613575, "step": 4117, "time_per_iteration": 2.792217493057251 }, { "auxiliary_loss_clip": 0.01084292, "auxiliary_loss_mlp": 0.01006413, "balance_loss_clip": 1.02558541, "balance_loss_mlp": 1.00493455, "epoch": 0.4951602236517766, "flos": 67315270187520.0, "grad_norm": 0.7851297357941073, "language_loss": 0.59855461, "learning_rate": 2.128441868134924e-06, "loss": 0.61946166, "num_input_tokens_seen": 88675510, "step": 4118, "time_per_iteration": 3.3637216091156006 }, { "auxiliary_loss_clip": 0.01173585, "auxiliary_loss_mlp": 0.01029702, "balance_loss_clip": 1.05169106, "balance_loss_mlp": 1.02194726, "epoch": 0.4952804665424157, "flos": 19901442758400.0, "grad_norm": 2.7491725483106118, "language_loss": 0.82774043, "learning_rate": 2.1276644947921606e-06, "loss": 0.84977335, "num_input_tokens_seen": 88694425, "step": 4119, "time_per_iteration": 2.7368836402893066 }, { "auxiliary_loss_clip": 0.01174932, "auxiliary_loss_mlp": 0.01029108, "balance_loss_clip": 1.05215776, "balance_loss_mlp": 1.02071571, "epoch": 0.49540070943305475, "flos": 18806813740800.0, "grad_norm": 2.9876064431811673, "language_loss": 0.82576257, "learning_rate": 2.126887102082795e-06, "loss": 0.847803, "num_input_tokens_seen": 88714450, "step": 4120, "time_per_iteration": 2.68676495552063 }, { "auxiliary_loss_clip": 0.0117118, "auxiliary_loss_mlp": 0.01029401, "balance_loss_clip": 1.05322671, "balance_loss_mlp": 1.02187943, "epoch": 0.49552095232369386, "flos": 24934179191040.0, "grad_norm": 1.6992734734231947, "language_loss": 0.70453614, "learning_rate": 2.126109690124757e-06, "loss": 0.72654194, "num_input_tokens_seen": 88735265, "step": 4121, "time_per_iteration": 2.826464891433716 }, { "auxiliary_loss_clip": 0.01171599, "auxiliary_loss_mlp": 0.01029218, "balance_loss_clip": 1.0529716, "balance_loss_mlp": 1.02143383, "epoch": 0.495641195214333, "flos": 22857249962880.0, "grad_norm": 1.79100267353277, "language_loss": 0.71134442, "learning_rate": 2.1253322590359786e-06, "loss": 0.73335254, "num_input_tokens_seen": 88754600, "step": 4122, "time_per_iteration": 2.7867212295532227 }, { "auxiliary_loss_clip": 0.01177478, "auxiliary_loss_mlp": 0.01027277, "balance_loss_clip": 1.05499482, "balance_loss_mlp": 1.01884258, "epoch": 0.49576143810497203, "flos": 25769748343680.0, "grad_norm": 3.2019428128875735, "language_loss": 0.74258244, "learning_rate": 2.124554808934397e-06, "loss": 0.76463002, "num_input_tokens_seen": 88775180, "step": 4123, "time_per_iteration": 2.841068744659424 }, { "auxiliary_loss_clip": 0.01165113, "auxiliary_loss_mlp": 0.01026468, "balance_loss_clip": 1.05305183, "balance_loss_mlp": 1.01906466, "epoch": 0.49588168099561114, "flos": 22128838058880.0, "grad_norm": 2.2204849958531483, "language_loss": 0.73168337, "learning_rate": 2.1237773399379496e-06, "loss": 0.75359917, "num_input_tokens_seen": 88796145, "step": 4124, "time_per_iteration": 2.8580799102783203 }, { "auxiliary_loss_clip": 0.01178348, "auxiliary_loss_mlp": 0.01022557, "balance_loss_clip": 1.05064571, "balance_loss_mlp": 1.01472509, "epoch": 0.49600192388625025, "flos": 24387331559040.0, "grad_norm": 1.6905580628992192, "language_loss": 0.87074912, "learning_rate": 2.122999852164578e-06, "loss": 0.89275813, "num_input_tokens_seen": 88816765, "step": 4125, "time_per_iteration": 2.731159210205078 }, { "auxiliary_loss_clip": 0.01170705, "auxiliary_loss_mlp": 0.01022424, "balance_loss_clip": 1.05697119, "balance_loss_mlp": 1.01421082, "epoch": 0.4961221667768893, "flos": 22857429530880.0, "grad_norm": 2.9241633922426185, "language_loss": 0.58630049, "learning_rate": 2.122222345732227e-06, "loss": 0.60823178, "num_input_tokens_seen": 88836680, "step": 4126, "time_per_iteration": 2.8653616905212402 }, { "auxiliary_loss_clip": 0.01173063, "auxiliary_loss_mlp": 0.01025875, "balance_loss_clip": 1.0529263, "balance_loss_mlp": 1.01754534, "epoch": 0.4962424096675284, "flos": 17858089768320.0, "grad_norm": 1.7563621608030908, "language_loss": 0.82841551, "learning_rate": 2.121444820758843e-06, "loss": 0.85040486, "num_input_tokens_seen": 88855320, "step": 4127, "time_per_iteration": 3.710891008377075 }, { "auxiliary_loss_clip": 0.01168821, "auxiliary_loss_mlp": 0.01027654, "balance_loss_clip": 1.05436158, "balance_loss_mlp": 1.0192318, "epoch": 0.49636265255816747, "flos": 21793611404160.0, "grad_norm": 3.0190923044807407, "language_loss": 0.78170979, "learning_rate": 2.120667277362376e-06, "loss": 0.80367458, "num_input_tokens_seen": 88874035, "step": 4128, "time_per_iteration": 2.8246212005615234 }, { "auxiliary_loss_clip": 0.01182326, "auxiliary_loss_mlp": 0.01029504, "balance_loss_clip": 1.05421805, "balance_loss_mlp": 1.02156806, "epoch": 0.4964828954488066, "flos": 16358603581440.0, "grad_norm": 2.394815826544656, "language_loss": 0.84872699, "learning_rate": 2.1198897156607796e-06, "loss": 0.87084526, "num_input_tokens_seen": 88891390, "step": 4129, "time_per_iteration": 3.604665994644165 }, { "auxiliary_loss_clip": 0.01182601, "auxiliary_loss_mlp": 0.0102587, "balance_loss_clip": 1.05213916, "balance_loss_mlp": 1.01810932, "epoch": 0.4966031383394457, "flos": 24711101775360.0, "grad_norm": 2.0896303181867304, "language_loss": 0.73772824, "learning_rate": 2.1191121357720085e-06, "loss": 0.75981295, "num_input_tokens_seen": 88909450, "step": 4130, "time_per_iteration": 3.6663522720336914 }, { "auxiliary_loss_clip": 0.01162749, "auxiliary_loss_mlp": 0.01030217, "balance_loss_clip": 1.05398107, "balance_loss_mlp": 1.02265954, "epoch": 0.49672338123008475, "flos": 22930615491840.0, "grad_norm": 1.703433997380418, "language_loss": 0.74813831, "learning_rate": 2.1183345378140206e-06, "loss": 0.77006805, "num_input_tokens_seen": 88929195, "step": 4131, "time_per_iteration": 2.7428174018859863 }, { "auxiliary_loss_clip": 0.01085383, "auxiliary_loss_mlp": 0.01003932, "balance_loss_clip": 1.02185631, "balance_loss_mlp": 1.00256085, "epoch": 0.49684362412072386, "flos": 65976736844160.0, "grad_norm": 0.8626604316012249, "language_loss": 0.61882675, "learning_rate": 2.1175569219047783e-06, "loss": 0.6397199, "num_input_tokens_seen": 88990635, "step": 4132, "time_per_iteration": 3.3694841861724854 }, { "auxiliary_loss_clip": 0.01179424, "auxiliary_loss_mlp": 0.01025939, "balance_loss_clip": 1.05273116, "balance_loss_mlp": 1.01837564, "epoch": 0.49696386701136297, "flos": 19971288754560.0, "grad_norm": 2.270091363822697, "language_loss": 0.73451096, "learning_rate": 2.1167792881622437e-06, "loss": 0.75656462, "num_input_tokens_seen": 89009655, "step": 4133, "time_per_iteration": 2.681349515914917 }, { "auxiliary_loss_clip": 0.01168387, "auxiliary_loss_mlp": 0.01030846, "balance_loss_clip": 1.05259037, "balance_loss_mlp": 1.02329421, "epoch": 0.497084109902002, "flos": 24750819239040.0, "grad_norm": 1.7883621242867287, "language_loss": 0.81219882, "learning_rate": 2.116001636704384e-06, "loss": 0.83419108, "num_input_tokens_seen": 89030040, "step": 4134, "time_per_iteration": 3.609992504119873 }, { "auxiliary_loss_clip": 0.01178087, "auxiliary_loss_mlp": 0.01028352, "balance_loss_clip": 1.0535903, "balance_loss_mlp": 1.02023423, "epoch": 0.49720435279264114, "flos": 21871825269120.0, "grad_norm": 2.2671698080368996, "language_loss": 0.79969621, "learning_rate": 2.1152239676491685e-06, "loss": 0.82176059, "num_input_tokens_seen": 89048145, "step": 4135, "time_per_iteration": 2.7251827716827393 }, { "auxiliary_loss_clip": 0.01179426, "auxiliary_loss_mlp": 0.01027403, "balance_loss_clip": 1.05225205, "balance_loss_mlp": 1.01989245, "epoch": 0.49732459568328025, "flos": 23805794367360.0, "grad_norm": 1.8125954306078798, "language_loss": 0.73179936, "learning_rate": 2.114446281114569e-06, "loss": 0.75386769, "num_input_tokens_seen": 89067165, "step": 4136, "time_per_iteration": 2.7782397270202637 }, { "auxiliary_loss_clip": 0.01166031, "auxiliary_loss_mlp": 0.01022573, "balance_loss_clip": 1.05160141, "balance_loss_mlp": 1.01531565, "epoch": 0.4974448385739193, "flos": 20047742853120.0, "grad_norm": 2.002590102282928, "language_loss": 0.75705016, "learning_rate": 2.1136685772185587e-06, "loss": 0.77893615, "num_input_tokens_seen": 89086190, "step": 4137, "time_per_iteration": 2.8759498596191406 }, { "auxiliary_loss_clip": 0.01174268, "auxiliary_loss_mlp": 0.01053111, "balance_loss_clip": 1.049577, "balance_loss_mlp": 1.01462424, "epoch": 0.4975650814645584, "flos": 24821347593600.0, "grad_norm": 1.7787447693235936, "language_loss": 0.77970088, "learning_rate": 2.1128908560791163e-06, "loss": 0.80197471, "num_input_tokens_seen": 89106020, "step": 4138, "time_per_iteration": 2.7889254093170166 }, { "auxiliary_loss_clip": 0.0117989, "auxiliary_loss_mlp": 0.01028598, "balance_loss_clip": 1.05306578, "balance_loss_mlp": 1.02105212, "epoch": 0.4976853243551975, "flos": 19829477859840.0, "grad_norm": 2.1367128303168896, "language_loss": 0.78512359, "learning_rate": 2.1121131178142203e-06, "loss": 0.80720848, "num_input_tokens_seen": 89125385, "step": 4139, "time_per_iteration": 2.762813091278076 }, { "auxiliary_loss_clip": 0.01170714, "auxiliary_loss_mlp": 0.01026577, "balance_loss_clip": 1.04917228, "balance_loss_mlp": 1.01878047, "epoch": 0.4978055672458366, "flos": 23142990654720.0, "grad_norm": 1.9400737994090735, "language_loss": 0.82667619, "learning_rate": 2.1113353625418544e-06, "loss": 0.84864902, "num_input_tokens_seen": 89143935, "step": 4140, "time_per_iteration": 2.7761332988739014 }, { "auxiliary_loss_clip": 0.01172302, "auxiliary_loss_mlp": 0.01029098, "balance_loss_clip": 1.05561149, "balance_loss_mlp": 1.02233005, "epoch": 0.4979258101364757, "flos": 15559914718080.0, "grad_norm": 1.719607378609079, "language_loss": 0.79010034, "learning_rate": 2.1105575903800017e-06, "loss": 0.81211436, "num_input_tokens_seen": 89162655, "step": 4141, "time_per_iteration": 2.830413341522217 }, { "auxiliary_loss_clip": 0.0117904, "auxiliary_loss_mlp": 0.01027654, "balance_loss_clip": 1.05090284, "balance_loss_mlp": 1.01986957, "epoch": 0.4980460530271148, "flos": 26356169784960.0, "grad_norm": 1.7944749333468222, "language_loss": 0.85808098, "learning_rate": 2.1097798014466502e-06, "loss": 0.88014793, "num_input_tokens_seen": 89182255, "step": 4142, "time_per_iteration": 2.769751787185669 }, { "auxiliary_loss_clip": 0.01178382, "auxiliary_loss_mlp": 0.01024309, "balance_loss_clip": 1.05142272, "balance_loss_mlp": 1.0163691, "epoch": 0.49816629591775385, "flos": 17274541415040.0, "grad_norm": 2.357331240975015, "language_loss": 0.58846986, "learning_rate": 2.109001995859791e-06, "loss": 0.61049676, "num_input_tokens_seen": 89201155, "step": 4143, "time_per_iteration": 2.7153453826904297 }, { "auxiliary_loss_clip": 0.01079597, "auxiliary_loss_mlp": 0.01002815, "balance_loss_clip": 1.0198046, "balance_loss_mlp": 1.00155783, "epoch": 0.49828653880839296, "flos": 64930947344640.0, "grad_norm": 0.7932433176315897, "language_loss": 0.60032523, "learning_rate": 2.108224173737415e-06, "loss": 0.6211493, "num_input_tokens_seen": 89264455, "step": 4144, "time_per_iteration": 3.3107640743255615 }, { "auxiliary_loss_clip": 0.01173372, "auxiliary_loss_mlp": 0.01028319, "balance_loss_clip": 1.050758, "balance_loss_mlp": 1.02028167, "epoch": 0.498406781699032, "flos": 27484806003840.0, "grad_norm": 13.972606262828096, "language_loss": 0.76414526, "learning_rate": 2.1074463351975183e-06, "loss": 0.78616214, "num_input_tokens_seen": 89283340, "step": 4145, "time_per_iteration": 2.789297342300415 }, { "auxiliary_loss_clip": 0.0117326, "auxiliary_loss_mlp": 0.01027806, "balance_loss_clip": 1.05051661, "balance_loss_mlp": 1.02061486, "epoch": 0.49852702458967113, "flos": 31499870307840.0, "grad_norm": 1.836853332604728, "language_loss": 0.71627283, "learning_rate": 2.106668480358098e-06, "loss": 0.73828346, "num_input_tokens_seen": 89303565, "step": 4146, "time_per_iteration": 2.8186776638031006 }, { "auxiliary_loss_clip": 0.01182104, "auxiliary_loss_mlp": 0.01025254, "balance_loss_clip": 1.05358887, "balance_loss_mlp": 1.01640284, "epoch": 0.49864726748031024, "flos": 22852868503680.0, "grad_norm": 2.2741462035993623, "language_loss": 0.70950598, "learning_rate": 2.105890609337154e-06, "loss": 0.73157954, "num_input_tokens_seen": 89322080, "step": 4147, "time_per_iteration": 2.688066244125366 }, { "auxiliary_loss_clip": 0.01079995, "auxiliary_loss_mlp": 0.01003503, "balance_loss_clip": 1.01842666, "balance_loss_mlp": 1.00229263, "epoch": 0.4987675103709493, "flos": 70405708544640.0, "grad_norm": 0.6900186559422504, "language_loss": 0.63765514, "learning_rate": 2.1051127222526883e-06, "loss": 0.65849018, "num_input_tokens_seen": 89394195, "step": 4148, "time_per_iteration": 3.3949947357177734 }, { "auxiliary_loss_clip": 0.01174558, "auxiliary_loss_mlp": 0.0102917, "balance_loss_clip": 1.05421376, "balance_loss_mlp": 1.02235103, "epoch": 0.4988877532615884, "flos": 28767571482240.0, "grad_norm": 1.599281803464836, "language_loss": 0.80680525, "learning_rate": 2.1043348192227067e-06, "loss": 0.82884252, "num_input_tokens_seen": 89414565, "step": 4149, "time_per_iteration": 2.8122763633728027 }, { "auxiliary_loss_clip": 0.01162437, "auxiliary_loss_mlp": 0.01030245, "balance_loss_clip": 1.05289149, "balance_loss_mlp": 1.02248728, "epoch": 0.4990079961522275, "flos": 16872700988160.0, "grad_norm": 1.8461050732507576, "language_loss": 0.62007117, "learning_rate": 2.1035569003652156e-06, "loss": 0.64199805, "num_input_tokens_seen": 89433195, "step": 4150, "time_per_iteration": 2.756056070327759 }, { "auxiliary_loss_clip": 0.01168729, "auxiliary_loss_mlp": 0.01035571, "balance_loss_clip": 1.05539966, "balance_loss_mlp": 1.0271368, "epoch": 0.4991282390428666, "flos": 13291042187520.0, "grad_norm": 2.8728933165375894, "language_loss": 0.81709599, "learning_rate": 2.1027789657982255e-06, "loss": 0.83913898, "num_input_tokens_seen": 89447410, "step": 4151, "time_per_iteration": 2.839719533920288 }, { "auxiliary_loss_clip": 0.01165364, "auxiliary_loss_mlp": 0.01026358, "balance_loss_clip": 1.05008006, "balance_loss_mlp": 1.01871634, "epoch": 0.4992484819335057, "flos": 21537496454400.0, "grad_norm": 2.027917428165433, "language_loss": 0.77195734, "learning_rate": 2.1020010156397482e-06, "loss": 0.7938745, "num_input_tokens_seen": 89464630, "step": 4152, "time_per_iteration": 2.705303907394409 }, { "auxiliary_loss_clip": 0.01178291, "auxiliary_loss_mlp": 0.01028664, "balance_loss_clip": 1.05323052, "balance_loss_mlp": 1.02056646, "epoch": 0.4993687248241448, "flos": 24860095390080.0, "grad_norm": 2.7465110055459987, "language_loss": 0.77452719, "learning_rate": 2.101223050007797e-06, "loss": 0.79659677, "num_input_tokens_seen": 89483180, "step": 4153, "time_per_iteration": 2.7201473712921143 }, { "auxiliary_loss_clip": 0.0107812, "auxiliary_loss_mlp": 0.010007, "balance_loss_clip": 1.01709712, "balance_loss_mlp": 0.99943656, "epoch": 0.49948896771478385, "flos": 62941602453120.0, "grad_norm": 0.8240125649964672, "language_loss": 0.5380137, "learning_rate": 2.1004450690203904e-06, "loss": 0.55880189, "num_input_tokens_seen": 89539260, "step": 4154, "time_per_iteration": 4.23703145980835 }, { "auxiliary_loss_clip": 0.01077972, "auxiliary_loss_mlp": 0.01000706, "balance_loss_clip": 1.01695454, "balance_loss_mlp": 0.99947804, "epoch": 0.49960921060542296, "flos": 68284213516800.0, "grad_norm": 0.8930127415697834, "language_loss": 0.63275373, "learning_rate": 2.099667072795546e-06, "loss": 0.65354061, "num_input_tokens_seen": 89601380, "step": 4155, "time_per_iteration": 4.261589527130127 }, { "auxiliary_loss_clip": 0.0117078, "auxiliary_loss_mlp": 0.0102677, "balance_loss_clip": 1.04869008, "balance_loss_mlp": 1.01897407, "epoch": 0.49972945349606207, "flos": 23659350618240.0, "grad_norm": 1.8302441504421252, "language_loss": 0.79827416, "learning_rate": 2.0988890614512864e-06, "loss": 0.82024956, "num_input_tokens_seen": 89621270, "step": 4156, "time_per_iteration": 3.734043598175049 }, { "auxiliary_loss_clip": 0.01177656, "auxiliary_loss_mlp": 0.01027662, "balance_loss_clip": 1.05474424, "balance_loss_mlp": 1.02011609, "epoch": 0.4998496963867011, "flos": 19755825022080.0, "grad_norm": 7.999827505774498, "language_loss": 0.84212613, "learning_rate": 2.098111035105635e-06, "loss": 0.86417925, "num_input_tokens_seen": 89639695, "step": 4157, "time_per_iteration": 2.793485164642334 }, { "auxiliary_loss_clip": 0.01166196, "auxiliary_loss_mlp": 0.0102332, "balance_loss_clip": 1.05347753, "balance_loss_mlp": 1.01570892, "epoch": 0.49996993927734024, "flos": 22265728790400.0, "grad_norm": 1.8669878388582353, "language_loss": 0.73301166, "learning_rate": 2.0973329938766176e-06, "loss": 0.75490677, "num_input_tokens_seen": 89657125, "step": 4158, "time_per_iteration": 2.7760322093963623 }, { "auxiliary_loss_clip": 0.01187847, "auxiliary_loss_mlp": 0.01030284, "balance_loss_clip": 1.05665708, "balance_loss_mlp": 1.02194512, "epoch": 0.5000901821679793, "flos": 23327212533120.0, "grad_norm": 3.915772292753397, "language_loss": 0.79085267, "learning_rate": 2.0965549378822618e-06, "loss": 0.81303406, "num_input_tokens_seen": 89678415, "step": 4159, "time_per_iteration": 2.7344117164611816 }, { "auxiliary_loss_clip": 0.01159305, "auxiliary_loss_mlp": 0.01034226, "balance_loss_clip": 1.05478096, "balance_loss_mlp": 1.02605987, "epoch": 0.5002104250586185, "flos": 20339014239360.0, "grad_norm": 2.1829849544854962, "language_loss": 0.8382585, "learning_rate": 2.095776867240599e-06, "loss": 0.86019379, "num_input_tokens_seen": 89695405, "step": 4160, "time_per_iteration": 3.635434627532959 }, { "auxiliary_loss_clip": 0.01168189, "auxiliary_loss_mlp": 0.01032995, "balance_loss_clip": 1.05401683, "balance_loss_mlp": 1.02538347, "epoch": 0.5003306679492575, "flos": 13991372634240.0, "grad_norm": 2.265458681600492, "language_loss": 0.82737809, "learning_rate": 2.094998782069661e-06, "loss": 0.84938997, "num_input_tokens_seen": 89713110, "step": 4161, "time_per_iteration": 2.743084192276001 }, { "auxiliary_loss_clip": 0.01178438, "auxiliary_loss_mlp": 0.01026062, "balance_loss_clip": 1.05263519, "balance_loss_mlp": 1.01815295, "epoch": 0.5004509108398966, "flos": 27672762896640.0, "grad_norm": 2.3190458489585857, "language_loss": 0.75849384, "learning_rate": 2.0942206824874845e-06, "loss": 0.78053886, "num_input_tokens_seen": 89735885, "step": 4162, "time_per_iteration": 2.7302324771881104 }, { "auxiliary_loss_clip": 0.01169393, "auxiliary_loss_mlp": 0.01025102, "balance_loss_clip": 1.04860377, "balance_loss_mlp": 1.01726961, "epoch": 0.5005711537305357, "flos": 14976186796800.0, "grad_norm": 2.597845528076464, "language_loss": 0.79086244, "learning_rate": 2.093442568612105e-06, "loss": 0.81280744, "num_input_tokens_seen": 89753690, "step": 4163, "time_per_iteration": 2.639324188232422 }, { "auxiliary_loss_clip": 0.01180858, "auxiliary_loss_mlp": 0.01026659, "balance_loss_clip": 1.05172873, "balance_loss_mlp": 1.018803, "epoch": 0.5006913966211748, "flos": 26503259978880.0, "grad_norm": 1.6982884053846872, "language_loss": 0.84976345, "learning_rate": 2.0926644405615613e-06, "loss": 0.87183857, "num_input_tokens_seen": 89774590, "step": 4164, "time_per_iteration": 2.699902057647705 }, { "auxiliary_loss_clip": 0.01163785, "auxiliary_loss_mlp": 0.01026414, "balance_loss_clip": 1.04980063, "balance_loss_mlp": 1.01899922, "epoch": 0.5008116395118138, "flos": 20449295971200.0, "grad_norm": 1.8538397580685062, "language_loss": 0.81412458, "learning_rate": 2.091886298453897e-06, "loss": 0.83602655, "num_input_tokens_seen": 89792775, "step": 4165, "time_per_iteration": 2.7209372520446777 }, { "auxiliary_loss_clip": 0.01176767, "auxiliary_loss_mlp": 0.01032603, "balance_loss_clip": 1.05429161, "balance_loss_mlp": 1.02523315, "epoch": 0.500931882402453, "flos": 21579871524480.0, "grad_norm": 2.253127662108382, "language_loss": 0.72986686, "learning_rate": 2.091108142407153e-06, "loss": 0.75196064, "num_input_tokens_seen": 89811515, "step": 4166, "time_per_iteration": 2.690819263458252 }, { "auxiliary_loss_clip": 0.01083044, "auxiliary_loss_mlp": 0.01006225, "balance_loss_clip": 1.02683258, "balance_loss_mlp": 1.00485981, "epoch": 0.5010521252930921, "flos": 57785011925760.0, "grad_norm": 0.8465088398651037, "language_loss": 0.62352902, "learning_rate": 2.090329972539377e-06, "loss": 0.6444217, "num_input_tokens_seen": 89870080, "step": 4167, "time_per_iteration": 3.263035774230957 }, { "auxiliary_loss_clip": 0.01150257, "auxiliary_loss_mlp": 0.01026266, "balance_loss_clip": 1.05227304, "balance_loss_mlp": 1.01804042, "epoch": 0.5011723681837311, "flos": 18625500864000.0, "grad_norm": 1.7858128579349704, "language_loss": 0.68403333, "learning_rate": 2.089551788968616e-06, "loss": 0.70579851, "num_input_tokens_seen": 89888045, "step": 4168, "time_per_iteration": 2.825953960418701 }, { "auxiliary_loss_clip": 0.01076818, "auxiliary_loss_mlp": 0.01000961, "balance_loss_clip": 1.01609719, "balance_loss_mlp": 0.99967921, "epoch": 0.5012926110743702, "flos": 55883146608000.0, "grad_norm": 0.8358168514293101, "language_loss": 0.60726476, "learning_rate": 2.08877359181292e-06, "loss": 0.62804246, "num_input_tokens_seen": 89944610, "step": 4169, "time_per_iteration": 3.2411131858825684 }, { "auxiliary_loss_clip": 0.01173299, "auxiliary_loss_mlp": 0.01021235, "balance_loss_clip": 1.04992914, "balance_loss_mlp": 1.01401365, "epoch": 0.5014128539650093, "flos": 24238266117120.0, "grad_norm": 2.2241512212226238, "language_loss": 0.86115563, "learning_rate": 2.0879953811903396e-06, "loss": 0.88310093, "num_input_tokens_seen": 89959495, "step": 4170, "time_per_iteration": 2.6810221672058105 }, { "auxiliary_loss_clip": 0.0117272, "auxiliary_loss_mlp": 0.01028038, "balance_loss_clip": 1.05038106, "balance_loss_mlp": 1.01977062, "epoch": 0.5015330968556484, "flos": 27527468382720.0, "grad_norm": 2.3024481970159423, "language_loss": 0.78400952, "learning_rate": 2.08721715721893e-06, "loss": 0.8060171, "num_input_tokens_seen": 89978820, "step": 4171, "time_per_iteration": 2.775724411010742 }, { "auxiliary_loss_clip": 0.0117371, "auxiliary_loss_mlp": 0.01024044, "balance_loss_clip": 1.05064702, "balance_loss_mlp": 1.01640248, "epoch": 0.5016533397462875, "flos": 23800802376960.0, "grad_norm": 1.911229479667614, "language_loss": 0.77148616, "learning_rate": 2.0864389200167477e-06, "loss": 0.79346365, "num_input_tokens_seen": 89997075, "step": 4172, "time_per_iteration": 2.7151269912719727 }, { "auxiliary_loss_clip": 0.01176993, "auxiliary_loss_mlp": 0.01063105, "balance_loss_clip": 1.05193305, "balance_loss_mlp": 1.01962733, "epoch": 0.5017735826369266, "flos": 25295009264640.0, "grad_norm": 1.8243023629650494, "language_loss": 0.78939342, "learning_rate": 2.0856606697018504e-06, "loss": 0.8117944, "num_input_tokens_seen": 90015085, "step": 4173, "time_per_iteration": 2.783005952835083 }, { "auxiliary_loss_clip": 0.01169626, "auxiliary_loss_mlp": 0.01024786, "balance_loss_clip": 1.051296, "balance_loss_mlp": 1.01696026, "epoch": 0.5018938255275657, "flos": 16873203778560.0, "grad_norm": 1.9977936496920383, "language_loss": 0.73401928, "learning_rate": 2.084882406392297e-06, "loss": 0.75596344, "num_input_tokens_seen": 90033045, "step": 4174, "time_per_iteration": 2.6457509994506836 }, { "auxiliary_loss_clip": 0.01177275, "auxiliary_loss_mlp": 0.01026672, "balance_loss_clip": 1.05238295, "balance_loss_mlp": 1.01879263, "epoch": 0.5020140684182047, "flos": 25515429073920.0, "grad_norm": 2.595498592786813, "language_loss": 0.71018583, "learning_rate": 2.0841041302061496e-06, "loss": 0.7322253, "num_input_tokens_seen": 90052505, "step": 4175, "time_per_iteration": 2.7304654121398926 }, { "auxiliary_loss_clip": 0.01166457, "auxiliary_loss_mlp": 0.01031187, "balance_loss_clip": 1.05238581, "balance_loss_mlp": 1.02337897, "epoch": 0.5021343113088439, "flos": 23659278791040.0, "grad_norm": 2.1270649943791997, "language_loss": 0.75454521, "learning_rate": 2.083325841261473e-06, "loss": 0.77652168, "num_input_tokens_seen": 90071565, "step": 4176, "time_per_iteration": 2.6776533126831055 }, { "auxiliary_loss_clip": 0.01170022, "auxiliary_loss_mlp": 0.01025832, "balance_loss_clip": 1.05197167, "balance_loss_mlp": 1.01830399, "epoch": 0.502254554199483, "flos": 24534673148160.0, "grad_norm": 2.3715930766594275, "language_loss": 0.66498494, "learning_rate": 2.0825475396763322e-06, "loss": 0.68694353, "num_input_tokens_seen": 90092215, "step": 4177, "time_per_iteration": 2.7600390911102295 }, { "auxiliary_loss_clip": 0.01150986, "auxiliary_loss_mlp": 0.01025496, "balance_loss_clip": 1.05235791, "balance_loss_mlp": 1.01820612, "epoch": 0.502374797090122, "flos": 34240285607040.0, "grad_norm": 3.155742749737611, "language_loss": 0.65640235, "learning_rate": 2.081769225568796e-06, "loss": 0.6781671, "num_input_tokens_seen": 90114665, "step": 4178, "time_per_iteration": 2.881028890609741 }, { "auxiliary_loss_clip": 0.01175947, "auxiliary_loss_mlp": 0.01030787, "balance_loss_clip": 1.04881036, "balance_loss_mlp": 1.02221632, "epoch": 0.5024950399807612, "flos": 26031106679040.0, "grad_norm": 1.483573254408268, "language_loss": 0.75965154, "learning_rate": 2.0809908990569327e-06, "loss": 0.78171891, "num_input_tokens_seen": 90136445, "step": 4179, "time_per_iteration": 2.648561954498291 }, { "auxiliary_loss_clip": 0.01172584, "auxiliary_loss_mlp": 0.0103742, "balance_loss_clip": 1.05318344, "balance_loss_mlp": 1.02986264, "epoch": 0.5026152828714002, "flos": 21252438120960.0, "grad_norm": 1.7891369964254378, "language_loss": 0.79109317, "learning_rate": 2.0802125602588146e-06, "loss": 0.8131932, "num_input_tokens_seen": 90155710, "step": 4180, "time_per_iteration": 3.759983777999878 }, { "auxiliary_loss_clip": 0.01181642, "auxiliary_loss_mlp": 0.01026849, "balance_loss_clip": 1.05340135, "balance_loss_mlp": 1.01933861, "epoch": 0.5027355257620393, "flos": 30956111245440.0, "grad_norm": 1.8827508941651967, "language_loss": 0.66260183, "learning_rate": 2.0794342092925146e-06, "loss": 0.68468678, "num_input_tokens_seen": 90176845, "step": 4181, "time_per_iteration": 3.7329342365264893 }, { "auxiliary_loss_clip": 0.01179063, "auxiliary_loss_mlp": 0.01029074, "balance_loss_clip": 1.05328608, "balance_loss_mlp": 1.02114916, "epoch": 0.5028557686526784, "flos": 24791147233920.0, "grad_norm": 2.7935293896413764, "language_loss": 0.68200314, "learning_rate": 2.078655846276108e-06, "loss": 0.70408452, "num_input_tokens_seen": 90197175, "step": 4182, "time_per_iteration": 3.6426467895507812 }, { "auxiliary_loss_clip": 0.01167315, "auxiliary_loss_mlp": 0.01025067, "balance_loss_clip": 1.05068028, "balance_loss_mlp": 1.01787281, "epoch": 0.5029760115433175, "flos": 22966992990720.0, "grad_norm": 2.426807998418673, "language_loss": 0.69161087, "learning_rate": 2.0778774713276727e-06, "loss": 0.71353471, "num_input_tokens_seen": 90216650, "step": 4183, "time_per_iteration": 2.7393040657043457 }, { "auxiliary_loss_clip": 0.01174054, "auxiliary_loss_mlp": 0.01027236, "balance_loss_clip": 1.05145764, "balance_loss_mlp": 1.01864076, "epoch": 0.5030962544339566, "flos": 15305164485120.0, "grad_norm": 2.1937655474661786, "language_loss": 0.67654949, "learning_rate": 2.077099084565287e-06, "loss": 0.69856244, "num_input_tokens_seen": 90234055, "step": 4184, "time_per_iteration": 2.749924659729004 }, { "auxiliary_loss_clip": 0.01169264, "auxiliary_loss_mlp": 0.01027612, "balance_loss_clip": 1.05075788, "balance_loss_mlp": 1.01976228, "epoch": 0.5032164973245957, "flos": 24494847943680.0, "grad_norm": 2.6524205931106386, "language_loss": 0.65785551, "learning_rate": 2.0763206861070313e-06, "loss": 0.67982429, "num_input_tokens_seen": 90253115, "step": 4185, "time_per_iteration": 2.824913501739502 }, { "auxiliary_loss_clip": 0.01179745, "auxiliary_loss_mlp": 0.01024946, "balance_loss_clip": 1.05206251, "balance_loss_mlp": 1.01672053, "epoch": 0.5033367402152348, "flos": 16213452721920.0, "grad_norm": 2.2002470301079957, "language_loss": 0.75447559, "learning_rate": 2.0755422760709876e-06, "loss": 0.77652252, "num_input_tokens_seen": 90270515, "step": 4186, "time_per_iteration": 3.6242151260375977 }, { "auxiliary_loss_clip": 0.01161613, "auxiliary_loss_mlp": 0.01023168, "balance_loss_clip": 1.0520221, "balance_loss_mlp": 1.01590168, "epoch": 0.5034569831058738, "flos": 21391375927680.0, "grad_norm": 2.219662784782375, "language_loss": 0.77248001, "learning_rate": 2.0747638545752417e-06, "loss": 0.79432786, "num_input_tokens_seen": 90289075, "step": 4187, "time_per_iteration": 2.746242046356201 }, { "auxiliary_loss_clip": 0.01170492, "auxiliary_loss_mlp": 0.01026789, "balance_loss_clip": 1.05325389, "balance_loss_mlp": 1.0193032, "epoch": 0.503577225996513, "flos": 20558751690240.0, "grad_norm": 2.136580862279332, "language_loss": 0.83316034, "learning_rate": 2.073985421737878e-06, "loss": 0.85513318, "num_input_tokens_seen": 90306385, "step": 4188, "time_per_iteration": 2.7313790321350098 }, { "auxiliary_loss_clip": 0.01175906, "auxiliary_loss_mlp": 0.01024455, "balance_loss_clip": 1.05056834, "balance_loss_mlp": 1.01672995, "epoch": 0.5036974688871521, "flos": 27229157930880.0, "grad_norm": 4.471741341040498, "language_loss": 0.73505616, "learning_rate": 2.0732069776769844e-06, "loss": 0.75705969, "num_input_tokens_seen": 90323795, "step": 4189, "time_per_iteration": 2.7236833572387695 }, { "auxiliary_loss_clip": 0.01179078, "auxiliary_loss_mlp": 0.01026021, "balance_loss_clip": 1.05243206, "balance_loss_mlp": 1.0182786, "epoch": 0.5038177117777911, "flos": 20412164286720.0, "grad_norm": 2.308548823083823, "language_loss": 0.72949851, "learning_rate": 2.072428522510651e-06, "loss": 0.75154948, "num_input_tokens_seen": 90340360, "step": 4190, "time_per_iteration": 2.759890556335449 }, { "auxiliary_loss_clip": 0.01165788, "auxiliary_loss_mlp": 0.01027741, "balance_loss_clip": 1.05326343, "balance_loss_mlp": 1.01972437, "epoch": 0.5039379546684303, "flos": 21907987286400.0, "grad_norm": 2.387080158285159, "language_loss": 0.76266551, "learning_rate": 2.071650056356968e-06, "loss": 0.78460079, "num_input_tokens_seen": 90357900, "step": 4191, "time_per_iteration": 2.7675657272338867 }, { "auxiliary_loss_clip": 0.01179216, "auxiliary_loss_mlp": 0.01030032, "balance_loss_clip": 1.05231726, "balance_loss_mlp": 1.02231359, "epoch": 0.5040581975590693, "flos": 20010718909440.0, "grad_norm": 1.9051880653315374, "language_loss": 0.80080491, "learning_rate": 2.070871579334028e-06, "loss": 0.82289737, "num_input_tokens_seen": 90377010, "step": 4192, "time_per_iteration": 2.7106871604919434 }, { "auxiliary_loss_clip": 0.01175869, "auxiliary_loss_mlp": 0.01025351, "balance_loss_clip": 1.04996741, "balance_loss_mlp": 1.01773381, "epoch": 0.5041784404497084, "flos": 20959837931520.0, "grad_norm": 1.8420106913341427, "language_loss": 0.72021353, "learning_rate": 2.0700930915599264e-06, "loss": 0.74222577, "num_input_tokens_seen": 90396740, "step": 4193, "time_per_iteration": 2.684946060180664 }, { "auxiliary_loss_clip": 0.01175746, "auxiliary_loss_mlp": 0.01023092, "balance_loss_clip": 1.05035448, "balance_loss_mlp": 1.01558471, "epoch": 0.5042986833403476, "flos": 12495082757760.0, "grad_norm": 1.9664923575958042, "language_loss": 0.78551924, "learning_rate": 2.0693145931527583e-06, "loss": 0.80750763, "num_input_tokens_seen": 90413220, "step": 4194, "time_per_iteration": 2.762852430343628 }, { "auxiliary_loss_clip": 0.01166732, "auxiliary_loss_mlp": 0.01026149, "balance_loss_clip": 1.05002224, "balance_loss_mlp": 1.01816785, "epoch": 0.5044189262309866, "flos": 29202305788800.0, "grad_norm": 1.6305646369368991, "language_loss": 0.78288865, "learning_rate": 2.068536084230622e-06, "loss": 0.80481744, "num_input_tokens_seen": 90435085, "step": 4195, "time_per_iteration": 2.77386474609375 }, { "auxiliary_loss_clip": 0.01174972, "auxiliary_loss_mlp": 0.01024583, "balance_loss_clip": 1.051494, "balance_loss_mlp": 1.01653004, "epoch": 0.5045391691216257, "flos": 23873198238720.0, "grad_norm": 2.135759690369232, "language_loss": 0.88963258, "learning_rate": 2.067757564911616e-06, "loss": 0.91162813, "num_input_tokens_seen": 90453660, "step": 4196, "time_per_iteration": 2.7029526233673096 }, { "auxiliary_loss_clip": 0.01179178, "auxiliary_loss_mlp": 0.01055877, "balance_loss_clip": 1.05156803, "balance_loss_mlp": 1.01665521, "epoch": 0.5046594120122648, "flos": 24644990793600.0, "grad_norm": 1.964425390859592, "language_loss": 0.92535555, "learning_rate": 2.0669790353138407e-06, "loss": 0.9477061, "num_input_tokens_seen": 90472625, "step": 4197, "time_per_iteration": 2.803107738494873 }, { "auxiliary_loss_clip": 0.01165258, "auxiliary_loss_mlp": 0.01063243, "balance_loss_clip": 1.05194712, "balance_loss_mlp": 1.02293444, "epoch": 0.5047796549029039, "flos": 23362835846400.0, "grad_norm": 4.993269801007412, "language_loss": 0.73438871, "learning_rate": 2.0662004955553995e-06, "loss": 0.75667369, "num_input_tokens_seen": 90492325, "step": 4198, "time_per_iteration": 2.7794458866119385 }, { "auxiliary_loss_clip": 0.01168453, "auxiliary_loss_mlp": 0.01023506, "balance_loss_clip": 1.05056667, "balance_loss_mlp": 1.01589477, "epoch": 0.5048998977935429, "flos": 17304095329920.0, "grad_norm": 4.226257587880192, "language_loss": 0.76352787, "learning_rate": 2.065421945754395e-06, "loss": 0.78544748, "num_input_tokens_seen": 90510055, "step": 4199, "time_per_iteration": 2.757922887802124 }, { "auxiliary_loss_clip": 0.01164366, "auxiliary_loss_mlp": 0.01029497, "balance_loss_clip": 1.05105352, "balance_loss_mlp": 1.02204323, "epoch": 0.505020140684182, "flos": 34856979235200.0, "grad_norm": 1.779825672982625, "language_loss": 0.77941519, "learning_rate": 2.0646433860289344e-06, "loss": 0.80135381, "num_input_tokens_seen": 90528980, "step": 4200, "time_per_iteration": 2.806856870651245 }, { "auxiliary_loss_clip": 0.01177355, "auxiliary_loss_mlp": 0.01068244, "balance_loss_clip": 1.05044031, "balance_loss_mlp": 1.02724957, "epoch": 0.5051403835748212, "flos": 24863974058880.0, "grad_norm": 1.8188983254633602, "language_loss": 0.82549429, "learning_rate": 2.0638648164971233e-06, "loss": 0.84795028, "num_input_tokens_seen": 90547445, "step": 4201, "time_per_iteration": 2.697178602218628 }, { "auxiliary_loss_clip": 0.01172415, "auxiliary_loss_mlp": 0.01026068, "balance_loss_clip": 1.05295634, "balance_loss_mlp": 1.01849484, "epoch": 0.5052606264654602, "flos": 20959694277120.0, "grad_norm": 2.3109706967708443, "language_loss": 0.88688266, "learning_rate": 2.06308623727707e-06, "loss": 0.90886748, "num_input_tokens_seen": 90567545, "step": 4202, "time_per_iteration": 2.769638776779175 }, { "auxiliary_loss_clip": 0.01173461, "auxiliary_loss_mlp": 0.01026184, "balance_loss_clip": 1.05052733, "balance_loss_mlp": 1.01851845, "epoch": 0.5053808693560993, "flos": 19642382893440.0, "grad_norm": 2.6653218370249507, "language_loss": 0.75902712, "learning_rate": 2.0623076484868846e-06, "loss": 0.78102362, "num_input_tokens_seen": 90585000, "step": 4203, "time_per_iteration": 2.715670108795166 }, { "auxiliary_loss_clip": 0.01072942, "auxiliary_loss_mlp": 0.01005156, "balance_loss_clip": 1.01732731, "balance_loss_mlp": 1.00377929, "epoch": 0.5055011122467384, "flos": 67504915019520.0, "grad_norm": 0.8373091935333173, "language_loss": 0.60680765, "learning_rate": 2.061529050244679e-06, "loss": 0.62758863, "num_input_tokens_seen": 90644745, "step": 4204, "time_per_iteration": 3.32002592086792 }, { "auxiliary_loss_clip": 0.01171632, "auxiliary_loss_mlp": 0.01026911, "balance_loss_clip": 1.04948151, "balance_loss_mlp": 1.01905537, "epoch": 0.5056213551373775, "flos": 16872952383360.0, "grad_norm": 1.77491997421195, "language_loss": 0.74190921, "learning_rate": 2.060750442668565e-06, "loss": 0.76389468, "num_input_tokens_seen": 90662500, "step": 4205, "time_per_iteration": 3.5840249061584473 }, { "auxiliary_loss_clip": 0.01176222, "auxiliary_loss_mlp": 0.01025463, "balance_loss_clip": 1.05447507, "balance_loss_mlp": 1.01751804, "epoch": 0.5057415980280165, "flos": 15334179696000.0, "grad_norm": 2.243294093990185, "language_loss": 0.63893664, "learning_rate": 2.059971825876657e-06, "loss": 0.66095352, "num_input_tokens_seen": 90677010, "step": 4206, "time_per_iteration": 2.7548348903656006 }, { "auxiliary_loss_clip": 0.01177742, "auxiliary_loss_mlp": 0.0102266, "balance_loss_clip": 1.05285192, "balance_loss_mlp": 1.01542366, "epoch": 0.5058618409186557, "flos": 19025976574080.0, "grad_norm": 1.7561158732946913, "language_loss": 0.76722872, "learning_rate": 2.0591931999870713e-06, "loss": 0.78923273, "num_input_tokens_seen": 90695935, "step": 4207, "time_per_iteration": 3.370990514755249 }, { "auxiliary_loss_clip": 0.01071777, "auxiliary_loss_mlp": 0.01004561, "balance_loss_clip": 1.01630902, "balance_loss_mlp": 1.00333273, "epoch": 0.5059820838092948, "flos": 63453114080640.0, "grad_norm": 0.8385726477514732, "language_loss": 0.57561374, "learning_rate": 2.0584145651179234e-06, "loss": 0.59637707, "num_input_tokens_seen": 90751645, "step": 4208, "time_per_iteration": 3.9599087238311768 }, { "auxiliary_loss_clip": 0.01173072, "auxiliary_loss_mlp": 0.01060506, "balance_loss_clip": 1.05305135, "balance_loss_mlp": 1.02015495, "epoch": 0.5061023266999338, "flos": 15441803821440.0, "grad_norm": 2.324210292457775, "language_loss": 0.79701817, "learning_rate": 2.0576359213873327e-06, "loss": 0.81935394, "num_input_tokens_seen": 90766795, "step": 4209, "time_per_iteration": 2.5503721237182617 }, { "auxiliary_loss_clip": 0.01177953, "auxiliary_loss_mlp": 0.01022463, "balance_loss_clip": 1.04961228, "balance_loss_mlp": 1.01463056, "epoch": 0.506222569590573, "flos": 22451063990400.0, "grad_norm": 2.318387488590779, "language_loss": 0.71029496, "learning_rate": 2.056857268913419e-06, "loss": 0.73229909, "num_input_tokens_seen": 90786845, "step": 4210, "time_per_iteration": 2.5798168182373047 }, { "auxiliary_loss_clip": 0.01175584, "auxiliary_loss_mlp": 0.01025781, "balance_loss_clip": 1.05247939, "balance_loss_mlp": 1.01838398, "epoch": 0.506342812481212, "flos": 17558665994880.0, "grad_norm": 2.297712653153157, "language_loss": 0.83832407, "learning_rate": 2.056078607814303e-06, "loss": 0.86033773, "num_input_tokens_seen": 90802630, "step": 4211, "time_per_iteration": 2.520185947418213 }, { "auxiliary_loss_clip": 0.01175698, "auxiliary_loss_mlp": 0.01025179, "balance_loss_clip": 1.05294704, "balance_loss_mlp": 1.01772881, "epoch": 0.5064630553718511, "flos": 23402050519680.0, "grad_norm": 2.6176690314696924, "language_loss": 0.78682071, "learning_rate": 2.055299938208106e-06, "loss": 0.80882955, "num_input_tokens_seen": 90823620, "step": 4212, "time_per_iteration": 3.4211697578430176 }, { "auxiliary_loss_clip": 0.01179672, "auxiliary_loss_mlp": 0.01030169, "balance_loss_clip": 1.05396914, "balance_loss_mlp": 1.02215815, "epoch": 0.5065832982624903, "flos": 23987035416960.0, "grad_norm": 1.6776773180793305, "language_loss": 0.86302507, "learning_rate": 2.0545212602129526e-06, "loss": 0.88512337, "num_input_tokens_seen": 90843475, "step": 4213, "time_per_iteration": 2.7470853328704834 }, { "auxiliary_loss_clip": 0.01168308, "auxiliary_loss_mlp": 0.01026109, "balance_loss_clip": 1.05164373, "balance_loss_mlp": 1.01815236, "epoch": 0.5067035411531293, "flos": 21503058289920.0, "grad_norm": 6.937723809203545, "language_loss": 0.66618645, "learning_rate": 2.0537425739469673e-06, "loss": 0.68813062, "num_input_tokens_seen": 90862410, "step": 4214, "time_per_iteration": 2.7406790256500244 }, { "auxiliary_loss_clip": 0.0107522, "auxiliary_loss_mlp": 0.01002442, "balance_loss_clip": 1.01423144, "balance_loss_mlp": 1.00108266, "epoch": 0.5068237840437684, "flos": 65934397687680.0, "grad_norm": 0.8357532407292129, "language_loss": 0.59443259, "learning_rate": 2.052963879528276e-06, "loss": 0.61520922, "num_input_tokens_seen": 90922280, "step": 4215, "time_per_iteration": 3.332637071609497 }, { "auxiliary_loss_clip": 0.01172496, "auxiliary_loss_mlp": 0.01031695, "balance_loss_clip": 1.05111539, "balance_loss_mlp": 1.02355945, "epoch": 0.5069440269344075, "flos": 27264206626560.0, "grad_norm": 5.650790691144346, "language_loss": 0.76459074, "learning_rate": 2.052185177075007e-06, "loss": 0.78663266, "num_input_tokens_seen": 90941850, "step": 4216, "time_per_iteration": 2.8425326347351074 }, { "auxiliary_loss_clip": 0.01178486, "auxiliary_loss_mlp": 0.01024335, "balance_loss_clip": 1.05409849, "balance_loss_mlp": 1.0163238, "epoch": 0.5070642698250466, "flos": 23366319465600.0, "grad_norm": 1.9440088996614922, "language_loss": 0.8280853, "learning_rate": 2.051406466705288e-06, "loss": 0.85011351, "num_input_tokens_seen": 90961390, "step": 4217, "time_per_iteration": 2.7313246726989746 }, { "auxiliary_loss_clip": 0.01175879, "auxiliary_loss_mlp": 0.01025423, "balance_loss_clip": 1.04954958, "balance_loss_mlp": 1.01844931, "epoch": 0.5071845127156857, "flos": 20340127560960.0, "grad_norm": 2.3198553190110727, "language_loss": 0.80767012, "learning_rate": 2.0506277485372486e-06, "loss": 0.82968318, "num_input_tokens_seen": 90980215, "step": 4218, "time_per_iteration": 2.6351089477539062 }, { "auxiliary_loss_clip": 0.01171426, "auxiliary_loss_mlp": 0.01029446, "balance_loss_clip": 1.05260158, "balance_loss_mlp": 1.02104759, "epoch": 0.5073047556063248, "flos": 12092955022080.0, "grad_norm": 2.4265419376767414, "language_loss": 0.67569143, "learning_rate": 2.04984902268902e-06, "loss": 0.69770014, "num_input_tokens_seen": 90997415, "step": 4219, "time_per_iteration": 2.697965383529663 }, { "auxiliary_loss_clip": 0.01181615, "auxiliary_loss_mlp": 0.0102574, "balance_loss_clip": 1.05124366, "balance_loss_mlp": 1.01747334, "epoch": 0.5074249984969639, "flos": 19682854542720.0, "grad_norm": 2.234799980554639, "language_loss": 0.75247103, "learning_rate": 2.0490702892787345e-06, "loss": 0.7745446, "num_input_tokens_seen": 91016475, "step": 4220, "time_per_iteration": 2.62412166595459 }, { "auxiliary_loss_clip": 0.01168479, "auxiliary_loss_mlp": 0.01026838, "balance_loss_clip": 1.0501529, "balance_loss_mlp": 1.01951861, "epoch": 0.5075452413876029, "flos": 28765703975040.0, "grad_norm": 1.6134723711149288, "language_loss": 0.62377155, "learning_rate": 2.0482915484245246e-06, "loss": 0.64572477, "num_input_tokens_seen": 91038095, "step": 4221, "time_per_iteration": 2.732001781463623 }, { "auxiliary_loss_clip": 0.01165443, "auxiliary_loss_mlp": 0.01024776, "balance_loss_clip": 1.05446661, "balance_loss_mlp": 1.0167954, "epoch": 0.5076654842782421, "flos": 20339445202560.0, "grad_norm": 2.9970444039351976, "language_loss": 0.83865845, "learning_rate": 2.047512800244526e-06, "loss": 0.86056066, "num_input_tokens_seen": 91053360, "step": 4222, "time_per_iteration": 2.818626880645752 }, { "auxiliary_loss_clip": 0.01174082, "auxiliary_loss_mlp": 0.01032386, "balance_loss_clip": 1.05102348, "balance_loss_mlp": 1.02441049, "epoch": 0.5077857271688812, "flos": 26359653404160.0, "grad_norm": 2.318818569772645, "language_loss": 0.79164606, "learning_rate": 2.046734044856873e-06, "loss": 0.81371069, "num_input_tokens_seen": 91072770, "step": 4223, "time_per_iteration": 2.733773708343506 }, { "auxiliary_loss_clip": 0.01172753, "auxiliary_loss_mlp": 0.010347, "balance_loss_clip": 1.0502305, "balance_loss_mlp": 1.02706492, "epoch": 0.5079059700595202, "flos": 21798962530560.0, "grad_norm": 1.8970172155067166, "language_loss": 0.8128624, "learning_rate": 2.045955282379702e-06, "loss": 0.83493698, "num_input_tokens_seen": 91091430, "step": 4224, "time_per_iteration": 2.745518445968628 }, { "auxiliary_loss_clip": 0.01173103, "auxiliary_loss_mlp": 0.01032665, "balance_loss_clip": 1.05065918, "balance_loss_mlp": 1.02453518, "epoch": 0.5080262129501594, "flos": 13187943175680.0, "grad_norm": 2.9168740032936995, "language_loss": 0.76215553, "learning_rate": 2.045176512931152e-06, "loss": 0.78421319, "num_input_tokens_seen": 91106060, "step": 4225, "time_per_iteration": 2.690294027328491 }, { "auxiliary_loss_clip": 0.01171141, "auxiliary_loss_mlp": 0.01027254, "balance_loss_clip": 1.05149889, "balance_loss_mlp": 1.02023256, "epoch": 0.5081464558407984, "flos": 25301473712640.0, "grad_norm": 2.053420636725352, "language_loss": 0.75679183, "learning_rate": 2.0443977366293604e-06, "loss": 0.77877575, "num_input_tokens_seen": 91124100, "step": 4226, "time_per_iteration": 2.8051631450653076 }, { "auxiliary_loss_clip": 0.01165504, "auxiliary_loss_mlp": 0.01026459, "balance_loss_clip": 1.05282521, "balance_loss_mlp": 1.01834106, "epoch": 0.5082666987314375, "flos": 30951226995840.0, "grad_norm": 1.7503486814743772, "language_loss": 0.76814461, "learning_rate": 2.043618953592468e-06, "loss": 0.79006422, "num_input_tokens_seen": 91146555, "step": 4227, "time_per_iteration": 2.9646992683410645 }, { "auxiliary_loss_clip": 0.01171175, "auxiliary_loss_mlp": 0.01026989, "balance_loss_clip": 1.05305338, "balance_loss_mlp": 1.01923418, "epoch": 0.5083869416220766, "flos": 19682495406720.0, "grad_norm": 13.0204676537076, "language_loss": 0.81267548, "learning_rate": 2.0428401639386144e-06, "loss": 0.83465707, "num_input_tokens_seen": 91167120, "step": 4228, "time_per_iteration": 2.789304733276367 }, { "auxiliary_loss_clip": 0.01068923, "auxiliary_loss_mlp": 0.01000556, "balance_loss_clip": 1.01520705, "balance_loss_mlp": 0.99927473, "epoch": 0.5085071845127157, "flos": 71817535589760.0, "grad_norm": 0.8220679442840849, "language_loss": 0.58077741, "learning_rate": 2.042061367785943e-06, "loss": 0.60147214, "num_input_tokens_seen": 91220260, "step": 4229, "time_per_iteration": 3.252997398376465 }, { "auxiliary_loss_clip": 0.01172209, "auxiliary_loss_mlp": 0.01027924, "balance_loss_clip": 1.05112422, "balance_loss_mlp": 1.02019286, "epoch": 0.5086274274033548, "flos": 35951608252800.0, "grad_norm": 2.6281344609900787, "language_loss": 0.75076818, "learning_rate": 2.041282565252594e-06, "loss": 0.77276945, "num_input_tokens_seen": 91240425, "step": 4230, "time_per_iteration": 2.9113404750823975 }, { "auxiliary_loss_clip": 0.01168859, "auxiliary_loss_mlp": 0.01022513, "balance_loss_clip": 1.05124974, "balance_loss_mlp": 1.01477289, "epoch": 0.5087476702939938, "flos": 23513732881920.0, "grad_norm": 4.26769524228433, "language_loss": 0.77415502, "learning_rate": 2.040503756456714e-06, "loss": 0.79606879, "num_input_tokens_seen": 91259635, "step": 4231, "time_per_iteration": 3.6962740421295166 }, { "auxiliary_loss_clip": 0.01171613, "auxiliary_loss_mlp": 0.01033896, "balance_loss_clip": 1.05143607, "balance_loss_mlp": 1.02604914, "epoch": 0.508867913184633, "flos": 15122091841920.0, "grad_norm": 2.429225819591287, "language_loss": 0.79105324, "learning_rate": 2.0397249415164456e-06, "loss": 0.81310833, "num_input_tokens_seen": 91276990, "step": 4232, "time_per_iteration": 2.7006101608276367 }, { "auxiliary_loss_clip": 0.01166333, "auxiliary_loss_mlp": 0.01026065, "balance_loss_clip": 1.05065608, "balance_loss_mlp": 1.01788116, "epoch": 0.508988156075272, "flos": 25885309374720.0, "grad_norm": 2.7146382058060827, "language_loss": 0.80217183, "learning_rate": 2.0389461205499354e-06, "loss": 0.82409573, "num_input_tokens_seen": 91296125, "step": 4233, "time_per_iteration": 3.7965943813323975 }, { "auxiliary_loss_clip": 0.01169708, "auxiliary_loss_mlp": 0.01024903, "balance_loss_clip": 1.05022621, "balance_loss_mlp": 1.01746476, "epoch": 0.5091083989659111, "flos": 13844857057920.0, "grad_norm": 1.9239103834869862, "language_loss": 0.73340392, "learning_rate": 2.03816729367533e-06, "loss": 0.75535011, "num_input_tokens_seen": 91314280, "step": 4234, "time_per_iteration": 3.902134895324707 }, { "auxiliary_loss_clip": 0.01174954, "auxiliary_loss_mlp": 0.01028236, "balance_loss_clip": 1.05265999, "balance_loss_mlp": 1.0208447, "epoch": 0.5092286418565503, "flos": 21104881050240.0, "grad_norm": 2.3104015323023632, "language_loss": 0.714432, "learning_rate": 2.0373884610107765e-06, "loss": 0.7364639, "num_input_tokens_seen": 91334595, "step": 4235, "time_per_iteration": 2.7284674644470215 }, { "auxiliary_loss_clip": 0.01176062, "auxiliary_loss_mlp": 0.01032981, "balance_loss_clip": 1.04925132, "balance_loss_mlp": 1.02509558, "epoch": 0.5093488847471893, "flos": 18621298972800.0, "grad_norm": 2.858806852672945, "language_loss": 0.69652104, "learning_rate": 2.0366096226744225e-06, "loss": 0.71861148, "num_input_tokens_seen": 91349790, "step": 4236, "time_per_iteration": 2.6942877769470215 }, { "auxiliary_loss_clip": 0.01168973, "auxiliary_loss_mlp": 0.01033872, "balance_loss_clip": 1.05219746, "balance_loss_mlp": 1.02647817, "epoch": 0.5094691276378284, "flos": 23803783205760.0, "grad_norm": 1.9159671223534485, "language_loss": 0.769521, "learning_rate": 2.035830778784418e-06, "loss": 0.79154944, "num_input_tokens_seen": 91370465, "step": 4237, "time_per_iteration": 2.6940879821777344 }, { "auxiliary_loss_clip": 0.01178652, "auxiliary_loss_mlp": 0.01026869, "balance_loss_clip": 1.05649519, "balance_loss_mlp": 1.01907289, "epoch": 0.5095893705284675, "flos": 17420410546560.0, "grad_norm": 2.913531021159584, "language_loss": 0.8010484, "learning_rate": 2.0350519294589134e-06, "loss": 0.82310367, "num_input_tokens_seen": 91388505, "step": 4238, "time_per_iteration": 3.8028478622436523 }, { "auxiliary_loss_clip": 0.011641, "auxiliary_loss_mlp": 0.01027846, "balance_loss_clip": 1.05095625, "balance_loss_mlp": 1.01926279, "epoch": 0.5097096134191066, "flos": 25849362839040.0, "grad_norm": 1.631899982319937, "language_loss": 0.82692593, "learning_rate": 2.0342730748160588e-06, "loss": 0.84884536, "num_input_tokens_seen": 91408970, "step": 4239, "time_per_iteration": 2.8251848220825195 }, { "auxiliary_loss_clip": 0.01172573, "auxiliary_loss_mlp": 0.01028618, "balance_loss_clip": 1.05027866, "balance_loss_mlp": 1.02042282, "epoch": 0.5098298563097456, "flos": 27745122844800.0, "grad_norm": 2.0720270153016638, "language_loss": 0.70811248, "learning_rate": 2.033494214974006e-06, "loss": 0.73012441, "num_input_tokens_seen": 91430115, "step": 4240, "time_per_iteration": 2.9319567680358887 }, { "auxiliary_loss_clip": 0.01164531, "auxiliary_loss_mlp": 0.01022574, "balance_loss_clip": 1.05370998, "balance_loss_mlp": 1.01500976, "epoch": 0.5099500992003848, "flos": 21358913011200.0, "grad_norm": 2.117799494457203, "language_loss": 0.84190023, "learning_rate": 2.0327153500509067e-06, "loss": 0.86377132, "num_input_tokens_seen": 91449140, "step": 4241, "time_per_iteration": 2.7772862911224365 }, { "auxiliary_loss_clip": 0.01173763, "auxiliary_loss_mlp": 0.01030238, "balance_loss_clip": 1.05280995, "balance_loss_mlp": 1.02238226, "epoch": 0.5100703420910239, "flos": 19865999013120.0, "grad_norm": 1.8680096257983152, "language_loss": 0.84752023, "learning_rate": 2.031936480164916e-06, "loss": 0.86956024, "num_input_tokens_seen": 91466880, "step": 4242, "time_per_iteration": 2.6871378421783447 }, { "auxiliary_loss_clip": 0.01167527, "auxiliary_loss_mlp": 0.01027053, "balance_loss_clip": 1.05204201, "balance_loss_mlp": 1.01956081, "epoch": 0.5101905849816629, "flos": 24648797635200.0, "grad_norm": 2.3415157975077436, "language_loss": 0.80118859, "learning_rate": 2.0311576054341857e-06, "loss": 0.82313436, "num_input_tokens_seen": 91487495, "step": 4243, "time_per_iteration": 2.8345696926116943 }, { "auxiliary_loss_clip": 0.01181998, "auxiliary_loss_mlp": 0.0103102, "balance_loss_clip": 1.05601811, "balance_loss_mlp": 1.02346492, "epoch": 0.5103108278723021, "flos": 22930076787840.0, "grad_norm": 5.978994866727083, "language_loss": 0.62802029, "learning_rate": 2.0303787259768715e-06, "loss": 0.65015042, "num_input_tokens_seen": 91508395, "step": 4244, "time_per_iteration": 2.7482776641845703 }, { "auxiliary_loss_clip": 0.01175701, "auxiliary_loss_mlp": 0.01023582, "balance_loss_clip": 1.05465269, "balance_loss_mlp": 1.01621211, "epoch": 0.5104310707629411, "flos": 21506613736320.0, "grad_norm": 2.4332180454482626, "language_loss": 0.69454014, "learning_rate": 2.0295998419111294e-06, "loss": 0.71653295, "num_input_tokens_seen": 91525685, "step": 4245, "time_per_iteration": 2.8033463954925537 }, { "auxiliary_loss_clip": 0.01168535, "auxiliary_loss_mlp": 0.01032607, "balance_loss_clip": 1.0545392, "balance_loss_mlp": 1.02487648, "epoch": 0.5105513136535802, "flos": 14903180403840.0, "grad_norm": 3.10277492482178, "language_loss": 0.737993, "learning_rate": 2.028820953355115e-06, "loss": 0.7600044, "num_input_tokens_seen": 91543785, "step": 4246, "time_per_iteration": 2.7192723751068115 }, { "auxiliary_loss_clip": 0.01180005, "auxiliary_loss_mlp": 0.01029416, "balance_loss_clip": 1.05207467, "balance_loss_mlp": 1.0211904, "epoch": 0.5106715565442194, "flos": 22602212421120.0, "grad_norm": 2.3970037775731856, "language_loss": 0.78910315, "learning_rate": 2.0280420604269834e-06, "loss": 0.8111974, "num_input_tokens_seen": 91563325, "step": 4247, "time_per_iteration": 2.7706050872802734 }, { "auxiliary_loss_clip": 0.0107545, "auxiliary_loss_mlp": 0.01002883, "balance_loss_clip": 1.01673388, "balance_loss_mlp": 1.00175655, "epoch": 0.5107917994348584, "flos": 71027645558400.0, "grad_norm": 0.7094820190694311, "language_loss": 0.58967459, "learning_rate": 2.027263163244895e-06, "loss": 0.6104579, "num_input_tokens_seen": 91632450, "step": 4248, "time_per_iteration": 3.3926074504852295 }, { "auxiliary_loss_clip": 0.01174908, "auxiliary_loss_mlp": 0.01028793, "balance_loss_clip": 1.05306363, "balance_loss_mlp": 1.02180123, "epoch": 0.5109120423254975, "flos": 24827416992000.0, "grad_norm": 1.6708753823756144, "language_loss": 0.74497843, "learning_rate": 2.026484261927005e-06, "loss": 0.76701546, "num_input_tokens_seen": 91651945, "step": 4249, "time_per_iteration": 2.7094151973724365 }, { "auxiliary_loss_clip": 0.01184384, "auxiliary_loss_mlp": 0.0103494, "balance_loss_clip": 1.05601454, "balance_loss_mlp": 1.02662563, "epoch": 0.5110322852161366, "flos": 21247661612160.0, "grad_norm": 2.1114100984749493, "language_loss": 0.74200952, "learning_rate": 2.025705356591475e-06, "loss": 0.76420277, "num_input_tokens_seen": 91669635, "step": 4250, "time_per_iteration": 2.748627185821533 }, { "auxiliary_loss_clip": 0.01072016, "auxiliary_loss_mlp": 0.01047114, "balance_loss_clip": 1.01725078, "balance_loss_mlp": 1.00215232, "epoch": 0.5111525281067757, "flos": 66457114358400.0, "grad_norm": 0.7614782166867679, "language_loss": 0.5793556, "learning_rate": 2.024926447356462e-06, "loss": 0.60054696, "num_input_tokens_seen": 91731920, "step": 4251, "time_per_iteration": 3.2116634845733643 }, { "auxiliary_loss_clip": 0.01180602, "auxiliary_loss_mlp": 0.01030929, "balance_loss_clip": 1.05460119, "balance_loss_mlp": 1.02238762, "epoch": 0.5112727709974147, "flos": 14866731077760.0, "grad_norm": 1.9894970942016292, "language_loss": 0.78308344, "learning_rate": 2.024147534340127e-06, "loss": 0.80519879, "num_input_tokens_seen": 91749780, "step": 4252, "time_per_iteration": 2.71696138381958 }, { "auxiliary_loss_clip": 0.01170698, "auxiliary_loss_mlp": 0.0102194, "balance_loss_clip": 1.04998195, "balance_loss_mlp": 1.01447773, "epoch": 0.5113930138880539, "flos": 21177600134400.0, "grad_norm": 1.7982256792545945, "language_loss": 0.79795849, "learning_rate": 2.02336861766063e-06, "loss": 0.81988478, "num_input_tokens_seen": 91768840, "step": 4253, "time_per_iteration": 2.8885271549224854 }, { "auxiliary_loss_clip": 0.011857, "auxiliary_loss_mlp": 0.01027256, "balance_loss_clip": 1.05549169, "balance_loss_mlp": 1.01926315, "epoch": 0.511513256778693, "flos": 20409111630720.0, "grad_norm": 1.9068003696709575, "language_loss": 0.78840256, "learning_rate": 2.0225896974361327e-06, "loss": 0.81053215, "num_input_tokens_seen": 91788945, "step": 4254, "time_per_iteration": 2.714080572128296 }, { "auxiliary_loss_clip": 0.0107316, "auxiliary_loss_mlp": 0.01003668, "balance_loss_clip": 1.01715386, "balance_loss_mlp": 1.00238073, "epoch": 0.511633499669332, "flos": 69879975131520.0, "grad_norm": 0.8603544236546087, "language_loss": 0.59908998, "learning_rate": 2.0218107737847962e-06, "loss": 0.61985826, "num_input_tokens_seen": 91850990, "step": 4255, "time_per_iteration": 3.3478965759277344 }, { "auxiliary_loss_clip": 0.01178945, "auxiliary_loss_mlp": 0.0102265, "balance_loss_clip": 1.05328798, "balance_loss_mlp": 1.01474607, "epoch": 0.5117537425599712, "flos": 24097855852800.0, "grad_norm": 2.3521285571326365, "language_loss": 0.74806249, "learning_rate": 2.0210318468247826e-06, "loss": 0.77007842, "num_input_tokens_seen": 91869960, "step": 4256, "time_per_iteration": 2.7372090816497803 }, { "auxiliary_loss_clip": 0.01172431, "auxiliary_loss_mlp": 0.0102583, "balance_loss_clip": 1.05014443, "balance_loss_mlp": 1.0182842, "epoch": 0.5118739854506102, "flos": 20959550622720.0, "grad_norm": 1.9016762635598752, "language_loss": 0.82069659, "learning_rate": 2.020252916674255e-06, "loss": 0.84267926, "num_input_tokens_seen": 91889075, "step": 4257, "time_per_iteration": 3.7452237606048584 }, { "auxiliary_loss_clip": 0.01178808, "auxiliary_loss_mlp": 0.01026294, "balance_loss_clip": 1.05201948, "balance_loss_mlp": 1.01831281, "epoch": 0.5119942283412493, "flos": 17457326749440.0, "grad_norm": 2.2632047610624864, "language_loss": 0.81190479, "learning_rate": 2.019473983451375e-06, "loss": 0.83395576, "num_input_tokens_seen": 91907495, "step": 4258, "time_per_iteration": 2.707838773727417 }, { "auxiliary_loss_clip": 0.01178491, "auxiliary_loss_mlp": 0.01029684, "balance_loss_clip": 1.05385709, "balance_loss_mlp": 1.02176881, "epoch": 0.5121144712318885, "flos": 21066743784960.0, "grad_norm": 1.845051782027591, "language_loss": 0.71368003, "learning_rate": 2.0186950472743076e-06, "loss": 0.73576176, "num_input_tokens_seen": 91927400, "step": 4259, "time_per_iteration": 3.857825517654419 }, { "auxiliary_loss_clip": 0.01179258, "auxiliary_loss_mlp": 0.01027086, "balance_loss_clip": 1.05229616, "balance_loss_mlp": 1.0189743, "epoch": 0.5122347141225275, "flos": 19860791541120.0, "grad_norm": 1.9118554566329242, "language_loss": 0.73856562, "learning_rate": 2.0179161082612162e-06, "loss": 0.76062906, "num_input_tokens_seen": 91946790, "step": 4260, "time_per_iteration": 2.6742560863494873 }, { "auxiliary_loss_clip": 0.01172551, "auxiliary_loss_mlp": 0.01023652, "balance_loss_clip": 1.05369985, "balance_loss_mlp": 1.01592112, "epoch": 0.5123549570131666, "flos": 22528487756160.0, "grad_norm": 2.371422932043145, "language_loss": 0.72804958, "learning_rate": 2.017137166530266e-06, "loss": 0.75001156, "num_input_tokens_seen": 91966325, "step": 4261, "time_per_iteration": 3.8481225967407227 }, { "auxiliary_loss_clip": 0.0117876, "auxiliary_loss_mlp": 0.01027548, "balance_loss_clip": 1.05315852, "balance_loss_mlp": 1.01963258, "epoch": 0.5124751999038056, "flos": 20333375804160.0, "grad_norm": 2.0506388294818105, "language_loss": 0.80389708, "learning_rate": 2.0163582221996213e-06, "loss": 0.82596016, "num_input_tokens_seen": 91984700, "step": 4262, "time_per_iteration": 2.8062350749969482 }, { "auxiliary_loss_clip": 0.01174239, "auxiliary_loss_mlp": 0.01026944, "balance_loss_clip": 1.05305326, "balance_loss_mlp": 1.01863539, "epoch": 0.5125954427944448, "flos": 39785970211200.0, "grad_norm": 2.175087175289031, "language_loss": 0.68426514, "learning_rate": 2.015579275387446e-06, "loss": 0.70627701, "num_input_tokens_seen": 92010020, "step": 4263, "time_per_iteration": 2.895540952682495 }, { "auxiliary_loss_clip": 0.01166907, "auxiliary_loss_mlp": 0.01030388, "balance_loss_clip": 1.05375862, "balance_loss_mlp": 1.02236509, "epoch": 0.5127156856850839, "flos": 29205394358400.0, "grad_norm": 1.8652975278490345, "language_loss": 0.68813288, "learning_rate": 2.0148003262119085e-06, "loss": 0.71010578, "num_input_tokens_seen": 92030990, "step": 4264, "time_per_iteration": 2.7866053581237793 }, { "auxiliary_loss_clip": 0.01173355, "auxiliary_loss_mlp": 0.010261, "balance_loss_clip": 1.05488658, "balance_loss_mlp": 1.01795244, "epoch": 0.5128359285757229, "flos": 13553693412480.0, "grad_norm": 2.135458166262933, "language_loss": 0.76370847, "learning_rate": 2.0140213747911728e-06, "loss": 0.785703, "num_input_tokens_seen": 92049525, "step": 4265, "time_per_iteration": 3.607282876968384 }, { "auxiliary_loss_clip": 0.01172873, "auxiliary_loss_mlp": 0.0102924, "balance_loss_clip": 1.05494142, "balance_loss_mlp": 1.02115774, "epoch": 0.5129561714663621, "flos": 25192089820800.0, "grad_norm": 1.9487982307494671, "language_loss": 0.80482531, "learning_rate": 2.013242421243406e-06, "loss": 0.82684642, "num_input_tokens_seen": 92068430, "step": 4266, "time_per_iteration": 2.834918737411499 }, { "auxiliary_loss_clip": 0.01168019, "auxiliary_loss_mlp": 0.01031643, "balance_loss_clip": 1.05333841, "balance_loss_mlp": 1.02401614, "epoch": 0.5130764143570011, "flos": 18150223080960.0, "grad_norm": 1.7190932807245283, "language_loss": 0.79013807, "learning_rate": 2.012463465686774e-06, "loss": 0.81213462, "num_input_tokens_seen": 92088180, "step": 4267, "time_per_iteration": 2.82918643951416 }, { "auxiliary_loss_clip": 0.01078912, "auxiliary_loss_mlp": 0.0101121, "balance_loss_clip": 1.02517319, "balance_loss_mlp": 1.00983322, "epoch": 0.5131966572476402, "flos": 59794896418560.0, "grad_norm": 0.8234972718608171, "language_loss": 0.54762501, "learning_rate": 2.0116845082394446e-06, "loss": 0.56852621, "num_input_tokens_seen": 92153015, "step": 4268, "time_per_iteration": 3.3487939834594727 }, { "auxiliary_loss_clip": 0.01180723, "auxiliary_loss_mlp": 0.01025413, "balance_loss_clip": 1.05227542, "balance_loss_mlp": 1.01748538, "epoch": 0.5133169001382794, "flos": 18515219132160.0, "grad_norm": 2.5945383317009543, "language_loss": 0.78690237, "learning_rate": 2.0109055490195836e-06, "loss": 0.80896378, "num_input_tokens_seen": 92171470, "step": 4269, "time_per_iteration": 2.718266725540161 }, { "auxiliary_loss_clip": 0.01170212, "auxiliary_loss_mlp": 0.01033732, "balance_loss_clip": 1.05160582, "balance_loss_mlp": 1.02616215, "epoch": 0.5134371430289184, "flos": 15523537219200.0, "grad_norm": 2.0610334089423157, "language_loss": 0.64032626, "learning_rate": 2.0101265881453605e-06, "loss": 0.66236567, "num_input_tokens_seen": 92189945, "step": 4270, "time_per_iteration": 2.801860809326172 }, { "auxiliary_loss_clip": 0.01170766, "auxiliary_loss_mlp": 0.01031898, "balance_loss_clip": 1.05714297, "balance_loss_mlp": 1.02402449, "epoch": 0.5135573859195575, "flos": 21433786911360.0, "grad_norm": 2.094309928783237, "language_loss": 0.78213376, "learning_rate": 2.009347625734941e-06, "loss": 0.80416036, "num_input_tokens_seen": 92209855, "step": 4271, "time_per_iteration": 2.7673826217651367 }, { "auxiliary_loss_clip": 0.01185221, "auxiliary_loss_mlp": 0.01029089, "balance_loss_clip": 1.05680108, "balance_loss_mlp": 1.02146006, "epoch": 0.5136776288101966, "flos": 17712651600000.0, "grad_norm": 2.5242279006314625, "language_loss": 0.75027001, "learning_rate": 2.0085686619064954e-06, "loss": 0.77241313, "num_input_tokens_seen": 92226295, "step": 4272, "time_per_iteration": 2.6770834922790527 }, { "auxiliary_loss_clip": 0.01182889, "auxiliary_loss_mlp": 0.01029037, "balance_loss_clip": 1.05482686, "balance_loss_mlp": 1.02069235, "epoch": 0.5137978717008357, "flos": 16581680997120.0, "grad_norm": 2.329223939246576, "language_loss": 0.82951403, "learning_rate": 2.00778969677819e-06, "loss": 0.85163331, "num_input_tokens_seen": 92243330, "step": 4273, "time_per_iteration": 2.729029893875122 }, { "auxiliary_loss_clip": 0.01172527, "auxiliary_loss_mlp": 0.01028876, "balance_loss_clip": 1.05273533, "balance_loss_mlp": 1.02102637, "epoch": 0.5139181145914747, "flos": 20668243322880.0, "grad_norm": 1.7590519383175498, "language_loss": 0.63921529, "learning_rate": 2.0070107304681934e-06, "loss": 0.66122937, "num_input_tokens_seen": 92262285, "step": 4274, "time_per_iteration": 2.6523337364196777 }, { "auxiliary_loss_clip": 0.01169868, "auxiliary_loss_mlp": 0.01024253, "balance_loss_clip": 1.05442929, "balance_loss_mlp": 1.016469, "epoch": 0.5140383574821139, "flos": 32926996546560.0, "grad_norm": 2.0457605342476097, "language_loss": 0.78219259, "learning_rate": 2.006231763094675e-06, "loss": 0.80413383, "num_input_tokens_seen": 92283305, "step": 4275, "time_per_iteration": 2.867539882659912 }, { "auxiliary_loss_clip": 0.01173278, "auxiliary_loss_mlp": 0.01027666, "balance_loss_clip": 1.05670094, "balance_loss_mlp": 1.01997721, "epoch": 0.514158600372753, "flos": 19537093152000.0, "grad_norm": 2.0143620472985817, "language_loss": 0.87046266, "learning_rate": 2.0054527947758027e-06, "loss": 0.89247215, "num_input_tokens_seen": 92302105, "step": 4276, "time_per_iteration": 2.7207067012786865 }, { "auxiliary_loss_clip": 0.01074454, "auxiliary_loss_mlp": 0.01003105, "balance_loss_clip": 1.01679134, "balance_loss_mlp": 1.00180531, "epoch": 0.514278843263392, "flos": 62523855279360.0, "grad_norm": 0.7280051452680235, "language_loss": 0.55863237, "learning_rate": 2.004673825629746e-06, "loss": 0.57940799, "num_input_tokens_seen": 92362885, "step": 4277, "time_per_iteration": 3.224771738052368 }, { "auxiliary_loss_clip": 0.01169427, "auxiliary_loss_mlp": 0.01027691, "balance_loss_clip": 1.0520041, "balance_loss_mlp": 1.01969171, "epoch": 0.5143990861540312, "flos": 25882328545920.0, "grad_norm": 2.6395171983403554, "language_loss": 0.72479856, "learning_rate": 2.0038948557746744e-06, "loss": 0.74676973, "num_input_tokens_seen": 92384740, "step": 4278, "time_per_iteration": 2.7976973056793213 }, { "auxiliary_loss_clip": 0.01172862, "auxiliary_loss_mlp": 0.01025836, "balance_loss_clip": 1.05191505, "balance_loss_mlp": 1.01833725, "epoch": 0.5145193290446702, "flos": 23330660238720.0, "grad_norm": 1.7701884461025588, "language_loss": 0.75222361, "learning_rate": 2.0031158853287558e-06, "loss": 0.77421057, "num_input_tokens_seen": 92405175, "step": 4279, "time_per_iteration": 2.7131192684173584 }, { "auxiliary_loss_clip": 0.01173601, "auxiliary_loss_mlp": 0.01032303, "balance_loss_clip": 1.05379844, "balance_loss_mlp": 1.02414334, "epoch": 0.5146395719353093, "flos": 22856603518080.0, "grad_norm": 3.2888906990969917, "language_loss": 0.70708001, "learning_rate": 2.0023369144101593e-06, "loss": 0.72913909, "num_input_tokens_seen": 92423345, "step": 4280, "time_per_iteration": 2.7662057876586914 }, { "auxiliary_loss_clip": 0.01168423, "auxiliary_loss_mlp": 0.0103036, "balance_loss_clip": 1.05348206, "balance_loss_mlp": 1.02271235, "epoch": 0.5147598148259485, "flos": 26391577616640.0, "grad_norm": 1.8300935565735053, "language_loss": 0.77163494, "learning_rate": 2.0015579431370555e-06, "loss": 0.79362285, "num_input_tokens_seen": 92445025, "step": 4281, "time_per_iteration": 2.7991724014282227 }, { "auxiliary_loss_clip": 0.01175668, "auxiliary_loss_mlp": 0.01025057, "balance_loss_clip": 1.05366087, "balance_loss_mlp": 1.01783013, "epoch": 0.5148800577165875, "flos": 29965694561280.0, "grad_norm": 2.122619518478832, "language_loss": 0.69899821, "learning_rate": 2.000778971627612e-06, "loss": 0.72100544, "num_input_tokens_seen": 92464490, "step": 4282, "time_per_iteration": 2.847773551940918 }, { "auxiliary_loss_clip": 0.01172786, "auxiliary_loss_mlp": 0.01033164, "balance_loss_clip": 1.05612624, "balance_loss_mlp": 1.02535903, "epoch": 0.5150003006072266, "flos": 17931383470080.0, "grad_norm": 2.9032085476761234, "language_loss": 0.90370584, "learning_rate": 2e-06, "loss": 0.9257654, "num_input_tokens_seen": 92482085, "step": 4283, "time_per_iteration": 3.7014904022216797 }, { "auxiliary_loss_clip": 0.01178712, "auxiliary_loss_mlp": 0.01023683, "balance_loss_clip": 1.0536921, "balance_loss_mlp": 1.01616716, "epoch": 0.5151205434978657, "flos": 18478733892480.0, "grad_norm": 1.804889116778494, "language_loss": 0.85537225, "learning_rate": 1.9992210283723878e-06, "loss": 0.87739623, "num_input_tokens_seen": 92499325, "step": 4284, "time_per_iteration": 3.831207513809204 }, { "auxiliary_loss_clip": 0.01177577, "auxiliary_loss_mlp": 0.01024623, "balance_loss_clip": 1.0531776, "balance_loss_mlp": 1.01704717, "epoch": 0.5152407863885048, "flos": 25341263003520.0, "grad_norm": 1.5532609286984826, "language_loss": 0.79488528, "learning_rate": 1.9984420568629448e-06, "loss": 0.81690723, "num_input_tokens_seen": 92522090, "step": 4285, "time_per_iteration": 2.8278844356536865 }, { "auxiliary_loss_clip": 0.01178965, "auxiliary_loss_mlp": 0.01024903, "balance_loss_clip": 1.05350471, "balance_loss_mlp": 1.01773238, "epoch": 0.5153610292791438, "flos": 18329740277760.0, "grad_norm": 2.1157323568222473, "language_loss": 0.78600341, "learning_rate": 1.9976630855898405e-06, "loss": 0.80804205, "num_input_tokens_seen": 92539845, "step": 4286, "time_per_iteration": 3.7750184535980225 }, { "auxiliary_loss_clip": 0.01168901, "auxiliary_loss_mlp": 0.01025038, "balance_loss_clip": 1.04928112, "balance_loss_mlp": 1.01761103, "epoch": 0.515481272169783, "flos": 30409945971840.0, "grad_norm": 2.1793417682640754, "language_loss": 0.74864936, "learning_rate": 1.9968841146712445e-06, "loss": 0.77058876, "num_input_tokens_seen": 92559460, "step": 4287, "time_per_iteration": 2.7994749546051025 }, { "auxiliary_loss_clip": 0.01167187, "auxiliary_loss_mlp": 0.01059533, "balance_loss_clip": 1.05628633, "balance_loss_mlp": 1.02232671, "epoch": 0.5156015150604221, "flos": 23037305863680.0, "grad_norm": 1.657331183664844, "language_loss": 0.71793205, "learning_rate": 1.996105144225326e-06, "loss": 0.74019921, "num_input_tokens_seen": 92579695, "step": 4288, "time_per_iteration": 2.8850622177124023 }, { "auxiliary_loss_clip": 0.01175323, "auxiliary_loss_mlp": 0.01036028, "balance_loss_clip": 1.0530417, "balance_loss_mlp": 1.02788615, "epoch": 0.5157217579510611, "flos": 17858556645120.0, "grad_norm": 1.9716080736820536, "language_loss": 0.79255468, "learning_rate": 1.995326174370254e-06, "loss": 0.81466818, "num_input_tokens_seen": 92598795, "step": 4289, "time_per_iteration": 2.6034655570983887 }, { "auxiliary_loss_clip": 0.01171741, "auxiliary_loss_mlp": 0.01056981, "balance_loss_clip": 1.04960179, "balance_loss_mlp": 1.01957774, "epoch": 0.5158420008417003, "flos": 19171486569600.0, "grad_norm": 1.9680635879360062, "language_loss": 0.7328881, "learning_rate": 1.994547205224197e-06, "loss": 0.75517535, "num_input_tokens_seen": 92617700, "step": 4290, "time_per_iteration": 3.5171868801116943 }, { "auxiliary_loss_clip": 0.01169081, "auxiliary_loss_mlp": 0.01027352, "balance_loss_clip": 1.05340433, "balance_loss_mlp": 1.01965117, "epoch": 0.5159622437323393, "flos": 22419534827520.0, "grad_norm": 1.9909372373954057, "language_loss": 0.67787707, "learning_rate": 1.993768236905325e-06, "loss": 0.69984144, "num_input_tokens_seen": 92638370, "step": 4291, "time_per_iteration": 2.705660820007324 }, { "auxiliary_loss_clip": 0.01171146, "auxiliary_loss_mlp": 0.01025376, "balance_loss_clip": 1.05255806, "balance_loss_mlp": 1.01747799, "epoch": 0.5160824866229784, "flos": 24603010773120.0, "grad_norm": 2.6164087590011387, "language_loss": 0.66664779, "learning_rate": 1.992989269531807e-06, "loss": 0.68861294, "num_input_tokens_seen": 92657180, "step": 4292, "time_per_iteration": 2.746319532394409 }, { "auxiliary_loss_clip": 0.01174128, "auxiliary_loss_mlp": 0.01029604, "balance_loss_clip": 1.05225456, "balance_loss_mlp": 1.020926, "epoch": 0.5162027295136175, "flos": 18002737837440.0, "grad_norm": 2.9883052785030157, "language_loss": 0.6835106, "learning_rate": 1.99221030322181e-06, "loss": 0.70554793, "num_input_tokens_seen": 92673985, "step": 4293, "time_per_iteration": 2.637471914291382 }, { "auxiliary_loss_clip": 0.01175792, "auxiliary_loss_mlp": 0.01023933, "balance_loss_clip": 1.05230927, "balance_loss_mlp": 1.01685512, "epoch": 0.5163229724042566, "flos": 27344611221120.0, "grad_norm": 1.7290765856324057, "language_loss": 0.80757445, "learning_rate": 1.991431338093505e-06, "loss": 0.82957172, "num_input_tokens_seen": 92696340, "step": 4294, "time_per_iteration": 2.787691354751587 }, { "auxiliary_loss_clip": 0.0117278, "auxiliary_loss_mlp": 0.01024282, "balance_loss_clip": 1.05486012, "balance_loss_mlp": 1.01655138, "epoch": 0.5164432152948957, "flos": 21762764599680.0, "grad_norm": 1.7550190512977357, "language_loss": 0.79274452, "learning_rate": 1.9906523742650587e-06, "loss": 0.81471509, "num_input_tokens_seen": 92715200, "step": 4295, "time_per_iteration": 2.7559924125671387 }, { "auxiliary_loss_clip": 0.01178993, "auxiliary_loss_mlp": 0.01032264, "balance_loss_clip": 1.05081654, "balance_loss_mlp": 1.02429461, "epoch": 0.5165634581855347, "flos": 25550334115200.0, "grad_norm": 2.003967401515992, "language_loss": 0.77331865, "learning_rate": 1.9898734118546397e-06, "loss": 0.7954312, "num_input_tokens_seen": 92735150, "step": 4296, "time_per_iteration": 2.7274866104125977 }, { "auxiliary_loss_clip": 0.01168449, "auxiliary_loss_mlp": 0.01027193, "balance_loss_clip": 1.05520964, "balance_loss_mlp": 1.01950419, "epoch": 0.5166837010761739, "flos": 19901191363200.0, "grad_norm": 1.945386274385366, "language_loss": 0.80205554, "learning_rate": 1.989094450980416e-06, "loss": 0.82401192, "num_input_tokens_seen": 92755250, "step": 4297, "time_per_iteration": 2.837460994720459 }, { "auxiliary_loss_clip": 0.01176847, "auxiliary_loss_mlp": 0.01023351, "balance_loss_clip": 1.05378747, "balance_loss_mlp": 1.01584053, "epoch": 0.516803943966813, "flos": 26646076454400.0, "grad_norm": 1.8707439693710455, "language_loss": 0.76722151, "learning_rate": 1.9883154917605556e-06, "loss": 0.78922349, "num_input_tokens_seen": 92774460, "step": 4298, "time_per_iteration": 2.785989284515381 }, { "auxiliary_loss_clip": 0.0117609, "auxiliary_loss_mlp": 0.01025166, "balance_loss_clip": 1.0502249, "balance_loss_mlp": 1.01755428, "epoch": 0.516924186857452, "flos": 19682854542720.0, "grad_norm": 2.2439276720748964, "language_loss": 0.83324295, "learning_rate": 1.9875365343132262e-06, "loss": 0.85525554, "num_input_tokens_seen": 92791580, "step": 4299, "time_per_iteration": 2.7812702655792236 }, { "auxiliary_loss_clip": 0.01176687, "auxiliary_loss_mlp": 0.01059403, "balance_loss_clip": 1.0544728, "balance_loss_mlp": 1.02212739, "epoch": 0.5170444297480912, "flos": 15956583586560.0, "grad_norm": 2.320536602856899, "language_loss": 0.8483901, "learning_rate": 1.9867575787565946e-06, "loss": 0.87075102, "num_input_tokens_seen": 92806240, "step": 4300, "time_per_iteration": 2.6791281700134277 }, { "auxiliary_loss_clip": 0.01176855, "auxiliary_loss_mlp": 0.01027272, "balance_loss_clip": 1.05319357, "balance_loss_mlp": 1.01941562, "epoch": 0.5171646726387302, "flos": 14174157968640.0, "grad_norm": 2.0375454920471747, "language_loss": 0.86023551, "learning_rate": 1.9859786252088275e-06, "loss": 0.88227677, "num_input_tokens_seen": 92823420, "step": 4301, "time_per_iteration": 2.6719682216644287 }, { "auxiliary_loss_clip": 0.01172046, "auxiliary_loss_mlp": 0.01025533, "balance_loss_clip": 1.05103683, "balance_loss_mlp": 1.01706934, "epoch": 0.5172849155293693, "flos": 23578550974080.0, "grad_norm": 2.708866361820843, "language_loss": 0.66930419, "learning_rate": 1.9851996737880914e-06, "loss": 0.69128001, "num_input_tokens_seen": 92838605, "step": 4302, "time_per_iteration": 2.728193521499634 }, { "auxiliary_loss_clip": 0.01180451, "auxiliary_loss_mlp": 0.0103011, "balance_loss_clip": 1.05277467, "balance_loss_mlp": 1.02137184, "epoch": 0.5174051584200084, "flos": 14283541860480.0, "grad_norm": 2.0161808796556087, "language_loss": 0.74145567, "learning_rate": 1.9844207246125537e-06, "loss": 0.76356125, "num_input_tokens_seen": 92855185, "step": 4303, "time_per_iteration": 2.694002151489258 }, { "auxiliary_loss_clip": 0.01172998, "auxiliary_loss_mlp": 0.01023729, "balance_loss_clip": 1.05409265, "balance_loss_mlp": 1.01610541, "epoch": 0.5175254013106475, "flos": 37889384192640.0, "grad_norm": 2.1544881084726293, "language_loss": 0.68219876, "learning_rate": 1.983641777800379e-06, "loss": 0.70416605, "num_input_tokens_seen": 92877830, "step": 4304, "time_per_iteration": 3.0130414962768555 }, { "auxiliary_loss_clip": 0.01075103, "auxiliary_loss_mlp": 0.01002576, "balance_loss_clip": 1.01366138, "balance_loss_mlp": 1.00127649, "epoch": 0.5176456442012866, "flos": 68549737829760.0, "grad_norm": 0.7412915447048277, "language_loss": 0.58749378, "learning_rate": 1.9828628334697343e-06, "loss": 0.60827059, "num_input_tokens_seen": 92945040, "step": 4305, "time_per_iteration": 3.4983296394348145 }, { "auxiliary_loss_clip": 0.01075094, "auxiliary_loss_mlp": 0.01005341, "balance_loss_clip": 1.01319957, "balance_loss_mlp": 1.00414932, "epoch": 0.5177658870919257, "flos": 64084137235200.0, "grad_norm": 0.76293266421102, "language_loss": 0.5466243, "learning_rate": 1.982083891738784e-06, "loss": 0.56742865, "num_input_tokens_seen": 93005910, "step": 4306, "time_per_iteration": 3.325519561767578 }, { "auxiliary_loss_clip": 0.0116823, "auxiliary_loss_mlp": 0.0102151, "balance_loss_clip": 1.05345368, "balance_loss_mlp": 1.01426458, "epoch": 0.5178861299825648, "flos": 26651248012800.0, "grad_norm": 1.4840978600591292, "language_loss": 0.82911694, "learning_rate": 1.9813049527256923e-06, "loss": 0.85101432, "num_input_tokens_seen": 93026305, "step": 4307, "time_per_iteration": 2.9126970767974854 }, { "auxiliary_loss_clip": 0.0117075, "auxiliary_loss_mlp": 0.01031128, "balance_loss_clip": 1.0527463, "balance_loss_mlp": 1.02316475, "epoch": 0.5180063728732038, "flos": 17931886260480.0, "grad_norm": 2.753322651694152, "language_loss": 0.82105899, "learning_rate": 1.9805260165486252e-06, "loss": 0.84307778, "num_input_tokens_seen": 93045675, "step": 4308, "time_per_iteration": 2.768470525741577 }, { "auxiliary_loss_clip": 0.01175096, "auxiliary_loss_mlp": 0.01024188, "balance_loss_clip": 1.05361247, "balance_loss_mlp": 1.01657653, "epoch": 0.518126615763843, "flos": 19500895221120.0, "grad_norm": 2.0515154866844902, "language_loss": 0.85876667, "learning_rate": 1.9797470833257457e-06, "loss": 0.88075948, "num_input_tokens_seen": 93065375, "step": 4309, "time_per_iteration": 3.630643367767334 }, { "auxiliary_loss_clip": 0.01176932, "auxiliary_loss_mlp": 0.0102888, "balance_loss_clip": 1.05393112, "balance_loss_mlp": 1.02085125, "epoch": 0.5182468586544821, "flos": 20704082117760.0, "grad_norm": 4.042982426459342, "language_loss": 0.77500367, "learning_rate": 1.9789681531752177e-06, "loss": 0.7970618, "num_input_tokens_seen": 93085595, "step": 4310, "time_per_iteration": 3.8774075508117676 }, { "auxiliary_loss_clip": 0.01166243, "auxiliary_loss_mlp": 0.01025887, "balance_loss_clip": 1.0523293, "balance_loss_mlp": 1.01888657, "epoch": 0.5183671015451211, "flos": 23112107936640.0, "grad_norm": 48.522637261961336, "language_loss": 0.7268939, "learning_rate": 1.978189226215204e-06, "loss": 0.74881518, "num_input_tokens_seen": 93106140, "step": 4311, "time_per_iteration": 2.7780911922454834 }, { "auxiliary_loss_clip": 0.01178002, "auxiliary_loss_mlp": 0.01024959, "balance_loss_clip": 1.05194175, "balance_loss_mlp": 1.01731193, "epoch": 0.5184873444357603, "flos": 17597090568960.0, "grad_norm": 2.155533427833921, "language_loss": 0.77194786, "learning_rate": 1.9774103025638675e-06, "loss": 0.7939775, "num_input_tokens_seen": 93124265, "step": 4312, "time_per_iteration": 3.5665204524993896 }, { "auxiliary_loss_clip": 0.01170516, "auxiliary_loss_mlp": 0.01030444, "balance_loss_clip": 1.05723786, "balance_loss_mlp": 1.02246881, "epoch": 0.5186075873263993, "flos": 24936800883840.0, "grad_norm": 1.5021551041774028, "language_loss": 0.76211572, "learning_rate": 1.9766313823393696e-06, "loss": 0.78412533, "num_input_tokens_seen": 93145130, "step": 4313, "time_per_iteration": 2.823291063308716 }, { "auxiliary_loss_clip": 0.01165397, "auxiliary_loss_mlp": 0.01023469, "balance_loss_clip": 1.0528177, "balance_loss_mlp": 1.01557136, "epoch": 0.5187278302170384, "flos": 15190106244480.0, "grad_norm": 3.2936007693277185, "language_loss": 0.69087768, "learning_rate": 1.975852465659873e-06, "loss": 0.71276629, "num_input_tokens_seen": 93161110, "step": 4314, "time_per_iteration": 2.779636859893799 }, { "auxiliary_loss_clip": 0.01178584, "auxiliary_loss_mlp": 0.01028723, "balance_loss_clip": 1.05368257, "balance_loss_mlp": 1.02110589, "epoch": 0.5188480731076776, "flos": 25009412227200.0, "grad_norm": 2.5730743468162562, "language_loss": 0.69673061, "learning_rate": 1.9750735526435377e-06, "loss": 0.7188037, "num_input_tokens_seen": 93178055, "step": 4315, "time_per_iteration": 2.7957851886749268 }, { "auxiliary_loss_clip": 0.01168323, "auxiliary_loss_mlp": 0.01027199, "balance_loss_clip": 1.05163908, "balance_loss_mlp": 1.01980531, "epoch": 0.5189683159983166, "flos": 24790141653120.0, "grad_norm": 6.788740409477078, "language_loss": 0.79751492, "learning_rate": 1.974294643408525e-06, "loss": 0.81947011, "num_input_tokens_seen": 93195850, "step": 4316, "time_per_iteration": 3.6191487312316895 }, { "auxiliary_loss_clip": 0.01180496, "auxiliary_loss_mlp": 0.01027273, "balance_loss_clip": 1.05245376, "balance_loss_mlp": 1.0197506, "epoch": 0.5190885588889557, "flos": 24754266944640.0, "grad_norm": 2.1504801693367854, "language_loss": 0.66981924, "learning_rate": 1.9735157380729947e-06, "loss": 0.69189692, "num_input_tokens_seen": 93216260, "step": 4317, "time_per_iteration": 2.7324070930480957 }, { "auxiliary_loss_clip": 0.01173353, "auxiliary_loss_mlp": 0.01025878, "balance_loss_clip": 1.05056596, "balance_loss_mlp": 1.01792717, "epoch": 0.5192088017795948, "flos": 24712646060160.0, "grad_norm": 1.9456554902681045, "language_loss": 0.84179074, "learning_rate": 1.9727368367551053e-06, "loss": 0.863783, "num_input_tokens_seen": 93234810, "step": 4318, "time_per_iteration": 2.700373649597168 }, { "auxiliary_loss_clip": 0.01163788, "auxiliary_loss_mlp": 0.01023181, "balance_loss_clip": 1.05162978, "balance_loss_mlp": 1.01548004, "epoch": 0.5193290446702339, "flos": 27229588894080.0, "grad_norm": 1.9277919230412681, "language_loss": 0.68311775, "learning_rate": 1.9719579395730164e-06, "loss": 0.70498747, "num_input_tokens_seen": 93254185, "step": 4319, "time_per_iteration": 2.8271946907043457 }, { "auxiliary_loss_clip": 0.01180946, "auxiliary_loss_mlp": 0.01026909, "balance_loss_clip": 1.05472374, "balance_loss_mlp": 1.01904082, "epoch": 0.5194492875608729, "flos": 11473352392320.0, "grad_norm": 2.119582495492831, "language_loss": 0.93480206, "learning_rate": 1.9711790466448854e-06, "loss": 0.95688063, "num_input_tokens_seen": 93268205, "step": 4320, "time_per_iteration": 2.7451589107513428 }, { "auxiliary_loss_clip": 0.01171691, "auxiliary_loss_mlp": 0.01031658, "balance_loss_clip": 1.05367315, "balance_loss_mlp": 1.02356982, "epoch": 0.5195695304515121, "flos": 20338906498560.0, "grad_norm": 4.262428042942364, "language_loss": 0.7152437, "learning_rate": 1.9704001580888704e-06, "loss": 0.73727721, "num_input_tokens_seen": 93286945, "step": 4321, "time_per_iteration": 2.809828519821167 }, { "auxiliary_loss_clip": 0.01167091, "auxiliary_loss_mlp": 0.01054374, "balance_loss_clip": 1.04999256, "balance_loss_mlp": 1.01994014, "epoch": 0.5196897733421512, "flos": 20048317470720.0, "grad_norm": 2.5684730234223117, "language_loss": 0.86458063, "learning_rate": 1.9696212740231283e-06, "loss": 0.88679528, "num_input_tokens_seen": 93305595, "step": 4322, "time_per_iteration": 2.8245694637298584 }, { "auxiliary_loss_clip": 0.01181389, "auxiliary_loss_mlp": 0.0102954, "balance_loss_clip": 1.05196428, "balance_loss_mlp": 1.02087307, "epoch": 0.5198100162327902, "flos": 23805507058560.0, "grad_norm": 1.9783601710798093, "language_loss": 0.82060444, "learning_rate": 1.9688423945658146e-06, "loss": 0.84271371, "num_input_tokens_seen": 93326460, "step": 4323, "time_per_iteration": 2.7585341930389404 }, { "auxiliary_loss_clip": 0.01161274, "auxiliary_loss_mlp": 0.01029623, "balance_loss_clip": 1.05163693, "balance_loss_mlp": 1.02147555, "epoch": 0.5199302591234293, "flos": 24023951619840.0, "grad_norm": 2.164666074763173, "language_loss": 0.72146726, "learning_rate": 1.9680635198350845e-06, "loss": 0.74337626, "num_input_tokens_seen": 93346170, "step": 4324, "time_per_iteration": 2.8057055473327637 }, { "auxiliary_loss_clip": 0.0117955, "auxiliary_loss_mlp": 0.01029887, "balance_loss_clip": 1.05448639, "balance_loss_mlp": 1.02160215, "epoch": 0.5200505020140684, "flos": 26359366095360.0, "grad_norm": 2.5148729332497144, "language_loss": 0.72771043, "learning_rate": 1.967284649949093e-06, "loss": 0.74980485, "num_input_tokens_seen": 93365380, "step": 4325, "time_per_iteration": 2.7190496921539307 }, { "auxiliary_loss_clip": 0.01164861, "auxiliary_loss_mlp": 0.01019848, "balance_loss_clip": 1.04953718, "balance_loss_mlp": 1.01191449, "epoch": 0.5201707449047075, "flos": 39604262284800.0, "grad_norm": 2.4182823914402762, "language_loss": 0.72347468, "learning_rate": 1.966505785025994e-06, "loss": 0.74532175, "num_input_tokens_seen": 93387285, "step": 4326, "time_per_iteration": 2.8804149627685547 }, { "auxiliary_loss_clip": 0.0116967, "auxiliary_loss_mlp": 0.01030477, "balance_loss_clip": 1.05337036, "balance_loss_mlp": 1.02276373, "epoch": 0.5202909877953465, "flos": 53682788292480.0, "grad_norm": 1.9683181505359388, "language_loss": 0.76198518, "learning_rate": 1.965726925183941e-06, "loss": 0.78398669, "num_input_tokens_seen": 93410390, "step": 4327, "time_per_iteration": 3.076730728149414 }, { "auxiliary_loss_clip": 0.01177663, "auxiliary_loss_mlp": 0.0102656, "balance_loss_clip": 1.05202091, "balance_loss_mlp": 1.01894808, "epoch": 0.5204112306859857, "flos": 19537021324800.0, "grad_norm": 1.908798845416237, "language_loss": 0.85177207, "learning_rate": 1.964948070541087e-06, "loss": 0.87381434, "num_input_tokens_seen": 93429050, "step": 4328, "time_per_iteration": 2.7773056030273438 }, { "auxiliary_loss_clip": 0.01166272, "auxiliary_loss_mlp": 0.01023957, "balance_loss_clip": 1.04977512, "balance_loss_mlp": 1.0162499, "epoch": 0.5205314735766248, "flos": 15304697608320.0, "grad_norm": 3.9110245984038077, "language_loss": 0.69034493, "learning_rate": 1.9641692212155816e-06, "loss": 0.71224725, "num_input_tokens_seen": 93446815, "step": 4329, "time_per_iteration": 2.731942892074585 }, { "auxiliary_loss_clip": 0.0116136, "auxiliary_loss_mlp": 0.01029641, "balance_loss_clip": 1.05169141, "balance_loss_mlp": 1.02171981, "epoch": 0.5206517164672638, "flos": 59263701160320.0, "grad_norm": 2.0042585817340566, "language_loss": 0.72709697, "learning_rate": 1.9633903773255777e-06, "loss": 0.74900699, "num_input_tokens_seen": 93469130, "step": 4330, "time_per_iteration": 3.0579171180725098 }, { "auxiliary_loss_clip": 0.01176057, "auxiliary_loss_mlp": 0.01023697, "balance_loss_clip": 1.05023301, "balance_loss_mlp": 1.01554894, "epoch": 0.520771959357903, "flos": 26871129118080.0, "grad_norm": 1.8771813424104167, "language_loss": 0.74374688, "learning_rate": 1.9626115389892237e-06, "loss": 0.76574445, "num_input_tokens_seen": 93489920, "step": 4331, "time_per_iteration": 2.717622995376587 }, { "auxiliary_loss_clip": 0.01173946, "auxiliary_loss_mlp": 0.01025675, "balance_loss_clip": 1.05111265, "balance_loss_mlp": 1.01807523, "epoch": 0.520892202248542, "flos": 26907075653760.0, "grad_norm": 2.0371127713026427, "language_loss": 0.8573792, "learning_rate": 1.96183270632467e-06, "loss": 0.87937546, "num_input_tokens_seen": 93509770, "step": 4332, "time_per_iteration": 2.8338212966918945 }, { "auxiliary_loss_clip": 0.01167889, "auxiliary_loss_mlp": 0.01056668, "balance_loss_clip": 1.05287218, "balance_loss_mlp": 1.02109599, "epoch": 0.5210124451391811, "flos": 25849434666240.0, "grad_norm": 1.7934122227306917, "language_loss": 0.78926033, "learning_rate": 1.9610538794500644e-06, "loss": 0.81150591, "num_input_tokens_seen": 93529320, "step": 4333, "time_per_iteration": 2.784280776977539 }, { "auxiliary_loss_clip": 0.01073888, "auxiliary_loss_mlp": 0.01001638, "balance_loss_clip": 1.01667655, "balance_loss_mlp": 1.00027323, "epoch": 0.5211326880298203, "flos": 70553804319360.0, "grad_norm": 0.772851397486495, "language_loss": 0.59366155, "learning_rate": 1.9602750584835542e-06, "loss": 0.61441678, "num_input_tokens_seen": 93595255, "step": 4334, "time_per_iteration": 3.388970136642456 }, { "auxiliary_loss_clip": 0.01169504, "auxiliary_loss_mlp": 0.01030232, "balance_loss_clip": 1.05120301, "balance_loss_mlp": 1.02275157, "epoch": 0.5212529309204593, "flos": 15628898787840.0, "grad_norm": 2.088133736817676, "language_loss": 0.8277992, "learning_rate": 1.959496243543286e-06, "loss": 0.84979659, "num_input_tokens_seen": 93613135, "step": 4335, "time_per_iteration": 3.643268585205078 }, { "auxiliary_loss_clip": 0.01180525, "auxiliary_loss_mlp": 0.01028263, "balance_loss_clip": 1.05605555, "balance_loss_mlp": 1.02060413, "epoch": 0.5213731738110984, "flos": 26242655829120.0, "grad_norm": 2.1446372774741644, "language_loss": 0.79563934, "learning_rate": 1.9587174347474057e-06, "loss": 0.81772721, "num_input_tokens_seen": 93629645, "step": 4336, "time_per_iteration": 3.7357282638549805 }, { "auxiliary_loss_clip": 0.01157214, "auxiliary_loss_mlp": 0.0102574, "balance_loss_clip": 1.05374193, "balance_loss_mlp": 1.01803887, "epoch": 0.5214934167017375, "flos": 19418407637760.0, "grad_norm": 2.340874174912514, "language_loss": 0.82700711, "learning_rate": 1.9579386322140574e-06, "loss": 0.8488366, "num_input_tokens_seen": 93645325, "step": 4337, "time_per_iteration": 2.7513985633850098 }, { "auxiliary_loss_clip": 0.01181973, "auxiliary_loss_mlp": 0.01050689, "balance_loss_clip": 1.05354857, "balance_loss_mlp": 1.01339483, "epoch": 0.5216136595923766, "flos": 30955788023040.0, "grad_norm": 1.7938721878411892, "language_loss": 0.80949438, "learning_rate": 1.9571598360613854e-06, "loss": 0.83182096, "num_input_tokens_seen": 93668200, "step": 4338, "time_per_iteration": 3.706141471862793 }, { "auxiliary_loss_clip": 0.0116125, "auxiliary_loss_mlp": 0.0102858, "balance_loss_clip": 1.05073714, "balance_loss_mlp": 1.02002716, "epoch": 0.5217339024830157, "flos": 21945047143680.0, "grad_norm": 2.060300369300897, "language_loss": 0.69848549, "learning_rate": 1.956381046407532e-06, "loss": 0.72038388, "num_input_tokens_seen": 93688495, "step": 4339, "time_per_iteration": 2.7859904766082764 }, { "auxiliary_loss_clip": 0.01168351, "auxiliary_loss_mlp": 0.01027741, "balance_loss_clip": 1.05303645, "balance_loss_mlp": 1.02005744, "epoch": 0.5218541453736548, "flos": 20923209037440.0, "grad_norm": 1.6927623626621466, "language_loss": 0.86381781, "learning_rate": 1.9556022633706394e-06, "loss": 0.88577873, "num_input_tokens_seen": 93707285, "step": 4340, "time_per_iteration": 2.7361574172973633 }, { "auxiliary_loss_clip": 0.01165786, "auxiliary_loss_mlp": 0.01024342, "balance_loss_clip": 1.05165935, "balance_loss_mlp": 1.01683152, "epoch": 0.5219743882642939, "flos": 23951663498880.0, "grad_norm": 1.7999014866939862, "language_loss": 0.79904974, "learning_rate": 1.954823487068848e-06, "loss": 0.8209511, "num_input_tokens_seen": 93727495, "step": 4341, "time_per_iteration": 2.719996690750122 }, { "auxiliary_loss_clip": 0.01176869, "auxiliary_loss_mlp": 0.01028145, "balance_loss_clip": 1.05498004, "balance_loss_mlp": 1.01997304, "epoch": 0.5220946311549329, "flos": 28799280213120.0, "grad_norm": 1.6679339929206014, "language_loss": 0.80963749, "learning_rate": 1.9540447176202976e-06, "loss": 0.83168763, "num_input_tokens_seen": 93748740, "step": 4342, "time_per_iteration": 3.853243112564087 }, { "auxiliary_loss_clip": 0.01072344, "auxiliary_loss_mlp": 0.01004155, "balance_loss_clip": 1.01612341, "balance_loss_mlp": 1.00280213, "epoch": 0.5222148740455721, "flos": 67189369017600.0, "grad_norm": 0.8731751959908752, "language_loss": 0.60695457, "learning_rate": 1.9532659551431272e-06, "loss": 0.62771952, "num_input_tokens_seen": 93815770, "step": 4343, "time_per_iteration": 3.461514472961426 }, { "auxiliary_loss_clip": 0.01177918, "auxiliary_loss_mlp": 0.01026269, "balance_loss_clip": 1.05257833, "balance_loss_mlp": 1.01869905, "epoch": 0.5223351169362112, "flos": 61856164339200.0, "grad_norm": 1.542064611809904, "language_loss": 0.67571282, "learning_rate": 1.9524871997554744e-06, "loss": 0.69775474, "num_input_tokens_seen": 93843530, "step": 4344, "time_per_iteration": 3.0596344470977783 }, { "auxiliary_loss_clip": 0.01178247, "auxiliary_loss_mlp": 0.01020395, "balance_loss_clip": 1.0538888, "balance_loss_mlp": 1.01281357, "epoch": 0.5224553598268502, "flos": 14647388676480.0, "grad_norm": 3.595730376652932, "language_loss": 0.80580771, "learning_rate": 1.951708451575475e-06, "loss": 0.82779408, "num_input_tokens_seen": 93860595, "step": 4345, "time_per_iteration": 2.6433072090148926 }, { "auxiliary_loss_clip": 0.01179187, "auxiliary_loss_mlp": 0.01027457, "balance_loss_clip": 1.05316591, "balance_loss_mlp": 1.01978016, "epoch": 0.5225756027174894, "flos": 14826043946880.0, "grad_norm": 2.2547239236563774, "language_loss": 0.82388151, "learning_rate": 1.9509297107212657e-06, "loss": 0.84594798, "num_input_tokens_seen": 93877365, "step": 4346, "time_per_iteration": 2.7087535858154297 }, { "auxiliary_loss_clip": 0.0117966, "auxiliary_loss_mlp": 0.01026573, "balance_loss_clip": 1.05381262, "balance_loss_mlp": 1.01819897, "epoch": 0.5226958456081284, "flos": 23512009029120.0, "grad_norm": 2.2055814555675792, "language_loss": 0.79207301, "learning_rate": 1.95015097731098e-06, "loss": 0.81413531, "num_input_tokens_seen": 93896855, "step": 4347, "time_per_iteration": 2.6718623638153076 }, { "auxiliary_loss_clip": 0.01180739, "auxiliary_loss_mlp": 0.01029451, "balance_loss_clip": 1.05306089, "balance_loss_mlp": 1.02163124, "epoch": 0.5228160884987675, "flos": 19062928690560.0, "grad_norm": 2.2901018013887726, "language_loss": 0.81619853, "learning_rate": 1.949372251462751e-06, "loss": 0.83830047, "num_input_tokens_seen": 93914270, "step": 4348, "time_per_iteration": 2.6140849590301514 }, { "auxiliary_loss_clip": 0.01170152, "auxiliary_loss_mlp": 0.01056707, "balance_loss_clip": 1.05283213, "balance_loss_mlp": 1.02197564, "epoch": 0.5229363313894067, "flos": 21063224252160.0, "grad_norm": 2.1102294624165325, "language_loss": 0.82996023, "learning_rate": 1.9485935332947124e-06, "loss": 0.85222876, "num_input_tokens_seen": 93932180, "step": 4349, "time_per_iteration": 2.8379483222961426 }, { "auxiliary_loss_clip": 0.01165251, "auxiliary_loss_mlp": 0.0102544, "balance_loss_clip": 1.04974937, "balance_loss_mlp": 1.01827264, "epoch": 0.5230565742800457, "flos": 14830389492480.0, "grad_norm": 2.0725032471173113, "language_loss": 0.83459622, "learning_rate": 1.947814822924993e-06, "loss": 0.85650313, "num_input_tokens_seen": 93949690, "step": 4350, "time_per_iteration": 2.7194886207580566 }, { "auxiliary_loss_clip": 0.01178592, "auxiliary_loss_mlp": 0.01030061, "balance_loss_clip": 1.05421972, "balance_loss_mlp": 1.02239633, "epoch": 0.5231768171706848, "flos": 25813021253760.0, "grad_norm": 3.877325886147242, "language_loss": 0.82896411, "learning_rate": 1.9470361204717236e-06, "loss": 0.85105067, "num_input_tokens_seen": 93968830, "step": 4351, "time_per_iteration": 2.768423318862915 }, { "auxiliary_loss_clip": 0.01169749, "auxiliary_loss_mlp": 0.01057127, "balance_loss_clip": 1.05250156, "balance_loss_mlp": 1.0205369, "epoch": 0.5232970600613239, "flos": 22743807834240.0, "grad_norm": 1.5591221817716905, "language_loss": 0.80758071, "learning_rate": 1.9462574260530326e-06, "loss": 0.82984942, "num_input_tokens_seen": 93989110, "step": 4352, "time_per_iteration": 2.7241532802581787 }, { "auxiliary_loss_clip": 0.01167486, "auxiliary_loss_mlp": 0.01027138, "balance_loss_clip": 1.05138981, "balance_loss_mlp": 1.01919293, "epoch": 0.523417302951963, "flos": 17310703432320.0, "grad_norm": 1.908924790041283, "language_loss": 0.81244183, "learning_rate": 1.9454787397870472e-06, "loss": 0.83438808, "num_input_tokens_seen": 94006430, "step": 4353, "time_per_iteration": 2.7217769622802734 }, { "auxiliary_loss_clip": 0.01153765, "auxiliary_loss_mlp": 0.01028167, "balance_loss_clip": 1.05230641, "balance_loss_mlp": 1.02005482, "epoch": 0.523537545842602, "flos": 18551740285440.0, "grad_norm": 3.944832240979386, "language_loss": 0.72024935, "learning_rate": 1.944700061791894e-06, "loss": 0.74206871, "num_input_tokens_seen": 94024825, "step": 4354, "time_per_iteration": 2.7746994495391846 }, { "auxiliary_loss_clip": 0.01172966, "auxiliary_loss_mlp": 0.01029919, "balance_loss_clip": 1.05231643, "balance_loss_mlp": 1.02234364, "epoch": 0.5236577887332411, "flos": 19719267955200.0, "grad_norm": 2.160362514934899, "language_loss": 0.65636301, "learning_rate": 1.943921392185698e-06, "loss": 0.67839187, "num_input_tokens_seen": 94043450, "step": 4355, "time_per_iteration": 2.703155279159546 }, { "auxiliary_loss_clip": 0.01176616, "auxiliary_loss_mlp": 0.01028129, "balance_loss_clip": 1.0519619, "balance_loss_mlp": 1.01996946, "epoch": 0.5237780316238803, "flos": 23550218121600.0, "grad_norm": 2.372814493552575, "language_loss": 0.7779144, "learning_rate": 1.9431427310865814e-06, "loss": 0.79996181, "num_input_tokens_seen": 94063055, "step": 4356, "time_per_iteration": 2.7480647563934326 }, { "auxiliary_loss_clip": 0.01153012, "auxiliary_loss_mlp": 0.01026871, "balance_loss_clip": 1.05125141, "balance_loss_mlp": 1.0196712, "epoch": 0.5238982745145193, "flos": 22491894775680.0, "grad_norm": 1.892653679800113, "language_loss": 0.78697121, "learning_rate": 1.942364078612667e-06, "loss": 0.80877, "num_input_tokens_seen": 94081785, "step": 4357, "time_per_iteration": 2.6981594562530518 }, { "auxiliary_loss_clip": 0.01173609, "auxiliary_loss_mlp": 0.01024143, "balance_loss_clip": 1.05016756, "balance_loss_mlp": 1.01665115, "epoch": 0.5240185174051584, "flos": 27088927234560.0, "grad_norm": 1.756461032273824, "language_loss": 0.75889373, "learning_rate": 1.9415854348820765e-06, "loss": 0.78087121, "num_input_tokens_seen": 94101635, "step": 4358, "time_per_iteration": 2.764090061187744 }, { "auxiliary_loss_clip": 0.01179267, "auxiliary_loss_mlp": 0.01027441, "balance_loss_clip": 1.05170846, "balance_loss_mlp": 1.01960921, "epoch": 0.5241387602957975, "flos": 22674680110080.0, "grad_norm": 2.4246277343381983, "language_loss": 0.68598586, "learning_rate": 1.940806800012929e-06, "loss": 0.70805287, "num_input_tokens_seen": 94121705, "step": 4359, "time_per_iteration": 2.7481045722961426 }, { "auxiliary_loss_clip": 0.01162879, "auxiliary_loss_mlp": 0.01060322, "balance_loss_clip": 1.05389214, "balance_loss_mlp": 1.02305746, "epoch": 0.5242590031864366, "flos": 40553453134080.0, "grad_norm": 1.5583171077762226, "language_loss": 0.63711548, "learning_rate": 1.9400281741233432e-06, "loss": 0.65934747, "num_input_tokens_seen": 94146595, "step": 4360, "time_per_iteration": 2.9634745121002197 }, { "auxiliary_loss_clip": 0.01070475, "auxiliary_loss_mlp": 0.0100153, "balance_loss_clip": 1.01904726, "balance_loss_mlp": 1.00009918, "epoch": 0.5243792460770756, "flos": 66676313105280.0, "grad_norm": 0.655502975356132, "language_loss": 0.52493143, "learning_rate": 1.939249557331435e-06, "loss": 0.54565156, "num_input_tokens_seen": 94212410, "step": 4361, "time_per_iteration": 4.2798380851745605 }, { "auxiliary_loss_clip": 0.01177047, "auxiliary_loss_mlp": 0.01024796, "balance_loss_clip": 1.05332637, "balance_loss_mlp": 1.01736069, "epoch": 0.5244994889677148, "flos": 28183663992960.0, "grad_norm": 1.8369041053633919, "language_loss": 0.72697771, "learning_rate": 1.938470949755321e-06, "loss": 0.74899614, "num_input_tokens_seen": 94232290, "step": 4362, "time_per_iteration": 3.768996477127075 }, { "auxiliary_loss_clip": 0.01074338, "auxiliary_loss_mlp": 0.01002669, "balance_loss_clip": 1.01539195, "balance_loss_mlp": 1.00155401, "epoch": 0.5246197318583539, "flos": 65950379239680.0, "grad_norm": 0.8105275566871842, "language_loss": 0.55622399, "learning_rate": 1.937692351513115e-06, "loss": 0.57699406, "num_input_tokens_seen": 94291285, "step": 4363, "time_per_iteration": 3.2898731231689453 }, { "auxiliary_loss_clip": 0.01177763, "auxiliary_loss_mlp": 0.0102572, "balance_loss_clip": 1.05097866, "balance_loss_mlp": 1.01865733, "epoch": 0.5247399747489929, "flos": 21033490769280.0, "grad_norm": 1.796864745776621, "language_loss": 0.80488551, "learning_rate": 1.9369137627229297e-06, "loss": 0.82692039, "num_input_tokens_seen": 94309685, "step": 4364, "time_per_iteration": 3.7645983695983887 }, { "auxiliary_loss_clip": 0.01171575, "auxiliary_loss_mlp": 0.01029499, "balance_loss_clip": 1.05164695, "balance_loss_mlp": 1.02173829, "epoch": 0.5248602176396321, "flos": 19025940660480.0, "grad_norm": 2.2685291775740217, "language_loss": 0.88495302, "learning_rate": 1.936135183502877e-06, "loss": 0.90696377, "num_input_tokens_seen": 94326985, "step": 4365, "time_per_iteration": 2.7365150451660156 }, { "auxiliary_loss_clip": 0.01175309, "auxiliary_loss_mlp": 0.01028651, "balance_loss_clip": 1.05474937, "balance_loss_mlp": 1.02090788, "epoch": 0.5249804605302711, "flos": 22200084685440.0, "grad_norm": 2.055556296531988, "language_loss": 0.80545938, "learning_rate": 1.935356613971066e-06, "loss": 0.82749891, "num_input_tokens_seen": 94347645, "step": 4366, "time_per_iteration": 2.8211417198181152 }, { "auxiliary_loss_clip": 0.01165599, "auxiliary_loss_mlp": 0.0105341, "balance_loss_clip": 1.04839325, "balance_loss_mlp": 1.01754022, "epoch": 0.5251007034209102, "flos": 23805686626560.0, "grad_norm": 2.40435089948136, "language_loss": 0.76718771, "learning_rate": 1.9345780542456047e-06, "loss": 0.78937781, "num_input_tokens_seen": 94367020, "step": 4367, "time_per_iteration": 2.6827096939086914 }, { "auxiliary_loss_clip": 0.01164654, "auxiliary_loss_mlp": 0.01024286, "balance_loss_clip": 1.05035341, "balance_loss_mlp": 1.01612282, "epoch": 0.5252209463115494, "flos": 23294605962240.0, "grad_norm": 1.889288851544301, "language_loss": 0.71373194, "learning_rate": 1.9337995044446007e-06, "loss": 0.73562133, "num_input_tokens_seen": 94385860, "step": 4368, "time_per_iteration": 2.7389261722564697 }, { "auxiliary_loss_clip": 0.01176265, "auxiliary_loss_mlp": 0.01031007, "balance_loss_clip": 1.05008054, "balance_loss_mlp": 1.02338338, "epoch": 0.5253411892021884, "flos": 19828687760640.0, "grad_norm": 2.036543142096224, "language_loss": 0.80102247, "learning_rate": 1.9330209646861596e-06, "loss": 0.82309514, "num_input_tokens_seen": 94405010, "step": 4369, "time_per_iteration": 3.7308599948883057 }, { "auxiliary_loss_clip": 0.01166939, "auxiliary_loss_mlp": 0.01022955, "balance_loss_clip": 1.04960752, "balance_loss_mlp": 1.01508677, "epoch": 0.5254614320928275, "flos": 24133730561280.0, "grad_norm": 1.8438103972702202, "language_loss": 0.77829862, "learning_rate": 1.9322424350883843e-06, "loss": 0.8001976, "num_input_tokens_seen": 94426845, "step": 4370, "time_per_iteration": 2.82544207572937 }, { "auxiliary_loss_clip": 0.01173714, "auxiliary_loss_mlp": 0.0102836, "balance_loss_clip": 1.05296552, "balance_loss_mlp": 1.02151406, "epoch": 0.5255816749834666, "flos": 24644954880000.0, "grad_norm": 1.761233331965697, "language_loss": 0.78925949, "learning_rate": 1.931463915769379e-06, "loss": 0.81128025, "num_input_tokens_seen": 94446960, "step": 4371, "time_per_iteration": 2.8117244243621826 }, { "auxiliary_loss_clip": 0.01165192, "auxiliary_loss_mlp": 0.01024836, "balance_loss_clip": 1.0533607, "balance_loss_mlp": 1.01659274, "epoch": 0.5257019178741057, "flos": 14136595320960.0, "grad_norm": 2.280778585227101, "language_loss": 0.74228877, "learning_rate": 1.930685406847242e-06, "loss": 0.764189, "num_input_tokens_seen": 94461535, "step": 4372, "time_per_iteration": 2.7760679721832275 }, { "auxiliary_loss_clip": 0.01166727, "auxiliary_loss_mlp": 0.01023759, "balance_loss_clip": 1.04995322, "balance_loss_mlp": 1.01649344, "epoch": 0.5258221607647448, "flos": 23548961145600.0, "grad_norm": 1.6021073422332623, "language_loss": 0.82038689, "learning_rate": 1.9299069084400734e-06, "loss": 0.84229177, "num_input_tokens_seen": 94482395, "step": 4373, "time_per_iteration": 2.7891058921813965 }, { "auxiliary_loss_clip": 0.01164473, "auxiliary_loss_mlp": 0.01028111, "balance_loss_clip": 1.05219519, "balance_loss_mlp": 1.02073193, "epoch": 0.5259424036553839, "flos": 24966103403520.0, "grad_norm": 2.3274167742863785, "language_loss": 0.69878793, "learning_rate": 1.9291284206659717e-06, "loss": 0.72071379, "num_input_tokens_seen": 94500580, "step": 4374, "time_per_iteration": 2.940133571624756 }, { "auxiliary_loss_clip": 0.01179704, "auxiliary_loss_mlp": 0.01026086, "balance_loss_clip": 1.05333877, "balance_loss_mlp": 1.01853418, "epoch": 0.526062646546023, "flos": 28763908295040.0, "grad_norm": 2.418270061689653, "language_loss": 0.71414024, "learning_rate": 1.928349943643032e-06, "loss": 0.73619813, "num_input_tokens_seen": 94519680, "step": 4375, "time_per_iteration": 2.839909791946411 }, { "auxiliary_loss_clip": 0.01172877, "auxiliary_loss_mlp": 0.01024967, "balance_loss_clip": 1.05349874, "balance_loss_mlp": 1.01706386, "epoch": 0.526182889436662, "flos": 22821375254400.0, "grad_norm": 5.355928776394613, "language_loss": 0.8184908, "learning_rate": 1.9275714774893493e-06, "loss": 0.84046924, "num_input_tokens_seen": 94539135, "step": 4376, "time_per_iteration": 2.7808351516723633 }, { "auxiliary_loss_clip": 0.01160116, "auxiliary_loss_mlp": 0.01027432, "balance_loss_clip": 1.05134547, "balance_loss_mlp": 1.01926613, "epoch": 0.5263031323273012, "flos": 22929466256640.0, "grad_norm": 2.318293205752956, "language_loss": 0.72489488, "learning_rate": 1.9267930223230154e-06, "loss": 0.74677038, "num_input_tokens_seen": 94557610, "step": 4377, "time_per_iteration": 2.8323540687561035 }, { "auxiliary_loss_clip": 0.01173782, "auxiliary_loss_mlp": 0.01027391, "balance_loss_clip": 1.05272698, "balance_loss_mlp": 1.01977992, "epoch": 0.5264233752179402, "flos": 17748634049280.0, "grad_norm": 2.123592698835842, "language_loss": 0.77548516, "learning_rate": 1.9260145782621224e-06, "loss": 0.79749686, "num_input_tokens_seen": 94575390, "step": 4378, "time_per_iteration": 2.8367059230804443 }, { "auxiliary_loss_clip": 0.01169413, "auxiliary_loss_mlp": 0.01027267, "balance_loss_clip": 1.05431461, "balance_loss_mlp": 1.01997709, "epoch": 0.5265436181085793, "flos": 24421626069120.0, "grad_norm": 1.9465271810119402, "language_loss": 0.88067603, "learning_rate": 1.925236145424758e-06, "loss": 0.90264285, "num_input_tokens_seen": 94594210, "step": 4379, "time_per_iteration": 2.7811741828918457 }, { "auxiliary_loss_clip": 0.01074879, "auxiliary_loss_mlp": 0.01002284, "balance_loss_clip": 1.01619887, "balance_loss_mlp": 1.00117576, "epoch": 0.5266638609992185, "flos": 69207298156800.0, "grad_norm": 0.6908857631479548, "language_loss": 0.57554829, "learning_rate": 1.924457723929012e-06, "loss": 0.59631991, "num_input_tokens_seen": 94665020, "step": 4380, "time_per_iteration": 3.391148805618286 }, { "auxiliary_loss_clip": 0.01173232, "auxiliary_loss_mlp": 0.01022322, "balance_loss_clip": 1.0514276, "balance_loss_mlp": 1.01479363, "epoch": 0.5267841038898575, "flos": 20738699850240.0, "grad_norm": 3.2770001732688785, "language_loss": 0.82548445, "learning_rate": 1.9236793138929685e-06, "loss": 0.84744, "num_input_tokens_seen": 94684290, "step": 4381, "time_per_iteration": 2.7365410327911377 }, { "auxiliary_loss_clip": 0.01180345, "auxiliary_loss_mlp": 0.01026049, "balance_loss_clip": 1.05342615, "balance_loss_mlp": 1.01859879, "epoch": 0.5269043467804966, "flos": 17234392988160.0, "grad_norm": 2.082509965672172, "language_loss": 0.81015539, "learning_rate": 1.9229009154347133e-06, "loss": 0.8322193, "num_input_tokens_seen": 94701880, "step": 4382, "time_per_iteration": 2.696613311767578 }, { "auxiliary_loss_clip": 0.01152925, "auxiliary_loss_mlp": 0.01054877, "balance_loss_clip": 1.05127048, "balance_loss_mlp": 1.01746356, "epoch": 0.5270245896711357, "flos": 18223157646720.0, "grad_norm": 2.108714893904034, "language_loss": 0.80285287, "learning_rate": 1.922122528672327e-06, "loss": 0.82493085, "num_input_tokens_seen": 94720545, "step": 4383, "time_per_iteration": 2.8101022243499756 }, { "auxiliary_loss_clip": 0.01173041, "auxiliary_loss_mlp": 0.01025787, "balance_loss_clip": 1.0493294, "balance_loss_mlp": 1.01847088, "epoch": 0.5271448325617748, "flos": 21287558643840.0, "grad_norm": 5.415294981469229, "language_loss": 0.78937626, "learning_rate": 1.9213441537238914e-06, "loss": 0.81136453, "num_input_tokens_seen": 94737420, "step": 4384, "time_per_iteration": 2.699549436569214 }, { "auxiliary_loss_clip": 0.01075372, "auxiliary_loss_mlp": 0.01000416, "balance_loss_clip": 1.02451539, "balance_loss_mlp": 0.99923021, "epoch": 0.5272650754524139, "flos": 65495497403520.0, "grad_norm": 0.8456407530482979, "language_loss": 0.57365227, "learning_rate": 1.920565790707485e-06, "loss": 0.59441012, "num_input_tokens_seen": 94802810, "step": 4385, "time_per_iteration": 3.492260456085205 }, { "auxiliary_loss_clip": 0.01174195, "auxiliary_loss_mlp": 0.01028443, "balance_loss_clip": 1.05334437, "balance_loss_mlp": 1.02033675, "epoch": 0.527385318343053, "flos": 19676426008320.0, "grad_norm": 2.037789538268549, "language_loss": 0.65938258, "learning_rate": 1.9197874397411853e-06, "loss": 0.681409, "num_input_tokens_seen": 94819440, "step": 4386, "time_per_iteration": 2.794463634490967 }, { "auxiliary_loss_clip": 0.01171991, "auxiliary_loss_mlp": 0.01028247, "balance_loss_clip": 1.05248952, "balance_loss_mlp": 1.02047408, "epoch": 0.5275055612336921, "flos": 12712018947840.0, "grad_norm": 4.024332240119191, "language_loss": 0.66384012, "learning_rate": 1.919009100943067e-06, "loss": 0.68584251, "num_input_tokens_seen": 94835130, "step": 4387, "time_per_iteration": 3.7428641319274902 }, { "auxiliary_loss_clip": 0.01175454, "auxiliary_loss_mlp": 0.01026346, "balance_loss_clip": 1.05213201, "balance_loss_mlp": 1.01890135, "epoch": 0.5276258041243311, "flos": 17749029098880.0, "grad_norm": 2.0967728468291473, "language_loss": 0.65854883, "learning_rate": 1.9182307744312043e-06, "loss": 0.68056685, "num_input_tokens_seen": 94852235, "step": 4388, "time_per_iteration": 3.811432361602783 }, { "auxiliary_loss_clip": 0.01175096, "auxiliary_loss_mlp": 0.01024826, "balance_loss_clip": 1.05190957, "balance_loss_mlp": 1.01659453, "epoch": 0.5277460470149702, "flos": 22710447077760.0, "grad_norm": 3.1692591159473, "language_loss": 0.76560515, "learning_rate": 1.9174524603236676e-06, "loss": 0.78760439, "num_input_tokens_seen": 94871185, "step": 4389, "time_per_iteration": 2.7004635334014893 }, { "auxiliary_loss_clip": 0.01168241, "auxiliary_loss_mlp": 0.01024422, "balance_loss_clip": 1.05079246, "balance_loss_mlp": 1.0165422, "epoch": 0.5278662899056094, "flos": 19902699734400.0, "grad_norm": 1.9507654762636653, "language_loss": 0.76003754, "learning_rate": 1.916674158738527e-06, "loss": 0.78196418, "num_input_tokens_seen": 94890090, "step": 4390, "time_per_iteration": 3.632251739501953 }, { "auxiliary_loss_clip": 0.01165322, "auxiliary_loss_mlp": 0.01059038, "balance_loss_clip": 1.05295277, "balance_loss_mlp": 1.02106857, "epoch": 0.5279865327962484, "flos": 18005215875840.0, "grad_norm": 1.7558092718976699, "language_loss": 0.60027409, "learning_rate": 1.9158958697938506e-06, "loss": 0.62251765, "num_input_tokens_seen": 94908470, "step": 4391, "time_per_iteration": 2.8160669803619385 }, { "auxiliary_loss_clip": 0.01167768, "auxiliary_loss_mlp": 0.01027474, "balance_loss_clip": 1.0526166, "balance_loss_mlp": 1.02002072, "epoch": 0.5281067756868875, "flos": 15924443892480.0, "grad_norm": 2.4365784164596613, "language_loss": 0.86126125, "learning_rate": 1.9151175936077032e-06, "loss": 0.8832137, "num_input_tokens_seen": 94923440, "step": 4392, "time_per_iteration": 2.7504923343658447 }, { "auxiliary_loss_clip": 0.01169259, "auxiliary_loss_mlp": 0.01024939, "balance_loss_clip": 1.05129743, "balance_loss_mlp": 1.01775932, "epoch": 0.5282270185775266, "flos": 19426488197760.0, "grad_norm": 1.5745715962857207, "language_loss": 0.79344004, "learning_rate": 1.9143393302981507e-06, "loss": 0.81538206, "num_input_tokens_seen": 94941125, "step": 4393, "time_per_iteration": 2.8002572059631348 }, { "auxiliary_loss_clip": 0.01175388, "auxiliary_loss_mlp": 0.01023411, "balance_loss_clip": 1.05315948, "balance_loss_mlp": 1.01584744, "epoch": 0.5283472614681657, "flos": 16399613934720.0, "grad_norm": 1.7732950350165713, "language_loss": 0.83270073, "learning_rate": 1.913561079983252e-06, "loss": 0.85468864, "num_input_tokens_seen": 94959950, "step": 4394, "time_per_iteration": 2.7048580646514893 }, { "auxiliary_loss_clip": 0.01181595, "auxiliary_loss_mlp": 0.01034702, "balance_loss_clip": 1.05534601, "balance_loss_mlp": 1.02645254, "epoch": 0.5284675043588047, "flos": 26760524163840.0, "grad_norm": 2.1136930844177795, "language_loss": 0.74806213, "learning_rate": 1.9127828427810693e-06, "loss": 0.77022505, "num_input_tokens_seen": 94980515, "step": 4395, "time_per_iteration": 3.7265186309814453 }, { "auxiliary_loss_clip": 0.01176138, "auxiliary_loss_mlp": 0.0102526, "balance_loss_clip": 1.05247426, "balance_loss_mlp": 1.01779187, "epoch": 0.5285877472494439, "flos": 19899898473600.0, "grad_norm": 1.9492233951671007, "language_loss": 0.80697536, "learning_rate": 1.9120046188096607e-06, "loss": 0.82898927, "num_input_tokens_seen": 94998560, "step": 4396, "time_per_iteration": 2.801323413848877 }, { "auxiliary_loss_clip": 0.01170218, "auxiliary_loss_mlp": 0.01023469, "balance_loss_clip": 1.05403495, "balance_loss_mlp": 1.015872, "epoch": 0.528707990140083, "flos": 20011257613440.0, "grad_norm": 2.2806045981815473, "language_loss": 0.74345887, "learning_rate": 1.9112264081870804e-06, "loss": 0.76539576, "num_input_tokens_seen": 95016950, "step": 4397, "time_per_iteration": 2.6744422912597656 }, { "auxiliary_loss_clip": 0.01165242, "auxiliary_loss_mlp": 0.01031626, "balance_loss_clip": 1.05102265, "balance_loss_mlp": 1.02385378, "epoch": 0.528828233030722, "flos": 20667956014080.0, "grad_norm": 2.1506588579120187, "language_loss": 0.75925148, "learning_rate": 1.9104482110313843e-06, "loss": 0.78122008, "num_input_tokens_seen": 95036540, "step": 4398, "time_per_iteration": 2.7853448390960693 }, { "auxiliary_loss_clip": 0.01174339, "auxiliary_loss_mlp": 0.0102941, "balance_loss_clip": 1.05290651, "balance_loss_mlp": 1.02148879, "epoch": 0.5289484759213612, "flos": 25192448956800.0, "grad_norm": 2.0753332553446855, "language_loss": 0.74177718, "learning_rate": 1.909670027460623e-06, "loss": 0.76381469, "num_input_tokens_seen": 95053840, "step": 4399, "time_per_iteration": 2.696180820465088 }, { "auxiliary_loss_clip": 0.01177453, "auxiliary_loss_mlp": 0.0102783, "balance_loss_clip": 1.05419409, "balance_loss_mlp": 1.02027845, "epoch": 0.5290687188120002, "flos": 31139255715840.0, "grad_norm": 1.9413283334139253, "language_loss": 0.71964526, "learning_rate": 1.908891857592847e-06, "loss": 0.74169815, "num_input_tokens_seen": 95074910, "step": 4400, "time_per_iteration": 2.7593564987182617 }, { "auxiliary_loss_clip": 0.01161973, "auxiliary_loss_mlp": 0.01026342, "balance_loss_clip": 1.05247951, "balance_loss_mlp": 1.01851535, "epoch": 0.5291889617026393, "flos": 20119851406080.0, "grad_norm": 2.228543979619498, "language_loss": 0.90124553, "learning_rate": 1.9081137015461034e-06, "loss": 0.92312866, "num_input_tokens_seen": 95090985, "step": 4401, "time_per_iteration": 2.704263687133789 }, { "auxiliary_loss_clip": 0.01160765, "auxiliary_loss_mlp": 0.01024286, "balance_loss_clip": 1.05547237, "balance_loss_mlp": 1.01642418, "epoch": 0.5293092045932785, "flos": 19643747610240.0, "grad_norm": 1.858552372233661, "language_loss": 0.90644443, "learning_rate": 1.9073355594384383e-06, "loss": 0.9282949, "num_input_tokens_seen": 95109225, "step": 4402, "time_per_iteration": 2.7367477416992188 }, { "auxiliary_loss_clip": 0.01160812, "auxiliary_loss_mlp": 0.01025642, "balance_loss_clip": 1.05265498, "balance_loss_mlp": 1.0181495, "epoch": 0.5294294474839175, "flos": 24317736958080.0, "grad_norm": 2.4270283711536114, "language_loss": 0.80508155, "learning_rate": 1.906557431387895e-06, "loss": 0.82694602, "num_input_tokens_seen": 95128215, "step": 4403, "time_per_iteration": 2.8318474292755127 }, { "auxiliary_loss_clip": 0.01165881, "auxiliary_loss_mlp": 0.0102669, "balance_loss_clip": 1.05497479, "balance_loss_mlp": 1.01900697, "epoch": 0.5295496903745566, "flos": 18875941464960.0, "grad_norm": 2.013383773457175, "language_loss": 0.79030573, "learning_rate": 1.905779317512516e-06, "loss": 0.81223148, "num_input_tokens_seen": 95145760, "step": 4404, "time_per_iteration": 2.7252328395843506 }, { "auxiliary_loss_clip": 0.01170447, "auxiliary_loss_mlp": 0.01027044, "balance_loss_clip": 1.05041897, "balance_loss_mlp": 1.01934326, "epoch": 0.5296699332651957, "flos": 20923101296640.0, "grad_norm": 1.9509407029752739, "language_loss": 0.80451721, "learning_rate": 1.9050012179303385e-06, "loss": 0.82649207, "num_input_tokens_seen": 95164270, "step": 4405, "time_per_iteration": 2.685905694961548 }, { "auxiliary_loss_clip": 0.01178642, "auxiliary_loss_mlp": 0.01031134, "balance_loss_clip": 1.05358756, "balance_loss_mlp": 1.02315891, "epoch": 0.5297901761558348, "flos": 22046745525120.0, "grad_norm": 2.4088911388716294, "language_loss": 0.6937139, "learning_rate": 1.904223132759401e-06, "loss": 0.71581167, "num_input_tokens_seen": 95182870, "step": 4406, "time_per_iteration": 2.6498630046844482 }, { "auxiliary_loss_clip": 0.01176096, "auxiliary_loss_mlp": 0.01027716, "balance_loss_clip": 1.05248857, "balance_loss_mlp": 1.02026486, "epoch": 0.5299104190464738, "flos": 21798495653760.0, "grad_norm": 2.330285836809705, "language_loss": 0.69019783, "learning_rate": 1.9034450621177383e-06, "loss": 0.71223599, "num_input_tokens_seen": 95201190, "step": 4407, "time_per_iteration": 2.7379488945007324 }, { "auxiliary_loss_clip": 0.01174091, "auxiliary_loss_mlp": 0.01027334, "balance_loss_clip": 1.05429339, "balance_loss_mlp": 1.01915956, "epoch": 0.530030661937113, "flos": 14720790119040.0, "grad_norm": 2.0803945133100754, "language_loss": 0.70960295, "learning_rate": 1.9026670061233824e-06, "loss": 0.73161721, "num_input_tokens_seen": 95218625, "step": 4408, "time_per_iteration": 2.75225567817688 }, { "auxiliary_loss_clip": 0.01167546, "auxiliary_loss_mlp": 0.01023293, "balance_loss_clip": 1.05144823, "balance_loss_mlp": 1.01591969, "epoch": 0.5301509048277521, "flos": 21251504367360.0, "grad_norm": 1.7534527385991772, "language_loss": 0.8101837, "learning_rate": 1.901888964894365e-06, "loss": 0.83209205, "num_input_tokens_seen": 95237665, "step": 4409, "time_per_iteration": 2.7883636951446533 }, { "auxiliary_loss_clip": 0.01181528, "auxiliary_loss_mlp": 0.01024194, "balance_loss_clip": 1.05281067, "balance_loss_mlp": 1.01586723, "epoch": 0.5302711477183911, "flos": 25957058791680.0, "grad_norm": 2.081744183909436, "language_loss": 0.67777067, "learning_rate": 1.9011109385487134e-06, "loss": 0.69982791, "num_input_tokens_seen": 95258915, "step": 4410, "time_per_iteration": 2.6442737579345703 }, { "auxiliary_loss_clip": 0.01180656, "auxiliary_loss_mlp": 0.01024606, "balance_loss_clip": 1.05285478, "balance_loss_mlp": 1.01693439, "epoch": 0.5303913906090303, "flos": 22273126992000.0, "grad_norm": 2.5193784642053365, "language_loss": 0.66198289, "learning_rate": 1.900332927204454e-06, "loss": 0.68403554, "num_input_tokens_seen": 95277365, "step": 4411, "time_per_iteration": 2.6525676250457764 }, { "auxiliary_loss_clip": 0.01177689, "auxiliary_loss_mlp": 0.01023754, "balance_loss_clip": 1.05287242, "balance_loss_mlp": 1.01550198, "epoch": 0.5305116334996693, "flos": 24936010784640.0, "grad_norm": 1.8929176166215393, "language_loss": 0.76657033, "learning_rate": 1.8995549309796097e-06, "loss": 0.78858483, "num_input_tokens_seen": 95296670, "step": 4412, "time_per_iteration": 2.7507410049438477 }, { "auxiliary_loss_clip": 0.01182765, "auxiliary_loss_mlp": 0.0103134, "balance_loss_clip": 1.05507851, "balance_loss_mlp": 1.02366853, "epoch": 0.5306318763903084, "flos": 20189338266240.0, "grad_norm": 1.8682977437420436, "language_loss": 0.76808274, "learning_rate": 1.8987769499922028e-06, "loss": 0.79022378, "num_input_tokens_seen": 95315640, "step": 4413, "time_per_iteration": 3.5834503173828125 }, { "auxiliary_loss_clip": 0.01172068, "auxiliary_loss_mlp": 0.01052029, "balance_loss_clip": 1.05255747, "balance_loss_mlp": 1.01729345, "epoch": 0.5307521192809476, "flos": 20266366982400.0, "grad_norm": 2.164414427594786, "language_loss": 0.71229434, "learning_rate": 1.897998984360252e-06, "loss": 0.73453534, "num_input_tokens_seen": 95334610, "step": 4414, "time_per_iteration": 3.681694746017456 }, { "auxiliary_loss_clip": 0.01171112, "auxiliary_loss_mlp": 0.01025702, "balance_loss_clip": 1.05362427, "balance_loss_mlp": 1.01836467, "epoch": 0.5308723621715866, "flos": 28844276976000.0, "grad_norm": 1.343571149958297, "language_loss": 0.78557366, "learning_rate": 1.897221034201775e-06, "loss": 0.80754179, "num_input_tokens_seen": 95358350, "step": 4415, "time_per_iteration": 3.7850401401519775 }, { "auxiliary_loss_clip": 0.01167132, "auxiliary_loss_mlp": 0.01025842, "balance_loss_clip": 1.05049074, "balance_loss_mlp": 1.01861823, "epoch": 0.5309926050622257, "flos": 27457766040960.0, "grad_norm": 1.5973497352009365, "language_loss": 0.66578293, "learning_rate": 1.8964430996347842e-06, "loss": 0.68771267, "num_input_tokens_seen": 95379900, "step": 4416, "time_per_iteration": 2.833237409591675 }, { "auxiliary_loss_clip": 0.01173939, "auxiliary_loss_mlp": 0.0102589, "balance_loss_clip": 1.05545866, "balance_loss_mlp": 1.01770616, "epoch": 0.5311128479528648, "flos": 20514545026560.0, "grad_norm": 1.7245687838204, "language_loss": 0.82181787, "learning_rate": 1.8956651807772931e-06, "loss": 0.84381616, "num_input_tokens_seen": 95397935, "step": 4417, "time_per_iteration": 2.7431046962738037 }, { "auxiliary_loss_clip": 0.01170966, "auxiliary_loss_mlp": 0.01029438, "balance_loss_clip": 1.05281687, "balance_loss_mlp": 1.02187443, "epoch": 0.5312330908435039, "flos": 21397660807680.0, "grad_norm": 5.617690238253875, "language_loss": 0.8405534, "learning_rate": 1.8948872777473115e-06, "loss": 0.86255753, "num_input_tokens_seen": 95415890, "step": 4418, "time_per_iteration": 2.805666923522949 }, { "auxiliary_loss_clip": 0.01173204, "auxiliary_loss_mlp": 0.01028513, "balance_loss_clip": 1.05425596, "balance_loss_mlp": 1.02019191, "epoch": 0.531353333734143, "flos": 24717350741760.0, "grad_norm": 1.8054104866594272, "language_loss": 0.63705289, "learning_rate": 1.8941093906628458e-06, "loss": 0.65907001, "num_input_tokens_seen": 95433675, "step": 4419, "time_per_iteration": 2.8317267894744873 }, { "auxiliary_loss_clip": 0.01167625, "auxiliary_loss_mlp": 0.01023857, "balance_loss_clip": 1.0520401, "balance_loss_mlp": 1.01622152, "epoch": 0.531473576624782, "flos": 30480689808000.0, "grad_norm": 1.8396131046090527, "language_loss": 0.70669806, "learning_rate": 1.893331519641902e-06, "loss": 0.7286129, "num_input_tokens_seen": 95455820, "step": 4420, "time_per_iteration": 2.9389894008636475 }, { "auxiliary_loss_clip": 0.01162426, "auxiliary_loss_mlp": 0.01025354, "balance_loss_clip": 1.05259669, "balance_loss_mlp": 1.01694357, "epoch": 0.5315938195154212, "flos": 23002975440000.0, "grad_norm": 2.2827769178179755, "language_loss": 0.74560082, "learning_rate": 1.8925536648024815e-06, "loss": 0.76747864, "num_input_tokens_seen": 95473240, "step": 4421, "time_per_iteration": 3.6890199184417725 }, { "auxiliary_loss_clip": 0.01180061, "auxiliary_loss_mlp": 0.01027294, "balance_loss_clip": 1.0533973, "balance_loss_mlp": 1.01966453, "epoch": 0.5317140624060602, "flos": 22748584343040.0, "grad_norm": 2.067067183336292, "language_loss": 0.75849319, "learning_rate": 1.8917758262625849e-06, "loss": 0.78056669, "num_input_tokens_seen": 95493480, "step": 4422, "time_per_iteration": 2.7763354778289795 }, { "auxiliary_loss_clip": 0.01168712, "auxiliary_loss_mlp": 0.01024445, "balance_loss_clip": 1.05406642, "balance_loss_mlp": 1.01688755, "epoch": 0.5318343052966993, "flos": 22821087945600.0, "grad_norm": 1.812145964409065, "language_loss": 0.8079536, "learning_rate": 1.8909980041402089e-06, "loss": 0.82988513, "num_input_tokens_seen": 95512075, "step": 4423, "time_per_iteration": 2.735823392868042 }, { "auxiliary_loss_clip": 0.01169442, "auxiliary_loss_mlp": 0.01027903, "balance_loss_clip": 1.05142093, "balance_loss_mlp": 1.01970768, "epoch": 0.5319545481873384, "flos": 13626089274240.0, "grad_norm": 2.669558698163176, "language_loss": 0.65895432, "learning_rate": 1.8902201985533494e-06, "loss": 0.68092775, "num_input_tokens_seen": 95529340, "step": 4424, "time_per_iteration": 2.706655502319336 }, { "auxiliary_loss_clip": 0.01170811, "auxiliary_loss_mlp": 0.01028757, "balance_loss_clip": 1.05289721, "balance_loss_mlp": 1.02162838, "epoch": 0.5320747910779775, "flos": 22162522037760.0, "grad_norm": 2.3895509284312255, "language_loss": 0.75375211, "learning_rate": 1.8894424096199983e-06, "loss": 0.77574778, "num_input_tokens_seen": 95548545, "step": 4425, "time_per_iteration": 2.7525970935821533 }, { "auxiliary_loss_clip": 0.01176564, "auxiliary_loss_mlp": 0.01028581, "balance_loss_clip": 1.05517244, "balance_loss_mlp": 1.02022481, "epoch": 0.5321950339686166, "flos": 18588081870720.0, "grad_norm": 1.88444368929623, "language_loss": 0.85868621, "learning_rate": 1.8886646374581463e-06, "loss": 0.88073766, "num_input_tokens_seen": 95567770, "step": 4426, "time_per_iteration": 2.730830669403076 }, { "auxiliary_loss_clip": 0.01175503, "auxiliary_loss_mlp": 0.01030459, "balance_loss_clip": 1.05304897, "balance_loss_mlp": 1.02248991, "epoch": 0.5323152768592557, "flos": 22856818999680.0, "grad_norm": 1.641342112629709, "language_loss": 0.71064949, "learning_rate": 1.8878868821857795e-06, "loss": 0.73270905, "num_input_tokens_seen": 95587420, "step": 4427, "time_per_iteration": 2.742506504058838 }, { "auxiliary_loss_clip": 0.01166671, "auxiliary_loss_mlp": 0.01034517, "balance_loss_clip": 1.05323052, "balance_loss_mlp": 1.02643502, "epoch": 0.5324355197498948, "flos": 33948690998400.0, "grad_norm": 2.2793007409159163, "language_loss": 0.75225139, "learning_rate": 1.8871091439208838e-06, "loss": 0.77426332, "num_input_tokens_seen": 95609030, "step": 4428, "time_per_iteration": 2.791760206222534 }, { "auxiliary_loss_clip": 0.01165996, "auxiliary_loss_mlp": 0.01030171, "balance_loss_clip": 1.05348992, "balance_loss_mlp": 1.02170122, "epoch": 0.5325557626405338, "flos": 23256720092160.0, "grad_norm": 2.058975042189157, "language_loss": 0.77146322, "learning_rate": 1.8863314227814414e-06, "loss": 0.79342484, "num_input_tokens_seen": 95627340, "step": 4429, "time_per_iteration": 2.8437247276306152 }, { "auxiliary_loss_clip": 0.01181298, "auxiliary_loss_mlp": 0.01025579, "balance_loss_clip": 1.05394673, "balance_loss_mlp": 1.01777685, "epoch": 0.532676005531173, "flos": 26718687797760.0, "grad_norm": 2.5218848662640663, "language_loss": 0.48977572, "learning_rate": 1.8855537188854313e-06, "loss": 0.51184452, "num_input_tokens_seen": 95646315, "step": 4430, "time_per_iteration": 2.8345601558685303 }, { "auxiliary_loss_clip": 0.01176211, "auxiliary_loss_mlp": 0.01026224, "balance_loss_clip": 1.05021012, "balance_loss_mlp": 1.01843381, "epoch": 0.5327962484218121, "flos": 17894610921600.0, "grad_norm": 2.061281705843125, "language_loss": 0.78193271, "learning_rate": 1.8847760323508315e-06, "loss": 0.80395699, "num_input_tokens_seen": 95665220, "step": 4431, "time_per_iteration": 2.6839816570281982 }, { "auxiliary_loss_clip": 0.01166373, "auxiliary_loss_mlp": 0.01027208, "balance_loss_clip": 1.05204988, "balance_loss_mlp": 1.01998425, "epoch": 0.5329164913124511, "flos": 17925385898880.0, "grad_norm": 1.6043793608184382, "language_loss": 0.75379342, "learning_rate": 1.883998363295616e-06, "loss": 0.77572924, "num_input_tokens_seen": 95682700, "step": 4432, "time_per_iteration": 2.7874248027801514 }, { "auxiliary_loss_clip": 0.01074599, "auxiliary_loss_mlp": 0.01003632, "balance_loss_clip": 1.02091742, "balance_loss_mlp": 1.00254714, "epoch": 0.5330367342030903, "flos": 57254178781440.0, "grad_norm": 0.8772235715944886, "language_loss": 0.62601155, "learning_rate": 1.8832207118377565e-06, "loss": 0.64679384, "num_input_tokens_seen": 95738070, "step": 4433, "time_per_iteration": 3.2371084690093994 }, { "auxiliary_loss_clip": 0.01175869, "auxiliary_loss_mlp": 0.01027164, "balance_loss_clip": 1.05204225, "balance_loss_mlp": 1.01980877, "epoch": 0.5331569770937293, "flos": 17420518287360.0, "grad_norm": 2.1572996212579705, "language_loss": 0.6963082, "learning_rate": 1.882443078095222e-06, "loss": 0.71833849, "num_input_tokens_seen": 95756950, "step": 4434, "time_per_iteration": 2.7957582473754883 }, { "auxiliary_loss_clip": 0.0107699, "auxiliary_loss_mlp": 0.01002075, "balance_loss_clip": 1.0213474, "balance_loss_mlp": 1.00090063, "epoch": 0.5332772199843684, "flos": 56750783627520.0, "grad_norm": 0.8668924633681979, "language_loss": 0.66769803, "learning_rate": 1.8816654621859794e-06, "loss": 0.68848866, "num_input_tokens_seen": 95816615, "step": 4435, "time_per_iteration": 3.2076830863952637 }, { "auxiliary_loss_clip": 0.0117552, "auxiliary_loss_mlp": 0.01028739, "balance_loss_clip": 1.05203283, "balance_loss_mlp": 1.02090693, "epoch": 0.5333974628750076, "flos": 18697753071360.0, "grad_norm": 2.2836943681400057, "language_loss": 0.72484434, "learning_rate": 1.8808878642279915e-06, "loss": 0.74688685, "num_input_tokens_seen": 95832020, "step": 4436, "time_per_iteration": 2.7104976177215576 }, { "auxiliary_loss_clip": 0.01175617, "auxiliary_loss_mlp": 0.01029312, "balance_loss_clip": 1.05414629, "balance_loss_mlp": 1.02136111, "epoch": 0.5335177057656466, "flos": 23805507058560.0, "grad_norm": 2.5128555253594356, "language_loss": 0.65221196, "learning_rate": 1.8801102843392209e-06, "loss": 0.67426127, "num_input_tokens_seen": 95851425, "step": 4437, "time_per_iteration": 2.7269725799560547 }, { "auxiliary_loss_clip": 0.01167864, "auxiliary_loss_mlp": 0.0102969, "balance_loss_clip": 1.05110621, "balance_loss_mlp": 1.02185822, "epoch": 0.5336379486562857, "flos": 25078683605760.0, "grad_norm": 1.7718500702882236, "language_loss": 0.85234463, "learning_rate": 1.8793327226376238e-06, "loss": 0.87432009, "num_input_tokens_seen": 95870745, "step": 4438, "time_per_iteration": 3.830977439880371 }, { "auxiliary_loss_clip": 0.0117992, "auxiliary_loss_mlp": 0.01029843, "balance_loss_clip": 1.05396295, "balance_loss_mlp": 1.02190351, "epoch": 0.5337581915469248, "flos": 21396691140480.0, "grad_norm": 2.7220560518862174, "language_loss": 0.80178726, "learning_rate": 1.8785551792411569e-06, "loss": 0.8238849, "num_input_tokens_seen": 95889755, "step": 4439, "time_per_iteration": 2.7492449283599854 }, { "auxiliary_loss_clip": 0.01172832, "auxiliary_loss_mlp": 0.010264, "balance_loss_clip": 1.05269408, "balance_loss_mlp": 1.01916993, "epoch": 0.5338784344375639, "flos": 14865905064960.0, "grad_norm": 2.1446381344559806, "language_loss": 0.82870793, "learning_rate": 1.8777776542677733e-06, "loss": 0.8507002, "num_input_tokens_seen": 95907805, "step": 4440, "time_per_iteration": 2.8534929752349854 }, { "auxiliary_loss_clip": 0.01168959, "auxiliary_loss_mlp": 0.01021082, "balance_loss_clip": 1.05220747, "balance_loss_mlp": 1.01356602, "epoch": 0.5339986773282029, "flos": 20813501923200.0, "grad_norm": 1.9483733319552443, "language_loss": 0.73098385, "learning_rate": 1.8770001478354216e-06, "loss": 0.75288427, "num_input_tokens_seen": 95927480, "step": 4441, "time_per_iteration": 4.744765520095825 }, { "auxiliary_loss_clip": 0.01173229, "auxiliary_loss_mlp": 0.01025058, "balance_loss_clip": 1.05414581, "balance_loss_mlp": 1.01644516, "epoch": 0.5341189202188421, "flos": 17969089772160.0, "grad_norm": 2.163632938300293, "language_loss": 0.83597863, "learning_rate": 1.8762226600620504e-06, "loss": 0.85796148, "num_input_tokens_seen": 95946095, "step": 4442, "time_per_iteration": 2.726261615753174 }, { "auxiliary_loss_clip": 0.01184261, "auxiliary_loss_mlp": 0.01031774, "balance_loss_clip": 1.05634284, "balance_loss_mlp": 1.02362013, "epoch": 0.5342391631094812, "flos": 11031866328960.0, "grad_norm": 2.8093687537281156, "language_loss": 0.58922625, "learning_rate": 1.8754451910656031e-06, "loss": 0.61138666, "num_input_tokens_seen": 95959995, "step": 4443, "time_per_iteration": 2.6624228954315186 }, { "auxiliary_loss_clip": 0.01176995, "auxiliary_loss_mlp": 0.01025379, "balance_loss_clip": 1.05502629, "balance_loss_mlp": 1.0178746, "epoch": 0.5343594060001202, "flos": 15339135772800.0, "grad_norm": 1.9125837738889961, "language_loss": 0.82925546, "learning_rate": 1.8746677409640212e-06, "loss": 0.85127914, "num_input_tokens_seen": 95977095, "step": 4444, "time_per_iteration": 2.8167240619659424 }, { "auxiliary_loss_clip": 0.01179633, "auxiliary_loss_mlp": 0.01030528, "balance_loss_clip": 1.05456173, "balance_loss_mlp": 1.02284455, "epoch": 0.5344796488907594, "flos": 26900898514560.0, "grad_norm": 3.058246648564391, "language_loss": 0.84904641, "learning_rate": 1.8738903098752432e-06, "loss": 0.87114799, "num_input_tokens_seen": 95996225, "step": 4445, "time_per_iteration": 2.762343168258667 }, { "auxiliary_loss_clip": 0.0117627, "auxiliary_loss_mlp": 0.0102859, "balance_loss_clip": 1.0559088, "balance_loss_mlp": 1.02097201, "epoch": 0.5345998917813984, "flos": 25411216740480.0, "grad_norm": 2.125646795394592, "language_loss": 0.73149025, "learning_rate": 1.8731128979172052e-06, "loss": 0.75353885, "num_input_tokens_seen": 96015425, "step": 4446, "time_per_iteration": 2.775108814239502 }, { "auxiliary_loss_clip": 0.0116654, "auxiliary_loss_mlp": 0.01026494, "balance_loss_clip": 1.05009091, "balance_loss_mlp": 1.01860201, "epoch": 0.5347201346720375, "flos": 32853379622400.0, "grad_norm": 2.328217939955704, "language_loss": 0.67914557, "learning_rate": 1.8723355052078394e-06, "loss": 0.70107591, "num_input_tokens_seen": 96035460, "step": 4447, "time_per_iteration": 3.93646502494812 }, { "auxiliary_loss_clip": 0.01176479, "auxiliary_loss_mlp": 0.01026549, "balance_loss_clip": 1.05466592, "balance_loss_mlp": 1.01835954, "epoch": 0.5348403775626767, "flos": 17967940536960.0, "grad_norm": 2.071383347230443, "language_loss": 0.77154803, "learning_rate": 1.8715581318650765e-06, "loss": 0.79357839, "num_input_tokens_seen": 96054515, "step": 4448, "time_per_iteration": 2.7531065940856934 }, { "auxiliary_loss_clip": 0.01181214, "auxiliary_loss_mlp": 0.0102933, "balance_loss_clip": 1.05594587, "balance_loss_mlp": 1.02143574, "epoch": 0.5349606204533157, "flos": 17603339535360.0, "grad_norm": 2.9538393134530803, "language_loss": 0.81586635, "learning_rate": 1.8707807780068422e-06, "loss": 0.83797181, "num_input_tokens_seen": 96072330, "step": 4449, "time_per_iteration": 2.7542264461517334 }, { "auxiliary_loss_clip": 0.0116994, "auxiliary_loss_mlp": 0.01026137, "balance_loss_clip": 1.04949212, "balance_loss_mlp": 1.01794147, "epoch": 0.5350808633439548, "flos": 29167831710720.0, "grad_norm": 2.1304908060895467, "language_loss": 0.66230172, "learning_rate": 1.8700034437510611e-06, "loss": 0.68426245, "num_input_tokens_seen": 96092425, "step": 4450, "time_per_iteration": 2.867838144302368 }, { "auxiliary_loss_clip": 0.01162671, "auxiliary_loss_mlp": 0.01028533, "balance_loss_clip": 1.05265462, "balance_loss_mlp": 1.02034926, "epoch": 0.5352011062345938, "flos": 19499997381120.0, "grad_norm": 2.5675698684906134, "language_loss": 0.81892854, "learning_rate": 1.8692261292156549e-06, "loss": 0.84084058, "num_input_tokens_seen": 96111660, "step": 4451, "time_per_iteration": 2.8543941974639893 }, { "auxiliary_loss_clip": 0.01179153, "auxiliary_loss_mlp": 0.01029068, "balance_loss_clip": 1.05496693, "balance_loss_mlp": 1.02158427, "epoch": 0.535321349125233, "flos": 23477642691840.0, "grad_norm": 1.9286293410896886, "language_loss": 0.81141651, "learning_rate": 1.8684488345185401e-06, "loss": 0.83349872, "num_input_tokens_seen": 96131835, "step": 4452, "time_per_iteration": 2.736074924468994 }, { "auxiliary_loss_clip": 0.01181895, "auxiliary_loss_mlp": 0.01030101, "balance_loss_clip": 1.05544972, "balance_loss_mlp": 1.02174425, "epoch": 0.535441592015872, "flos": 20478059786880.0, "grad_norm": 2.3695296381506052, "language_loss": 0.78823215, "learning_rate": 1.8676715597776332e-06, "loss": 0.81035215, "num_input_tokens_seen": 96150180, "step": 4453, "time_per_iteration": 2.642721652984619 }, { "auxiliary_loss_clip": 0.01159963, "auxiliary_loss_mlp": 0.01024068, "balance_loss_clip": 1.05328858, "balance_loss_mlp": 1.01664138, "epoch": 0.5355618349065111, "flos": 19573147428480.0, "grad_norm": 1.800564901570183, "language_loss": 0.76290494, "learning_rate": 1.8668943051108455e-06, "loss": 0.78474522, "num_input_tokens_seen": 96167485, "step": 4454, "time_per_iteration": 2.7954530715942383 }, { "auxiliary_loss_clip": 0.01171646, "auxiliary_loss_mlp": 0.01031308, "balance_loss_clip": 1.05320203, "balance_loss_mlp": 1.02300549, "epoch": 0.5356820777971503, "flos": 24024633978240.0, "grad_norm": 2.0412466884144207, "language_loss": 0.76324618, "learning_rate": 1.8661170706360856e-06, "loss": 0.78527582, "num_input_tokens_seen": 96186650, "step": 4455, "time_per_iteration": 2.757098436355591 }, { "auxiliary_loss_clip": 0.0117267, "auxiliary_loss_mlp": 0.0102669, "balance_loss_clip": 1.05218661, "balance_loss_mlp": 1.01954961, "epoch": 0.5358023206877893, "flos": 20884676722560.0, "grad_norm": 1.610917620216123, "language_loss": 0.81633079, "learning_rate": 1.8653398564712594e-06, "loss": 0.83832443, "num_input_tokens_seen": 96205595, "step": 4456, "time_per_iteration": 2.7793338298797607 }, { "auxiliary_loss_clip": 0.0116895, "auxiliary_loss_mlp": 0.01032443, "balance_loss_clip": 1.05152452, "balance_loss_mlp": 1.02459347, "epoch": 0.5359225635784284, "flos": 22418996123520.0, "grad_norm": 1.6210572752954788, "language_loss": 0.82380688, "learning_rate": 1.8645626627342704e-06, "loss": 0.84582078, "num_input_tokens_seen": 96226360, "step": 4457, "time_per_iteration": 2.720081329345703 }, { "auxiliary_loss_clip": 0.01178152, "auxiliary_loss_mlp": 0.0102565, "balance_loss_clip": 1.05252373, "balance_loss_mlp": 1.01814032, "epoch": 0.5360428064690675, "flos": 24097784025600.0, "grad_norm": 2.798184779596833, "language_loss": 0.81110936, "learning_rate": 1.8637854895430172e-06, "loss": 0.83314741, "num_input_tokens_seen": 96245625, "step": 4458, "time_per_iteration": 2.78916335105896 }, { "auxiliary_loss_clip": 0.0116261, "auxiliary_loss_mlp": 0.01023442, "balance_loss_clip": 1.05141068, "balance_loss_mlp": 1.0148052, "epoch": 0.5361630493597066, "flos": 21434505183360.0, "grad_norm": 2.2557320726291215, "language_loss": 0.69194984, "learning_rate": 1.8630083370153978e-06, "loss": 0.71381032, "num_input_tokens_seen": 96265265, "step": 4459, "time_per_iteration": 2.6807098388671875 }, { "auxiliary_loss_clip": 0.01077996, "auxiliary_loss_mlp": 0.01000637, "balance_loss_clip": 1.02077842, "balance_loss_mlp": 0.99931997, "epoch": 0.5362832922503457, "flos": 68888696520960.0, "grad_norm": 0.868184958237176, "language_loss": 0.55390555, "learning_rate": 1.8622312052693041e-06, "loss": 0.57469189, "num_input_tokens_seen": 96326445, "step": 4460, "time_per_iteration": 3.502281904220581 }, { "auxiliary_loss_clip": 0.01168466, "auxiliary_loss_mlp": 0.01017345, "balance_loss_clip": 1.05027568, "balance_loss_mlp": 1.01034999, "epoch": 0.5364035351409848, "flos": 9793702563840.0, "grad_norm": 2.266473552449144, "language_loss": 0.71544182, "learning_rate": 1.8614540944226267e-06, "loss": 0.73729992, "num_input_tokens_seen": 96343115, "step": 4461, "time_per_iteration": 2.7838146686553955 }, { "auxiliary_loss_clip": 0.01166968, "auxiliary_loss_mlp": 0.01029226, "balance_loss_clip": 1.05219781, "balance_loss_mlp": 1.02166772, "epoch": 0.5365237780316239, "flos": 23290080848640.0, "grad_norm": 1.9861850439596747, "language_loss": 0.6831094, "learning_rate": 1.8606770045932537e-06, "loss": 0.70507133, "num_input_tokens_seen": 96362230, "step": 4462, "time_per_iteration": 2.721665859222412 }, { "auxiliary_loss_clip": 0.01166905, "auxiliary_loss_mlp": 0.01025941, "balance_loss_clip": 1.05387974, "balance_loss_mlp": 1.01731634, "epoch": 0.5366440209222629, "flos": 26578133879040.0, "grad_norm": 2.565166993806122, "language_loss": 0.81258893, "learning_rate": 1.859899935899068e-06, "loss": 0.83451742, "num_input_tokens_seen": 96382085, "step": 4463, "time_per_iteration": 2.854053497314453 }, { "auxiliary_loss_clip": 0.01168778, "auxiliary_loss_mlp": 0.01031141, "balance_loss_clip": 1.05409002, "balance_loss_mlp": 1.02318954, "epoch": 0.5367642638129021, "flos": 19608052469760.0, "grad_norm": 1.7011927432173604, "language_loss": 0.79094648, "learning_rate": 1.8591228884579506e-06, "loss": 0.81294566, "num_input_tokens_seen": 96400580, "step": 4464, "time_per_iteration": 2.8768441677093506 }, { "auxiliary_loss_clip": 0.01171108, "auxiliary_loss_mlp": 0.0102897, "balance_loss_clip": 1.05161238, "balance_loss_mlp": 1.0209589, "epoch": 0.5368845067035412, "flos": 23915214172800.0, "grad_norm": 3.2400200142319173, "language_loss": 0.82268184, "learning_rate": 1.8583458623877795e-06, "loss": 0.84468263, "num_input_tokens_seen": 96419680, "step": 4465, "time_per_iteration": 3.838975191116333 }, { "auxiliary_loss_clip": 0.01176289, "auxiliary_loss_mlp": 0.01031972, "balance_loss_clip": 1.05228448, "balance_loss_mlp": 1.02405655, "epoch": 0.5370047495941802, "flos": 16873131951360.0, "grad_norm": 2.009425697426401, "language_loss": 0.73953605, "learning_rate": 1.8575688578064281e-06, "loss": 0.76161873, "num_input_tokens_seen": 96437805, "step": 4466, "time_per_iteration": 3.577840805053711 }, { "auxiliary_loss_clip": 0.01176595, "auxiliary_loss_mlp": 0.01024435, "balance_loss_clip": 1.05386806, "balance_loss_mlp": 1.01685905, "epoch": 0.5371249924848194, "flos": 20740926493440.0, "grad_norm": 2.023211743713731, "language_loss": 0.76827538, "learning_rate": 1.8567918748317674e-06, "loss": 0.79028571, "num_input_tokens_seen": 96457155, "step": 4467, "time_per_iteration": 3.7516024112701416 }, { "auxiliary_loss_clip": 0.01172242, "auxiliary_loss_mlp": 0.01028609, "balance_loss_clip": 1.05302942, "balance_loss_mlp": 1.02047884, "epoch": 0.5372452353754584, "flos": 17968120104960.0, "grad_norm": 1.9348606656467353, "language_loss": 0.8290472, "learning_rate": 1.8560149135816659e-06, "loss": 0.85105568, "num_input_tokens_seen": 96473990, "step": 4468, "time_per_iteration": 2.809648036956787 }, { "auxiliary_loss_clip": 0.01171239, "auxiliary_loss_mlp": 0.01021933, "balance_loss_clip": 1.05097115, "balance_loss_mlp": 1.01421404, "epoch": 0.5373654782660975, "flos": 15377021642880.0, "grad_norm": 2.483062301913845, "language_loss": 0.84628928, "learning_rate": 1.8552379741739873e-06, "loss": 0.86822104, "num_input_tokens_seen": 96491335, "step": 4469, "time_per_iteration": 2.72641921043396 }, { "auxiliary_loss_clip": 0.01076081, "auxiliary_loss_mlp": 0.01042728, "balance_loss_clip": 1.02188027, "balance_loss_mlp": 0.99832726, "epoch": 0.5374857211567367, "flos": 69000091574400.0, "grad_norm": 1.6403525344444236, "language_loss": 0.55575073, "learning_rate": 1.8544610567265935e-06, "loss": 0.57693875, "num_input_tokens_seen": 96545275, "step": 4470, "time_per_iteration": 3.259629249572754 }, { "auxiliary_loss_clip": 0.01173008, "auxiliary_loss_mlp": 0.01063004, "balance_loss_clip": 1.05465269, "balance_loss_mlp": 1.0243783, "epoch": 0.5376059640473757, "flos": 15085355207040.0, "grad_norm": 1.9050021241344035, "language_loss": 0.83197004, "learning_rate": 1.853684161357341e-06, "loss": 0.85433018, "num_input_tokens_seen": 96562935, "step": 4471, "time_per_iteration": 2.7688121795654297 }, { "auxiliary_loss_clip": 0.01172598, "auxiliary_loss_mlp": 0.01055522, "balance_loss_clip": 1.05209398, "balance_loss_mlp": 1.01805282, "epoch": 0.5377262069380148, "flos": 19792597570560.0, "grad_norm": 1.8548958070724926, "language_loss": 0.76593375, "learning_rate": 1.852907288184085e-06, "loss": 0.78821492, "num_input_tokens_seen": 96581820, "step": 4472, "time_per_iteration": 2.8088741302490234 }, { "auxiliary_loss_clip": 0.01168128, "auxiliary_loss_mlp": 0.01024082, "balance_loss_clip": 1.05164456, "balance_loss_mlp": 1.01552272, "epoch": 0.5378464498286539, "flos": 30003077640960.0, "grad_norm": 2.3785803211404994, "language_loss": 0.70180881, "learning_rate": 1.8521304373246762e-06, "loss": 0.72373092, "num_input_tokens_seen": 96602865, "step": 4473, "time_per_iteration": 3.7506723403930664 }, { "auxiliary_loss_clip": 0.01180217, "auxiliary_loss_mlp": 0.01031245, "balance_loss_clip": 1.05512929, "balance_loss_mlp": 1.02273917, "epoch": 0.537966692719293, "flos": 21251217058560.0, "grad_norm": 2.516583650808495, "language_loss": 0.89257228, "learning_rate": 1.8513536088969626e-06, "loss": 0.91468692, "num_input_tokens_seen": 96620530, "step": 4474, "time_per_iteration": 2.7904739379882812 }, { "auxiliary_loss_clip": 0.01178571, "auxiliary_loss_mlp": 0.01031258, "balance_loss_clip": 1.05476451, "balance_loss_mlp": 1.02316022, "epoch": 0.538086935609932, "flos": 21543170803200.0, "grad_norm": 1.7010402031539082, "language_loss": 0.80372649, "learning_rate": 1.8505768030187884e-06, "loss": 0.82582474, "num_input_tokens_seen": 96640660, "step": 4475, "time_per_iteration": 2.7369964122772217 }, { "auxiliary_loss_clip": 0.01166559, "auxiliary_loss_mlp": 0.01025923, "balance_loss_clip": 1.05247545, "balance_loss_mlp": 1.0182637, "epoch": 0.5382071785005712, "flos": 22747219626240.0, "grad_norm": 1.6475493910289711, "language_loss": 0.79990482, "learning_rate": 1.849800019807995e-06, "loss": 0.82182968, "num_input_tokens_seen": 96661885, "step": 4476, "time_per_iteration": 2.856013059616089 }, { "auxiliary_loss_clip": 0.0116858, "auxiliary_loss_mlp": 0.01030201, "balance_loss_clip": 1.05286145, "balance_loss_mlp": 1.02253604, "epoch": 0.5383274213912103, "flos": 24934574240640.0, "grad_norm": 2.2072229502596366, "language_loss": 0.71027732, "learning_rate": 1.8490232593824186e-06, "loss": 0.73226517, "num_input_tokens_seen": 96678340, "step": 4477, "time_per_iteration": 2.763977289199829 }, { "auxiliary_loss_clip": 0.01169645, "auxiliary_loss_mlp": 0.01025012, "balance_loss_clip": 1.05304575, "balance_loss_mlp": 1.01713848, "epoch": 0.5384476642818493, "flos": 22310186849280.0, "grad_norm": 1.7108450553791408, "language_loss": 0.84950143, "learning_rate": 1.8482465218598935e-06, "loss": 0.87144804, "num_input_tokens_seen": 96698285, "step": 4478, "time_per_iteration": 2.7895984649658203 }, { "auxiliary_loss_clip": 0.0117086, "auxiliary_loss_mlp": 0.01029233, "balance_loss_clip": 1.05436277, "balance_loss_mlp": 1.02046514, "epoch": 0.5385679071724885, "flos": 22711021695360.0, "grad_norm": 1.6745574716146696, "language_loss": 0.83585751, "learning_rate": 1.8474698073582508e-06, "loss": 0.85785842, "num_input_tokens_seen": 96719655, "step": 4479, "time_per_iteration": 2.8115453720092773 }, { "auxiliary_loss_clip": 0.0117602, "auxiliary_loss_mlp": 0.01028534, "balance_loss_clip": 1.05285239, "balance_loss_mlp": 1.02070189, "epoch": 0.5386881500631275, "flos": 15953746412160.0, "grad_norm": 2.061679210529953, "language_loss": 0.8677054, "learning_rate": 1.8466931159953166e-06, "loss": 0.8897509, "num_input_tokens_seen": 96736290, "step": 4480, "time_per_iteration": 2.861546277999878 }, { "auxiliary_loss_clip": 0.01174339, "auxiliary_loss_mlp": 0.010303, "balance_loss_clip": 1.05358469, "balance_loss_mlp": 1.02215862, "epoch": 0.5388083929537666, "flos": 24060041809920.0, "grad_norm": 4.86420273140471, "language_loss": 0.84026557, "learning_rate": 1.8459164478889158e-06, "loss": 0.86231196, "num_input_tokens_seen": 96757685, "step": 4481, "time_per_iteration": 2.8587896823883057 }, { "auxiliary_loss_clip": 0.01161721, "auxiliary_loss_mlp": 0.01026732, "balance_loss_clip": 1.05023241, "balance_loss_mlp": 1.01913857, "epoch": 0.5389286358444056, "flos": 22236893147520.0, "grad_norm": 2.2188147341616893, "language_loss": 0.76405811, "learning_rate": 1.8451398031568663e-06, "loss": 0.78594273, "num_input_tokens_seen": 96777310, "step": 4482, "time_per_iteration": 2.9240429401397705 }, { "auxiliary_loss_clip": 0.01169396, "auxiliary_loss_mlp": 0.01033012, "balance_loss_clip": 1.05476749, "balance_loss_mlp": 1.02468491, "epoch": 0.5390488787350448, "flos": 24281718595200.0, "grad_norm": 1.5323513606331842, "language_loss": 0.74456489, "learning_rate": 1.844363181916986e-06, "loss": 0.76658893, "num_input_tokens_seen": 96798035, "step": 4483, "time_per_iteration": 2.8530077934265137 }, { "auxiliary_loss_clip": 0.01176379, "auxiliary_loss_mlp": 0.01029283, "balance_loss_clip": 1.05377913, "balance_loss_mlp": 1.02102745, "epoch": 0.5391691216256839, "flos": 16581393688320.0, "grad_norm": 2.282581695369296, "language_loss": 0.83084762, "learning_rate": 1.8435865842870868e-06, "loss": 0.85290426, "num_input_tokens_seen": 96815975, "step": 4484, "time_per_iteration": 2.715111017227173 }, { "auxiliary_loss_clip": 0.01165467, "auxiliary_loss_mlp": 0.01056176, "balance_loss_clip": 1.05190754, "balance_loss_mlp": 1.01836145, "epoch": 0.5392893645163229, "flos": 23330049707520.0, "grad_norm": 2.539316317029254, "language_loss": 0.72082204, "learning_rate": 1.8428100103849787e-06, "loss": 0.74303842, "num_input_tokens_seen": 96835770, "step": 4485, "time_per_iteration": 2.832498073577881 }, { "auxiliary_loss_clip": 0.01172869, "auxiliary_loss_mlp": 0.01028655, "balance_loss_clip": 1.05607104, "balance_loss_mlp": 1.02082944, "epoch": 0.5394096074069621, "flos": 15669801400320.0, "grad_norm": 4.2880109051913795, "language_loss": 0.72965068, "learning_rate": 1.842033460328467e-06, "loss": 0.75166595, "num_input_tokens_seen": 96854490, "step": 4486, "time_per_iteration": 2.767418622970581 }, { "auxiliary_loss_clip": 0.011767, "auxiliary_loss_mlp": 0.01053397, "balance_loss_clip": 1.05409074, "balance_loss_mlp": 1.01580453, "epoch": 0.5395298502976011, "flos": 22893447893760.0, "grad_norm": 1.6194996256125955, "language_loss": 0.75020987, "learning_rate": 1.8412569342353541e-06, "loss": 0.77251083, "num_input_tokens_seen": 96874645, "step": 4487, "time_per_iteration": 2.8432180881500244 }, { "auxiliary_loss_clip": 0.0117853, "auxiliary_loss_mlp": 0.01028707, "balance_loss_clip": 1.05462277, "balance_loss_mlp": 1.0203445, "epoch": 0.5396500931882402, "flos": 23842135952640.0, "grad_norm": 1.71031480564065, "language_loss": 0.84838617, "learning_rate": 1.840480432223438e-06, "loss": 0.87045848, "num_input_tokens_seen": 96893650, "step": 4488, "time_per_iteration": 2.7605295181274414 }, { "auxiliary_loss_clip": 0.01174146, "auxiliary_loss_mlp": 0.01025834, "balance_loss_clip": 1.05194068, "balance_loss_mlp": 1.01798415, "epoch": 0.5397703360788794, "flos": 26322988596480.0, "grad_norm": 1.9739637402434211, "language_loss": 0.7736358, "learning_rate": 1.8397039544105131e-06, "loss": 0.79563558, "num_input_tokens_seen": 96912735, "step": 4489, "time_per_iteration": 2.7498347759246826 }, { "auxiliary_loss_clip": 0.01169543, "auxiliary_loss_mlp": 0.01031316, "balance_loss_clip": 1.05225897, "balance_loss_mlp": 1.02328181, "epoch": 0.5398905789695184, "flos": 21214588164480.0, "grad_norm": 3.8471132891996125, "language_loss": 0.69743228, "learning_rate": 1.8389275009143711e-06, "loss": 0.71944082, "num_input_tokens_seen": 96932475, "step": 4490, "time_per_iteration": 2.7771825790405273 }, { "auxiliary_loss_clip": 0.01174991, "auxiliary_loss_mlp": 0.01024838, "balance_loss_clip": 1.05129409, "balance_loss_mlp": 1.01759636, "epoch": 0.5400108218601575, "flos": 25080335631360.0, "grad_norm": 4.279192928835243, "language_loss": 0.73694265, "learning_rate": 1.8381510718527988e-06, "loss": 0.75894094, "num_input_tokens_seen": 96952085, "step": 4491, "time_per_iteration": 3.629112720489502 }, { "auxiliary_loss_clip": 0.01178135, "auxiliary_loss_mlp": 0.01029642, "balance_loss_clip": 1.05374146, "balance_loss_mlp": 1.02159595, "epoch": 0.5401310647507966, "flos": 26357498588160.0, "grad_norm": 1.9605222449324244, "language_loss": 0.63163376, "learning_rate": 1.8373746673435812e-06, "loss": 0.65371156, "num_input_tokens_seen": 96973110, "step": 4492, "time_per_iteration": 2.7183618545532227 }, { "auxiliary_loss_clip": 0.01181597, "auxiliary_loss_mlp": 0.01035988, "balance_loss_clip": 1.05572152, "balance_loss_mlp": 1.02782202, "epoch": 0.5402513076414357, "flos": 27855332749440.0, "grad_norm": 1.6514444040336411, "language_loss": 0.7900852, "learning_rate": 1.8365982875044964e-06, "loss": 0.81226099, "num_input_tokens_seen": 96993420, "step": 4493, "time_per_iteration": 4.700037240982056 }, { "auxiliary_loss_clip": 0.01181706, "auxiliary_loss_mlp": 0.0105629, "balance_loss_clip": 1.05359125, "balance_loss_mlp": 1.01914752, "epoch": 0.5403715505320748, "flos": 22893771116160.0, "grad_norm": 2.456014986512284, "language_loss": 0.76331615, "learning_rate": 1.8358219324533217e-06, "loss": 0.78569609, "num_input_tokens_seen": 97013685, "step": 4494, "time_per_iteration": 2.7252461910247803 }, { "auxiliary_loss_clip": 0.01168529, "auxiliary_loss_mlp": 0.01026638, "balance_loss_clip": 1.05109239, "balance_loss_mlp": 1.01922357, "epoch": 0.5404917934227139, "flos": 30224143895040.0, "grad_norm": 1.7047000940016899, "language_loss": 0.70260942, "learning_rate": 1.8350456023078292e-06, "loss": 0.7245611, "num_input_tokens_seen": 97036060, "step": 4495, "time_per_iteration": 2.8700461387634277 }, { "auxiliary_loss_clip": 0.01183124, "auxiliary_loss_mlp": 0.01027615, "balance_loss_clip": 1.05284095, "balance_loss_mlp": 1.01882958, "epoch": 0.540612036313353, "flos": 19938502615680.0, "grad_norm": 3.0410492958864372, "language_loss": 0.78161287, "learning_rate": 1.8342692971857874e-06, "loss": 0.80372024, "num_input_tokens_seen": 97055260, "step": 4496, "time_per_iteration": 2.71116042137146 }, { "auxiliary_loss_clip": 0.01168748, "auxiliary_loss_mlp": 0.01025325, "balance_loss_clip": 1.05153739, "balance_loss_mlp": 1.01736128, "epoch": 0.540732279203992, "flos": 24279599692800.0, "grad_norm": 2.280306087430748, "language_loss": 0.70393372, "learning_rate": 1.833493017204962e-06, "loss": 0.72587442, "num_input_tokens_seen": 97075365, "step": 4497, "time_per_iteration": 2.7393884658813477 }, { "auxiliary_loss_clip": 0.01178404, "auxiliary_loss_mlp": 0.01026584, "balance_loss_clip": 1.05156076, "balance_loss_mlp": 1.01875746, "epoch": 0.5408525220946312, "flos": 20193216935040.0, "grad_norm": 2.0305202788830554, "language_loss": 0.78009903, "learning_rate": 1.8327167624831134e-06, "loss": 0.80214894, "num_input_tokens_seen": 97093095, "step": 4498, "time_per_iteration": 2.7251603603363037 }, { "auxiliary_loss_clip": 0.01179064, "auxiliary_loss_mlp": 0.01029584, "balance_loss_clip": 1.05426049, "balance_loss_mlp": 1.02210402, "epoch": 0.5409727649852702, "flos": 24134448833280.0, "grad_norm": 1.6620214119471541, "language_loss": 0.70971274, "learning_rate": 1.831940533137999e-06, "loss": 0.73179913, "num_input_tokens_seen": 97112000, "step": 4499, "time_per_iteration": 3.6999588012695312 }, { "auxiliary_loss_clip": 0.01172137, "auxiliary_loss_mlp": 0.01032102, "balance_loss_clip": 1.05353332, "balance_loss_mlp": 1.0240072, "epoch": 0.5410930078759093, "flos": 23912700220800.0, "grad_norm": 1.627582172454016, "language_loss": 0.7222963, "learning_rate": 1.8311643292873718e-06, "loss": 0.74433863, "num_input_tokens_seen": 97130820, "step": 4500, "time_per_iteration": 2.7092647552490234 }, { "auxiliary_loss_clip": 0.01169603, "auxiliary_loss_mlp": 0.01027995, "balance_loss_clip": 1.05078006, "balance_loss_mlp": 1.02044272, "epoch": 0.5412132507665485, "flos": 21105132445440.0, "grad_norm": 2.0285778432763566, "language_loss": 0.8810485, "learning_rate": 1.8303881510489818e-06, "loss": 0.90302444, "num_input_tokens_seen": 97149210, "step": 4501, "time_per_iteration": 2.7107372283935547 }, { "auxiliary_loss_clip": 0.01173994, "auxiliary_loss_mlp": 0.01028208, "balance_loss_clip": 1.05329037, "balance_loss_mlp": 1.0200417, "epoch": 0.5413334936571875, "flos": 30227340205440.0, "grad_norm": 2.4785181955935225, "language_loss": 0.69106436, "learning_rate": 1.829611998540574e-06, "loss": 0.71308637, "num_input_tokens_seen": 97170415, "step": 4502, "time_per_iteration": 2.812225818634033 }, { "auxiliary_loss_clip": 0.01173558, "auxiliary_loss_mlp": 0.01058107, "balance_loss_clip": 1.05006266, "balance_loss_mlp": 1.02110541, "epoch": 0.5414537365478266, "flos": 24279635606400.0, "grad_norm": 2.0826701810857915, "language_loss": 0.79680765, "learning_rate": 1.8288358718798914e-06, "loss": 0.81912428, "num_input_tokens_seen": 97189605, "step": 4503, "time_per_iteration": 2.7877275943756104 }, { "auxiliary_loss_clip": 0.01172038, "auxiliary_loss_mlp": 0.01055916, "balance_loss_clip": 1.05283725, "balance_loss_mlp": 1.01830733, "epoch": 0.5415739794384657, "flos": 16654543735680.0, "grad_norm": 1.6427907812442697, "language_loss": 0.72489405, "learning_rate": 1.8280597711846703e-06, "loss": 0.74717367, "num_input_tokens_seen": 97207845, "step": 4504, "time_per_iteration": 2.8274617195129395 }, { "auxiliary_loss_clip": 0.01171999, "auxiliary_loss_mlp": 0.01028082, "balance_loss_clip": 1.05285478, "balance_loss_mlp": 1.0201484, "epoch": 0.5416942223291048, "flos": 23185724860800.0, "grad_norm": 2.207761678961384, "language_loss": 0.83382857, "learning_rate": 1.8272836965726455e-06, "loss": 0.85582936, "num_input_tokens_seen": 97226780, "step": 4505, "time_per_iteration": 2.910062074661255 }, { "auxiliary_loss_clip": 0.01168737, "auxiliary_loss_mlp": 0.01032372, "balance_loss_clip": 1.05592299, "balance_loss_mlp": 1.02406931, "epoch": 0.5418144652197439, "flos": 20303247271680.0, "grad_norm": 1.7097839783855826, "language_loss": 0.78226399, "learning_rate": 1.8265076481615461e-06, "loss": 0.8042751, "num_input_tokens_seen": 97246695, "step": 4506, "time_per_iteration": 2.7571768760681152 }, { "auxiliary_loss_clip": 0.01173156, "auxiliary_loss_mlp": 0.01025046, "balance_loss_clip": 1.05613708, "balance_loss_mlp": 1.01724946, "epoch": 0.541934708110383, "flos": 12458633431680.0, "grad_norm": 2.2795390167169582, "language_loss": 0.87575471, "learning_rate": 1.8257316260690987e-06, "loss": 0.89773673, "num_input_tokens_seen": 97264480, "step": 4507, "time_per_iteration": 2.8309860229492188 }, { "auxiliary_loss_clip": 0.01173409, "auxiliary_loss_mlp": 0.01028257, "balance_loss_clip": 1.05028772, "balance_loss_mlp": 1.02093136, "epoch": 0.5420549510010221, "flos": 21253802837760.0, "grad_norm": 1.49890322982977, "language_loss": 0.75896549, "learning_rate": 1.8249556304130254e-06, "loss": 0.78098214, "num_input_tokens_seen": 97285760, "step": 4508, "time_per_iteration": 2.688361644744873 }, { "auxiliary_loss_clip": 0.01163654, "auxiliary_loss_mlp": 0.01030178, "balance_loss_clip": 1.05161345, "balance_loss_mlp": 1.02178597, "epoch": 0.5421751938916611, "flos": 29490524519040.0, "grad_norm": 2.8482096206828857, "language_loss": 0.68612564, "learning_rate": 1.824179661311044e-06, "loss": 0.70806396, "num_input_tokens_seen": 97304510, "step": 4509, "time_per_iteration": 2.7911345958709717 }, { "auxiliary_loss_clip": 0.01167014, "auxiliary_loss_mlp": 0.01028231, "balance_loss_clip": 1.05181825, "balance_loss_mlp": 1.02010059, "epoch": 0.5422954367823003, "flos": 18734238311040.0, "grad_norm": 2.0922046648822645, "language_loss": 0.79772401, "learning_rate": 1.823403718880868e-06, "loss": 0.8196764, "num_input_tokens_seen": 97323270, "step": 4510, "time_per_iteration": 2.7831544876098633 }, { "auxiliary_loss_clip": 0.01176083, "auxiliary_loss_mlp": 0.0102628, "balance_loss_clip": 1.05358684, "balance_loss_mlp": 1.01771498, "epoch": 0.5424156796729394, "flos": 39969006940800.0, "grad_norm": 1.625155108426562, "language_loss": 0.66442716, "learning_rate": 1.822627803240207e-06, "loss": 0.68645084, "num_input_tokens_seen": 97345600, "step": 4511, "time_per_iteration": 2.9031128883361816 }, { "auxiliary_loss_clip": 0.01174577, "auxiliary_loss_mlp": 0.01031183, "balance_loss_clip": 1.05406296, "balance_loss_mlp": 1.02297211, "epoch": 0.5425359225635784, "flos": 11546538353280.0, "grad_norm": 2.5847905022902826, "language_loss": 0.85312831, "learning_rate": 1.8218519145067675e-06, "loss": 0.87518597, "num_input_tokens_seen": 97361220, "step": 4512, "time_per_iteration": 2.737236738204956 }, { "auxiliary_loss_clip": 0.01166881, "auxiliary_loss_mlp": 0.01027519, "balance_loss_clip": 1.05352306, "balance_loss_mlp": 1.01993752, "epoch": 0.5426561654542175, "flos": 20229702174720.0, "grad_norm": 1.8048973685105105, "language_loss": 0.89743114, "learning_rate": 1.8210760527982508e-06, "loss": 0.91937518, "num_input_tokens_seen": 97381505, "step": 4513, "time_per_iteration": 2.7400999069213867 }, { "auxiliary_loss_clip": 0.01175299, "auxiliary_loss_mlp": 0.01049033, "balance_loss_clip": 1.05449641, "balance_loss_mlp": 1.01321697, "epoch": 0.5427764083448566, "flos": 21871681614720.0, "grad_norm": 5.131622122070606, "language_loss": 0.75328177, "learning_rate": 1.8203002182323552e-06, "loss": 0.77552509, "num_input_tokens_seen": 97399060, "step": 4514, "time_per_iteration": 2.778494119644165 }, { "auxiliary_loss_clip": 0.01172624, "auxiliary_loss_mlp": 0.01025291, "balance_loss_clip": 1.05343473, "balance_loss_mlp": 1.01742315, "epoch": 0.5428966512354957, "flos": 19640946349440.0, "grad_norm": 1.8586448034257985, "language_loss": 0.76013368, "learning_rate": 1.819524410926773e-06, "loss": 0.78211284, "num_input_tokens_seen": 97416740, "step": 4515, "time_per_iteration": 2.776193857192993 }, { "auxiliary_loss_clip": 0.01162863, "auxiliary_loss_mlp": 0.01028509, "balance_loss_clip": 1.05388522, "balance_loss_mlp": 1.02041507, "epoch": 0.5430168941261347, "flos": 22382187661440.0, "grad_norm": 1.581016125142236, "language_loss": 0.76954025, "learning_rate": 1.8187486309991944e-06, "loss": 0.79145396, "num_input_tokens_seen": 97437620, "step": 4516, "time_per_iteration": 2.800083637237549 }, { "auxiliary_loss_clip": 0.01183186, "auxiliary_loss_mlp": 0.01024824, "balance_loss_clip": 1.05613112, "balance_loss_mlp": 1.01759434, "epoch": 0.5431371370167739, "flos": 18764187275520.0, "grad_norm": 2.954684873256488, "language_loss": 0.77812409, "learning_rate": 1.817972878567304e-06, "loss": 0.80020416, "num_input_tokens_seen": 97456275, "step": 4517, "time_per_iteration": 3.6948206424713135 }, { "auxiliary_loss_clip": 0.0117515, "auxiliary_loss_mlp": 0.01028329, "balance_loss_clip": 1.05126858, "balance_loss_mlp": 1.02052665, "epoch": 0.543257379907413, "flos": 18806023641600.0, "grad_norm": 1.7675534375895177, "language_loss": 0.76593077, "learning_rate": 1.8171971537487834e-06, "loss": 0.78796554, "num_input_tokens_seen": 97474925, "step": 4518, "time_per_iteration": 2.712717294692993 }, { "auxiliary_loss_clip": 0.01180623, "auxiliary_loss_mlp": 0.01033586, "balance_loss_clip": 1.05212891, "balance_loss_mlp": 1.02585578, "epoch": 0.543377622798052, "flos": 17493381025920.0, "grad_norm": 1.976994473330852, "language_loss": 0.80724859, "learning_rate": 1.8164214566613093e-06, "loss": 0.82939065, "num_input_tokens_seen": 97493550, "step": 4519, "time_per_iteration": 4.642422676086426 }, { "auxiliary_loss_clip": 0.01177877, "auxiliary_loss_mlp": 0.01025039, "balance_loss_clip": 1.05232561, "balance_loss_mlp": 1.01763606, "epoch": 0.5434978656886912, "flos": 18989311766400.0, "grad_norm": 2.9245494178432505, "language_loss": 0.66042858, "learning_rate": 1.8156457874225547e-06, "loss": 0.68245775, "num_input_tokens_seen": 97512010, "step": 4520, "time_per_iteration": 2.728823184967041 }, { "auxiliary_loss_clip": 0.01165079, "auxiliary_loss_mlp": 0.01028743, "balance_loss_clip": 1.0524323, "balance_loss_mlp": 1.02089357, "epoch": 0.5436181085793302, "flos": 17274936464640.0, "grad_norm": 6.88949360436057, "language_loss": 0.8056426, "learning_rate": 1.814870146150187e-06, "loss": 0.82758081, "num_input_tokens_seen": 97530120, "step": 4521, "time_per_iteration": 2.730217933654785 }, { "auxiliary_loss_clip": 0.01180915, "auxiliary_loss_mlp": 0.01027199, "balance_loss_clip": 1.05401659, "balance_loss_mlp": 1.0189321, "epoch": 0.5437383514699693, "flos": 19098587917440.0, "grad_norm": 2.8411448661226975, "language_loss": 0.78912914, "learning_rate": 1.814094532961871e-06, "loss": 0.81121033, "num_input_tokens_seen": 97548695, "step": 4522, "time_per_iteration": 2.7509727478027344 }, { "auxiliary_loss_clip": 0.01169931, "auxiliary_loss_mlp": 0.01025861, "balance_loss_clip": 1.05194461, "balance_loss_mlp": 1.0178206, "epoch": 0.5438585943606085, "flos": 22602715211520.0, "grad_norm": 2.0600476306456166, "language_loss": 0.8412919, "learning_rate": 1.8133189479752666e-06, "loss": 0.86324984, "num_input_tokens_seen": 97567625, "step": 4523, "time_per_iteration": 2.77384090423584 }, { "auxiliary_loss_clip": 0.01179366, "auxiliary_loss_mlp": 0.01027207, "balance_loss_clip": 1.05423832, "balance_loss_mlp": 1.01914811, "epoch": 0.5439788372512475, "flos": 21798495653760.0, "grad_norm": 1.9911224862236374, "language_loss": 0.8160032, "learning_rate": 1.8125433913080292e-06, "loss": 0.83806896, "num_input_tokens_seen": 97585325, "step": 4524, "time_per_iteration": 2.720964193344116 }, { "auxiliary_loss_clip": 0.01146197, "auxiliary_loss_mlp": 0.01028024, "balance_loss_clip": 1.05139983, "balance_loss_mlp": 1.02001894, "epoch": 0.5440990801418866, "flos": 16399362539520.0, "grad_norm": 2.380132438478261, "language_loss": 0.82684451, "learning_rate": 1.811767863077811e-06, "loss": 0.84858668, "num_input_tokens_seen": 97604275, "step": 4525, "time_per_iteration": 3.85060715675354 }, { "auxiliary_loss_clip": 0.01156008, "auxiliary_loss_mlp": 0.01026696, "balance_loss_clip": 1.05334771, "balance_loss_mlp": 1.01917732, "epoch": 0.5442193230325257, "flos": 21615638492160.0, "grad_norm": 1.778210497219667, "language_loss": 0.78533971, "learning_rate": 1.8109923634022577e-06, "loss": 0.80716676, "num_input_tokens_seen": 97624300, "step": 4526, "time_per_iteration": 3.077106475830078 }, { "auxiliary_loss_clip": 0.01180744, "auxiliary_loss_mlp": 0.01023812, "balance_loss_clip": 1.05281484, "balance_loss_mlp": 1.01600075, "epoch": 0.5443395659231648, "flos": 15481198062720.0, "grad_norm": 10.339294841473121, "language_loss": 0.8656373, "learning_rate": 1.8102168923990128e-06, "loss": 0.88768291, "num_input_tokens_seen": 97637845, "step": 4527, "time_per_iteration": 2.6725809574127197 }, { "auxiliary_loss_clip": 0.01178007, "auxiliary_loss_mlp": 0.01059373, "balance_loss_clip": 1.05350053, "balance_loss_mlp": 1.02145267, "epoch": 0.5444598088138038, "flos": 18770436241920.0, "grad_norm": 1.7895139232725907, "language_loss": 0.80029535, "learning_rate": 1.809441450185714e-06, "loss": 0.82266927, "num_input_tokens_seen": 97656330, "step": 4528, "time_per_iteration": 2.7058591842651367 }, { "auxiliary_loss_clip": 0.01177926, "auxiliary_loss_mlp": 0.01032159, "balance_loss_clip": 1.05154598, "balance_loss_mlp": 1.0240823, "epoch": 0.544580051704443, "flos": 21142335957120.0, "grad_norm": 2.3490019013321217, "language_loss": 0.73409891, "learning_rate": 1.8086660368799958e-06, "loss": 0.75619972, "num_input_tokens_seen": 97674380, "step": 4529, "time_per_iteration": 2.7932255268096924 }, { "auxiliary_loss_clip": 0.01173362, "auxiliary_loss_mlp": 0.01022726, "balance_loss_clip": 1.05247808, "balance_loss_mlp": 1.01487637, "epoch": 0.5447002945950821, "flos": 32491508054400.0, "grad_norm": 2.819294605655455, "language_loss": 0.77457678, "learning_rate": 1.807890652599488e-06, "loss": 0.7965377, "num_input_tokens_seen": 97698765, "step": 4530, "time_per_iteration": 2.8118274211883545 }, { "auxiliary_loss_clip": 0.01177087, "auxiliary_loss_mlp": 0.01030319, "balance_loss_clip": 1.05325246, "balance_loss_mlp": 1.02321458, "epoch": 0.5448205374857211, "flos": 11798307757440.0, "grad_norm": 2.072882447424922, "language_loss": 0.82789445, "learning_rate": 1.8071152974618156e-06, "loss": 0.84996843, "num_input_tokens_seen": 97716565, "step": 4531, "time_per_iteration": 2.7698729038238525 }, { "auxiliary_loss_clip": 0.01172501, "auxiliary_loss_mlp": 0.01051492, "balance_loss_clip": 1.05206239, "balance_loss_mlp": 1.01432621, "epoch": 0.5449407803763603, "flos": 24133766474880.0, "grad_norm": 2.504266346345784, "language_loss": 0.78270024, "learning_rate": 1.806339971584599e-06, "loss": 0.80494022, "num_input_tokens_seen": 97733225, "step": 4532, "time_per_iteration": 2.7414724826812744 }, { "auxiliary_loss_clip": 0.01178667, "auxiliary_loss_mlp": 0.01022207, "balance_loss_clip": 1.05205083, "balance_loss_mlp": 1.01457763, "epoch": 0.5450610232669993, "flos": 23258551685760.0, "grad_norm": 1.7454887012031761, "language_loss": 0.85578054, "learning_rate": 1.8055646750854546e-06, "loss": 0.87778926, "num_input_tokens_seen": 97752735, "step": 4533, "time_per_iteration": 2.704214334487915 }, { "auxiliary_loss_clip": 0.01176025, "auxiliary_loss_mlp": 0.01030575, "balance_loss_clip": 1.05390751, "balance_loss_mlp": 1.02285588, "epoch": 0.5451812661576384, "flos": 17785083375360.0, "grad_norm": 2.249170697410394, "language_loss": 0.81828666, "learning_rate": 1.8047894080819945e-06, "loss": 0.84035265, "num_input_tokens_seen": 97769985, "step": 4534, "time_per_iteration": 2.6928176879882812 }, { "auxiliary_loss_clip": 0.01079023, "auxiliary_loss_mlp": 0.01007829, "balance_loss_clip": 1.02113366, "balance_loss_mlp": 1.00679803, "epoch": 0.5453015090482776, "flos": 71062586513280.0, "grad_norm": 0.7280551640285202, "language_loss": 0.63247657, "learning_rate": 1.8040141706918258e-06, "loss": 0.65334511, "num_input_tokens_seen": 97831225, "step": 4535, "time_per_iteration": 3.3563477993011475 }, { "auxiliary_loss_clip": 0.01174937, "auxiliary_loss_mlp": 0.01026434, "balance_loss_clip": 1.05410659, "balance_loss_mlp": 1.01878631, "epoch": 0.5454217519389166, "flos": 25552201622400.0, "grad_norm": 1.7524812147165933, "language_loss": 0.76911277, "learning_rate": 1.8032389630325525e-06, "loss": 0.79112649, "num_input_tokens_seen": 97849975, "step": 4536, "time_per_iteration": 2.731527090072632 }, { "auxiliary_loss_clip": 0.01173328, "auxiliary_loss_mlp": 0.0102834, "balance_loss_clip": 1.05212569, "balance_loss_mlp": 1.0206337, "epoch": 0.5455419948295557, "flos": 23658345037440.0, "grad_norm": 1.7020726963717445, "language_loss": 0.75636363, "learning_rate": 1.8024637852217707e-06, "loss": 0.77838027, "num_input_tokens_seen": 97869700, "step": 4537, "time_per_iteration": 2.694622039794922 }, { "auxiliary_loss_clip": 0.01171701, "auxiliary_loss_mlp": 0.01028507, "balance_loss_clip": 1.05235243, "balance_loss_mlp": 1.0208236, "epoch": 0.5456622377201948, "flos": 23403989854080.0, "grad_norm": 1.8137298506525157, "language_loss": 0.84926772, "learning_rate": 1.8016886373770766e-06, "loss": 0.87126982, "num_input_tokens_seen": 97888215, "step": 4538, "time_per_iteration": 2.726024627685547 }, { "auxiliary_loss_clip": 0.01174193, "auxiliary_loss_mlp": 0.01030173, "balance_loss_clip": 1.05507898, "balance_loss_mlp": 1.02256155, "epoch": 0.5457824806108339, "flos": 23988040997760.0, "grad_norm": 3.1080909418923186, "language_loss": 0.78871036, "learning_rate": 1.8009135196160579e-06, "loss": 0.810754, "num_input_tokens_seen": 97907090, "step": 4539, "time_per_iteration": 2.6960604190826416 }, { "auxiliary_loss_clip": 0.01167108, "auxiliary_loss_mlp": 0.01029581, "balance_loss_clip": 1.05352783, "balance_loss_mlp": 1.02193379, "epoch": 0.545902723501473, "flos": 22565870835840.0, "grad_norm": 1.7999224927479094, "language_loss": 0.84211302, "learning_rate": 1.8001384320563e-06, "loss": 0.86407995, "num_input_tokens_seen": 97927345, "step": 4540, "time_per_iteration": 2.8412532806396484 }, { "auxiliary_loss_clip": 0.0107938, "auxiliary_loss_mlp": 0.01003877, "balance_loss_clip": 1.02110672, "balance_loss_mlp": 1.00272691, "epoch": 0.5460229663921121, "flos": 55198399685760.0, "grad_norm": 0.7856974155085957, "language_loss": 0.57793784, "learning_rate": 1.7993633748153833e-06, "loss": 0.59877038, "num_input_tokens_seen": 97981950, "step": 4541, "time_per_iteration": 3.1562423706054688 }, { "auxiliary_loss_clip": 0.01177888, "auxiliary_loss_mlp": 0.01030066, "balance_loss_clip": 1.05194581, "balance_loss_mlp": 1.02175725, "epoch": 0.5461432092827512, "flos": 15413866018560.0, "grad_norm": 2.08398210565816, "language_loss": 0.72987938, "learning_rate": 1.7985883480108834e-06, "loss": 0.75195891, "num_input_tokens_seen": 97999585, "step": 4542, "time_per_iteration": 2.678144931793213 }, { "auxiliary_loss_clip": 0.01173577, "auxiliary_loss_mlp": 0.01030202, "balance_loss_clip": 1.05411172, "balance_loss_mlp": 1.02222717, "epoch": 0.5462634521733902, "flos": 24024921287040.0, "grad_norm": 1.6740874571117264, "language_loss": 0.72433496, "learning_rate": 1.797813351760371e-06, "loss": 0.74637276, "num_input_tokens_seen": 98021290, "step": 4543, "time_per_iteration": 3.740175724029541 }, { "auxiliary_loss_clip": 0.01178486, "auxiliary_loss_mlp": 0.01024141, "balance_loss_clip": 1.05194938, "balance_loss_mlp": 1.01595783, "epoch": 0.5463836950640293, "flos": 22820944291200.0, "grad_norm": 2.034004908563902, "language_loss": 0.77728337, "learning_rate": 1.7970383861814116e-06, "loss": 0.79930961, "num_input_tokens_seen": 98041060, "step": 4544, "time_per_iteration": 3.7544174194335938 }, { "auxiliary_loss_clip": 0.01179344, "auxiliary_loss_mlp": 0.01031797, "balance_loss_clip": 1.0572536, "balance_loss_mlp": 1.02375078, "epoch": 0.5465039379546685, "flos": 20448290390400.0, "grad_norm": 2.1960388104036874, "language_loss": 0.74081892, "learning_rate": 1.7962634513915684e-06, "loss": 0.76293027, "num_input_tokens_seen": 98058410, "step": 4545, "time_per_iteration": 3.6407337188720703 }, { "auxiliary_loss_clip": 0.01177181, "auxiliary_loss_mlp": 0.01027036, "balance_loss_clip": 1.05273187, "balance_loss_mlp": 1.01898324, "epoch": 0.5466241808453075, "flos": 17343310003200.0, "grad_norm": 1.5631384818446856, "language_loss": 0.79343897, "learning_rate": 1.7954885475083969e-06, "loss": 0.81548113, "num_input_tokens_seen": 98076080, "step": 4546, "time_per_iteration": 2.76088809967041 }, { "auxiliary_loss_clip": 0.0118564, "auxiliary_loss_mlp": 0.01029725, "balance_loss_clip": 1.0567708, "balance_loss_mlp": 1.02190483, "epoch": 0.5467444237359466, "flos": 21617039122560.0, "grad_norm": 2.3310007605081013, "language_loss": 0.72985983, "learning_rate": 1.7947136746494513e-06, "loss": 0.75201344, "num_input_tokens_seen": 98096995, "step": 4547, "time_per_iteration": 2.8481333255767822 }, { "auxiliary_loss_clip": 0.01175273, "auxiliary_loss_mlp": 0.01025558, "balance_loss_clip": 1.05416882, "balance_loss_mlp": 1.01739836, "epoch": 0.5468646666265857, "flos": 24170467196160.0, "grad_norm": 2.622836796034203, "language_loss": 0.88162756, "learning_rate": 1.793938832932277e-06, "loss": 0.90363586, "num_input_tokens_seen": 98115105, "step": 4548, "time_per_iteration": 2.775607109069824 }, { "auxiliary_loss_clip": 0.01177304, "auxiliary_loss_mlp": 0.01031787, "balance_loss_clip": 1.05192637, "balance_loss_mlp": 1.02367485, "epoch": 0.5469849095172248, "flos": 27527001505920.0, "grad_norm": 1.9519995088593922, "language_loss": 0.70345569, "learning_rate": 1.7931640224744185e-06, "loss": 0.7255466, "num_input_tokens_seen": 98135655, "step": 4549, "time_per_iteration": 2.908489227294922 }, { "auxiliary_loss_clip": 0.01161494, "auxiliary_loss_mlp": 0.01023648, "balance_loss_clip": 1.05123305, "balance_loss_mlp": 1.01622152, "epoch": 0.5471051524078638, "flos": 27964680727680.0, "grad_norm": 1.812229689270211, "language_loss": 0.73728073, "learning_rate": 1.7923892433934127e-06, "loss": 0.75913215, "num_input_tokens_seen": 98156730, "step": 4550, "time_per_iteration": 2.894470453262329 }, { "auxiliary_loss_clip": 0.01175751, "auxiliary_loss_mlp": 0.01058592, "balance_loss_clip": 1.05331945, "balance_loss_mlp": 1.02202892, "epoch": 0.547225395298503, "flos": 18150510389760.0, "grad_norm": 1.63549600689501, "language_loss": 0.78715074, "learning_rate": 1.7916144958067939e-06, "loss": 0.8094942, "num_input_tokens_seen": 98174590, "step": 4551, "time_per_iteration": 3.785620927810669 }, { "auxiliary_loss_clip": 0.01175402, "auxiliary_loss_mlp": 0.01029726, "balance_loss_clip": 1.05219889, "balance_loss_mlp": 1.02232933, "epoch": 0.5473456381891421, "flos": 21361498790400.0, "grad_norm": 1.8085912992928819, "language_loss": 0.79023063, "learning_rate": 1.7908397798320905e-06, "loss": 0.81228197, "num_input_tokens_seen": 98194325, "step": 4552, "time_per_iteration": 2.8740038871765137 }, { "auxiliary_loss_clip": 0.01182191, "auxiliary_loss_mlp": 0.01054081, "balance_loss_clip": 1.05809689, "balance_loss_mlp": 1.01866424, "epoch": 0.5474658810797811, "flos": 19932145908480.0, "grad_norm": 5.789425277387135, "language_loss": 0.74882519, "learning_rate": 1.7900650955868265e-06, "loss": 0.77118796, "num_input_tokens_seen": 98213970, "step": 4553, "time_per_iteration": 2.7544050216674805 }, { "auxiliary_loss_clip": 0.01177891, "auxiliary_loss_mlp": 0.01058197, "balance_loss_clip": 1.0562793, "balance_loss_mlp": 1.0224067, "epoch": 0.5475861239704203, "flos": 50476217264640.0, "grad_norm": 1.3074254498976112, "language_loss": 0.76636744, "learning_rate": 1.7892904431885202e-06, "loss": 0.7887283, "num_input_tokens_seen": 98241145, "step": 4554, "time_per_iteration": 2.994725227355957 }, { "auxiliary_loss_clip": 0.01168415, "auxiliary_loss_mlp": 0.01021448, "balance_loss_clip": 1.05148315, "balance_loss_mlp": 1.01405704, "epoch": 0.5477063668610593, "flos": 20705123612160.0, "grad_norm": 11.843390082765797, "language_loss": 0.75736499, "learning_rate": 1.788515822754686e-06, "loss": 0.77926362, "num_input_tokens_seen": 98261565, "step": 4555, "time_per_iteration": 2.7183151245117188 }, { "auxiliary_loss_clip": 0.01174023, "auxiliary_loss_mlp": 0.01029344, "balance_loss_clip": 1.0537467, "balance_loss_mlp": 1.02106512, "epoch": 0.5478266097516984, "flos": 19609740408960.0, "grad_norm": 2.152382594975041, "language_loss": 0.78637421, "learning_rate": 1.7877412344028335e-06, "loss": 0.8084079, "num_input_tokens_seen": 98281370, "step": 4556, "time_per_iteration": 2.730970621109009 }, { "auxiliary_loss_clip": 0.01179517, "auxiliary_loss_mlp": 0.01028381, "balance_loss_clip": 1.05315149, "balance_loss_mlp": 1.02070713, "epoch": 0.5479468526423376, "flos": 12896599962240.0, "grad_norm": 32.93440053687744, "language_loss": 0.77096051, "learning_rate": 1.7869666782504668e-06, "loss": 0.7930395, "num_input_tokens_seen": 98297950, "step": 4557, "time_per_iteration": 2.6016385555267334 }, { "auxiliary_loss_clip": 0.0116145, "auxiliary_loss_mlp": 0.01026629, "balance_loss_clip": 1.04984379, "balance_loss_mlp": 1.01874924, "epoch": 0.5480670955329766, "flos": 18588800142720.0, "grad_norm": 3.2762524965873374, "language_loss": 0.68965137, "learning_rate": 1.7861921544150867e-06, "loss": 0.71153212, "num_input_tokens_seen": 98316800, "step": 4558, "time_per_iteration": 2.753471851348877 }, { "auxiliary_loss_clip": 0.01152117, "auxiliary_loss_mlp": 0.01054947, "balance_loss_clip": 1.0532887, "balance_loss_mlp": 1.01871681, "epoch": 0.5481873384236157, "flos": 15954608338560.0, "grad_norm": 1.7972171177810428, "language_loss": 0.76112449, "learning_rate": 1.7854176630141856e-06, "loss": 0.78319514, "num_input_tokens_seen": 98333935, "step": 4559, "time_per_iteration": 2.8104019165039062 }, { "auxiliary_loss_clip": 0.01182914, "auxiliary_loss_mlp": 0.01031824, "balance_loss_clip": 1.05475092, "balance_loss_mlp": 1.02349091, "epoch": 0.5483075813142548, "flos": 22783812606720.0, "grad_norm": 2.4863364379711563, "language_loss": 0.84441733, "learning_rate": 1.784643204165255e-06, "loss": 0.86656475, "num_input_tokens_seen": 98353255, "step": 4560, "time_per_iteration": 2.718810558319092 }, { "auxiliary_loss_clip": 0.01172891, "auxiliary_loss_mlp": 0.0102616, "balance_loss_clip": 1.05423927, "balance_loss_mlp": 1.018435, "epoch": 0.5484278242048939, "flos": 19317212046720.0, "grad_norm": 1.882262076812531, "language_loss": 0.77667379, "learning_rate": 1.7838687779857783e-06, "loss": 0.79866433, "num_input_tokens_seen": 98371130, "step": 4561, "time_per_iteration": 2.6942646503448486 }, { "auxiliary_loss_clip": 0.01163858, "auxiliary_loss_mlp": 0.01029249, "balance_loss_clip": 1.05080223, "balance_loss_mlp": 1.02163792, "epoch": 0.5485480670955329, "flos": 22816024128000.0, "grad_norm": 1.765845414364701, "language_loss": 0.64203966, "learning_rate": 1.7830943845932366e-06, "loss": 0.66397077, "num_input_tokens_seen": 98390455, "step": 4562, "time_per_iteration": 2.7662010192871094 }, { "auxiliary_loss_clip": 0.01176263, "auxiliary_loss_mlp": 0.01027358, "balance_loss_clip": 1.05332994, "balance_loss_mlp": 1.01956761, "epoch": 0.5486683099861721, "flos": 22671304231680.0, "grad_norm": 1.9767576148042334, "language_loss": 0.75011897, "learning_rate": 1.7823200241051044e-06, "loss": 0.77215517, "num_input_tokens_seen": 98409370, "step": 4563, "time_per_iteration": 2.8204362392425537 }, { "auxiliary_loss_clip": 0.01179135, "auxiliary_loss_mlp": 0.01028342, "balance_loss_clip": 1.05255139, "balance_loss_mlp": 1.0207665, "epoch": 0.5487885528768112, "flos": 23149383275520.0, "grad_norm": 1.893746015941662, "language_loss": 0.80412209, "learning_rate": 1.7815456966388513e-06, "loss": 0.82619685, "num_input_tokens_seen": 98428465, "step": 4564, "time_per_iteration": 2.7496798038482666 }, { "auxiliary_loss_clip": 0.01170868, "auxiliary_loss_mlp": 0.01025316, "balance_loss_clip": 1.05248296, "balance_loss_mlp": 1.01675701, "epoch": 0.5489087957674502, "flos": 22053928245120.0, "grad_norm": 2.171335139521908, "language_loss": 0.8129251, "learning_rate": 1.780771402311943e-06, "loss": 0.83488691, "num_input_tokens_seen": 98447300, "step": 4565, "time_per_iteration": 2.8494532108306885 }, { "auxiliary_loss_clip": 0.01174831, "auxiliary_loss_mlp": 0.01028827, "balance_loss_clip": 1.05478978, "balance_loss_mlp": 1.0205543, "epoch": 0.5490290386580894, "flos": 24315977191680.0, "grad_norm": 2.4955630462600005, "language_loss": 0.78560936, "learning_rate": 1.7799971412418374e-06, "loss": 0.80764598, "num_input_tokens_seen": 98468695, "step": 4566, "time_per_iteration": 2.769176483154297 }, { "auxiliary_loss_clip": 0.01172058, "auxiliary_loss_mlp": 0.01032397, "balance_loss_clip": 1.05500722, "balance_loss_mlp": 1.02365327, "epoch": 0.5491492815487284, "flos": 18294942977280.0, "grad_norm": 2.9103132779008685, "language_loss": 0.74122149, "learning_rate": 1.7792229135459918e-06, "loss": 0.76326609, "num_input_tokens_seen": 98485345, "step": 4567, "time_per_iteration": 2.7302002906799316 }, { "auxiliary_loss_clip": 0.01071659, "auxiliary_loss_mlp": 0.01009541, "balance_loss_clip": 1.02519917, "balance_loss_mlp": 1.00837922, "epoch": 0.5492695244393675, "flos": 64550257050240.0, "grad_norm": 0.7444737509862557, "language_loss": 0.61608624, "learning_rate": 1.7784487193418538e-06, "loss": 0.63689828, "num_input_tokens_seen": 98543195, "step": 4568, "time_per_iteration": 3.218092679977417 }, { "auxiliary_loss_clip": 0.01159664, "auxiliary_loss_mlp": 0.01023092, "balance_loss_clip": 1.05144227, "balance_loss_mlp": 1.01462865, "epoch": 0.5493897673300067, "flos": 17379579761280.0, "grad_norm": 1.832959664293024, "language_loss": 0.60770673, "learning_rate": 1.7776745587468698e-06, "loss": 0.6295343, "num_input_tokens_seen": 98560620, "step": 4569, "time_per_iteration": 3.712754964828491 }, { "auxiliary_loss_clip": 0.01177263, "auxiliary_loss_mlp": 0.01028373, "balance_loss_clip": 1.05009675, "balance_loss_mlp": 1.02066362, "epoch": 0.5495100102206457, "flos": 19901765980800.0, "grad_norm": 3.0615177141174197, "language_loss": 0.81782991, "learning_rate": 1.7769004318784776e-06, "loss": 0.83988631, "num_input_tokens_seen": 98578265, "step": 4570, "time_per_iteration": 3.718766450881958 }, { "auxiliary_loss_clip": 0.01173318, "auxiliary_loss_mlp": 0.01028398, "balance_loss_clip": 1.05050159, "balance_loss_mlp": 1.02065825, "epoch": 0.5496302531112848, "flos": 16727190992640.0, "grad_norm": 1.9077238329260062, "language_loss": 0.80486107, "learning_rate": 1.776126338854113e-06, "loss": 0.82687825, "num_input_tokens_seen": 98596055, "step": 4571, "time_per_iteration": 3.690538167953491 }, { "auxiliary_loss_clip": 0.01168827, "auxiliary_loss_mlp": 0.01023307, "balance_loss_clip": 1.05154133, "balance_loss_mlp": 1.0153383, "epoch": 0.5497504960019239, "flos": 24572343536640.0, "grad_norm": 1.6213569198592237, "language_loss": 0.84486693, "learning_rate": 1.7753522797912044e-06, "loss": 0.86678827, "num_input_tokens_seen": 98616140, "step": 4572, "time_per_iteration": 2.723973035812378 }, { "auxiliary_loss_clip": 0.01179877, "auxiliary_loss_mlp": 0.01030404, "balance_loss_clip": 1.05313778, "balance_loss_mlp": 1.0223341, "epoch": 0.549870738892563, "flos": 15450494912640.0, "grad_norm": 2.448358138315349, "language_loss": 0.69756329, "learning_rate": 1.7745782548071765e-06, "loss": 0.71966612, "num_input_tokens_seen": 98633035, "step": 4573, "time_per_iteration": 2.7885172367095947 }, { "auxiliary_loss_clip": 0.01164007, "auxiliary_loss_mlp": 0.0102619, "balance_loss_clip": 1.05309296, "balance_loss_mlp": 1.01863801, "epoch": 0.549990981783202, "flos": 21069114082560.0, "grad_norm": 1.6353689945436596, "language_loss": 0.74324137, "learning_rate": 1.7738042640194482e-06, "loss": 0.76514339, "num_input_tokens_seen": 98652700, "step": 4574, "time_per_iteration": 2.7183351516723633 }, { "auxiliary_loss_clip": 0.0117789, "auxiliary_loss_mlp": 0.01026455, "balance_loss_clip": 1.05177295, "balance_loss_mlp": 1.01809835, "epoch": 0.5501112246738411, "flos": 21395901041280.0, "grad_norm": 4.136370875512619, "language_loss": 0.70467854, "learning_rate": 1.7730303075454335e-06, "loss": 0.726722, "num_input_tokens_seen": 98671590, "step": 4575, "time_per_iteration": 2.7092902660369873 }, { "auxiliary_loss_clip": 0.01175728, "auxiliary_loss_mlp": 0.01030818, "balance_loss_clip": 1.05457735, "balance_loss_mlp": 1.02221727, "epoch": 0.5502314675644803, "flos": 17456931699840.0, "grad_norm": 2.380350092532602, "language_loss": 0.84844923, "learning_rate": 1.7722563855025402e-06, "loss": 0.87051475, "num_input_tokens_seen": 98689620, "step": 4576, "time_per_iteration": 2.8005480766296387 }, { "auxiliary_loss_clip": 0.01176919, "auxiliary_loss_mlp": 0.01027248, "balance_loss_clip": 1.05219388, "balance_loss_mlp": 1.01920688, "epoch": 0.5503517104551193, "flos": 24310410583680.0, "grad_norm": 2.367762428708443, "language_loss": 0.71087217, "learning_rate": 1.7714824980081721e-06, "loss": 0.73291385, "num_input_tokens_seen": 98708915, "step": 4577, "time_per_iteration": 3.7930009365081787 }, { "auxiliary_loss_clip": 0.01173536, "auxiliary_loss_mlp": 0.01027402, "balance_loss_clip": 1.05442739, "balance_loss_mlp": 1.01970124, "epoch": 0.5504719533457584, "flos": 22419427086720.0, "grad_norm": 10.20601943066966, "language_loss": 0.73925221, "learning_rate": 1.7707086451797276e-06, "loss": 0.76126158, "num_input_tokens_seen": 98729790, "step": 4578, "time_per_iteration": 2.8175339698791504 }, { "auxiliary_loss_clip": 0.01079535, "auxiliary_loss_mlp": 0.01002529, "balance_loss_clip": 1.02470303, "balance_loss_mlp": 1.0013485, "epoch": 0.5505921962363975, "flos": 67294155968640.0, "grad_norm": 0.7002352060839455, "language_loss": 0.5232234, "learning_rate": 1.7699348271345993e-06, "loss": 0.54404402, "num_input_tokens_seen": 98792415, "step": 4579, "time_per_iteration": 3.2843761444091797 }, { "auxiliary_loss_clip": 0.0108235, "auxiliary_loss_mlp": 0.01000684, "balance_loss_clip": 1.02285922, "balance_loss_mlp": 0.99947435, "epoch": 0.5507124391270366, "flos": 45685125578880.0, "grad_norm": 0.7127214415797251, "language_loss": 0.54420948, "learning_rate": 1.7691610439901753e-06, "loss": 0.56503975, "num_input_tokens_seen": 98855350, "step": 4580, "time_per_iteration": 3.4433653354644775 }, { "auxiliary_loss_clip": 0.01175618, "auxiliary_loss_mlp": 0.01025372, "balance_loss_clip": 1.05155981, "balance_loss_mlp": 1.01759994, "epoch": 0.5508326820176757, "flos": 22273845264000.0, "grad_norm": 2.013044644331211, "language_loss": 0.75482702, "learning_rate": 1.7683872958638367e-06, "loss": 0.77683699, "num_input_tokens_seen": 98874230, "step": 4581, "time_per_iteration": 2.7144179344177246 }, { "auxiliary_loss_clip": 0.01171423, "auxiliary_loss_mlp": 0.01026989, "balance_loss_clip": 1.05148482, "balance_loss_mlp": 1.01940751, "epoch": 0.5509529249083148, "flos": 20012442762240.0, "grad_norm": 1.8607194557509874, "language_loss": 0.84614241, "learning_rate": 1.7676135828729614e-06, "loss": 0.86812651, "num_input_tokens_seen": 98893940, "step": 4582, "time_per_iteration": 2.851482629776001 }, { "auxiliary_loss_clip": 0.01173449, "auxiliary_loss_mlp": 0.01025683, "balance_loss_clip": 1.05122924, "balance_loss_mlp": 1.01748455, "epoch": 0.5510731677989539, "flos": 21834801325440.0, "grad_norm": 6.865129695084954, "language_loss": 0.82747221, "learning_rate": 1.7668399051349205e-06, "loss": 0.84946358, "num_input_tokens_seen": 98913620, "step": 4583, "time_per_iteration": 2.804680347442627 }, { "auxiliary_loss_clip": 0.01170447, "auxiliary_loss_mlp": 0.01032892, "balance_loss_clip": 1.05367398, "balance_loss_mlp": 1.02423716, "epoch": 0.5511934106895929, "flos": 21467901853440.0, "grad_norm": 1.9605457379099545, "language_loss": 0.83546305, "learning_rate": 1.766066262767081e-06, "loss": 0.8574965, "num_input_tokens_seen": 98931460, "step": 4584, "time_per_iteration": 2.833406448364258 }, { "auxiliary_loss_clip": 0.0116751, "auxiliary_loss_mlp": 0.0103028, "balance_loss_clip": 1.05258846, "balance_loss_mlp": 1.02290726, "epoch": 0.5513136535802321, "flos": 21068934514560.0, "grad_norm": 1.9965273656647402, "language_loss": 0.77222896, "learning_rate": 1.765292655886803e-06, "loss": 0.79420686, "num_input_tokens_seen": 98950105, "step": 4585, "time_per_iteration": 2.797027349472046 }, { "auxiliary_loss_clip": 0.01178106, "auxiliary_loss_mlp": 0.01026176, "balance_loss_clip": 1.05332506, "balance_loss_mlp": 1.01857007, "epoch": 0.5514338964708712, "flos": 27815004754560.0, "grad_norm": 1.8857249890987968, "language_loss": 0.70883912, "learning_rate": 1.764519084611443e-06, "loss": 0.73088193, "num_input_tokens_seen": 98970560, "step": 4586, "time_per_iteration": 2.7914202213287354 }, { "auxiliary_loss_clip": 0.01174598, "auxiliary_loss_mlp": 0.01021837, "balance_loss_clip": 1.05422187, "balance_loss_mlp": 1.01336122, "epoch": 0.5515541393615102, "flos": 21908525990400.0, "grad_norm": 1.7916443817562748, "language_loss": 0.77799559, "learning_rate": 1.7637455490583505e-06, "loss": 0.79995996, "num_input_tokens_seen": 98989885, "step": 4587, "time_per_iteration": 2.7327308654785156 }, { "auxiliary_loss_clip": 0.01176116, "auxiliary_loss_mlp": 0.01026722, "balance_loss_clip": 1.05315971, "balance_loss_mlp": 1.01916742, "epoch": 0.5516743822521494, "flos": 20485422074880.0, "grad_norm": 2.036309930271722, "language_loss": 0.77866083, "learning_rate": 1.7629720493448701e-06, "loss": 0.80068922, "num_input_tokens_seen": 99007180, "step": 4588, "time_per_iteration": 2.7696096897125244 }, { "auxiliary_loss_clip": 0.01179086, "auxiliary_loss_mlp": 0.0102703, "balance_loss_clip": 1.05305505, "balance_loss_mlp": 1.01861942, "epoch": 0.5517946251427884, "flos": 14940383915520.0, "grad_norm": 1.80300419701195, "language_loss": 0.85465515, "learning_rate": 1.7621985855883418e-06, "loss": 0.87671626, "num_input_tokens_seen": 99023880, "step": 4589, "time_per_iteration": 2.710350751876831 }, { "auxiliary_loss_clip": 0.01167297, "auxiliary_loss_mlp": 0.01024411, "balance_loss_clip": 1.0505904, "balance_loss_mlp": 1.01657879, "epoch": 0.5519148680334275, "flos": 18404865573120.0, "grad_norm": 1.9815954601263506, "language_loss": 0.72441995, "learning_rate": 1.7614251579060983e-06, "loss": 0.746337, "num_input_tokens_seen": 99042475, "step": 4590, "time_per_iteration": 2.844615936279297 }, { "auxiliary_loss_clip": 0.01170742, "auxiliary_loss_mlp": 0.01026711, "balance_loss_clip": 1.05117035, "balance_loss_mlp": 1.01877773, "epoch": 0.5520351109240667, "flos": 25113337251840.0, "grad_norm": 1.6259789337129866, "language_loss": 0.84342897, "learning_rate": 1.76065176641547e-06, "loss": 0.86540353, "num_input_tokens_seen": 99065185, "step": 4591, "time_per_iteration": 2.832695484161377 }, { "auxiliary_loss_clip": 0.01173875, "auxiliary_loss_mlp": 0.01033456, "balance_loss_clip": 1.04841757, "balance_loss_mlp": 1.0257194, "epoch": 0.5521553538147057, "flos": 21069545045760.0, "grad_norm": 1.7351500032611682, "language_loss": 0.77953863, "learning_rate": 1.759878411233777e-06, "loss": 0.8016119, "num_input_tokens_seen": 99083645, "step": 4592, "time_per_iteration": 2.7988948822021484 }, { "auxiliary_loss_clip": 0.01174027, "auxiliary_loss_mlp": 0.01032392, "balance_loss_clip": 1.05316293, "balance_loss_mlp": 1.02413058, "epoch": 0.5522755967053448, "flos": 18879999701760.0, "grad_norm": 2.0550795760103275, "language_loss": 0.7574479, "learning_rate": 1.7591050924783388e-06, "loss": 0.77951205, "num_input_tokens_seen": 99100835, "step": 4593, "time_per_iteration": 2.658045530319214 }, { "auxiliary_loss_clip": 0.01074413, "auxiliary_loss_mlp": 0.01006838, "balance_loss_clip": 1.02098107, "balance_loss_mlp": 1.00580645, "epoch": 0.5523958395959839, "flos": 64675622494080.0, "grad_norm": 0.8444397182904537, "language_loss": 0.57936943, "learning_rate": 1.7583318102664661e-06, "loss": 0.60018194, "num_input_tokens_seen": 99168400, "step": 4594, "time_per_iteration": 3.4203884601593018 }, { "auxiliary_loss_clip": 0.01179575, "auxiliary_loss_mlp": 0.01024788, "balance_loss_clip": 1.05089068, "balance_loss_mlp": 1.01643741, "epoch": 0.552516082486623, "flos": 10889732211840.0, "grad_norm": 1.7945124785456428, "language_loss": 0.78876305, "learning_rate": 1.757558564715466e-06, "loss": 0.81080675, "num_input_tokens_seen": 99186475, "step": 4595, "time_per_iteration": 3.7142467498779297 }, { "auxiliary_loss_clip": 0.01175837, "auxiliary_loss_mlp": 0.01028327, "balance_loss_clip": 1.05168128, "balance_loss_mlp": 1.0203346, "epoch": 0.552636325377262, "flos": 22199797376640.0, "grad_norm": 3.4948465132413946, "language_loss": 0.73813152, "learning_rate": 1.7567853559426386e-06, "loss": 0.76017314, "num_input_tokens_seen": 99203525, "step": 4596, "time_per_iteration": 3.616952419281006 }, { "auxiliary_loss_clip": 0.01176087, "auxiliary_loss_mlp": 0.01023761, "balance_loss_clip": 1.051893, "balance_loss_mlp": 1.01579523, "epoch": 0.5527565682679012, "flos": 23988184652160.0, "grad_norm": 2.1637828566518853, "language_loss": 0.74894404, "learning_rate": 1.7560121840652797e-06, "loss": 0.77094245, "num_input_tokens_seen": 99222910, "step": 4597, "time_per_iteration": 3.689600944519043 }, { "auxiliary_loss_clip": 0.0115812, "auxiliary_loss_mlp": 0.01028208, "balance_loss_clip": 1.05055714, "balance_loss_mlp": 1.02067959, "epoch": 0.5528768111585403, "flos": 19719267955200.0, "grad_norm": 1.8995825100339407, "language_loss": 0.69262224, "learning_rate": 1.7552390492006782e-06, "loss": 0.71448559, "num_input_tokens_seen": 99241230, "step": 4598, "time_per_iteration": 2.7972145080566406 }, { "auxiliary_loss_clip": 0.0117561, "auxiliary_loss_mlp": 0.01058507, "balance_loss_clip": 1.05206275, "balance_loss_mlp": 1.02156138, "epoch": 0.5529970540491793, "flos": 26215975002240.0, "grad_norm": 3.443322016580294, "language_loss": 0.65052849, "learning_rate": 1.7544659514661184e-06, "loss": 0.67286968, "num_input_tokens_seen": 99264320, "step": 4599, "time_per_iteration": 2.8041720390319824 }, { "auxiliary_loss_clip": 0.0117235, "auxiliary_loss_mlp": 0.01025988, "balance_loss_clip": 1.05242646, "balance_loss_mlp": 1.01829898, "epoch": 0.5531172969398185, "flos": 24425971614720.0, "grad_norm": 2.259796360225873, "language_loss": 0.79731506, "learning_rate": 1.7536928909788786e-06, "loss": 0.81929845, "num_input_tokens_seen": 99283625, "step": 4600, "time_per_iteration": 2.819079637527466 }, { "auxiliary_loss_clip": 0.01076885, "auxiliary_loss_mlp": 0.01007857, "balance_loss_clip": 1.01930714, "balance_loss_mlp": 1.00662923, "epoch": 0.5532375398304575, "flos": 64907316195840.0, "grad_norm": 0.8803900024613035, "language_loss": 0.61935711, "learning_rate": 1.752919867856231e-06, "loss": 0.64020455, "num_input_tokens_seen": 99335270, "step": 4601, "time_per_iteration": 3.15179705619812 }, { "auxiliary_loss_clip": 0.01170586, "auxiliary_loss_mlp": 0.01024592, "balance_loss_clip": 1.05444956, "balance_loss_mlp": 1.01721895, "epoch": 0.5533577827210966, "flos": 19683105937920.0, "grad_norm": 1.6392096999599042, "language_loss": 0.7881543, "learning_rate": 1.7521468822154436e-06, "loss": 0.81010616, "num_input_tokens_seen": 99354185, "step": 4602, "time_per_iteration": 2.8447391986846924 }, { "auxiliary_loss_clip": 0.01167441, "auxiliary_loss_mlp": 0.01026481, "balance_loss_clip": 1.05218434, "balance_loss_mlp": 1.01870847, "epoch": 0.5534780256117358, "flos": 32306496076800.0, "grad_norm": 1.730486902832306, "language_loss": 0.75159812, "learning_rate": 1.751373934173777e-06, "loss": 0.77353734, "num_input_tokens_seen": 99376930, "step": 4603, "time_per_iteration": 3.728694200515747 }, { "auxiliary_loss_clip": 0.01179748, "auxiliary_loss_mlp": 0.01028591, "balance_loss_clip": 1.05183172, "balance_loss_mlp": 1.02047849, "epoch": 0.5535982685023748, "flos": 23222425582080.0, "grad_norm": 3.7222877255999407, "language_loss": 0.72927582, "learning_rate": 1.750601023848487e-06, "loss": 0.75135922, "num_input_tokens_seen": 99397655, "step": 4604, "time_per_iteration": 2.697103977203369 }, { "auxiliary_loss_clip": 0.01177029, "auxiliary_loss_mlp": 0.01053475, "balance_loss_clip": 1.0525142, "balance_loss_mlp": 1.01809645, "epoch": 0.5537185113930139, "flos": 24352534258560.0, "grad_norm": 1.7606515726848742, "language_loss": 0.73820072, "learning_rate": 1.749828151356823e-06, "loss": 0.76050574, "num_input_tokens_seen": 99417850, "step": 4605, "time_per_iteration": 2.7290103435516357 }, { "auxiliary_loss_clip": 0.01170179, "auxiliary_loss_mlp": 0.010267, "balance_loss_clip": 1.05101776, "balance_loss_mlp": 1.0191958, "epoch": 0.553838754283653, "flos": 23549068886400.0, "grad_norm": 1.5858904339373534, "language_loss": 0.75948882, "learning_rate": 1.7490553168160297e-06, "loss": 0.78145754, "num_input_tokens_seen": 99438920, "step": 4606, "time_per_iteration": 2.6686761379241943 }, { "auxiliary_loss_clip": 0.01170644, "auxiliary_loss_mlp": 0.01023949, "balance_loss_clip": 1.0502857, "balance_loss_mlp": 1.01676357, "epoch": 0.5539589971742921, "flos": 17275044205440.0, "grad_norm": 1.9024716034966458, "language_loss": 0.76161516, "learning_rate": 1.748282520343345e-06, "loss": 0.78356105, "num_input_tokens_seen": 99457950, "step": 4607, "time_per_iteration": 2.757884979248047 }, { "auxiliary_loss_clip": 0.0118376, "auxiliary_loss_mlp": 0.01026082, "balance_loss_clip": 1.05480611, "balance_loss_mlp": 1.01795769, "epoch": 0.5540792400649311, "flos": 27564169104000.0, "grad_norm": 1.8394572148858064, "language_loss": 0.78428411, "learning_rate": 1.7475097620560023e-06, "loss": 0.80638254, "num_input_tokens_seen": 99478015, "step": 4608, "time_per_iteration": 2.7912535667419434 }, { "auxiliary_loss_clip": 0.01177195, "auxiliary_loss_mlp": 0.01024842, "balance_loss_clip": 1.05246723, "balance_loss_mlp": 1.01748109, "epoch": 0.5541994829555702, "flos": 23878657105920.0, "grad_norm": 1.7361960878623166, "language_loss": 0.70632923, "learning_rate": 1.746737042071228e-06, "loss": 0.72834963, "num_input_tokens_seen": 99496520, "step": 4609, "time_per_iteration": 2.719843864440918 }, { "auxiliary_loss_clip": 0.01164834, "auxiliary_loss_mlp": 0.0102328, "balance_loss_clip": 1.04998827, "balance_loss_mlp": 1.01541245, "epoch": 0.5543197258462094, "flos": 20115721342080.0, "grad_norm": 2.0087656548712816, "language_loss": 0.79099935, "learning_rate": 1.7459643605062424e-06, "loss": 0.81288052, "num_input_tokens_seen": 99513780, "step": 4610, "time_per_iteration": 2.788362741470337 }, { "auxiliary_loss_clip": 0.01161478, "auxiliary_loss_mlp": 0.01025247, "balance_loss_clip": 1.05111575, "balance_loss_mlp": 1.01751614, "epoch": 0.5544399687368484, "flos": 20916565021440.0, "grad_norm": 1.6182532286378966, "language_loss": 0.80569494, "learning_rate": 1.745191717478262e-06, "loss": 0.82756221, "num_input_tokens_seen": 99532360, "step": 4611, "time_per_iteration": 2.7633533477783203 }, { "auxiliary_loss_clip": 0.01164145, "auxiliary_loss_mlp": 0.01023474, "balance_loss_clip": 1.04996085, "balance_loss_mlp": 1.01552224, "epoch": 0.5545602116274875, "flos": 25518661297920.0, "grad_norm": 1.8491473076150742, "language_loss": 0.79760337, "learning_rate": 1.7444191131044948e-06, "loss": 0.81947958, "num_input_tokens_seen": 99552635, "step": 4612, "time_per_iteration": 2.8417768478393555 }, { "auxiliary_loss_clip": 0.0117143, "auxiliary_loss_mlp": 0.01026689, "balance_loss_clip": 1.05276227, "balance_loss_mlp": 1.01889825, "epoch": 0.5546804545181266, "flos": 20995568985600.0, "grad_norm": 1.688543031761769, "language_loss": 0.73365164, "learning_rate": 1.7436465475021456e-06, "loss": 0.75563276, "num_input_tokens_seen": 99572685, "step": 4613, "time_per_iteration": 2.858739137649536 }, { "auxiliary_loss_clip": 0.01160236, "auxiliary_loss_mlp": 0.01024943, "balance_loss_clip": 1.05078983, "balance_loss_mlp": 1.01714659, "epoch": 0.5548006974087657, "flos": 26833638297600.0, "grad_norm": 2.208221533215043, "language_loss": 0.71392763, "learning_rate": 1.7428740207884111e-06, "loss": 0.7357794, "num_input_tokens_seen": 99593565, "step": 4614, "time_per_iteration": 2.8516077995300293 }, { "auxiliary_loss_clip": 0.01164195, "auxiliary_loss_mlp": 0.0102821, "balance_loss_clip": 1.0502007, "balance_loss_mlp": 1.02021444, "epoch": 0.5549209402994048, "flos": 33656414031360.0, "grad_norm": 2.0818125780317276, "language_loss": 0.61248034, "learning_rate": 1.7421015330804833e-06, "loss": 0.63440436, "num_input_tokens_seen": 99613485, "step": 4615, "time_per_iteration": 2.8543550968170166 }, { "auxiliary_loss_clip": 0.01176061, "auxiliary_loss_mlp": 0.01024701, "balance_loss_clip": 1.05069888, "balance_loss_mlp": 1.01694608, "epoch": 0.5550411831900439, "flos": 23769524609280.0, "grad_norm": 2.0344380496821515, "language_loss": 0.72509295, "learning_rate": 1.7413290844955475e-06, "loss": 0.74710053, "num_input_tokens_seen": 99633515, "step": 4616, "time_per_iteration": 2.7941415309906006 }, { "auxiliary_loss_clip": 0.01169751, "auxiliary_loss_mlp": 0.01031512, "balance_loss_clip": 1.05502915, "balance_loss_mlp": 1.02389479, "epoch": 0.555161426080683, "flos": 21651189978240.0, "grad_norm": 1.684548814650208, "language_loss": 0.78078783, "learning_rate": 1.7405566751507843e-06, "loss": 0.80280048, "num_input_tokens_seen": 99651560, "step": 4617, "time_per_iteration": 2.7854583263397217 }, { "auxiliary_loss_clip": 0.01167019, "auxiliary_loss_mlp": 0.01025796, "balance_loss_clip": 1.05166698, "balance_loss_mlp": 1.01785028, "epoch": 0.555281668971322, "flos": 49563116605440.0, "grad_norm": 1.739228183191114, "language_loss": 0.67680788, "learning_rate": 1.7397843051633668e-06, "loss": 0.69873601, "num_input_tokens_seen": 99674255, "step": 4618, "time_per_iteration": 3.032428503036499 }, { "auxiliary_loss_clip": 0.01170901, "auxiliary_loss_mlp": 0.01024919, "balance_loss_clip": 1.05157888, "balance_loss_mlp": 1.01733732, "epoch": 0.5554019118619612, "flos": 20741608851840.0, "grad_norm": 1.6705739839932794, "language_loss": 0.71601605, "learning_rate": 1.739011974650464e-06, "loss": 0.73797429, "num_input_tokens_seen": 99693585, "step": 4619, "time_per_iteration": 2.81233811378479 }, { "auxiliary_loss_clip": 0.01168131, "auxiliary_loss_mlp": 0.01031074, "balance_loss_clip": 1.05160558, "balance_loss_mlp": 1.02293754, "epoch": 0.5555221547526003, "flos": 25483217552640.0, "grad_norm": 1.7260648801957008, "language_loss": 0.76553273, "learning_rate": 1.7382396837292365e-06, "loss": 0.78752476, "num_input_tokens_seen": 99714045, "step": 4620, "time_per_iteration": 2.8650922775268555 }, { "auxiliary_loss_clip": 0.01181175, "auxiliary_loss_mlp": 0.01032749, "balance_loss_clip": 1.05448413, "balance_loss_mlp": 1.02453506, "epoch": 0.5556423976432393, "flos": 21762513204480.0, "grad_norm": 1.6759704375672897, "language_loss": 0.73781359, "learning_rate": 1.737467432516841e-06, "loss": 0.75995284, "num_input_tokens_seen": 99734145, "step": 4621, "time_per_iteration": 2.7671828269958496 }, { "auxiliary_loss_clip": 0.0117297, "auxiliary_loss_mlp": 0.01033508, "balance_loss_clip": 1.05245888, "balance_loss_mlp": 1.02544963, "epoch": 0.5557626405338785, "flos": 24900171989760.0, "grad_norm": 3.6116065163260664, "language_loss": 0.74303299, "learning_rate": 1.7366952211304274e-06, "loss": 0.7650978, "num_input_tokens_seen": 99751990, "step": 4622, "time_per_iteration": 4.604590892791748 }, { "auxiliary_loss_clip": 0.01164966, "auxiliary_loss_mlp": 0.01021753, "balance_loss_clip": 1.05174851, "balance_loss_mlp": 1.01433253, "epoch": 0.5558828834245175, "flos": 18697501676160.0, "grad_norm": 2.153695362035074, "language_loss": 0.83499569, "learning_rate": 1.735923049687139e-06, "loss": 0.85686284, "num_input_tokens_seen": 99768565, "step": 4623, "time_per_iteration": 3.7545485496520996 }, { "auxiliary_loss_clip": 0.01168877, "auxiliary_loss_mlp": 0.01033371, "balance_loss_clip": 1.05274284, "balance_loss_mlp": 1.02562273, "epoch": 0.5560031263151566, "flos": 27272179445760.0, "grad_norm": 1.4848334885380954, "language_loss": 0.7391293, "learning_rate": 1.7351509183041144e-06, "loss": 0.76115179, "num_input_tokens_seen": 99788895, "step": 4624, "time_per_iteration": 2.7927706241607666 }, { "auxiliary_loss_clip": 0.01181314, "auxiliary_loss_mlp": 0.01025532, "balance_loss_clip": 1.05388522, "balance_loss_mlp": 1.01795673, "epoch": 0.5561233692057957, "flos": 23403738458880.0, "grad_norm": 1.7229286620510607, "language_loss": 0.71722269, "learning_rate": 1.7343788270984852e-06, "loss": 0.73929119, "num_input_tokens_seen": 99808035, "step": 4625, "time_per_iteration": 2.6751046180725098 }, { "auxiliary_loss_clip": 0.01171851, "auxiliary_loss_mlp": 0.01026883, "balance_loss_clip": 1.05405664, "balance_loss_mlp": 1.01943278, "epoch": 0.5562436120964348, "flos": 37670867804160.0, "grad_norm": 2.6854560624485195, "language_loss": 0.74647379, "learning_rate": 1.7336067761873764e-06, "loss": 0.76846111, "num_input_tokens_seen": 99830460, "step": 4626, "time_per_iteration": 2.92767596244812 }, { "auxiliary_loss_clip": 0.01183761, "auxiliary_loss_mlp": 0.01028709, "balance_loss_clip": 1.05378366, "balance_loss_mlp": 1.02053154, "epoch": 0.5563638549870739, "flos": 25155245445120.0, "grad_norm": 1.9568339187324326, "language_loss": 0.75943655, "learning_rate": 1.7328347656879076e-06, "loss": 0.78156126, "num_input_tokens_seen": 99850320, "step": 4627, "time_per_iteration": 2.8797969818115234 }, { "auxiliary_loss_clip": 0.01172123, "auxiliary_loss_mlp": 0.0102834, "balance_loss_clip": 1.05457783, "balance_loss_mlp": 1.02034676, "epoch": 0.556484097877713, "flos": 13581810783360.0, "grad_norm": 2.6363843598055072, "language_loss": 0.683792, "learning_rate": 1.7320627957171927e-06, "loss": 0.7057966, "num_input_tokens_seen": 99864980, "step": 4628, "time_per_iteration": 2.6777517795562744 }, { "auxiliary_loss_clip": 0.01180985, "auxiliary_loss_mlp": 0.01027396, "balance_loss_clip": 1.05543804, "balance_loss_mlp": 1.01975465, "epoch": 0.5566043407683521, "flos": 24681368292480.0, "grad_norm": 1.8963900325157255, "language_loss": 0.81479299, "learning_rate": 1.7312908663923382e-06, "loss": 0.83687675, "num_input_tokens_seen": 99881155, "step": 4629, "time_per_iteration": 2.883553981781006 }, { "auxiliary_loss_clip": 0.01172619, "auxiliary_loss_mlp": 0.01030836, "balance_loss_clip": 1.0527935, "balance_loss_mlp": 1.02260435, "epoch": 0.5567245836589911, "flos": 20588161950720.0, "grad_norm": 2.1493024483000758, "language_loss": 0.67223001, "learning_rate": 1.7305189778304463e-06, "loss": 0.69426453, "num_input_tokens_seen": 99899330, "step": 4630, "time_per_iteration": 3.676651954650879 }, { "auxiliary_loss_clip": 0.01169565, "auxiliary_loss_mlp": 0.01028108, "balance_loss_clip": 1.05283129, "balance_loss_mlp": 1.02027321, "epoch": 0.5568448265496303, "flos": 20704189858560.0, "grad_norm": 1.8741673904058502, "language_loss": 0.79754913, "learning_rate": 1.729747130148611e-06, "loss": 0.81952584, "num_input_tokens_seen": 99918525, "step": 4631, "time_per_iteration": 2.702349901199341 }, { "auxiliary_loss_clip": 0.01177599, "auxiliary_loss_mlp": 0.01026904, "balance_loss_clip": 1.05429721, "balance_loss_mlp": 1.01876235, "epoch": 0.5569650694402694, "flos": 25302910256640.0, "grad_norm": 2.2097265021611583, "language_loss": 0.76866937, "learning_rate": 1.7289753234639208e-06, "loss": 0.79071438, "num_input_tokens_seen": 99937500, "step": 4632, "time_per_iteration": 2.811396598815918 }, { "auxiliary_loss_clip": 0.01178246, "auxiliary_loss_mlp": 0.01025954, "balance_loss_clip": 1.05207288, "balance_loss_mlp": 1.01793671, "epoch": 0.5570853123309084, "flos": 19712623939200.0, "grad_norm": 2.6627874277591297, "language_loss": 0.76839018, "learning_rate": 1.7282035578934592e-06, "loss": 0.79043221, "num_input_tokens_seen": 99955665, "step": 4633, "time_per_iteration": 2.6394593715667725 }, { "auxiliary_loss_clip": 0.01165016, "auxiliary_loss_mlp": 0.01022594, "balance_loss_clip": 1.05264056, "balance_loss_mlp": 1.01456547, "epoch": 0.5572055552215476, "flos": 16108091153280.0, "grad_norm": 1.726175946664406, "language_loss": 0.78810143, "learning_rate": 1.727431833554301e-06, "loss": 0.80997759, "num_input_tokens_seen": 99974140, "step": 4634, "time_per_iteration": 2.7605173587799072 }, { "auxiliary_loss_clip": 0.01169054, "auxiliary_loss_mlp": 0.01025262, "balance_loss_clip": 1.05200505, "balance_loss_mlp": 1.0172931, "epoch": 0.5573257981121866, "flos": 17128815937920.0, "grad_norm": 1.821094168328765, "language_loss": 0.77315605, "learning_rate": 1.7266601505635175e-06, "loss": 0.79509926, "num_input_tokens_seen": 99991480, "step": 4635, "time_per_iteration": 2.66570782661438 }, { "auxiliary_loss_clip": 0.01175682, "auxiliary_loss_mlp": 0.01028454, "balance_loss_clip": 1.05358171, "balance_loss_mlp": 1.02082419, "epoch": 0.5574460410028257, "flos": 18807029222400.0, "grad_norm": 3.3631012835689265, "language_loss": 0.75533879, "learning_rate": 1.7258885090381717e-06, "loss": 0.77738011, "num_input_tokens_seen": 100009520, "step": 4636, "time_per_iteration": 2.716428279876709 }, { "auxiliary_loss_clip": 0.01172648, "auxiliary_loss_mlp": 0.01027423, "balance_loss_clip": 1.0516727, "balance_loss_mlp": 1.01975787, "epoch": 0.5575662838934649, "flos": 29642678530560.0, "grad_norm": 3.258744987660693, "language_loss": 0.78570008, "learning_rate": 1.7251169090953213e-06, "loss": 0.80770087, "num_input_tokens_seen": 100029995, "step": 4637, "time_per_iteration": 2.7049264907836914 }, { "auxiliary_loss_clip": 0.01175985, "auxiliary_loss_mlp": 0.01026776, "balance_loss_clip": 1.05269396, "balance_loss_mlp": 1.01933098, "epoch": 0.5576865267841039, "flos": 22054466949120.0, "grad_norm": 5.5888253803865755, "language_loss": 0.76744032, "learning_rate": 1.7243453508520168e-06, "loss": 0.78946793, "num_input_tokens_seen": 100046980, "step": 4638, "time_per_iteration": 2.7947816848754883 }, { "auxiliary_loss_clip": 0.01172971, "auxiliary_loss_mlp": 0.01032437, "balance_loss_clip": 1.05142701, "balance_loss_mlp": 1.024122, "epoch": 0.557806769674743, "flos": 17196040241280.0, "grad_norm": 2.0550901780180078, "language_loss": 0.84470403, "learning_rate": 1.7235738344253038e-06, "loss": 0.86675811, "num_input_tokens_seen": 100060610, "step": 4639, "time_per_iteration": 2.727487564086914 }, { "auxiliary_loss_clip": 0.01172131, "auxiliary_loss_mlp": 0.01026775, "balance_loss_clip": 1.0529232, "balance_loss_mlp": 1.0188241, "epoch": 0.557927012565382, "flos": 24712717887360.0, "grad_norm": 2.016894660500113, "language_loss": 0.83006489, "learning_rate": 1.72280235993222e-06, "loss": 0.852054, "num_input_tokens_seen": 100078915, "step": 4640, "time_per_iteration": 2.6895527839660645 }, { "auxiliary_loss_clip": 0.01170976, "auxiliary_loss_mlp": 0.01056047, "balance_loss_clip": 1.05109799, "balance_loss_mlp": 1.01923656, "epoch": 0.5580472554560212, "flos": 16983090460800.0, "grad_norm": 2.414887429468081, "language_loss": 0.6995343, "learning_rate": 1.722030927489798e-06, "loss": 0.7218045, "num_input_tokens_seen": 100096195, "step": 4641, "time_per_iteration": 2.711885452270508 }, { "auxiliary_loss_clip": 0.01168739, "auxiliary_loss_mlp": 0.01027046, "balance_loss_clip": 1.05309105, "balance_loss_mlp": 1.01952362, "epoch": 0.5581674983466602, "flos": 23509100027520.0, "grad_norm": 1.6543130863183215, "language_loss": 0.74330342, "learning_rate": 1.7212595372150634e-06, "loss": 0.76526117, "num_input_tokens_seen": 100116175, "step": 4642, "time_per_iteration": 2.7314717769622803 }, { "auxiliary_loss_clip": 0.01178914, "auxiliary_loss_mlp": 0.01029374, "balance_loss_clip": 1.05409622, "balance_loss_mlp": 1.02142215, "epoch": 0.5582877412372993, "flos": 13480291969920.0, "grad_norm": 3.0501227310152705, "language_loss": 0.7295121, "learning_rate": 1.720488189225035e-06, "loss": 0.75159502, "num_input_tokens_seen": 100133875, "step": 4643, "time_per_iteration": 2.6537532806396484 }, { "auxiliary_loss_clip": 0.01176894, "auxiliary_loss_mlp": 0.01029725, "balance_loss_clip": 1.05218434, "balance_loss_mlp": 1.02189302, "epoch": 0.5584079841279385, "flos": 21903605827200.0, "grad_norm": 3.2304903810893237, "language_loss": 0.79477847, "learning_rate": 1.7197168836367265e-06, "loss": 0.8168447, "num_input_tokens_seen": 100150685, "step": 4644, "time_per_iteration": 2.7473180294036865 }, { "auxiliary_loss_clip": 0.01171009, "auxiliary_loss_mlp": 0.01065574, "balance_loss_clip": 1.05080879, "balance_loss_mlp": 1.02702618, "epoch": 0.5585282270185775, "flos": 18843550375680.0, "grad_norm": 1.8979833707788785, "language_loss": 0.81781346, "learning_rate": 1.7189456205671433e-06, "loss": 0.84017932, "num_input_tokens_seen": 100169530, "step": 4645, "time_per_iteration": 2.74074387550354 }, { "auxiliary_loss_clip": 0.01183799, "auxiliary_loss_mlp": 0.01031068, "balance_loss_clip": 1.05489254, "balance_loss_mlp": 1.02318215, "epoch": 0.5586484699092166, "flos": 21868449390720.0, "grad_norm": 1.9749805070106425, "language_loss": 0.82768768, "learning_rate": 1.7181744001332866e-06, "loss": 0.84983641, "num_input_tokens_seen": 100188140, "step": 4646, "time_per_iteration": 2.788557529449463 }, { "auxiliary_loss_clip": 0.01177013, "auxiliary_loss_mlp": 0.01025411, "balance_loss_clip": 1.05444884, "balance_loss_mlp": 1.01803184, "epoch": 0.5587687127998557, "flos": 22893232412160.0, "grad_norm": 2.150965155672515, "language_loss": 0.63488543, "learning_rate": 1.7174032224521493e-06, "loss": 0.6569097, "num_input_tokens_seen": 100206850, "step": 4647, "time_per_iteration": 2.6736671924591064 }, { "auxiliary_loss_clip": 0.01171611, "auxiliary_loss_mlp": 0.01027161, "balance_loss_clip": 1.05081177, "balance_loss_mlp": 1.01977587, "epoch": 0.5588889556904948, "flos": 20303067703680.0, "grad_norm": 1.557065719103377, "language_loss": 0.69797492, "learning_rate": 1.7166320876407184e-06, "loss": 0.71996266, "num_input_tokens_seen": 100226270, "step": 4648, "time_per_iteration": 3.607022523880005 }, { "auxiliary_loss_clip": 0.01183372, "auxiliary_loss_mlp": 0.01056419, "balance_loss_clip": 1.05674875, "balance_loss_mlp": 1.02060807, "epoch": 0.5590091985811338, "flos": 16472153450880.0, "grad_norm": 2.0717770664766455, "language_loss": 0.68004274, "learning_rate": 1.7158609958159742e-06, "loss": 0.70244074, "num_input_tokens_seen": 100243675, "step": 4649, "time_per_iteration": 4.614681959152222 }, { "auxiliary_loss_clip": 0.0117447, "auxiliary_loss_mlp": 0.0103007, "balance_loss_clip": 1.05453169, "balance_loss_mlp": 1.0220654, "epoch": 0.559129441471773, "flos": 14532186781440.0, "grad_norm": 3.7189626161320377, "language_loss": 0.78664911, "learning_rate": 1.7150899470948911e-06, "loss": 0.80869448, "num_input_tokens_seen": 100258940, "step": 4650, "time_per_iteration": 2.735854387283325 }, { "auxiliary_loss_clip": 0.01076328, "auxiliary_loss_mlp": 0.0100137, "balance_loss_clip": 1.02259302, "balance_loss_mlp": 1.00026762, "epoch": 0.5592496843624121, "flos": 60521009852160.0, "grad_norm": 0.7989929558373983, "language_loss": 0.56623018, "learning_rate": 1.7143189415944365e-06, "loss": 0.58700716, "num_input_tokens_seen": 100323400, "step": 4651, "time_per_iteration": 3.3851239681243896 }, { "auxiliary_loss_clip": 0.01172278, "auxiliary_loss_mlp": 0.01024902, "balance_loss_clip": 1.05269623, "balance_loss_mlp": 1.0166471, "epoch": 0.5593699272530511, "flos": 20886256920960.0, "grad_norm": 1.8162145913221657, "language_loss": 0.76546061, "learning_rate": 1.7135479794315714e-06, "loss": 0.78743243, "num_input_tokens_seen": 100340355, "step": 4652, "time_per_iteration": 2.7240896224975586 }, { "auxiliary_loss_clip": 0.01167577, "auxiliary_loss_mlp": 0.01024833, "balance_loss_clip": 1.05354309, "balance_loss_mlp": 1.01735854, "epoch": 0.5594901701436903, "flos": 12896743616640.0, "grad_norm": 1.9413884088459032, "language_loss": 0.78961635, "learning_rate": 1.7127770607232502e-06, "loss": 0.81154042, "num_input_tokens_seen": 100358900, "step": 4653, "time_per_iteration": 2.862382173538208 }, { "auxiliary_loss_clip": 0.01172434, "auxiliary_loss_mlp": 0.01032705, "balance_loss_clip": 1.05135131, "balance_loss_mlp": 1.02470875, "epoch": 0.5596104130343293, "flos": 23112107936640.0, "grad_norm": 1.8237359760150642, "language_loss": 0.7974242, "learning_rate": 1.7120061855864204e-06, "loss": 0.81947553, "num_input_tokens_seen": 100378910, "step": 4654, "time_per_iteration": 2.8295514583587646 }, { "auxiliary_loss_clip": 0.01173825, "auxiliary_loss_mlp": 0.01028027, "balance_loss_clip": 1.05437887, "balance_loss_mlp": 1.01995635, "epoch": 0.5597306559249684, "flos": 25957812977280.0, "grad_norm": 2.6184703444239608, "language_loss": 0.71438885, "learning_rate": 1.7112353541380233e-06, "loss": 0.7364074, "num_input_tokens_seen": 100398770, "step": 4655, "time_per_iteration": 3.849895477294922 }, { "auxiliary_loss_clip": 0.01176376, "auxiliary_loss_mlp": 0.01029319, "balance_loss_clip": 1.05589736, "balance_loss_mlp": 1.02096236, "epoch": 0.5598508988156076, "flos": 22492289825280.0, "grad_norm": 1.5960626803616522, "language_loss": 0.71985519, "learning_rate": 1.7104645664949931e-06, "loss": 0.74191213, "num_input_tokens_seen": 100421240, "step": 4656, "time_per_iteration": 2.8735313415527344 }, { "auxiliary_loss_clip": 0.01176484, "auxiliary_loss_mlp": 0.01032014, "balance_loss_clip": 1.05298924, "balance_loss_mlp": 1.02395558, "epoch": 0.5599711417062466, "flos": 23112538899840.0, "grad_norm": 1.908214179894068, "language_loss": 0.7168237, "learning_rate": 1.7096938227742584e-06, "loss": 0.73890865, "num_input_tokens_seen": 100442370, "step": 4657, "time_per_iteration": 2.8266923427581787 }, { "auxiliary_loss_clip": 0.0117729, "auxiliary_loss_mlp": 0.01027993, "balance_loss_clip": 1.05255914, "balance_loss_mlp": 1.02085304, "epoch": 0.5600913845968857, "flos": 22339345714560.0, "grad_norm": 2.2116951412111066, "language_loss": 0.84245038, "learning_rate": 1.70892312309274e-06, "loss": 0.86450326, "num_input_tokens_seen": 100460260, "step": 4658, "time_per_iteration": 2.874797821044922 }, { "auxiliary_loss_clip": 0.01175433, "auxiliary_loss_mlp": 0.01026595, "balance_loss_clip": 1.05118513, "balance_loss_mlp": 1.01869404, "epoch": 0.5602116274875248, "flos": 17633791290240.0, "grad_norm": 2.0944205636736966, "language_loss": 0.68312198, "learning_rate": 1.7081524675673523e-06, "loss": 0.70514226, "num_input_tokens_seen": 100475750, "step": 4659, "time_per_iteration": 2.7968897819519043 }, { "auxiliary_loss_clip": 0.01077629, "auxiliary_loss_mlp": 0.01001682, "balance_loss_clip": 1.02210236, "balance_loss_mlp": 1.00069809, "epoch": 0.5603318703781639, "flos": 70115945529600.0, "grad_norm": 0.7750497021852478, "language_loss": 0.59593779, "learning_rate": 1.7073818563150026e-06, "loss": 0.61673087, "num_input_tokens_seen": 100537830, "step": 4660, "time_per_iteration": 3.411824941635132 }, { "auxiliary_loss_clip": 0.01173405, "auxiliary_loss_mlp": 0.01028594, "balance_loss_clip": 1.05362153, "balance_loss_mlp": 1.02057159, "epoch": 0.560452113268803, "flos": 18545850455040.0, "grad_norm": 2.3163887697001466, "language_loss": 0.86432302, "learning_rate": 1.7066112894525935e-06, "loss": 0.886343, "num_input_tokens_seen": 100555910, "step": 4661, "time_per_iteration": 2.908478260040283 }, { "auxiliary_loss_clip": 0.01166829, "auxiliary_loss_mlp": 0.01024335, "balance_loss_clip": 1.05411983, "balance_loss_mlp": 1.01649117, "epoch": 0.5605723561594421, "flos": 25264665250560.0, "grad_norm": 1.702368177311979, "language_loss": 0.72962922, "learning_rate": 1.7058407670970177e-06, "loss": 0.7515409, "num_input_tokens_seen": 100577385, "step": 4662, "time_per_iteration": 2.7846107482910156 }, { "auxiliary_loss_clip": 0.01181005, "auxiliary_loss_mlp": 0.0103213, "balance_loss_clip": 1.05447721, "balance_loss_mlp": 1.02461946, "epoch": 0.5606925990500812, "flos": 20594949621120.0, "grad_norm": 1.7754753422961962, "language_loss": 0.6124953, "learning_rate": 1.7050702893651643e-06, "loss": 0.63462663, "num_input_tokens_seen": 100596965, "step": 4663, "time_per_iteration": 2.812443733215332 }, { "auxiliary_loss_clip": 0.01177068, "auxiliary_loss_mlp": 0.01027616, "balance_loss_clip": 1.05569112, "balance_loss_mlp": 1.02007604, "epoch": 0.5608128419407202, "flos": 35006044677120.0, "grad_norm": 2.2111926999966185, "language_loss": 0.75686395, "learning_rate": 1.7042998563739134e-06, "loss": 0.77891088, "num_input_tokens_seen": 100615315, "step": 4664, "time_per_iteration": 2.8121767044067383 }, { "auxiliary_loss_clip": 0.01183414, "auxiliary_loss_mlp": 0.01025286, "balance_loss_clip": 1.05515885, "balance_loss_mlp": 1.01704216, "epoch": 0.5609330848313594, "flos": 24639819235200.0, "grad_norm": 3.5215673620500163, "language_loss": 0.71725333, "learning_rate": 1.703529468240139e-06, "loss": 0.73934031, "num_input_tokens_seen": 100634185, "step": 4665, "time_per_iteration": 2.8232626914978027 }, { "auxiliary_loss_clip": 0.01168664, "auxiliary_loss_mlp": 0.01026548, "balance_loss_clip": 1.05335498, "balance_loss_mlp": 1.01887643, "epoch": 0.5610533277219985, "flos": 18762894385920.0, "grad_norm": 3.2071852340798532, "language_loss": 0.73788422, "learning_rate": 1.7027591250807088e-06, "loss": 0.75983632, "num_input_tokens_seen": 100651360, "step": 4666, "time_per_iteration": 2.7742764949798584 }, { "auxiliary_loss_clip": 0.01182097, "auxiliary_loss_mlp": 0.01026008, "balance_loss_clip": 1.055462, "balance_loss_mlp": 1.01859951, "epoch": 0.5611735706126375, "flos": 15012384727680.0, "grad_norm": 2.110775595113035, "language_loss": 0.84291542, "learning_rate": 1.7019888270124825e-06, "loss": 0.86499649, "num_input_tokens_seen": 100668525, "step": 4667, "time_per_iteration": 2.762155771255493 }, { "auxiliary_loss_clip": 0.01182689, "auxiliary_loss_mlp": 0.01033352, "balance_loss_clip": 1.05811214, "balance_loss_mlp": 1.02574635, "epoch": 0.5612938135032767, "flos": 16468167041280.0, "grad_norm": 1.8747309295205858, "language_loss": 0.82050824, "learning_rate": 1.7012185741523147e-06, "loss": 0.84266865, "num_input_tokens_seen": 100684850, "step": 4668, "time_per_iteration": 2.789856195449829 }, { "auxiliary_loss_clip": 0.01180757, "auxiliary_loss_mlp": 0.01023185, "balance_loss_clip": 1.05477476, "balance_loss_mlp": 1.01566887, "epoch": 0.5614140563939157, "flos": 25666433850240.0, "grad_norm": 2.7979345797798985, "language_loss": 0.62794012, "learning_rate": 1.7004483666170514e-06, "loss": 0.64997959, "num_input_tokens_seen": 100705345, "step": 4669, "time_per_iteration": 2.717777729034424 }, { "auxiliary_loss_clip": 0.01175304, "auxiliary_loss_mlp": 0.01025245, "balance_loss_clip": 1.05289984, "balance_loss_mlp": 1.0178299, "epoch": 0.5615342992845548, "flos": 24717566223360.0, "grad_norm": 2.354376668607738, "language_loss": 0.80192494, "learning_rate": 1.699678204523533e-06, "loss": 0.8239305, "num_input_tokens_seen": 100725210, "step": 4670, "time_per_iteration": 2.88374400138855 }, { "auxiliary_loss_clip": 0.01178767, "auxiliary_loss_mlp": 0.01027676, "balance_loss_clip": 1.05791569, "balance_loss_mlp": 1.01999855, "epoch": 0.5616545421751938, "flos": 22015934634240.0, "grad_norm": 3.086075702096142, "language_loss": 0.69370061, "learning_rate": 1.6989080879885918e-06, "loss": 0.715765, "num_input_tokens_seen": 100743070, "step": 4671, "time_per_iteration": 2.7323503494262695 }, { "auxiliary_loss_clip": 0.01076568, "auxiliary_loss_mlp": 0.01003412, "balance_loss_clip": 1.02215302, "balance_loss_mlp": 1.00230908, "epoch": 0.561774785065833, "flos": 53760358690560.0, "grad_norm": 0.8958322953935285, "language_loss": 0.609676, "learning_rate": 1.6981380171290544e-06, "loss": 0.63047588, "num_input_tokens_seen": 100804095, "step": 4672, "time_per_iteration": 3.319091320037842 }, { "auxiliary_loss_clip": 0.01169643, "auxiliary_loss_mlp": 0.01025927, "balance_loss_clip": 1.05169225, "balance_loss_mlp": 1.01794624, "epoch": 0.5618950279564721, "flos": 19750007018880.0, "grad_norm": 2.126270356222238, "language_loss": 0.74232543, "learning_rate": 1.6973679920617396e-06, "loss": 0.76428115, "num_input_tokens_seen": 100821630, "step": 4673, "time_per_iteration": 2.700118064880371 }, { "auxiliary_loss_clip": 0.01170966, "auxiliary_loss_mlp": 0.01023286, "balance_loss_clip": 1.05507565, "balance_loss_mlp": 1.01515627, "epoch": 0.5620152708471111, "flos": 16800592435200.0, "grad_norm": 2.277273961288019, "language_loss": 0.8530038, "learning_rate": 1.6965980129034603e-06, "loss": 0.87494636, "num_input_tokens_seen": 100839015, "step": 4674, "time_per_iteration": 4.6743552684783936 }, { "auxiliary_loss_clip": 0.01172012, "auxiliary_loss_mlp": 0.01025974, "balance_loss_clip": 1.05412102, "balance_loss_mlp": 1.01871467, "epoch": 0.5621355137377503, "flos": 26797799502720.0, "grad_norm": 1.6619791958822405, "language_loss": 0.7655009, "learning_rate": 1.6958280797710209e-06, "loss": 0.78748077, "num_input_tokens_seen": 100860940, "step": 4675, "time_per_iteration": 3.780743360519409 }, { "auxiliary_loss_clip": 0.01073488, "auxiliary_loss_mlp": 0.01000447, "balance_loss_clip": 1.01779258, "balance_loss_mlp": 0.99934447, "epoch": 0.5622557566283893, "flos": 61207046686080.0, "grad_norm": 0.7208003260232105, "language_loss": 0.54708695, "learning_rate": 1.6950581927812198e-06, "loss": 0.56782627, "num_input_tokens_seen": 100920510, "step": 4676, "time_per_iteration": 3.1626296043395996 }, { "auxiliary_loss_clip": 0.01173912, "auxiliary_loss_mlp": 0.01026974, "balance_loss_clip": 1.05143654, "balance_loss_mlp": 1.01965773, "epoch": 0.5623759995190284, "flos": 26468534505600.0, "grad_norm": 2.0098923280056225, "language_loss": 0.79007137, "learning_rate": 1.6942883520508486e-06, "loss": 0.81208026, "num_input_tokens_seen": 100939245, "step": 4677, "time_per_iteration": 2.7355589866638184 }, { "auxiliary_loss_clip": 0.01179117, "auxiliary_loss_mlp": 0.01026332, "balance_loss_clip": 1.05386996, "balance_loss_mlp": 1.01863706, "epoch": 0.5624962424096676, "flos": 19390900798080.0, "grad_norm": 2.5500765245862973, "language_loss": 0.77356654, "learning_rate": 1.693518557696691e-06, "loss": 0.79562104, "num_input_tokens_seen": 100958385, "step": 4678, "time_per_iteration": 2.695697546005249 }, { "auxiliary_loss_clip": 0.01172837, "auxiliary_loss_mlp": 0.0102846, "balance_loss_clip": 1.05147088, "balance_loss_mlp": 1.02058649, "epoch": 0.5626164853003066, "flos": 20667345482880.0, "grad_norm": 1.940776712171775, "language_loss": 0.89417028, "learning_rate": 1.6927488098355252e-06, "loss": 0.91618329, "num_input_tokens_seen": 100976015, "step": 4679, "time_per_iteration": 2.6577537059783936 }, { "auxiliary_loss_clip": 0.010783, "auxiliary_loss_mlp": 0.01000712, "balance_loss_clip": 1.0184536, "balance_loss_mlp": 0.99966925, "epoch": 0.5627367281909457, "flos": 62766071665920.0, "grad_norm": 0.9179073211047794, "language_loss": 0.63135099, "learning_rate": 1.6919791085841201e-06, "loss": 0.65214109, "num_input_tokens_seen": 101033425, "step": 4680, "time_per_iteration": 3.2282145023345947 }, { "auxiliary_loss_clip": 0.01170964, "auxiliary_loss_mlp": 0.01027364, "balance_loss_clip": 1.05113637, "balance_loss_mlp": 1.01905501, "epoch": 0.5628569710815848, "flos": 12787144243200.0, "grad_norm": 3.296832549411404, "language_loss": 0.79023123, "learning_rate": 1.6912094540592396e-06, "loss": 0.81221449, "num_input_tokens_seen": 101048945, "step": 4681, "time_per_iteration": 2.704998254776001 }, { "auxiliary_loss_clip": 0.01173302, "auxiliary_loss_mlp": 0.0102715, "balance_loss_clip": 1.05162168, "balance_loss_mlp": 1.01950216, "epoch": 0.5629772139722239, "flos": 13762082165760.0, "grad_norm": 2.6700274312623042, "language_loss": 0.81289077, "learning_rate": 1.6904398463776393e-06, "loss": 0.83489525, "num_input_tokens_seen": 101062745, "step": 4682, "time_per_iteration": 3.6524291038513184 }, { "auxiliary_loss_clip": 0.0117439, "auxiliary_loss_mlp": 0.0102205, "balance_loss_clip": 1.04936528, "balance_loss_mlp": 1.01468921, "epoch": 0.5630974568628629, "flos": 21467830026240.0, "grad_norm": 1.669866716558194, "language_loss": 0.72871304, "learning_rate": 1.6896702856560683e-06, "loss": 0.75067741, "num_input_tokens_seen": 101081840, "step": 4683, "time_per_iteration": 2.7188704013824463 }, { "auxiliary_loss_clip": 0.01169391, "auxiliary_loss_mlp": 0.0103351, "balance_loss_clip": 1.05177903, "balance_loss_mlp": 1.02554715, "epoch": 0.5632176997535021, "flos": 14245907385600.0, "grad_norm": 2.4604414082192463, "language_loss": 0.69667757, "learning_rate": 1.6889007720112677e-06, "loss": 0.71870655, "num_input_tokens_seen": 101099585, "step": 4684, "time_per_iteration": 2.750990867614746 }, { "auxiliary_loss_clip": 0.01176195, "auxiliary_loss_mlp": 0.01024675, "balance_loss_clip": 1.05311155, "balance_loss_mlp": 1.01709914, "epoch": 0.5633379426441412, "flos": 20812244947200.0, "grad_norm": 1.6353638976044955, "language_loss": 0.7688269, "learning_rate": 1.6881313055599734e-06, "loss": 0.79083562, "num_input_tokens_seen": 101119515, "step": 4685, "time_per_iteration": 2.746400833129883 }, { "auxiliary_loss_clip": 0.01162736, "auxiliary_loss_mlp": 0.01027887, "balance_loss_clip": 1.05250287, "balance_loss_mlp": 1.01994741, "epoch": 0.5634581855347802, "flos": 22600883617920.0, "grad_norm": 2.300852235984994, "language_loss": 0.82203621, "learning_rate": 1.6873618864189117e-06, "loss": 0.84394246, "num_input_tokens_seen": 101135285, "step": 4686, "time_per_iteration": 2.7622294425964355 }, { "auxiliary_loss_clip": 0.01174441, "auxiliary_loss_mlp": 0.01024781, "balance_loss_clip": 1.05102825, "balance_loss_mlp": 1.01714599, "epoch": 0.5635784284254194, "flos": 21506972872320.0, "grad_norm": 2.7792690322840947, "language_loss": 0.78381103, "learning_rate": 1.686592514704803e-06, "loss": 0.80580324, "num_input_tokens_seen": 101152680, "step": 4687, "time_per_iteration": 2.749558210372925 }, { "auxiliary_loss_clip": 0.01167885, "auxiliary_loss_mlp": 0.01022582, "balance_loss_clip": 1.05168235, "balance_loss_mlp": 1.01520324, "epoch": 0.5636986713160584, "flos": 19827466698240.0, "grad_norm": 2.2773085955588384, "language_loss": 0.71196198, "learning_rate": 1.685823190534361e-06, "loss": 0.73386669, "num_input_tokens_seen": 101170920, "step": 4688, "time_per_iteration": 2.78570556640625 }, { "auxiliary_loss_clip": 0.01183189, "auxiliary_loss_mlp": 0.01025616, "balance_loss_clip": 1.05388379, "balance_loss_mlp": 1.01784325, "epoch": 0.5638189142066975, "flos": 19792453916160.0, "grad_norm": 1.7213851245841516, "language_loss": 0.83886695, "learning_rate": 1.6850539140242907e-06, "loss": 0.860955, "num_input_tokens_seen": 101190180, "step": 4689, "time_per_iteration": 2.669611692428589 }, { "auxiliary_loss_clip": 0.0117695, "auxiliary_loss_mlp": 0.0102364, "balance_loss_clip": 1.05149138, "balance_loss_mlp": 1.01592755, "epoch": 0.5639391570973367, "flos": 22893771116160.0, "grad_norm": 2.0277732629756975, "language_loss": 0.82243907, "learning_rate": 1.684284685291292e-06, "loss": 0.84444493, "num_input_tokens_seen": 101211825, "step": 4690, "time_per_iteration": 2.7679717540740967 }, { "auxiliary_loss_clip": 0.01179697, "auxiliary_loss_mlp": 0.01023247, "balance_loss_clip": 1.05327415, "balance_loss_mlp": 1.01512241, "epoch": 0.5640593999879757, "flos": 23727077712000.0, "grad_norm": 2.005566173940264, "language_loss": 0.8112092, "learning_rate": 1.683515504452055e-06, "loss": 0.8332386, "num_input_tokens_seen": 101229200, "step": 4691, "time_per_iteration": 2.7304952144622803 }, { "auxiliary_loss_clip": 0.01164998, "auxiliary_loss_mlp": 0.01026499, "balance_loss_clip": 1.0518012, "balance_loss_mlp": 1.01826811, "epoch": 0.5641796428786148, "flos": 22710123855360.0, "grad_norm": 2.2004652048263225, "language_loss": 0.66447771, "learning_rate": 1.6827463716232648e-06, "loss": 0.68639278, "num_input_tokens_seen": 101249860, "step": 4692, "time_per_iteration": 2.8752100467681885 }, { "auxiliary_loss_clip": 0.01175042, "auxiliary_loss_mlp": 0.01054183, "balance_loss_clip": 1.05317473, "balance_loss_mlp": 1.01866937, "epoch": 0.5642998857692539, "flos": 19791987039360.0, "grad_norm": 1.7049136553836446, "language_loss": 0.76000261, "learning_rate": 1.6819772869215972e-06, "loss": 0.78229487, "num_input_tokens_seen": 101268940, "step": 4693, "time_per_iteration": 2.699099540710449 }, { "auxiliary_loss_clip": 0.01176037, "auxiliary_loss_mlp": 0.01020599, "balance_loss_clip": 1.05288351, "balance_loss_mlp": 1.01320767, "epoch": 0.564420128659893, "flos": 23185904428800.0, "grad_norm": 1.7152158117708993, "language_loss": 0.82067025, "learning_rate": 1.6812082504637228e-06, "loss": 0.84263659, "num_input_tokens_seen": 101290260, "step": 4694, "time_per_iteration": 2.7893762588500977 }, { "auxiliary_loss_clip": 0.0117193, "auxiliary_loss_mlp": 0.0102303, "balance_loss_clip": 1.05266905, "balance_loss_mlp": 1.01581788, "epoch": 0.564540371550532, "flos": 23258264376960.0, "grad_norm": 1.520098576853909, "language_loss": 0.74295771, "learning_rate": 1.6804392623663025e-06, "loss": 0.76490736, "num_input_tokens_seen": 101311465, "step": 4695, "time_per_iteration": 2.8244214057922363 }, { "auxiliary_loss_clip": 0.01168234, "auxiliary_loss_mlp": 0.01024209, "balance_loss_clip": 1.05030608, "balance_loss_mlp": 1.01663327, "epoch": 0.5646606144411712, "flos": 25010058672000.0, "grad_norm": 1.858587038371832, "language_loss": 0.78284138, "learning_rate": 1.6796703227459935e-06, "loss": 0.80476582, "num_input_tokens_seen": 101329420, "step": 4696, "time_per_iteration": 2.8188235759735107 }, { "auxiliary_loss_clip": 0.01160151, "auxiliary_loss_mlp": 0.01021373, "balance_loss_clip": 1.05108523, "balance_loss_mlp": 1.01387155, "epoch": 0.5647808573318103, "flos": 36539645806080.0, "grad_norm": 1.7780572263739325, "language_loss": 0.76329088, "learning_rate": 1.6789014317194407e-06, "loss": 0.78510612, "num_input_tokens_seen": 101350900, "step": 4697, "time_per_iteration": 2.970506191253662 }, { "auxiliary_loss_clip": 0.01180641, "auxiliary_loss_mlp": 0.01029854, "balance_loss_clip": 1.05453587, "balance_loss_mlp": 1.02115726, "epoch": 0.5649011002224493, "flos": 22528451842560.0, "grad_norm": 2.6186467042434103, "language_loss": 0.72935081, "learning_rate": 1.6781325894032853e-06, "loss": 0.75145578, "num_input_tokens_seen": 101369860, "step": 4698, "time_per_iteration": 2.97829532623291 }, { "auxiliary_loss_clip": 0.01164712, "auxiliary_loss_mlp": 0.01022117, "balance_loss_clip": 1.05110228, "balance_loss_mlp": 1.01436257, "epoch": 0.5650213431130885, "flos": 18515147304960.0, "grad_norm": 2.597238798571865, "language_loss": 0.91804236, "learning_rate": 1.6773637959141608e-06, "loss": 0.93991065, "num_input_tokens_seen": 101386835, "step": 4699, "time_per_iteration": 2.8787503242492676 }, { "auxiliary_loss_clip": 0.01165105, "auxiliary_loss_mlp": 0.01024005, "balance_loss_clip": 1.05318546, "balance_loss_mlp": 1.01620293, "epoch": 0.5651415860037275, "flos": 17526310819200.0, "grad_norm": 2.041412374930473, "language_loss": 0.66742021, "learning_rate": 1.6765950513686915e-06, "loss": 0.68931133, "num_input_tokens_seen": 101404945, "step": 4700, "time_per_iteration": 2.8546462059020996 }, { "auxiliary_loss_clip": 0.01167869, "auxiliary_loss_mlp": 0.01028326, "balance_loss_clip": 1.05125546, "balance_loss_mlp": 1.02001667, "epoch": 0.5652618288943666, "flos": 25520026014720.0, "grad_norm": 4.2424691898743365, "language_loss": 0.76232052, "learning_rate": 1.675826355883496e-06, "loss": 0.78428257, "num_input_tokens_seen": 101424160, "step": 4701, "time_per_iteration": 4.650137662887573 }, { "auxiliary_loss_clip": 0.01165948, "auxiliary_loss_mlp": 0.01025221, "balance_loss_clip": 1.0518899, "balance_loss_mlp": 1.01760364, "epoch": 0.5653820717850057, "flos": 19683105937920.0, "grad_norm": 1.8532497683937867, "language_loss": 0.79520744, "learning_rate": 1.6750577095751848e-06, "loss": 0.81711912, "num_input_tokens_seen": 101443270, "step": 4702, "time_per_iteration": 2.729954481124878 }, { "auxiliary_loss_clip": 0.01174799, "auxiliary_loss_mlp": 0.01022195, "balance_loss_clip": 1.05113125, "balance_loss_mlp": 1.01497054, "epoch": 0.5655023146756448, "flos": 26979722910720.0, "grad_norm": 1.6675584172735918, "language_loss": 0.72697562, "learning_rate": 1.6742891125603605e-06, "loss": 0.74894553, "num_input_tokens_seen": 101464175, "step": 4703, "time_per_iteration": 2.707026481628418 }, { "auxiliary_loss_clip": 0.01173209, "auxiliary_loss_mlp": 0.01023493, "balance_loss_clip": 1.05288625, "balance_loss_mlp": 1.01533926, "epoch": 0.5656225575662839, "flos": 27669351104640.0, "grad_norm": 1.7579112895366276, "language_loss": 0.72123885, "learning_rate": 1.6735205649556185e-06, "loss": 0.7432059, "num_input_tokens_seen": 101484045, "step": 4704, "time_per_iteration": 2.754896402359009 }, { "auxiliary_loss_clip": 0.0117008, "auxiliary_loss_mlp": 0.01027677, "balance_loss_clip": 1.05049396, "balance_loss_mlp": 1.02000272, "epoch": 0.5657428004569229, "flos": 24349732997760.0, "grad_norm": 1.5464112895723965, "language_loss": 0.84753031, "learning_rate": 1.6727520668775476e-06, "loss": 0.86950791, "num_input_tokens_seen": 101504330, "step": 4705, "time_per_iteration": 2.7920355796813965 }, { "auxiliary_loss_clip": 0.01180267, "auxiliary_loss_mlp": 0.01023474, "balance_loss_clip": 1.05217767, "balance_loss_mlp": 1.01565099, "epoch": 0.5658630433475621, "flos": 21944041562880.0, "grad_norm": 1.523209640901517, "language_loss": 0.75301313, "learning_rate": 1.6719836184427275e-06, "loss": 0.77505058, "num_input_tokens_seen": 101524635, "step": 4706, "time_per_iteration": 2.7010481357574463 }, { "auxiliary_loss_clip": 0.01168094, "auxiliary_loss_mlp": 0.01025659, "balance_loss_clip": 1.04936743, "balance_loss_mlp": 1.01848245, "epoch": 0.5659832862382012, "flos": 30409012218240.0, "grad_norm": 2.094885547266622, "language_loss": 0.645769, "learning_rate": 1.671215219767733e-06, "loss": 0.66770655, "num_input_tokens_seen": 101544095, "step": 4707, "time_per_iteration": 2.9268336296081543 }, { "auxiliary_loss_clip": 0.01167495, "auxiliary_loss_mlp": 0.01027383, "balance_loss_clip": 1.05283523, "balance_loss_mlp": 1.01948285, "epoch": 0.5661035291288402, "flos": 13188194570880.0, "grad_norm": 2.155962109701133, "language_loss": 0.75887591, "learning_rate": 1.670446870969127e-06, "loss": 0.78082466, "num_input_tokens_seen": 101561760, "step": 4708, "time_per_iteration": 3.757235288619995 }, { "auxiliary_loss_clip": 0.01171647, "auxiliary_loss_mlp": 0.01028482, "balance_loss_clip": 1.04905128, "balance_loss_mlp": 1.02073908, "epoch": 0.5662237720194794, "flos": 16143032108160.0, "grad_norm": 1.9824588688747324, "language_loss": 0.79636526, "learning_rate": 1.6696785721634685e-06, "loss": 0.81836653, "num_input_tokens_seen": 101576245, "step": 4709, "time_per_iteration": 2.6997859477996826 }, { "auxiliary_loss_clip": 0.01179566, "auxiliary_loss_mlp": 0.0102991, "balance_loss_clip": 1.05524051, "balance_loss_mlp": 1.02210784, "epoch": 0.5663440149101184, "flos": 17676848718720.0, "grad_norm": 2.1494811199738306, "language_loss": 0.74011075, "learning_rate": 1.6689103234673086e-06, "loss": 0.76220554, "num_input_tokens_seen": 101594565, "step": 4710, "time_per_iteration": 2.8505587577819824 }, { "auxiliary_loss_clip": 0.01173111, "auxiliary_loss_mlp": 0.01029384, "balance_loss_clip": 1.05307269, "balance_loss_mlp": 1.02106953, "epoch": 0.5664642578007575, "flos": 23368330627200.0, "grad_norm": 1.9991581956828537, "language_loss": 0.77769065, "learning_rate": 1.668142124997189e-06, "loss": 0.79971558, "num_input_tokens_seen": 101614225, "step": 4711, "time_per_iteration": 2.782471179962158 }, { "auxiliary_loss_clip": 0.01068671, "auxiliary_loss_mlp": 0.0100535, "balance_loss_clip": 1.01922464, "balance_loss_mlp": 1.00437212, "epoch": 0.5665845006913967, "flos": 65516470945920.0, "grad_norm": 0.7269511865536814, "language_loss": 0.59736753, "learning_rate": 1.6673739768696453e-06, "loss": 0.61810768, "num_input_tokens_seen": 101680795, "step": 4712, "time_per_iteration": 3.4079718589782715 }, { "auxiliary_loss_clip": 0.01178031, "auxiliary_loss_mlp": 0.01026368, "balance_loss_clip": 1.05259538, "balance_loss_mlp": 1.01900649, "epoch": 0.5667047435820357, "flos": 26140885620480.0, "grad_norm": 1.7348799803902584, "language_loss": 0.77711064, "learning_rate": 1.6666058792012052e-06, "loss": 0.79915464, "num_input_tokens_seen": 101701680, "step": 4713, "time_per_iteration": 2.6976864337921143 }, { "auxiliary_loss_clip": 0.0107541, "auxiliary_loss_mlp": 0.01004449, "balance_loss_clip": 1.01587629, "balance_loss_mlp": 1.00341153, "epoch": 0.5668249864726748, "flos": 71866949725440.0, "grad_norm": 1.1347502590203769, "language_loss": 0.68727356, "learning_rate": 1.6658378321083878e-06, "loss": 0.70807213, "num_input_tokens_seen": 101766010, "step": 4714, "time_per_iteration": 3.3406059741973877 }, { "auxiliary_loss_clip": 0.01165402, "auxiliary_loss_mlp": 0.01029477, "balance_loss_clip": 1.05122471, "balance_loss_mlp": 1.021788, "epoch": 0.5669452293633139, "flos": 22195667312640.0, "grad_norm": 2.4619045714603076, "language_loss": 0.82588303, "learning_rate": 1.6650698357077055e-06, "loss": 0.84783179, "num_input_tokens_seen": 101783055, "step": 4715, "time_per_iteration": 2.823110580444336 }, { "auxiliary_loss_clip": 0.01175855, "auxiliary_loss_mlp": 0.01025809, "balance_loss_clip": 1.05161214, "balance_loss_mlp": 1.01845968, "epoch": 0.567065472253953, "flos": 18223193560320.0, "grad_norm": 3.828069251756495, "language_loss": 0.81231999, "learning_rate": 1.6643018901156632e-06, "loss": 0.83433664, "num_input_tokens_seen": 101802150, "step": 4716, "time_per_iteration": 2.6966171264648438 }, { "auxiliary_loss_clip": 0.01174656, "auxiliary_loss_mlp": 0.0102069, "balance_loss_clip": 1.05150795, "balance_loss_mlp": 1.01307285, "epoch": 0.567185715144592, "flos": 20371548983040.0, "grad_norm": 2.502182022834008, "language_loss": 0.79002476, "learning_rate": 1.6635339954487566e-06, "loss": 0.81197822, "num_input_tokens_seen": 101818025, "step": 4717, "time_per_iteration": 2.759996175765991 }, { "auxiliary_loss_clip": 0.01175963, "auxiliary_loss_mlp": 0.01027365, "balance_loss_clip": 1.05101824, "balance_loss_mlp": 1.01981318, "epoch": 0.5673059580352312, "flos": 23221348174080.0, "grad_norm": 1.9449580599767649, "language_loss": 0.82364917, "learning_rate": 1.6627661518234765e-06, "loss": 0.84568238, "num_input_tokens_seen": 101837280, "step": 4718, "time_per_iteration": 2.6991937160491943 }, { "auxiliary_loss_clip": 0.01169696, "auxiliary_loss_mlp": 0.01027493, "balance_loss_clip": 1.05419064, "balance_loss_mlp": 1.01920819, "epoch": 0.5674262009258703, "flos": 21719599430400.0, "grad_norm": 11.36213377362955, "language_loss": 0.8554858, "learning_rate": 1.661998359356302e-06, "loss": 0.87745774, "num_input_tokens_seen": 101856310, "step": 4719, "time_per_iteration": 2.8303122520446777 }, { "auxiliary_loss_clip": 0.01072594, "auxiliary_loss_mlp": 0.01000153, "balance_loss_clip": 1.01544309, "balance_loss_mlp": 0.99913943, "epoch": 0.5675464438165093, "flos": 67470369114240.0, "grad_norm": 0.7486669280389133, "language_loss": 0.55761957, "learning_rate": 1.6612306181637077e-06, "loss": 0.57834697, "num_input_tokens_seen": 101915635, "step": 4720, "time_per_iteration": 3.347973108291626 }, { "auxiliary_loss_clip": 0.01167255, "auxiliary_loss_mlp": 0.01028708, "balance_loss_clip": 1.05068481, "balance_loss_mlp": 1.02054787, "epoch": 0.5676666867071485, "flos": 18879173688960.0, "grad_norm": 1.9814608732695902, "language_loss": 0.65824175, "learning_rate": 1.6604629283621598e-06, "loss": 0.68020141, "num_input_tokens_seen": 101933565, "step": 4721, "time_per_iteration": 2.733820676803589 }, { "auxiliary_loss_clip": 0.01180527, "auxiliary_loss_mlp": 0.01023425, "balance_loss_clip": 1.05292988, "balance_loss_mlp": 1.0156467, "epoch": 0.5677869295977875, "flos": 33546778744320.0, "grad_norm": 2.0912532170345557, "language_loss": 0.74317384, "learning_rate": 1.6596952900681152e-06, "loss": 0.76521337, "num_input_tokens_seen": 101954325, "step": 4722, "time_per_iteration": 2.784097671508789 }, { "auxiliary_loss_clip": 0.01156805, "auxiliary_loss_mlp": 0.01029972, "balance_loss_clip": 1.05247974, "balance_loss_mlp": 1.02110291, "epoch": 0.5679071724884266, "flos": 28037256157440.0, "grad_norm": 2.301889939176439, "language_loss": 0.82029784, "learning_rate": 1.658927703398025e-06, "loss": 0.84216559, "num_input_tokens_seen": 101974390, "step": 4723, "time_per_iteration": 2.832472562789917 }, { "auxiliary_loss_clip": 0.01165311, "auxiliary_loss_mlp": 0.01024806, "balance_loss_clip": 1.04766083, "balance_loss_mlp": 1.0163095, "epoch": 0.5680274153790658, "flos": 23550110380800.0, "grad_norm": 2.503507841501097, "language_loss": 0.77779233, "learning_rate": 1.6581601684683309e-06, "loss": 0.79969352, "num_input_tokens_seen": 101994815, "step": 4724, "time_per_iteration": 2.8168439865112305 }, { "auxiliary_loss_clip": 0.01174866, "auxiliary_loss_mlp": 0.01030026, "balance_loss_clip": 1.05143309, "balance_loss_mlp": 1.02151477, "epoch": 0.5681476582697048, "flos": 22455158140800.0, "grad_norm": 3.133336084087448, "language_loss": 0.68473566, "learning_rate": 1.6573926853954674e-06, "loss": 0.70678455, "num_input_tokens_seen": 102012400, "step": 4725, "time_per_iteration": 2.7232553958892822 }, { "auxiliary_loss_clip": 0.01167, "auxiliary_loss_mlp": 0.01026935, "balance_loss_clip": 1.05052185, "balance_loss_mlp": 1.01934147, "epoch": 0.5682679011603439, "flos": 19536913584000.0, "grad_norm": 1.7380977963074224, "language_loss": 0.83192968, "learning_rate": 1.6566252542958608e-06, "loss": 0.85386908, "num_input_tokens_seen": 102031900, "step": 4726, "time_per_iteration": 2.7292072772979736 }, { "auxiliary_loss_clip": 0.01163299, "auxiliary_loss_mlp": 0.01027746, "balance_loss_clip": 1.05165279, "balance_loss_mlp": 1.02003336, "epoch": 0.568388144050983, "flos": 28765488493440.0, "grad_norm": 2.076413733152613, "language_loss": 0.78233784, "learning_rate": 1.6558578752859305e-06, "loss": 0.80424827, "num_input_tokens_seen": 102050860, "step": 4727, "time_per_iteration": 5.631314754486084 }, { "auxiliary_loss_clip": 0.01166338, "auxiliary_loss_mlp": 0.01020482, "balance_loss_clip": 1.05052817, "balance_loss_mlp": 1.01289415, "epoch": 0.5685083869416221, "flos": 21209452519680.0, "grad_norm": 1.7501780858822573, "language_loss": 0.78789377, "learning_rate": 1.6550905484820865e-06, "loss": 0.809762, "num_input_tokens_seen": 102069320, "step": 4728, "time_per_iteration": 2.7766025066375732 }, { "auxiliary_loss_clip": 0.01178572, "auxiliary_loss_mlp": 0.01026887, "balance_loss_clip": 1.05003738, "balance_loss_mlp": 1.01903081, "epoch": 0.5686286298322611, "flos": 24827021942400.0, "grad_norm": 2.617464917850123, "language_loss": 0.78780693, "learning_rate": 1.6543232740007328e-06, "loss": 0.80986154, "num_input_tokens_seen": 102086435, "step": 4729, "time_per_iteration": 2.682605504989624 }, { "auxiliary_loss_clip": 0.01174109, "auxiliary_loss_mlp": 0.01023678, "balance_loss_clip": 1.05109882, "balance_loss_mlp": 1.01618528, "epoch": 0.5687488727229003, "flos": 26615121909120.0, "grad_norm": 2.277899393234205, "language_loss": 0.67408383, "learning_rate": 1.653556051958263e-06, "loss": 0.69606173, "num_input_tokens_seen": 102106115, "step": 4730, "time_per_iteration": 2.7728259563446045 }, { "auxiliary_loss_clip": 0.01147251, "auxiliary_loss_mlp": 0.01030316, "balance_loss_clip": 1.05336618, "balance_loss_mlp": 1.0224843, "epoch": 0.5688691156135394, "flos": 20808725414400.0, "grad_norm": 1.7753243259015359, "language_loss": 0.73859906, "learning_rate": 1.6527888824710642e-06, "loss": 0.76037472, "num_input_tokens_seen": 102125715, "step": 4731, "time_per_iteration": 2.7675297260284424 }, { "auxiliary_loss_clip": 0.01163001, "auxiliary_loss_mlp": 0.01030478, "balance_loss_clip": 1.05084372, "balance_loss_mlp": 1.02269936, "epoch": 0.5689893585041784, "flos": 25880963829120.0, "grad_norm": 2.4267028954561756, "language_loss": 0.76803052, "learning_rate": 1.6520217656555166e-06, "loss": 0.78996527, "num_input_tokens_seen": 102145005, "step": 4732, "time_per_iteration": 2.816972017288208 }, { "auxiliary_loss_clip": 0.01161182, "auxiliary_loss_mlp": 0.01027693, "balance_loss_clip": 1.05110502, "balance_loss_mlp": 1.02038574, "epoch": 0.5691096013948175, "flos": 23477463123840.0, "grad_norm": 9.573107886303767, "language_loss": 0.70506489, "learning_rate": 1.65125470162799e-06, "loss": 0.72695363, "num_input_tokens_seen": 102165360, "step": 4733, "time_per_iteration": 2.7132298946380615 }, { "auxiliary_loss_clip": 0.01168681, "auxiliary_loss_mlp": 0.01029413, "balance_loss_clip": 1.04921412, "balance_loss_mlp": 1.02136016, "epoch": 0.5692298442854566, "flos": 18075600576000.0, "grad_norm": 3.136981913941833, "language_loss": 0.69949985, "learning_rate": 1.6504876905048485e-06, "loss": 0.72148079, "num_input_tokens_seen": 102182320, "step": 4734, "time_per_iteration": 3.560384511947632 }, { "auxiliary_loss_clip": 0.01172989, "auxiliary_loss_mlp": 0.01032114, "balance_loss_clip": 1.05002594, "balance_loss_mlp": 1.02445769, "epoch": 0.5693500871760957, "flos": 23039317025280.0, "grad_norm": 1.7177338147122936, "language_loss": 0.72222197, "learning_rate": 1.6497207324024464e-06, "loss": 0.74427301, "num_input_tokens_seen": 102201220, "step": 4735, "time_per_iteration": 2.600975513458252 }, { "auxiliary_loss_clip": 0.01175224, "auxiliary_loss_mlp": 0.01026323, "balance_loss_clip": 1.0498414, "balance_loss_mlp": 1.01815724, "epoch": 0.5694703300667348, "flos": 18989670902400.0, "grad_norm": 2.3936353325246427, "language_loss": 0.8276, "learning_rate": 1.6489538274371305e-06, "loss": 0.84961545, "num_input_tokens_seen": 102219825, "step": 4736, "time_per_iteration": 2.6025149822235107 }, { "auxiliary_loss_clip": 0.01170264, "auxiliary_loss_mlp": 0.01022334, "balance_loss_clip": 1.05217743, "balance_loss_mlp": 1.01529217, "epoch": 0.5695905729573739, "flos": 21908705558400.0, "grad_norm": 3.859579663278508, "language_loss": 0.83297491, "learning_rate": 1.6481869757252396e-06, "loss": 0.8549009, "num_input_tokens_seen": 102238160, "step": 4737, "time_per_iteration": 2.5972445011138916 }, { "auxiliary_loss_clip": 0.01169053, "auxiliary_loss_mlp": 0.010268, "balance_loss_clip": 1.04926097, "balance_loss_mlp": 1.01936698, "epoch": 0.569710815848013, "flos": 28476659232000.0, "grad_norm": 1.5182121238548447, "language_loss": 0.71774286, "learning_rate": 1.647420177383105e-06, "loss": 0.73970139, "num_input_tokens_seen": 102261030, "step": 4738, "time_per_iteration": 2.6171114444732666 }, { "auxiliary_loss_clip": 0.0116914, "auxiliary_loss_mlp": 0.01023719, "balance_loss_clip": 1.05130899, "balance_loss_mlp": 1.01675725, "epoch": 0.569831058738652, "flos": 28366162018560.0, "grad_norm": 1.7166719985714678, "language_loss": 0.7273159, "learning_rate": 1.646653432527049e-06, "loss": 0.74924445, "num_input_tokens_seen": 102281670, "step": 4739, "time_per_iteration": 2.6246516704559326 }, { "auxiliary_loss_clip": 0.01168066, "auxiliary_loss_mlp": 0.01029494, "balance_loss_clip": 1.05044365, "balance_loss_mlp": 1.02213287, "epoch": 0.5699513016292912, "flos": 25849973370240.0, "grad_norm": 1.5562182510513574, "language_loss": 0.74548185, "learning_rate": 1.645886741273387e-06, "loss": 0.76745749, "num_input_tokens_seen": 102303485, "step": 4740, "time_per_iteration": 2.606748580932617 }, { "auxiliary_loss_clip": 0.01162291, "auxiliary_loss_mlp": 0.01027862, "balance_loss_clip": 1.05195999, "balance_loss_mlp": 1.01998854, "epoch": 0.5700715445199303, "flos": 18037858360320.0, "grad_norm": 2.0401841698111842, "language_loss": 0.73898578, "learning_rate": 1.645120103738424e-06, "loss": 0.76088727, "num_input_tokens_seen": 102320995, "step": 4741, "time_per_iteration": 2.5429024696350098 }, { "auxiliary_loss_clip": 0.01162215, "auxiliary_loss_mlp": 0.01051746, "balance_loss_clip": 1.04930484, "balance_loss_mlp": 1.01723862, "epoch": 0.5701917874105693, "flos": 11473352392320.0, "grad_norm": 3.0807658989872926, "language_loss": 0.83976758, "learning_rate": 1.6443535200384591e-06, "loss": 0.86190724, "num_input_tokens_seen": 102339170, "step": 4742, "time_per_iteration": 2.602778911590576 }, { "auxiliary_loss_clip": 0.01175347, "auxiliary_loss_mlp": 0.01028644, "balance_loss_clip": 1.05091369, "balance_loss_mlp": 1.02099037, "epoch": 0.5703120303012085, "flos": 21761759018880.0, "grad_norm": 1.6701205398735037, "language_loss": 0.70479035, "learning_rate": 1.6435869902897827e-06, "loss": 0.72683024, "num_input_tokens_seen": 102357750, "step": 4743, "time_per_iteration": 2.7874739170074463 }, { "auxiliary_loss_clip": 0.01073351, "auxiliary_loss_mlp": 0.01001036, "balance_loss_clip": 1.0190767, "balance_loss_mlp": 0.9998855, "epoch": 0.5704322731918475, "flos": 56746258513920.0, "grad_norm": 0.7974960419076247, "language_loss": 0.61929256, "learning_rate": 1.6428205146086764e-06, "loss": 0.64003646, "num_input_tokens_seen": 102419730, "step": 4744, "time_per_iteration": 3.3807790279388428 }, { "auxiliary_loss_clip": 0.01175685, "auxiliary_loss_mlp": 0.01027746, "balance_loss_clip": 1.05073655, "balance_loss_mlp": 1.01999998, "epoch": 0.5705525160824866, "flos": 20741141975040.0, "grad_norm": 2.913621954432866, "language_loss": 0.70951408, "learning_rate": 1.6420540931114142e-06, "loss": 0.73154843, "num_input_tokens_seen": 102440320, "step": 4745, "time_per_iteration": 2.8728363513946533 }, { "auxiliary_loss_clip": 0.01172535, "auxiliary_loss_mlp": 0.01022959, "balance_loss_clip": 1.05093408, "balance_loss_mlp": 1.01527894, "epoch": 0.5706727589731257, "flos": 18771262254720.0, "grad_norm": 1.9624928215683304, "language_loss": 0.79049361, "learning_rate": 1.6412877259142616e-06, "loss": 0.8124485, "num_input_tokens_seen": 102460240, "step": 4746, "time_per_iteration": 2.844853401184082 }, { "auxiliary_loss_clip": 0.0116881, "auxiliary_loss_mlp": 0.01029739, "balance_loss_clip": 1.05295634, "balance_loss_mlp": 1.02189493, "epoch": 0.5707930018637648, "flos": 27634733372160.0, "grad_norm": 1.9695851885176743, "language_loss": 0.740228, "learning_rate": 1.6405214131334757e-06, "loss": 0.76221347, "num_input_tokens_seen": 102478765, "step": 4747, "time_per_iteration": 2.8329622745513916 }, { "auxiliary_loss_clip": 0.01161426, "auxiliary_loss_mlp": 0.01031773, "balance_loss_clip": 1.05233145, "balance_loss_mlp": 1.02423334, "epoch": 0.5709132447544039, "flos": 27597673514880.0, "grad_norm": 2.034887158584849, "language_loss": 0.79512608, "learning_rate": 1.6397551548853052e-06, "loss": 0.81705809, "num_input_tokens_seen": 102496930, "step": 4748, "time_per_iteration": 2.7928593158721924 }, { "auxiliary_loss_clip": 0.01170129, "auxiliary_loss_mlp": 0.01025864, "balance_loss_clip": 1.05261016, "balance_loss_mlp": 1.0173943, "epoch": 0.571033487645043, "flos": 21686095019520.0, "grad_norm": 1.7441263167116798, "language_loss": 0.71163738, "learning_rate": 1.6389889512859917e-06, "loss": 0.73359728, "num_input_tokens_seen": 102516590, "step": 4749, "time_per_iteration": 2.6288044452667236 }, { "auxiliary_loss_clip": 0.01070121, "auxiliary_loss_mlp": 0.01002846, "balance_loss_clip": 1.01690197, "balance_loss_mlp": 1.00182045, "epoch": 0.5711537305356821, "flos": 70181445980160.0, "grad_norm": 0.847724411352738, "language_loss": 0.60404003, "learning_rate": 1.638222802451767e-06, "loss": 0.62476969, "num_input_tokens_seen": 102578070, "step": 4750, "time_per_iteration": 3.380490779876709 }, { "auxiliary_loss_clip": 0.01165294, "auxiliary_loss_mlp": 0.01023453, "balance_loss_clip": 1.04940712, "balance_loss_mlp": 1.01646113, "epoch": 0.5712739734263211, "flos": 24717494396160.0, "grad_norm": 2.191037487645527, "language_loss": 0.75032014, "learning_rate": 1.6374567084988561e-06, "loss": 0.77220762, "num_input_tokens_seen": 102599255, "step": 4751, "time_per_iteration": 2.6693978309631348 }, { "auxiliary_loss_clip": 0.01175915, "auxiliary_loss_mlp": 0.01026476, "balance_loss_clip": 1.05513096, "balance_loss_mlp": 1.01875722, "epoch": 0.5713942163169603, "flos": 26578169792640.0, "grad_norm": 1.837693053323261, "language_loss": 0.76854336, "learning_rate": 1.6366906695434738e-06, "loss": 0.79056728, "num_input_tokens_seen": 102621775, "step": 4752, "time_per_iteration": 2.7654683589935303 }, { "auxiliary_loss_clip": 0.01172957, "auxiliary_loss_mlp": 0.01021245, "balance_loss_clip": 1.05091608, "balance_loss_mlp": 1.01401496, "epoch": 0.5715144592075994, "flos": 21142443697920.0, "grad_norm": 2.047758555047209, "language_loss": 0.86048692, "learning_rate": 1.6359246857018275e-06, "loss": 0.88242894, "num_input_tokens_seen": 102639305, "step": 4753, "time_per_iteration": 4.74817967414856 }, { "auxiliary_loss_clip": 0.01161421, "auxiliary_loss_mlp": 0.0102226, "balance_loss_clip": 1.04919112, "balance_loss_mlp": 1.015167, "epoch": 0.5716347020982384, "flos": 23330265189120.0, "grad_norm": 1.9980924721788809, "language_loss": 0.78303874, "learning_rate": 1.6351587570901178e-06, "loss": 0.80487561, "num_input_tokens_seen": 102659430, "step": 4754, "time_per_iteration": 2.717672824859619 }, { "auxiliary_loss_clip": 0.01166243, "auxiliary_loss_mlp": 0.01026048, "balance_loss_clip": 1.0512408, "balance_loss_mlp": 1.01832318, "epoch": 0.5717549449888776, "flos": 17009555806080.0, "grad_norm": 6.743805905020755, "language_loss": 0.75186253, "learning_rate": 1.634392883824534e-06, "loss": 0.77378541, "num_input_tokens_seen": 102671430, "step": 4755, "time_per_iteration": 2.8156774044036865 }, { "auxiliary_loss_clip": 0.01167363, "auxiliary_loss_mlp": 0.01026547, "balance_loss_clip": 1.05029392, "balance_loss_mlp": 1.01845884, "epoch": 0.5718751878795166, "flos": 35518130922240.0, "grad_norm": 1.615494812679231, "language_loss": 0.6806075, "learning_rate": 1.6336270660212595e-06, "loss": 0.7025466, "num_input_tokens_seen": 102693025, "step": 4756, "time_per_iteration": 2.9399333000183105 }, { "auxiliary_loss_clip": 0.01168549, "auxiliary_loss_mlp": 0.01027224, "balance_loss_clip": 1.05286181, "balance_loss_mlp": 1.01916003, "epoch": 0.5719954307701557, "flos": 38613989255040.0, "grad_norm": 2.0780978173473548, "language_loss": 0.66022378, "learning_rate": 1.6328613037964676e-06, "loss": 0.68218154, "num_input_tokens_seen": 102716090, "step": 4757, "time_per_iteration": 2.898294687271118 }, { "auxiliary_loss_clip": 0.01171064, "auxiliary_loss_mlp": 0.01025059, "balance_loss_clip": 1.0495621, "balance_loss_mlp": 1.01768613, "epoch": 0.5721156736607949, "flos": 20631111638400.0, "grad_norm": 2.29082047931319, "language_loss": 0.67563248, "learning_rate": 1.6320955972663241e-06, "loss": 0.69759375, "num_input_tokens_seen": 102735685, "step": 4758, "time_per_iteration": 2.7844080924987793 }, { "auxiliary_loss_clip": 0.01171792, "auxiliary_loss_mlp": 0.01027983, "balance_loss_clip": 1.04849339, "balance_loss_mlp": 1.01981711, "epoch": 0.5722359165514339, "flos": 37415076076800.0, "grad_norm": 1.8104868713034201, "language_loss": 0.65545297, "learning_rate": 1.6313299465469857e-06, "loss": 0.67745066, "num_input_tokens_seen": 102758415, "step": 4759, "time_per_iteration": 2.8457305431365967 }, { "auxiliary_loss_clip": 0.01169731, "auxiliary_loss_mlp": 0.01031726, "balance_loss_clip": 1.05080748, "balance_loss_mlp": 1.023453, "epoch": 0.572356159442073, "flos": 21972877205760.0, "grad_norm": 2.4461278764957677, "language_loss": 0.79603529, "learning_rate": 1.6305643517546014e-06, "loss": 0.81804991, "num_input_tokens_seen": 102773795, "step": 4760, "time_per_iteration": 3.6974103450775146 }, { "auxiliary_loss_clip": 0.01174311, "auxiliary_loss_mlp": 0.01031459, "balance_loss_clip": 1.0509665, "balance_loss_mlp": 1.02397323, "epoch": 0.5724764023327121, "flos": 19135540033920.0, "grad_norm": 1.7497325295886648, "language_loss": 0.84470451, "learning_rate": 1.629798813005311e-06, "loss": 0.86676222, "num_input_tokens_seen": 102793515, "step": 4761, "time_per_iteration": 2.722318410873413 }, { "auxiliary_loss_clip": 0.01162134, "auxiliary_loss_mlp": 0.0102796, "balance_loss_clip": 1.05123687, "balance_loss_mlp": 1.02067006, "epoch": 0.5725966452233512, "flos": 22819759142400.0, "grad_norm": 1.8647961476097623, "language_loss": 0.70821238, "learning_rate": 1.6290333304152473e-06, "loss": 0.73011327, "num_input_tokens_seen": 102813390, "step": 4762, "time_per_iteration": 2.998837471008301 }, { "auxiliary_loss_clip": 0.01165368, "auxiliary_loss_mlp": 0.01025211, "balance_loss_clip": 1.05231023, "balance_loss_mlp": 1.01678348, "epoch": 0.5727168881139902, "flos": 41496610498560.0, "grad_norm": 1.8894662014291763, "language_loss": 0.56859499, "learning_rate": 1.6282679041005314e-06, "loss": 0.59050077, "num_input_tokens_seen": 102838980, "step": 4763, "time_per_iteration": 2.965282440185547 }, { "auxiliary_loss_clip": 0.01164386, "auxiliary_loss_mlp": 0.0102469, "balance_loss_clip": 1.04958701, "balance_loss_mlp": 1.01698899, "epoch": 0.5728371310046293, "flos": 14647675985280.0, "grad_norm": 2.117722214496662, "language_loss": 0.86773849, "learning_rate": 1.6275025341772789e-06, "loss": 0.88962919, "num_input_tokens_seen": 102855285, "step": 4764, "time_per_iteration": 2.863985776901245 }, { "auxiliary_loss_clip": 0.01171813, "auxiliary_loss_mlp": 0.01024535, "balance_loss_clip": 1.05185699, "balance_loss_mlp": 1.0168047, "epoch": 0.5729573738952685, "flos": 21506613736320.0, "grad_norm": 2.070644756073604, "language_loss": 0.82012278, "learning_rate": 1.626737220761596e-06, "loss": 0.84208626, "num_input_tokens_seen": 102872750, "step": 4765, "time_per_iteration": 2.9515984058380127 }, { "auxiliary_loss_clip": 0.01172075, "auxiliary_loss_mlp": 0.01032148, "balance_loss_clip": 1.05193043, "balance_loss_mlp": 1.0240891, "epoch": 0.5730776167859075, "flos": 23621680229760.0, "grad_norm": 2.038954272901749, "language_loss": 0.78970516, "learning_rate": 1.62597196396958e-06, "loss": 0.81174731, "num_input_tokens_seen": 102890920, "step": 4766, "time_per_iteration": 2.7578134536743164 }, { "auxiliary_loss_clip": 0.0116872, "auxiliary_loss_mlp": 0.01024145, "balance_loss_clip": 1.04941297, "balance_loss_mlp": 1.01656961, "epoch": 0.5731978596765466, "flos": 25739224761600.0, "grad_norm": 2.7217649135292703, "language_loss": 0.85768694, "learning_rate": 1.6252067639173197e-06, "loss": 0.87961555, "num_input_tokens_seen": 102912830, "step": 4767, "time_per_iteration": 2.8507437705993652 }, { "auxiliary_loss_clip": 0.01171636, "auxiliary_loss_mlp": 0.0103224, "balance_loss_clip": 1.0490545, "balance_loss_mlp": 1.02446771, "epoch": 0.5733181025671857, "flos": 26359509749760.0, "grad_norm": 1.8347542448013041, "language_loss": 0.69717264, "learning_rate": 1.6244416207208956e-06, "loss": 0.71921146, "num_input_tokens_seen": 102933765, "step": 4768, "time_per_iteration": 2.8983330726623535 }, { "auxiliary_loss_clip": 0.01167542, "auxiliary_loss_mlp": 0.0102574, "balance_loss_clip": 1.05123734, "balance_loss_mlp": 1.01826262, "epoch": 0.5734383454578248, "flos": 29423874833280.0, "grad_norm": 1.6858031159704252, "language_loss": 0.73426008, "learning_rate": 1.6236765344963787e-06, "loss": 0.75619286, "num_input_tokens_seen": 102955025, "step": 4769, "time_per_iteration": 2.8401269912719727 }, { "auxiliary_loss_clip": 0.01167064, "auxiliary_loss_mlp": 0.01026523, "balance_loss_clip": 1.05030251, "balance_loss_mlp": 1.01888728, "epoch": 0.5735585883484638, "flos": 34969954487040.0, "grad_norm": 2.432565951982681, "language_loss": 0.69220805, "learning_rate": 1.6229115053598322e-06, "loss": 0.71414393, "num_input_tokens_seen": 102976780, "step": 4770, "time_per_iteration": 2.9356536865234375 }, { "auxiliary_loss_clip": 0.01173139, "auxiliary_loss_mlp": 0.0102976, "balance_loss_clip": 1.05259919, "balance_loss_mlp": 1.02173686, "epoch": 0.573678831239103, "flos": 18770759464320.0, "grad_norm": 2.9783567257746153, "language_loss": 0.72296226, "learning_rate": 1.6221465334273108e-06, "loss": 0.74499124, "num_input_tokens_seen": 102995990, "step": 4771, "time_per_iteration": 2.84110689163208 }, { "auxiliary_loss_clip": 0.01170008, "auxiliary_loss_mlp": 0.01026837, "balance_loss_clip": 1.05101633, "balance_loss_mlp": 1.01927924, "epoch": 0.5737990741297421, "flos": 25702883176320.0, "grad_norm": 2.030909685353118, "language_loss": 0.61260104, "learning_rate": 1.6213816188148593e-06, "loss": 0.63456947, "num_input_tokens_seen": 103014695, "step": 4772, "time_per_iteration": 2.86977219581604 }, { "auxiliary_loss_clip": 0.01159009, "auxiliary_loss_mlp": 0.01022333, "balance_loss_clip": 1.05118728, "balance_loss_mlp": 1.01444769, "epoch": 0.5739193170203811, "flos": 27269234530560.0, "grad_norm": 1.9756763593196123, "language_loss": 0.77028191, "learning_rate": 1.6206167616385162e-06, "loss": 0.79209536, "num_input_tokens_seen": 103035760, "step": 4773, "time_per_iteration": 2.8101539611816406 }, { "auxiliary_loss_clip": 0.01178103, "auxiliary_loss_mlp": 0.01029451, "balance_loss_clip": 1.05400372, "balance_loss_mlp": 1.02164865, "epoch": 0.5740395599110203, "flos": 12239721993600.0, "grad_norm": 2.686741026989084, "language_loss": 0.73985326, "learning_rate": 1.6198519620143078e-06, "loss": 0.76192886, "num_input_tokens_seen": 103052915, "step": 4774, "time_per_iteration": 2.8144586086273193 }, { "auxiliary_loss_clip": 0.01170495, "auxiliary_loss_mlp": 0.01026998, "balance_loss_clip": 1.05207646, "balance_loss_mlp": 1.0192852, "epoch": 0.5741598028016593, "flos": 25921399564800.0, "grad_norm": 2.124504814469815, "language_loss": 0.78170264, "learning_rate": 1.6190872200582546e-06, "loss": 0.80367756, "num_input_tokens_seen": 103074655, "step": 4775, "time_per_iteration": 2.935330390930176 }, { "auxiliary_loss_clip": 0.01164905, "auxiliary_loss_mlp": 0.01053898, "balance_loss_clip": 1.05026388, "balance_loss_mlp": 1.01656938, "epoch": 0.5742800456922984, "flos": 19244133826560.0, "grad_norm": 2.34734877524394, "language_loss": 0.78004932, "learning_rate": 1.6183225358863676e-06, "loss": 0.80223739, "num_input_tokens_seen": 103091550, "step": 4776, "time_per_iteration": 2.8197898864746094 }, { "auxiliary_loss_clip": 0.0116286, "auxiliary_loss_mlp": 0.01024026, "balance_loss_clip": 1.04975641, "balance_loss_mlp": 1.0161401, "epoch": 0.5744002885829376, "flos": 30920487932160.0, "grad_norm": 3.2856614430390647, "language_loss": 0.71824229, "learning_rate": 1.617557909614648e-06, "loss": 0.74011111, "num_input_tokens_seen": 103110985, "step": 4777, "time_per_iteration": 2.896899938583374 }, { "auxiliary_loss_clip": 0.01163766, "auxiliary_loss_mlp": 0.01025821, "balance_loss_clip": 1.04949486, "balance_loss_mlp": 1.01829267, "epoch": 0.5745205314735766, "flos": 23840017050240.0, "grad_norm": 1.9965416461217718, "language_loss": 0.8633818, "learning_rate": 1.6167933413590899e-06, "loss": 0.88527763, "num_input_tokens_seen": 103129890, "step": 4778, "time_per_iteration": 3.7488725185394287 }, { "auxiliary_loss_clip": 0.01172129, "auxiliary_loss_mlp": 0.01027642, "balance_loss_clip": 1.05124497, "balance_loss_mlp": 1.02025676, "epoch": 0.5746407743642157, "flos": 12311902373760.0, "grad_norm": 2.4627818074473984, "language_loss": 0.90553868, "learning_rate": 1.6160288312356773e-06, "loss": 0.92753637, "num_input_tokens_seen": 103147020, "step": 4779, "time_per_iteration": 4.781981945037842 }, { "auxiliary_loss_clip": 0.01176779, "auxiliary_loss_mlp": 0.01023494, "balance_loss_clip": 1.05110073, "balance_loss_mlp": 1.01611829, "epoch": 0.5747610172548548, "flos": 24133658734080.0, "grad_norm": 1.6335195355293428, "language_loss": 0.8191089, "learning_rate": 1.6152643793603857e-06, "loss": 0.84111166, "num_input_tokens_seen": 103167370, "step": 4780, "time_per_iteration": 2.794055461883545 }, { "auxiliary_loss_clip": 0.01173117, "auxiliary_loss_mlp": 0.01031784, "balance_loss_clip": 1.0492115, "balance_loss_mlp": 1.0233736, "epoch": 0.5748812601454939, "flos": 25408451393280.0, "grad_norm": 2.5925641320812574, "language_loss": 0.87705117, "learning_rate": 1.6144999858491815e-06, "loss": 0.89910018, "num_input_tokens_seen": 103186000, "step": 4781, "time_per_iteration": 2.8795342445373535 }, { "auxiliary_loss_clip": 0.01174211, "auxiliary_loss_mlp": 0.01027864, "balance_loss_clip": 1.05104804, "balance_loss_mlp": 1.02007985, "epoch": 0.575001503036133, "flos": 30624942827520.0, "grad_norm": 1.675119966965491, "language_loss": 0.85800999, "learning_rate": 1.6137356508180232e-06, "loss": 0.88003075, "num_input_tokens_seen": 103207710, "step": 4782, "time_per_iteration": 2.8572659492492676 }, { "auxiliary_loss_clip": 0.01174818, "auxiliary_loss_mlp": 0.01058072, "balance_loss_clip": 1.05018258, "balance_loss_mlp": 1.02138495, "epoch": 0.5751217459267721, "flos": 21726566668800.0, "grad_norm": 2.3429996425507156, "language_loss": 0.81451237, "learning_rate": 1.6129713743828593e-06, "loss": 0.83684129, "num_input_tokens_seen": 103226720, "step": 4783, "time_per_iteration": 2.801111936569214 }, { "auxiliary_loss_clip": 0.01171399, "auxiliary_loss_mlp": 0.01023072, "balance_loss_clip": 1.05058599, "balance_loss_mlp": 1.01537693, "epoch": 0.5752419888174112, "flos": 21651620941440.0, "grad_norm": 1.968394826782425, "language_loss": 0.75517201, "learning_rate": 1.6122071566596306e-06, "loss": 0.77711672, "num_input_tokens_seen": 103246995, "step": 4784, "time_per_iteration": 2.765474319458008 }, { "auxiliary_loss_clip": 0.01172866, "auxiliary_loss_mlp": 0.01029589, "balance_loss_clip": 1.04951572, "balance_loss_mlp": 1.021662, "epoch": 0.5753622317080502, "flos": 17775997234560.0, "grad_norm": 2.7411938446150677, "language_loss": 0.83247042, "learning_rate": 1.6114429977642674e-06, "loss": 0.85449499, "num_input_tokens_seen": 103261500, "step": 4785, "time_per_iteration": 2.8078114986419678 }, { "auxiliary_loss_clip": 0.01172931, "auxiliary_loss_mlp": 0.01026253, "balance_loss_clip": 1.05332565, "balance_loss_mlp": 1.01849818, "epoch": 0.5754824745986894, "flos": 19789616741760.0, "grad_norm": 2.1758558470198843, "language_loss": 0.7369256, "learning_rate": 1.6106788978126926e-06, "loss": 0.75891745, "num_input_tokens_seen": 103280475, "step": 4786, "time_per_iteration": 2.7900776863098145 }, { "auxiliary_loss_clip": 0.01158299, "auxiliary_loss_mlp": 0.01024763, "balance_loss_clip": 1.04820728, "balance_loss_mlp": 1.01677001, "epoch": 0.5756027174893285, "flos": 30985665160320.0, "grad_norm": 2.1461720305857073, "language_loss": 0.7880199, "learning_rate": 1.6099148569208196e-06, "loss": 0.80985045, "num_input_tokens_seen": 103297695, "step": 4787, "time_per_iteration": 3.814009428024292 }, { "auxiliary_loss_clip": 0.01166735, "auxiliary_loss_mlp": 0.01030438, "balance_loss_clip": 1.05069971, "balance_loss_mlp": 1.02199459, "epoch": 0.5757229603799675, "flos": 28546864364160.0, "grad_norm": 1.8956821469076066, "language_loss": 0.63128376, "learning_rate": 1.6091508752045523e-06, "loss": 0.65325546, "num_input_tokens_seen": 103318575, "step": 4788, "time_per_iteration": 2.8747150897979736 }, { "auxiliary_loss_clip": 0.01158123, "auxiliary_loss_mlp": 0.01026977, "balance_loss_clip": 1.0474565, "balance_loss_mlp": 1.01941276, "epoch": 0.5758432032706067, "flos": 22999024944000.0, "grad_norm": 1.581086340712874, "language_loss": 0.86349177, "learning_rate": 1.608386952779787e-06, "loss": 0.88534284, "num_input_tokens_seen": 103337945, "step": 4789, "time_per_iteration": 2.859375238418579 }, { "auxiliary_loss_clip": 0.01172949, "auxiliary_loss_mlp": 0.01028155, "balance_loss_clip": 1.05066895, "balance_loss_mlp": 1.02103841, "epoch": 0.5759634461612457, "flos": 25739727552000.0, "grad_norm": 1.580590625258962, "language_loss": 0.75048184, "learning_rate": 1.6076230897624098e-06, "loss": 0.77249289, "num_input_tokens_seen": 103360150, "step": 4790, "time_per_iteration": 2.929833173751831 }, { "auxiliary_loss_clip": 0.01172987, "auxiliary_loss_mlp": 0.01024595, "balance_loss_clip": 1.04861486, "balance_loss_mlp": 1.0168823, "epoch": 0.5760836890518848, "flos": 30591761639040.0, "grad_norm": 2.9533285910121903, "language_loss": 0.77832943, "learning_rate": 1.6068592862682974e-06, "loss": 0.80030519, "num_input_tokens_seen": 103378305, "step": 4791, "time_per_iteration": 2.7864162921905518 }, { "auxiliary_loss_clip": 0.01168825, "auxiliary_loss_mlp": 0.01022561, "balance_loss_clip": 1.05103707, "balance_loss_mlp": 1.01457381, "epoch": 0.576203931942524, "flos": 36538963447680.0, "grad_norm": 1.9009552770514373, "language_loss": 0.7377345, "learning_rate": 1.6060955424133187e-06, "loss": 0.75964838, "num_input_tokens_seen": 103399230, "step": 4792, "time_per_iteration": 2.9503493309020996 }, { "auxiliary_loss_clip": 0.01171887, "auxiliary_loss_mlp": 0.01021789, "balance_loss_clip": 1.05191302, "balance_loss_mlp": 1.01348639, "epoch": 0.576324174833163, "flos": 25516937445120.0, "grad_norm": 1.6303470882984836, "language_loss": 0.89130223, "learning_rate": 1.6053318583133332e-06, "loss": 0.91323906, "num_input_tokens_seen": 103420100, "step": 4793, "time_per_iteration": 2.7784013748168945 }, { "auxiliary_loss_clip": 0.01171455, "auxiliary_loss_mlp": 0.0102815, "balance_loss_clip": 1.05032063, "balance_loss_mlp": 1.02068162, "epoch": 0.5764444177238021, "flos": 25119262995840.0, "grad_norm": 1.9708289986331833, "language_loss": 0.74957407, "learning_rate": 1.6045682340841907e-06, "loss": 0.77157015, "num_input_tokens_seen": 103439025, "step": 4794, "time_per_iteration": 2.69380784034729 }, { "auxiliary_loss_clip": 0.01070906, "auxiliary_loss_mlp": 0.01053222, "balance_loss_clip": 1.01588297, "balance_loss_mlp": 1.00652075, "epoch": 0.5765646606144411, "flos": 62212687758720.0, "grad_norm": 0.758013798845302, "language_loss": 0.57962203, "learning_rate": 1.6038046698417336e-06, "loss": 0.60086322, "num_input_tokens_seen": 103499920, "step": 4795, "time_per_iteration": 3.3613154888153076 }, { "auxiliary_loss_clip": 0.01172975, "auxiliary_loss_mlp": 0.01028894, "balance_loss_clip": 1.0513308, "balance_loss_mlp": 1.02140713, "epoch": 0.5766849035050803, "flos": 25118760205440.0, "grad_norm": 2.972989071583824, "language_loss": 0.6870724, "learning_rate": 1.6030411657017919e-06, "loss": 0.70909113, "num_input_tokens_seen": 103519575, "step": 4796, "time_per_iteration": 2.828758478164673 }, { "auxiliary_loss_clip": 0.01164551, "auxiliary_loss_mlp": 0.01025874, "balance_loss_clip": 1.04973686, "balance_loss_mlp": 1.01894188, "epoch": 0.5768051463957193, "flos": 15991093578240.0, "grad_norm": 1.922590840571237, "language_loss": 0.84224319, "learning_rate": 1.6022777217801903e-06, "loss": 0.86414742, "num_input_tokens_seen": 103536530, "step": 4797, "time_per_iteration": 2.7795779705047607 }, { "auxiliary_loss_clip": 0.0116637, "auxiliary_loss_mlp": 0.01028276, "balance_loss_clip": 1.05170965, "balance_loss_mlp": 1.02075362, "epoch": 0.5769253892863584, "flos": 22163635359360.0, "grad_norm": 1.9693957439681689, "language_loss": 0.7366448, "learning_rate": 1.601514338192742e-06, "loss": 0.75859129, "num_input_tokens_seen": 103556460, "step": 4798, "time_per_iteration": 2.8170289993286133 }, { "auxiliary_loss_clip": 0.01169348, "auxiliary_loss_mlp": 0.0102627, "balance_loss_clip": 1.04798269, "balance_loss_mlp": 1.01895022, "epoch": 0.5770456321769976, "flos": 22856388036480.0, "grad_norm": 2.0389672167417583, "language_loss": 0.7141884, "learning_rate": 1.6007510150552514e-06, "loss": 0.73614454, "num_input_tokens_seen": 103574520, "step": 4799, "time_per_iteration": 2.784162998199463 }, { "auxiliary_loss_clip": 0.01178081, "auxiliary_loss_mlp": 0.01021646, "balance_loss_clip": 1.05123067, "balance_loss_mlp": 1.01434159, "epoch": 0.5771658750676366, "flos": 46353672489600.0, "grad_norm": 1.582115279233497, "language_loss": 0.6219219, "learning_rate": 1.599987752483515e-06, "loss": 0.64391923, "num_input_tokens_seen": 103598965, "step": 4800, "time_per_iteration": 2.9701621532440186 }, { "auxiliary_loss_clip": 0.01161074, "auxiliary_loss_mlp": 0.01028847, "balance_loss_clip": 1.0504396, "balance_loss_mlp": 1.02119339, "epoch": 0.5772861179582757, "flos": 22159972172160.0, "grad_norm": 2.9846548843836094, "language_loss": 0.68136472, "learning_rate": 1.5992245505933184e-06, "loss": 0.70326388, "num_input_tokens_seen": 103618665, "step": 4801, "time_per_iteration": 2.7478835582733154 }, { "auxiliary_loss_clip": 0.01175763, "auxiliary_loss_mlp": 0.01028105, "balance_loss_clip": 1.05094004, "balance_loss_mlp": 1.02088428, "epoch": 0.5774063608489148, "flos": 31248926916480.0, "grad_norm": 2.2545835159616012, "language_loss": 0.71576703, "learning_rate": 1.5984614095004388e-06, "loss": 0.73780566, "num_input_tokens_seen": 103639800, "step": 4802, "time_per_iteration": 2.903162956237793 }, { "auxiliary_loss_clip": 0.01166318, "auxiliary_loss_mlp": 0.0102495, "balance_loss_clip": 1.05058479, "balance_loss_mlp": 1.01708257, "epoch": 0.5775266037395539, "flos": 22527123039360.0, "grad_norm": 6.115558443384054, "language_loss": 0.80692792, "learning_rate": 1.5976983293206438e-06, "loss": 0.82884055, "num_input_tokens_seen": 103655605, "step": 4803, "time_per_iteration": 2.7457213401794434 }, { "auxiliary_loss_clip": 0.01163601, "auxiliary_loss_mlp": 0.01023092, "balance_loss_clip": 1.04800081, "balance_loss_mlp": 1.01524854, "epoch": 0.577646846630193, "flos": 21068790860160.0, "grad_norm": 1.6439682737131518, "language_loss": 0.71659726, "learning_rate": 1.5969353101696928e-06, "loss": 0.73846418, "num_input_tokens_seen": 103674045, "step": 4804, "time_per_iteration": 2.8170619010925293 }, { "auxiliary_loss_clip": 0.01169545, "auxiliary_loss_mlp": 0.01026937, "balance_loss_clip": 1.04884875, "balance_loss_mlp": 1.01979053, "epoch": 0.5777670895208321, "flos": 29714284293120.0, "grad_norm": 1.9185166655715395, "language_loss": 0.79981637, "learning_rate": 1.5961723521633341e-06, "loss": 0.82178116, "num_input_tokens_seen": 103695285, "step": 4805, "time_per_iteration": 4.734870195388794 }, { "auxiliary_loss_clip": 0.01165622, "auxiliary_loss_mlp": 0.01039195, "balance_loss_clip": 1.04993737, "balance_loss_mlp": 1.03110111, "epoch": 0.5778873324114712, "flos": 19500428344320.0, "grad_norm": 2.2968881277613695, "language_loss": 0.91145074, "learning_rate": 1.5954094554173097e-06, "loss": 0.93349892, "num_input_tokens_seen": 103713275, "step": 4806, "time_per_iteration": 2.6973953247070312 }, { "auxiliary_loss_clip": 0.01169418, "auxiliary_loss_mlp": 0.0103013, "balance_loss_clip": 1.0500443, "balance_loss_mlp": 1.02336836, "epoch": 0.5780075753021102, "flos": 14136846716160.0, "grad_norm": 1.9164839104887694, "language_loss": 0.79165745, "learning_rate": 1.5946466200473482e-06, "loss": 0.81365287, "num_input_tokens_seen": 103731185, "step": 4807, "time_per_iteration": 2.758470058441162 }, { "auxiliary_loss_clip": 0.01172712, "auxiliary_loss_mlp": 0.01026585, "balance_loss_clip": 1.05036223, "balance_loss_mlp": 1.01972795, "epoch": 0.5781278181927494, "flos": 15262178883840.0, "grad_norm": 2.0748130708648085, "language_loss": 0.83181643, "learning_rate": 1.5938838461691723e-06, "loss": 0.85380942, "num_input_tokens_seen": 103748095, "step": 4808, "time_per_iteration": 2.7843146324157715 }, { "auxiliary_loss_clip": 0.01176255, "auxiliary_loss_mlp": 0.01030458, "balance_loss_clip": 1.05216074, "balance_loss_mlp": 1.02298331, "epoch": 0.5782480610833884, "flos": 16726831856640.0, "grad_norm": 2.2700462922984834, "language_loss": 0.83078235, "learning_rate": 1.593121133898494e-06, "loss": 0.85284948, "num_input_tokens_seen": 103765300, "step": 4809, "time_per_iteration": 2.7826991081237793 }, { "auxiliary_loss_clip": 0.01176147, "auxiliary_loss_mlp": 0.01029847, "balance_loss_clip": 1.05017424, "balance_loss_mlp": 1.02230084, "epoch": 0.5783683039740275, "flos": 25482140144640.0, "grad_norm": 3.7099526874146926, "language_loss": 0.79218984, "learning_rate": 1.592358483351016e-06, "loss": 0.81424981, "num_input_tokens_seen": 103785475, "step": 4810, "time_per_iteration": 2.877974510192871 }, { "auxiliary_loss_clip": 0.01166967, "auxiliary_loss_mlp": 0.01025851, "balance_loss_clip": 1.04835153, "balance_loss_mlp": 1.01860332, "epoch": 0.5784885468646667, "flos": 18405835240320.0, "grad_norm": 1.9958138508205105, "language_loss": 0.72863531, "learning_rate": 1.5915958946424326e-06, "loss": 0.7505635, "num_input_tokens_seen": 103804160, "step": 4811, "time_per_iteration": 2.732680082321167 }, { "auxiliary_loss_clip": 0.01164685, "auxiliary_loss_mlp": 0.01063921, "balance_loss_clip": 1.04944503, "balance_loss_mlp": 1.02569175, "epoch": 0.5786087897553057, "flos": 46100717936640.0, "grad_norm": 1.7136300583373043, "language_loss": 0.74552345, "learning_rate": 1.5908333678884271e-06, "loss": 0.76780951, "num_input_tokens_seen": 103830580, "step": 4812, "time_per_iteration": 3.0005745887756348 }, { "auxiliary_loss_clip": 0.01173094, "auxiliary_loss_mlp": 0.01027355, "balance_loss_clip": 1.05282474, "balance_loss_mlp": 1.02004147, "epoch": 0.5787290326459448, "flos": 12385950261120.0, "grad_norm": 2.287928351238291, "language_loss": 0.74095231, "learning_rate": 1.5900709032046743e-06, "loss": 0.76295686, "num_input_tokens_seen": 103848655, "step": 4813, "time_per_iteration": 3.9395675659179688 }, { "auxiliary_loss_clip": 0.01165699, "auxiliary_loss_mlp": 0.0102139, "balance_loss_clip": 1.05195117, "balance_loss_mlp": 1.01422238, "epoch": 0.5788492755365839, "flos": 23290332243840.0, "grad_norm": 2.1424646056357104, "language_loss": 0.78003538, "learning_rate": 1.5893085007068391e-06, "loss": 0.80190623, "num_input_tokens_seen": 103866215, "step": 4814, "time_per_iteration": 2.7831273078918457 }, { "auxiliary_loss_clip": 0.01160069, "auxiliary_loss_mlp": 0.01024704, "balance_loss_clip": 1.05073738, "balance_loss_mlp": 1.01742899, "epoch": 0.578969518427223, "flos": 24061047390720.0, "grad_norm": 1.8892450661103475, "language_loss": 0.70764703, "learning_rate": 1.5885461605105786e-06, "loss": 0.72949481, "num_input_tokens_seen": 103887815, "step": 4815, "time_per_iteration": 2.812750816345215 }, { "auxiliary_loss_clip": 0.01170914, "auxiliary_loss_mlp": 0.01025854, "balance_loss_clip": 1.05172062, "balance_loss_mlp": 1.01806331, "epoch": 0.579089761317862, "flos": 21871825269120.0, "grad_norm": 2.002702333718975, "language_loss": 0.77268428, "learning_rate": 1.5877838827315375e-06, "loss": 0.79465193, "num_input_tokens_seen": 103906360, "step": 4816, "time_per_iteration": 2.855144500732422 }, { "auxiliary_loss_clip": 0.01172328, "auxiliary_loss_mlp": 0.01024761, "balance_loss_clip": 1.05008268, "balance_loss_mlp": 1.01693177, "epoch": 0.5792100042085012, "flos": 22929681738240.0, "grad_norm": 1.9041078796915547, "language_loss": 0.70129663, "learning_rate": 1.587021667485355e-06, "loss": 0.72326756, "num_input_tokens_seen": 103925730, "step": 4817, "time_per_iteration": 2.837606906890869 }, { "auxiliary_loss_clip": 0.01170959, "auxiliary_loss_mlp": 0.01022909, "balance_loss_clip": 1.04887426, "balance_loss_mlp": 1.01551497, "epoch": 0.5793302470991403, "flos": 21470056669440.0, "grad_norm": 2.1651706688691745, "language_loss": 0.78444707, "learning_rate": 1.5862595148876559e-06, "loss": 0.80638576, "num_input_tokens_seen": 103945835, "step": 4818, "time_per_iteration": 2.795072555541992 }, { "auxiliary_loss_clip": 0.01164881, "auxiliary_loss_mlp": 0.01020635, "balance_loss_clip": 1.05185103, "balance_loss_mlp": 1.01296043, "epoch": 0.5794504899897793, "flos": 12711013367040.0, "grad_norm": 2.133457489814734, "language_loss": 0.76289237, "learning_rate": 1.58549742505406e-06, "loss": 0.78474754, "num_input_tokens_seen": 103960580, "step": 4819, "time_per_iteration": 2.776186943054199 }, { "auxiliary_loss_clip": 0.01172625, "auxiliary_loss_mlp": 0.01027265, "balance_loss_clip": 1.04875481, "balance_loss_mlp": 1.01933801, "epoch": 0.5795707328804185, "flos": 14867054300160.0, "grad_norm": 2.4088694934417973, "language_loss": 0.75586045, "learning_rate": 1.5847353981001747e-06, "loss": 0.77785933, "num_input_tokens_seen": 103977760, "step": 4820, "time_per_iteration": 2.667250156402588 }, { "auxiliary_loss_clip": 0.01163757, "auxiliary_loss_mlp": 0.01031377, "balance_loss_clip": 1.04961586, "balance_loss_mlp": 1.02350926, "epoch": 0.5796909757710575, "flos": 36430046432640.0, "grad_norm": 1.8675013554104667, "language_loss": 0.6996274, "learning_rate": 1.5839734341415993e-06, "loss": 0.72157872, "num_input_tokens_seen": 103999960, "step": 4821, "time_per_iteration": 2.879760265350342 }, { "auxiliary_loss_clip": 0.01166358, "auxiliary_loss_mlp": 0.01025638, "balance_loss_clip": 1.05251741, "balance_loss_mlp": 1.01846194, "epoch": 0.5798112186616966, "flos": 23039891642880.0, "grad_norm": 1.6036883543620557, "language_loss": 0.76805258, "learning_rate": 1.5832115332939238e-06, "loss": 0.78997254, "num_input_tokens_seen": 104018400, "step": 4822, "time_per_iteration": 2.7787904739379883 }, { "auxiliary_loss_clip": 0.01170978, "auxiliary_loss_mlp": 0.01026926, "balance_loss_clip": 1.04976475, "balance_loss_mlp": 1.01942205, "epoch": 0.5799314615523358, "flos": 16652604401280.0, "grad_norm": 2.00584668474154, "language_loss": 0.74875641, "learning_rate": 1.5824496956727272e-06, "loss": 0.7707355, "num_input_tokens_seen": 104035605, "step": 4823, "time_per_iteration": 2.664560317993164 }, { "auxiliary_loss_clip": 0.01167361, "auxiliary_loss_mlp": 0.01026718, "balance_loss_clip": 1.04989362, "balance_loss_mlp": 1.01944065, "epoch": 0.5800517044429748, "flos": 20485673470080.0, "grad_norm": 3.5811950587751102, "language_loss": 0.7341404, "learning_rate": 1.5816879213935797e-06, "loss": 0.75608116, "num_input_tokens_seen": 104054415, "step": 4824, "time_per_iteration": 2.7899088859558105 }, { "auxiliary_loss_clip": 0.01166487, "auxiliary_loss_mlp": 0.01023825, "balance_loss_clip": 1.04994464, "balance_loss_mlp": 1.01584399, "epoch": 0.5801719473336139, "flos": 31538258968320.0, "grad_norm": 1.4755184192458162, "language_loss": 0.79432845, "learning_rate": 1.5809262105720416e-06, "loss": 0.81623149, "num_input_tokens_seen": 104075455, "step": 4825, "time_per_iteration": 2.7933101654052734 }, { "auxiliary_loss_clip": 0.01171045, "auxiliary_loss_mlp": 0.01031302, "balance_loss_clip": 1.04926276, "balance_loss_mlp": 1.02444482, "epoch": 0.580292190224253, "flos": 20375966355840.0, "grad_norm": 1.4855492181512255, "language_loss": 0.79635119, "learning_rate": 1.5801645633236644e-06, "loss": 0.81837469, "num_input_tokens_seen": 104096440, "step": 4826, "time_per_iteration": 2.8086931705474854 }, { "auxiliary_loss_clip": 0.01163359, "auxiliary_loss_mlp": 0.01025373, "balance_loss_clip": 1.04940295, "balance_loss_mlp": 1.01828623, "epoch": 0.5804124331148921, "flos": 26615373304320.0, "grad_norm": 1.8445480959604013, "language_loss": 0.77310878, "learning_rate": 1.579402979763989e-06, "loss": 0.79499614, "num_input_tokens_seen": 104116775, "step": 4827, "time_per_iteration": 2.7220003604888916 }, { "auxiliary_loss_clip": 0.0116764, "auxiliary_loss_mlp": 0.01022946, "balance_loss_clip": 1.04933381, "balance_loss_mlp": 1.01594234, "epoch": 0.5805326760055312, "flos": 13478496289920.0, "grad_norm": 2.4043851508156995, "language_loss": 0.81296098, "learning_rate": 1.578641460008548e-06, "loss": 0.83486676, "num_input_tokens_seen": 104134510, "step": 4828, "time_per_iteration": 2.816812515258789 }, { "auxiliary_loss_clip": 0.01166872, "auxiliary_loss_mlp": 0.01027458, "balance_loss_clip": 1.0487119, "balance_loss_mlp": 1.02044559, "epoch": 0.5806529188961702, "flos": 12091374823680.0, "grad_norm": 2.1124388014629822, "language_loss": 0.68469083, "learning_rate": 1.5778800041728613e-06, "loss": 0.7066341, "num_input_tokens_seen": 104150800, "step": 4829, "time_per_iteration": 2.6786553859710693 }, { "auxiliary_loss_clip": 0.0116444, "auxiliary_loss_mlp": 0.01024474, "balance_loss_clip": 1.04819679, "balance_loss_mlp": 1.01713109, "epoch": 0.5807731617868094, "flos": 26214107495040.0, "grad_norm": 1.767124452166208, "language_loss": 0.66340482, "learning_rate": 1.577118612372443e-06, "loss": 0.68529397, "num_input_tokens_seen": 104172640, "step": 4830, "time_per_iteration": 3.7465672492980957 }, { "auxiliary_loss_clip": 0.01163722, "auxiliary_loss_mlp": 0.01051953, "balance_loss_clip": 1.04955363, "balance_loss_mlp": 1.01333952, "epoch": 0.5808934046774484, "flos": 37962139190400.0, "grad_norm": 1.6994893101802602, "language_loss": 0.70499593, "learning_rate": 1.5763572847227943e-06, "loss": 0.72715271, "num_input_tokens_seen": 104193525, "step": 4831, "time_per_iteration": 3.8028299808502197 }, { "auxiliary_loss_clip": 0.01166767, "auxiliary_loss_mlp": 0.01025414, "balance_loss_clip": 1.04622459, "balance_loss_mlp": 1.01864624, "epoch": 0.5810136475680875, "flos": 20485853038080.0, "grad_norm": 1.832223271079879, "language_loss": 0.81425798, "learning_rate": 1.5755960213394091e-06, "loss": 0.83617985, "num_input_tokens_seen": 104210625, "step": 4832, "time_per_iteration": 2.6767642498016357 }, { "auxiliary_loss_clip": 0.01167324, "auxiliary_loss_mlp": 0.01021733, "balance_loss_clip": 1.04973435, "balance_loss_mlp": 1.0150156, "epoch": 0.5811338904587267, "flos": 17530153574400.0, "grad_norm": 2.4355226477626966, "language_loss": 0.78336221, "learning_rate": 1.5748348223377703e-06, "loss": 0.80525279, "num_input_tokens_seen": 104228180, "step": 4833, "time_per_iteration": 2.689453601837158 }, { "auxiliary_loss_clip": 0.01165092, "auxiliary_loss_mlp": 0.01025121, "balance_loss_clip": 1.04941058, "balance_loss_mlp": 1.01800156, "epoch": 0.5812541333493657, "flos": 19458017360640.0, "grad_norm": 1.6259620702795892, "language_loss": 0.77933842, "learning_rate": 1.5740736878333507e-06, "loss": 0.80124056, "num_input_tokens_seen": 104246020, "step": 4834, "time_per_iteration": 2.747464418411255 }, { "auxiliary_loss_clip": 0.01171426, "auxiliary_loss_mlp": 0.01028663, "balance_loss_clip": 1.04986429, "balance_loss_mlp": 1.02117348, "epoch": 0.5813743762400048, "flos": 20594949621120.0, "grad_norm": 3.28102679290538, "language_loss": 0.78047287, "learning_rate": 1.5733126179416143e-06, "loss": 0.80247378, "num_input_tokens_seen": 104260505, "step": 4835, "time_per_iteration": 2.6634397506713867 }, { "auxiliary_loss_clip": 0.01168452, "auxiliary_loss_mlp": 0.01027164, "balance_loss_clip": 1.04913592, "balance_loss_mlp": 1.02028537, "epoch": 0.5814946191306439, "flos": 33178227246720.0, "grad_norm": 2.0009097006035748, "language_loss": 0.72687423, "learning_rate": 1.5725516127780137e-06, "loss": 0.74883044, "num_input_tokens_seen": 104282640, "step": 4836, "time_per_iteration": 2.822540760040283 }, { "auxiliary_loss_clip": 0.01172512, "auxiliary_loss_mlp": 0.01022179, "balance_loss_clip": 1.04777122, "balance_loss_mlp": 1.01428771, "epoch": 0.581614862021283, "flos": 16143283503360.0, "grad_norm": 3.1411903526075204, "language_loss": 0.88568664, "learning_rate": 1.5717906724579943e-06, "loss": 0.90763354, "num_input_tokens_seen": 104299700, "step": 4837, "time_per_iteration": 2.669839382171631 }, { "auxiliary_loss_clip": 0.01171484, "auxiliary_loss_mlp": 0.01032019, "balance_loss_clip": 1.0506556, "balance_loss_mlp": 1.02450275, "epoch": 0.581735104911922, "flos": 33802642298880.0, "grad_norm": 7.2257576431205495, "language_loss": 0.68390441, "learning_rate": 1.571029797096989e-06, "loss": 0.70593941, "num_input_tokens_seen": 104320805, "step": 4838, "time_per_iteration": 2.863729953765869 }, { "auxiliary_loss_clip": 0.01170775, "auxiliary_loss_mlp": 0.01019331, "balance_loss_clip": 1.04825759, "balance_loss_mlp": 1.01180899, "epoch": 0.5818553478025612, "flos": 23331163029120.0, "grad_norm": 1.8918659288255337, "language_loss": 0.79001844, "learning_rate": 1.570268986810423e-06, "loss": 0.81191945, "num_input_tokens_seen": 104340700, "step": 4839, "time_per_iteration": 3.535923957824707 }, { "auxiliary_loss_clip": 0.01166408, "auxiliary_loss_mlp": 0.01020013, "balance_loss_clip": 1.05047464, "balance_loss_mlp": 1.01282489, "epoch": 0.5819755906932003, "flos": 20996143603200.0, "grad_norm": 2.0701777052905097, "language_loss": 0.74964106, "learning_rate": 1.5695082417137096e-06, "loss": 0.7715053, "num_input_tokens_seen": 104358575, "step": 4840, "time_per_iteration": 2.7590906620025635 }, { "auxiliary_loss_clip": 0.01162595, "auxiliary_loss_mlp": 0.01021489, "balance_loss_clip": 1.04603219, "balance_loss_mlp": 1.01499462, "epoch": 0.5820958335838393, "flos": 21431668008960.0, "grad_norm": 1.715449031881389, "language_loss": 0.75131893, "learning_rate": 1.5687475619222539e-06, "loss": 0.77315974, "num_input_tokens_seen": 104378530, "step": 4841, "time_per_iteration": 2.8204329013824463 }, { "auxiliary_loss_clip": 0.01166618, "auxiliary_loss_mlp": 0.01027425, "balance_loss_clip": 1.05190539, "balance_loss_mlp": 1.01990891, "epoch": 0.5822160764744785, "flos": 17967473660160.0, "grad_norm": 3.9055653308509855, "language_loss": 0.73919082, "learning_rate": 1.5679869475514496e-06, "loss": 0.76113123, "num_input_tokens_seen": 104395465, "step": 4842, "time_per_iteration": 2.680330753326416 }, { "auxiliary_loss_clip": 0.0117072, "auxiliary_loss_mlp": 0.010257, "balance_loss_clip": 1.04924619, "balance_loss_mlp": 1.01747501, "epoch": 0.5823363193651175, "flos": 23033858158080.0, "grad_norm": 2.4454306990544006, "language_loss": 0.81094754, "learning_rate": 1.567226398716682e-06, "loss": 0.83291173, "num_input_tokens_seen": 104415380, "step": 4843, "time_per_iteration": 2.7887494564056396 }, { "auxiliary_loss_clip": 0.01172871, "auxiliary_loss_mlp": 0.0102388, "balance_loss_clip": 1.04795241, "balance_loss_mlp": 1.0161643, "epoch": 0.5824565622557566, "flos": 32891840110080.0, "grad_norm": 1.8327589943049825, "language_loss": 0.62133664, "learning_rate": 1.566465915533326e-06, "loss": 0.64330411, "num_input_tokens_seen": 104437410, "step": 4844, "time_per_iteration": 2.793621301651001 }, { "auxiliary_loss_clip": 0.01164281, "auxiliary_loss_mlp": 0.01030886, "balance_loss_clip": 1.04750919, "balance_loss_mlp": 1.02314973, "epoch": 0.5825768051463958, "flos": 22229674513920.0, "grad_norm": 1.8310372639387322, "language_loss": 0.88058251, "learning_rate": 1.5657054981167458e-06, "loss": 0.90253425, "num_input_tokens_seen": 104456305, "step": 4845, "time_per_iteration": 2.798682928085327 }, { "auxiliary_loss_clip": 0.01165975, "auxiliary_loss_mlp": 0.01026857, "balance_loss_clip": 1.04915404, "balance_loss_mlp": 1.02060151, "epoch": 0.5826970480370348, "flos": 28001561016960.0, "grad_norm": 1.8368229478679707, "language_loss": 0.67885602, "learning_rate": 1.5649451465822965e-06, "loss": 0.70078433, "num_input_tokens_seen": 104477695, "step": 4846, "time_per_iteration": 2.8403961658477783 }, { "auxiliary_loss_clip": 0.01158822, "auxiliary_loss_mlp": 0.01027094, "balance_loss_clip": 1.04939747, "balance_loss_mlp": 1.01988459, "epoch": 0.5828172909276739, "flos": 17858053854720.0, "grad_norm": 1.59279238877155, "language_loss": 0.83655119, "learning_rate": 1.5641848610453218e-06, "loss": 0.85841036, "num_input_tokens_seen": 104496355, "step": 4847, "time_per_iteration": 2.8745124340057373 }, { "auxiliary_loss_clip": 0.01167609, "auxiliary_loss_mlp": 0.01021949, "balance_loss_clip": 1.050578, "balance_loss_mlp": 1.01412868, "epoch": 0.582937533818313, "flos": 19865244827520.0, "grad_norm": 2.049013976928205, "language_loss": 0.86034662, "learning_rate": 1.563424641621158e-06, "loss": 0.8822422, "num_input_tokens_seen": 104515535, "step": 4848, "time_per_iteration": 2.76169753074646 }, { "auxiliary_loss_clip": 0.01170754, "auxiliary_loss_mlp": 0.010278, "balance_loss_clip": 1.05048871, "balance_loss_mlp": 1.01991987, "epoch": 0.5830577767089521, "flos": 26870734068480.0, "grad_norm": 1.9510562874812596, "language_loss": 0.69699478, "learning_rate": 1.5626644884251282e-06, "loss": 0.71898031, "num_input_tokens_seen": 104535055, "step": 4849, "time_per_iteration": 2.771721601486206 }, { "auxiliary_loss_clip": 0.01168371, "auxiliary_loss_mlp": 0.01024718, "balance_loss_clip": 1.04729557, "balance_loss_mlp": 1.0177269, "epoch": 0.5831780195995911, "flos": 25298205575040.0, "grad_norm": 1.7732633932562447, "language_loss": 0.88239098, "learning_rate": 1.5619044015725488e-06, "loss": 0.90432191, "num_input_tokens_seen": 104554745, "step": 4850, "time_per_iteration": 2.7208094596862793 }, { "auxiliary_loss_clip": 0.01181748, "auxiliary_loss_mlp": 0.01027479, "balance_loss_clip": 1.05458093, "balance_loss_mlp": 1.01897335, "epoch": 0.5832982624902303, "flos": 14756988049920.0, "grad_norm": 2.7357915579255625, "language_loss": 0.86858237, "learning_rate": 1.5611443811787224e-06, "loss": 0.89067459, "num_input_tokens_seen": 104568870, "step": 4851, "time_per_iteration": 2.74159836769104 }, { "auxiliary_loss_clip": 0.01167158, "auxiliary_loss_mlp": 0.0102796, "balance_loss_clip": 1.05059326, "balance_loss_mlp": 1.0212816, "epoch": 0.5834185053808694, "flos": 20444555376000.0, "grad_norm": 2.194069412658598, "language_loss": 0.69481003, "learning_rate": 1.560384427358945e-06, "loss": 0.71676117, "num_input_tokens_seen": 104588415, "step": 4852, "time_per_iteration": 2.733975648880005 }, { "auxiliary_loss_clip": 0.0116178, "auxiliary_loss_mlp": 0.01026003, "balance_loss_clip": 1.047014, "balance_loss_mlp": 1.01845109, "epoch": 0.5835387482715084, "flos": 27200394115200.0, "grad_norm": 1.513477071832477, "language_loss": 0.72984135, "learning_rate": 1.5596245402284998e-06, "loss": 0.75171924, "num_input_tokens_seen": 104611940, "step": 4853, "time_per_iteration": 2.7437257766723633 }, { "auxiliary_loss_clip": 0.01172181, "auxiliary_loss_mlp": 0.01024614, "balance_loss_clip": 1.05116677, "balance_loss_mlp": 1.01705575, "epoch": 0.5836589911621476, "flos": 16654615562880.0, "grad_norm": 1.7134748302925233, "language_loss": 0.81782055, "learning_rate": 1.5588647199026619e-06, "loss": 0.83978856, "num_input_tokens_seen": 104629675, "step": 4854, "time_per_iteration": 2.84016752243042 }, { "auxiliary_loss_clip": 0.01177559, "auxiliary_loss_mlp": 0.0102778, "balance_loss_clip": 1.05209446, "balance_loss_mlp": 1.02034092, "epoch": 0.5837792340527866, "flos": 20446817932800.0, "grad_norm": 3.2970669231828813, "language_loss": 0.87518585, "learning_rate": 1.5581049664966956e-06, "loss": 0.89723927, "num_input_tokens_seen": 104647435, "step": 4855, "time_per_iteration": 2.717531681060791 }, { "auxiliary_loss_clip": 0.01075186, "auxiliary_loss_mlp": 0.01000643, "balance_loss_clip": 1.0187757, "balance_loss_mlp": 0.99962342, "epoch": 0.5838994769434257, "flos": 65995480765440.0, "grad_norm": 0.9912012328663949, "language_loss": 0.65102178, "learning_rate": 1.5573452801258545e-06, "loss": 0.67178005, "num_input_tokens_seen": 104694605, "step": 4856, "time_per_iteration": 3.986783742904663 }, { "auxiliary_loss_clip": 0.01175501, "auxiliary_loss_mlp": 0.01030094, "balance_loss_clip": 1.0504235, "balance_loss_mlp": 1.02202392, "epoch": 0.5840197198340649, "flos": 21470523546240.0, "grad_norm": 3.247789572658295, "language_loss": 0.63693118, "learning_rate": 1.5565856609053824e-06, "loss": 0.65898716, "num_input_tokens_seen": 104713400, "step": 4857, "time_per_iteration": 3.7684834003448486 }, { "auxiliary_loss_clip": 0.01172094, "auxiliary_loss_mlp": 0.01028735, "balance_loss_clip": 1.04986858, "balance_loss_mlp": 1.02148402, "epoch": 0.5841399627247039, "flos": 19135144984320.0, "grad_norm": 2.140655816498076, "language_loss": 0.80407304, "learning_rate": 1.5558261089505127e-06, "loss": 0.82608128, "num_input_tokens_seen": 104732130, "step": 4858, "time_per_iteration": 2.6553878784179688 }, { "auxiliary_loss_clip": 0.01169858, "auxiliary_loss_mlp": 0.01026147, "balance_loss_clip": 1.04951429, "balance_loss_mlp": 1.01945019, "epoch": 0.584260205615343, "flos": 26425692558720.0, "grad_norm": 1.772702781223426, "language_loss": 0.79804122, "learning_rate": 1.5550666243764697e-06, "loss": 0.82000136, "num_input_tokens_seen": 104750290, "step": 4859, "time_per_iteration": 2.7441320419311523 }, { "auxiliary_loss_clip": 0.01167638, "auxiliary_loss_mlp": 0.01026583, "balance_loss_clip": 1.04773188, "balance_loss_mlp": 1.01902509, "epoch": 0.584380448505982, "flos": 13881809174400.0, "grad_norm": 1.9865163665173622, "language_loss": 0.77074307, "learning_rate": 1.554307207298465e-06, "loss": 0.79268527, "num_input_tokens_seen": 104768550, "step": 4860, "time_per_iteration": 2.720506191253662 }, { "auxiliary_loss_clip": 0.01176454, "auxiliary_loss_mlp": 0.01032881, "balance_loss_clip": 1.05147219, "balance_loss_mlp": 1.02500439, "epoch": 0.5845006913966212, "flos": 21543709507200.0, "grad_norm": 2.7132172763918483, "language_loss": 0.78927153, "learning_rate": 1.553547857831704e-06, "loss": 0.81136489, "num_input_tokens_seen": 104785060, "step": 4861, "time_per_iteration": 2.788801908493042 }, { "auxiliary_loss_clip": 0.01073127, "auxiliary_loss_mlp": 0.01002931, "balance_loss_clip": 1.0158006, "balance_loss_mlp": 1.00184047, "epoch": 0.5846209342872603, "flos": 58375452712320.0, "grad_norm": 0.8843527912938534, "language_loss": 0.6410836, "learning_rate": 1.5527885760913771e-06, "loss": 0.66184413, "num_input_tokens_seen": 104834950, "step": 4862, "time_per_iteration": 3.1162357330322266 }, { "auxiliary_loss_clip": 0.0116472, "auxiliary_loss_mlp": 0.01025591, "balance_loss_clip": 1.0499711, "balance_loss_mlp": 1.01864088, "epoch": 0.5847411771778993, "flos": 18588045957120.0, "grad_norm": 2.3990981272824525, "language_loss": 0.76485229, "learning_rate": 1.552029362192668e-06, "loss": 0.78675544, "num_input_tokens_seen": 104854210, "step": 4863, "time_per_iteration": 2.6741833686828613 }, { "auxiliary_loss_clip": 0.01160682, "auxiliary_loss_mlp": 0.01026037, "balance_loss_clip": 1.04924238, "balance_loss_mlp": 1.01886058, "epoch": 0.5848614200685385, "flos": 24240780069120.0, "grad_norm": 1.7649966478702286, "language_loss": 0.72127235, "learning_rate": 1.5512702162507478e-06, "loss": 0.74313951, "num_input_tokens_seen": 104874525, "step": 4864, "time_per_iteration": 2.8101117610931396 }, { "auxiliary_loss_clip": 0.01072325, "auxiliary_loss_mlp": 0.01002799, "balance_loss_clip": 1.01576829, "balance_loss_mlp": 1.00169051, "epoch": 0.5849816629591775, "flos": 71660245933440.0, "grad_norm": 1.101086235034848, "language_loss": 0.55711281, "learning_rate": 1.5505111383807792e-06, "loss": 0.57786405, "num_input_tokens_seen": 104937195, "step": 4865, "time_per_iteration": 4.226020574569702 }, { "auxiliary_loss_clip": 0.01162866, "auxiliary_loss_mlp": 0.01027506, "balance_loss_clip": 1.04855657, "balance_loss_mlp": 1.01971531, "epoch": 0.5851019058498166, "flos": 23802095266560.0, "grad_norm": 2.2288464611135006, "language_loss": 0.80419856, "learning_rate": 1.5497521286979138e-06, "loss": 0.82610232, "num_input_tokens_seen": 104957435, "step": 4866, "time_per_iteration": 2.7390570640563965 }, { "auxiliary_loss_clip": 0.01168614, "auxiliary_loss_mlp": 0.01025954, "balance_loss_clip": 1.0485388, "balance_loss_mlp": 1.01806831, "epoch": 0.5852221487404557, "flos": 24388516707840.0, "grad_norm": 2.1730997239323147, "language_loss": 0.74235642, "learning_rate": 1.5489931873172927e-06, "loss": 0.76430213, "num_input_tokens_seen": 104978755, "step": 4867, "time_per_iteration": 2.762441873550415 }, { "auxiliary_loss_clip": 0.0114504, "auxiliary_loss_mlp": 0.0103306, "balance_loss_clip": 1.04871964, "balance_loss_mlp": 1.0254302, "epoch": 0.5853423916310948, "flos": 27271425260160.0, "grad_norm": 1.8739450986434838, "language_loss": 0.79021871, "learning_rate": 1.5482343143540467e-06, "loss": 0.81199968, "num_input_tokens_seen": 105000020, "step": 4868, "time_per_iteration": 2.912259101867676 }, { "auxiliary_loss_clip": 0.01162072, "auxiliary_loss_mlp": 0.01059749, "balance_loss_clip": 1.04619205, "balance_loss_mlp": 1.02051044, "epoch": 0.5854626345217339, "flos": 11983786611840.0, "grad_norm": 2.0838467069643585, "language_loss": 0.83091205, "learning_rate": 1.547475509923295e-06, "loss": 0.85313028, "num_input_tokens_seen": 105017060, "step": 4869, "time_per_iteration": 2.8034019470214844 }, { "auxiliary_loss_clip": 0.01074424, "auxiliary_loss_mlp": 0.01001199, "balance_loss_clip": 1.01532412, "balance_loss_mlp": 1.00008428, "epoch": 0.585582877412373, "flos": 64342335173760.0, "grad_norm": 0.7269481126915822, "language_loss": 0.56071746, "learning_rate": 1.5467167741401495e-06, "loss": 0.58147371, "num_input_tokens_seen": 105078540, "step": 4870, "time_per_iteration": 3.287606954574585 }, { "auxiliary_loss_clip": 0.01168227, "auxiliary_loss_mlp": 0.01021374, "balance_loss_clip": 1.0501852, "balance_loss_mlp": 1.01416743, "epoch": 0.5857031203030121, "flos": 17011926103680.0, "grad_norm": 8.316543671870914, "language_loss": 0.71356559, "learning_rate": 1.5459581071197083e-06, "loss": 0.73546159, "num_input_tokens_seen": 105094200, "step": 4871, "time_per_iteration": 2.6664116382598877 }, { "auxiliary_loss_clip": 0.01175002, "auxiliary_loss_mlp": 0.01026834, "balance_loss_clip": 1.05298305, "balance_loss_mlp": 1.01934433, "epoch": 0.5858233631936511, "flos": 20885682303360.0, "grad_norm": 2.0322838850572986, "language_loss": 0.83388513, "learning_rate": 1.5451995089770624e-06, "loss": 0.85590351, "num_input_tokens_seen": 105113985, "step": 4872, "time_per_iteration": 2.7386012077331543 }, { "auxiliary_loss_clip": 0.01171455, "auxiliary_loss_mlp": 0.01027173, "balance_loss_clip": 1.04870152, "balance_loss_mlp": 1.01985312, "epoch": 0.5859436060842903, "flos": 23191902000000.0, "grad_norm": 1.3170424052101308, "language_loss": 0.71887445, "learning_rate": 1.5444409798272885e-06, "loss": 0.7408607, "num_input_tokens_seen": 105138075, "step": 4873, "time_per_iteration": 2.810544490814209 }, { "auxiliary_loss_clip": 0.01168247, "auxiliary_loss_mlp": 0.01025221, "balance_loss_clip": 1.05092955, "balance_loss_mlp": 1.01763964, "epoch": 0.5860638489749294, "flos": 22492648961280.0, "grad_norm": 1.8807698925400282, "language_loss": 0.81101143, "learning_rate": 1.543682519785456e-06, "loss": 0.83294618, "num_input_tokens_seen": 105156555, "step": 4874, "time_per_iteration": 2.7945237159729004 }, { "auxiliary_loss_clip": 0.01164675, "auxiliary_loss_mlp": 0.01031628, "balance_loss_clip": 1.04828954, "balance_loss_mlp": 1.02463579, "epoch": 0.5861840918655684, "flos": 17566243764480.0, "grad_norm": 6.620696798520052, "language_loss": 0.80019724, "learning_rate": 1.5429241289666219e-06, "loss": 0.82216024, "num_input_tokens_seen": 105174055, "step": 4875, "time_per_iteration": 2.6532509326934814 }, { "auxiliary_loss_clip": 0.01162159, "auxiliary_loss_mlp": 0.01030482, "balance_loss_clip": 1.04948688, "balance_loss_mlp": 1.02359176, "epoch": 0.5863043347562076, "flos": 25556152118400.0, "grad_norm": 1.9509026657769335, "language_loss": 0.69838905, "learning_rate": 1.5421658074858342e-06, "loss": 0.72031546, "num_input_tokens_seen": 105192160, "step": 4876, "time_per_iteration": 2.786369562149048 }, { "auxiliary_loss_clip": 0.01165166, "auxiliary_loss_mlp": 0.01029612, "balance_loss_clip": 1.05107391, "balance_loss_mlp": 1.0218339, "epoch": 0.5864245776468466, "flos": 20667525050880.0, "grad_norm": 6.681703136643461, "language_loss": 0.66718888, "learning_rate": 1.5414075554581298e-06, "loss": 0.68913668, "num_input_tokens_seen": 105210205, "step": 4877, "time_per_iteration": 2.7904529571533203 }, { "auxiliary_loss_clip": 0.011755, "auxiliary_loss_mlp": 0.01029114, "balance_loss_clip": 1.04972827, "balance_loss_mlp": 1.02185392, "epoch": 0.5865448205374857, "flos": 28913907490560.0, "grad_norm": 2.3697676988216108, "language_loss": 0.78927481, "learning_rate": 1.5406493729985348e-06, "loss": 0.81132102, "num_input_tokens_seen": 105229400, "step": 4878, "time_per_iteration": 2.790564775466919 }, { "auxiliary_loss_clip": 0.01167388, "auxiliary_loss_mlp": 0.01061906, "balance_loss_clip": 1.05255926, "balance_loss_mlp": 1.02425814, "epoch": 0.5866650634281249, "flos": 25842575168640.0, "grad_norm": 2.2573488653466693, "language_loss": 0.721398, "learning_rate": 1.5398912602220644e-06, "loss": 0.74369097, "num_input_tokens_seen": 105248675, "step": 4879, "time_per_iteration": 2.840050220489502 }, { "auxiliary_loss_clip": 0.0117321, "auxiliary_loss_mlp": 0.01028308, "balance_loss_clip": 1.0504775, "balance_loss_mlp": 1.02070212, "epoch": 0.5867853063187639, "flos": 17052325925760.0, "grad_norm": 2.0781544430985126, "language_loss": 0.78855324, "learning_rate": 1.539133217243724e-06, "loss": 0.81056839, "num_input_tokens_seen": 105265695, "step": 4880, "time_per_iteration": 2.67403244972229 }, { "auxiliary_loss_clip": 0.01172074, "auxiliary_loss_mlp": 0.01025336, "balance_loss_clip": 1.05181336, "balance_loss_mlp": 1.01687837, "epoch": 0.586905549209403, "flos": 24645026707200.0, "grad_norm": 2.3095053684525393, "language_loss": 0.75950348, "learning_rate": 1.5383752441785081e-06, "loss": 0.78147763, "num_input_tokens_seen": 105284920, "step": 4881, "time_per_iteration": 2.7573330402374268 }, { "auxiliary_loss_clip": 0.01175055, "auxiliary_loss_mlp": 0.01026206, "balance_loss_clip": 1.05116081, "balance_loss_mlp": 1.01886868, "epoch": 0.5870257921000421, "flos": 14720538723840.0, "grad_norm": 2.5214776495572013, "language_loss": 0.85611784, "learning_rate": 1.5376173411414003e-06, "loss": 0.87813044, "num_input_tokens_seen": 105302960, "step": 4882, "time_per_iteration": 2.6382219791412354 }, { "auxiliary_loss_clip": 0.01170998, "auxiliary_loss_mlp": 0.010344, "balance_loss_clip": 1.05027997, "balance_loss_mlp": 1.02618086, "epoch": 0.5871460349906812, "flos": 23914998691200.0, "grad_norm": 3.390630487984454, "language_loss": 0.78984892, "learning_rate": 1.5368595082473753e-06, "loss": 0.81190288, "num_input_tokens_seen": 105321260, "step": 4883, "time_per_iteration": 4.6461405754089355 }, { "auxiliary_loss_clip": 0.01172137, "auxiliary_loss_mlp": 0.01022277, "balance_loss_clip": 1.04938447, "balance_loss_mlp": 1.01502872, "epoch": 0.5872662778813202, "flos": 22164174063360.0, "grad_norm": 1.6723221230289058, "language_loss": 0.7802065, "learning_rate": 1.5361017456113935e-06, "loss": 0.80215061, "num_input_tokens_seen": 105341610, "step": 4884, "time_per_iteration": 2.7903945446014404 }, { "auxiliary_loss_clip": 0.01172501, "auxiliary_loss_mlp": 0.01025586, "balance_loss_clip": 1.04907393, "balance_loss_mlp": 1.01747942, "epoch": 0.5873865207719594, "flos": 18441925430400.0, "grad_norm": 2.0891461565208256, "language_loss": 0.85935628, "learning_rate": 1.5353440533484085e-06, "loss": 0.88133717, "num_input_tokens_seen": 105360465, "step": 4885, "time_per_iteration": 2.632577657699585 }, { "auxiliary_loss_clip": 0.01170067, "auxiliary_loss_mlp": 0.01029163, "balance_loss_clip": 1.05068493, "balance_loss_mlp": 1.02117634, "epoch": 0.5875067636625985, "flos": 54015321427200.0, "grad_norm": 1.940559584148598, "language_loss": 0.66501552, "learning_rate": 1.534586431573361e-06, "loss": 0.68700784, "num_input_tokens_seen": 105385405, "step": 4886, "time_per_iteration": 3.001035451889038 }, { "auxiliary_loss_clip": 0.01162519, "auxiliary_loss_mlp": 0.01031533, "balance_loss_clip": 1.05067706, "balance_loss_mlp": 1.02385581, "epoch": 0.5876270065532375, "flos": 27995707100160.0, "grad_norm": 2.5705070091880597, "language_loss": 0.79233366, "learning_rate": 1.5338288804011817e-06, "loss": 0.81427407, "num_input_tokens_seen": 105404905, "step": 4887, "time_per_iteration": 2.7837605476379395 }, { "auxiliary_loss_clip": 0.01166304, "auxiliary_loss_mlp": 0.0102753, "balance_loss_clip": 1.04849255, "balance_loss_mlp": 1.01928413, "epoch": 0.5877472494438767, "flos": 21361462876800.0, "grad_norm": 1.9433732546978808, "language_loss": 0.71368927, "learning_rate": 1.533071399946791e-06, "loss": 0.73562759, "num_input_tokens_seen": 105423650, "step": 4888, "time_per_iteration": 2.7118327617645264 }, { "auxiliary_loss_clip": 0.01166536, "auxiliary_loss_mlp": 0.01025829, "balance_loss_clip": 1.0473125, "balance_loss_mlp": 1.01823556, "epoch": 0.5878674923345157, "flos": 22383013674240.0, "grad_norm": 1.9263376918777428, "language_loss": 0.57082117, "learning_rate": 1.5323139903250977e-06, "loss": 0.59274477, "num_input_tokens_seen": 105444255, "step": 4889, "time_per_iteration": 2.668678045272827 }, { "auxiliary_loss_clip": 0.01171513, "auxiliary_loss_mlp": 0.01028337, "balance_loss_clip": 1.05241704, "balance_loss_mlp": 1.02105641, "epoch": 0.5879877352251548, "flos": 21868664872320.0, "grad_norm": 1.507566724666196, "language_loss": 0.77099288, "learning_rate": 1.5315566516510002e-06, "loss": 0.79299134, "num_input_tokens_seen": 105462425, "step": 4890, "time_per_iteration": 3.6715633869171143 }, { "auxiliary_loss_clip": 0.01175114, "auxiliary_loss_mlp": 0.01029492, "balance_loss_clip": 1.04979479, "balance_loss_mlp": 1.02178514, "epoch": 0.5881079781157939, "flos": 17493811989120.0, "grad_norm": 1.987773174148579, "language_loss": 0.67873901, "learning_rate": 1.5307993840393857e-06, "loss": 0.70078504, "num_input_tokens_seen": 105480505, "step": 4891, "time_per_iteration": 2.649278402328491 }, { "auxiliary_loss_clip": 0.01168412, "auxiliary_loss_mlp": 0.01022604, "balance_loss_clip": 1.04521871, "balance_loss_mlp": 1.0158335, "epoch": 0.588228221006433, "flos": 22601853285120.0, "grad_norm": 2.015579006856351, "language_loss": 0.80792207, "learning_rate": 1.530042187605132e-06, "loss": 0.82983226, "num_input_tokens_seen": 105499760, "step": 4892, "time_per_iteration": 2.667736530303955 }, { "auxiliary_loss_clip": 0.01171481, "auxiliary_loss_mlp": 0.01057708, "balance_loss_clip": 1.04912198, "balance_loss_mlp": 1.02155256, "epoch": 0.5883484638970721, "flos": 26176939896960.0, "grad_norm": 1.680642087546075, "language_loss": 0.84172142, "learning_rate": 1.5292850624631044e-06, "loss": 0.86401331, "num_input_tokens_seen": 105521955, "step": 4893, "time_per_iteration": 2.767984628677368 }, { "auxiliary_loss_clip": 0.01165699, "auxiliary_loss_mlp": 0.01026341, "balance_loss_clip": 1.04883504, "balance_loss_mlp": 1.01886022, "epoch": 0.5884687067877111, "flos": 30443737691520.0, "grad_norm": 2.5960481088995575, "language_loss": 0.80285233, "learning_rate": 1.5285280087281593e-06, "loss": 0.82477272, "num_input_tokens_seen": 105542685, "step": 4894, "time_per_iteration": 2.7500953674316406 }, { "auxiliary_loss_clip": 0.01073345, "auxiliary_loss_mlp": 0.01000102, "balance_loss_clip": 1.01575541, "balance_loss_mlp": 0.99897575, "epoch": 0.5885889496783503, "flos": 70507550580480.0, "grad_norm": 0.6471451875525277, "language_loss": 0.56625211, "learning_rate": 1.5277710265151398e-06, "loss": 0.58698654, "num_input_tokens_seen": 105612165, "step": 4895, "time_per_iteration": 3.470531702041626 }, { "auxiliary_loss_clip": 0.01174609, "auxiliary_loss_mlp": 0.01025467, "balance_loss_clip": 1.05281687, "balance_loss_mlp": 1.01777172, "epoch": 0.5887091925689893, "flos": 19098767485440.0, "grad_norm": 6.014909715702352, "language_loss": 0.77386141, "learning_rate": 1.5270141159388803e-06, "loss": 0.79586214, "num_input_tokens_seen": 105629185, "step": 4896, "time_per_iteration": 2.718689441680908 }, { "auxiliary_loss_clip": 0.01172462, "auxiliary_loss_mlp": 0.01026944, "balance_loss_clip": 1.04802608, "balance_loss_mlp": 1.01953173, "epoch": 0.5888294354596284, "flos": 23294282739840.0, "grad_norm": 1.963529612678041, "language_loss": 0.80515146, "learning_rate": 1.526257277114203e-06, "loss": 0.82714558, "num_input_tokens_seen": 105650260, "step": 4897, "time_per_iteration": 2.6942191123962402 }, { "auxiliary_loss_clip": 0.01162814, "auxiliary_loss_mlp": 0.01026316, "balance_loss_clip": 1.04729891, "balance_loss_mlp": 1.01875508, "epoch": 0.5889496783502676, "flos": 21981532383360.0, "grad_norm": 1.8212879484815205, "language_loss": 0.79646397, "learning_rate": 1.5255005101559201e-06, "loss": 0.8183552, "num_input_tokens_seen": 105667870, "step": 4898, "time_per_iteration": 2.850280284881592 }, { "auxiliary_loss_clip": 0.01171902, "auxiliary_loss_mlp": 0.01026337, "balance_loss_clip": 1.04868162, "balance_loss_mlp": 1.0193398, "epoch": 0.5890699212409066, "flos": 21685233093120.0, "grad_norm": 2.2270023066371074, "language_loss": 0.76887667, "learning_rate": 1.524743815178833e-06, "loss": 0.79085904, "num_input_tokens_seen": 105685830, "step": 4899, "time_per_iteration": 2.673349380493164 }, { "auxiliary_loss_clip": 0.01167173, "auxiliary_loss_mlp": 0.01024088, "balance_loss_clip": 1.04641604, "balance_loss_mlp": 1.01670277, "epoch": 0.5891901641315457, "flos": 19464553635840.0, "grad_norm": 1.8077746482455588, "language_loss": 0.8111676, "learning_rate": 1.5239871922977315e-06, "loss": 0.83308017, "num_input_tokens_seen": 105705745, "step": 4900, "time_per_iteration": 2.7408032417297363 }, { "auxiliary_loss_clip": 0.01168143, "auxiliary_loss_mlp": 0.01024439, "balance_loss_clip": 1.0501579, "balance_loss_mlp": 1.01701176, "epoch": 0.5893104070221848, "flos": 19609884063360.0, "grad_norm": 1.9206760679920507, "language_loss": 0.90004486, "learning_rate": 1.523230641627394e-06, "loss": 0.92197067, "num_input_tokens_seen": 105724730, "step": 4901, "time_per_iteration": 2.656959295272827 }, { "auxiliary_loss_clip": 0.01164099, "auxiliary_loss_mlp": 0.01020879, "balance_loss_clip": 1.04860806, "balance_loss_mlp": 1.01364326, "epoch": 0.5894306499128239, "flos": 29060063930880.0, "grad_norm": 2.081592575146337, "language_loss": 0.73149502, "learning_rate": 1.5224741632825888e-06, "loss": 0.75334477, "num_input_tokens_seen": 105744920, "step": 4902, "time_per_iteration": 2.8139383792877197 }, { "auxiliary_loss_clip": 0.01176671, "auxiliary_loss_mlp": 0.01032909, "balance_loss_clip": 1.05009341, "balance_loss_mlp": 1.02542901, "epoch": 0.589550892803463, "flos": 42298890721920.0, "grad_norm": 2.2633638435322743, "language_loss": 0.69211257, "learning_rate": 1.521717757378074e-06, "loss": 0.71420836, "num_input_tokens_seen": 105765465, "step": 4903, "time_per_iteration": 2.885730743408203 }, { "auxiliary_loss_clip": 0.01173126, "auxiliary_loss_mlp": 0.01023841, "balance_loss_clip": 1.0485599, "balance_loss_mlp": 1.01543653, "epoch": 0.5896711356941021, "flos": 14137062197760.0, "grad_norm": 2.3397756194297687, "language_loss": 0.68528295, "learning_rate": 1.5209614240285943e-06, "loss": 0.70725268, "num_input_tokens_seen": 105783120, "step": 4904, "time_per_iteration": 2.79191517829895 }, { "auxiliary_loss_clip": 0.01172102, "auxiliary_loss_mlp": 0.01058095, "balance_loss_clip": 1.04817772, "balance_loss_mlp": 1.01922095, "epoch": 0.5897913785847412, "flos": 17201355454080.0, "grad_norm": 2.4435773305694544, "language_loss": 0.85022473, "learning_rate": 1.520205163348887e-06, "loss": 0.87252665, "num_input_tokens_seen": 105801055, "step": 4905, "time_per_iteration": 2.6496224403381348 }, { "auxiliary_loss_clip": 0.01073194, "auxiliary_loss_mlp": 0.01002063, "balance_loss_clip": 1.01487708, "balance_loss_mlp": 1.00078142, "epoch": 0.5899116214753802, "flos": 48794164202880.0, "grad_norm": 0.7293336709036037, "language_loss": 0.56965476, "learning_rate": 1.519448975453674e-06, "loss": 0.59040731, "num_input_tokens_seen": 105856155, "step": 4906, "time_per_iteration": 3.1971657276153564 }, { "auxiliary_loss_clip": 0.01173069, "auxiliary_loss_mlp": 0.01052957, "balance_loss_clip": 1.05212903, "balance_loss_mlp": 1.01618612, "epoch": 0.5900318643660194, "flos": 21103659987840.0, "grad_norm": 3.582685626713225, "language_loss": 0.76410979, "learning_rate": 1.5186928604576696e-06, "loss": 0.7863701, "num_input_tokens_seen": 105873350, "step": 4907, "time_per_iteration": 2.725445032119751 }, { "auxiliary_loss_clip": 0.01168278, "auxiliary_loss_mlp": 0.01028545, "balance_loss_clip": 1.04932845, "balance_loss_mlp": 1.02191997, "epoch": 0.5901521072566585, "flos": 21178390233600.0, "grad_norm": 2.26770880427766, "language_loss": 0.77239752, "learning_rate": 1.5179368184755752e-06, "loss": 0.79436576, "num_input_tokens_seen": 105891435, "step": 4908, "time_per_iteration": 2.69590425491333 }, { "auxiliary_loss_clip": 0.01164328, "auxiliary_loss_mlp": 0.01025222, "balance_loss_clip": 1.04798603, "balance_loss_mlp": 1.01789069, "epoch": 0.5902723501472975, "flos": 20225967160320.0, "grad_norm": 1.7026578084638935, "language_loss": 0.82637578, "learning_rate": 1.5171808496220821e-06, "loss": 0.84827131, "num_input_tokens_seen": 105910190, "step": 4909, "time_per_iteration": 3.638158082962036 }, { "auxiliary_loss_clip": 0.01172499, "auxiliary_loss_mlp": 0.01024602, "balance_loss_clip": 1.04954815, "balance_loss_mlp": 1.01692176, "epoch": 0.5903925930379367, "flos": 22964407211520.0, "grad_norm": 1.7993474754931742, "language_loss": 0.81151003, "learning_rate": 1.5164249540118708e-06, "loss": 0.83348095, "num_input_tokens_seen": 105929315, "step": 4910, "time_per_iteration": 3.6763129234313965 }, { "auxiliary_loss_clip": 0.01164923, "auxiliary_loss_mlp": 0.01034075, "balance_loss_clip": 1.05113888, "balance_loss_mlp": 1.026824, "epoch": 0.5905128359285757, "flos": 23367720096000.0, "grad_norm": 1.6609979766920024, "language_loss": 0.83201802, "learning_rate": 1.5156691317596093e-06, "loss": 0.85400796, "num_input_tokens_seen": 105950740, "step": 4911, "time_per_iteration": 2.723524808883667 }, { "auxiliary_loss_clip": 0.01174078, "auxiliary_loss_mlp": 0.01056326, "balance_loss_clip": 1.05112386, "balance_loss_mlp": 1.01760125, "epoch": 0.5906330788192148, "flos": 28032335994240.0, "grad_norm": 2.501150568325556, "language_loss": 0.67086399, "learning_rate": 1.5149133829799556e-06, "loss": 0.69316804, "num_input_tokens_seen": 105968735, "step": 4912, "time_per_iteration": 2.8173952102661133 }, { "auxiliary_loss_clip": 0.01175571, "auxiliary_loss_mlp": 0.01027484, "balance_loss_clip": 1.05063343, "balance_loss_mlp": 1.02022409, "epoch": 0.590753321709854, "flos": 18477943793280.0, "grad_norm": 2.1549082069159886, "language_loss": 0.81052798, "learning_rate": 1.5141577077875556e-06, "loss": 0.83255857, "num_input_tokens_seen": 105986060, "step": 4913, "time_per_iteration": 2.6579151153564453 }, { "auxiliary_loss_clip": 0.0117254, "auxiliary_loss_mlp": 0.01022576, "balance_loss_clip": 1.04921699, "balance_loss_mlp": 1.01505411, "epoch": 0.590873564600493, "flos": 16873706568960.0, "grad_norm": 2.0161286211007785, "language_loss": 0.7235024, "learning_rate": 1.5134021062970451e-06, "loss": 0.74545348, "num_input_tokens_seen": 106004440, "step": 4914, "time_per_iteration": 2.733241558074951 }, { "auxiliary_loss_clip": 0.01155908, "auxiliary_loss_mlp": 0.01024484, "balance_loss_clip": 1.05120146, "balance_loss_mlp": 1.01741505, "epoch": 0.5909938074911321, "flos": 13516166678400.0, "grad_norm": 2.0048515365973896, "language_loss": 0.81293082, "learning_rate": 1.5126465786230483e-06, "loss": 0.83473474, "num_input_tokens_seen": 106021215, "step": 4915, "time_per_iteration": 2.778007745742798 }, { "auxiliary_loss_clip": 0.01173085, "auxiliary_loss_mlp": 0.01028846, "balance_loss_clip": 1.05010235, "balance_loss_mlp": 1.02172613, "epoch": 0.5911140503817712, "flos": 26024067613440.0, "grad_norm": 1.7486387276388833, "language_loss": 0.82274806, "learning_rate": 1.5118911248801787e-06, "loss": 0.84476739, "num_input_tokens_seen": 106039225, "step": 4916, "time_per_iteration": 2.7686123847961426 }, { "auxiliary_loss_clip": 0.01163258, "auxiliary_loss_mlp": 0.01026621, "balance_loss_clip": 1.04665494, "balance_loss_mlp": 1.01942658, "epoch": 0.5912342932724103, "flos": 23258731253760.0, "grad_norm": 2.1125502866405372, "language_loss": 0.79848838, "learning_rate": 1.5111357451830364e-06, "loss": 0.82038713, "num_input_tokens_seen": 106057920, "step": 4917, "time_per_iteration": 3.598877429962158 }, { "auxiliary_loss_clip": 0.01170016, "auxiliary_loss_mlp": 0.01022145, "balance_loss_clip": 1.04823387, "balance_loss_mlp": 1.0152247, "epoch": 0.5913545361630493, "flos": 19573039687680.0, "grad_norm": 2.938878631744228, "language_loss": 0.712129, "learning_rate": 1.5103804396462131e-06, "loss": 0.73405069, "num_input_tokens_seen": 106077855, "step": 4918, "time_per_iteration": 2.600339412689209 }, { "auxiliary_loss_clip": 0.01176524, "auxiliary_loss_mlp": 0.01028572, "balance_loss_clip": 1.0503087, "balance_loss_mlp": 1.02008462, "epoch": 0.5914747790536885, "flos": 26213532877440.0, "grad_norm": 2.3431310677575334, "language_loss": 0.8022334, "learning_rate": 1.5096252083842877e-06, "loss": 0.82428443, "num_input_tokens_seen": 106097065, "step": 4919, "time_per_iteration": 2.698359489440918 }, { "auxiliary_loss_clip": 0.01169042, "auxiliary_loss_mlp": 0.0102588, "balance_loss_clip": 1.04789126, "balance_loss_mlp": 1.01813149, "epoch": 0.5915950219443276, "flos": 27417545786880.0, "grad_norm": 1.7372977146493913, "language_loss": 0.85407615, "learning_rate": 1.5088700515118285e-06, "loss": 0.87602532, "num_input_tokens_seen": 106116385, "step": 4920, "time_per_iteration": 2.9034435749053955 }, { "auxiliary_loss_clip": 0.011594, "auxiliary_loss_mlp": 0.01029468, "balance_loss_clip": 1.04893613, "balance_loss_mlp": 1.02179933, "epoch": 0.5917152648349666, "flos": 21907879545600.0, "grad_norm": 3.659146789372185, "language_loss": 0.665905, "learning_rate": 1.508114969143392e-06, "loss": 0.68779367, "num_input_tokens_seen": 106136370, "step": 4921, "time_per_iteration": 2.7480762004852295 }, { "auxiliary_loss_clip": 0.01167352, "auxiliary_loss_mlp": 0.0102201, "balance_loss_clip": 1.04687655, "balance_loss_mlp": 1.01531351, "epoch": 0.5918355077256057, "flos": 28109185142400.0, "grad_norm": 1.45199445212216, "language_loss": 0.77814174, "learning_rate": 1.5073599613935238e-06, "loss": 0.80003536, "num_input_tokens_seen": 106158490, "step": 4922, "time_per_iteration": 2.7272729873657227 }, { "auxiliary_loss_clip": 0.01167907, "auxiliary_loss_mlp": 0.0102154, "balance_loss_clip": 1.04973531, "balance_loss_mlp": 1.01406002, "epoch": 0.5919557506162448, "flos": 28183807647360.0, "grad_norm": 2.034792869565593, "language_loss": 0.57526767, "learning_rate": 1.5066050283767574e-06, "loss": 0.59716213, "num_input_tokens_seen": 106179170, "step": 4923, "time_per_iteration": 2.9110913276672363 }, { "auxiliary_loss_clip": 0.01162358, "auxiliary_loss_mlp": 0.01021735, "balance_loss_clip": 1.04885149, "balance_loss_mlp": 1.01415336, "epoch": 0.5920759935068839, "flos": 12094355652480.0, "grad_norm": 2.0810875632663324, "language_loss": 0.83408535, "learning_rate": 1.505850170207616e-06, "loss": 0.85592622, "num_input_tokens_seen": 106196035, "step": 4924, "time_per_iteration": 2.682687520980835 }, { "auxiliary_loss_clip": 0.01167018, "auxiliary_loss_mlp": 0.01022108, "balance_loss_clip": 1.04896533, "balance_loss_mlp": 1.014961, "epoch": 0.592196236397523, "flos": 29424772673280.0, "grad_norm": 2.2307054732247544, "language_loss": 0.77995855, "learning_rate": 1.505095387000611e-06, "loss": 0.80184978, "num_input_tokens_seen": 106218335, "step": 4925, "time_per_iteration": 2.739885091781616 }, { "auxiliary_loss_clip": 0.01159891, "auxiliary_loss_mlp": 0.01027694, "balance_loss_clip": 1.05062175, "balance_loss_mlp": 1.01998687, "epoch": 0.5923164792881621, "flos": 24384709866240.0, "grad_norm": 1.7618988557168136, "language_loss": 0.73827344, "learning_rate": 1.504340678870242e-06, "loss": 0.76014924, "num_input_tokens_seen": 106236550, "step": 4926, "time_per_iteration": 2.797703981399536 }, { "auxiliary_loss_clip": 0.0116681, "auxiliary_loss_mlp": 0.01028133, "balance_loss_clip": 1.04767919, "balance_loss_mlp": 1.01987743, "epoch": 0.5924367221788012, "flos": 24024238928640.0, "grad_norm": 2.1214967444849755, "language_loss": 0.89710891, "learning_rate": 1.5035860459309989e-06, "loss": 0.91905832, "num_input_tokens_seen": 106254265, "step": 4927, "time_per_iteration": 2.6762430667877197 }, { "auxiliary_loss_clip": 0.01163567, "auxiliary_loss_mlp": 0.01028344, "balance_loss_clip": 1.04856777, "balance_loss_mlp": 1.02034521, "epoch": 0.5925569650694402, "flos": 26870590414080.0, "grad_norm": 1.9516759785641762, "language_loss": 0.63643301, "learning_rate": 1.5028314882973568e-06, "loss": 0.65835214, "num_input_tokens_seen": 106274670, "step": 4928, "time_per_iteration": 2.945687770843506 }, { "auxiliary_loss_clip": 0.01165089, "auxiliary_loss_mlp": 0.01030932, "balance_loss_clip": 1.04844999, "balance_loss_mlp": 1.02265882, "epoch": 0.5926772079600794, "flos": 22302788647680.0, "grad_norm": 2.074763575270355, "language_loss": 0.84858072, "learning_rate": 1.502077006083783e-06, "loss": 0.87054092, "num_input_tokens_seen": 106293330, "step": 4929, "time_per_iteration": 2.700193405151367 }, { "auxiliary_loss_clip": 0.01172764, "auxiliary_loss_mlp": 0.01050562, "balance_loss_clip": 1.05035257, "balance_loss_mlp": 1.01349676, "epoch": 0.5927974508507184, "flos": 19865244827520.0, "grad_norm": 2.038831146509655, "language_loss": 0.76619136, "learning_rate": 1.5013225994047315e-06, "loss": 0.78842461, "num_input_tokens_seen": 106310960, "step": 4930, "time_per_iteration": 2.6787078380584717 }, { "auxiliary_loss_clip": 0.01171957, "auxiliary_loss_mlp": 0.01054642, "balance_loss_clip": 1.05088985, "balance_loss_mlp": 1.01834273, "epoch": 0.5929176937413575, "flos": 15776743167360.0, "grad_norm": 2.1127490229237753, "language_loss": 0.80748546, "learning_rate": 1.5005682683746452e-06, "loss": 0.82975149, "num_input_tokens_seen": 106329475, "step": 4931, "time_per_iteration": 2.62579607963562 }, { "auxiliary_loss_clip": 0.01172535, "auxiliary_loss_mlp": 0.01024863, "balance_loss_clip": 1.05187786, "balance_loss_mlp": 1.01775777, "epoch": 0.5930379366319967, "flos": 17601472028160.0, "grad_norm": 2.0677718925584516, "language_loss": 0.7279802, "learning_rate": 1.4998140131079553e-06, "loss": 0.74995416, "num_input_tokens_seen": 106345565, "step": 4932, "time_per_iteration": 2.680290460586548 }, { "auxiliary_loss_clip": 0.01156934, "auxiliary_loss_mlp": 0.01049925, "balance_loss_clip": 1.0494113, "balance_loss_mlp": 1.01591158, "epoch": 0.5931581795226357, "flos": 17704283731200.0, "grad_norm": 1.8576429007212738, "language_loss": 0.73479509, "learning_rate": 1.4990598337190821e-06, "loss": 0.75686371, "num_input_tokens_seen": 106361920, "step": 4933, "time_per_iteration": 2.7101683616638184 }, { "auxiliary_loss_clip": 0.01170914, "auxiliary_loss_mlp": 0.0105389, "balance_loss_clip": 1.04769361, "balance_loss_mlp": 1.01783848, "epoch": 0.5932784224132748, "flos": 24280102483200.0, "grad_norm": 2.121447597068211, "language_loss": 0.6802513, "learning_rate": 1.4983057303224338e-06, "loss": 0.70249933, "num_input_tokens_seen": 106381735, "step": 4934, "time_per_iteration": 3.7726945877075195 }, { "auxiliary_loss_clip": 0.0116073, "auxiliary_loss_mlp": 0.01024444, "balance_loss_clip": 1.05011547, "balance_loss_mlp": 1.01658797, "epoch": 0.5933986653039139, "flos": 22926700909440.0, "grad_norm": 1.6308515346763421, "language_loss": 0.87764263, "learning_rate": 1.4975517030324072e-06, "loss": 0.89949429, "num_input_tokens_seen": 106399745, "step": 4935, "time_per_iteration": 3.8174080848693848 }, { "auxiliary_loss_clip": 0.01071363, "auxiliary_loss_mlp": 0.0104905, "balance_loss_clip": 1.01432991, "balance_loss_mlp": 1.00876045, "epoch": 0.593518908194553, "flos": 71121730256640.0, "grad_norm": 0.7856390280851141, "language_loss": 0.61806393, "learning_rate": 1.4967977519633882e-06, "loss": 0.63926804, "num_input_tokens_seen": 106457205, "step": 4936, "time_per_iteration": 3.4096667766571045 }, { "auxiliary_loss_clip": 0.01163728, "auxiliary_loss_mlp": 0.0102987, "balance_loss_clip": 1.0500567, "balance_loss_mlp": 1.02166271, "epoch": 0.593639151085192, "flos": 20448649526400.0, "grad_norm": 1.9767569394101931, "language_loss": 0.77906001, "learning_rate": 1.4960438772297494e-06, "loss": 0.80099595, "num_input_tokens_seen": 106474250, "step": 4937, "time_per_iteration": 2.828266143798828 }, { "auxiliary_loss_clip": 0.01169024, "auxiliary_loss_mlp": 0.0103044, "balance_loss_clip": 1.048159, "balance_loss_mlp": 1.02309704, "epoch": 0.5937593939758312, "flos": 30883428074880.0, "grad_norm": 2.18076863044082, "language_loss": 0.73624647, "learning_rate": 1.495290078945855e-06, "loss": 0.75824112, "num_input_tokens_seen": 106494015, "step": 4938, "time_per_iteration": 2.8281521797180176 }, { "auxiliary_loss_clip": 0.01173809, "auxiliary_loss_mlp": 0.01024529, "balance_loss_clip": 1.05042958, "balance_loss_mlp": 1.01729941, "epoch": 0.5938796368664703, "flos": 36898069668480.0, "grad_norm": 2.0048757299694713, "language_loss": 0.74177253, "learning_rate": 1.4945363572260529e-06, "loss": 0.76375592, "num_input_tokens_seen": 106515010, "step": 4939, "time_per_iteration": 2.8320300579071045 }, { "auxiliary_loss_clip": 0.01168687, "auxiliary_loss_mlp": 0.01025613, "balance_loss_clip": 1.04838109, "balance_loss_mlp": 1.01826382, "epoch": 0.5939998797571093, "flos": 23842926051840.0, "grad_norm": 2.72785691170428, "language_loss": 0.68128037, "learning_rate": 1.4937827121846845e-06, "loss": 0.70322335, "num_input_tokens_seen": 106535265, "step": 4940, "time_per_iteration": 2.737473249435425 }, { "auxiliary_loss_clip": 0.01159045, "auxiliary_loss_mlp": 0.01028618, "balance_loss_clip": 1.05083156, "balance_loss_mlp": 1.0214479, "epoch": 0.5941201226477485, "flos": 25191407462400.0, "grad_norm": 1.4704299318319267, "language_loss": 0.73609376, "learning_rate": 1.4930291439360755e-06, "loss": 0.75797033, "num_input_tokens_seen": 106557830, "step": 4941, "time_per_iteration": 2.8749959468841553 }, { "auxiliary_loss_clip": 0.01171444, "auxiliary_loss_mlp": 0.01023797, "balance_loss_clip": 1.04963028, "balance_loss_mlp": 1.01608372, "epoch": 0.5942403655383875, "flos": 22418996123520.0, "grad_norm": 1.893599242000087, "language_loss": 0.79126012, "learning_rate": 1.4922756525945427e-06, "loss": 0.81321251, "num_input_tokens_seen": 106577140, "step": 4942, "time_per_iteration": 2.723480224609375 }, { "auxiliary_loss_clip": 0.01071554, "auxiliary_loss_mlp": 0.01006477, "balance_loss_clip": 1.01382875, "balance_loss_mlp": 1.00540388, "epoch": 0.5943606084290266, "flos": 67629310796160.0, "grad_norm": 0.7742574342059758, "language_loss": 0.59512317, "learning_rate": 1.4915222382743894e-06, "loss": 0.6159035, "num_input_tokens_seen": 106635975, "step": 4943, "time_per_iteration": 4.217113971710205 }, { "auxiliary_loss_clip": 0.0117329, "auxiliary_loss_mlp": 0.01029896, "balance_loss_clip": 1.05169296, "balance_loss_mlp": 1.02198982, "epoch": 0.5944808513196658, "flos": 18223157646720.0, "grad_norm": 1.9845148367754426, "language_loss": 0.7229228, "learning_rate": 1.4907689010899085e-06, "loss": 0.74495465, "num_input_tokens_seen": 106653555, "step": 4944, "time_per_iteration": 2.706008195877075 }, { "auxiliary_loss_clip": 0.01163026, "auxiliary_loss_mlp": 0.01021738, "balance_loss_clip": 1.04759777, "balance_loss_mlp": 1.01372099, "epoch": 0.5946010942103048, "flos": 24790824011520.0, "grad_norm": 3.173369069752387, "language_loss": 0.62406254, "learning_rate": 1.4900156411553804e-06, "loss": 0.64591026, "num_input_tokens_seen": 106673385, "step": 4945, "time_per_iteration": 2.738367795944214 }, { "auxiliary_loss_clip": 0.01169624, "auxiliary_loss_mlp": 0.01024814, "balance_loss_clip": 1.05000007, "balance_loss_mlp": 1.01720214, "epoch": 0.5947213371009439, "flos": 15231619388160.0, "grad_norm": 2.190235246765822, "language_loss": 0.85583359, "learning_rate": 1.4892624585850739e-06, "loss": 0.87777799, "num_input_tokens_seen": 106691740, "step": 4946, "time_per_iteration": 2.669339418411255 }, { "auxiliary_loss_clip": 0.01175684, "auxiliary_loss_mlp": 0.01024389, "balance_loss_clip": 1.04955471, "balance_loss_mlp": 1.01681328, "epoch": 0.594841579991583, "flos": 25848069949440.0, "grad_norm": 2.3762918873477576, "language_loss": 0.79777443, "learning_rate": 1.4885093534932465e-06, "loss": 0.81977516, "num_input_tokens_seen": 106709705, "step": 4947, "time_per_iteration": 2.6811399459838867 }, { "auxiliary_loss_clip": 0.01165382, "auxiliary_loss_mlp": 0.01026873, "balance_loss_clip": 1.05182242, "balance_loss_mlp": 1.01888561, "epoch": 0.5949618228822221, "flos": 23981109672960.0, "grad_norm": 2.2169774374476843, "language_loss": 0.71466386, "learning_rate": 1.4877563259941433e-06, "loss": 0.73658645, "num_input_tokens_seen": 106727560, "step": 4948, "time_per_iteration": 2.760648488998413 }, { "auxiliary_loss_clip": 0.01177062, "auxiliary_loss_mlp": 0.01026225, "balance_loss_clip": 1.0514816, "balance_loss_mlp": 1.01893544, "epoch": 0.5950820657728612, "flos": 40547491476480.0, "grad_norm": 2.060655748112329, "language_loss": 0.67889309, "learning_rate": 1.4870033762019988e-06, "loss": 0.70092595, "num_input_tokens_seen": 106747725, "step": 4949, "time_per_iteration": 2.8384711742401123 }, { "auxiliary_loss_clip": 0.01166107, "auxiliary_loss_mlp": 0.01025202, "balance_loss_clip": 1.0493319, "balance_loss_mlp": 1.0176115, "epoch": 0.5952023086635003, "flos": 23184467884800.0, "grad_norm": 1.586632968792306, "language_loss": 0.73594582, "learning_rate": 1.4862505042310334e-06, "loss": 0.75785887, "num_input_tokens_seen": 106767010, "step": 4950, "time_per_iteration": 2.7819106578826904 }, { "auxiliary_loss_clip": 0.01161705, "auxiliary_loss_mlp": 0.01026578, "balance_loss_clip": 1.05040002, "balance_loss_mlp": 1.0195713, "epoch": 0.5953225515541394, "flos": 33653289548160.0, "grad_norm": 1.895638016486721, "language_loss": 0.69564652, "learning_rate": 1.4854977101954587e-06, "loss": 0.7175293, "num_input_tokens_seen": 106789230, "step": 4951, "time_per_iteration": 2.881094217300415 }, { "auxiliary_loss_clip": 0.01170879, "auxiliary_loss_mlp": 0.01026376, "balance_loss_clip": 1.04753995, "balance_loss_mlp": 1.01843655, "epoch": 0.5954427944447784, "flos": 24459619680000.0, "grad_norm": 3.0618010478558007, "language_loss": 0.86216325, "learning_rate": 1.4847449942094716e-06, "loss": 0.88413578, "num_input_tokens_seen": 106808110, "step": 4952, "time_per_iteration": 2.816969156265259 }, { "auxiliary_loss_clip": 0.01160977, "auxiliary_loss_mlp": 0.01031018, "balance_loss_clip": 1.04821086, "balance_loss_mlp": 1.02298951, "epoch": 0.5955630373354175, "flos": 18551848026240.0, "grad_norm": 1.879065245306648, "language_loss": 0.86378813, "learning_rate": 1.4839923563872598e-06, "loss": 0.88570803, "num_input_tokens_seen": 106826650, "step": 4953, "time_per_iteration": 2.665348768234253 }, { "auxiliary_loss_clip": 0.01163576, "auxiliary_loss_mlp": 0.01024877, "balance_loss_clip": 1.0488925, "balance_loss_mlp": 1.01727128, "epoch": 0.5956832802260567, "flos": 19791699730560.0, "grad_norm": 1.9437727378263747, "language_loss": 0.75889719, "learning_rate": 1.483239796842997e-06, "loss": 0.78078175, "num_input_tokens_seen": 106844680, "step": 4954, "time_per_iteration": 2.847458839416504 }, { "auxiliary_loss_clip": 0.01164781, "auxiliary_loss_mlp": 0.01028027, "balance_loss_clip": 1.05024815, "balance_loss_mlp": 1.0208025, "epoch": 0.5958035231166957, "flos": 19750868945280.0, "grad_norm": 1.7233441786310286, "language_loss": 0.83999228, "learning_rate": 1.4824873156908462e-06, "loss": 0.86192036, "num_input_tokens_seen": 106862605, "step": 4955, "time_per_iteration": 2.6703314781188965 }, { "auxiliary_loss_clip": 0.01174766, "auxiliary_loss_mlp": 0.01060623, "balance_loss_clip": 1.0524348, "balance_loss_mlp": 1.02357543, "epoch": 0.5959237660073348, "flos": 21652806090240.0, "grad_norm": 2.101047290352209, "language_loss": 0.75676256, "learning_rate": 1.4817349130449584e-06, "loss": 0.77911645, "num_input_tokens_seen": 106882325, "step": 4956, "time_per_iteration": 2.7674734592437744 }, { "auxiliary_loss_clip": 0.01165075, "auxiliary_loss_mlp": 0.01031906, "balance_loss_clip": 1.04730952, "balance_loss_mlp": 1.02453268, "epoch": 0.5960440088979739, "flos": 21171207513600.0, "grad_norm": 4.404059459880219, "language_loss": 0.82785529, "learning_rate": 1.4809825890194717e-06, "loss": 0.84982514, "num_input_tokens_seen": 106900995, "step": 4957, "time_per_iteration": 2.6629538536071777 }, { "auxiliary_loss_clip": 0.01161525, "auxiliary_loss_mlp": 0.01022553, "balance_loss_clip": 1.04766655, "balance_loss_mlp": 1.01578522, "epoch": 0.596164251788613, "flos": 14757526753920.0, "grad_norm": 1.8369589893093647, "language_loss": 0.76877737, "learning_rate": 1.4802303437285139e-06, "loss": 0.79061812, "num_input_tokens_seen": 106918265, "step": 4958, "time_per_iteration": 2.6887905597686768 }, { "auxiliary_loss_clip": 0.01164636, "auxiliary_loss_mlp": 0.01025009, "balance_loss_clip": 1.0461607, "balance_loss_mlp": 1.01708114, "epoch": 0.596284494679252, "flos": 20485924865280.0, "grad_norm": 2.3114537433282716, "language_loss": 0.80980456, "learning_rate": 1.4794781772861994e-06, "loss": 0.83170104, "num_input_tokens_seen": 106934760, "step": 4959, "time_per_iteration": 2.779905319213867 }, { "auxiliary_loss_clip": 0.01166163, "auxiliary_loss_mlp": 0.01053421, "balance_loss_clip": 1.04975104, "balance_loss_mlp": 1.01658309, "epoch": 0.5964047375698912, "flos": 31212262108800.0, "grad_norm": 2.576701879289839, "language_loss": 0.67114246, "learning_rate": 1.4787260898066324e-06, "loss": 0.69333827, "num_input_tokens_seen": 106954760, "step": 4960, "time_per_iteration": 3.7456672191619873 }, { "auxiliary_loss_clip": 0.01171471, "auxiliary_loss_mlp": 0.01028458, "balance_loss_clip": 1.04944217, "balance_loss_mlp": 1.02105784, "epoch": 0.5965249804605303, "flos": 27483620855040.0, "grad_norm": 2.1451265348995756, "language_loss": 0.8588351, "learning_rate": 1.4779740814039023e-06, "loss": 0.88083434, "num_input_tokens_seen": 106974845, "step": 4961, "time_per_iteration": 4.701030731201172 }, { "auxiliary_loss_clip": 0.01171746, "auxiliary_loss_mlp": 0.01026154, "balance_loss_clip": 1.04858279, "balance_loss_mlp": 1.01863217, "epoch": 0.5966452233511693, "flos": 30773936442240.0, "grad_norm": 1.924235946800698, "language_loss": 0.68528461, "learning_rate": 1.4772221521920894e-06, "loss": 0.70726359, "num_input_tokens_seen": 106994870, "step": 4962, "time_per_iteration": 2.8052823543548584 }, { "auxiliary_loss_clip": 0.01166487, "auxiliary_loss_mlp": 0.01025998, "balance_loss_clip": 1.0502696, "balance_loss_mlp": 1.01879811, "epoch": 0.5967654662418085, "flos": 25481170477440.0, "grad_norm": 1.935142347612784, "language_loss": 0.74228013, "learning_rate": 1.4764703022852598e-06, "loss": 0.76420498, "num_input_tokens_seen": 107015390, "step": 4963, "time_per_iteration": 2.771674394607544 }, { "auxiliary_loss_clip": 0.01147401, "auxiliary_loss_mlp": 0.01024846, "balance_loss_clip": 1.04782081, "balance_loss_mlp": 1.01804519, "epoch": 0.5968857091324475, "flos": 19099126621440.0, "grad_norm": 1.8852848068322032, "language_loss": 0.76863575, "learning_rate": 1.4757185317974696e-06, "loss": 0.79035825, "num_input_tokens_seen": 107033775, "step": 4964, "time_per_iteration": 2.906301498413086 }, { "auxiliary_loss_clip": 0.01170679, "auxiliary_loss_mlp": 0.01030673, "balance_loss_clip": 1.04793382, "balance_loss_mlp": 1.02254891, "epoch": 0.5970059520230866, "flos": 23692711374720.0, "grad_norm": 2.6768518757036293, "language_loss": 0.70829099, "learning_rate": 1.474966840842761e-06, "loss": 0.73030454, "num_input_tokens_seen": 107053355, "step": 4965, "time_per_iteration": 2.804326295852661 }, { "auxiliary_loss_clip": 0.01175175, "auxiliary_loss_mlp": 0.01023745, "balance_loss_clip": 1.05079699, "balance_loss_mlp": 1.01636553, "epoch": 0.5971261949137258, "flos": 23185545292800.0, "grad_norm": 1.7196198601084907, "language_loss": 0.86842114, "learning_rate": 1.4742152295351655e-06, "loss": 0.8904103, "num_input_tokens_seen": 107072510, "step": 4966, "time_per_iteration": 2.9145407676696777 }, { "auxiliary_loss_clip": 0.01168043, "auxiliary_loss_mlp": 0.01054904, "balance_loss_clip": 1.04696155, "balance_loss_mlp": 1.01803803, "epoch": 0.5972464378043648, "flos": 20557710195840.0, "grad_norm": 2.685532398997019, "language_loss": 0.64106023, "learning_rate": 1.4734636979887016e-06, "loss": 0.66328979, "num_input_tokens_seen": 107089970, "step": 4967, "time_per_iteration": 2.779280424118042 }, { "auxiliary_loss_clip": 0.01170334, "auxiliary_loss_mlp": 0.01025363, "balance_loss_clip": 1.04979992, "balance_loss_mlp": 1.01759028, "epoch": 0.5973666806950039, "flos": 29387030457600.0, "grad_norm": 2.3308570552745627, "language_loss": 0.90701699, "learning_rate": 1.4727122463173755e-06, "loss": 0.92897391, "num_input_tokens_seen": 107108500, "step": 4968, "time_per_iteration": 2.8214542865753174 }, { "auxiliary_loss_clip": 0.01163674, "auxiliary_loss_mlp": 0.01024447, "balance_loss_clip": 1.04806077, "balance_loss_mlp": 1.01710367, "epoch": 0.597486923585643, "flos": 22273522041600.0, "grad_norm": 3.7260862510775086, "language_loss": 0.64297426, "learning_rate": 1.471960874635183e-06, "loss": 0.66485548, "num_input_tokens_seen": 107128060, "step": 4969, "time_per_iteration": 3.7805566787719727 }, { "auxiliary_loss_clip": 0.01166308, "auxiliary_loss_mlp": 0.01027785, "balance_loss_clip": 1.05029535, "balance_loss_mlp": 1.02041483, "epoch": 0.5976071664762821, "flos": 13772461196160.0, "grad_norm": 2.156567550970377, "language_loss": 0.7083292, "learning_rate": 1.4712095830561055e-06, "loss": 0.73027015, "num_input_tokens_seen": 107146550, "step": 4970, "time_per_iteration": 2.8277461528778076 }, { "auxiliary_loss_clip": 0.01167363, "auxiliary_loss_mlp": 0.01026276, "balance_loss_clip": 1.04804289, "balance_loss_mlp": 1.01918316, "epoch": 0.5977274093669211, "flos": 19098623831040.0, "grad_norm": 2.154068511832698, "language_loss": 0.81144273, "learning_rate": 1.4704583716941147e-06, "loss": 0.83337915, "num_input_tokens_seen": 107165415, "step": 4971, "time_per_iteration": 2.798649311065674 }, { "auxiliary_loss_clip": 0.01164861, "auxiliary_loss_mlp": 0.01027188, "balance_loss_clip": 1.05111194, "balance_loss_mlp": 1.019279, "epoch": 0.5978476522575603, "flos": 20376002269440.0, "grad_norm": 2.1379054803253377, "language_loss": 0.72228789, "learning_rate": 1.4697072406631672e-06, "loss": 0.74420846, "num_input_tokens_seen": 107185320, "step": 4972, "time_per_iteration": 2.844294548034668 }, { "auxiliary_loss_clip": 0.0116526, "auxiliary_loss_mlp": 0.01025099, "balance_loss_clip": 1.05232894, "balance_loss_mlp": 1.01649785, "epoch": 0.5979678951481994, "flos": 29023147728000.0, "grad_norm": 2.629992276811104, "language_loss": 0.72797292, "learning_rate": 1.4689561900772097e-06, "loss": 0.7498765, "num_input_tokens_seen": 107205380, "step": 4973, "time_per_iteration": 2.7958219051361084 }, { "auxiliary_loss_clip": 0.01166451, "auxiliary_loss_mlp": 0.01026872, "balance_loss_clip": 1.04813385, "balance_loss_mlp": 1.019207, "epoch": 0.5980881380388384, "flos": 17967689141760.0, "grad_norm": 2.8191958336621554, "language_loss": 0.72320437, "learning_rate": 1.4682052200501758e-06, "loss": 0.74513763, "num_input_tokens_seen": 107222585, "step": 4974, "time_per_iteration": 2.725220203399658 }, { "auxiliary_loss_clip": 0.01169364, "auxiliary_loss_mlp": 0.01026789, "balance_loss_clip": 1.04610813, "balance_loss_mlp": 1.01935017, "epoch": 0.5982083809294776, "flos": 22962827013120.0, "grad_norm": 1.92468560521614, "language_loss": 0.80073833, "learning_rate": 1.4674543306959876e-06, "loss": 0.8226999, "num_input_tokens_seen": 107242055, "step": 4975, "time_per_iteration": 2.6430470943450928 }, { "auxiliary_loss_clip": 0.01172545, "auxiliary_loss_mlp": 0.01026324, "balance_loss_clip": 1.05051351, "balance_loss_mlp": 1.01858735, "epoch": 0.5983286238201166, "flos": 20991941712000.0, "grad_norm": 2.2137874513218296, "language_loss": 0.84494692, "learning_rate": 1.4667035221285535e-06, "loss": 0.86693561, "num_input_tokens_seen": 107259695, "step": 4976, "time_per_iteration": 2.7100369930267334 }, { "auxiliary_loss_clip": 0.01167304, "auxiliary_loss_mlp": 0.01025752, "balance_loss_clip": 1.04862094, "balance_loss_mlp": 1.01833451, "epoch": 0.5984488667107557, "flos": 28183448511360.0, "grad_norm": 1.8129071481114711, "language_loss": 0.74314356, "learning_rate": 1.4659527944617715e-06, "loss": 0.76507407, "num_input_tokens_seen": 107279640, "step": 4977, "time_per_iteration": 2.68664813041687 }, { "auxiliary_loss_clip": 0.01154047, "auxiliary_loss_mlp": 0.01024293, "balance_loss_clip": 1.04879093, "balance_loss_mlp": 1.01655054, "epoch": 0.5985691096013949, "flos": 16471794314880.0, "grad_norm": 1.6828372723159566, "language_loss": 0.75908792, "learning_rate": 1.465202147809526e-06, "loss": 0.78087133, "num_input_tokens_seen": 107298135, "step": 4978, "time_per_iteration": 2.759469747543335 }, { "auxiliary_loss_clip": 0.01174445, "auxiliary_loss_mlp": 0.01023469, "balance_loss_clip": 1.0498457, "balance_loss_mlp": 1.01601839, "epoch": 0.5986893524920339, "flos": 26719046933760.0, "grad_norm": 1.8229813481706942, "language_loss": 0.76243728, "learning_rate": 1.4644515822856888e-06, "loss": 0.78441644, "num_input_tokens_seen": 107316570, "step": 4979, "time_per_iteration": 2.726862668991089 }, { "auxiliary_loss_clip": 0.01071511, "auxiliary_loss_mlp": 0.01001476, "balance_loss_clip": 1.01514316, "balance_loss_mlp": 1.00042105, "epoch": 0.598809595382673, "flos": 61608061100160.0, "grad_norm": 0.7567405585710535, "language_loss": 0.56547874, "learning_rate": 1.4637010980041215e-06, "loss": 0.58620864, "num_input_tokens_seen": 107378680, "step": 4980, "time_per_iteration": 3.3055460453033447 }, { "auxiliary_loss_clip": 0.01176377, "auxiliary_loss_mlp": 0.01027534, "balance_loss_clip": 1.05154216, "balance_loss_mlp": 1.0200181, "epoch": 0.5989298382733121, "flos": 11801719549440.0, "grad_norm": 5.18288206929954, "language_loss": 0.89890063, "learning_rate": 1.4629506950786707e-06, "loss": 0.92093974, "num_input_tokens_seen": 107394860, "step": 4981, "time_per_iteration": 2.7301433086395264 }, { "auxiliary_loss_clip": 0.01069734, "auxiliary_loss_mlp": 0.00999894, "balance_loss_clip": 1.01327956, "balance_loss_mlp": 0.9988389, "epoch": 0.5990500811639512, "flos": 60025800021120.0, "grad_norm": 0.8488581078848312, "language_loss": 0.56063181, "learning_rate": 1.4622003736231733e-06, "loss": 0.58132809, "num_input_tokens_seen": 107453850, "step": 4982, "time_per_iteration": 3.3148131370544434 }, { "auxiliary_loss_clip": 0.01167192, "auxiliary_loss_mlp": 0.0102358, "balance_loss_clip": 1.04824102, "balance_loss_mlp": 1.01557541, "epoch": 0.5991703240545903, "flos": 18222726683520.0, "grad_norm": 6.696215709591923, "language_loss": 0.80656874, "learning_rate": 1.461450133751451e-06, "loss": 0.82847655, "num_input_tokens_seen": 107471920, "step": 4983, "time_per_iteration": 2.7349233627319336 }, { "auxiliary_loss_clip": 0.01172394, "auxiliary_loss_mlp": 0.01026233, "balance_loss_clip": 1.0484736, "balance_loss_mlp": 1.01879478, "epoch": 0.5992905669452293, "flos": 27709894581120.0, "grad_norm": 1.8044015170778802, "language_loss": 0.7593872, "learning_rate": 1.4606999755773153e-06, "loss": 0.7813735, "num_input_tokens_seen": 107493125, "step": 4984, "time_per_iteration": 2.7150843143463135 }, { "auxiliary_loss_clip": 0.0117338, "auxiliary_loss_mlp": 0.01024581, "balance_loss_clip": 1.04956949, "balance_loss_mlp": 1.01746178, "epoch": 0.5994108098358685, "flos": 20449008662400.0, "grad_norm": 1.676762624365854, "language_loss": 0.82373273, "learning_rate": 1.4599498992145643e-06, "loss": 0.84571242, "num_input_tokens_seen": 107513150, "step": 4985, "time_per_iteration": 2.6706128120422363 }, { "auxiliary_loss_clip": 0.01172188, "auxiliary_loss_mlp": 0.01050571, "balance_loss_clip": 1.05007637, "balance_loss_mlp": 1.01635051, "epoch": 0.5995310527265075, "flos": 22269966595200.0, "grad_norm": 1.978505729154591, "language_loss": 0.70891011, "learning_rate": 1.4591999047769846e-06, "loss": 0.73113763, "num_input_tokens_seen": 107532005, "step": 4986, "time_per_iteration": 2.6962225437164307 }, { "auxiliary_loss_clip": 0.01157632, "auxiliary_loss_mlp": 0.01030658, "balance_loss_clip": 1.05016494, "balance_loss_mlp": 1.02281439, "epoch": 0.5996512956171466, "flos": 18916951818240.0, "grad_norm": 1.920536413396067, "language_loss": 0.7509377, "learning_rate": 1.4584499923783486e-06, "loss": 0.77282059, "num_input_tokens_seen": 107550585, "step": 4987, "time_per_iteration": 4.719727993011475 }, { "auxiliary_loss_clip": 0.01166757, "auxiliary_loss_mlp": 0.01023117, "balance_loss_clip": 1.04911399, "balance_loss_mlp": 1.01579773, "epoch": 0.5997715385077858, "flos": 15370916330880.0, "grad_norm": 1.8499694577282702, "language_loss": 0.76206374, "learning_rate": 1.457700162132419e-06, "loss": 0.78396249, "num_input_tokens_seen": 107567575, "step": 4988, "time_per_iteration": 2.7278919219970703 }, { "auxiliary_loss_clip": 0.01157867, "auxiliary_loss_mlp": 0.01024289, "balance_loss_clip": 1.04920268, "balance_loss_mlp": 1.01701152, "epoch": 0.5998917813984248, "flos": 25264844818560.0, "grad_norm": 2.2616469545217965, "language_loss": 0.72434098, "learning_rate": 1.4569504141529433e-06, "loss": 0.74616253, "num_input_tokens_seen": 107585410, "step": 4989, "time_per_iteration": 2.791685104370117 }, { "auxiliary_loss_clip": 0.01170971, "auxiliary_loss_mlp": 0.01025921, "balance_loss_clip": 1.05078375, "balance_loss_mlp": 1.01701617, "epoch": 0.6000120242890639, "flos": 22054502862720.0, "grad_norm": 2.0378922278997207, "language_loss": 0.71844047, "learning_rate": 1.456200748553658e-06, "loss": 0.74040937, "num_input_tokens_seen": 107603405, "step": 4990, "time_per_iteration": 2.6603057384490967 }, { "auxiliary_loss_clip": 0.01176607, "auxiliary_loss_mlp": 0.01027814, "balance_loss_clip": 1.05162668, "balance_loss_mlp": 1.02029228, "epoch": 0.600132267179703, "flos": 29863421562240.0, "grad_norm": 1.5346090026573247, "language_loss": 0.78490734, "learning_rate": 1.455451165448287e-06, "loss": 0.80695158, "num_input_tokens_seen": 107626060, "step": 4991, "time_per_iteration": 2.7353408336639404 }, { "auxiliary_loss_clip": 0.01164317, "auxiliary_loss_mlp": 0.01025807, "balance_loss_clip": 1.04808164, "balance_loss_mlp": 1.01840127, "epoch": 0.6002525100703421, "flos": 25045358762880.0, "grad_norm": 2.698049803062417, "language_loss": 0.73618317, "learning_rate": 1.4547016649505407e-06, "loss": 0.75808436, "num_input_tokens_seen": 107644070, "step": 4992, "time_per_iteration": 2.7075114250183105 }, { "auxiliary_loss_clip": 0.01164952, "auxiliary_loss_mlp": 0.01029335, "balance_loss_clip": 1.04996538, "balance_loss_mlp": 1.02153254, "epoch": 0.6003727529609811, "flos": 20849592113280.0, "grad_norm": 2.1245193490076932, "language_loss": 0.84661961, "learning_rate": 1.4539522471741193e-06, "loss": 0.8685624, "num_input_tokens_seen": 107661495, "step": 4993, "time_per_iteration": 277.82844376564026 }, { "auxiliary_loss_clip": 0.01175051, "auxiliary_loss_mlp": 0.01025453, "balance_loss_clip": 1.05100965, "balance_loss_mlp": 1.01744199, "epoch": 0.6004929958516203, "flos": 15594604277760.0, "grad_norm": 3.133107321303171, "language_loss": 0.70602256, "learning_rate": 1.4532029122327067e-06, "loss": 0.72802758, "num_input_tokens_seen": 107678280, "step": 4994, "time_per_iteration": 2.792638063430786 }, { "auxiliary_loss_clip": 0.01158855, "auxiliary_loss_mlp": 0.01029583, "balance_loss_clip": 1.05128264, "balance_loss_mlp": 1.02235341, "epoch": 0.6006132387422594, "flos": 21763267390080.0, "grad_norm": 4.956419306221959, "language_loss": 0.75292861, "learning_rate": 1.4524536602399783e-06, "loss": 0.77481306, "num_input_tokens_seen": 107697370, "step": 4995, "time_per_iteration": 3.769906997680664 }, { "auxiliary_loss_clip": 0.0116469, "auxiliary_loss_mlp": 0.01029245, "balance_loss_clip": 1.05048144, "balance_loss_mlp": 1.02196455, "epoch": 0.6007334816328984, "flos": 22858542852480.0, "grad_norm": 1.9846647830812827, "language_loss": 0.77456844, "learning_rate": 1.4517044913095938e-06, "loss": 0.79650778, "num_input_tokens_seen": 107717790, "step": 4996, "time_per_iteration": 2.780261993408203 }, { "auxiliary_loss_clip": 0.01171296, "auxiliary_loss_mlp": 0.01028391, "balance_loss_clip": 1.05052161, "balance_loss_mlp": 1.02089262, "epoch": 0.6008537245235376, "flos": 28324577047680.0, "grad_norm": 1.702795305996676, "language_loss": 0.81419337, "learning_rate": 1.4509554055552022e-06, "loss": 0.83619022, "num_input_tokens_seen": 107738020, "step": 4997, "time_per_iteration": 2.8252909183502197 }, { "auxiliary_loss_clip": 0.01164068, "auxiliary_loss_mlp": 0.01027279, "balance_loss_clip": 1.04924774, "balance_loss_mlp": 1.01963186, "epoch": 0.6009739674141766, "flos": 20886113266560.0, "grad_norm": 2.399945162707192, "language_loss": 0.83580691, "learning_rate": 1.450206403090439e-06, "loss": 0.85772038, "num_input_tokens_seen": 107756215, "step": 4998, "time_per_iteration": 2.7880914211273193 }, { "auxiliary_loss_clip": 0.01168022, "auxiliary_loss_mlp": 0.01026718, "balance_loss_clip": 1.05029917, "balance_loss_mlp": 1.01927316, "epoch": 0.6010942103048157, "flos": 20481004702080.0, "grad_norm": 2.134103053497071, "language_loss": 0.86282825, "learning_rate": 1.4494574840289274e-06, "loss": 0.88477564, "num_input_tokens_seen": 107773330, "step": 4999, "time_per_iteration": 2.693432331085205 }, { "auxiliary_loss_clip": 0.01175371, "auxiliary_loss_mlp": 0.01024493, "balance_loss_clip": 1.04937315, "balance_loss_mlp": 1.01610053, "epoch": 0.6012144531954549, "flos": 23805973935360.0, "grad_norm": 1.8755735169796954, "language_loss": 0.73541892, "learning_rate": 1.4487086484842782e-06, "loss": 0.75741756, "num_input_tokens_seen": 107791975, "step": 5000, "time_per_iteration": 2.814760208129883 }, { "auxiliary_loss_clip": 0.01172272, "auxiliary_loss_mlp": 0.01032363, "balance_loss_clip": 1.04928529, "balance_loss_mlp": 1.02467656, "epoch": 0.6013346960860939, "flos": 18988378012800.0, "grad_norm": 2.063844177940656, "language_loss": 0.59857064, "learning_rate": 1.4479598965700878e-06, "loss": 0.62061703, "num_input_tokens_seen": 107809240, "step": 5001, "time_per_iteration": 2.6570968627929688 }, { "auxiliary_loss_clip": 0.01164145, "auxiliary_loss_mlp": 0.01022592, "balance_loss_clip": 1.04982758, "balance_loss_mlp": 1.01472116, "epoch": 0.601454938976733, "flos": 24025316336640.0, "grad_norm": 2.5957800433851714, "language_loss": 0.68938863, "learning_rate": 1.4472112283999427e-06, "loss": 0.71125603, "num_input_tokens_seen": 107827895, "step": 5002, "time_per_iteration": 2.8022451400756836 }, { "auxiliary_loss_clip": 0.01162659, "auxiliary_loss_mlp": 0.01027466, "balance_loss_clip": 1.04822516, "balance_loss_mlp": 1.0202893, "epoch": 0.6015751818673721, "flos": 26427129102720.0, "grad_norm": 2.194835710426609, "language_loss": 0.69452226, "learning_rate": 1.4464626440874143e-06, "loss": 0.71642351, "num_input_tokens_seen": 107847010, "step": 5003, "time_per_iteration": 2.7099742889404297 }, { "auxiliary_loss_clip": 0.01171455, "auxiliary_loss_mlp": 0.01030185, "balance_loss_clip": 1.05035532, "balance_loss_mlp": 1.02255011, "epoch": 0.6016954247580112, "flos": 13115260005120.0, "grad_norm": 2.4615288011715566, "language_loss": 0.74141693, "learning_rate": 1.4457141437460636e-06, "loss": 0.76343334, "num_input_tokens_seen": 107864235, "step": 5004, "time_per_iteration": 2.6550452709198 }, { "auxiliary_loss_clip": 0.01169282, "auxiliary_loss_mlp": 0.01025659, "balance_loss_clip": 1.05079985, "balance_loss_mlp": 1.01775575, "epoch": 0.6018156676486502, "flos": 23768447201280.0, "grad_norm": 2.0974890816859464, "language_loss": 0.73284787, "learning_rate": 1.444965727489436e-06, "loss": 0.75479728, "num_input_tokens_seen": 107883680, "step": 5005, "time_per_iteration": 2.769094944000244 }, { "auxiliary_loss_clip": 0.01161525, "auxiliary_loss_mlp": 0.01032496, "balance_loss_clip": 1.04783106, "balance_loss_mlp": 1.02408552, "epoch": 0.6019359105392894, "flos": 26469360518400.0, "grad_norm": 1.9471345126331778, "language_loss": 0.63436729, "learning_rate": 1.444217395431066e-06, "loss": 0.65630746, "num_input_tokens_seen": 107906220, "step": 5006, "time_per_iteration": 2.707683563232422 }, { "auxiliary_loss_clip": 0.01069282, "auxiliary_loss_mlp": 0.01004353, "balance_loss_clip": 1.0178988, "balance_loss_mlp": 1.0032742, "epoch": 0.6020561534299285, "flos": 69190849728000.0, "grad_norm": 0.843930516487551, "language_loss": 0.55773348, "learning_rate": 1.4434691476844755e-06, "loss": 0.57846981, "num_input_tokens_seen": 107967195, "step": 5007, "time_per_iteration": 3.2304913997650146 }, { "auxiliary_loss_clip": 0.01164591, "auxiliary_loss_mlp": 0.01028635, "balance_loss_clip": 1.05105007, "balance_loss_mlp": 1.02138722, "epoch": 0.6021763963205675, "flos": 21835304115840.0, "grad_norm": 3.922023245979794, "language_loss": 0.67254305, "learning_rate": 1.4427209843631729e-06, "loss": 0.69447535, "num_input_tokens_seen": 107984245, "step": 5008, "time_per_iteration": 2.741380453109741 }, { "auxiliary_loss_clip": 0.01172624, "auxiliary_loss_mlp": 0.01052872, "balance_loss_clip": 1.05052626, "balance_loss_mlp": 1.01650524, "epoch": 0.6022966392112067, "flos": 26578636669440.0, "grad_norm": 2.0156749006997283, "language_loss": 0.81103873, "learning_rate": 1.4419729055806534e-06, "loss": 0.83329368, "num_input_tokens_seen": 108003680, "step": 5009, "time_per_iteration": 2.8303844928741455 }, { "auxiliary_loss_clip": 0.01163862, "auxiliary_loss_mlp": 0.01049879, "balance_loss_clip": 1.05188727, "balance_loss_mlp": 1.01503825, "epoch": 0.6024168821018457, "flos": 20703722981760.0, "grad_norm": 1.8962483031620276, "language_loss": 0.82068968, "learning_rate": 1.441224911450401e-06, "loss": 0.84282708, "num_input_tokens_seen": 108019635, "step": 5010, "time_per_iteration": 2.74360728263855 }, { "auxiliary_loss_clip": 0.01174656, "auxiliary_loss_mlp": 0.01023585, "balance_loss_clip": 1.05055761, "balance_loss_mlp": 1.015854, "epoch": 0.6025371249924848, "flos": 24680973242880.0, "grad_norm": 1.8303630155448156, "language_loss": 0.82350153, "learning_rate": 1.4404770020858851e-06, "loss": 0.84548396, "num_input_tokens_seen": 108039120, "step": 5011, "time_per_iteration": 2.696857213973999 }, { "auxiliary_loss_clip": 0.01164179, "auxiliary_loss_mlp": 0.01028143, "balance_loss_clip": 1.04933321, "balance_loss_mlp": 1.020818, "epoch": 0.602657367883124, "flos": 25955801815680.0, "grad_norm": 1.5952820018578155, "language_loss": 0.86294913, "learning_rate": 1.439729177600563e-06, "loss": 0.88487238, "num_input_tokens_seen": 108059615, "step": 5012, "time_per_iteration": 3.629210948944092 }, { "auxiliary_loss_clip": 0.01167553, "auxiliary_loss_mlp": 0.01021185, "balance_loss_clip": 1.04922152, "balance_loss_mlp": 1.01387191, "epoch": 0.602777610773763, "flos": 16690633925760.0, "grad_norm": 2.5214854183242434, "language_loss": 0.73197109, "learning_rate": 1.4389814381078793e-06, "loss": 0.75385845, "num_input_tokens_seen": 108078855, "step": 5013, "time_per_iteration": 4.832350730895996 }, { "auxiliary_loss_clip": 0.01151532, "auxiliary_loss_mlp": 0.01025934, "balance_loss_clip": 1.05224168, "balance_loss_mlp": 1.01886475, "epoch": 0.6028978536644021, "flos": 13334243270400.0, "grad_norm": 2.111655531135527, "language_loss": 0.8000344, "learning_rate": 1.438233783721265e-06, "loss": 0.82180905, "num_input_tokens_seen": 108095020, "step": 5014, "time_per_iteration": 2.848358631134033 }, { "auxiliary_loss_clip": 0.01162029, "auxiliary_loss_mlp": 0.01029081, "balance_loss_clip": 1.05137527, "balance_loss_mlp": 1.02085209, "epoch": 0.6030180965550412, "flos": 19644825018240.0, "grad_norm": 2.5448040482339307, "language_loss": 0.7810778, "learning_rate": 1.43748621455414e-06, "loss": 0.80298889, "num_input_tokens_seen": 108111455, "step": 5015, "time_per_iteration": 2.7264907360076904 }, { "auxiliary_loss_clip": 0.01165502, "auxiliary_loss_mlp": 0.01032415, "balance_loss_clip": 1.05055237, "balance_loss_mlp": 1.0246011, "epoch": 0.6031383394456803, "flos": 14458390289280.0, "grad_norm": 2.195607150257509, "language_loss": 0.80905247, "learning_rate": 1.4367387307199082e-06, "loss": 0.83103162, "num_input_tokens_seen": 108128305, "step": 5016, "time_per_iteration": 2.6758334636688232 }, { "auxiliary_loss_clip": 0.01165269, "auxiliary_loss_mlp": 0.0102597, "balance_loss_clip": 1.04934621, "balance_loss_mlp": 1.01858497, "epoch": 0.6032585823363193, "flos": 13917791623680.0, "grad_norm": 1.8701386714357335, "language_loss": 0.82637787, "learning_rate": 1.4359913323319632e-06, "loss": 0.84829032, "num_input_tokens_seen": 108145475, "step": 5017, "time_per_iteration": 2.861262798309326 }, { "auxiliary_loss_clip": 0.01149295, "auxiliary_loss_mlp": 0.01028663, "balance_loss_clip": 1.04861259, "balance_loss_mlp": 1.02102733, "epoch": 0.6033788252269584, "flos": 24353252530560.0, "grad_norm": 1.6888764725425367, "language_loss": 0.77542233, "learning_rate": 1.4352440195036847e-06, "loss": 0.79720187, "num_input_tokens_seen": 108165650, "step": 5018, "time_per_iteration": 2.8699185848236084 }, { "auxiliary_loss_clip": 0.01160289, "auxiliary_loss_mlp": 0.01027595, "balance_loss_clip": 1.04847383, "balance_loss_mlp": 1.02011466, "epoch": 0.6034990681175976, "flos": 25521247077120.0, "grad_norm": 1.7959249285941967, "language_loss": 0.79934609, "learning_rate": 1.4344967923484395e-06, "loss": 0.82122493, "num_input_tokens_seen": 108187620, "step": 5019, "time_per_iteration": 2.933424711227417 }, { "auxiliary_loss_clip": 0.0116716, "auxiliary_loss_mlp": 0.01023295, "balance_loss_clip": 1.0494318, "balance_loss_mlp": 1.01581466, "epoch": 0.6036193110082366, "flos": 25958387594880.0, "grad_norm": 2.027581090208895, "language_loss": 0.7194469, "learning_rate": 1.433749650979581e-06, "loss": 0.74135143, "num_input_tokens_seen": 108207605, "step": 5020, "time_per_iteration": 2.9466753005981445 }, { "auxiliary_loss_clip": 0.01167884, "auxiliary_loss_mlp": 0.01022736, "balance_loss_clip": 1.04888725, "balance_loss_mlp": 1.01500535, "epoch": 0.6037395538988757, "flos": 25593427457280.0, "grad_norm": 1.8254174780019543, "language_loss": 0.67879277, "learning_rate": 1.433002595510451e-06, "loss": 0.70069897, "num_input_tokens_seen": 108226385, "step": 5021, "time_per_iteration": 3.7132697105407715 }, { "auxiliary_loss_clip": 0.01164198, "auxiliary_loss_mlp": 0.01055311, "balance_loss_clip": 1.04968119, "balance_loss_mlp": 1.01810193, "epoch": 0.6038597967895148, "flos": 17816253402240.0, "grad_norm": 1.8685736310879617, "language_loss": 0.72143221, "learning_rate": 1.4322556260543757e-06, "loss": 0.74362731, "num_input_tokens_seen": 108242960, "step": 5022, "time_per_iteration": 2.7657663822174072 }, { "auxiliary_loss_clip": 0.01068663, "auxiliary_loss_mlp": 0.01001009, "balance_loss_clip": 1.01430476, "balance_loss_mlp": 0.999924, "epoch": 0.6039800396801539, "flos": 65169213235200.0, "grad_norm": 0.9309332381974719, "language_loss": 0.6270985, "learning_rate": 1.4315087427246703e-06, "loss": 0.6477952, "num_input_tokens_seen": 108296785, "step": 5023, "time_per_iteration": 3.2070393562316895 }, { "auxiliary_loss_clip": 0.01069515, "auxiliary_loss_mlp": 0.01000834, "balance_loss_clip": 1.01310527, "balance_loss_mlp": 0.99986279, "epoch": 0.604100282570793, "flos": 67386409073280.0, "grad_norm": 1.1642425965371286, "language_loss": 0.5847708, "learning_rate": 1.4307619456346372e-06, "loss": 0.60547429, "num_input_tokens_seen": 108341090, "step": 5024, "time_per_iteration": 2.973417282104492 }, { "auxiliary_loss_clip": 0.0116974, "auxiliary_loss_mlp": 0.01021001, "balance_loss_clip": 1.04648852, "balance_loss_mlp": 1.01397967, "epoch": 0.6042205254614321, "flos": 35297495631360.0, "grad_norm": 2.197183220273045, "language_loss": 0.7412284, "learning_rate": 1.430015234897564e-06, "loss": 0.76313579, "num_input_tokens_seen": 108364370, "step": 5025, "time_per_iteration": 2.8269996643066406 }, { "auxiliary_loss_clip": 0.0117224, "auxiliary_loss_mlp": 0.01059874, "balance_loss_clip": 1.04875708, "balance_loss_mlp": 1.02236938, "epoch": 0.6043407683520712, "flos": 45658262206080.0, "grad_norm": 6.303825349585108, "language_loss": 0.66415274, "learning_rate": 1.4292686106267274e-06, "loss": 0.68647391, "num_input_tokens_seen": 108387220, "step": 5026, "time_per_iteration": 2.8674795627593994 }, { "auxiliary_loss_clip": 0.01174427, "auxiliary_loss_mlp": 0.01026325, "balance_loss_clip": 1.04949725, "balance_loss_mlp": 1.01896358, "epoch": 0.6044610112427102, "flos": 16180020138240.0, "grad_norm": 1.5608184378363568, "language_loss": 0.77388936, "learning_rate": 1.4285220729353876e-06, "loss": 0.79589689, "num_input_tokens_seen": 108405760, "step": 5027, "time_per_iteration": 2.7255403995513916 }, { "auxiliary_loss_clip": 0.01168432, "auxiliary_loss_mlp": 0.01024789, "balance_loss_clip": 1.05014682, "balance_loss_mlp": 1.01739788, "epoch": 0.6045812541333494, "flos": 13804062186240.0, "grad_norm": 1.9196672299084656, "language_loss": 0.77963471, "learning_rate": 1.4277756219367957e-06, "loss": 0.80156696, "num_input_tokens_seen": 108422785, "step": 5028, "time_per_iteration": 2.7247660160064697 }, { "auxiliary_loss_clip": 0.01171986, "auxiliary_loss_mlp": 0.01024303, "balance_loss_clip": 1.05180967, "balance_loss_mlp": 1.01656055, "epoch": 0.6047014970239885, "flos": 19975059682560.0, "grad_norm": 2.186716466394789, "language_loss": 0.79620081, "learning_rate": 1.4270292577441864e-06, "loss": 0.81816375, "num_input_tokens_seen": 108442290, "step": 5029, "time_per_iteration": 2.9409899711608887 }, { "auxiliary_loss_clip": 0.0117165, "auxiliary_loss_mlp": 0.01023505, "balance_loss_clip": 1.04757285, "balance_loss_mlp": 1.01639378, "epoch": 0.6048217399146275, "flos": 25337097025920.0, "grad_norm": 1.8660697274469846, "language_loss": 0.7185235, "learning_rate": 1.4262829804707836e-06, "loss": 0.74047506, "num_input_tokens_seen": 108464280, "step": 5030, "time_per_iteration": 2.834071397781372 }, { "auxiliary_loss_clip": 0.01173777, "auxiliary_loss_mlp": 0.01030512, "balance_loss_clip": 1.05149436, "balance_loss_mlp": 1.02315402, "epoch": 0.6049419828052667, "flos": 26030819370240.0, "grad_norm": 1.679038446736178, "language_loss": 0.70071483, "learning_rate": 1.4255367902297958e-06, "loss": 0.72275776, "num_input_tokens_seen": 108485610, "step": 5031, "time_per_iteration": 2.6666464805603027 }, { "auxiliary_loss_clip": 0.01170684, "auxiliary_loss_mlp": 0.01027125, "balance_loss_clip": 1.05003715, "balance_loss_mlp": 1.01985586, "epoch": 0.6050622256959057, "flos": 14648106948480.0, "grad_norm": 2.157130376130479, "language_loss": 0.78883404, "learning_rate": 1.4247906871344215e-06, "loss": 0.81081212, "num_input_tokens_seen": 108501005, "step": 5032, "time_per_iteration": 2.773513078689575 }, { "auxiliary_loss_clip": 0.01160828, "auxiliary_loss_mlp": 0.01023891, "balance_loss_clip": 1.04753184, "balance_loss_mlp": 1.01672685, "epoch": 0.6051824685865448, "flos": 23331450337920.0, "grad_norm": 2.1300279971180895, "language_loss": 0.75185865, "learning_rate": 1.4240446712978415e-06, "loss": 0.77370584, "num_input_tokens_seen": 108519990, "step": 5033, "time_per_iteration": 2.773822784423828 }, { "auxiliary_loss_clip": 0.01170274, "auxiliary_loss_mlp": 0.01028105, "balance_loss_clip": 1.04779601, "balance_loss_mlp": 1.02062488, "epoch": 0.605302711477184, "flos": 27563307177600.0, "grad_norm": 1.8241149368904341, "language_loss": 0.7475996, "learning_rate": 1.423298742833227e-06, "loss": 0.7695834, "num_input_tokens_seen": 108538650, "step": 5034, "time_per_iteration": 2.7276082038879395 }, { "auxiliary_loss_clip": 0.01167015, "auxiliary_loss_mlp": 0.010249, "balance_loss_clip": 1.04713273, "balance_loss_mlp": 1.01782155, "epoch": 0.605422954367823, "flos": 15154698412800.0, "grad_norm": 4.945460688696777, "language_loss": 0.71890259, "learning_rate": 1.4225529018537352e-06, "loss": 0.74082172, "num_input_tokens_seen": 108554155, "step": 5035, "time_per_iteration": 2.7021541595458984 }, { "auxiliary_loss_clip": 0.01169563, "auxiliary_loss_mlp": 0.01026791, "balance_loss_clip": 1.04794574, "balance_loss_mlp": 1.01963806, "epoch": 0.6055431972584621, "flos": 27673912131840.0, "grad_norm": 1.7678529712786772, "language_loss": 0.78020936, "learning_rate": 1.4218071484725082e-06, "loss": 0.80217296, "num_input_tokens_seen": 108576275, "step": 5036, "time_per_iteration": 2.808255672454834 }, { "auxiliary_loss_clip": 0.01163739, "auxiliary_loss_mlp": 0.01029325, "balance_loss_clip": 1.05056524, "balance_loss_mlp": 1.02127862, "epoch": 0.6056634401491012, "flos": 19387489006080.0, "grad_norm": 2.520461905033289, "language_loss": 0.76503599, "learning_rate": 1.4210614828026786e-06, "loss": 0.78696656, "num_input_tokens_seen": 108594125, "step": 5037, "time_per_iteration": 2.841766357421875 }, { "auxiliary_loss_clip": 0.01169537, "auxiliary_loss_mlp": 0.01023122, "balance_loss_clip": 1.04694557, "balance_loss_mlp": 1.01623201, "epoch": 0.6057836830397403, "flos": 24789459294720.0, "grad_norm": 1.4756136510477758, "language_loss": 0.74346817, "learning_rate": 1.4203159049573605e-06, "loss": 0.76539481, "num_input_tokens_seen": 108615360, "step": 5038, "time_per_iteration": 3.8576583862304688 }, { "auxiliary_loss_clip": 0.01172656, "auxiliary_loss_mlp": 0.0102738, "balance_loss_clip": 1.04887033, "balance_loss_mlp": 1.01982856, "epoch": 0.6059039259303793, "flos": 20558248899840.0, "grad_norm": 2.428549294972929, "language_loss": 0.87092459, "learning_rate": 1.4195704150496593e-06, "loss": 0.8929249, "num_input_tokens_seen": 108633075, "step": 5039, "time_per_iteration": 4.519800424575806 }, { "auxiliary_loss_clip": 0.01168251, "auxiliary_loss_mlp": 0.0103094, "balance_loss_clip": 1.05224848, "balance_loss_mlp": 1.02351964, "epoch": 0.6060241688210185, "flos": 21069724613760.0, "grad_norm": 3.182433251813232, "language_loss": 0.74122876, "learning_rate": 1.4188250131926639e-06, "loss": 0.76322067, "num_input_tokens_seen": 108651875, "step": 5040, "time_per_iteration": 2.8024840354919434 }, { "auxiliary_loss_clip": 0.01170041, "auxiliary_loss_mlp": 0.0102741, "balance_loss_clip": 1.05125117, "balance_loss_mlp": 1.01947021, "epoch": 0.6061444117116576, "flos": 16361081619840.0, "grad_norm": 2.95117044395249, "language_loss": 0.80520165, "learning_rate": 1.4180796994994525e-06, "loss": 0.82717615, "num_input_tokens_seen": 108669290, "step": 5041, "time_per_iteration": 2.663485527038574 }, { "auxiliary_loss_clip": 0.01163672, "auxiliary_loss_mlp": 0.01026504, "balance_loss_clip": 1.04733419, "balance_loss_mlp": 1.0193038, "epoch": 0.6062646546022966, "flos": 21507296094720.0, "grad_norm": 1.7691859711620868, "language_loss": 0.71678448, "learning_rate": 1.4173344740830877e-06, "loss": 0.73868626, "num_input_tokens_seen": 108688420, "step": 5042, "time_per_iteration": 2.7712161540985107 }, { "auxiliary_loss_clip": 0.01159743, "auxiliary_loss_mlp": 0.01025503, "balance_loss_clip": 1.05071628, "balance_loss_mlp": 1.01793957, "epoch": 0.6063848974929358, "flos": 38983151283840.0, "grad_norm": 1.7243823776923572, "language_loss": 0.70820946, "learning_rate": 1.4165893370566206e-06, "loss": 0.73006189, "num_input_tokens_seen": 108712175, "step": 5043, "time_per_iteration": 2.8255410194396973 }, { "auxiliary_loss_clip": 0.01165326, "auxiliary_loss_mlp": 0.01024868, "balance_loss_clip": 1.0486238, "balance_loss_mlp": 1.01745582, "epoch": 0.6065051403835748, "flos": 19646584784640.0, "grad_norm": 1.9568081281584206, "language_loss": 0.77622128, "learning_rate": 1.4158442885330865e-06, "loss": 0.79812324, "num_input_tokens_seen": 108730745, "step": 5044, "time_per_iteration": 2.787071943283081 }, { "auxiliary_loss_clip": 0.01165742, "auxiliary_loss_mlp": 0.01027676, "balance_loss_clip": 1.04910612, "balance_loss_mlp": 1.02020741, "epoch": 0.6066253832742139, "flos": 23513086437120.0, "grad_norm": 2.027157195951965, "language_loss": 0.78679997, "learning_rate": 1.4150993286255094e-06, "loss": 0.80873418, "num_input_tokens_seen": 108749995, "step": 5045, "time_per_iteration": 2.6668288707733154 }, { "auxiliary_loss_clip": 0.01169627, "auxiliary_loss_mlp": 0.01025018, "balance_loss_clip": 1.04716086, "balance_loss_mlp": 1.01816308, "epoch": 0.6067456261648531, "flos": 19133708440320.0, "grad_norm": 4.892752604189573, "language_loss": 0.79640496, "learning_rate": 1.4143544574468993e-06, "loss": 0.81835139, "num_input_tokens_seen": 108768355, "step": 5046, "time_per_iteration": 2.6458542346954346 }, { "auxiliary_loss_clip": 0.01161886, "auxiliary_loss_mlp": 0.01021505, "balance_loss_clip": 1.04615188, "balance_loss_mlp": 1.01398551, "epoch": 0.6068658690554921, "flos": 20520614424960.0, "grad_norm": 4.597803326589704, "language_loss": 0.82521302, "learning_rate": 1.4136096751102523e-06, "loss": 0.84704691, "num_input_tokens_seen": 108786685, "step": 5047, "time_per_iteration": 3.691563844680786 }, { "auxiliary_loss_clip": 0.0116669, "auxiliary_loss_mlp": 0.01028642, "balance_loss_clip": 1.04918265, "balance_loss_mlp": 1.02096486, "epoch": 0.6069861119461312, "flos": 27374560185600.0, "grad_norm": 2.462676490617702, "language_loss": 0.83117753, "learning_rate": 1.4128649817285516e-06, "loss": 0.85313082, "num_input_tokens_seen": 108804820, "step": 5048, "time_per_iteration": 2.7618486881256104 }, { "auxiliary_loss_clip": 0.01169444, "auxiliary_loss_mlp": 0.01024271, "balance_loss_clip": 1.04970086, "balance_loss_mlp": 1.01696348, "epoch": 0.6071063548367702, "flos": 25626500904960.0, "grad_norm": 1.9421859438724633, "language_loss": 0.63438946, "learning_rate": 1.412120377414766e-06, "loss": 0.65632665, "num_input_tokens_seen": 108825010, "step": 5049, "time_per_iteration": 2.7818920612335205 }, { "auxiliary_loss_clip": 0.01170536, "auxiliary_loss_mlp": 0.01028608, "balance_loss_clip": 1.0499649, "balance_loss_mlp": 1.02186668, "epoch": 0.6072265977274094, "flos": 24460517520000.0, "grad_norm": 1.599363271358509, "language_loss": 0.71255654, "learning_rate": 1.4113758622818522e-06, "loss": 0.73454797, "num_input_tokens_seen": 108845075, "step": 5050, "time_per_iteration": 2.6877734661102295 }, { "auxiliary_loss_clip": 0.01170081, "auxiliary_loss_mlp": 0.01052639, "balance_loss_clip": 1.05136979, "balance_loss_mlp": 1.01408482, "epoch": 0.6073468406180484, "flos": 18149253413760.0, "grad_norm": 1.7915928337413745, "language_loss": 0.83028167, "learning_rate": 1.410631436442751e-06, "loss": 0.8525089, "num_input_tokens_seen": 108863870, "step": 5051, "time_per_iteration": 2.832239866256714 }, { "auxiliary_loss_clip": 0.01173361, "auxiliary_loss_mlp": 0.01027322, "balance_loss_clip": 1.0491091, "balance_loss_mlp": 1.01980591, "epoch": 0.6074670835086875, "flos": 20697617669760.0, "grad_norm": 4.273351601868745, "language_loss": 0.8650862, "learning_rate": 1.4098871000103936e-06, "loss": 0.88709307, "num_input_tokens_seen": 108882470, "step": 5052, "time_per_iteration": 2.725165605545044 }, { "auxiliary_loss_clip": 0.01167061, "auxiliary_loss_mlp": 0.01026432, "balance_loss_clip": 1.04928184, "balance_loss_mlp": 1.01900578, "epoch": 0.6075873263993267, "flos": 23769955572480.0, "grad_norm": 1.646489862956537, "language_loss": 0.82589912, "learning_rate": 1.409142853097693e-06, "loss": 0.84783405, "num_input_tokens_seen": 108902710, "step": 5053, "time_per_iteration": 2.8836913108825684 }, { "auxiliary_loss_clip": 0.01165776, "auxiliary_loss_mlp": 0.01029996, "balance_loss_clip": 1.04878235, "balance_loss_mlp": 1.02274764, "epoch": 0.6077075692899657, "flos": 24454484035200.0, "grad_norm": 1.9694941771638308, "language_loss": 0.79709417, "learning_rate": 1.408398695817553e-06, "loss": 0.81905186, "num_input_tokens_seen": 108919935, "step": 5054, "time_per_iteration": 2.8856444358825684 }, { "auxiliary_loss_clip": 0.01166733, "auxiliary_loss_mlp": 0.01025331, "balance_loss_clip": 1.05080914, "balance_loss_mlp": 1.01732659, "epoch": 0.6078278121806048, "flos": 27382102041600.0, "grad_norm": 1.9293470306744376, "language_loss": 0.70421445, "learning_rate": 1.4076546282828593e-06, "loss": 0.72613519, "num_input_tokens_seen": 108942790, "step": 5055, "time_per_iteration": 2.8225364685058594 }, { "auxiliary_loss_clip": 0.01170498, "auxiliary_loss_mlp": 0.0102464, "balance_loss_clip": 1.04846394, "balance_loss_mlp": 1.01731467, "epoch": 0.6079480550712439, "flos": 38436447306240.0, "grad_norm": 2.299110960017649, "language_loss": 0.66465449, "learning_rate": 1.4069106506064874e-06, "loss": 0.68660593, "num_input_tokens_seen": 108964215, "step": 5056, "time_per_iteration": 2.864122152328491 }, { "auxiliary_loss_clip": 0.01160079, "auxiliary_loss_mlp": 0.01028075, "balance_loss_clip": 1.04910111, "balance_loss_mlp": 1.02035594, "epoch": 0.608068297961883, "flos": 25336271013120.0, "grad_norm": 2.6002881201362684, "language_loss": 0.7829625, "learning_rate": 1.4061667629012989e-06, "loss": 0.80484402, "num_input_tokens_seen": 108984885, "step": 5057, "time_per_iteration": 2.7311906814575195 }, { "auxiliary_loss_clip": 0.01159839, "auxiliary_loss_mlp": 0.0102134, "balance_loss_clip": 1.05049551, "balance_loss_mlp": 1.01387429, "epoch": 0.608188540852522, "flos": 24202463235840.0, "grad_norm": 2.2395381676668444, "language_loss": 0.83406401, "learning_rate": 1.40542296528014e-06, "loss": 0.85587573, "num_input_tokens_seen": 109004545, "step": 5058, "time_per_iteration": 2.832641124725342 }, { "auxiliary_loss_clip": 0.01169984, "auxiliary_loss_mlp": 0.01024174, "balance_loss_clip": 1.04846811, "balance_loss_mlp": 1.01679754, "epoch": 0.6083087837431612, "flos": 21284146851840.0, "grad_norm": 1.9373883735733155, "language_loss": 0.76311195, "learning_rate": 1.4046792578558452e-06, "loss": 0.78505355, "num_input_tokens_seen": 109022440, "step": 5059, "time_per_iteration": 2.7077410221099854 }, { "auxiliary_loss_clip": 0.01161189, "auxiliary_loss_mlp": 0.01024812, "balance_loss_clip": 1.04843187, "balance_loss_mlp": 1.01730168, "epoch": 0.6084290266338003, "flos": 16471435178880.0, "grad_norm": 2.6040261222737353, "language_loss": 0.75916839, "learning_rate": 1.4039356407412325e-06, "loss": 0.78102839, "num_input_tokens_seen": 109035680, "step": 5060, "time_per_iteration": 2.7412872314453125 }, { "auxiliary_loss_clip": 0.01070012, "auxiliary_loss_mlp": 0.01000059, "balance_loss_clip": 1.013502, "balance_loss_mlp": 0.99889088, "epoch": 0.6085492695244393, "flos": 66443574931200.0, "grad_norm": 0.7810420818179143, "language_loss": 0.57126528, "learning_rate": 1.40319211404911e-06, "loss": 0.59196603, "num_input_tokens_seen": 109090680, "step": 5061, "time_per_iteration": 3.237614393234253 }, { "auxiliary_loss_clip": 0.01170707, "auxiliary_loss_mlp": 0.0102589, "balance_loss_clip": 1.04814315, "balance_loss_mlp": 1.01862073, "epoch": 0.6086695124150785, "flos": 23618986709760.0, "grad_norm": 1.769520201164927, "language_loss": 0.90663218, "learning_rate": 1.4024486778922691e-06, "loss": 0.92859817, "num_input_tokens_seen": 109108995, "step": 5062, "time_per_iteration": 2.7653896808624268 }, { "auxiliary_loss_clip": 0.01171155, "auxiliary_loss_mlp": 0.01026124, "balance_loss_clip": 1.04854023, "balance_loss_mlp": 1.01876903, "epoch": 0.6087897553057176, "flos": 20157054917760.0, "grad_norm": 1.8238864247234803, "language_loss": 0.77783251, "learning_rate": 1.4017053323834884e-06, "loss": 0.79980534, "num_input_tokens_seen": 109128825, "step": 5063, "time_per_iteration": 3.0392844676971436 }, { "auxiliary_loss_clip": 0.01167264, "auxiliary_loss_mlp": 0.01024906, "balance_loss_clip": 1.0480001, "balance_loss_mlp": 1.01732469, "epoch": 0.6089099981963566, "flos": 25482535194240.0, "grad_norm": 2.1014057011304543, "language_loss": 0.76149201, "learning_rate": 1.4009620776355333e-06, "loss": 0.78341377, "num_input_tokens_seen": 109150425, "step": 5064, "time_per_iteration": 4.6688642501831055 }, { "auxiliary_loss_clip": 0.01165275, "auxiliary_loss_mlp": 0.01025696, "balance_loss_clip": 1.04829884, "balance_loss_mlp": 1.01838851, "epoch": 0.6090302410869958, "flos": 25332895134720.0, "grad_norm": 2.1079088501756353, "language_loss": 0.79020447, "learning_rate": 1.4002189137611553e-06, "loss": 0.81211424, "num_input_tokens_seen": 109169765, "step": 5065, "time_per_iteration": 3.660574436187744 }, { "auxiliary_loss_clip": 0.01167771, "auxiliary_loss_mlp": 0.01024628, "balance_loss_clip": 1.04955697, "balance_loss_mlp": 1.01737452, "epoch": 0.6091504839776348, "flos": 23987358639360.0, "grad_norm": 6.530013906766686, "language_loss": 0.6977638, "learning_rate": 1.3994758408730901e-06, "loss": 0.7196877, "num_input_tokens_seen": 109188950, "step": 5066, "time_per_iteration": 2.73777437210083 }, { "auxiliary_loss_clip": 0.01168355, "auxiliary_loss_mlp": 0.01027915, "balance_loss_clip": 1.0519582, "balance_loss_mlp": 1.02023196, "epoch": 0.6092707268682739, "flos": 29643037666560.0, "grad_norm": 4.899675731575274, "language_loss": 0.76255363, "learning_rate": 1.3987328590840629e-06, "loss": 0.78451633, "num_input_tokens_seen": 109209895, "step": 5067, "time_per_iteration": 2.9722557067871094 }, { "auxiliary_loss_clip": 0.01165847, "auxiliary_loss_mlp": 0.01023898, "balance_loss_clip": 1.0476172, "balance_loss_mlp": 1.01655126, "epoch": 0.609390969758913, "flos": 24024957200640.0, "grad_norm": 1.920729637009635, "language_loss": 0.8616274, "learning_rate": 1.397989968506783e-06, "loss": 0.88352484, "num_input_tokens_seen": 109228905, "step": 5068, "time_per_iteration": 2.9489078521728516 }, { "auxiliary_loss_clip": 0.01175081, "auxiliary_loss_mlp": 0.01022916, "balance_loss_clip": 1.05062163, "balance_loss_mlp": 1.01517045, "epoch": 0.6095112126495521, "flos": 11102143288320.0, "grad_norm": 8.000189917995241, "language_loss": 0.72395849, "learning_rate": 1.3972471692539458e-06, "loss": 0.74593848, "num_input_tokens_seen": 109243620, "step": 5069, "time_per_iteration": 2.6978707313537598 }, { "auxiliary_loss_clip": 0.01161169, "auxiliary_loss_mlp": 0.01029485, "balance_loss_clip": 1.04974961, "balance_loss_mlp": 1.02208805, "epoch": 0.6096314555401912, "flos": 17265491187840.0, "grad_norm": 1.9266382597377052, "language_loss": 0.7541092, "learning_rate": 1.3965044614382348e-06, "loss": 0.7760157, "num_input_tokens_seen": 109259070, "step": 5070, "time_per_iteration": 2.8955376148223877 }, { "auxiliary_loss_clip": 0.01174174, "auxiliary_loss_mlp": 0.01027262, "balance_loss_clip": 1.0509007, "balance_loss_mlp": 1.0197401, "epoch": 0.6097516984308303, "flos": 21645910679040.0, "grad_norm": 3.3254067114695065, "language_loss": 0.75645089, "learning_rate": 1.3957618451723162e-06, "loss": 0.77846527, "num_input_tokens_seen": 109275100, "step": 5071, "time_per_iteration": 2.8587865829467773 }, { "auxiliary_loss_clip": 0.0116747, "auxiliary_loss_mlp": 0.01021592, "balance_loss_clip": 1.04772091, "balance_loss_mlp": 1.01457024, "epoch": 0.6098719413214694, "flos": 27199208966400.0, "grad_norm": 2.234839618830875, "language_loss": 0.71271312, "learning_rate": 1.3950193205688457e-06, "loss": 0.73460376, "num_input_tokens_seen": 109294825, "step": 5072, "time_per_iteration": 2.9742259979248047 }, { "auxiliary_loss_clip": 0.01163129, "auxiliary_loss_mlp": 0.01031585, "balance_loss_clip": 1.04871988, "balance_loss_mlp": 1.02439952, "epoch": 0.6099921842121084, "flos": 20412954385920.0, "grad_norm": 1.9615431269850139, "language_loss": 0.83776724, "learning_rate": 1.3942768877404627e-06, "loss": 0.85971445, "num_input_tokens_seen": 109313790, "step": 5073, "time_per_iteration": 3.8378124237060547 }, { "auxiliary_loss_clip": 0.01170858, "auxiliary_loss_mlp": 0.01025616, "balance_loss_clip": 1.0486784, "balance_loss_mlp": 1.01850176, "epoch": 0.6101124271027476, "flos": 23366139897600.0, "grad_norm": 1.6297902817311523, "language_loss": 0.73785144, "learning_rate": 1.393534546799795e-06, "loss": 0.75981617, "num_input_tokens_seen": 109333490, "step": 5074, "time_per_iteration": 2.8799774646759033 }, { "auxiliary_loss_clip": 0.0116062, "auxiliary_loss_mlp": 0.0102425, "balance_loss_clip": 1.0512538, "balance_loss_mlp": 1.01707017, "epoch": 0.6102326699933867, "flos": 26687840993280.0, "grad_norm": 2.159871947293766, "language_loss": 0.6783334, "learning_rate": 1.3927922978594536e-06, "loss": 0.70018208, "num_input_tokens_seen": 109354575, "step": 5075, "time_per_iteration": 2.818185567855835 }, { "auxiliary_loss_clip": 0.01065833, "auxiliary_loss_mlp": 0.00999692, "balance_loss_clip": 1.01329374, "balance_loss_mlp": 0.99860686, "epoch": 0.6103529128840257, "flos": 60644612551680.0, "grad_norm": 0.7711077610413049, "language_loss": 0.57417083, "learning_rate": 1.3920501410320387e-06, "loss": 0.59482598, "num_input_tokens_seen": 109410690, "step": 5076, "time_per_iteration": 3.241384983062744 }, { "auxiliary_loss_clip": 0.01165837, "auxiliary_loss_mlp": 0.01024218, "balance_loss_clip": 1.04845202, "balance_loss_mlp": 1.01682663, "epoch": 0.6104731557746649, "flos": 19021307806080.0, "grad_norm": 3.0525772992891573, "language_loss": 0.75902557, "learning_rate": 1.3913080764301333e-06, "loss": 0.78092611, "num_input_tokens_seen": 109427650, "step": 5077, "time_per_iteration": 2.794053792953491 }, { "auxiliary_loss_clip": 0.01168594, "auxiliary_loss_mlp": 0.01026681, "balance_loss_clip": 1.0482502, "balance_loss_mlp": 1.01898575, "epoch": 0.6105933986653039, "flos": 23366894083200.0, "grad_norm": 2.5348835489221635, "language_loss": 0.71329653, "learning_rate": 1.3905661041663085e-06, "loss": 0.73524934, "num_input_tokens_seen": 109448835, "step": 5078, "time_per_iteration": 2.771366834640503 }, { "auxiliary_loss_clip": 0.01171172, "auxiliary_loss_mlp": 0.01023238, "balance_loss_clip": 1.05066586, "balance_loss_mlp": 1.01557243, "epoch": 0.610713641555943, "flos": 34637565006720.0, "grad_norm": 2.824949833234117, "language_loss": 0.64985025, "learning_rate": 1.389824224353122e-06, "loss": 0.67179435, "num_input_tokens_seen": 109470425, "step": 5079, "time_per_iteration": 2.825418710708618 }, { "auxiliary_loss_clip": 0.0116612, "auxiliary_loss_mlp": 0.01023282, "balance_loss_clip": 1.04920053, "balance_loss_mlp": 1.01592124, "epoch": 0.610833884446582, "flos": 26646471504000.0, "grad_norm": 1.5278004016800542, "language_loss": 0.76494855, "learning_rate": 1.389082437103115e-06, "loss": 0.78684258, "num_input_tokens_seen": 109489695, "step": 5080, "time_per_iteration": 2.6941628456115723 }, { "auxiliary_loss_clip": 0.0116606, "auxiliary_loss_mlp": 0.0102426, "balance_loss_clip": 1.05043519, "balance_loss_mlp": 1.01656234, "epoch": 0.6109541273372212, "flos": 21215126868480.0, "grad_norm": 2.061728271282341, "language_loss": 0.78057283, "learning_rate": 1.3883407425288172e-06, "loss": 0.80247599, "num_input_tokens_seen": 109510030, "step": 5081, "time_per_iteration": 2.8511850833892822 }, { "auxiliary_loss_clip": 0.01163839, "auxiliary_loss_mlp": 0.01020803, "balance_loss_clip": 1.04898226, "balance_loss_mlp": 1.01314378, "epoch": 0.6110743702278603, "flos": 20084084438400.0, "grad_norm": 2.3563993284479423, "language_loss": 0.799137, "learning_rate": 1.3875991407427417e-06, "loss": 0.82098341, "num_input_tokens_seen": 109528255, "step": 5082, "time_per_iteration": 2.8073089122772217 }, { "auxiliary_loss_clip": 0.01070144, "auxiliary_loss_mlp": 0.01001984, "balance_loss_clip": 1.01288557, "balance_loss_mlp": 1.00074387, "epoch": 0.6111946131184993, "flos": 68302957438080.0, "grad_norm": 0.7680157907084297, "language_loss": 0.58130932, "learning_rate": 1.38685763185739e-06, "loss": 0.60203058, "num_input_tokens_seen": 109581915, "step": 5083, "time_per_iteration": 3.3315751552581787 }, { "auxiliary_loss_clip": 0.01171048, "auxiliary_loss_mlp": 0.01028701, "balance_loss_clip": 1.04836488, "balance_loss_mlp": 1.02072883, "epoch": 0.6113148560091385, "flos": 19937676602880.0, "grad_norm": 3.071581159990178, "language_loss": 0.679456, "learning_rate": 1.3861162159852476e-06, "loss": 0.70145345, "num_input_tokens_seen": 109600050, "step": 5084, "time_per_iteration": 2.71606183052063 }, { "auxiliary_loss_clip": 0.0117166, "auxiliary_loss_mlp": 0.01030964, "balance_loss_clip": 1.05025792, "balance_loss_mlp": 1.02323377, "epoch": 0.6114350988997775, "flos": 23731854220800.0, "grad_norm": 1.6603130156146315, "language_loss": 0.80013335, "learning_rate": 1.3853748932387875e-06, "loss": 0.82215965, "num_input_tokens_seen": 109620690, "step": 5085, "time_per_iteration": 2.7654812335968018 }, { "auxiliary_loss_clip": 0.01160343, "auxiliary_loss_mlp": 0.01022032, "balance_loss_clip": 1.05045986, "balance_loss_mlp": 1.01450419, "epoch": 0.6115553417904166, "flos": 24023700224640.0, "grad_norm": 2.613068864565396, "language_loss": 0.75066811, "learning_rate": 1.3846336637304671e-06, "loss": 0.77249187, "num_input_tokens_seen": 109638960, "step": 5086, "time_per_iteration": 2.7706313133239746 }, { "auxiliary_loss_clip": 0.01156041, "auxiliary_loss_mlp": 0.01024114, "balance_loss_clip": 1.04646611, "balance_loss_mlp": 1.0170505, "epoch": 0.6116755846810558, "flos": 23733542160000.0, "grad_norm": 2.173249760363939, "language_loss": 0.82649809, "learning_rate": 1.3838925275727316e-06, "loss": 0.84829962, "num_input_tokens_seen": 109659700, "step": 5087, "time_per_iteration": 2.676156520843506 }, { "auxiliary_loss_clip": 0.01171925, "auxiliary_loss_mlp": 0.01030186, "balance_loss_clip": 1.04945588, "balance_loss_mlp": 1.02297974, "epoch": 0.6117958275716948, "flos": 18661626967680.0, "grad_norm": 1.712133974740508, "language_loss": 0.78793597, "learning_rate": 1.3831514848780089e-06, "loss": 0.80995715, "num_input_tokens_seen": 109679275, "step": 5088, "time_per_iteration": 2.68118953704834 }, { "auxiliary_loss_clip": 0.01163963, "auxiliary_loss_mlp": 0.01027042, "balance_loss_clip": 1.04933286, "balance_loss_mlp": 1.01984227, "epoch": 0.6119160704623339, "flos": 16471183783680.0, "grad_norm": 2.282566934822288, "language_loss": 0.92068172, "learning_rate": 1.3824105357587152e-06, "loss": 0.94259179, "num_input_tokens_seen": 109696380, "step": 5089, "time_per_iteration": 2.743678331375122 }, { "auxiliary_loss_clip": 0.01159472, "auxiliary_loss_mlp": 0.01028477, "balance_loss_clip": 1.04616058, "balance_loss_mlp": 1.0212115, "epoch": 0.612036313352973, "flos": 23915465568000.0, "grad_norm": 1.7909116710986202, "language_loss": 0.82477307, "learning_rate": 1.381669680327253e-06, "loss": 0.84665257, "num_input_tokens_seen": 109718060, "step": 5090, "time_per_iteration": 3.792917013168335 }, { "auxiliary_loss_clip": 0.01164369, "auxiliary_loss_mlp": 0.0102956, "balance_loss_clip": 1.05382538, "balance_loss_mlp": 1.02215111, "epoch": 0.6121565562436121, "flos": 26974766833920.0, "grad_norm": 4.410905375581658, "language_loss": 0.71340889, "learning_rate": 1.380928918696008e-06, "loss": 0.73534822, "num_input_tokens_seen": 109736830, "step": 5091, "time_per_iteration": 1.9754528999328613 }, { "auxiliary_loss_clip": 0.01167349, "auxiliary_loss_mlp": 0.0102575, "balance_loss_clip": 1.04735005, "balance_loss_mlp": 1.01770926, "epoch": 0.6122767991342511, "flos": 15668867646720.0, "grad_norm": 2.4956253663789756, "language_loss": 0.71816111, "learning_rate": 1.3801882509773548e-06, "loss": 0.74009204, "num_input_tokens_seen": 109754690, "step": 5092, "time_per_iteration": 2.7215495109558105 }, { "auxiliary_loss_clip": 0.01164456, "auxiliary_loss_mlp": 0.01028309, "balance_loss_clip": 1.04808712, "balance_loss_mlp": 1.02086401, "epoch": 0.6123970420248903, "flos": 27964321591680.0, "grad_norm": 1.8739952241329012, "language_loss": 0.81673193, "learning_rate": 1.3794476772836503e-06, "loss": 0.83865958, "num_input_tokens_seen": 109775790, "step": 5093, "time_per_iteration": 2.8246991634368896 }, { "auxiliary_loss_clip": 0.01156227, "auxiliary_loss_mlp": 0.01027715, "balance_loss_clip": 1.04960704, "balance_loss_mlp": 1.01977241, "epoch": 0.6125172849155294, "flos": 21468727866240.0, "grad_norm": 1.8771208229959029, "language_loss": 0.84818971, "learning_rate": 1.3787071977272402e-06, "loss": 0.87002915, "num_input_tokens_seen": 109795050, "step": 5094, "time_per_iteration": 2.855835199356079 }, { "auxiliary_loss_clip": 0.01157691, "auxiliary_loss_mlp": 0.01026762, "balance_loss_clip": 1.05068946, "balance_loss_mlp": 1.01951993, "epoch": 0.6126375278061684, "flos": 16248321849600.0, "grad_norm": 3.4529526158154127, "language_loss": 0.71902728, "learning_rate": 1.3779668124204535e-06, "loss": 0.74087179, "num_input_tokens_seen": 109811465, "step": 5095, "time_per_iteration": 2.7610044479370117 }, { "auxiliary_loss_clip": 0.01157085, "auxiliary_loss_mlp": 0.01024231, "balance_loss_clip": 1.04814386, "balance_loss_mlp": 1.01661015, "epoch": 0.6127577706968076, "flos": 20448865008000.0, "grad_norm": 1.7967975291434706, "language_loss": 0.80878639, "learning_rate": 1.3772265214756074e-06, "loss": 0.83059955, "num_input_tokens_seen": 109831225, "step": 5096, "time_per_iteration": 2.73783016204834 }, { "auxiliary_loss_clip": 0.01172094, "auxiliary_loss_mlp": 0.01019825, "balance_loss_clip": 1.04840302, "balance_loss_mlp": 1.01260686, "epoch": 0.6128780135874466, "flos": 18260397072000.0, "grad_norm": 2.7696271566220223, "language_loss": 0.75536615, "learning_rate": 1.3764863250050025e-06, "loss": 0.77728534, "num_input_tokens_seen": 109849465, "step": 5097, "time_per_iteration": 2.730489492416382 }, { "auxiliary_loss_clip": 0.01161633, "auxiliary_loss_mlp": 0.01026709, "balance_loss_clip": 1.04803181, "balance_loss_mlp": 1.01928854, "epoch": 0.6129982564780857, "flos": 24937088192640.0, "grad_norm": 1.8478977982080114, "language_loss": 0.80632567, "learning_rate": 1.3757462231209272e-06, "loss": 0.82820916, "num_input_tokens_seen": 109869770, "step": 5098, "time_per_iteration": 2.799043655395508 }, { "auxiliary_loss_clip": 0.01161158, "auxiliary_loss_mlp": 0.0102667, "balance_loss_clip": 1.04894722, "balance_loss_mlp": 1.01928484, "epoch": 0.6131184993687249, "flos": 22492038430080.0, "grad_norm": 2.0143205560136526, "language_loss": 0.89007437, "learning_rate": 1.3750062159356525e-06, "loss": 0.91195273, "num_input_tokens_seen": 109889120, "step": 5099, "time_per_iteration": 3.766659736633301 }, { "auxiliary_loss_clip": 0.01152472, "auxiliary_loss_mlp": 0.01024771, "balance_loss_clip": 1.04735827, "balance_loss_mlp": 1.01789832, "epoch": 0.6132387422593639, "flos": 15885839750400.0, "grad_norm": 1.797388651866394, "language_loss": 0.83068991, "learning_rate": 1.3742663035614382e-06, "loss": 0.85246235, "num_input_tokens_seen": 109906490, "step": 5100, "time_per_iteration": 2.691438913345337 }, { "auxiliary_loss_clip": 0.01173253, "auxiliary_loss_mlp": 0.01030198, "balance_loss_clip": 1.04897439, "balance_loss_mlp": 1.0225091, "epoch": 0.613358985150003, "flos": 25411539962880.0, "grad_norm": 1.8572804298543548, "language_loss": 0.79980481, "learning_rate": 1.3735264861105283e-06, "loss": 0.82183933, "num_input_tokens_seen": 109927130, "step": 5101, "time_per_iteration": 2.753030300140381 }, { "auxiliary_loss_clip": 0.01162303, "auxiliary_loss_mlp": 0.01021811, "balance_loss_clip": 1.04780245, "balance_loss_mlp": 1.01484573, "epoch": 0.6134792280406421, "flos": 21361283308800.0, "grad_norm": 2.131677468604048, "language_loss": 0.78782505, "learning_rate": 1.372786763695152e-06, "loss": 0.80966616, "num_input_tokens_seen": 109945890, "step": 5102, "time_per_iteration": 2.780869245529175 }, { "auxiliary_loss_clip": 0.01171411, "auxiliary_loss_mlp": 0.01025317, "balance_loss_clip": 1.050282, "balance_loss_mlp": 1.01790822, "epoch": 0.6135994709312812, "flos": 21211248199680.0, "grad_norm": 5.743890315790567, "language_loss": 0.77364838, "learning_rate": 1.3720471364275257e-06, "loss": 0.79561567, "num_input_tokens_seen": 109965535, "step": 5103, "time_per_iteration": 2.751737117767334 }, { "auxiliary_loss_clip": 0.0115975, "auxiliary_loss_mlp": 0.01057066, "balance_loss_clip": 1.04817235, "balance_loss_mlp": 1.01991665, "epoch": 0.6137197138219203, "flos": 14794047907200.0, "grad_norm": 1.9211081801135093, "language_loss": 0.78185427, "learning_rate": 1.3713076044198486e-06, "loss": 0.80402243, "num_input_tokens_seen": 109982345, "step": 5104, "time_per_iteration": 2.7775394916534424 }, { "auxiliary_loss_clip": 0.01160954, "auxiliary_loss_mlp": 0.0102913, "balance_loss_clip": 1.04854298, "balance_loss_mlp": 1.02127397, "epoch": 0.6138399567125594, "flos": 20084515401600.0, "grad_norm": 2.299998539652303, "language_loss": 0.80854309, "learning_rate": 1.3705681677843086e-06, "loss": 0.83044398, "num_input_tokens_seen": 110000940, "step": 5105, "time_per_iteration": 2.8204357624053955 }, { "auxiliary_loss_clip": 0.01068352, "auxiliary_loss_mlp": 0.01002905, "balance_loss_clip": 1.01261616, "balance_loss_mlp": 1.00186181, "epoch": 0.6139601996031985, "flos": 60123838193280.0, "grad_norm": 0.7711477122443489, "language_loss": 0.60575759, "learning_rate": 1.3698288266330768e-06, "loss": 0.62647021, "num_input_tokens_seen": 110061565, "step": 5106, "time_per_iteration": 3.3773839473724365 }, { "auxiliary_loss_clip": 0.01161094, "auxiliary_loss_mlp": 0.01027821, "balance_loss_clip": 1.05056596, "balance_loss_mlp": 1.02086866, "epoch": 0.6140804424938375, "flos": 23586703361280.0, "grad_norm": 2.6904116304148764, "language_loss": 0.72540587, "learning_rate": 1.3690895810783113e-06, "loss": 0.74729502, "num_input_tokens_seen": 110080360, "step": 5107, "time_per_iteration": 2.780796527862549 }, { "auxiliary_loss_clip": 0.01164369, "auxiliary_loss_mlp": 0.0105615, "balance_loss_clip": 1.04813063, "balance_loss_mlp": 1.01939952, "epoch": 0.6142006853844767, "flos": 21398199511680.0, "grad_norm": 2.205958984216181, "language_loss": 0.71731865, "learning_rate": 1.3683504312321543e-06, "loss": 0.73952383, "num_input_tokens_seen": 110100695, "step": 5108, "time_per_iteration": 2.853919267654419 }, { "auxiliary_loss_clip": 0.01171309, "auxiliary_loss_mlp": 0.01025164, "balance_loss_clip": 1.04882526, "balance_loss_mlp": 1.01763034, "epoch": 0.6143209282751158, "flos": 12057367622400.0, "grad_norm": 2.035710908184897, "language_loss": 0.80275178, "learning_rate": 1.3676113772067355e-06, "loss": 0.82471651, "num_input_tokens_seen": 110117750, "step": 5109, "time_per_iteration": 2.7178080081939697 }, { "auxiliary_loss_clip": 0.0116541, "auxiliary_loss_mlp": 0.01023006, "balance_loss_clip": 1.05035853, "balance_loss_mlp": 1.01561522, "epoch": 0.6144411711657548, "flos": 25082274965760.0, "grad_norm": 3.012196667714523, "language_loss": 0.72474802, "learning_rate": 1.3668724191141671e-06, "loss": 0.74663222, "num_input_tokens_seen": 110137020, "step": 5110, "time_per_iteration": 2.8739655017852783 }, { "auxiliary_loss_clip": 0.01157018, "auxiliary_loss_mlp": 0.01026851, "balance_loss_clip": 1.05007923, "balance_loss_mlp": 1.01925159, "epoch": 0.6145614140563939, "flos": 20114069316480.0, "grad_norm": 2.289051844483998, "language_loss": 0.66249061, "learning_rate": 1.3661335570665493e-06, "loss": 0.68432933, "num_input_tokens_seen": 110154930, "step": 5111, "time_per_iteration": 2.7309329509735107 }, { "auxiliary_loss_clip": 0.01169069, "auxiliary_loss_mlp": 0.01024149, "balance_loss_clip": 1.04968715, "balance_loss_mlp": 1.01693082, "epoch": 0.614681656947033, "flos": 16800376953600.0, "grad_norm": 2.4031867982544868, "language_loss": 0.69932079, "learning_rate": 1.3653947911759676e-06, "loss": 0.72125304, "num_input_tokens_seen": 110172480, "step": 5112, "time_per_iteration": 2.712980031967163 }, { "auxiliary_loss_clip": 0.01157124, "auxiliary_loss_mlp": 0.01023825, "balance_loss_clip": 1.05162907, "balance_loss_mlp": 1.01618052, "epoch": 0.6148018998376721, "flos": 38801587011840.0, "grad_norm": 1.730180831426154, "language_loss": 0.74292451, "learning_rate": 1.3646561215544904e-06, "loss": 0.76473403, "num_input_tokens_seen": 110197120, "step": 5113, "time_per_iteration": 2.879783868789673 }, { "auxiliary_loss_clip": 0.01167205, "auxiliary_loss_mlp": 0.01023505, "balance_loss_clip": 1.04871714, "balance_loss_mlp": 1.01640654, "epoch": 0.6149221427283111, "flos": 23327032965120.0, "grad_norm": 2.2703758238132763, "language_loss": 0.79606879, "learning_rate": 1.363917548314176e-06, "loss": 0.81797588, "num_input_tokens_seen": 110216385, "step": 5114, "time_per_iteration": 2.6972241401672363 }, { "auxiliary_loss_clip": 0.01177326, "auxiliary_loss_mlp": 0.01027337, "balance_loss_clip": 1.05086374, "balance_loss_mlp": 1.01957011, "epoch": 0.6150423856189503, "flos": 22379494141440.0, "grad_norm": 1.740931495577648, "language_loss": 0.72738266, "learning_rate": 1.3631790715670626e-06, "loss": 0.74942929, "num_input_tokens_seen": 110234790, "step": 5115, "time_per_iteration": 2.8913497924804688 }, { "auxiliary_loss_clip": 0.01143762, "auxiliary_loss_mlp": 0.01023678, "balance_loss_clip": 1.04971457, "balance_loss_mlp": 1.01690423, "epoch": 0.6151626285095894, "flos": 18692078722560.0, "grad_norm": 1.7640539237714004, "language_loss": 0.85427755, "learning_rate": 1.3624406914251783e-06, "loss": 0.87595195, "num_input_tokens_seen": 110251910, "step": 5116, "time_per_iteration": 3.8449594974517822 }, { "auxiliary_loss_clip": 0.01170347, "auxiliary_loss_mlp": 0.01025058, "balance_loss_clip": 1.04810297, "balance_loss_mlp": 1.01783669, "epoch": 0.6152828714002284, "flos": 15851688894720.0, "grad_norm": 1.8784974654278397, "language_loss": 0.88241076, "learning_rate": 1.3617024080005335e-06, "loss": 0.90436488, "num_input_tokens_seen": 110268810, "step": 5117, "time_per_iteration": 4.6858437061309814 }, { "auxiliary_loss_clip": 0.01169046, "auxiliary_loss_mlp": 0.01047496, "balance_loss_clip": 1.04844332, "balance_loss_mlp": 1.01259446, "epoch": 0.6154031142908676, "flos": 24869792062080.0, "grad_norm": 1.8800265561731722, "language_loss": 0.74364662, "learning_rate": 1.3609642214051266e-06, "loss": 0.76581204, "num_input_tokens_seen": 110293035, "step": 5118, "time_per_iteration": 2.854952812194824 }, { "auxiliary_loss_clip": 0.011615, "auxiliary_loss_mlp": 0.01023465, "balance_loss_clip": 1.05177379, "balance_loss_mlp": 1.01583612, "epoch": 0.6155233571815066, "flos": 19244744357760.0, "grad_norm": 2.3843379274431777, "language_loss": 0.65751338, "learning_rate": 1.3602261317509385e-06, "loss": 0.67936301, "num_input_tokens_seen": 110309695, "step": 5119, "time_per_iteration": 2.6870572566986084 }, { "auxiliary_loss_clip": 0.01170199, "auxiliary_loss_mlp": 0.01023662, "balance_loss_clip": 1.04857075, "balance_loss_mlp": 1.01622963, "epoch": 0.6156436000721457, "flos": 18770077105920.0, "grad_norm": 3.353380043917105, "language_loss": 0.82992017, "learning_rate": 1.3594881391499387e-06, "loss": 0.85185874, "num_input_tokens_seen": 110328610, "step": 5120, "time_per_iteration": 2.7307305335998535 }, { "auxiliary_loss_clip": 0.01169307, "auxiliary_loss_mlp": 0.01025367, "balance_loss_clip": 1.05097055, "balance_loss_mlp": 1.01808643, "epoch": 0.6157638429627849, "flos": 18041198325120.0, "grad_norm": 1.7796979711702674, "language_loss": 0.79180515, "learning_rate": 1.3587502437140778e-06, "loss": 0.81375194, "num_input_tokens_seen": 110346775, "step": 5121, "time_per_iteration": 2.7168161869049072 }, { "auxiliary_loss_clip": 0.01167406, "auxiliary_loss_mlp": 0.01028425, "balance_loss_clip": 1.04723263, "balance_loss_mlp": 1.02049112, "epoch": 0.6158840858534239, "flos": 25556726736000.0, "grad_norm": 21.706752910496014, "language_loss": 0.8510325, "learning_rate": 1.3580124455552952e-06, "loss": 0.87299079, "num_input_tokens_seen": 110366140, "step": 5122, "time_per_iteration": 2.7593133449554443 }, { "auxiliary_loss_clip": 0.01169044, "auxiliary_loss_mlp": 0.01050271, "balance_loss_clip": 1.04946339, "balance_loss_mlp": 1.01577365, "epoch": 0.616004328744063, "flos": 24640788902400.0, "grad_norm": 1.8940564659175818, "language_loss": 0.87391865, "learning_rate": 1.3572747447855148e-06, "loss": 0.89611185, "num_input_tokens_seen": 110386550, "step": 5123, "time_per_iteration": 2.852853536605835 }, { "auxiliary_loss_clip": 0.01177249, "auxiliary_loss_mlp": 0.01021934, "balance_loss_clip": 1.05220747, "balance_loss_mlp": 1.014871, "epoch": 0.6161245716347021, "flos": 21689686379520.0, "grad_norm": 2.539695896438481, "language_loss": 0.69224739, "learning_rate": 1.356537141516644e-06, "loss": 0.71423918, "num_input_tokens_seen": 110403970, "step": 5124, "time_per_iteration": 2.6859323978424072 }, { "auxiliary_loss_clip": 0.01169361, "auxiliary_loss_mlp": 0.01025433, "balance_loss_clip": 1.05081975, "balance_loss_mlp": 1.01795554, "epoch": 0.6162448145253412, "flos": 35189225061120.0, "grad_norm": 2.0006665994411374, "language_loss": 0.61822939, "learning_rate": 1.3557996358605775e-06, "loss": 0.64017737, "num_input_tokens_seen": 110423890, "step": 5125, "time_per_iteration": 3.783829689025879 }, { "auxiliary_loss_clip": 0.01167105, "auxiliary_loss_mlp": 0.01023722, "balance_loss_clip": 1.04834843, "balance_loss_mlp": 1.01676345, "epoch": 0.6163650574159802, "flos": 21615279356160.0, "grad_norm": 2.0667990442153146, "language_loss": 0.69771254, "learning_rate": 1.3550622279291941e-06, "loss": 0.71962082, "num_input_tokens_seen": 110442035, "step": 5126, "time_per_iteration": 2.7598719596862793 }, { "auxiliary_loss_clip": 0.01154683, "auxiliary_loss_mlp": 0.01025912, "balance_loss_clip": 1.05046797, "balance_loss_mlp": 1.01877689, "epoch": 0.6164853003066194, "flos": 24572163968640.0, "grad_norm": 1.4161295417706796, "language_loss": 0.8339287, "learning_rate": 1.354324917834358e-06, "loss": 0.85573471, "num_input_tokens_seen": 110463280, "step": 5127, "time_per_iteration": 2.8224592208862305 }, { "auxiliary_loss_clip": 0.01151879, "auxiliary_loss_mlp": 0.01055735, "balance_loss_clip": 1.04728365, "balance_loss_mlp": 1.01974642, "epoch": 0.6166055431972585, "flos": 21835986474240.0, "grad_norm": 2.1493677895517522, "language_loss": 0.76834667, "learning_rate": 1.353587705687918e-06, "loss": 0.7904228, "num_input_tokens_seen": 110481455, "step": 5128, "time_per_iteration": 2.807833194732666 }, { "auxiliary_loss_clip": 0.0117436, "auxiliary_loss_mlp": 0.0102787, "balance_loss_clip": 1.05362201, "balance_loss_mlp": 1.02038383, "epoch": 0.6167257860878975, "flos": 17785262943360.0, "grad_norm": 2.6568359939745383, "language_loss": 0.72245276, "learning_rate": 1.3528505916017096e-06, "loss": 0.74447507, "num_input_tokens_seen": 110499155, "step": 5129, "time_per_iteration": 2.7168312072753906 }, { "auxiliary_loss_clip": 0.01174114, "auxiliary_loss_mlp": 0.01027284, "balance_loss_clip": 1.05084801, "balance_loss_mlp": 1.02017939, "epoch": 0.6168460289785367, "flos": 23214811898880.0, "grad_norm": 2.29019729371974, "language_loss": 0.88621068, "learning_rate": 1.3521135756875514e-06, "loss": 0.90822464, "num_input_tokens_seen": 110515470, "step": 5130, "time_per_iteration": 2.686204195022583 }, { "auxiliary_loss_clip": 0.01148086, "auxiliary_loss_mlp": 0.01025063, "balance_loss_clip": 1.04775834, "balance_loss_mlp": 1.01814902, "epoch": 0.6169662718691757, "flos": 26213281482240.0, "grad_norm": 1.493300382170906, "language_loss": 0.86272764, "learning_rate": 1.3513766580572496e-06, "loss": 0.88445914, "num_input_tokens_seen": 110538290, "step": 5131, "time_per_iteration": 2.8201701641082764 }, { "auxiliary_loss_clip": 0.01167613, "auxiliary_loss_mlp": 0.01023519, "balance_loss_clip": 1.04887748, "balance_loss_mlp": 1.01653314, "epoch": 0.6170865147598148, "flos": 19026120228480.0, "grad_norm": 2.1103371267142275, "language_loss": 0.77545893, "learning_rate": 1.3506398388225924e-06, "loss": 0.7973702, "num_input_tokens_seen": 110555610, "step": 5132, "time_per_iteration": 2.672624349594116 }, { "auxiliary_loss_clip": 0.01169227, "auxiliary_loss_mlp": 0.01026889, "balance_loss_clip": 1.04904437, "balance_loss_mlp": 1.01990628, "epoch": 0.617206757650454, "flos": 18260361158400.0, "grad_norm": 1.8424854430098272, "language_loss": 0.72276586, "learning_rate": 1.349903118095355e-06, "loss": 0.74472702, "num_input_tokens_seen": 110574745, "step": 5133, "time_per_iteration": 2.6280434131622314 }, { "auxiliary_loss_clip": 0.0117619, "auxiliary_loss_mlp": 0.01024116, "balance_loss_clip": 1.05184078, "balance_loss_mlp": 1.01696599, "epoch": 0.617327000541093, "flos": 18186959715840.0, "grad_norm": 1.7284575165667362, "language_loss": 0.73332554, "learning_rate": 1.349166495987298e-06, "loss": 0.7553286, "num_input_tokens_seen": 110593310, "step": 5134, "time_per_iteration": 2.6924121379852295 }, { "auxiliary_loss_clip": 0.01069755, "auxiliary_loss_mlp": 0.01008996, "balance_loss_clip": 1.01962614, "balance_loss_mlp": 1.00791121, "epoch": 0.6174472434317321, "flos": 61833796122240.0, "grad_norm": 0.8242551717050485, "language_loss": 0.6085304, "learning_rate": 1.348429972610166e-06, "loss": 0.62931788, "num_input_tokens_seen": 110657615, "step": 5135, "time_per_iteration": 3.4070706367492676 }, { "auxiliary_loss_clip": 0.01065321, "auxiliary_loss_mlp": 0.01010542, "balance_loss_clip": 1.01945376, "balance_loss_mlp": 1.00947523, "epoch": 0.6175674863223712, "flos": 71230970494080.0, "grad_norm": 0.8470092278922692, "language_loss": 0.57838786, "learning_rate": 1.3476935480756897e-06, "loss": 0.59914649, "num_input_tokens_seen": 110714365, "step": 5136, "time_per_iteration": 3.1623897552490234 }, { "auxiliary_loss_clip": 0.01159503, "auxiliary_loss_mlp": 0.01029027, "balance_loss_clip": 1.05213249, "balance_loss_mlp": 1.02163029, "epoch": 0.6176877292130103, "flos": 21835447770240.0, "grad_norm": 2.6956284864102837, "language_loss": 0.75029051, "learning_rate": 1.346957222495583e-06, "loss": 0.77217585, "num_input_tokens_seen": 110732160, "step": 5137, "time_per_iteration": 2.7452735900878906 }, { "auxiliary_loss_clip": 0.0117271, "auxiliary_loss_mlp": 0.01059338, "balance_loss_clip": 1.05143344, "balance_loss_mlp": 1.02320373, "epoch": 0.6178079721036493, "flos": 17741738638080.0, "grad_norm": 2.2410939907544645, "language_loss": 0.71269846, "learning_rate": 1.3462209959815466e-06, "loss": 0.73501897, "num_input_tokens_seen": 110746900, "step": 5138, "time_per_iteration": 2.6534342765808105 }, { "auxiliary_loss_clip": 0.01168424, "auxiliary_loss_mlp": 0.01020839, "balance_loss_clip": 1.05098867, "balance_loss_mlp": 1.01410365, "epoch": 0.6179282149942885, "flos": 22633131052800.0, "grad_norm": 2.067825470122615, "language_loss": 0.74496961, "learning_rate": 1.345484868645265e-06, "loss": 0.76686221, "num_input_tokens_seen": 110765710, "step": 5139, "time_per_iteration": 2.7883899211883545 }, { "auxiliary_loss_clip": 0.01169877, "auxiliary_loss_mlp": 0.01027288, "balance_loss_clip": 1.04937935, "balance_loss_mlp": 1.01957548, "epoch": 0.6180484578849276, "flos": 22310330503680.0, "grad_norm": 1.9869852364886427, "language_loss": 0.78489828, "learning_rate": 1.3447488405984088e-06, "loss": 0.80686998, "num_input_tokens_seen": 110783970, "step": 5140, "time_per_iteration": 2.8754422664642334 }, { "auxiliary_loss_clip": 0.01163947, "auxiliary_loss_mlp": 0.01030142, "balance_loss_clip": 1.04904985, "balance_loss_mlp": 1.02217913, "epoch": 0.6181687007755666, "flos": 35225458905600.0, "grad_norm": 2.2044456590832358, "language_loss": 0.7049467, "learning_rate": 1.3440129119526322e-06, "loss": 0.72688758, "num_input_tokens_seen": 110806395, "step": 5141, "time_per_iteration": 2.846125602722168 }, { "auxiliary_loss_clip": 0.01070006, "auxiliary_loss_mlp": 0.01001827, "balance_loss_clip": 1.01404548, "balance_loss_mlp": 1.00086713, "epoch": 0.6182889436662057, "flos": 61547370094080.0, "grad_norm": 0.8582674883367094, "language_loss": 0.51203477, "learning_rate": 1.3432770828195762e-06, "loss": 0.53275311, "num_input_tokens_seen": 110867380, "step": 5142, "time_per_iteration": 4.360620498657227 }, { "auxiliary_loss_clip": 0.01156739, "auxiliary_loss_mlp": 0.0102673, "balance_loss_clip": 1.04891145, "balance_loss_mlp": 1.01917791, "epoch": 0.6184091865568448, "flos": 19609991804160.0, "grad_norm": 2.298256938649086, "language_loss": 0.70099854, "learning_rate": 1.3425413533108635e-06, "loss": 0.72283322, "num_input_tokens_seen": 110885980, "step": 5143, "time_per_iteration": 3.733372449874878 }, { "auxiliary_loss_clip": 0.01162904, "auxiliary_loss_mlp": 0.01027612, "balance_loss_clip": 1.05323577, "balance_loss_mlp": 1.02060008, "epoch": 0.6185294294474839, "flos": 23586882929280.0, "grad_norm": 2.117423401160627, "language_loss": 0.71149731, "learning_rate": 1.341805723538105e-06, "loss": 0.73340237, "num_input_tokens_seen": 110906085, "step": 5144, "time_per_iteration": 3.707808494567871 }, { "auxiliary_loss_clip": 0.01173475, "auxiliary_loss_mlp": 0.01026026, "balance_loss_clip": 1.05044663, "balance_loss_mlp": 1.01850939, "epoch": 0.618649672338123, "flos": 26762032535040.0, "grad_norm": 1.69357263882835, "language_loss": 0.7727471, "learning_rate": 1.3410701936128948e-06, "loss": 0.79474211, "num_input_tokens_seen": 110928865, "step": 5145, "time_per_iteration": 2.813602924346924 }, { "auxiliary_loss_clip": 0.01166511, "auxiliary_loss_mlp": 0.01027108, "balance_loss_clip": 1.04991221, "balance_loss_mlp": 1.01991081, "epoch": 0.6187699152287621, "flos": 14456630522880.0, "grad_norm": 2.468387541924856, "language_loss": 0.8509683, "learning_rate": 1.340334763646812e-06, "loss": 0.87290442, "num_input_tokens_seen": 110943000, "step": 5146, "time_per_iteration": 2.6963183879852295 }, { "auxiliary_loss_clip": 0.01172308, "auxiliary_loss_mlp": 0.01034538, "balance_loss_clip": 1.04863203, "balance_loss_mlp": 1.02682793, "epoch": 0.6188901581194012, "flos": 20084766796800.0, "grad_norm": 1.6755827451073537, "language_loss": 0.74318373, "learning_rate": 1.3395994337514218e-06, "loss": 0.76525223, "num_input_tokens_seen": 110963170, "step": 5147, "time_per_iteration": 2.787755012512207 }, { "auxiliary_loss_clip": 0.01160793, "auxiliary_loss_mlp": 0.01025952, "balance_loss_clip": 1.04690051, "balance_loss_mlp": 1.0186801, "epoch": 0.6190104010100402, "flos": 25700728360320.0, "grad_norm": 1.6313563163069484, "language_loss": 0.78384054, "learning_rate": 1.3388642040382725e-06, "loss": 0.80570799, "num_input_tokens_seen": 110983595, "step": 5148, "time_per_iteration": 2.7328476905822754 }, { "auxiliary_loss_clip": 0.0117088, "auxiliary_loss_mlp": 0.01030833, "balance_loss_clip": 1.05097938, "balance_loss_mlp": 1.02328753, "epoch": 0.6191306439006794, "flos": 30442372974720.0, "grad_norm": 3.057872038228682, "language_loss": 0.8413285, "learning_rate": 1.3381290746188975e-06, "loss": 0.86334562, "num_input_tokens_seen": 111002965, "step": 5149, "time_per_iteration": 2.788074016571045 }, { "auxiliary_loss_clip": 0.01171019, "auxiliary_loss_mlp": 0.01022945, "balance_loss_clip": 1.05268955, "balance_loss_mlp": 1.01554775, "epoch": 0.6192508867913185, "flos": 26685793918080.0, "grad_norm": 1.8107179939936764, "language_loss": 0.67129719, "learning_rate": 1.3373940456048152e-06, "loss": 0.69323683, "num_input_tokens_seen": 111022990, "step": 5150, "time_per_iteration": 2.7401375770568848 }, { "auxiliary_loss_clip": 0.01171228, "auxiliary_loss_mlp": 0.01024916, "balance_loss_clip": 1.05006146, "balance_loss_mlp": 1.01799572, "epoch": 0.6193711296819575, "flos": 36722036090880.0, "grad_norm": 5.3791697272218775, "language_loss": 0.59164584, "learning_rate": 1.3366591171075299e-06, "loss": 0.61360729, "num_input_tokens_seen": 111046495, "step": 5151, "time_per_iteration": 3.7778775691986084 }, { "auxiliary_loss_clip": 0.01162096, "auxiliary_loss_mlp": 0.01025008, "balance_loss_clip": 1.04884815, "balance_loss_mlp": 1.0180676, "epoch": 0.6194913725725967, "flos": 25192556697600.0, "grad_norm": 2.1617784340984882, "language_loss": 0.90952396, "learning_rate": 1.335924289238529e-06, "loss": 0.93139499, "num_input_tokens_seen": 111065705, "step": 5152, "time_per_iteration": 2.7676124572753906 }, { "auxiliary_loss_clip": 0.01168216, "auxiliary_loss_mlp": 0.01055342, "balance_loss_clip": 1.05261195, "balance_loss_mlp": 1.01958394, "epoch": 0.6196116154632357, "flos": 21178821196800.0, "grad_norm": 1.6153820814985533, "language_loss": 0.76982975, "learning_rate": 1.3351895621092859e-06, "loss": 0.79206532, "num_input_tokens_seen": 111086050, "step": 5153, "time_per_iteration": 2.8864424228668213 }, { "auxiliary_loss_clip": 0.01142555, "auxiliary_loss_mlp": 0.01025394, "balance_loss_clip": 1.04874325, "balance_loss_mlp": 1.01813745, "epoch": 0.6197318583538748, "flos": 16253744803200.0, "grad_norm": 3.356238482545759, "language_loss": 0.76486927, "learning_rate": 1.3344549358312567e-06, "loss": 0.78654873, "num_input_tokens_seen": 111104450, "step": 5154, "time_per_iteration": 2.8282365798950195 }, { "auxiliary_loss_clip": 0.01172909, "auxiliary_loss_mlp": 0.01022317, "balance_loss_clip": 1.05081987, "balance_loss_mlp": 1.01472938, "epoch": 0.619852101244514, "flos": 24425612478720.0, "grad_norm": 2.350063927401492, "language_loss": 0.78336442, "learning_rate": 1.3337204105158852e-06, "loss": 0.80531669, "num_input_tokens_seen": 111123320, "step": 5155, "time_per_iteration": 2.7116146087646484 }, { "auxiliary_loss_clip": 0.01152676, "auxiliary_loss_mlp": 0.01027286, "balance_loss_clip": 1.04624987, "balance_loss_mlp": 1.02011597, "epoch": 0.619972344135153, "flos": 16727298733440.0, "grad_norm": 3.332976613116627, "language_loss": 0.73206359, "learning_rate": 1.332985986274597e-06, "loss": 0.75386322, "num_input_tokens_seen": 111140950, "step": 5156, "time_per_iteration": 2.6492226123809814 }, { "auxiliary_loss_clip": 0.01150371, "auxiliary_loss_mlp": 0.0105478, "balance_loss_clip": 1.0505358, "balance_loss_mlp": 1.01920283, "epoch": 0.6200925870257921, "flos": 12495190498560.0, "grad_norm": 1.9270383597582441, "language_loss": 0.75034356, "learning_rate": 1.3322516632188047e-06, "loss": 0.77239501, "num_input_tokens_seen": 111157845, "step": 5157, "time_per_iteration": 2.7782764434814453 }, { "auxiliary_loss_clip": 0.01159079, "auxiliary_loss_mlp": 0.010278, "balance_loss_clip": 1.04785347, "balance_loss_mlp": 1.0197947, "epoch": 0.6202128299164312, "flos": 26539350168960.0, "grad_norm": 2.2206305496261725, "language_loss": 0.66843009, "learning_rate": 1.3315174414599045e-06, "loss": 0.69029886, "num_input_tokens_seen": 111179165, "step": 5158, "time_per_iteration": 2.8385510444641113 }, { "auxiliary_loss_clip": 0.01164623, "auxiliary_loss_mlp": 0.01027079, "balance_loss_clip": 1.04854894, "balance_loss_mlp": 1.02022779, "epoch": 0.6203330728070703, "flos": 18770508069120.0, "grad_norm": 1.928932407963486, "language_loss": 0.75174737, "learning_rate": 1.3307833211092768e-06, "loss": 0.77366436, "num_input_tokens_seen": 111197830, "step": 5159, "time_per_iteration": 2.7205605506896973 }, { "auxiliary_loss_clip": 0.01173077, "auxiliary_loss_mlp": 0.01027243, "balance_loss_clip": 1.0507859, "balance_loss_mlp": 1.01967359, "epoch": 0.6204533156977093, "flos": 20629782835200.0, "grad_norm": 1.6423739927317504, "language_loss": 0.75484407, "learning_rate": 1.3300493022782873e-06, "loss": 0.7768473, "num_input_tokens_seen": 111218400, "step": 5160, "time_per_iteration": 2.7025156021118164 }, { "auxiliary_loss_clip": 0.0114948, "auxiliary_loss_mlp": 0.01050257, "balance_loss_clip": 1.04827189, "balance_loss_mlp": 1.01559806, "epoch": 0.6205735585883485, "flos": 17348050598400.0, "grad_norm": 1.7714494436722321, "language_loss": 0.72605252, "learning_rate": 1.3293153850782855e-06, "loss": 0.74804986, "num_input_tokens_seen": 111236720, "step": 5161, "time_per_iteration": 2.790571928024292 }, { "auxiliary_loss_clip": 0.01154812, "auxiliary_loss_mlp": 0.01029116, "balance_loss_clip": 1.04845428, "balance_loss_mlp": 1.0211587, "epoch": 0.6206938014789876, "flos": 22965017742720.0, "grad_norm": 1.9753293807433367, "language_loss": 0.70967579, "learning_rate": 1.3285815696206069e-06, "loss": 0.73151505, "num_input_tokens_seen": 111258265, "step": 5162, "time_per_iteration": 2.874319553375244 }, { "auxiliary_loss_clip": 0.01164291, "auxiliary_loss_mlp": 0.01029051, "balance_loss_clip": 1.04861403, "balance_loss_mlp": 1.02102816, "epoch": 0.6208140443696266, "flos": 23983192661760.0, "grad_norm": 1.9869985357416704, "language_loss": 0.77060503, "learning_rate": 1.32784785601657e-06, "loss": 0.7925384, "num_input_tokens_seen": 111277675, "step": 5163, "time_per_iteration": 2.754453182220459 }, { "auxiliary_loss_clip": 0.01169663, "auxiliary_loss_mlp": 0.01025223, "balance_loss_clip": 1.04903877, "balance_loss_mlp": 1.01824021, "epoch": 0.6209342872602658, "flos": 35077291303680.0, "grad_norm": 1.9059302447922766, "language_loss": 0.74094766, "learning_rate": 1.3271142443774798e-06, "loss": 0.76289654, "num_input_tokens_seen": 111299910, "step": 5164, "time_per_iteration": 2.8244876861572266 }, { "auxiliary_loss_clip": 0.01162195, "auxiliary_loss_mlp": 0.01023226, "balance_loss_clip": 1.04895794, "balance_loss_mlp": 1.01605821, "epoch": 0.6210545301509048, "flos": 26979327861120.0, "grad_norm": 2.241077923107381, "language_loss": 0.8185066, "learning_rate": 1.3263807348146228e-06, "loss": 0.84036076, "num_input_tokens_seen": 111319765, "step": 5165, "time_per_iteration": 2.703167676925659 }, { "auxiliary_loss_clip": 0.01167959, "auxiliary_loss_mlp": 0.01029261, "balance_loss_clip": 1.05147874, "balance_loss_mlp": 1.02154267, "epoch": 0.6211747730415439, "flos": 33618240852480.0, "grad_norm": 1.9916506292083698, "language_loss": 0.73465681, "learning_rate": 1.3256473274392733e-06, "loss": 0.75662899, "num_input_tokens_seen": 111341110, "step": 5166, "time_per_iteration": 2.8294832706451416 }, { "auxiliary_loss_clip": 0.01172333, "auxiliary_loss_mlp": 0.01024917, "balance_loss_clip": 1.05042613, "balance_loss_mlp": 1.01775277, "epoch": 0.6212950159321831, "flos": 34167099646080.0, "grad_norm": 2.5799740024626856, "language_loss": 0.70191193, "learning_rate": 1.3249140223626873e-06, "loss": 0.72388434, "num_input_tokens_seen": 111362730, "step": 5167, "time_per_iteration": 2.835310459136963 }, { "auxiliary_loss_clip": 0.01164407, "auxiliary_loss_mlp": 0.01024512, "balance_loss_clip": 1.04776955, "balance_loss_mlp": 1.01694512, "epoch": 0.6214152588228221, "flos": 27965758135680.0, "grad_norm": 1.8675074633140332, "language_loss": 0.75449133, "learning_rate": 1.3241808196961077e-06, "loss": 0.77638054, "num_input_tokens_seen": 111383855, "step": 5168, "time_per_iteration": 3.8541417121887207 }, { "auxiliary_loss_clip": 0.01152783, "auxiliary_loss_mlp": 0.01023296, "balance_loss_clip": 1.04743135, "balance_loss_mlp": 1.01648951, "epoch": 0.6215355017134612, "flos": 20230204965120.0, "grad_norm": 1.9371269701474993, "language_loss": 0.70657474, "learning_rate": 1.3234477195507608e-06, "loss": 0.7283355, "num_input_tokens_seen": 111402685, "step": 5169, "time_per_iteration": 3.7797772884368896 }, { "auxiliary_loss_clip": 0.01161779, "auxiliary_loss_mlp": 0.01022079, "balance_loss_clip": 1.04835808, "balance_loss_mlp": 1.01487613, "epoch": 0.6216557446041003, "flos": 41428129219200.0, "grad_norm": 2.237653310663685, "language_loss": 0.62538111, "learning_rate": 1.322714722037857e-06, "loss": 0.64721972, "num_input_tokens_seen": 111424130, "step": 5170, "time_per_iteration": 3.8820855617523193 }, { "auxiliary_loss_clip": 0.011729, "auxiliary_loss_mlp": 0.01039239, "balance_loss_clip": 1.05197704, "balance_loss_mlp": 1.0311687, "epoch": 0.6217759874947394, "flos": 27928770105600.0, "grad_norm": 2.654580685680309, "language_loss": 0.76964295, "learning_rate": 1.321981827268591e-06, "loss": 0.79176438, "num_input_tokens_seen": 111444785, "step": 5171, "time_per_iteration": 2.7780537605285645 }, { "auxiliary_loss_clip": 0.0116891, "auxiliary_loss_mlp": 0.01024563, "balance_loss_clip": 1.04836059, "balance_loss_mlp": 1.01732683, "epoch": 0.6218962303853784, "flos": 21765673601280.0, "grad_norm": 1.7358510499252824, "language_loss": 0.81567621, "learning_rate": 1.3212490353541426e-06, "loss": 0.8376109, "num_input_tokens_seen": 111467045, "step": 5172, "time_per_iteration": 2.71708607673645 }, { "auxiliary_loss_clip": 0.01174182, "auxiliary_loss_mlp": 0.01021277, "balance_loss_clip": 1.04988742, "balance_loss_mlp": 1.01369476, "epoch": 0.6220164732760175, "flos": 21246260981760.0, "grad_norm": 2.0013858411814063, "language_loss": 0.80291641, "learning_rate": 1.3205163464056762e-06, "loss": 0.824871, "num_input_tokens_seen": 111483650, "step": 5173, "time_per_iteration": 2.663607358932495 }, { "auxiliary_loss_clip": 0.01165614, "auxiliary_loss_mlp": 0.01024727, "balance_loss_clip": 1.04797053, "balance_loss_mlp": 1.01749086, "epoch": 0.6221367161666567, "flos": 26136360506880.0, "grad_norm": 1.7805346940992062, "language_loss": 0.72962308, "learning_rate": 1.319783760534339e-06, "loss": 0.75152647, "num_input_tokens_seen": 111502895, "step": 5174, "time_per_iteration": 2.732037305831909 }, { "auxiliary_loss_clip": 0.01172956, "auxiliary_loss_mlp": 0.01027644, "balance_loss_clip": 1.0518074, "balance_loss_mlp": 1.02014303, "epoch": 0.6222569590572957, "flos": 16284196558080.0, "grad_norm": 2.429360064702075, "language_loss": 0.75136518, "learning_rate": 1.319051277851266e-06, "loss": 0.77337122, "num_input_tokens_seen": 111519180, "step": 5175, "time_per_iteration": 2.6319239139556885 }, { "auxiliary_loss_clip": 0.01169835, "auxiliary_loss_mlp": 0.01028613, "balance_loss_clip": 1.04949069, "balance_loss_mlp": 1.02140963, "epoch": 0.6223772019479348, "flos": 18223840005120.0, "grad_norm": 2.0422746013282205, "language_loss": 0.84130895, "learning_rate": 1.3183188984675716e-06, "loss": 0.86329341, "num_input_tokens_seen": 111537545, "step": 5176, "time_per_iteration": 2.7236578464508057 }, { "auxiliary_loss_clip": 0.01161374, "auxiliary_loss_mlp": 0.01028758, "balance_loss_clip": 1.0490818, "balance_loss_mlp": 1.02144432, "epoch": 0.6224974448385739, "flos": 27489797994240.0, "grad_norm": 2.424764424962744, "language_loss": 0.71104264, "learning_rate": 1.3175866224943586e-06, "loss": 0.73294389, "num_input_tokens_seen": 111556265, "step": 5177, "time_per_iteration": 3.7788617610931396 }, { "auxiliary_loss_clip": 0.01171456, "auxiliary_loss_mlp": 0.01023432, "balance_loss_clip": 1.0523572, "balance_loss_mlp": 1.01603234, "epoch": 0.622617687729213, "flos": 19791951125760.0, "grad_norm": 3.853203638574874, "language_loss": 0.73337793, "learning_rate": 1.316854450042712e-06, "loss": 0.75532681, "num_input_tokens_seen": 111574205, "step": 5178, "time_per_iteration": 2.7530736923217773 }, { "auxiliary_loss_clip": 0.01171752, "auxiliary_loss_mlp": 0.01019352, "balance_loss_clip": 1.04871881, "balance_loss_mlp": 1.01217532, "epoch": 0.622737930619852, "flos": 23038886062080.0, "grad_norm": 2.270753810358547, "language_loss": 0.74383205, "learning_rate": 1.3161223812237024e-06, "loss": 0.76574314, "num_input_tokens_seen": 111593560, "step": 5179, "time_per_iteration": 2.795121908187866 }, { "auxiliary_loss_clip": 0.01171291, "auxiliary_loss_mlp": 0.01023693, "balance_loss_clip": 1.04809999, "balance_loss_mlp": 1.01602185, "epoch": 0.6228581735104912, "flos": 12634271959680.0, "grad_norm": 2.243337726046941, "language_loss": 0.85293937, "learning_rate": 1.3153904161483842e-06, "loss": 0.87488925, "num_input_tokens_seen": 111608860, "step": 5180, "time_per_iteration": 2.7321360111236572 }, { "auxiliary_loss_clip": 0.01160018, "auxiliary_loss_mlp": 0.01021574, "balance_loss_clip": 1.04857779, "balance_loss_mlp": 1.01405191, "epoch": 0.6229784164011303, "flos": 23802813538560.0, "grad_norm": 2.6266429691228366, "language_loss": 0.85202706, "learning_rate": 1.3146585549277953e-06, "loss": 0.87384295, "num_input_tokens_seen": 111627500, "step": 5181, "time_per_iteration": 2.8198134899139404 }, { "auxiliary_loss_clip": 0.01173819, "auxiliary_loss_mlp": 0.01031001, "balance_loss_clip": 1.05135679, "balance_loss_mlp": 1.02284133, "epoch": 0.6230986592917693, "flos": 22414219614720.0, "grad_norm": 3.985923913679858, "language_loss": 0.78144312, "learning_rate": 1.3139267976729591e-06, "loss": 0.80349129, "num_input_tokens_seen": 111647690, "step": 5182, "time_per_iteration": 2.68636417388916 }, { "auxiliary_loss_clip": 0.01172731, "auxiliary_loss_mlp": 0.01024265, "balance_loss_clip": 1.05079103, "balance_loss_mlp": 1.01709461, "epoch": 0.6232189021824085, "flos": 34528217028480.0, "grad_norm": 2.1101290094363083, "language_loss": 0.71668071, "learning_rate": 1.3131951444948815e-06, "loss": 0.73865068, "num_input_tokens_seen": 111667090, "step": 5183, "time_per_iteration": 2.78564715385437 }, { "auxiliary_loss_clip": 0.01170861, "auxiliary_loss_mlp": 0.01026709, "balance_loss_clip": 1.05314898, "balance_loss_mlp": 1.01938367, "epoch": 0.6233391450730476, "flos": 22237000888320.0, "grad_norm": 1.829990535543285, "language_loss": 0.76130682, "learning_rate": 1.3124635955045546e-06, "loss": 0.78328252, "num_input_tokens_seen": 111686905, "step": 5184, "time_per_iteration": 2.7565908432006836 }, { "auxiliary_loss_clip": 0.01151926, "auxiliary_loss_mlp": 0.01050657, "balance_loss_clip": 1.04880881, "balance_loss_mlp": 1.0164839, "epoch": 0.6234593879636866, "flos": 20332693445760.0, "grad_norm": 1.7871707347564845, "language_loss": 0.84105021, "learning_rate": 1.3117321508129537e-06, "loss": 0.86307603, "num_input_tokens_seen": 111704985, "step": 5185, "time_per_iteration": 2.8282575607299805 }, { "auxiliary_loss_clip": 0.01167861, "auxiliary_loss_mlp": 0.0102968, "balance_loss_clip": 1.05058038, "balance_loss_mlp": 1.02199674, "epoch": 0.6235796308543258, "flos": 20664903358080.0, "grad_norm": 4.14201896443115, "language_loss": 0.76600862, "learning_rate": 1.3110008105310388e-06, "loss": 0.78798401, "num_input_tokens_seen": 111724805, "step": 5186, "time_per_iteration": 2.7983477115631104 }, { "auxiliary_loss_clip": 0.01171799, "auxiliary_loss_mlp": 0.0102492, "balance_loss_clip": 1.04730678, "balance_loss_mlp": 1.01692653, "epoch": 0.6236998737449648, "flos": 26618641441920.0, "grad_norm": 1.6780260628200319, "language_loss": 0.78048801, "learning_rate": 1.3102695747697526e-06, "loss": 0.80245519, "num_input_tokens_seen": 111747675, "step": 5187, "time_per_iteration": 2.7634034156799316 }, { "auxiliary_loss_clip": 0.01159671, "auxiliary_loss_mlp": 0.01026603, "balance_loss_clip": 1.05269742, "balance_loss_mlp": 1.0190866, "epoch": 0.6238201166356039, "flos": 12674599954560.0, "grad_norm": 2.5715663176894568, "language_loss": 0.90829974, "learning_rate": 1.3095384436400237e-06, "loss": 0.93016249, "num_input_tokens_seen": 111759205, "step": 5188, "time_per_iteration": 2.869889736175537 }, { "auxiliary_loss_clip": 0.0117417, "auxiliary_loss_mlp": 0.01024101, "balance_loss_clip": 1.04966784, "balance_loss_mlp": 1.01688015, "epoch": 0.623940359526243, "flos": 10452160730880.0, "grad_norm": 2.4174571486861756, "language_loss": 0.82444191, "learning_rate": 1.3088074172527633e-06, "loss": 0.84642464, "num_input_tokens_seen": 111776335, "step": 5189, "time_per_iteration": 2.686478853225708 }, { "auxiliary_loss_clip": 0.0116715, "auxiliary_loss_mlp": 0.01023007, "balance_loss_clip": 1.0466423, "balance_loss_mlp": 1.01506734, "epoch": 0.6240606024168821, "flos": 29059525226880.0, "grad_norm": 2.325204038214039, "language_loss": 0.71551573, "learning_rate": 1.3080764957188684e-06, "loss": 0.73741734, "num_input_tokens_seen": 111796580, "step": 5190, "time_per_iteration": 2.7806241512298584 }, { "auxiliary_loss_clip": 0.01165195, "auxiliary_loss_mlp": 0.01025841, "balance_loss_clip": 1.05104887, "balance_loss_mlp": 1.01798487, "epoch": 0.6241808453075212, "flos": 22018089450240.0, "grad_norm": 1.8565616290125124, "language_loss": 0.70896524, "learning_rate": 1.3073456791492192e-06, "loss": 0.73087561, "num_input_tokens_seen": 111816290, "step": 5191, "time_per_iteration": 2.718845844268799 }, { "auxiliary_loss_clip": 0.011659, "auxiliary_loss_mlp": 0.01024479, "balance_loss_clip": 1.04792142, "balance_loss_mlp": 1.01729095, "epoch": 0.6243010881981603, "flos": 21138708683520.0, "grad_norm": 3.0334889791814907, "language_loss": 0.7795673, "learning_rate": 1.3066149676546801e-06, "loss": 0.80147111, "num_input_tokens_seen": 111834470, "step": 5192, "time_per_iteration": 2.823218822479248 }, { "auxiliary_loss_clip": 0.01161804, "auxiliary_loss_mlp": 0.0102399, "balance_loss_clip": 1.04998589, "balance_loss_mlp": 1.01734662, "epoch": 0.6244213310887994, "flos": 22344948236160.0, "grad_norm": 1.8454002931748112, "language_loss": 0.66639209, "learning_rate": 1.3058843613460985e-06, "loss": 0.68825001, "num_input_tokens_seen": 111852410, "step": 5193, "time_per_iteration": 2.8022563457489014 }, { "auxiliary_loss_clip": 0.01171095, "auxiliary_loss_mlp": 0.01023888, "balance_loss_clip": 1.04947948, "balance_loss_mlp": 1.01666975, "epoch": 0.6245415739794384, "flos": 15231978524160.0, "grad_norm": 1.9921632576038502, "language_loss": 0.74508578, "learning_rate": 1.3051538603343075e-06, "loss": 0.7670356, "num_input_tokens_seen": 111870340, "step": 5194, "time_per_iteration": 3.8791441917419434 }, { "auxiliary_loss_clip": 0.01169481, "auxiliary_loss_mlp": 0.01023298, "balance_loss_clip": 1.05093455, "balance_loss_mlp": 1.01560879, "epoch": 0.6246618168700776, "flos": 18879891960960.0, "grad_norm": 2.2269150893658405, "language_loss": 0.67724895, "learning_rate": 1.3044234647301235e-06, "loss": 0.69917673, "num_input_tokens_seen": 111888365, "step": 5195, "time_per_iteration": 3.676015853881836 }, { "auxiliary_loss_clip": 0.01163227, "auxiliary_loss_mlp": 0.01024727, "balance_loss_clip": 1.0469842, "balance_loss_mlp": 1.01784301, "epoch": 0.6247820597607167, "flos": 14319201087360.0, "grad_norm": 1.7895773932602694, "language_loss": 0.72347784, "learning_rate": 1.303693174644347e-06, "loss": 0.74535739, "num_input_tokens_seen": 111905840, "step": 5196, "time_per_iteration": 3.525921583175659 }, { "auxiliary_loss_clip": 0.01162873, "auxiliary_loss_mlp": 0.01034862, "balance_loss_clip": 1.05003536, "balance_loss_mlp": 1.02750659, "epoch": 0.6249023026513557, "flos": 22637979388800.0, "grad_norm": 3.489085703033225, "language_loss": 0.80294824, "learning_rate": 1.3029629901877625e-06, "loss": 0.8249256, "num_input_tokens_seen": 111925215, "step": 5197, "time_per_iteration": 2.829310894012451 }, { "auxiliary_loss_clip": 0.01177867, "auxiliary_loss_mlp": 0.0102722, "balance_loss_clip": 1.05205083, "balance_loss_mlp": 1.02008867, "epoch": 0.6250225455419949, "flos": 20266690204800.0, "grad_norm": 23.802352919966953, "language_loss": 0.77601379, "learning_rate": 1.3022329114711376e-06, "loss": 0.79806465, "num_input_tokens_seen": 111943925, "step": 5198, "time_per_iteration": 2.6934750080108643 }, { "auxiliary_loss_clip": 0.01162967, "auxiliary_loss_mlp": 0.01020852, "balance_loss_clip": 1.0514822, "balance_loss_mlp": 1.01392853, "epoch": 0.6251427884326339, "flos": 23437853400960.0, "grad_norm": 1.8026933685541702, "language_loss": 0.69300646, "learning_rate": 1.3015029386052256e-06, "loss": 0.7148447, "num_input_tokens_seen": 111964095, "step": 5199, "time_per_iteration": 2.6970324516296387 }, { "auxiliary_loss_clip": 0.01170926, "auxiliary_loss_mlp": 0.01025139, "balance_loss_clip": 1.0494715, "balance_loss_mlp": 1.01792669, "epoch": 0.625263031323273, "flos": 31723055464320.0, "grad_norm": 1.9283926866138459, "language_loss": 0.72923851, "learning_rate": 1.3007730717007622e-06, "loss": 0.75119919, "num_input_tokens_seen": 111984910, "step": 5200, "time_per_iteration": 2.799128532409668 }, { "auxiliary_loss_clip": 0.01176013, "auxiliary_loss_mlp": 0.01026567, "balance_loss_clip": 1.05069089, "balance_loss_mlp": 1.01852643, "epoch": 0.6253832742139122, "flos": 24134341092480.0, "grad_norm": 1.9106278370694723, "language_loss": 0.75622547, "learning_rate": 1.3000433108684676e-06, "loss": 0.77825129, "num_input_tokens_seen": 112005410, "step": 5201, "time_per_iteration": 2.7099127769470215 }, { "auxiliary_loss_clip": 0.01165068, "auxiliary_loss_mlp": 0.01025057, "balance_loss_clip": 1.04898238, "balance_loss_mlp": 1.01766324, "epoch": 0.6255035171045512, "flos": 27668812400640.0, "grad_norm": 3.40905465112962, "language_loss": 0.80743259, "learning_rate": 1.2993136562190467e-06, "loss": 0.82933378, "num_input_tokens_seen": 112024530, "step": 5202, "time_per_iteration": 2.7026138305664062 }, { "auxiliary_loss_clip": 0.01168749, "auxiliary_loss_mlp": 0.01025955, "balance_loss_clip": 1.04956985, "balance_loss_mlp": 1.01905584, "epoch": 0.6256237599951903, "flos": 20227798753920.0, "grad_norm": 1.4706235127432292, "language_loss": 0.7015388, "learning_rate": 1.2985841078631871e-06, "loss": 0.72348583, "num_input_tokens_seen": 112043850, "step": 5203, "time_per_iteration": 3.538416862487793 }, { "auxiliary_loss_clip": 0.01154346, "auxiliary_loss_mlp": 0.01021517, "balance_loss_clip": 1.04776096, "balance_loss_mlp": 1.01406097, "epoch": 0.6257440028858293, "flos": 24170574936960.0, "grad_norm": 2.3461488615923582, "language_loss": 0.7810117, "learning_rate": 1.2978546659115608e-06, "loss": 0.80277038, "num_input_tokens_seen": 112061930, "step": 5204, "time_per_iteration": 2.627227544784546 }, { "auxiliary_loss_clip": 0.01169753, "auxiliary_loss_mlp": 0.01025666, "balance_loss_clip": 1.05067217, "balance_loss_mlp": 1.01874936, "epoch": 0.6258642457764685, "flos": 15851940289920.0, "grad_norm": 1.8812018460883932, "language_loss": 0.85609365, "learning_rate": 1.2971253304748228e-06, "loss": 0.87804788, "num_input_tokens_seen": 112079645, "step": 5205, "time_per_iteration": 2.576279640197754 }, { "auxiliary_loss_clip": 0.0117216, "auxiliary_loss_mlp": 0.01026459, "balance_loss_clip": 1.05103016, "balance_loss_mlp": 1.01874042, "epoch": 0.6259844886671075, "flos": 11911354836480.0, "grad_norm": 1.8853959244199392, "language_loss": 0.75072879, "learning_rate": 1.296396101663614e-06, "loss": 0.77271497, "num_input_tokens_seen": 112096205, "step": 5206, "time_per_iteration": 2.4940693378448486 }, { "auxiliary_loss_clip": 0.01170658, "auxiliary_loss_mlp": 0.01025528, "balance_loss_clip": 1.05004478, "balance_loss_mlp": 1.01810157, "epoch": 0.6261047315577466, "flos": 15887958652800.0, "grad_norm": 2.023706451650895, "language_loss": 0.84345406, "learning_rate": 1.2956669795885565e-06, "loss": 0.86541587, "num_input_tokens_seen": 112112835, "step": 5207, "time_per_iteration": 2.5724034309387207 }, { "auxiliary_loss_clip": 0.01161996, "auxiliary_loss_mlp": 0.01026446, "balance_loss_clip": 1.05519056, "balance_loss_mlp": 1.01908791, "epoch": 0.6262249744483858, "flos": 31248926916480.0, "grad_norm": 1.8088877939309433, "language_loss": 0.6791051, "learning_rate": 1.294937964360259e-06, "loss": 0.70098948, "num_input_tokens_seen": 112133105, "step": 5208, "time_per_iteration": 2.61745285987854 }, { "auxiliary_loss_clip": 0.01170528, "auxiliary_loss_mlp": 0.0102807, "balance_loss_clip": 1.04934609, "balance_loss_mlp": 1.02027965, "epoch": 0.6263452173390248, "flos": 27198598435200.0, "grad_norm": 4.300054957879713, "language_loss": 0.71125323, "learning_rate": 1.2942090560893108e-06, "loss": 0.73323917, "num_input_tokens_seen": 112152510, "step": 5209, "time_per_iteration": 2.6576614379882812 }, { "auxiliary_loss_clip": 0.01173018, "auxiliary_loss_mlp": 0.01021423, "balance_loss_clip": 1.05083942, "balance_loss_mlp": 1.01440418, "epoch": 0.6264654602296639, "flos": 37342069683840.0, "grad_norm": 2.848314103438494, "language_loss": 0.60278702, "learning_rate": 1.2934802548862882e-06, "loss": 0.62473142, "num_input_tokens_seen": 112175295, "step": 5210, "time_per_iteration": 2.896669626235962 }, { "auxiliary_loss_clip": 0.01160453, "auxiliary_loss_mlp": 0.01022941, "balance_loss_clip": 1.04618645, "balance_loss_mlp": 1.01568127, "epoch": 0.626585703120303, "flos": 14756952136320.0, "grad_norm": 1.947796460105621, "language_loss": 0.83148903, "learning_rate": 1.292751560861749e-06, "loss": 0.85332298, "num_input_tokens_seen": 112190200, "step": 5211, "time_per_iteration": 2.771702527999878 }, { "auxiliary_loss_clip": 0.01174258, "auxiliary_loss_mlp": 0.01030484, "balance_loss_clip": 1.0499934, "balance_loss_mlp": 1.02302468, "epoch": 0.6267059460109421, "flos": 22347318533760.0, "grad_norm": 1.704262842201106, "language_loss": 0.79412681, "learning_rate": 1.2920229741262354e-06, "loss": 0.81617421, "num_input_tokens_seen": 112208205, "step": 5212, "time_per_iteration": 2.7621688842773438 }, { "auxiliary_loss_clip": 0.0116663, "auxiliary_loss_mlp": 0.01027163, "balance_loss_clip": 1.04863524, "balance_loss_mlp": 1.02030849, "epoch": 0.6268261889015811, "flos": 17748813617280.0, "grad_norm": 2.0154621100006382, "language_loss": 0.75416136, "learning_rate": 1.2912944947902739e-06, "loss": 0.77609926, "num_input_tokens_seen": 112224690, "step": 5213, "time_per_iteration": 2.791013240814209 }, { "auxiliary_loss_clip": 0.01172387, "auxiliary_loss_mlp": 0.01026937, "balance_loss_clip": 1.05043685, "balance_loss_mlp": 1.01936173, "epoch": 0.6269464317922203, "flos": 32846484211200.0, "grad_norm": 2.3151686490264884, "language_loss": 0.71875012, "learning_rate": 1.2905661229643742e-06, "loss": 0.7407434, "num_input_tokens_seen": 112244450, "step": 5214, "time_per_iteration": 2.921180248260498 }, { "auxiliary_loss_clip": 0.01170945, "auxiliary_loss_mlp": 0.01024422, "balance_loss_clip": 1.04768789, "balance_loss_mlp": 1.01732934, "epoch": 0.6270666746828594, "flos": 17929192740480.0, "grad_norm": 2.2589799654635514, "language_loss": 0.84727526, "learning_rate": 1.2898378587590299e-06, "loss": 0.86922896, "num_input_tokens_seen": 112261050, "step": 5215, "time_per_iteration": 2.6638426780700684 }, { "auxiliary_loss_clip": 0.01164807, "auxiliary_loss_mlp": 0.0102299, "balance_loss_clip": 1.04882193, "balance_loss_mlp": 1.01586449, "epoch": 0.6271869175734984, "flos": 17457326749440.0, "grad_norm": 1.989471211715973, "language_loss": 0.87294948, "learning_rate": 1.2891097022847173e-06, "loss": 0.89482749, "num_input_tokens_seen": 112278395, "step": 5216, "time_per_iteration": 2.684494733810425 }, { "auxiliary_loss_clip": 0.01166876, "auxiliary_loss_mlp": 0.01023995, "balance_loss_clip": 1.04916191, "balance_loss_mlp": 1.0162344, "epoch": 0.6273071604641376, "flos": 26868615166080.0, "grad_norm": 2.173560602544189, "language_loss": 0.66766989, "learning_rate": 1.2883816536518978e-06, "loss": 0.68957865, "num_input_tokens_seen": 112299535, "step": 5217, "time_per_iteration": 2.7797017097473145 }, { "auxiliary_loss_clip": 0.01164399, "auxiliary_loss_mlp": 0.0102691, "balance_loss_clip": 1.04780614, "balance_loss_mlp": 1.0196383, "epoch": 0.6274274033547766, "flos": 26062384446720.0, "grad_norm": 1.6995556846993192, "language_loss": 0.81531358, "learning_rate": 1.2876537129710155e-06, "loss": 0.83722663, "num_input_tokens_seen": 112317265, "step": 5218, "time_per_iteration": 2.7842984199523926 }, { "auxiliary_loss_clip": 0.01162242, "auxiliary_loss_mlp": 0.01026052, "balance_loss_clip": 1.05100858, "balance_loss_mlp": 1.01815486, "epoch": 0.6275476462454157, "flos": 20266259241600.0, "grad_norm": 2.08025012984182, "language_loss": 0.75393069, "learning_rate": 1.286925880352499e-06, "loss": 0.77581364, "num_input_tokens_seen": 112336125, "step": 5219, "time_per_iteration": 2.7108867168426514 }, { "auxiliary_loss_clip": 0.01161513, "auxiliary_loss_mlp": 0.01026899, "balance_loss_clip": 1.04804492, "balance_loss_mlp": 1.0192337, "epoch": 0.6276678891360549, "flos": 26320402817280.0, "grad_norm": 1.7080509505158143, "language_loss": 0.71142143, "learning_rate": 1.2861981559067592e-06, "loss": 0.73330557, "num_input_tokens_seen": 112356730, "step": 5220, "time_per_iteration": 3.744854688644409 }, { "auxiliary_loss_clip": 0.01151667, "auxiliary_loss_mlp": 0.01020001, "balance_loss_clip": 1.04807127, "balance_loss_mlp": 1.0134269, "epoch": 0.6277881320266939, "flos": 13912512324480.0, "grad_norm": 2.3520117187947154, "language_loss": 0.80718017, "learning_rate": 1.2854705397441917e-06, "loss": 0.82889688, "num_input_tokens_seen": 112372270, "step": 5221, "time_per_iteration": 3.699821710586548 }, { "auxiliary_loss_clip": 0.01157239, "auxiliary_loss_mlp": 0.01022261, "balance_loss_clip": 1.04792142, "balance_loss_mlp": 1.01514387, "epoch": 0.627908374917333, "flos": 27048922462080.0, "grad_norm": 2.2985174359510205, "language_loss": 0.77753484, "learning_rate": 1.2847430319751747e-06, "loss": 0.79932988, "num_input_tokens_seen": 112390365, "step": 5222, "time_per_iteration": 3.7144229412078857 }, { "auxiliary_loss_clip": 0.01164734, "auxiliary_loss_mlp": 0.01031993, "balance_loss_clip": 1.05114293, "balance_loss_mlp": 1.02510881, "epoch": 0.6280286178079721, "flos": 23769201386880.0, "grad_norm": 2.2654980069497017, "language_loss": 0.67330211, "learning_rate": 1.2840156327100712e-06, "loss": 0.69526935, "num_input_tokens_seen": 112407490, "step": 5223, "time_per_iteration": 2.6865408420562744 }, { "auxiliary_loss_clip": 0.01173007, "auxiliary_loss_mlp": 0.01025573, "balance_loss_clip": 1.05107093, "balance_loss_mlp": 1.01857495, "epoch": 0.6281488606986112, "flos": 26359150613760.0, "grad_norm": 1.7528972680626036, "language_loss": 0.72826636, "learning_rate": 1.2832883420592272e-06, "loss": 0.75025213, "num_input_tokens_seen": 112426385, "step": 5224, "time_per_iteration": 2.7055983543395996 }, { "auxiliary_loss_clip": 0.01162295, "auxiliary_loss_mlp": 0.0102386, "balance_loss_clip": 1.05079424, "balance_loss_mlp": 1.01669312, "epoch": 0.6282691035892503, "flos": 36137194848000.0, "grad_norm": 2.175687270252353, "language_loss": 0.64249957, "learning_rate": 1.282561160132972e-06, "loss": 0.66436112, "num_input_tokens_seen": 112446905, "step": 5225, "time_per_iteration": 2.7810349464416504 }, { "auxiliary_loss_clip": 0.01172168, "auxiliary_loss_mlp": 0.01023408, "balance_loss_clip": 1.04801273, "balance_loss_mlp": 1.01563001, "epoch": 0.6283893464798894, "flos": 26537231266560.0, "grad_norm": 1.6308536370669602, "language_loss": 0.81100655, "learning_rate": 1.2818340870416186e-06, "loss": 0.83296227, "num_input_tokens_seen": 112468040, "step": 5226, "time_per_iteration": 2.746347188949585 }, { "auxiliary_loss_clip": 0.01168315, "auxiliary_loss_mlp": 0.01023846, "balance_loss_clip": 1.0475626, "balance_loss_mlp": 1.01622868, "epoch": 0.6285095893705285, "flos": 22237216369920.0, "grad_norm": 7.327025379288335, "language_loss": 0.75888062, "learning_rate": 1.2811071228954626e-06, "loss": 0.78080225, "num_input_tokens_seen": 112486675, "step": 5227, "time_per_iteration": 2.6893601417541504 }, { "auxiliary_loss_clip": 0.01163468, "auxiliary_loss_mlp": 0.01022453, "balance_loss_clip": 1.04872978, "balance_loss_mlp": 1.01522636, "epoch": 0.6286298322611675, "flos": 26542259170560.0, "grad_norm": 1.9630268434581994, "language_loss": 0.80719304, "learning_rate": 1.2803802678047846e-06, "loss": 0.82905233, "num_input_tokens_seen": 112506825, "step": 5228, "time_per_iteration": 2.7350361347198486 }, { "auxiliary_loss_clip": 0.01169611, "auxiliary_loss_mlp": 0.0102698, "balance_loss_clip": 1.05053496, "balance_loss_mlp": 1.01907682, "epoch": 0.6287500751518067, "flos": 21795227516160.0, "grad_norm": 1.7126062563667626, "language_loss": 0.73992348, "learning_rate": 1.279653521879848e-06, "loss": 0.7618894, "num_input_tokens_seen": 112526890, "step": 5229, "time_per_iteration": 3.6641128063201904 }, { "auxiliary_loss_clip": 0.01144195, "auxiliary_loss_mlp": 0.01027617, "balance_loss_clip": 1.04661059, "balance_loss_mlp": 1.02130759, "epoch": 0.6288703180424458, "flos": 20009605587840.0, "grad_norm": 1.9543116433575376, "language_loss": 0.83583671, "learning_rate": 1.2789268852308997e-06, "loss": 0.85755491, "num_input_tokens_seen": 112542100, "step": 5230, "time_per_iteration": 2.7585692405700684 }, { "auxiliary_loss_clip": 0.01165049, "auxiliary_loss_mlp": 0.01022464, "balance_loss_clip": 1.05170739, "balance_loss_mlp": 1.01537108, "epoch": 0.6289905609330848, "flos": 22124923476480.0, "grad_norm": 3.1750926864087083, "language_loss": 0.70926571, "learning_rate": 1.2782003579681688e-06, "loss": 0.73114079, "num_input_tokens_seen": 112561630, "step": 5231, "time_per_iteration": 2.7643470764160156 }, { "auxiliary_loss_clip": 0.0117348, "auxiliary_loss_mlp": 0.01028019, "balance_loss_clip": 1.05053306, "balance_loss_mlp": 1.020944, "epoch": 0.629110803823724, "flos": 25518481729920.0, "grad_norm": 2.0946996902532176, "language_loss": 0.74454659, "learning_rate": 1.2774739402018701e-06, "loss": 0.76656157, "num_input_tokens_seen": 112582465, "step": 5232, "time_per_iteration": 2.807224750518799 }, { "auxiliary_loss_clip": 0.01165196, "auxiliary_loss_mlp": 0.01025255, "balance_loss_clip": 1.04919314, "balance_loss_mlp": 1.01760221, "epoch": 0.629231046714363, "flos": 20886616056960.0, "grad_norm": 1.6871082694918917, "language_loss": 0.73174608, "learning_rate": 1.2767476320422002e-06, "loss": 0.75365055, "num_input_tokens_seen": 112602390, "step": 5233, "time_per_iteration": 2.772840976715088 }, { "auxiliary_loss_clip": 0.01074114, "auxiliary_loss_mlp": 0.01001162, "balance_loss_clip": 1.01562691, "balance_loss_mlp": 0.99986231, "epoch": 0.6293512896050021, "flos": 65050027908480.0, "grad_norm": 0.6765276548934678, "language_loss": 0.57223725, "learning_rate": 1.2760214335993392e-06, "loss": 0.59298998, "num_input_tokens_seen": 112669035, "step": 5234, "time_per_iteration": 3.328713893890381 }, { "auxiliary_loss_clip": 0.01160487, "auxiliary_loss_mlp": 0.01031675, "balance_loss_clip": 1.04721737, "balance_loss_mlp": 1.02481449, "epoch": 0.6294715324956413, "flos": 34677857088000.0, "grad_norm": 1.86227315670544, "language_loss": 0.58709985, "learning_rate": 1.2752953449834514e-06, "loss": 0.60902148, "num_input_tokens_seen": 112691485, "step": 5235, "time_per_iteration": 2.8747451305389404 }, { "auxiliary_loss_clip": 0.01167152, "auxiliary_loss_mlp": 0.0102551, "balance_loss_clip": 1.04725337, "balance_loss_mlp": 1.01903987, "epoch": 0.6295917753862803, "flos": 22784207656320.0, "grad_norm": 2.084967219241017, "language_loss": 0.80100197, "learning_rate": 1.2745693663046836e-06, "loss": 0.82292861, "num_input_tokens_seen": 112710555, "step": 5236, "time_per_iteration": 2.6910574436187744 }, { "auxiliary_loss_clip": 0.01160567, "auxiliary_loss_mlp": 0.01023658, "balance_loss_clip": 1.04635143, "balance_loss_mlp": 1.01704454, "epoch": 0.6297120182769194, "flos": 20850454039680.0, "grad_norm": 1.7777079731174028, "language_loss": 0.81219155, "learning_rate": 1.2738434976731662e-06, "loss": 0.83403373, "num_input_tokens_seen": 112728740, "step": 5237, "time_per_iteration": 2.73518705368042 }, { "auxiliary_loss_clip": 0.01169108, "auxiliary_loss_mlp": 0.01025946, "balance_loss_clip": 1.05379522, "balance_loss_mlp": 1.0184536, "epoch": 0.6298322611675584, "flos": 19497662997120.0, "grad_norm": 1.5063285806323095, "language_loss": 0.75331914, "learning_rate": 1.2731177391990125e-06, "loss": 0.77526969, "num_input_tokens_seen": 112748665, "step": 5238, "time_per_iteration": 2.6784775257110596 }, { "auxiliary_loss_clip": 0.01163704, "auxiliary_loss_mlp": 0.01023872, "balance_loss_clip": 1.04592347, "balance_loss_mlp": 1.01692462, "epoch": 0.6299525040581976, "flos": 12604466649600.0, "grad_norm": 2.3649842062861386, "language_loss": 0.81639475, "learning_rate": 1.2723920909923203e-06, "loss": 0.83827049, "num_input_tokens_seen": 112764410, "step": 5239, "time_per_iteration": 2.635187864303589 }, { "auxiliary_loss_clip": 0.01072322, "auxiliary_loss_mlp": 0.0100103, "balance_loss_clip": 1.01527441, "balance_loss_mlp": 0.99994481, "epoch": 0.6300727469488366, "flos": 57725685636480.0, "grad_norm": 0.8601213634151242, "language_loss": 0.60451245, "learning_rate": 1.2716665531631688e-06, "loss": 0.62524605, "num_input_tokens_seen": 112818695, "step": 5240, "time_per_iteration": 3.1944921016693115 }, { "auxiliary_loss_clip": 0.0117307, "auxiliary_loss_mlp": 0.01024589, "balance_loss_clip": 1.04837918, "balance_loss_mlp": 1.01731443, "epoch": 0.6301929898394757, "flos": 22527302607360.0, "grad_norm": 1.6987950291819598, "language_loss": 0.77150887, "learning_rate": 1.270941125821623e-06, "loss": 0.79348546, "num_input_tokens_seen": 112839120, "step": 5241, "time_per_iteration": 2.6779866218566895 }, { "auxiliary_loss_clip": 0.01164604, "auxiliary_loss_mlp": 0.010234, "balance_loss_clip": 1.04721069, "balance_loss_mlp": 1.01630092, "epoch": 0.6303132327301149, "flos": 28293550675200.0, "grad_norm": 1.899218917872766, "language_loss": 0.75626874, "learning_rate": 1.2702158090777278e-06, "loss": 0.77814877, "num_input_tokens_seen": 112860210, "step": 5242, "time_per_iteration": 2.7267611026763916 }, { "auxiliary_loss_clip": 0.01160055, "auxiliary_loss_mlp": 0.01025688, "balance_loss_clip": 1.04924703, "balance_loss_mlp": 1.0183804, "epoch": 0.6304334756207539, "flos": 25264521596160.0, "grad_norm": 1.8735121902435914, "language_loss": 0.7483139, "learning_rate": 1.2694906030415148e-06, "loss": 0.7701714, "num_input_tokens_seen": 112877955, "step": 5243, "time_per_iteration": 2.7217042446136475 }, { "auxiliary_loss_clip": 0.01172909, "auxiliary_loss_mlp": 0.01027802, "balance_loss_clip": 1.04966998, "balance_loss_mlp": 1.02004719, "epoch": 0.630553718511393, "flos": 18033548728320.0, "grad_norm": 2.852831972554444, "language_loss": 0.81980342, "learning_rate": 1.2687655078229958e-06, "loss": 0.84181052, "num_input_tokens_seen": 112892285, "step": 5244, "time_per_iteration": 2.754316806793213 }, { "auxiliary_loss_clip": 0.01164217, "auxiliary_loss_mlp": 0.01027315, "balance_loss_clip": 1.05050993, "balance_loss_mlp": 1.01980519, "epoch": 0.6306739614020321, "flos": 27304103658240.0, "grad_norm": 4.312400656894653, "language_loss": 0.69134843, "learning_rate": 1.2680405235321678e-06, "loss": 0.71326369, "num_input_tokens_seen": 112913620, "step": 5245, "time_per_iteration": 2.8060739040374756 }, { "auxiliary_loss_clip": 0.01169762, "auxiliary_loss_mlp": 0.01059025, "balance_loss_clip": 1.05225384, "balance_loss_mlp": 1.02317715, "epoch": 0.6307942042926712, "flos": 15341434243200.0, "grad_norm": 2.2248974392315577, "language_loss": 0.78878188, "learning_rate": 1.267315650279011e-06, "loss": 0.81106979, "num_input_tokens_seen": 112932090, "step": 5246, "time_per_iteration": 3.6573262214660645 }, { "auxiliary_loss_clip": 0.0115743, "auxiliary_loss_mlp": 0.01022272, "balance_loss_clip": 1.04958534, "balance_loss_mlp": 1.01551604, "epoch": 0.6309144471833102, "flos": 19606400444160.0, "grad_norm": 1.9916280495955445, "language_loss": 0.74500918, "learning_rate": 1.2665908881734874e-06, "loss": 0.76680624, "num_input_tokens_seen": 112950925, "step": 5247, "time_per_iteration": 3.8048644065856934 }, { "auxiliary_loss_clip": 0.01167107, "auxiliary_loss_mlp": 0.01028152, "balance_loss_clip": 1.04930711, "balance_loss_mlp": 1.02145231, "epoch": 0.6310346900739494, "flos": 17493345112320.0, "grad_norm": 1.9790501976244028, "language_loss": 0.84682077, "learning_rate": 1.2658662373255432e-06, "loss": 0.8687734, "num_input_tokens_seen": 112969315, "step": 5248, "time_per_iteration": 3.7830893993377686 }, { "auxiliary_loss_clip": 0.01069427, "auxiliary_loss_mlp": 0.01000534, "balance_loss_clip": 1.01540613, "balance_loss_mlp": 0.99941915, "epoch": 0.6311549329645885, "flos": 55070164131840.0, "grad_norm": 0.7114332203502575, "language_loss": 0.5227896, "learning_rate": 1.2651416978451063e-06, "loss": 0.54348922, "num_input_tokens_seen": 113034700, "step": 5249, "time_per_iteration": 3.3856704235076904 }, { "auxiliary_loss_clip": 0.01174848, "auxiliary_loss_mlp": 0.01026376, "balance_loss_clip": 1.05092633, "balance_loss_mlp": 1.01917315, "epoch": 0.6312751758552275, "flos": 41902545075840.0, "grad_norm": 2.7900605828140352, "language_loss": 0.65337259, "learning_rate": 1.2644172698420903e-06, "loss": 0.67538476, "num_input_tokens_seen": 113056805, "step": 5250, "time_per_iteration": 2.9885001182556152 }, { "auxiliary_loss_clip": 0.01160704, "auxiliary_loss_mlp": 0.0102898, "balance_loss_clip": 1.04767132, "balance_loss_mlp": 1.02144015, "epoch": 0.6313954187458667, "flos": 19646800266240.0, "grad_norm": 1.8274642863518809, "language_loss": 0.84788227, "learning_rate": 1.2636929534263892e-06, "loss": 0.86977911, "num_input_tokens_seen": 113075790, "step": 5251, "time_per_iteration": 2.921908378601074 }, { "auxiliary_loss_clip": 0.01165571, "auxiliary_loss_mlp": 0.01024821, "balance_loss_clip": 1.0474087, "balance_loss_mlp": 1.01754677, "epoch": 0.6315156616365057, "flos": 22894273906560.0, "grad_norm": 1.7733771439544188, "language_loss": 0.78062046, "learning_rate": 1.2629687487078821e-06, "loss": 0.80252433, "num_input_tokens_seen": 113094600, "step": 5252, "time_per_iteration": 2.8702495098114014 }, { "auxiliary_loss_clip": 0.01170455, "auxiliary_loss_mlp": 0.0102782, "balance_loss_clip": 1.04758024, "balance_loss_mlp": 1.02016425, "epoch": 0.6316359045271448, "flos": 23726251699200.0, "grad_norm": 4.091111679829598, "language_loss": 0.76932156, "learning_rate": 1.2622446557964293e-06, "loss": 0.79130429, "num_input_tokens_seen": 113112605, "step": 5253, "time_per_iteration": 2.7292754650115967 }, { "auxiliary_loss_clip": 0.01164415, "auxiliary_loss_mlp": 0.01025986, "balance_loss_clip": 1.04569268, "balance_loss_mlp": 1.01899409, "epoch": 0.631756147417784, "flos": 33108417164160.0, "grad_norm": 1.9645563828328076, "language_loss": 0.71805876, "learning_rate": 1.261520674801876e-06, "loss": 0.73996276, "num_input_tokens_seen": 113133200, "step": 5254, "time_per_iteration": 2.8210861682891846 }, { "auxiliary_loss_clip": 0.01163614, "auxiliary_loss_mlp": 0.01022909, "balance_loss_clip": 1.05033612, "balance_loss_mlp": 1.0155983, "epoch": 0.631876390308423, "flos": 31248424126080.0, "grad_norm": 2.113517337776475, "language_loss": 0.72552419, "learning_rate": 1.2607968058340488e-06, "loss": 0.74738944, "num_input_tokens_seen": 113152895, "step": 5255, "time_per_iteration": 3.826277256011963 }, { "auxiliary_loss_clip": 0.01162391, "auxiliary_loss_mlp": 0.01026581, "balance_loss_clip": 1.04761815, "balance_loss_mlp": 1.01962829, "epoch": 0.6319966331990621, "flos": 24681152810880.0, "grad_norm": 1.6316428311042004, "language_loss": 0.73397475, "learning_rate": 1.2600730490027583e-06, "loss": 0.75586444, "num_input_tokens_seen": 113173135, "step": 5256, "time_per_iteration": 2.7891767024993896 }, { "auxiliary_loss_clip": 0.01160439, "auxiliary_loss_mlp": 0.01024026, "balance_loss_clip": 1.04859459, "balance_loss_mlp": 1.01744604, "epoch": 0.6321168760897012, "flos": 17491764913920.0, "grad_norm": 1.66917238090277, "language_loss": 0.8033666, "learning_rate": 1.2593494044177984e-06, "loss": 0.82521123, "num_input_tokens_seen": 113191440, "step": 5257, "time_per_iteration": 2.8506457805633545 }, { "auxiliary_loss_clip": 0.01175067, "auxiliary_loss_mlp": 0.01025388, "balance_loss_clip": 1.04884171, "balance_loss_mlp": 1.01732981, "epoch": 0.6322371189803403, "flos": 18295373940480.0, "grad_norm": 3.0842078452415365, "language_loss": 0.80929369, "learning_rate": 1.2586258721889448e-06, "loss": 0.83129823, "num_input_tokens_seen": 113208790, "step": 5258, "time_per_iteration": 2.6608967781066895 }, { "auxiliary_loss_clip": 0.01149733, "auxiliary_loss_mlp": 0.01022669, "balance_loss_clip": 1.04905593, "balance_loss_mlp": 1.01505435, "epoch": 0.6323573618709794, "flos": 20157270399360.0, "grad_norm": 2.1443896120411137, "language_loss": 0.81551516, "learning_rate": 1.2579024524259573e-06, "loss": 0.83723921, "num_input_tokens_seen": 113225050, "step": 5259, "time_per_iteration": 2.9410152435302734 }, { "auxiliary_loss_clip": 0.01162916, "auxiliary_loss_mlp": 0.01025517, "balance_loss_clip": 1.04831803, "balance_loss_mlp": 1.01801848, "epoch": 0.6324776047616185, "flos": 20042391726720.0, "grad_norm": 29.807257727657497, "language_loss": 0.91427344, "learning_rate": 1.2571791452385768e-06, "loss": 0.93615776, "num_input_tokens_seen": 113242315, "step": 5260, "time_per_iteration": 2.740812301635742 }, { "auxiliary_loss_clip": 0.01164608, "auxiliary_loss_mlp": 0.01026988, "balance_loss_clip": 1.04857695, "balance_loss_mlp": 1.01991248, "epoch": 0.6325978476522576, "flos": 30848235724800.0, "grad_norm": 1.8007686929973217, "language_loss": 0.76980734, "learning_rate": 1.2564559507365301e-06, "loss": 0.79172331, "num_input_tokens_seen": 113264720, "step": 5261, "time_per_iteration": 2.7754383087158203 }, { "auxiliary_loss_clip": 0.01169362, "auxiliary_loss_mlp": 0.01032849, "balance_loss_clip": 1.05138302, "balance_loss_mlp": 1.02518439, "epoch": 0.6327180905428966, "flos": 24535104111360.0, "grad_norm": 2.132559741654699, "language_loss": 0.78883362, "learning_rate": 1.2557328690295244e-06, "loss": 0.81085575, "num_input_tokens_seen": 113282910, "step": 5262, "time_per_iteration": 2.7595653533935547 }, { "auxiliary_loss_clip": 0.01162222, "auxiliary_loss_mlp": 0.01022463, "balance_loss_clip": 1.04641378, "balance_loss_mlp": 1.01542389, "epoch": 0.6328383334335358, "flos": 21575274583680.0, "grad_norm": 1.7186292603765354, "language_loss": 0.76603097, "learning_rate": 1.255009900227251e-06, "loss": 0.7878778, "num_input_tokens_seen": 113301935, "step": 5263, "time_per_iteration": 2.786707639694214 }, { "auxiliary_loss_clip": 0.01166746, "auxiliary_loss_mlp": 0.01026142, "balance_loss_clip": 1.0477978, "balance_loss_mlp": 1.01909709, "epoch": 0.6329585763241748, "flos": 22929861306240.0, "grad_norm": 3.0870050315714543, "language_loss": 0.79086286, "learning_rate": 1.254287044439383e-06, "loss": 0.81279171, "num_input_tokens_seen": 113321540, "step": 5264, "time_per_iteration": 2.6781930923461914 }, { "auxiliary_loss_clip": 0.01070158, "auxiliary_loss_mlp": 0.01001351, "balance_loss_clip": 1.01393914, "balance_loss_mlp": 1.00037384, "epoch": 0.6330788192148139, "flos": 70936897847040.0, "grad_norm": 0.7902578188310845, "language_loss": 0.54444396, "learning_rate": 1.2535643017755776e-06, "loss": 0.56515908, "num_input_tokens_seen": 113383730, "step": 5265, "time_per_iteration": 3.3481104373931885 }, { "auxiliary_loss_clip": 0.01163495, "auxiliary_loss_mlp": 0.0102815, "balance_loss_clip": 1.04880524, "balance_loss_mlp": 1.02106869, "epoch": 0.6331990621054531, "flos": 21244501215360.0, "grad_norm": 2.6781496063776786, "language_loss": 0.71977705, "learning_rate": 1.2528416723454737e-06, "loss": 0.7416935, "num_input_tokens_seen": 113400400, "step": 5266, "time_per_iteration": 2.700608968734741 }, { "auxiliary_loss_clip": 0.01164763, "auxiliary_loss_mlp": 0.01022827, "balance_loss_clip": 1.04641283, "balance_loss_mlp": 1.01586246, "epoch": 0.6333193049960921, "flos": 34459412526720.0, "grad_norm": 1.470694785282714, "language_loss": 0.70889837, "learning_rate": 1.2521191562586945e-06, "loss": 0.73077428, "num_input_tokens_seen": 113424050, "step": 5267, "time_per_iteration": 2.785839319229126 }, { "auxiliary_loss_clip": 0.01167409, "auxiliary_loss_mlp": 0.01051429, "balance_loss_clip": 1.04658914, "balance_loss_mlp": 1.01615191, "epoch": 0.6334395478867312, "flos": 18329883932160.0, "grad_norm": 1.8575351304905763, "language_loss": 0.76567912, "learning_rate": 1.2513967536248445e-06, "loss": 0.78786755, "num_input_tokens_seen": 113440370, "step": 5268, "time_per_iteration": 2.6686837673187256 }, { "auxiliary_loss_clip": 0.01165984, "auxiliary_loss_mlp": 0.01024005, "balance_loss_clip": 1.04893005, "balance_loss_mlp": 1.01634014, "epoch": 0.6335597907773702, "flos": 23623152687360.0, "grad_norm": 1.7721175675509537, "language_loss": 0.81005216, "learning_rate": 1.2506744645535117e-06, "loss": 0.8319521, "num_input_tokens_seen": 113460800, "step": 5269, "time_per_iteration": 2.714827299118042 }, { "auxiliary_loss_clip": 0.01157769, "auxiliary_loss_mlp": 0.01021178, "balance_loss_clip": 1.04533672, "balance_loss_mlp": 1.01405811, "epoch": 0.6336800336680094, "flos": 22710913954560.0, "grad_norm": 2.254935879317974, "language_loss": 0.59645486, "learning_rate": 1.249952289154267e-06, "loss": 0.61824435, "num_input_tokens_seen": 113480840, "step": 5270, "time_per_iteration": 2.805840492248535 }, { "auxiliary_loss_clip": 0.01150122, "auxiliary_loss_mlp": 0.01023779, "balance_loss_clip": 1.05201888, "balance_loss_mlp": 1.01693034, "epoch": 0.6338002765586485, "flos": 23622757637760.0, "grad_norm": 1.814544876453867, "language_loss": 0.76533198, "learning_rate": 1.2492302275366635e-06, "loss": 0.78707099, "num_input_tokens_seen": 113500515, "step": 5271, "time_per_iteration": 2.8216092586517334 }, { "auxiliary_loss_clip": 0.01164427, "auxiliary_loss_mlp": 0.01027539, "balance_loss_clip": 1.04848838, "balance_loss_mlp": 1.01926589, "epoch": 0.6339205194492875, "flos": 26505450708480.0, "grad_norm": 2.33280506313409, "language_loss": 0.65339231, "learning_rate": 1.2485082798102377e-06, "loss": 0.67531198, "num_input_tokens_seen": 113520930, "step": 5272, "time_per_iteration": 2.7953038215637207 }, { "auxiliary_loss_clip": 0.01168567, "auxiliary_loss_mlp": 0.01027151, "balance_loss_clip": 1.05088568, "balance_loss_mlp": 1.01992393, "epoch": 0.6340407623399267, "flos": 18544306170240.0, "grad_norm": 2.1649335197340425, "language_loss": 0.68732131, "learning_rate": 1.2477864460845084e-06, "loss": 0.70927846, "num_input_tokens_seen": 113537330, "step": 5273, "time_per_iteration": 4.715818166732788 }, { "auxiliary_loss_clip": 0.01167119, "auxiliary_loss_mlp": 0.01029863, "balance_loss_clip": 1.05195856, "balance_loss_mlp": 1.02208424, "epoch": 0.6341610052305657, "flos": 17712579772800.0, "grad_norm": 2.9803556766044865, "language_loss": 0.73184931, "learning_rate": 1.2470647264689776e-06, "loss": 0.75381911, "num_input_tokens_seen": 113555810, "step": 5274, "time_per_iteration": 3.686774730682373 }, { "auxiliary_loss_clip": 0.01159515, "auxiliary_loss_mlp": 0.01025171, "balance_loss_clip": 1.04566324, "balance_loss_mlp": 1.01780391, "epoch": 0.6342812481212048, "flos": 23587026583680.0, "grad_norm": 2.0438043125584127, "language_loss": 0.71197426, "learning_rate": 1.2463431210731282e-06, "loss": 0.73382115, "num_input_tokens_seen": 113575395, "step": 5275, "time_per_iteration": 2.8358097076416016 }, { "auxiliary_loss_clip": 0.01169427, "auxiliary_loss_mlp": 0.01026061, "balance_loss_clip": 1.04977846, "balance_loss_mlp": 1.01868832, "epoch": 0.634401491011844, "flos": 17821927751040.0, "grad_norm": 2.4724750826922612, "language_loss": 0.76247895, "learning_rate": 1.2456216300064289e-06, "loss": 0.78443378, "num_input_tokens_seen": 113592945, "step": 5276, "time_per_iteration": 2.6977169513702393 }, { "auxiliary_loss_clip": 0.01162085, "auxiliary_loss_mlp": 0.01029398, "balance_loss_clip": 1.04958344, "balance_loss_mlp": 1.02217412, "epoch": 0.634521733902483, "flos": 21358158825600.0, "grad_norm": 1.792038090980028, "language_loss": 0.78496385, "learning_rate": 1.244900253378328e-06, "loss": 0.80687869, "num_input_tokens_seen": 113613000, "step": 5277, "time_per_iteration": 2.7209274768829346 }, { "auxiliary_loss_clip": 0.01150269, "auxiliary_loss_mlp": 0.01021702, "balance_loss_clip": 1.04796982, "balance_loss_mlp": 1.014606, "epoch": 0.6346419767931221, "flos": 16545052103040.0, "grad_norm": 2.034765166156116, "language_loss": 0.69583857, "learning_rate": 1.2441789912982583e-06, "loss": 0.71755838, "num_input_tokens_seen": 113630085, "step": 5278, "time_per_iteration": 2.8268051147460938 }, { "auxiliary_loss_clip": 0.01169625, "auxiliary_loss_mlp": 0.01024928, "balance_loss_clip": 1.04928994, "balance_loss_mlp": 1.01689327, "epoch": 0.6347622196837612, "flos": 24350989973760.0, "grad_norm": 7.811314318236414, "language_loss": 0.64741254, "learning_rate": 1.2434578438756346e-06, "loss": 0.66935802, "num_input_tokens_seen": 113650515, "step": 5279, "time_per_iteration": 2.830284595489502 }, { "auxiliary_loss_clip": 0.01166976, "auxiliary_loss_mlp": 0.01021454, "balance_loss_clip": 1.0450722, "balance_loss_mlp": 1.01471257, "epoch": 0.6348824625744003, "flos": 64523178195840.0, "grad_norm": 2.160690614219644, "language_loss": 0.77882385, "learning_rate": 1.242736811219855e-06, "loss": 0.80070812, "num_input_tokens_seen": 113676475, "step": 5280, "time_per_iteration": 3.115169048309326 }, { "auxiliary_loss_clip": 0.0116169, "auxiliary_loss_mlp": 0.01025802, "balance_loss_clip": 1.04714072, "balance_loss_mlp": 1.01854551, "epoch": 0.6350027054650393, "flos": 28622133313920.0, "grad_norm": 2.0468102810796998, "language_loss": 0.82039773, "learning_rate": 1.2420158934402988e-06, "loss": 0.84227264, "num_input_tokens_seen": 113697090, "step": 5281, "time_per_iteration": 3.7016031742095947 }, { "auxiliary_loss_clip": 0.01151123, "auxiliary_loss_mlp": 0.01030251, "balance_loss_clip": 1.04681838, "balance_loss_mlp": 1.02276433, "epoch": 0.6351229483556785, "flos": 23002544476800.0, "grad_norm": 1.8610548436780083, "language_loss": 0.84869665, "learning_rate": 1.2412950906463286e-06, "loss": 0.87051034, "num_input_tokens_seen": 113714395, "step": 5282, "time_per_iteration": 2.8256783485412598 }, { "auxiliary_loss_clip": 0.01154448, "auxiliary_loss_mlp": 0.01027014, "balance_loss_clip": 1.04571366, "balance_loss_mlp": 1.01986992, "epoch": 0.6352431912463176, "flos": 21939300967680.0, "grad_norm": 1.8841635178606513, "language_loss": 0.90237021, "learning_rate": 1.2405744029472902e-06, "loss": 0.92418492, "num_input_tokens_seen": 113733880, "step": 5283, "time_per_iteration": 2.7607412338256836 }, { "auxiliary_loss_clip": 0.01162771, "auxiliary_loss_mlp": 0.01029943, "balance_loss_clip": 1.04937458, "balance_loss_mlp": 1.02284729, "epoch": 0.6353634341369566, "flos": 13735257684480.0, "grad_norm": 3.3296326485100014, "language_loss": 0.76120782, "learning_rate": 1.2398538304525108e-06, "loss": 0.78313506, "num_input_tokens_seen": 113752505, "step": 5284, "time_per_iteration": 2.6759889125823975 }, { "auxiliary_loss_clip": 0.01160667, "auxiliary_loss_mlp": 0.01030059, "balance_loss_clip": 1.05090165, "balance_loss_mlp": 1.02201819, "epoch": 0.6354836770275958, "flos": 19316170552320.0, "grad_norm": 2.7300766925893303, "language_loss": 0.76015425, "learning_rate": 1.2391333732713016e-06, "loss": 0.78206146, "num_input_tokens_seen": 113770310, "step": 5285, "time_per_iteration": 2.7186646461486816 }, { "auxiliary_loss_clip": 0.01161659, "auxiliary_loss_mlp": 0.01030897, "balance_loss_clip": 1.048123, "balance_loss_mlp": 1.02342272, "epoch": 0.6356039199182348, "flos": 21613375935360.0, "grad_norm": 2.2648324332485057, "language_loss": 0.78207886, "learning_rate": 1.2384130315129543e-06, "loss": 0.80400443, "num_input_tokens_seen": 113788635, "step": 5286, "time_per_iteration": 2.7643795013427734 }, { "auxiliary_loss_clip": 0.01155512, "auxiliary_loss_mlp": 0.01025651, "balance_loss_clip": 1.0488975, "balance_loss_mlp": 1.01832569, "epoch": 0.6357241628088739, "flos": 18111978074880.0, "grad_norm": 2.266534636034541, "language_loss": 0.73556721, "learning_rate": 1.2376928052867447e-06, "loss": 0.75737882, "num_input_tokens_seen": 113807755, "step": 5287, "time_per_iteration": 2.9052000045776367 }, { "auxiliary_loss_clip": 0.01165914, "auxiliary_loss_mlp": 0.01024436, "balance_loss_clip": 1.0497241, "balance_loss_mlp": 1.0170604, "epoch": 0.6358444056995131, "flos": 24935256599040.0, "grad_norm": 1.944017501982079, "language_loss": 0.77646744, "learning_rate": 1.2369726947019299e-06, "loss": 0.7983709, "num_input_tokens_seen": 113828230, "step": 5288, "time_per_iteration": 2.9575390815734863 }, { "auxiliary_loss_clip": 0.01166387, "auxiliary_loss_mlp": 0.01023213, "balance_loss_clip": 1.04688179, "balance_loss_mlp": 1.01561928, "epoch": 0.6359646485901521, "flos": 23293348986240.0, "grad_norm": 2.1495062661521067, "language_loss": 0.6726265, "learning_rate": 1.2362526998677511e-06, "loss": 0.69452244, "num_input_tokens_seen": 113844595, "step": 5289, "time_per_iteration": 2.6870131492614746 }, { "auxiliary_loss_clip": 0.01166289, "auxiliary_loss_mlp": 0.01023342, "balance_loss_clip": 1.04719412, "balance_loss_mlp": 1.01663089, "epoch": 0.6360848914807912, "flos": 20887442069760.0, "grad_norm": 2.787762972948941, "language_loss": 0.84385383, "learning_rate": 1.2355328208934301e-06, "loss": 0.86575019, "num_input_tokens_seen": 113863470, "step": 5290, "time_per_iteration": 2.7821645736694336 }, { "auxiliary_loss_clip": 0.01166211, "auxiliary_loss_mlp": 0.01053026, "balance_loss_clip": 1.04587984, "balance_loss_mlp": 1.01724994, "epoch": 0.6362051343714303, "flos": 18479775386880.0, "grad_norm": 1.8881472159780088, "language_loss": 0.72596562, "learning_rate": 1.2348130578881728e-06, "loss": 0.74815798, "num_input_tokens_seen": 113881690, "step": 5291, "time_per_iteration": 2.7398412227630615 }, { "auxiliary_loss_clip": 0.01171832, "auxiliary_loss_mlp": 0.01026993, "balance_loss_clip": 1.0477165, "balance_loss_mlp": 1.0190599, "epoch": 0.6363253772620694, "flos": 24389594115840.0, "grad_norm": 2.1658420953628976, "language_loss": 0.76105356, "learning_rate": 1.2340934109611664e-06, "loss": 0.78304183, "num_input_tokens_seen": 113902450, "step": 5292, "time_per_iteration": 2.82188081741333 }, { "auxiliary_loss_clip": 0.01169906, "auxiliary_loss_mlp": 0.01023585, "balance_loss_clip": 1.04936719, "balance_loss_mlp": 1.01586592, "epoch": 0.6364456201527084, "flos": 25958243940480.0, "grad_norm": 2.519772871942126, "language_loss": 0.68669158, "learning_rate": 1.2333738802215798e-06, "loss": 0.70862651, "num_input_tokens_seen": 113922670, "step": 5293, "time_per_iteration": 2.7207982540130615 }, { "auxiliary_loss_clip": 0.01157352, "auxiliary_loss_mlp": 0.01024113, "balance_loss_clip": 1.04914832, "balance_loss_mlp": 1.01633751, "epoch": 0.6365658630433476, "flos": 20740711011840.0, "grad_norm": 2.160887280525902, "language_loss": 0.80871111, "learning_rate": 1.2326544657785668e-06, "loss": 0.83052576, "num_input_tokens_seen": 113942360, "step": 5294, "time_per_iteration": 2.8243374824523926 }, { "auxiliary_loss_clip": 0.01157151, "auxiliary_loss_mlp": 0.0102768, "balance_loss_clip": 1.05004787, "balance_loss_mlp": 1.01985955, "epoch": 0.6366861059339867, "flos": 21434146047360.0, "grad_norm": 2.441341318976383, "language_loss": 0.74558938, "learning_rate": 1.2319351677412608e-06, "loss": 0.7674377, "num_input_tokens_seen": 113959405, "step": 5295, "time_per_iteration": 2.7883520126342773 }, { "auxiliary_loss_clip": 0.01167793, "auxiliary_loss_mlp": 0.01025299, "balance_loss_clip": 1.0485177, "balance_loss_mlp": 1.01800978, "epoch": 0.6368063488246257, "flos": 22267093507200.0, "grad_norm": 1.7559861831916135, "language_loss": 0.7446695, "learning_rate": 1.2312159862187796e-06, "loss": 0.76660037, "num_input_tokens_seen": 113977815, "step": 5296, "time_per_iteration": 2.776526927947998 }, { "auxiliary_loss_clip": 0.01176203, "auxiliary_loss_mlp": 0.0102416, "balance_loss_clip": 1.05157185, "balance_loss_mlp": 1.01622701, "epoch": 0.6369265917152649, "flos": 22420719976320.0, "grad_norm": 2.0642201161000258, "language_loss": 0.7638551, "learning_rate": 1.2304969213202217e-06, "loss": 0.78585881, "num_input_tokens_seen": 113999075, "step": 5297, "time_per_iteration": 2.6747868061065674 }, { "auxiliary_loss_clip": 0.01160012, "auxiliary_loss_mlp": 0.01028582, "balance_loss_clip": 1.04677546, "balance_loss_mlp": 1.02176881, "epoch": 0.6370468346059039, "flos": 24718176754560.0, "grad_norm": 10.817819622411943, "language_loss": 0.79514766, "learning_rate": 1.2297779731546692e-06, "loss": 0.81703365, "num_input_tokens_seen": 114018170, "step": 5298, "time_per_iteration": 3.7278189659118652 }, { "auxiliary_loss_clip": 0.01162567, "auxiliary_loss_mlp": 0.01023457, "balance_loss_clip": 1.04981697, "balance_loss_mlp": 1.01630783, "epoch": 0.637167077496543, "flos": 25296589463040.0, "grad_norm": 2.3953088671408596, "language_loss": 0.77988386, "learning_rate": 1.2290591418311853e-06, "loss": 0.8017441, "num_input_tokens_seen": 114035565, "step": 5299, "time_per_iteration": 3.688972234725952 }, { "auxiliary_loss_clip": 0.01164954, "auxiliary_loss_mlp": 0.01031936, "balance_loss_clip": 1.04763401, "balance_loss_mlp": 1.02479863, "epoch": 0.637287320387182, "flos": 27671110871040.0, "grad_norm": 1.5865829198084782, "language_loss": 0.72119272, "learning_rate": 1.2283404274588172e-06, "loss": 0.74316168, "num_input_tokens_seen": 114054510, "step": 5300, "time_per_iteration": 3.740778923034668 }, { "auxiliary_loss_clip": 0.01062279, "auxiliary_loss_mlp": 0.00999494, "balance_loss_clip": 1.01887894, "balance_loss_mlp": 0.9982363, "epoch": 0.6374075632778212, "flos": 63173406873600.0, "grad_norm": 0.7491500261915183, "language_loss": 0.52741939, "learning_rate": 1.227621830146592e-06, "loss": 0.54803711, "num_input_tokens_seen": 114109875, "step": 5301, "time_per_iteration": 3.2452354431152344 }, { "auxiliary_loss_clip": 0.01163912, "auxiliary_loss_mlp": 0.01031066, "balance_loss_clip": 1.0489893, "balance_loss_mlp": 1.02368689, "epoch": 0.6375278061684603, "flos": 25558127366400.0, "grad_norm": 4.178838181389903, "language_loss": 0.79368222, "learning_rate": 1.2269033500035217e-06, "loss": 0.81563199, "num_input_tokens_seen": 114130010, "step": 5302, "time_per_iteration": 2.7209014892578125 }, { "auxiliary_loss_clip": 0.01162786, "auxiliary_loss_mlp": 0.01027066, "balance_loss_clip": 1.04808545, "balance_loss_mlp": 1.01959193, "epoch": 0.6376480490590993, "flos": 25666362023040.0, "grad_norm": 1.8707943350423677, "language_loss": 0.73773044, "learning_rate": 1.2261849871385988e-06, "loss": 0.75962895, "num_input_tokens_seen": 114151115, "step": 5303, "time_per_iteration": 2.709402322769165 }, { "auxiliary_loss_clip": 0.01172071, "auxiliary_loss_mlp": 0.01026234, "balance_loss_clip": 1.04759276, "balance_loss_mlp": 1.01847315, "epoch": 0.6377682919497385, "flos": 31537684350720.0, "grad_norm": 2.3070451443668123, "language_loss": 0.62531722, "learning_rate": 1.2254667416607972e-06, "loss": 0.64730024, "num_input_tokens_seen": 114172715, "step": 5304, "time_per_iteration": 2.7424800395965576 }, { "auxiliary_loss_clip": 0.01167479, "auxiliary_loss_mlp": 0.01024187, "balance_loss_clip": 1.04940045, "balance_loss_mlp": 1.01704943, "epoch": 0.6378885348403776, "flos": 23039209284480.0, "grad_norm": 1.805220755033356, "language_loss": 0.82782543, "learning_rate": 1.2247486136790756e-06, "loss": 0.84974205, "num_input_tokens_seen": 114192195, "step": 5305, "time_per_iteration": 2.66825270652771 }, { "auxiliary_loss_clip": 0.01170788, "auxiliary_loss_mlp": 0.01028066, "balance_loss_clip": 1.04962063, "balance_loss_mlp": 1.02107465, "epoch": 0.6380087777310166, "flos": 18697070712960.0, "grad_norm": 2.2032854888717717, "language_loss": 0.81078124, "learning_rate": 1.2240306033023726e-06, "loss": 0.83276975, "num_input_tokens_seen": 114210020, "step": 5306, "time_per_iteration": 2.882136821746826 }, { "auxiliary_loss_clip": 0.01164688, "auxiliary_loss_mlp": 0.01021095, "balance_loss_clip": 1.04645884, "balance_loss_mlp": 1.01362681, "epoch": 0.6381290206216558, "flos": 23331558078720.0, "grad_norm": 1.7669173530734463, "language_loss": 0.72098196, "learning_rate": 1.223312710639611e-06, "loss": 0.74283975, "num_input_tokens_seen": 114228740, "step": 5307, "time_per_iteration": 3.6434824466705322 }, { "auxiliary_loss_clip": 0.01167333, "auxiliary_loss_mlp": 0.01027592, "balance_loss_clip": 1.0512886, "balance_loss_mlp": 1.02046967, "epoch": 0.6382492635122948, "flos": 18880466578560.0, "grad_norm": 2.162134990321437, "language_loss": 0.86761439, "learning_rate": 1.2225949357996928e-06, "loss": 0.88956368, "num_input_tokens_seen": 114246865, "step": 5308, "time_per_iteration": 2.672658681869507 }, { "auxiliary_loss_clip": 0.01162861, "auxiliary_loss_mlp": 0.01024052, "balance_loss_clip": 1.04822505, "balance_loss_mlp": 1.01679254, "epoch": 0.6383695064029339, "flos": 27819134818560.0, "grad_norm": 2.0490870292396486, "language_loss": 0.80333161, "learning_rate": 1.221877278891505e-06, "loss": 0.82520068, "num_input_tokens_seen": 114266120, "step": 5309, "time_per_iteration": 2.783809185028076 }, { "auxiliary_loss_clip": 0.0117333, "auxiliary_loss_mlp": 0.01027124, "balance_loss_clip": 1.04885781, "balance_loss_mlp": 1.01994205, "epoch": 0.638489749293573, "flos": 26395635853440.0, "grad_norm": 2.0161736163851622, "language_loss": 0.7181614, "learning_rate": 1.221159740023915e-06, "loss": 0.74016595, "num_input_tokens_seen": 114285950, "step": 5310, "time_per_iteration": 2.7194252014160156 }, { "auxiliary_loss_clip": 0.01169574, "auxiliary_loss_mlp": 0.01061486, "balance_loss_clip": 1.05060625, "balance_loss_mlp": 1.02444899, "epoch": 0.6386099921842121, "flos": 23988328306560.0, "grad_norm": 6.796015210904469, "language_loss": 0.72379184, "learning_rate": 1.2204423193057735e-06, "loss": 0.74610239, "num_input_tokens_seen": 114304780, "step": 5311, "time_per_iteration": 2.8234188556671143 }, { "auxiliary_loss_clip": 0.01068973, "auxiliary_loss_mlp": 0.01002267, "balance_loss_clip": 1.01381779, "balance_loss_mlp": 1.00133097, "epoch": 0.6387302350748512, "flos": 71731169337600.0, "grad_norm": 0.854243546429228, "language_loss": 0.63358599, "learning_rate": 1.2197250168459122e-06, "loss": 0.65429837, "num_input_tokens_seen": 114361180, "step": 5312, "time_per_iteration": 3.3521368503570557 }, { "auxiliary_loss_clip": 0.01170971, "auxiliary_loss_mlp": 0.0102675, "balance_loss_clip": 1.0500797, "balance_loss_mlp": 1.0199635, "epoch": 0.6388504779654903, "flos": 14535778141440.0, "grad_norm": 2.0989499522938124, "language_loss": 0.74387729, "learning_rate": 1.2190078327531454e-06, "loss": 0.76585454, "num_input_tokens_seen": 114377425, "step": 5313, "time_per_iteration": 2.615304946899414 }, { "auxiliary_loss_clip": 0.01168825, "auxiliary_loss_mlp": 0.01025792, "balance_loss_clip": 1.04756999, "balance_loss_mlp": 1.01857054, "epoch": 0.6389707208561294, "flos": 22346133384960.0, "grad_norm": 1.4525828376694188, "language_loss": 0.72984123, "learning_rate": 1.2182907671362697e-06, "loss": 0.75178742, "num_input_tokens_seen": 114398120, "step": 5314, "time_per_iteration": 2.7362570762634277 }, { "auxiliary_loss_clip": 0.01170923, "auxiliary_loss_mlp": 0.01026851, "balance_loss_clip": 1.05112398, "balance_loss_mlp": 1.01974368, "epoch": 0.6390909637467684, "flos": 19426883247360.0, "grad_norm": 2.3711880108681327, "language_loss": 0.78624344, "learning_rate": 1.2175738201040626e-06, "loss": 0.80822128, "num_input_tokens_seen": 114415160, "step": 5315, "time_per_iteration": 2.6773500442504883 }, { "auxiliary_loss_clip": 0.0116842, "auxiliary_loss_mlp": 0.01024836, "balance_loss_clip": 1.04789639, "balance_loss_mlp": 1.0173738, "epoch": 0.6392112066374076, "flos": 24090852700800.0, "grad_norm": 1.6789036660644945, "language_loss": 0.78558052, "learning_rate": 1.2168569917652855e-06, "loss": 0.80751306, "num_input_tokens_seen": 114435015, "step": 5316, "time_per_iteration": 2.702824592590332 }, { "auxiliary_loss_clip": 0.01167864, "auxiliary_loss_mlp": 0.01027277, "balance_loss_clip": 1.04855108, "balance_loss_mlp": 1.02004981, "epoch": 0.6393314495280467, "flos": 26795141896320.0, "grad_norm": 1.497632925343997, "language_loss": 0.63774824, "learning_rate": 1.2161402822286797e-06, "loss": 0.65969968, "num_input_tokens_seen": 114455700, "step": 5317, "time_per_iteration": 2.689398765563965 }, { "auxiliary_loss_clip": 0.01161731, "auxiliary_loss_mlp": 0.01027854, "balance_loss_clip": 1.04851115, "balance_loss_mlp": 1.02018332, "epoch": 0.6394516924186857, "flos": 20260692633600.0, "grad_norm": 1.995079727963399, "language_loss": 0.7914083, "learning_rate": 1.2154236916029703e-06, "loss": 0.81330419, "num_input_tokens_seen": 114473675, "step": 5318, "time_per_iteration": 2.7205591201782227 }, { "auxiliary_loss_clip": 0.01160062, "auxiliary_loss_mlp": 0.01031281, "balance_loss_clip": 1.04714715, "balance_loss_mlp": 1.02371705, "epoch": 0.6395719353093249, "flos": 18368847210240.0, "grad_norm": 2.320616966117483, "language_loss": 0.73832393, "learning_rate": 1.2147072199968627e-06, "loss": 0.76023734, "num_input_tokens_seen": 114492310, "step": 5319, "time_per_iteration": 2.6950435638427734 }, { "auxiliary_loss_clip": 0.01165456, "auxiliary_loss_mlp": 0.01023733, "balance_loss_clip": 1.04716945, "balance_loss_mlp": 1.01645815, "epoch": 0.6396921781999639, "flos": 17566315591680.0, "grad_norm": 1.7954154718176292, "language_loss": 0.71872258, "learning_rate": 1.2139908675190454e-06, "loss": 0.74061453, "num_input_tokens_seen": 114511520, "step": 5320, "time_per_iteration": 2.7318530082702637 }, { "auxiliary_loss_clip": 0.01151595, "auxiliary_loss_mlp": 0.01023401, "balance_loss_clip": 1.04804564, "balance_loss_mlp": 1.01633453, "epoch": 0.639812421090603, "flos": 21251252972160.0, "grad_norm": 4.234889808929862, "language_loss": 0.75439411, "learning_rate": 1.2132746342781883e-06, "loss": 0.77614415, "num_input_tokens_seen": 114532680, "step": 5321, "time_per_iteration": 2.7593014240264893 }, { "auxiliary_loss_clip": 0.01173261, "auxiliary_loss_mlp": 0.01025549, "balance_loss_clip": 1.04869902, "balance_loss_mlp": 1.01790738, "epoch": 0.6399326639812422, "flos": 11180967684480.0, "grad_norm": 2.565252382303653, "language_loss": 0.79963076, "learning_rate": 1.2125585203829442e-06, "loss": 0.82161885, "num_input_tokens_seen": 114548320, "step": 5322, "time_per_iteration": 2.662017583847046 }, { "auxiliary_loss_clip": 0.0115245, "auxiliary_loss_mlp": 0.01029292, "balance_loss_clip": 1.04875875, "balance_loss_mlp": 1.02162993, "epoch": 0.6400529068718812, "flos": 23911048195200.0, "grad_norm": 1.7144868197347463, "language_loss": 0.74212027, "learning_rate": 1.211842525941946e-06, "loss": 0.76393765, "num_input_tokens_seen": 114568115, "step": 5323, "time_per_iteration": 2.733027458190918 }, { "auxiliary_loss_clip": 0.01152287, "auxiliary_loss_mlp": 0.01027315, "balance_loss_clip": 1.04596496, "balance_loss_mlp": 1.02031136, "epoch": 0.6401731497625203, "flos": 44018724890880.0, "grad_norm": 2.3011862856333445, "language_loss": 0.78989053, "learning_rate": 1.2111266510638105e-06, "loss": 0.81168652, "num_input_tokens_seen": 114591040, "step": 5324, "time_per_iteration": 3.9709677696228027 }, { "auxiliary_loss_clip": 0.01155322, "auxiliary_loss_mlp": 0.01024195, "balance_loss_clip": 1.05158377, "balance_loss_mlp": 1.01669347, "epoch": 0.6402933926531594, "flos": 20662209838080.0, "grad_norm": 1.9410936139737138, "language_loss": 0.80154645, "learning_rate": 1.2104108958571346e-06, "loss": 0.82334161, "num_input_tokens_seen": 114609310, "step": 5325, "time_per_iteration": 4.756753921508789 }, { "auxiliary_loss_clip": 0.01164722, "auxiliary_loss_mlp": 0.01025325, "balance_loss_clip": 1.04760218, "balance_loss_mlp": 1.01780236, "epoch": 0.6404136355437985, "flos": 24863327614080.0, "grad_norm": 1.4508387726765601, "language_loss": 0.75847393, "learning_rate": 1.2096952604304975e-06, "loss": 0.78037435, "num_input_tokens_seen": 114629740, "step": 5326, "time_per_iteration": 2.805820941925049 }, { "auxiliary_loss_clip": 0.01169003, "auxiliary_loss_mlp": 0.01021667, "balance_loss_clip": 1.04773092, "balance_loss_mlp": 1.0139724, "epoch": 0.6405338784344375, "flos": 40479548901120.0, "grad_norm": 2.19196308281604, "language_loss": 0.70157313, "learning_rate": 1.2089797448924616e-06, "loss": 0.72347981, "num_input_tokens_seen": 114653615, "step": 5327, "time_per_iteration": 2.816537380218506 }, { "auxiliary_loss_clip": 0.01167811, "auxiliary_loss_mlp": 0.01028303, "balance_loss_clip": 1.05055237, "balance_loss_mlp": 1.02038169, "epoch": 0.6406541213250767, "flos": 20886041439360.0, "grad_norm": 4.0047893234624565, "language_loss": 0.66316223, "learning_rate": 1.2082643493515692e-06, "loss": 0.68512338, "num_input_tokens_seen": 114671935, "step": 5328, "time_per_iteration": 2.7762136459350586 }, { "auxiliary_loss_clip": 0.01167843, "auxiliary_loss_mlp": 0.01028011, "balance_loss_clip": 1.04884553, "balance_loss_mlp": 1.02063239, "epoch": 0.6407743642157158, "flos": 23295970679040.0, "grad_norm": 1.7424633266758103, "language_loss": 0.81966138, "learning_rate": 1.207549073916346e-06, "loss": 0.84161997, "num_input_tokens_seen": 114692870, "step": 5329, "time_per_iteration": 2.6724531650543213 }, { "auxiliary_loss_clip": 0.01156951, "auxiliary_loss_mlp": 0.01021272, "balance_loss_clip": 1.04906464, "balance_loss_mlp": 1.01465881, "epoch": 0.6408946071063548, "flos": 15012636122880.0, "grad_norm": 2.2442374654891544, "language_loss": 0.77944291, "learning_rate": 1.2068339186952976e-06, "loss": 0.80122513, "num_input_tokens_seen": 114710410, "step": 5330, "time_per_iteration": 2.714629650115967 }, { "auxiliary_loss_clip": 0.01171475, "auxiliary_loss_mlp": 0.01026522, "balance_loss_clip": 1.05018532, "balance_loss_mlp": 1.01917267, "epoch": 0.6410148499969939, "flos": 22528595496960.0, "grad_norm": 1.770911540226196, "language_loss": 0.73065245, "learning_rate": 1.2061188837969136e-06, "loss": 0.75263238, "num_input_tokens_seen": 114730020, "step": 5331, "time_per_iteration": 2.7265212535858154 }, { "auxiliary_loss_clip": 0.01160652, "auxiliary_loss_mlp": 0.0102917, "balance_loss_clip": 1.04912114, "balance_loss_mlp": 1.02146649, "epoch": 0.641135092887633, "flos": 12422004537600.0, "grad_norm": 2.219727142193593, "language_loss": 0.83648431, "learning_rate": 1.2054039693296631e-06, "loss": 0.85838258, "num_input_tokens_seen": 114748015, "step": 5332, "time_per_iteration": 2.6557888984680176 }, { "auxiliary_loss_clip": 0.01158038, "auxiliary_loss_mlp": 0.01027325, "balance_loss_clip": 1.0483098, "balance_loss_mlp": 1.02026749, "epoch": 0.6412553357782721, "flos": 22127329687680.0, "grad_norm": 2.1643551325651322, "language_loss": 0.81342024, "learning_rate": 1.2046891754019992e-06, "loss": 0.83527386, "num_input_tokens_seen": 114768625, "step": 5333, "time_per_iteration": 3.6610219478607178 }, { "auxiliary_loss_clip": 0.01170251, "auxiliary_loss_mlp": 0.01025564, "balance_loss_clip": 1.04832828, "balance_loss_mlp": 1.01841712, "epoch": 0.6413755786689112, "flos": 15888605097600.0, "grad_norm": 1.8056356116774297, "language_loss": 0.82646829, "learning_rate": 1.2039745021223548e-06, "loss": 0.8484264, "num_input_tokens_seen": 114786045, "step": 5334, "time_per_iteration": 2.609273910522461 }, { "auxiliary_loss_clip": 0.01068142, "auxiliary_loss_mlp": 0.01002677, "balance_loss_clip": 1.01850045, "balance_loss_mlp": 1.00174093, "epoch": 0.6414958215595503, "flos": 68039159955840.0, "grad_norm": 0.7931833438715323, "language_loss": 0.57062799, "learning_rate": 1.2032599495991456e-06, "loss": 0.59133619, "num_input_tokens_seen": 114850785, "step": 5335, "time_per_iteration": 3.3656165599823 }, { "auxiliary_loss_clip": 0.01170135, "auxiliary_loss_mlp": 0.01029892, "balance_loss_clip": 1.04994595, "balance_loss_mlp": 1.02233994, "epoch": 0.6416160644501894, "flos": 44091300320640.0, "grad_norm": 1.8653611175062155, "language_loss": 0.69479847, "learning_rate": 1.2025455179407685e-06, "loss": 0.71679878, "num_input_tokens_seen": 114871945, "step": 5336, "time_per_iteration": 2.859004497528076 }, { "auxiliary_loss_clip": 0.01165757, "auxiliary_loss_mlp": 0.01056618, "balance_loss_clip": 1.04801917, "balance_loss_mlp": 1.02028418, "epoch": 0.6417363073408284, "flos": 20959837931520.0, "grad_norm": 3.40649532819359, "language_loss": 0.73525125, "learning_rate": 1.2018312072556022e-06, "loss": 0.75747502, "num_input_tokens_seen": 114890445, "step": 5337, "time_per_iteration": 2.6616361141204834 }, { "auxiliary_loss_clip": 0.01169253, "auxiliary_loss_mlp": 0.01053481, "balance_loss_clip": 1.04707909, "balance_loss_mlp": 1.01767671, "epoch": 0.6418565502314676, "flos": 22455122227200.0, "grad_norm": 1.7703292876648196, "language_loss": 0.74402469, "learning_rate": 1.2011170176520077e-06, "loss": 0.76625204, "num_input_tokens_seen": 114911360, "step": 5338, "time_per_iteration": 2.687811851501465 }, { "auxiliary_loss_clip": 0.01143636, "auxiliary_loss_mlp": 0.01025894, "balance_loss_clip": 1.0488174, "balance_loss_mlp": 1.01900363, "epoch": 0.6419767931221066, "flos": 25045502417280.0, "grad_norm": 1.5531933998374223, "language_loss": 0.81084454, "learning_rate": 1.2004029492383256e-06, "loss": 0.8325398, "num_input_tokens_seen": 114932700, "step": 5339, "time_per_iteration": 2.7652270793914795 }, { "auxiliary_loss_clip": 0.01168795, "auxiliary_loss_mlp": 0.01024911, "balance_loss_clip": 1.05110312, "balance_loss_mlp": 1.0171268, "epoch": 0.6420970360127457, "flos": 19463691709440.0, "grad_norm": 1.8224556410247859, "language_loss": 0.73076832, "learning_rate": 1.1996890021228814e-06, "loss": 0.75270545, "num_input_tokens_seen": 114949475, "step": 5340, "time_per_iteration": 2.69034743309021 }, { "auxiliary_loss_clip": 0.01163519, "auxiliary_loss_mlp": 0.01025295, "balance_loss_clip": 1.04887176, "balance_loss_mlp": 1.01723647, "epoch": 0.6422172789033849, "flos": 40406147458560.0, "grad_norm": 1.5791309901490735, "language_loss": 0.6980983, "learning_rate": 1.1989751764139785e-06, "loss": 0.71998644, "num_input_tokens_seen": 114973125, "step": 5341, "time_per_iteration": 2.866521120071411 }, { "auxiliary_loss_clip": 0.0115854, "auxiliary_loss_mlp": 0.01027671, "balance_loss_clip": 1.04771781, "balance_loss_mlp": 1.02049804, "epoch": 0.6423375217940239, "flos": 27672870637440.0, "grad_norm": 1.553012168997957, "language_loss": 0.82904434, "learning_rate": 1.1982614722199044e-06, "loss": 0.85090649, "num_input_tokens_seen": 114994300, "step": 5342, "time_per_iteration": 2.8016316890716553 }, { "auxiliary_loss_clip": 0.0116816, "auxiliary_loss_mlp": 0.01024993, "balance_loss_clip": 1.04655218, "balance_loss_mlp": 1.01798046, "epoch": 0.642457764684663, "flos": 18369242259840.0, "grad_norm": 2.647492676984808, "language_loss": 0.78463197, "learning_rate": 1.1975478896489276e-06, "loss": 0.8065635, "num_input_tokens_seen": 115012135, "step": 5343, "time_per_iteration": 2.6708714962005615 }, { "auxiliary_loss_clip": 0.01165658, "auxiliary_loss_mlp": 0.01025805, "balance_loss_clip": 1.04536843, "balance_loss_mlp": 1.01869106, "epoch": 0.6425780075753021, "flos": 19750509809280.0, "grad_norm": 1.8098832945001544, "language_loss": 0.76605606, "learning_rate": 1.1968344288092981e-06, "loss": 0.78797066, "num_input_tokens_seen": 115028715, "step": 5344, "time_per_iteration": 2.676361083984375 }, { "auxiliary_loss_clip": 0.01168107, "auxiliary_loss_mlp": 0.01055736, "balance_loss_clip": 1.04942846, "balance_loss_mlp": 1.01906991, "epoch": 0.6426982504659412, "flos": 20558536208640.0, "grad_norm": 1.7276408100851752, "language_loss": 0.64475727, "learning_rate": 1.1961210898092468e-06, "loss": 0.66699564, "num_input_tokens_seen": 115047665, "step": 5345, "time_per_iteration": 2.8579883575439453 }, { "auxiliary_loss_clip": 0.01168721, "auxiliary_loss_mlp": 0.01023585, "balance_loss_clip": 1.04765511, "balance_loss_mlp": 1.01609254, "epoch": 0.6428184933565803, "flos": 17851984456320.0, "grad_norm": 2.3502405993790574, "language_loss": 0.78970426, "learning_rate": 1.1954078727569874e-06, "loss": 0.81162739, "num_input_tokens_seen": 115064965, "step": 5346, "time_per_iteration": 2.710700035095215 }, { "auxiliary_loss_clip": 0.0116876, "auxiliary_loss_mlp": 0.01051277, "balance_loss_clip": 1.05026937, "balance_loss_mlp": 1.01710701, "epoch": 0.6429387362472194, "flos": 22456953820800.0, "grad_norm": 1.6681657340839207, "language_loss": 0.78055322, "learning_rate": 1.1946947777607141e-06, "loss": 0.80275357, "num_input_tokens_seen": 115086100, "step": 5347, "time_per_iteration": 2.7987730503082275 }, { "auxiliary_loss_clip": 0.01153548, "auxiliary_loss_mlp": 0.01029697, "balance_loss_clip": 1.04793644, "balance_loss_mlp": 1.02193046, "epoch": 0.6430589791378585, "flos": 24752579005440.0, "grad_norm": 2.0238302507165273, "language_loss": 0.803635, "learning_rate": 1.1939818049286024e-06, "loss": 0.82546747, "num_input_tokens_seen": 115104260, "step": 5348, "time_per_iteration": 2.783220052719116 }, { "auxiliary_loss_clip": 0.01150206, "auxiliary_loss_mlp": 0.01025684, "balance_loss_clip": 1.05107856, "balance_loss_mlp": 1.01814437, "epoch": 0.6431792220284975, "flos": 24901249397760.0, "grad_norm": 2.152487837511183, "language_loss": 0.75887394, "learning_rate": 1.1932689543688101e-06, "loss": 0.78063279, "num_input_tokens_seen": 115125365, "step": 5349, "time_per_iteration": 2.9242289066314697 }, { "auxiliary_loss_clip": 0.01163046, "auxiliary_loss_mlp": 0.01023221, "balance_loss_clip": 1.0492363, "balance_loss_mlp": 1.01556826, "epoch": 0.6432994649191367, "flos": 21032305620480.0, "grad_norm": 1.8205472748340963, "language_loss": 0.73194194, "learning_rate": 1.1925562261894756e-06, "loss": 0.75380468, "num_input_tokens_seen": 115144445, "step": 5350, "time_per_iteration": 3.6385159492492676 }, { "auxiliary_loss_clip": 0.01161753, "auxiliary_loss_mlp": 0.0102142, "balance_loss_clip": 1.04991746, "balance_loss_mlp": 1.01438665, "epoch": 0.6434197078097758, "flos": 30884433655680.0, "grad_norm": 2.2126391565789185, "language_loss": 0.775226, "learning_rate": 1.1918436204987207e-06, "loss": 0.79705769, "num_input_tokens_seen": 115166305, "step": 5351, "time_per_iteration": 4.715695381164551 }, { "auxiliary_loss_clip": 0.01162258, "auxiliary_loss_mlp": 0.01026381, "balance_loss_clip": 1.04739451, "balance_loss_mlp": 1.01938367, "epoch": 0.6435399507004148, "flos": 15012492468480.0, "grad_norm": 3.0255823595054703, "language_loss": 0.81631982, "learning_rate": 1.191131137404645e-06, "loss": 0.83820623, "num_input_tokens_seen": 115183045, "step": 5352, "time_per_iteration": 2.68365216255188 }, { "auxiliary_loss_clip": 0.01155363, "auxiliary_loss_mlp": 0.0102496, "balance_loss_clip": 1.05102837, "balance_loss_mlp": 1.01770592, "epoch": 0.643660193591054, "flos": 19901981462400.0, "grad_norm": 2.150751894748107, "language_loss": 0.77343893, "learning_rate": 1.190418777015333e-06, "loss": 0.79524213, "num_input_tokens_seen": 115201955, "step": 5353, "time_per_iteration": 2.750770092010498 }, { "auxiliary_loss_clip": 0.01162312, "auxiliary_loss_mlp": 0.01020056, "balance_loss_clip": 1.04663181, "balance_loss_mlp": 1.01303458, "epoch": 0.643780436481693, "flos": 24133622820480.0, "grad_norm": 1.4181363713147248, "language_loss": 0.73518157, "learning_rate": 1.1897065394388487e-06, "loss": 0.75700521, "num_input_tokens_seen": 115222395, "step": 5354, "time_per_iteration": 2.682901620864868 }, { "auxiliary_loss_clip": 0.0116663, "auxiliary_loss_mlp": 0.01023914, "balance_loss_clip": 1.05274844, "balance_loss_mlp": 1.01639199, "epoch": 0.6439006793723321, "flos": 23148808657920.0, "grad_norm": 1.9695001816652264, "language_loss": 0.7653712, "learning_rate": 1.1889944247832385e-06, "loss": 0.78727663, "num_input_tokens_seen": 115242635, "step": 5355, "time_per_iteration": 2.769510507583618 }, { "auxiliary_loss_clip": 0.0117108, "auxiliary_loss_mlp": 0.01026893, "balance_loss_clip": 1.04819643, "balance_loss_mlp": 1.01971376, "epoch": 0.6440209222629713, "flos": 23617909301760.0, "grad_norm": 3.6091669876396084, "language_loss": 0.70873171, "learning_rate": 1.1882824331565283e-06, "loss": 0.73071146, "num_input_tokens_seen": 115262095, "step": 5356, "time_per_iteration": 2.7925658226013184 }, { "auxiliary_loss_clip": 0.01159674, "auxiliary_loss_mlp": 0.01028968, "balance_loss_clip": 1.04843569, "balance_loss_mlp": 1.02167797, "epoch": 0.6441411651536103, "flos": 16544872535040.0, "grad_norm": 2.2177789074760854, "language_loss": 0.88880837, "learning_rate": 1.1875705646667287e-06, "loss": 0.91069484, "num_input_tokens_seen": 115279985, "step": 5357, "time_per_iteration": 2.729279041290283 }, { "auxiliary_loss_clip": 0.01163624, "auxiliary_loss_mlp": 0.0102725, "balance_loss_clip": 1.04588354, "balance_loss_mlp": 1.01937068, "epoch": 0.6442614080442494, "flos": 25410965345280.0, "grad_norm": 7.171049650576626, "language_loss": 0.75609505, "learning_rate": 1.1868588194218282e-06, "loss": 0.77800375, "num_input_tokens_seen": 115300365, "step": 5358, "time_per_iteration": 2.6392786502838135 }, { "auxiliary_loss_clip": 0.0116959, "auxiliary_loss_mlp": 0.01024112, "balance_loss_clip": 1.04757583, "balance_loss_mlp": 1.01667094, "epoch": 0.6443816509348885, "flos": 28294017552000.0, "grad_norm": 2.2473523930291686, "language_loss": 0.74038571, "learning_rate": 1.1861471975297979e-06, "loss": 0.76232272, "num_input_tokens_seen": 115322060, "step": 5359, "time_per_iteration": 3.657382011413574 }, { "auxiliary_loss_clip": 0.01161063, "auxiliary_loss_mlp": 0.0102723, "balance_loss_clip": 1.05052996, "balance_loss_mlp": 1.01954103, "epoch": 0.6445018938255276, "flos": 36690075964800.0, "grad_norm": 1.7069094297725176, "language_loss": 0.71050322, "learning_rate": 1.185435699098591e-06, "loss": 0.73238617, "num_input_tokens_seen": 115348255, "step": 5360, "time_per_iteration": 2.881573438644409 }, { "auxiliary_loss_clip": 0.01171207, "auxiliary_loss_mlp": 0.010312, "balance_loss_clip": 1.0504806, "balance_loss_mlp": 1.02338862, "epoch": 0.6446221367161666, "flos": 14501411804160.0, "grad_norm": 2.2798784924918065, "language_loss": 0.78571284, "learning_rate": 1.1847243242361403e-06, "loss": 0.80773687, "num_input_tokens_seen": 115366845, "step": 5361, "time_per_iteration": 2.7059264183044434 }, { "auxiliary_loss_clip": 0.01164428, "auxiliary_loss_mlp": 0.01024299, "balance_loss_clip": 1.04686224, "balance_loss_mlp": 1.01673472, "epoch": 0.6447423796068057, "flos": 24609367480320.0, "grad_norm": 1.5778335504275522, "language_loss": 0.78296113, "learning_rate": 1.1840130730503624e-06, "loss": 0.80484837, "num_input_tokens_seen": 115388125, "step": 5362, "time_per_iteration": 2.7877745628356934 }, { "auxiliary_loss_clip": 0.01173039, "auxiliary_loss_mlp": 0.01022433, "balance_loss_clip": 1.04893041, "balance_loss_mlp": 1.01492262, "epoch": 0.6448626224974449, "flos": 25047298097280.0, "grad_norm": 1.983631046743409, "language_loss": 0.75174898, "learning_rate": 1.1833019456491518e-06, "loss": 0.77370369, "num_input_tokens_seen": 115409655, "step": 5363, "time_per_iteration": 2.7143282890319824 }, { "auxiliary_loss_clip": 0.01167259, "auxiliary_loss_mlp": 0.01024359, "balance_loss_clip": 1.04791439, "balance_loss_mlp": 1.01713204, "epoch": 0.6449828653880839, "flos": 22530355263360.0, "grad_norm": 1.923889186078833, "language_loss": 0.79105818, "learning_rate": 1.1825909421403871e-06, "loss": 0.81297433, "num_input_tokens_seen": 115428750, "step": 5364, "time_per_iteration": 2.6885030269622803 }, { "auxiliary_loss_clip": 0.01166883, "auxiliary_loss_mlp": 0.01026042, "balance_loss_clip": 1.04741049, "balance_loss_mlp": 1.01858521, "epoch": 0.645103108278723, "flos": 25695736369920.0, "grad_norm": 2.0730788702430503, "language_loss": 0.76372588, "learning_rate": 1.181880062631926e-06, "loss": 0.78565514, "num_input_tokens_seen": 115448085, "step": 5365, "time_per_iteration": 2.710495948791504 }, { "auxiliary_loss_clip": 0.01164482, "auxiliary_loss_mlp": 0.01024943, "balance_loss_clip": 1.05276406, "balance_loss_mlp": 1.01680386, "epoch": 0.6452233511693621, "flos": 27450331925760.0, "grad_norm": 2.214522703830159, "language_loss": 0.84596223, "learning_rate": 1.1811693072316093e-06, "loss": 0.8678565, "num_input_tokens_seen": 115465765, "step": 5366, "time_per_iteration": 2.8020577430725098 }, { "auxiliary_loss_clip": 0.01170539, "auxiliary_loss_mlp": 0.01052723, "balance_loss_clip": 1.04694915, "balance_loss_mlp": 1.01768863, "epoch": 0.6453435940600012, "flos": 19208618254080.0, "grad_norm": 2.324671150431068, "language_loss": 0.84199315, "learning_rate": 1.1804586760472574e-06, "loss": 0.86422575, "num_input_tokens_seen": 115482230, "step": 5367, "time_per_iteration": 2.6851541996002197 }, { "auxiliary_loss_clip": 0.01159869, "auxiliary_loss_mlp": 0.01024149, "balance_loss_clip": 1.04743266, "balance_loss_mlp": 1.01682353, "epoch": 0.6454638369506402, "flos": 25737680476800.0, "grad_norm": 2.3504028981731726, "language_loss": 0.80221367, "learning_rate": 1.1797481691866736e-06, "loss": 0.82405388, "num_input_tokens_seen": 115499455, "step": 5368, "time_per_iteration": 2.76318097114563 }, { "auxiliary_loss_clip": 0.0115583, "auxiliary_loss_mlp": 0.01028799, "balance_loss_clip": 1.04788494, "balance_loss_mlp": 1.02148306, "epoch": 0.6455840798412794, "flos": 20989176364800.0, "grad_norm": 2.8203257377050397, "language_loss": 0.83091617, "learning_rate": 1.1790377867576393e-06, "loss": 0.85276246, "num_input_tokens_seen": 115517205, "step": 5369, "time_per_iteration": 2.69337797164917 }, { "auxiliary_loss_clip": 0.01168213, "auxiliary_loss_mlp": 0.01027535, "balance_loss_clip": 1.04863524, "balance_loss_mlp": 1.02021301, "epoch": 0.6457043227319185, "flos": 26067556005120.0, "grad_norm": 2.026562522761235, "language_loss": 0.76975417, "learning_rate": 1.1783275288679203e-06, "loss": 0.79171157, "num_input_tokens_seen": 115534370, "step": 5370, "time_per_iteration": 2.7410216331481934 }, { "auxiliary_loss_clip": 0.01072206, "auxiliary_loss_mlp": 0.01001744, "balance_loss_clip": 1.01485336, "balance_loss_mlp": 1.00073659, "epoch": 0.6458245656225575, "flos": 60370831088640.0, "grad_norm": 0.841559114481528, "language_loss": 0.57111621, "learning_rate": 1.177617395625262e-06, "loss": 0.59185576, "num_input_tokens_seen": 115592345, "step": 5371, "time_per_iteration": 3.193763256072998 }, { "auxiliary_loss_clip": 0.0116729, "auxiliary_loss_mlp": 0.01025842, "balance_loss_clip": 1.04776549, "balance_loss_mlp": 1.01830196, "epoch": 0.6459448085131967, "flos": 23076771932160.0, "grad_norm": 1.9549377031588406, "language_loss": 0.75683343, "learning_rate": 1.1769073871373908e-06, "loss": 0.77876472, "num_input_tokens_seen": 115612550, "step": 5372, "time_per_iteration": 2.6547250747680664 }, { "auxiliary_loss_clip": 0.01159091, "auxiliary_loss_mlp": 0.01025013, "balance_loss_clip": 1.04647446, "balance_loss_mlp": 1.0179584, "epoch": 0.6460650514038357, "flos": 22598190097920.0, "grad_norm": 1.6126636978615898, "language_loss": 0.83991134, "learning_rate": 1.176197503512015e-06, "loss": 0.86175233, "num_input_tokens_seen": 115632265, "step": 5373, "time_per_iteration": 2.741180181503296 }, { "auxiliary_loss_clip": 0.01163325, "auxiliary_loss_mlp": 0.01027839, "balance_loss_clip": 1.0503428, "balance_loss_mlp": 1.02055192, "epoch": 0.6461852942944748, "flos": 20266726118400.0, "grad_norm": 2.0396534771184722, "language_loss": 0.82220596, "learning_rate": 1.1754877448568223e-06, "loss": 0.84411764, "num_input_tokens_seen": 115651720, "step": 5374, "time_per_iteration": 2.6804022789001465 }, { "auxiliary_loss_clip": 0.01165608, "auxiliary_loss_mlp": 0.01020538, "balance_loss_clip": 1.04801416, "balance_loss_mlp": 1.01364207, "epoch": 0.646305537185114, "flos": 23367109564800.0, "grad_norm": 1.9891990438079763, "language_loss": 0.90005791, "learning_rate": 1.1747781112794837e-06, "loss": 0.92191935, "num_input_tokens_seen": 115668215, "step": 5375, "time_per_iteration": 2.8340160846710205 }, { "auxiliary_loss_clip": 0.01162573, "auxiliary_loss_mlp": 0.01024771, "balance_loss_clip": 1.05288601, "balance_loss_mlp": 1.0179435, "epoch": 0.646425780075753, "flos": 24277480790400.0, "grad_norm": 1.558812982452645, "language_loss": 0.83100361, "learning_rate": 1.1740686028876487e-06, "loss": 0.85287702, "num_input_tokens_seen": 115687080, "step": 5376, "time_per_iteration": 3.712703227996826 }, { "auxiliary_loss_clip": 0.01163009, "auxiliary_loss_mlp": 0.01022038, "balance_loss_clip": 1.04787207, "balance_loss_mlp": 1.01496017, "epoch": 0.6465460229663921, "flos": 20813968800000.0, "grad_norm": 3.9338129376614224, "language_loss": 0.75375199, "learning_rate": 1.1733592197889507e-06, "loss": 0.77560246, "num_input_tokens_seen": 115703990, "step": 5377, "time_per_iteration": 3.5718636512756348 }, { "auxiliary_loss_clip": 0.01162719, "auxiliary_loss_mlp": 0.01022303, "balance_loss_clip": 1.04880607, "balance_loss_mlp": 1.01550841, "epoch": 0.6466662658570312, "flos": 22853299466880.0, "grad_norm": 1.9630829416659676, "language_loss": 0.72737741, "learning_rate": 1.1726499620910014e-06, "loss": 0.7492277, "num_input_tokens_seen": 115724270, "step": 5378, "time_per_iteration": 3.8012290000915527 }, { "auxiliary_loss_clip": 0.01164931, "auxiliary_loss_mlp": 0.01024734, "balance_loss_clip": 1.04790068, "balance_loss_mlp": 1.01708984, "epoch": 0.6467865087476703, "flos": 15304553953920.0, "grad_norm": 3.1165507144552156, "language_loss": 0.77469254, "learning_rate": 1.1719408299013955e-06, "loss": 0.79658914, "num_input_tokens_seen": 115742995, "step": 5379, "time_per_iteration": 2.7311577796936035 }, { "auxiliary_loss_clip": 0.0117106, "auxiliary_loss_mlp": 0.01025784, "balance_loss_clip": 1.05125642, "balance_loss_mlp": 1.01910281, "epoch": 0.6469067516383094, "flos": 19573650218880.0, "grad_norm": 2.78732736535932, "language_loss": 0.75304008, "learning_rate": 1.1712318233277067e-06, "loss": 0.7750085, "num_input_tokens_seen": 115762015, "step": 5380, "time_per_iteration": 2.613391876220703 }, { "auxiliary_loss_clip": 0.01071204, "auxiliary_loss_mlp": 0.01001429, "balance_loss_clip": 1.01490521, "balance_loss_mlp": 1.00043321, "epoch": 0.6470269945289485, "flos": 65098002522240.0, "grad_norm": 0.7561792092352027, "language_loss": 0.57845217, "learning_rate": 1.1705229424774916e-06, "loss": 0.59917849, "num_input_tokens_seen": 115816285, "step": 5381, "time_per_iteration": 3.1657989025115967 }, { "auxiliary_loss_clip": 0.01161832, "auxiliary_loss_mlp": 0.01023434, "balance_loss_clip": 1.04790282, "balance_loss_mlp": 1.01560807, "epoch": 0.6471472374195876, "flos": 30696943639680.0, "grad_norm": 3.1939056991576584, "language_loss": 0.64395273, "learning_rate": 1.1698141874582867e-06, "loss": 0.66580534, "num_input_tokens_seen": 115837330, "step": 5382, "time_per_iteration": 2.696669340133667 }, { "auxiliary_loss_clip": 0.01167743, "auxiliary_loss_mlp": 0.01022809, "balance_loss_clip": 1.04793561, "balance_loss_mlp": 1.01553726, "epoch": 0.6472674803102266, "flos": 20521835487360.0, "grad_norm": 2.0391648393522757, "language_loss": 0.71995592, "learning_rate": 1.169105558377609e-06, "loss": 0.74186146, "num_input_tokens_seen": 115857420, "step": 5383, "time_per_iteration": 2.678466320037842 }, { "auxiliary_loss_clip": 0.01155936, "auxiliary_loss_mlp": 0.010527, "balance_loss_clip": 1.05050659, "balance_loss_mlp": 1.01844716, "epoch": 0.6473877232008658, "flos": 24715447320960.0, "grad_norm": 1.8635555834390942, "language_loss": 0.78652644, "learning_rate": 1.1683970553429587e-06, "loss": 0.80861276, "num_input_tokens_seen": 115878875, "step": 5384, "time_per_iteration": 2.7098305225372314 }, { "auxiliary_loss_clip": 0.01165893, "auxiliary_loss_mlp": 0.01032072, "balance_loss_clip": 1.05084431, "balance_loss_mlp": 1.02448475, "epoch": 0.6475079660915048, "flos": 15885552441600.0, "grad_norm": 1.8487083948754723, "language_loss": 0.82392818, "learning_rate": 1.1676886784618128e-06, "loss": 0.84590787, "num_input_tokens_seen": 115895540, "step": 5385, "time_per_iteration": 3.637943744659424 }, { "auxiliary_loss_clip": 0.0116847, "auxiliary_loss_mlp": 0.01027201, "balance_loss_clip": 1.04957366, "balance_loss_mlp": 1.01974487, "epoch": 0.6476282089821439, "flos": 17381590922880.0, "grad_norm": 2.1905954746734664, "language_loss": 0.84485388, "learning_rate": 1.1669804278416332e-06, "loss": 0.86681062, "num_input_tokens_seen": 115910265, "step": 5386, "time_per_iteration": 2.7902345657348633 }, { "auxiliary_loss_clip": 0.01170187, "auxiliary_loss_mlp": 0.01024015, "balance_loss_clip": 1.05058086, "balance_loss_mlp": 1.01630259, "epoch": 0.6477484518727831, "flos": 20194078861440.0, "grad_norm": 2.455080815470508, "language_loss": 0.71346343, "learning_rate": 1.1662723035898602e-06, "loss": 0.7354055, "num_input_tokens_seen": 115930025, "step": 5387, "time_per_iteration": 2.6650917530059814 }, { "auxiliary_loss_clip": 0.01167475, "auxiliary_loss_mlp": 0.01022041, "balance_loss_clip": 1.04914641, "balance_loss_mlp": 1.01432216, "epoch": 0.6478686947634221, "flos": 25410426641280.0, "grad_norm": 2.0803823737595297, "language_loss": 0.81946516, "learning_rate": 1.165564305813915e-06, "loss": 0.84136033, "num_input_tokens_seen": 115949025, "step": 5388, "time_per_iteration": 2.6894407272338867 }, { "auxiliary_loss_clip": 0.01166393, "auxiliary_loss_mlp": 0.01019865, "balance_loss_clip": 1.04783142, "balance_loss_mlp": 1.01306427, "epoch": 0.6479889376540612, "flos": 20083581648000.0, "grad_norm": 1.778769997539279, "language_loss": 0.81157613, "learning_rate": 1.1648564346212019e-06, "loss": 0.83343875, "num_input_tokens_seen": 115968145, "step": 5389, "time_per_iteration": 2.6872904300689697 }, { "auxiliary_loss_clip": 0.01161282, "auxiliary_loss_mlp": 0.01023059, "balance_loss_clip": 1.04811466, "balance_loss_mlp": 1.01598418, "epoch": 0.6481091805447003, "flos": 26758082039040.0, "grad_norm": 2.034531444627899, "language_loss": 0.76505613, "learning_rate": 1.164148690119104e-06, "loss": 0.78689957, "num_input_tokens_seen": 115989425, "step": 5390, "time_per_iteration": 2.771304130554199 }, { "auxiliary_loss_clip": 0.01168417, "auxiliary_loss_mlp": 0.01023362, "balance_loss_clip": 1.04770565, "balance_loss_mlp": 1.01651371, "epoch": 0.6482294234353394, "flos": 23952094462080.0, "grad_norm": 1.8120540874971625, "language_loss": 0.74348134, "learning_rate": 1.163441072414985e-06, "loss": 0.7653991, "num_input_tokens_seen": 116009630, "step": 5391, "time_per_iteration": 2.643284797668457 }, { "auxiliary_loss_clip": 0.01166198, "auxiliary_loss_mlp": 0.01025436, "balance_loss_clip": 1.04936814, "balance_loss_mlp": 1.01830125, "epoch": 0.6483496663259785, "flos": 26209833776640.0, "grad_norm": 2.0101894311434965, "language_loss": 0.69529849, "learning_rate": 1.16273358161619e-06, "loss": 0.71721482, "num_input_tokens_seen": 116029965, "step": 5392, "time_per_iteration": 2.737327814102173 }, { "auxiliary_loss_clip": 0.01173632, "auxiliary_loss_mlp": 0.01026006, "balance_loss_clip": 1.05010295, "balance_loss_mlp": 1.0183413, "epoch": 0.6484699092166175, "flos": 20922239370240.0, "grad_norm": 2.0228773189493015, "language_loss": 0.83678645, "learning_rate": 1.1620262178300446e-06, "loss": 0.85878277, "num_input_tokens_seen": 116048580, "step": 5393, "time_per_iteration": 2.706571102142334 }, { "auxiliary_loss_clip": 0.01164913, "auxiliary_loss_mlp": 0.01024002, "balance_loss_clip": 1.04906559, "balance_loss_mlp": 1.01723444, "epoch": 0.6485901521072567, "flos": 33072865678080.0, "grad_norm": 1.8276185476186022, "language_loss": 0.75847703, "learning_rate": 1.1613189811638563e-06, "loss": 0.78036618, "num_input_tokens_seen": 116070305, "step": 5394, "time_per_iteration": 2.917562484741211 }, { "auxiliary_loss_clip": 0.01169486, "auxiliary_loss_mlp": 0.01024691, "balance_loss_clip": 1.05130303, "balance_loss_mlp": 1.01779485, "epoch": 0.6487103949978957, "flos": 22274060745600.0, "grad_norm": 1.7482078661373583, "language_loss": 0.78352314, "learning_rate": 1.1606118717249117e-06, "loss": 0.80546492, "num_input_tokens_seen": 116090405, "step": 5395, "time_per_iteration": 2.7747933864593506 }, { "auxiliary_loss_clip": 0.01173373, "auxiliary_loss_mlp": 0.01020995, "balance_loss_clip": 1.04785776, "balance_loss_mlp": 1.01331782, "epoch": 0.6488306378885348, "flos": 22930400010240.0, "grad_norm": 1.7474978042905671, "language_loss": 0.67650473, "learning_rate": 1.1599048896204787e-06, "loss": 0.69844842, "num_input_tokens_seen": 116110285, "step": 5396, "time_per_iteration": 2.663092851638794 }, { "auxiliary_loss_clip": 0.01164057, "auxiliary_loss_mlp": 0.01028646, "balance_loss_clip": 1.04804027, "balance_loss_mlp": 1.02136493, "epoch": 0.648950880779174, "flos": 20376110010240.0, "grad_norm": 1.7167835788275478, "language_loss": 0.8102594, "learning_rate": 1.1591980349578061e-06, "loss": 0.83218646, "num_input_tokens_seen": 116128955, "step": 5397, "time_per_iteration": 2.760098695755005 }, { "auxiliary_loss_clip": 0.01070952, "auxiliary_loss_mlp": 0.01002743, "balance_loss_clip": 1.01554096, "balance_loss_mlp": 1.00181293, "epoch": 0.649071123669813, "flos": 59930889310080.0, "grad_norm": 0.7389112414602036, "language_loss": 0.54257464, "learning_rate": 1.158491307844123e-06, "loss": 0.56331158, "num_input_tokens_seen": 116188875, "step": 5398, "time_per_iteration": 3.2694802284240723 }, { "auxiliary_loss_clip": 0.01165947, "auxiliary_loss_mlp": 0.01021321, "balance_loss_clip": 1.05107391, "balance_loss_mlp": 1.0144161, "epoch": 0.6491913665604521, "flos": 20446566537600.0, "grad_norm": 5.172375990607326, "language_loss": 0.83904874, "learning_rate": 1.1577847083866387e-06, "loss": 0.86092138, "num_input_tokens_seen": 116207910, "step": 5399, "time_per_iteration": 2.727055549621582 }, { "auxiliary_loss_clip": 0.01157568, "auxiliary_loss_mlp": 0.01021657, "balance_loss_clip": 1.04954731, "balance_loss_mlp": 1.01389909, "epoch": 0.6493116094510912, "flos": 16946820702720.0, "grad_norm": 8.117979200663592, "language_loss": 0.72271192, "learning_rate": 1.1570782366925453e-06, "loss": 0.74450409, "num_input_tokens_seen": 116226425, "step": 5400, "time_per_iteration": 2.7436563968658447 }, { "auxiliary_loss_clip": 0.01166917, "auxiliary_loss_mlp": 0.01023885, "balance_loss_clip": 1.04734695, "balance_loss_mlp": 1.0163033, "epoch": 0.6494318523417303, "flos": 18802935072000.0, "grad_norm": 1.6708704255421558, "language_loss": 0.75292552, "learning_rate": 1.1563718928690132e-06, "loss": 0.77483344, "num_input_tokens_seen": 116243860, "step": 5401, "time_per_iteration": 2.7362797260284424 }, { "auxiliary_loss_clip": 0.01160882, "auxiliary_loss_mlp": 0.01031357, "balance_loss_clip": 1.04900205, "balance_loss_mlp": 1.02308428, "epoch": 0.6495520952323693, "flos": 18982847318400.0, "grad_norm": 1.978132888807205, "language_loss": 0.71544886, "learning_rate": 1.1556656770231942e-06, "loss": 0.73737127, "num_input_tokens_seen": 116260055, "step": 5402, "time_per_iteration": 3.637354850769043 }, { "auxiliary_loss_clip": 0.01166611, "auxiliary_loss_mlp": 0.01021816, "balance_loss_clip": 1.04662299, "balance_loss_mlp": 1.01459801, "epoch": 0.6496723381230085, "flos": 22745388032640.0, "grad_norm": 1.4954537286746756, "language_loss": 0.76273483, "learning_rate": 1.1549595892622207e-06, "loss": 0.78461915, "num_input_tokens_seen": 116278825, "step": 5403, "time_per_iteration": 3.608046770095825 }, { "auxiliary_loss_clip": 0.01066729, "auxiliary_loss_mlp": 0.01004261, "balance_loss_clip": 1.02011931, "balance_loss_mlp": 1.00326002, "epoch": 0.6497925810136476, "flos": 62145283887360.0, "grad_norm": 0.818609378433692, "language_loss": 0.58938509, "learning_rate": 1.1542536296932047e-06, "loss": 0.61009496, "num_input_tokens_seen": 116342360, "step": 5404, "time_per_iteration": 4.294219732284546 }, { "auxiliary_loss_clip": 0.01171683, "auxiliary_loss_mlp": 0.01025374, "balance_loss_clip": 1.05040359, "balance_loss_mlp": 1.01776862, "epoch": 0.6499128239042866, "flos": 20156731695360.0, "grad_norm": 1.7968977155425199, "language_loss": 0.7057851, "learning_rate": 1.1535477984232414e-06, "loss": 0.72775567, "num_input_tokens_seen": 116362235, "step": 5405, "time_per_iteration": 2.7979869842529297 }, { "auxiliary_loss_clip": 0.01161369, "auxiliary_loss_mlp": 0.01023449, "balance_loss_clip": 1.04633236, "balance_loss_mlp": 1.01621008, "epoch": 0.6500330667949258, "flos": 24462420940800.0, "grad_norm": 1.9194226723910435, "language_loss": 0.77223283, "learning_rate": 1.152842095559404e-06, "loss": 0.79408097, "num_input_tokens_seen": 116382895, "step": 5406, "time_per_iteration": 2.7071714401245117 }, { "auxiliary_loss_clip": 0.01167491, "auxiliary_loss_mlp": 0.01022767, "balance_loss_clip": 1.04680586, "balance_loss_mlp": 1.01573944, "epoch": 0.6501533096855648, "flos": 25477399549440.0, "grad_norm": 2.0140145326785692, "language_loss": 0.76926398, "learning_rate": 1.1521365212087474e-06, "loss": 0.79116654, "num_input_tokens_seen": 116402880, "step": 5407, "time_per_iteration": 2.7482569217681885 }, { "auxiliary_loss_clip": 0.01166542, "auxiliary_loss_mlp": 0.01023387, "balance_loss_clip": 1.04791486, "balance_loss_mlp": 1.01544762, "epoch": 0.6502735525762039, "flos": 44819245347840.0, "grad_norm": 1.6347576495365024, "language_loss": 0.70649695, "learning_rate": 1.1514310754783062e-06, "loss": 0.72839624, "num_input_tokens_seen": 116425830, "step": 5408, "time_per_iteration": 2.8845057487487793 }, { "auxiliary_loss_clip": 0.01165344, "auxiliary_loss_mlp": 0.01025023, "balance_loss_clip": 1.04891777, "balance_loss_mlp": 1.01784694, "epoch": 0.6503937954668431, "flos": 28658546726400.0, "grad_norm": 1.760607469316896, "language_loss": 0.73616999, "learning_rate": 1.1507257584750964e-06, "loss": 0.75807369, "num_input_tokens_seen": 116446010, "step": 5409, "time_per_iteration": 2.7926952838897705 }, { "auxiliary_loss_clip": 0.01173608, "auxiliary_loss_mlp": 0.0102421, "balance_loss_clip": 1.05154097, "balance_loss_mlp": 1.01725423, "epoch": 0.6505140383574821, "flos": 20922562592640.0, "grad_norm": 2.0145914189725036, "language_loss": 0.77646595, "learning_rate": 1.150020570306113e-06, "loss": 0.79844421, "num_input_tokens_seen": 116465150, "step": 5410, "time_per_iteration": 2.761505365371704 }, { "auxiliary_loss_clip": 0.011626, "auxiliary_loss_mlp": 0.01030635, "balance_loss_clip": 1.04925251, "balance_loss_mlp": 1.02258849, "epoch": 0.6506342812481212, "flos": 20595236929920.0, "grad_norm": 1.6853787497386743, "language_loss": 0.74990565, "learning_rate": 1.1493155110783338e-06, "loss": 0.77183801, "num_input_tokens_seen": 116483675, "step": 5411, "time_per_iteration": 3.667233467102051 }, { "auxiliary_loss_clip": 0.01166709, "auxiliary_loss_mlp": 0.01022135, "balance_loss_clip": 1.04836273, "balance_loss_mlp": 1.01422548, "epoch": 0.6507545241387603, "flos": 30226478279040.0, "grad_norm": 2.087072781912915, "language_loss": 0.70871389, "learning_rate": 1.1486105808987155e-06, "loss": 0.73060238, "num_input_tokens_seen": 116505165, "step": 5412, "time_per_iteration": 2.7682178020477295 }, { "auxiliary_loss_clip": 0.01172039, "auxiliary_loss_mlp": 0.010244, "balance_loss_clip": 1.05158949, "balance_loss_mlp": 1.01754236, "epoch": 0.6508747670293994, "flos": 17128241320320.0, "grad_norm": 1.9859264183940433, "language_loss": 0.81392324, "learning_rate": 1.1479057798741947e-06, "loss": 0.83588761, "num_input_tokens_seen": 116523220, "step": 5413, "time_per_iteration": 2.6549947261810303 }, { "auxiliary_loss_clip": 0.01069112, "auxiliary_loss_mlp": 0.01004155, "balance_loss_clip": 1.0180608, "balance_loss_mlp": 1.00314808, "epoch": 0.6509950099200384, "flos": 68559826573440.0, "grad_norm": 0.7793481255251246, "language_loss": 0.53270936, "learning_rate": 1.14720110811169e-06, "loss": 0.553442, "num_input_tokens_seen": 116580450, "step": 5414, "time_per_iteration": 3.266846179962158 }, { "auxiliary_loss_clip": 0.01169878, "auxiliary_loss_mlp": 0.01022832, "balance_loss_clip": 1.04860687, "balance_loss_mlp": 1.0152384, "epoch": 0.6511152528106776, "flos": 22347462188160.0, "grad_norm": 2.244911208103431, "language_loss": 0.76799208, "learning_rate": 1.146496565718098e-06, "loss": 0.7899192, "num_input_tokens_seen": 116601020, "step": 5415, "time_per_iteration": 2.6854522228240967 }, { "auxiliary_loss_clip": 0.01163199, "auxiliary_loss_mlp": 0.01026178, "balance_loss_clip": 1.04973948, "balance_loss_mlp": 1.01889729, "epoch": 0.6512354957013167, "flos": 20522158709760.0, "grad_norm": 2.011020969897538, "language_loss": 0.75827491, "learning_rate": 1.1457921528002996e-06, "loss": 0.78016865, "num_input_tokens_seen": 116619455, "step": 5416, "time_per_iteration": 2.628563404083252 }, { "auxiliary_loss_clip": 0.01170186, "auxiliary_loss_mlp": 0.01056083, "balance_loss_clip": 1.04834497, "balance_loss_mlp": 1.02188432, "epoch": 0.6513557385919557, "flos": 32337342881280.0, "grad_norm": 2.5044638334382077, "language_loss": 0.72393036, "learning_rate": 1.1450878694651522e-06, "loss": 0.74619305, "num_input_tokens_seen": 116640020, "step": 5417, "time_per_iteration": 2.731898546218872 }, { "auxiliary_loss_clip": 0.01159027, "auxiliary_loss_mlp": 0.01029163, "balance_loss_clip": 1.04733419, "balance_loss_mlp": 1.02183759, "epoch": 0.6514759814825949, "flos": 12093206417280.0, "grad_norm": 2.235243313095637, "language_loss": 0.62994641, "learning_rate": 1.1443837158194954e-06, "loss": 0.65182829, "num_input_tokens_seen": 116655165, "step": 5418, "time_per_iteration": 2.722912073135376 }, { "auxiliary_loss_clip": 0.01157993, "auxiliary_loss_mlp": 0.01021026, "balance_loss_clip": 1.0489732, "balance_loss_mlp": 1.01376569, "epoch": 0.651596224373234, "flos": 22526907557760.0, "grad_norm": 1.5774990180060011, "language_loss": 0.74440491, "learning_rate": 1.1436796919701484e-06, "loss": 0.76619506, "num_input_tokens_seen": 116673880, "step": 5419, "time_per_iteration": 2.752939462661743 }, { "auxiliary_loss_clip": 0.01162526, "auxiliary_loss_mlp": 0.01026361, "balance_loss_clip": 1.04769742, "balance_loss_mlp": 1.01901817, "epoch": 0.651716467263873, "flos": 27818955250560.0, "grad_norm": 2.4782302663434, "language_loss": 0.61918402, "learning_rate": 1.1429757980239115e-06, "loss": 0.64107287, "num_input_tokens_seen": 116694305, "step": 5420, "time_per_iteration": 2.725572347640991 }, { "auxiliary_loss_clip": 0.01173406, "auxiliary_loss_mlp": 0.01025539, "balance_loss_clip": 1.04917419, "balance_loss_mlp": 1.01815414, "epoch": 0.6518367101545122, "flos": 24316300414080.0, "grad_norm": 4.418847134048719, "language_loss": 0.81771374, "learning_rate": 1.1422720340875636e-06, "loss": 0.8397032, "num_input_tokens_seen": 116713055, "step": 5421, "time_per_iteration": 2.673229217529297 }, { "auxiliary_loss_clip": 0.01174083, "auxiliary_loss_mlp": 0.01029597, "balance_loss_clip": 1.04960811, "balance_loss_mlp": 1.0223552, "epoch": 0.6519569530451512, "flos": 20011939971840.0, "grad_norm": 1.9100525225082348, "language_loss": 0.79275614, "learning_rate": 1.1415684002678671e-06, "loss": 0.81479299, "num_input_tokens_seen": 116731815, "step": 5422, "time_per_iteration": 2.7957353591918945 }, { "auxiliary_loss_clip": 0.01170044, "auxiliary_loss_mlp": 0.01026661, "balance_loss_clip": 1.04969835, "balance_loss_mlp": 1.01884711, "epoch": 0.6520771959357903, "flos": 21576064682880.0, "grad_norm": 2.2915314591879237, "language_loss": 0.77965182, "learning_rate": 1.1408648966715617e-06, "loss": 0.80161887, "num_input_tokens_seen": 116749335, "step": 5423, "time_per_iteration": 2.757911443710327 }, { "auxiliary_loss_clip": 0.01167933, "auxiliary_loss_mlp": 0.01028285, "balance_loss_clip": 1.04832566, "balance_loss_mlp": 1.02053618, "epoch": 0.6521974388264293, "flos": 22711021695360.0, "grad_norm": 1.825344473742105, "language_loss": 0.72776657, "learning_rate": 1.1401615234053683e-06, "loss": 0.7497288, "num_input_tokens_seen": 116768155, "step": 5424, "time_per_iteration": 2.69392991065979 }, { "auxiliary_loss_clip": 0.01168798, "auxiliary_loss_mlp": 0.01025086, "balance_loss_clip": 1.04862559, "balance_loss_mlp": 1.01750088, "epoch": 0.6523176817170685, "flos": 23002939526400.0, "grad_norm": 1.9713410490751613, "language_loss": 0.7592808, "learning_rate": 1.1394582805759885e-06, "loss": 0.7812196, "num_input_tokens_seen": 116787435, "step": 5425, "time_per_iteration": 2.7173049449920654 }, { "auxiliary_loss_clip": 0.01167732, "auxiliary_loss_mlp": 0.01023648, "balance_loss_clip": 1.04934657, "balance_loss_mlp": 1.01567912, "epoch": 0.6524379246077076, "flos": 21688249835520.0, "grad_norm": 2.1848167001519716, "language_loss": 0.75917494, "learning_rate": 1.1387551682901022e-06, "loss": 0.78108871, "num_input_tokens_seen": 116808040, "step": 5426, "time_per_iteration": 2.76460862159729 }, { "auxiliary_loss_clip": 0.01157862, "auxiliary_loss_mlp": 0.01025115, "balance_loss_clip": 1.0477705, "balance_loss_mlp": 1.01703858, "epoch": 0.6525581674983466, "flos": 19390936711680.0, "grad_norm": 1.9289870126948254, "language_loss": 0.71036184, "learning_rate": 1.138052186654373e-06, "loss": 0.73219162, "num_input_tokens_seen": 116825510, "step": 5427, "time_per_iteration": 2.6679177284240723 }, { "auxiliary_loss_clip": 0.01171003, "auxiliary_loss_mlp": 0.01028477, "balance_loss_clip": 1.05266142, "balance_loss_mlp": 1.0206213, "epoch": 0.6526784103889858, "flos": 17165444832000.0, "grad_norm": 1.974638915906318, "language_loss": 0.88100833, "learning_rate": 1.1373493357754417e-06, "loss": 0.9030031, "num_input_tokens_seen": 116844415, "step": 5428, "time_per_iteration": 3.54561710357666 }, { "auxiliary_loss_clip": 0.01169126, "auxiliary_loss_mlp": 0.01021644, "balance_loss_clip": 1.04650152, "balance_loss_mlp": 1.01448905, "epoch": 0.6527986532796248, "flos": 18989168112000.0, "grad_norm": 1.5831529490018683, "language_loss": 0.76887184, "learning_rate": 1.1366466157599303e-06, "loss": 0.79077953, "num_input_tokens_seen": 116863690, "step": 5429, "time_per_iteration": 3.6100127696990967 }, { "auxiliary_loss_clip": 0.01160507, "auxiliary_loss_mlp": 0.01061146, "balance_loss_clip": 1.05086017, "balance_loss_mlp": 1.02415991, "epoch": 0.6529188961702639, "flos": 14238581011200.0, "grad_norm": 1.9571865033379754, "language_loss": 0.76609045, "learning_rate": 1.1359440267144412e-06, "loss": 0.78830695, "num_input_tokens_seen": 116881145, "step": 5430, "time_per_iteration": 2.722175359725952 }, { "auxiliary_loss_clip": 0.01168911, "auxiliary_loss_mlp": 0.0102218, "balance_loss_clip": 1.0477612, "balance_loss_mlp": 1.01420236, "epoch": 0.653039139060903, "flos": 36682929158400.0, "grad_norm": 2.0449674729488594, "language_loss": 0.74244684, "learning_rate": 1.1352415687455556e-06, "loss": 0.76435775, "num_input_tokens_seen": 116902405, "step": 5431, "time_per_iteration": 2.8507039546966553 }, { "auxiliary_loss_clip": 0.01171271, "auxiliary_loss_mlp": 0.0102725, "balance_loss_clip": 1.0513984, "balance_loss_mlp": 1.01963198, "epoch": 0.6531593819515421, "flos": 25376275785600.0, "grad_norm": 2.315009539337117, "language_loss": 0.63853967, "learning_rate": 1.1345392419598362e-06, "loss": 0.66052485, "num_input_tokens_seen": 116921285, "step": 5432, "time_per_iteration": 2.7032666206359863 }, { "auxiliary_loss_clip": 0.01161162, "auxiliary_loss_mlp": 0.01025614, "balance_loss_clip": 1.04630613, "balance_loss_mlp": 1.01760292, "epoch": 0.6532796248421812, "flos": 21178533888000.0, "grad_norm": 8.067233845294911, "language_loss": 0.71697801, "learning_rate": 1.1338370464638263e-06, "loss": 0.73884571, "num_input_tokens_seen": 116940685, "step": 5433, "time_per_iteration": 2.7768125534057617 }, { "auxiliary_loss_clip": 0.01169025, "auxiliary_loss_mlp": 0.0102284, "balance_loss_clip": 1.04681611, "balance_loss_mlp": 1.01534212, "epoch": 0.6533998677328203, "flos": 17675950878720.0, "grad_norm": 2.067819086775784, "language_loss": 0.63572133, "learning_rate": 1.1331349823640474e-06, "loss": 0.65763998, "num_input_tokens_seen": 116958115, "step": 5434, "time_per_iteration": 2.6586620807647705 }, { "auxiliary_loss_clip": 0.01168875, "auxiliary_loss_mlp": 0.01049933, "balance_loss_clip": 1.04854524, "balance_loss_mlp": 1.01467752, "epoch": 0.6535201106234594, "flos": 28400384701440.0, "grad_norm": 2.257854560297748, "language_loss": 0.77911001, "learning_rate": 1.132433049767003e-06, "loss": 0.80129814, "num_input_tokens_seen": 116976030, "step": 5435, "time_per_iteration": 2.7447469234466553 }, { "auxiliary_loss_clip": 0.01162765, "auxiliary_loss_mlp": 0.01018815, "balance_loss_clip": 1.04741812, "balance_loss_mlp": 1.01143646, "epoch": 0.6536403535140984, "flos": 23586667447680.0, "grad_norm": 1.5360281212698972, "language_loss": 0.81365508, "learning_rate": 1.1317312487791748e-06, "loss": 0.83547091, "num_input_tokens_seen": 116997680, "step": 5436, "time_per_iteration": 2.702115297317505 }, { "auxiliary_loss_clip": 0.01162603, "auxiliary_loss_mlp": 0.0103182, "balance_loss_clip": 1.04635262, "balance_loss_mlp": 1.02427423, "epoch": 0.6537605964047376, "flos": 21579476474880.0, "grad_norm": 2.451483627394211, "language_loss": 0.73093212, "learning_rate": 1.1310295795070253e-06, "loss": 0.75287634, "num_input_tokens_seen": 117017620, "step": 5437, "time_per_iteration": 3.674663782119751 }, { "auxiliary_loss_clip": 0.01165248, "auxiliary_loss_mlp": 0.01025729, "balance_loss_clip": 1.05030954, "balance_loss_mlp": 1.01796889, "epoch": 0.6538808392953767, "flos": 26833997433600.0, "grad_norm": 2.0526351892630683, "language_loss": 0.8085075, "learning_rate": 1.1303280420569982e-06, "loss": 0.83041728, "num_input_tokens_seen": 117039505, "step": 5438, "time_per_iteration": 2.7863967418670654 }, { "auxiliary_loss_clip": 0.01163561, "auxiliary_loss_mlp": 0.01025792, "balance_loss_clip": 1.04869366, "balance_loss_mlp": 1.01825857, "epoch": 0.6540010821860157, "flos": 30738241301760.0, "grad_norm": 1.7715027676520965, "language_loss": 0.77554417, "learning_rate": 1.1296266365355158e-06, "loss": 0.79743773, "num_input_tokens_seen": 117062890, "step": 5439, "time_per_iteration": 2.76338267326355 }, { "auxiliary_loss_clip": 0.01162062, "auxiliary_loss_mlp": 0.01027295, "balance_loss_clip": 1.04868245, "balance_loss_mlp": 1.01963305, "epoch": 0.6541213250766549, "flos": 26907147480960.0, "grad_norm": 1.9184851552823143, "language_loss": 0.74104923, "learning_rate": 1.1289253630489806e-06, "loss": 0.76294279, "num_input_tokens_seen": 117083940, "step": 5440, "time_per_iteration": 2.750276565551758 }, { "auxiliary_loss_clip": 0.01174567, "auxiliary_loss_mlp": 0.01026098, "balance_loss_clip": 1.05028629, "balance_loss_mlp": 1.01795602, "epoch": 0.6542415679672939, "flos": 19172384409600.0, "grad_norm": 2.0213569053322162, "language_loss": 0.72511876, "learning_rate": 1.1282242217037753e-06, "loss": 0.74712545, "num_input_tokens_seen": 117101440, "step": 5441, "time_per_iteration": 2.678743600845337 }, { "auxiliary_loss_clip": 0.01158585, "auxiliary_loss_mlp": 0.0102874, "balance_loss_clip": 1.04946733, "balance_loss_mlp": 1.02015662, "epoch": 0.654361810857933, "flos": 48173517100800.0, "grad_norm": 3.6166165633397442, "language_loss": 0.62140429, "learning_rate": 1.127523212606262e-06, "loss": 0.64327747, "num_input_tokens_seen": 117124265, "step": 5442, "time_per_iteration": 2.974522829055786 }, { "auxiliary_loss_clip": 0.01164614, "auxiliary_loss_mlp": 0.01026839, "balance_loss_clip": 1.04665077, "balance_loss_mlp": 1.01958466, "epoch": 0.6544820537485722, "flos": 26943165843840.0, "grad_norm": 1.5975188199752894, "language_loss": 0.72905517, "learning_rate": 1.1268223358627835e-06, "loss": 0.75096971, "num_input_tokens_seen": 117146755, "step": 5443, "time_per_iteration": 2.738980293273926 }, { "auxiliary_loss_clip": 0.01171479, "auxiliary_loss_mlp": 0.01021826, "balance_loss_clip": 1.04842579, "balance_loss_mlp": 1.01404786, "epoch": 0.6546022966392112, "flos": 20886328748160.0, "grad_norm": 1.7569952585678383, "language_loss": 0.72062725, "learning_rate": 1.126121591579663e-06, "loss": 0.74256027, "num_input_tokens_seen": 117165960, "step": 5444, "time_per_iteration": 2.7309088706970215 }, { "auxiliary_loss_clip": 0.01159768, "auxiliary_loss_mlp": 0.01027814, "balance_loss_clip": 1.04588056, "balance_loss_mlp": 1.02078044, "epoch": 0.6547225395298503, "flos": 24936693143040.0, "grad_norm": 1.9025360333217518, "language_loss": 0.68928432, "learning_rate": 1.1254209798632018e-06, "loss": 0.71116018, "num_input_tokens_seen": 117186980, "step": 5445, "time_per_iteration": 2.8814291954040527 }, { "auxiliary_loss_clip": 0.01149338, "auxiliary_loss_mlp": 0.01026675, "balance_loss_clip": 1.05006051, "balance_loss_mlp": 1.01952553, "epoch": 0.6548427824204894, "flos": 22565942663040.0, "grad_norm": 1.9573652002928952, "language_loss": 0.84779215, "learning_rate": 1.124720500819683e-06, "loss": 0.86955225, "num_input_tokens_seen": 117205135, "step": 5446, "time_per_iteration": 2.9114339351654053 }, { "auxiliary_loss_clip": 0.01169168, "auxiliary_loss_mlp": 0.01029596, "balance_loss_clip": 1.04733872, "balance_loss_mlp": 1.02169204, "epoch": 0.6549630253111285, "flos": 18442500048000.0, "grad_norm": 1.8961168798645698, "language_loss": 0.82068408, "learning_rate": 1.1240201545553682e-06, "loss": 0.84267175, "num_input_tokens_seen": 117222935, "step": 5447, "time_per_iteration": 2.6931827068328857 }, { "auxiliary_loss_clip": 0.01161087, "auxiliary_loss_mlp": 0.01025374, "balance_loss_clip": 1.0486424, "balance_loss_mlp": 1.01764941, "epoch": 0.6550832682017675, "flos": 25187313312000.0, "grad_norm": 2.2071195108008075, "language_loss": 0.73278987, "learning_rate": 1.1233199411764987e-06, "loss": 0.75465453, "num_input_tokens_seen": 117242370, "step": 5448, "time_per_iteration": 2.6975467205047607 }, { "auxiliary_loss_clip": 0.01152257, "auxiliary_loss_mlp": 0.01024401, "balance_loss_clip": 1.04801881, "balance_loss_mlp": 1.01699829, "epoch": 0.6552035110924067, "flos": 22748153379840.0, "grad_norm": 1.7232556081745656, "language_loss": 0.68650162, "learning_rate": 1.1226198607892978e-06, "loss": 0.70826817, "num_input_tokens_seen": 117262930, "step": 5449, "time_per_iteration": 2.779046058654785 }, { "auxiliary_loss_clip": 0.01160564, "auxiliary_loss_mlp": 0.01022612, "balance_loss_clip": 1.04982805, "balance_loss_mlp": 1.01551914, "epoch": 0.6553237539830458, "flos": 21799178012160.0, "grad_norm": 2.123778415494062, "language_loss": 0.80446672, "learning_rate": 1.1219199134999664e-06, "loss": 0.82629848, "num_input_tokens_seen": 117281430, "step": 5450, "time_per_iteration": 2.7225098609924316 }, { "auxiliary_loss_clip": 0.01167429, "auxiliary_loss_mlp": 0.01032758, "balance_loss_clip": 1.04843497, "balance_loss_mlp": 1.02416348, "epoch": 0.6554439968736848, "flos": 20887226588160.0, "grad_norm": 1.94339905115001, "language_loss": 0.78136885, "learning_rate": 1.1212200994146863e-06, "loss": 0.80337065, "num_input_tokens_seen": 117299185, "step": 5451, "time_per_iteration": 2.6739742755889893 }, { "auxiliary_loss_clip": 0.01158812, "auxiliary_loss_mlp": 0.01023649, "balance_loss_clip": 1.04532003, "balance_loss_mlp": 1.01598382, "epoch": 0.655564239764324, "flos": 16139045698560.0, "grad_norm": 2.786289672338935, "language_loss": 0.75800729, "learning_rate": 1.120520418639618e-06, "loss": 0.77983189, "num_input_tokens_seen": 117317720, "step": 5452, "time_per_iteration": 2.8011205196380615 }, { "auxiliary_loss_clip": 0.01165965, "auxiliary_loss_mlp": 0.01025887, "balance_loss_clip": 1.04779351, "balance_loss_mlp": 1.01807904, "epoch": 0.655684482654963, "flos": 29570354496000.0, "grad_norm": 1.845956253385625, "language_loss": 0.83217812, "learning_rate": 1.119820871280903e-06, "loss": 0.85409665, "num_input_tokens_seen": 117338795, "step": 5453, "time_per_iteration": 2.782006025314331 }, { "auxiliary_loss_clip": 0.01167414, "auxiliary_loss_mlp": 0.01026857, "balance_loss_clip": 1.0497334, "balance_loss_mlp": 1.01895368, "epoch": 0.6558047255456021, "flos": 29789409588480.0, "grad_norm": 1.7634423048472878, "language_loss": 0.73470783, "learning_rate": 1.1191214574446614e-06, "loss": 0.75665057, "num_input_tokens_seen": 117359040, "step": 5454, "time_per_iteration": 3.638479232788086 }, { "auxiliary_loss_clip": 0.01157709, "auxiliary_loss_mlp": 0.01026035, "balance_loss_clip": 1.04533696, "balance_loss_mlp": 1.01859677, "epoch": 0.6559249684362413, "flos": 29059166090880.0, "grad_norm": 1.47394125838444, "language_loss": 0.8014735, "learning_rate": 1.118422177236995e-06, "loss": 0.82331091, "num_input_tokens_seen": 117380865, "step": 5455, "time_per_iteration": 3.9132943153381348 }, { "auxiliary_loss_clip": 0.01166553, "auxiliary_loss_mlp": 0.01024111, "balance_loss_clip": 1.04823816, "balance_loss_mlp": 1.01631784, "epoch": 0.6560452113268803, "flos": 20225464369920.0, "grad_norm": 6.743090101454729, "language_loss": 0.85687637, "learning_rate": 1.1177230307639835e-06, "loss": 0.87878305, "num_input_tokens_seen": 117398405, "step": 5456, "time_per_iteration": 3.6413071155548096 }, { "auxiliary_loss_clip": 0.01159556, "auxiliary_loss_mlp": 0.01022889, "balance_loss_clip": 1.05018151, "balance_loss_mlp": 1.01542032, "epoch": 0.6561654542175194, "flos": 25045538330880.0, "grad_norm": 2.256189420623056, "language_loss": 0.78794348, "learning_rate": 1.1170240181316865e-06, "loss": 0.80976796, "num_input_tokens_seen": 117419850, "step": 5457, "time_per_iteration": 2.7906675338745117 }, { "auxiliary_loss_clip": 0.01158684, "auxiliary_loss_mlp": 0.01023463, "balance_loss_clip": 1.0468967, "balance_loss_mlp": 1.01586306, "epoch": 0.6562856971081584, "flos": 22856711258880.0, "grad_norm": 2.3428584671080492, "language_loss": 0.79180503, "learning_rate": 1.1163251394461442e-06, "loss": 0.81362653, "num_input_tokens_seen": 117438330, "step": 5458, "time_per_iteration": 2.6922693252563477 }, { "auxiliary_loss_clip": 0.01166798, "auxiliary_loss_mlp": 0.01029693, "balance_loss_clip": 1.05014038, "balance_loss_mlp": 1.02234399, "epoch": 0.6564059399987976, "flos": 18872565586560.0, "grad_norm": 2.6968606820607697, "language_loss": 0.82183313, "learning_rate": 1.1156263948133746e-06, "loss": 0.84379804, "num_input_tokens_seen": 117454985, "step": 5459, "time_per_iteration": 2.6721436977386475 }, { "auxiliary_loss_clip": 0.01151108, "auxiliary_loss_mlp": 0.01053001, "balance_loss_clip": 1.04902542, "balance_loss_mlp": 1.01777458, "epoch": 0.6565261828894366, "flos": 25484187219840.0, "grad_norm": 2.711599370915068, "language_loss": 0.77726167, "learning_rate": 1.1149277843393787e-06, "loss": 0.79930276, "num_input_tokens_seen": 117476145, "step": 5460, "time_per_iteration": 2.784860134124756 }, { "auxiliary_loss_clip": 0.01155902, "auxiliary_loss_mlp": 0.01056436, "balance_loss_clip": 1.04849172, "balance_loss_mlp": 1.02040505, "epoch": 0.6566464257800757, "flos": 19683500987520.0, "grad_norm": 3.307904520704167, "language_loss": 0.63723946, "learning_rate": 1.1142293081301342e-06, "loss": 0.65936279, "num_input_tokens_seen": 117494025, "step": 5461, "time_per_iteration": 2.785898208618164 }, { "auxiliary_loss_clip": 0.01160752, "auxiliary_loss_mlp": 0.01026618, "balance_loss_clip": 1.04816246, "balance_loss_mlp": 1.0198288, "epoch": 0.6567666686707149, "flos": 23514127931520.0, "grad_norm": 1.5948695475483503, "language_loss": 0.67646879, "learning_rate": 1.1135309662915995e-06, "loss": 0.69834244, "num_input_tokens_seen": 117514190, "step": 5462, "time_per_iteration": 3.71185564994812 }, { "auxiliary_loss_clip": 0.01161948, "auxiliary_loss_mlp": 0.01025226, "balance_loss_clip": 1.04537368, "balance_loss_mlp": 1.01834488, "epoch": 0.6568869115613539, "flos": 32781342896640.0, "grad_norm": 6.900474856312558, "language_loss": 0.6023308, "learning_rate": 1.112832758929712e-06, "loss": 0.62420249, "num_input_tokens_seen": 117536800, "step": 5463, "time_per_iteration": 2.798088550567627 }, { "auxiliary_loss_clip": 0.01163776, "auxiliary_loss_mlp": 0.01027574, "balance_loss_clip": 1.04641724, "balance_loss_mlp": 1.01959932, "epoch": 0.657007154451993, "flos": 18442428220800.0, "grad_norm": 1.727113281056969, "language_loss": 0.75056112, "learning_rate": 1.11213468615039e-06, "loss": 0.77247465, "num_input_tokens_seen": 117556230, "step": 5464, "time_per_iteration": 2.738975763320923 }, { "auxiliary_loss_clip": 0.01153024, "auxiliary_loss_mlp": 0.01022725, "balance_loss_clip": 1.04820418, "balance_loss_mlp": 1.01562572, "epoch": 0.6571273973426321, "flos": 25156717902720.0, "grad_norm": 1.6349240320539908, "language_loss": 0.7510184, "learning_rate": 1.1114367480595292e-06, "loss": 0.77277589, "num_input_tokens_seen": 117577310, "step": 5465, "time_per_iteration": 2.726947069168091 }, { "auxiliary_loss_clip": 0.01150565, "auxiliary_loss_mlp": 0.01027525, "balance_loss_clip": 1.04796302, "balance_loss_mlp": 1.01980042, "epoch": 0.6572476402332712, "flos": 17529830352000.0, "grad_norm": 2.0654693376195006, "language_loss": 0.81301707, "learning_rate": 1.1107389447630086e-06, "loss": 0.83479798, "num_input_tokens_seen": 117596010, "step": 5466, "time_per_iteration": 2.8212368488311768 }, { "auxiliary_loss_clip": 0.01163013, "auxiliary_loss_mlp": 0.01050389, "balance_loss_clip": 1.04979014, "balance_loss_mlp": 1.01600122, "epoch": 0.6573678831239103, "flos": 17014260487680.0, "grad_norm": 3.600307623318433, "language_loss": 0.78371239, "learning_rate": 1.1100412763666818e-06, "loss": 0.80584645, "num_input_tokens_seen": 117611270, "step": 5467, "time_per_iteration": 2.6486897468566895 }, { "auxiliary_loss_clip": 0.01166058, "auxiliary_loss_mlp": 0.01024627, "balance_loss_clip": 1.0481559, "balance_loss_mlp": 1.01747441, "epoch": 0.6574881260145494, "flos": 23910078528000.0, "grad_norm": 1.6821570083811588, "language_loss": 0.79963994, "learning_rate": 1.1093437429763865e-06, "loss": 0.82154679, "num_input_tokens_seen": 117631535, "step": 5468, "time_per_iteration": 2.7460312843322754 }, { "auxiliary_loss_clip": 0.01164428, "auxiliary_loss_mlp": 0.01020893, "balance_loss_clip": 1.04682827, "balance_loss_mlp": 1.01339459, "epoch": 0.6576083689051885, "flos": 11218458504960.0, "grad_norm": 2.0970716485480088, "language_loss": 0.73390746, "learning_rate": 1.1086463446979361e-06, "loss": 0.75576067, "num_input_tokens_seen": 117649885, "step": 5469, "time_per_iteration": 2.6888163089752197 }, { "auxiliary_loss_clip": 0.01170991, "auxiliary_loss_mlp": 0.01024314, "balance_loss_clip": 1.05176985, "balance_loss_mlp": 1.01671994, "epoch": 0.6577286117958275, "flos": 22455553190400.0, "grad_norm": 1.781676046158152, "language_loss": 0.77346206, "learning_rate": 1.1079490816371277e-06, "loss": 0.7954151, "num_input_tokens_seen": 117669650, "step": 5470, "time_per_iteration": 2.644172430038452 }, { "auxiliary_loss_clip": 0.01165221, "auxiliary_loss_mlp": 0.01050847, "balance_loss_clip": 1.04604018, "balance_loss_mlp": 1.01603532, "epoch": 0.6578488546864667, "flos": 21872184405120.0, "grad_norm": 2.151760535322101, "language_loss": 0.74268407, "learning_rate": 1.1072519538997352e-06, "loss": 0.76484478, "num_input_tokens_seen": 117688790, "step": 5471, "time_per_iteration": 2.7102181911468506 }, { "auxiliary_loss_clip": 0.0116417, "auxiliary_loss_mlp": 0.0102512, "balance_loss_clip": 1.04587865, "balance_loss_mlp": 1.01746702, "epoch": 0.6579690975771058, "flos": 23543753673600.0, "grad_norm": 1.9616023252862627, "language_loss": 0.82336783, "learning_rate": 1.1065549615915095e-06, "loss": 0.84526074, "num_input_tokens_seen": 117708620, "step": 5472, "time_per_iteration": 2.683953046798706 }, { "auxiliary_loss_clip": 0.01165813, "auxiliary_loss_mlp": 0.0103108, "balance_loss_clip": 1.0493654, "balance_loss_mlp": 1.02342665, "epoch": 0.6580893404677448, "flos": 32743995730560.0, "grad_norm": 2.6208223801304205, "language_loss": 0.7909956, "learning_rate": 1.105858104818187e-06, "loss": 0.81296456, "num_input_tokens_seen": 117729775, "step": 5473, "time_per_iteration": 2.8013765811920166 }, { "auxiliary_loss_clip": 0.01170679, "auxiliary_loss_mlp": 0.01022648, "balance_loss_clip": 1.04785228, "balance_loss_mlp": 1.01514411, "epoch": 0.658209583358384, "flos": 15888138220800.0, "grad_norm": 2.5937988137846246, "language_loss": 0.74715698, "learning_rate": 1.105161383685478e-06, "loss": 0.76909029, "num_input_tokens_seen": 117746160, "step": 5474, "time_per_iteration": 2.6407744884490967 }, { "auxiliary_loss_clip": 0.01066658, "auxiliary_loss_mlp": 0.01005965, "balance_loss_clip": 1.01611972, "balance_loss_mlp": 1.00498128, "epoch": 0.658329826249023, "flos": 62695902447360.0, "grad_norm": 0.7734656015477606, "language_loss": 0.56304276, "learning_rate": 1.1044647982990771e-06, "loss": 0.58376896, "num_input_tokens_seen": 117808045, "step": 5475, "time_per_iteration": 3.2438337802886963 }, { "auxiliary_loss_clip": 0.01165353, "auxiliary_loss_mlp": 0.01028765, "balance_loss_clip": 1.04955781, "balance_loss_mlp": 1.02098107, "epoch": 0.6584500691396621, "flos": 31722624501120.0, "grad_norm": 2.686901471119234, "language_loss": 0.64292526, "learning_rate": 1.1037683487646536e-06, "loss": 0.66486645, "num_input_tokens_seen": 117828330, "step": 5476, "time_per_iteration": 2.746480703353882 }, { "auxiliary_loss_clip": 0.01162136, "auxiliary_loss_mlp": 0.01055952, "balance_loss_clip": 1.04913771, "balance_loss_mlp": 1.02083457, "epoch": 0.6585703120303013, "flos": 18406086635520.0, "grad_norm": 1.9450018623629357, "language_loss": 0.77046978, "learning_rate": 1.1030720351878583e-06, "loss": 0.7926507, "num_input_tokens_seen": 117846450, "step": 5477, "time_per_iteration": 2.749037742614746 }, { "auxiliary_loss_clip": 0.01072687, "auxiliary_loss_mlp": 0.01005275, "balance_loss_clip": 1.01644933, "balance_loss_mlp": 1.00419652, "epoch": 0.6586905549209403, "flos": 58309880434560.0, "grad_norm": 0.8071415322432801, "language_loss": 0.57601643, "learning_rate": 1.102375857674323e-06, "loss": 0.59679604, "num_input_tokens_seen": 117908365, "step": 5478, "time_per_iteration": 3.2334625720977783 }, { "auxiliary_loss_clip": 0.0116412, "auxiliary_loss_mlp": 0.01021619, "balance_loss_clip": 1.04766047, "balance_loss_mlp": 1.01448166, "epoch": 0.6588107978115794, "flos": 22782627457920.0, "grad_norm": 1.9939553146460287, "language_loss": 0.90532833, "learning_rate": 1.1016798163296561e-06, "loss": 0.92718577, "num_input_tokens_seen": 117927565, "step": 5479, "time_per_iteration": 2.73398494720459 }, { "auxiliary_loss_clip": 0.01168473, "auxiliary_loss_mlp": 0.01029927, "balance_loss_clip": 1.04870987, "balance_loss_mlp": 1.02248812, "epoch": 0.6589310407022185, "flos": 20667525050880.0, "grad_norm": 1.798706022341502, "language_loss": 0.66061604, "learning_rate": 1.1009839112594471e-06, "loss": 0.68260002, "num_input_tokens_seen": 117945590, "step": 5480, "time_per_iteration": 3.574248790740967 }, { "auxiliary_loss_clip": 0.01168207, "auxiliary_loss_mlp": 0.0102785, "balance_loss_clip": 1.0472188, "balance_loss_mlp": 1.02020907, "epoch": 0.6590512835928576, "flos": 25630595055360.0, "grad_norm": 2.231550979570244, "language_loss": 0.71830988, "learning_rate": 1.1002881425692638e-06, "loss": 0.7402705, "num_input_tokens_seen": 117966020, "step": 5481, "time_per_iteration": 4.60410213470459 }, { "auxiliary_loss_clip": 0.01160946, "auxiliary_loss_mlp": 0.01025721, "balance_loss_clip": 1.04643369, "balance_loss_mlp": 1.01847053, "epoch": 0.6591715264834966, "flos": 23726108044800.0, "grad_norm": 1.8097792225877216, "language_loss": 0.75051451, "learning_rate": 1.0995925103646532e-06, "loss": 0.77238119, "num_input_tokens_seen": 117984620, "step": 5482, "time_per_iteration": 2.6596925258636475 }, { "auxiliary_loss_clip": 0.01155959, "auxiliary_loss_mlp": 0.01024179, "balance_loss_clip": 1.04881716, "balance_loss_mlp": 1.01696062, "epoch": 0.6592917693741358, "flos": 35773850822400.0, "grad_norm": 1.5550905511534747, "language_loss": 0.66988635, "learning_rate": 1.0988970147511437e-06, "loss": 0.69168782, "num_input_tokens_seen": 118006500, "step": 5483, "time_per_iteration": 2.8647942543029785 }, { "auxiliary_loss_clip": 0.01166133, "auxiliary_loss_mlp": 0.01023886, "balance_loss_clip": 1.05079126, "balance_loss_mlp": 1.01588702, "epoch": 0.6594120122647749, "flos": 21396834794880.0, "grad_norm": 2.1818826643370173, "language_loss": 0.81125343, "learning_rate": 1.0982016558342405e-06, "loss": 0.83315355, "num_input_tokens_seen": 118025470, "step": 5484, "time_per_iteration": 2.7777888774871826 }, { "auxiliary_loss_clip": 0.01173179, "auxiliary_loss_mlp": 0.01023827, "balance_loss_clip": 1.05019343, "balance_loss_mlp": 1.01659083, "epoch": 0.6595322551554139, "flos": 19351829779200.0, "grad_norm": 1.894508361118727, "language_loss": 0.71144426, "learning_rate": 1.0975064337194291e-06, "loss": 0.73341429, "num_input_tokens_seen": 118043515, "step": 5485, "time_per_iteration": 2.7826638221740723 }, { "auxiliary_loss_clip": 0.01157614, "auxiliary_loss_mlp": 0.0102804, "balance_loss_clip": 1.04895139, "balance_loss_mlp": 1.02040505, "epoch": 0.6596524980460531, "flos": 16837113588480.0, "grad_norm": 1.7135593853522224, "language_loss": 0.70259285, "learning_rate": 1.0968113485121743e-06, "loss": 0.7244494, "num_input_tokens_seen": 118063105, "step": 5486, "time_per_iteration": 2.8639326095581055 }, { "auxiliary_loss_clip": 0.0117054, "auxiliary_loss_mlp": 0.01055675, "balance_loss_clip": 1.04861057, "balance_loss_mlp": 1.02077925, "epoch": 0.6597727409366921, "flos": 21798567480960.0, "grad_norm": 1.8323599740054781, "language_loss": 0.8041352, "learning_rate": 1.0961164003179185e-06, "loss": 0.82639736, "num_input_tokens_seen": 118081615, "step": 5487, "time_per_iteration": 2.7174570560455322 }, { "auxiliary_loss_clip": 0.01162012, "auxiliary_loss_mlp": 0.01029007, "balance_loss_clip": 1.04967225, "balance_loss_mlp": 1.02226615, "epoch": 0.6598929838273312, "flos": 23730704985600.0, "grad_norm": 1.8464361332246422, "language_loss": 0.84535152, "learning_rate": 1.0954215892420884e-06, "loss": 0.86726177, "num_input_tokens_seen": 118102315, "step": 5488, "time_per_iteration": 3.6779744625091553 }, { "auxiliary_loss_clip": 0.01165873, "auxiliary_loss_mlp": 0.01029748, "balance_loss_clip": 1.05129528, "balance_loss_mlp": 1.02244675, "epoch": 0.6600132267179702, "flos": 19974520978560.0, "grad_norm": 1.8480430561745267, "language_loss": 0.70600021, "learning_rate": 1.094726915390082e-06, "loss": 0.72795641, "num_input_tokens_seen": 118120650, "step": 5489, "time_per_iteration": 2.759326696395874 }, { "auxiliary_loss_clip": 0.01166884, "auxiliary_loss_mlp": 0.01028144, "balance_loss_clip": 1.0489316, "balance_loss_mlp": 1.02029991, "epoch": 0.6601334696086094, "flos": 22342649765760.0, "grad_norm": 1.7117196454715318, "language_loss": 0.69877839, "learning_rate": 1.0940323788672836e-06, "loss": 0.7207287, "num_input_tokens_seen": 118139825, "step": 5490, "time_per_iteration": 2.679898262023926 }, { "auxiliary_loss_clip": 0.01161827, "auxiliary_loss_mlp": 0.01022986, "balance_loss_clip": 1.04801273, "balance_loss_mlp": 1.01594687, "epoch": 0.6602537124992485, "flos": 25703098657920.0, "grad_norm": 1.86775532014228, "language_loss": 0.73668706, "learning_rate": 1.0933379797790522e-06, "loss": 0.75853515, "num_input_tokens_seen": 118159240, "step": 5491, "time_per_iteration": 2.7147433757781982 }, { "auxiliary_loss_clip": 0.01172082, "auxiliary_loss_mlp": 0.01025897, "balance_loss_clip": 1.05032969, "balance_loss_mlp": 1.0186758, "epoch": 0.6603739553898875, "flos": 25848572739840.0, "grad_norm": 3.2036644762157414, "language_loss": 0.7143994, "learning_rate": 1.0926437182307293e-06, "loss": 0.73637915, "num_input_tokens_seen": 118178050, "step": 5492, "time_per_iteration": 2.66282057762146 }, { "auxiliary_loss_clip": 0.01168915, "auxiliary_loss_mlp": 0.01027012, "balance_loss_clip": 1.04755497, "balance_loss_mlp": 1.01952529, "epoch": 0.6604941982805267, "flos": 24570296461440.0, "grad_norm": 2.3507127284854197, "language_loss": 0.777879, "learning_rate": 1.0919495943276338e-06, "loss": 0.79983824, "num_input_tokens_seen": 118199070, "step": 5493, "time_per_iteration": 2.750936508178711 }, { "auxiliary_loss_clip": 0.01165221, "auxiliary_loss_mlp": 0.01027916, "balance_loss_clip": 1.04720712, "balance_loss_mlp": 1.01983976, "epoch": 0.6606144411711657, "flos": 13261775581440.0, "grad_norm": 7.739938128407087, "language_loss": 0.7663669, "learning_rate": 1.0912556081750611e-06, "loss": 0.78829825, "num_input_tokens_seen": 118217000, "step": 5494, "time_per_iteration": 2.722196102142334 }, { "auxiliary_loss_clip": 0.01162133, "auxiliary_loss_mlp": 0.01023194, "balance_loss_clip": 1.04923105, "balance_loss_mlp": 1.01582098, "epoch": 0.6607346840618048, "flos": 25155281358720.0, "grad_norm": 1.760815118869966, "language_loss": 0.76077574, "learning_rate": 1.0905617598782909e-06, "loss": 0.78262901, "num_input_tokens_seen": 118237205, "step": 5495, "time_per_iteration": 2.796193838119507 }, { "auxiliary_loss_clip": 0.01152682, "auxiliary_loss_mlp": 0.01029366, "balance_loss_clip": 1.04963315, "balance_loss_mlp": 1.0216291, "epoch": 0.660854926952444, "flos": 17638029095040.0, "grad_norm": 2.157809873880854, "language_loss": 0.81510341, "learning_rate": 1.0898680495425775e-06, "loss": 0.8369239, "num_input_tokens_seen": 118255495, "step": 5496, "time_per_iteration": 2.7054669857025146 }, { "auxiliary_loss_clip": 0.01167063, "auxiliary_loss_mlp": 0.01024532, "balance_loss_clip": 1.04819536, "balance_loss_mlp": 1.01713538, "epoch": 0.660975169843083, "flos": 16836000266880.0, "grad_norm": 1.9065364984992768, "language_loss": 0.80825168, "learning_rate": 1.0891744772731594e-06, "loss": 0.83016765, "num_input_tokens_seen": 118273310, "step": 5497, "time_per_iteration": 2.706101655960083 }, { "auxiliary_loss_clip": 0.01167942, "auxiliary_loss_mlp": 0.01024397, "balance_loss_clip": 1.04695678, "balance_loss_mlp": 1.01673186, "epoch": 0.6610954127337221, "flos": 26870410846080.0, "grad_norm": 1.665067260273491, "language_loss": 0.6573571, "learning_rate": 1.088481043175248e-06, "loss": 0.67928052, "num_input_tokens_seen": 118293880, "step": 5498, "time_per_iteration": 2.8155324459075928 }, { "auxiliary_loss_clip": 0.01157821, "auxiliary_loss_mlp": 0.01026141, "balance_loss_clip": 1.04771042, "balance_loss_mlp": 1.01837802, "epoch": 0.6612156556243612, "flos": 26465697331200.0, "grad_norm": 1.933710876371045, "language_loss": 0.75862539, "learning_rate": 1.0877877473540368e-06, "loss": 0.78046501, "num_input_tokens_seen": 118314465, "step": 5499, "time_per_iteration": 2.863264322280884 }, { "auxiliary_loss_clip": 0.01172156, "auxiliary_loss_mlp": 0.01024134, "balance_loss_clip": 1.04833543, "balance_loss_mlp": 1.01617122, "epoch": 0.6613358985150003, "flos": 19791915212160.0, "grad_norm": 1.7266912091610982, "language_loss": 0.72757089, "learning_rate": 1.0870945899147002e-06, "loss": 0.74953389, "num_input_tokens_seen": 118331110, "step": 5500, "time_per_iteration": 2.8860669136047363 }, { "auxiliary_loss_clip": 0.01166017, "auxiliary_loss_mlp": 0.01027607, "balance_loss_clip": 1.04878926, "balance_loss_mlp": 1.02019763, "epoch": 0.6614561414056394, "flos": 26831627136000.0, "grad_norm": 2.1796667251738118, "language_loss": 0.75966001, "learning_rate": 1.0864015709623879e-06, "loss": 0.78159624, "num_input_tokens_seen": 118351980, "step": 5501, "time_per_iteration": 2.833425521850586 }, { "auxiliary_loss_clip": 0.01170413, "auxiliary_loss_mlp": 0.01025434, "balance_loss_clip": 1.04792476, "balance_loss_mlp": 1.01806641, "epoch": 0.6615763842962785, "flos": 22894597128960.0, "grad_norm": 2.572014284814256, "language_loss": 0.80261946, "learning_rate": 1.0857086906022313e-06, "loss": 0.82457793, "num_input_tokens_seen": 118370315, "step": 5502, "time_per_iteration": 2.7651660442352295 }, { "auxiliary_loss_clip": 0.01145667, "auxiliary_loss_mlp": 0.01024917, "balance_loss_clip": 1.0488627, "balance_loss_mlp": 1.01721001, "epoch": 0.6616966271869176, "flos": 24790321221120.0, "grad_norm": 2.0543005774310004, "language_loss": 0.72674036, "learning_rate": 1.0850159489393388e-06, "loss": 0.74844623, "num_input_tokens_seen": 118389575, "step": 5503, "time_per_iteration": 2.843967914581299 }, { "auxiliary_loss_clip": 0.0115628, "auxiliary_loss_mlp": 0.01026592, "balance_loss_clip": 1.04627824, "balance_loss_mlp": 1.01876926, "epoch": 0.6618168700775566, "flos": 17202109639680.0, "grad_norm": 1.8438813570670654, "language_loss": 0.82345796, "learning_rate": 1.0843233460787992e-06, "loss": 0.84528661, "num_input_tokens_seen": 118406790, "step": 5504, "time_per_iteration": 2.7735705375671387 }, { "auxiliary_loss_clip": 0.01148671, "auxiliary_loss_mlp": 0.0102126, "balance_loss_clip": 1.04796505, "balance_loss_mlp": 1.01407146, "epoch": 0.6619371129681958, "flos": 25447091448960.0, "grad_norm": 1.9541717679426307, "language_loss": 0.78029472, "learning_rate": 1.0836308821256805e-06, "loss": 0.80199403, "num_input_tokens_seen": 118427590, "step": 5505, "time_per_iteration": 3.6549909114837646 }, { "auxiliary_loss_clip": 0.01166174, "auxiliary_loss_mlp": 0.01026135, "balance_loss_clip": 1.04783535, "balance_loss_mlp": 1.01921237, "epoch": 0.6620573558588349, "flos": 18040444139520.0, "grad_norm": 2.3135751701495417, "language_loss": 0.77792495, "learning_rate": 1.0829385571850282e-06, "loss": 0.79984802, "num_input_tokens_seen": 118444570, "step": 5506, "time_per_iteration": 2.7429754734039307 }, { "auxiliary_loss_clip": 0.01177737, "auxiliary_loss_mlp": 0.01025633, "balance_loss_clip": 1.05013001, "balance_loss_mlp": 1.01746726, "epoch": 0.6621775987494739, "flos": 17785586165760.0, "grad_norm": 2.6082100589523227, "language_loss": 0.83444774, "learning_rate": 1.0822463713618679e-06, "loss": 0.85648149, "num_input_tokens_seen": 118461425, "step": 5507, "time_per_iteration": 5.186325311660767 }, { "auxiliary_loss_clip": 0.01162056, "auxiliary_loss_mlp": 0.01030563, "balance_loss_clip": 1.04854536, "balance_loss_mlp": 1.02309453, "epoch": 0.6622978416401131, "flos": 17492590926720.0, "grad_norm": 4.188295908047817, "language_loss": 0.85329258, "learning_rate": 1.0815543247612034e-06, "loss": 0.87521875, "num_input_tokens_seen": 118478495, "step": 5508, "time_per_iteration": 2.8779783248901367 }, { "auxiliary_loss_clip": 0.01164999, "auxiliary_loss_mlp": 0.01023265, "balance_loss_clip": 1.04671955, "balance_loss_mlp": 1.01602316, "epoch": 0.6624180845307521, "flos": 21648352803840.0, "grad_norm": 1.5895445975976004, "language_loss": 0.82991588, "learning_rate": 1.0808624174880168e-06, "loss": 0.85179853, "num_input_tokens_seen": 118499145, "step": 5509, "time_per_iteration": 2.859076499938965 }, { "auxiliary_loss_clip": 0.01167058, "auxiliary_loss_mlp": 0.01023399, "balance_loss_clip": 1.04727054, "balance_loss_mlp": 1.01655662, "epoch": 0.6625383274213912, "flos": 23805902108160.0, "grad_norm": 1.79131216548201, "language_loss": 0.79600519, "learning_rate": 1.080170649647272e-06, "loss": 0.81790984, "num_input_tokens_seen": 118518950, "step": 5510, "time_per_iteration": 2.8761706352233887 }, { "auxiliary_loss_clip": 0.01167013, "auxiliary_loss_mlp": 0.01030252, "balance_loss_clip": 1.04693222, "balance_loss_mlp": 1.02284944, "epoch": 0.6626585703120303, "flos": 33262941473280.0, "grad_norm": 3.2301406381479945, "language_loss": 0.67372715, "learning_rate": 1.0794790213439068e-06, "loss": 0.69569981, "num_input_tokens_seen": 118545850, "step": 5511, "time_per_iteration": 2.828007698059082 }, { "auxiliary_loss_clip": 0.01161863, "auxiliary_loss_mlp": 0.01031095, "balance_loss_clip": 1.05000639, "balance_loss_mlp": 1.02288103, "epoch": 0.6627788132026694, "flos": 22085780630400.0, "grad_norm": 2.1407946215275473, "language_loss": 0.78452712, "learning_rate": 1.078787532682843e-06, "loss": 0.80645669, "num_input_tokens_seen": 118563325, "step": 5512, "time_per_iteration": 2.7568066120147705 }, { "auxiliary_loss_clip": 0.0116524, "auxiliary_loss_mlp": 0.01023189, "balance_loss_clip": 1.04688931, "balance_loss_mlp": 1.01611376, "epoch": 0.6628990560933085, "flos": 36173608260480.0, "grad_norm": 2.340583951743241, "language_loss": 0.75735527, "learning_rate": 1.0780961837689773e-06, "loss": 0.77923954, "num_input_tokens_seen": 118582835, "step": 5513, "time_per_iteration": 2.8510489463806152 }, { "auxiliary_loss_clip": 0.011584, "auxiliary_loss_mlp": 0.01027103, "balance_loss_clip": 1.04946899, "balance_loss_mlp": 1.01981294, "epoch": 0.6630192989839476, "flos": 18513567106560.0, "grad_norm": 1.5860415601575177, "language_loss": 0.70204234, "learning_rate": 1.0774049747071883e-06, "loss": 0.7238974, "num_input_tokens_seen": 118600715, "step": 5514, "time_per_iteration": 2.7954907417297363 }, { "auxiliary_loss_clip": 0.01158612, "auxiliary_loss_mlp": 0.01025075, "balance_loss_clip": 1.05067682, "balance_loss_mlp": 1.01759171, "epoch": 0.6631395418745867, "flos": 35809510049280.0, "grad_norm": 1.8351548926840555, "language_loss": 0.68370914, "learning_rate": 1.076713905602332e-06, "loss": 0.70554602, "num_input_tokens_seen": 118621290, "step": 5515, "time_per_iteration": 3.867415189743042 }, { "auxiliary_loss_clip": 0.011701, "auxiliary_loss_mlp": 0.01027704, "balance_loss_clip": 1.04924583, "balance_loss_mlp": 1.01977634, "epoch": 0.6632597847652257, "flos": 20047742853120.0, "grad_norm": 1.8947343226351563, "language_loss": 0.81029117, "learning_rate": 1.07602297655924e-06, "loss": 0.83226919, "num_input_tokens_seen": 118639610, "step": 5516, "time_per_iteration": 2.6862823963165283 }, { "auxiliary_loss_clip": 0.01170254, "auxiliary_loss_mlp": 0.01021546, "balance_loss_clip": 1.04876971, "balance_loss_mlp": 1.01380682, "epoch": 0.6633800276558649, "flos": 21214480423680.0, "grad_norm": 1.8214672296659402, "language_loss": 0.81071854, "learning_rate": 1.0753321876827292e-06, "loss": 0.83263659, "num_input_tokens_seen": 118658895, "step": 5517, "time_per_iteration": 2.7702767848968506 }, { "auxiliary_loss_clip": 0.01172391, "auxiliary_loss_mlp": 0.01025687, "balance_loss_clip": 1.04866481, "balance_loss_mlp": 1.01794767, "epoch": 0.663500270546504, "flos": 23987753688960.0, "grad_norm": 2.209616284262993, "language_loss": 0.74313092, "learning_rate": 1.0746415390775893e-06, "loss": 0.76511174, "num_input_tokens_seen": 118677025, "step": 5518, "time_per_iteration": 2.728982925415039 }, { "auxiliary_loss_clip": 0.01171772, "auxiliary_loss_mlp": 0.01030936, "balance_loss_clip": 1.0510242, "balance_loss_mlp": 1.0229733, "epoch": 0.663620513437143, "flos": 17932389050880.0, "grad_norm": 2.114390096542142, "language_loss": 0.76506042, "learning_rate": 1.0739510308485939e-06, "loss": 0.78708744, "num_input_tokens_seen": 118694240, "step": 5519, "time_per_iteration": 2.697153329849243 }, { "auxiliary_loss_clip": 0.01071943, "auxiliary_loss_mlp": 0.01002268, "balance_loss_clip": 1.01528144, "balance_loss_mlp": 1.00112939, "epoch": 0.6637407563277821, "flos": 57840241086720.0, "grad_norm": 0.8198201906067997, "language_loss": 0.62488842, "learning_rate": 1.07326066310049e-06, "loss": 0.64563054, "num_input_tokens_seen": 118758365, "step": 5520, "time_per_iteration": 3.3632142543792725 }, { "auxiliary_loss_clip": 0.01162428, "auxiliary_loss_mlp": 0.01030284, "balance_loss_clip": 1.0510819, "balance_loss_mlp": 1.02224898, "epoch": 0.6638609992184212, "flos": 27306007079040.0, "grad_norm": 2.3149867304506158, "language_loss": 0.79429537, "learning_rate": 1.0725704359380059e-06, "loss": 0.81622243, "num_input_tokens_seen": 118778220, "step": 5521, "time_per_iteration": 2.859208106994629 }, { "auxiliary_loss_clip": 0.01169618, "auxiliary_loss_mlp": 0.01029371, "balance_loss_clip": 1.04682875, "balance_loss_mlp": 1.02199817, "epoch": 0.6639812421090603, "flos": 18624854419200.0, "grad_norm": 1.989872961056216, "language_loss": 0.71914774, "learning_rate": 1.0718803494658497e-06, "loss": 0.74113762, "num_input_tokens_seen": 118797110, "step": 5522, "time_per_iteration": 2.7167460918426514 }, { "auxiliary_loss_clip": 0.01144555, "auxiliary_loss_mlp": 0.01027733, "balance_loss_clip": 1.04989934, "balance_loss_mlp": 1.02000284, "epoch": 0.6641014849996993, "flos": 15924479806080.0, "grad_norm": 2.238398296721623, "language_loss": 0.83812219, "learning_rate": 1.071190403788707e-06, "loss": 0.8598451, "num_input_tokens_seen": 118812415, "step": 5523, "time_per_iteration": 2.8147132396698 }, { "auxiliary_loss_clip": 0.01166936, "auxiliary_loss_mlp": 0.01026453, "balance_loss_clip": 1.04960144, "balance_loss_mlp": 1.01863933, "epoch": 0.6642217278903385, "flos": 26505486622080.0, "grad_norm": 1.779318877834973, "language_loss": 0.75137258, "learning_rate": 1.0705005990112415e-06, "loss": 0.77330649, "num_input_tokens_seen": 118832195, "step": 5524, "time_per_iteration": 2.8203036785125732 }, { "auxiliary_loss_clip": 0.01150963, "auxiliary_loss_mlp": 0.01030548, "balance_loss_clip": 1.05070877, "balance_loss_mlp": 1.02295494, "epoch": 0.6643419707809776, "flos": 15377308951680.0, "grad_norm": 2.204611064777004, "language_loss": 0.752617, "learning_rate": 1.0698109352380957e-06, "loss": 0.77443218, "num_input_tokens_seen": 118849795, "step": 5525, "time_per_iteration": 2.7654623985290527 }, { "auxiliary_loss_clip": 0.01169126, "auxiliary_loss_mlp": 0.01027622, "balance_loss_clip": 1.04771829, "balance_loss_mlp": 1.02000451, "epoch": 0.6644622136716166, "flos": 25117610970240.0, "grad_norm": 2.1637820726516845, "language_loss": 0.77952015, "learning_rate": 1.0691214125738909e-06, "loss": 0.80148757, "num_input_tokens_seen": 118870000, "step": 5526, "time_per_iteration": 2.7524654865264893 }, { "auxiliary_loss_clip": 0.0107123, "auxiliary_loss_mlp": 0.0100222, "balance_loss_clip": 1.01543975, "balance_loss_mlp": 1.0012064, "epoch": 0.6645824565622558, "flos": 66201717680640.0, "grad_norm": 0.7925364444752894, "language_loss": 0.5752939, "learning_rate": 1.0684320311232287e-06, "loss": 0.59602839, "num_input_tokens_seen": 118932905, "step": 5527, "time_per_iteration": 3.3182270526885986 }, { "auxiliary_loss_clip": 0.01163105, "auxiliary_loss_mlp": 0.01025298, "balance_loss_clip": 1.04969728, "balance_loss_mlp": 1.0173409, "epoch": 0.6647026994528948, "flos": 25082131311360.0, "grad_norm": 1.8184535608342678, "language_loss": 0.81595331, "learning_rate": 1.0677427909906865e-06, "loss": 0.83783734, "num_input_tokens_seen": 118953355, "step": 5528, "time_per_iteration": 2.7962119579315186 }, { "auxiliary_loss_clip": 0.01173277, "auxiliary_loss_mlp": 0.01028494, "balance_loss_clip": 1.04903221, "balance_loss_mlp": 1.02071261, "epoch": 0.6648229423435339, "flos": 18222187979520.0, "grad_norm": 2.5863426707121664, "language_loss": 0.72594285, "learning_rate": 1.0670536922808216e-06, "loss": 0.74796057, "num_input_tokens_seen": 118973480, "step": 5529, "time_per_iteration": 2.7610175609588623 }, { "auxiliary_loss_clip": 0.01165061, "auxiliary_loss_mlp": 0.01024634, "balance_loss_clip": 1.04843426, "balance_loss_mlp": 1.01711178, "epoch": 0.6649431852341731, "flos": 18296882311680.0, "grad_norm": 3.2295286706816895, "language_loss": 0.72085238, "learning_rate": 1.06636473509817e-06, "loss": 0.74274927, "num_input_tokens_seen": 118989860, "step": 5530, "time_per_iteration": 2.671372652053833 }, { "auxiliary_loss_clip": 0.01161993, "auxiliary_loss_mlp": 0.01053793, "balance_loss_clip": 1.04896522, "balance_loss_mlp": 1.01715636, "epoch": 0.6650634281248121, "flos": 17019575700480.0, "grad_norm": 2.1583317708431125, "language_loss": 0.80605495, "learning_rate": 1.0656759195472447e-06, "loss": 0.82821274, "num_input_tokens_seen": 119007150, "step": 5531, "time_per_iteration": 2.7080228328704834 }, { "auxiliary_loss_clip": 0.01069225, "auxiliary_loss_mlp": 0.01002415, "balance_loss_clip": 1.01541483, "balance_loss_mlp": 1.00143778, "epoch": 0.6651836710154512, "flos": 69294810666240.0, "grad_norm": 0.7706325482254671, "language_loss": 0.59752715, "learning_rate": 1.0649872457325414e-06, "loss": 0.61824358, "num_input_tokens_seen": 119068435, "step": 5532, "time_per_iteration": 4.234927177429199 }, { "auxiliary_loss_clip": 0.01070691, "auxiliary_loss_mlp": 0.01000338, "balance_loss_clip": 1.01407862, "balance_loss_mlp": 0.99930048, "epoch": 0.6653039139060903, "flos": 66883444882560.0, "grad_norm": 0.8564073195071837, "language_loss": 0.55060124, "learning_rate": 1.0642987137585278e-06, "loss": 0.57131153, "num_input_tokens_seen": 119127960, "step": 5533, "time_per_iteration": 5.218303918838501 }, { "auxiliary_loss_clip": 0.01163441, "auxiliary_loss_mlp": 0.01024986, "balance_loss_clip": 1.04952455, "balance_loss_mlp": 1.01714242, "epoch": 0.6654241567967294, "flos": 21470056669440.0, "grad_norm": 1.7884644374404133, "language_loss": 0.82709068, "learning_rate": 1.0636103237296561e-06, "loss": 0.84897494, "num_input_tokens_seen": 119146885, "step": 5534, "time_per_iteration": 2.6951351165771484 }, { "auxiliary_loss_clip": 0.01165669, "auxiliary_loss_mlp": 0.01026897, "balance_loss_clip": 1.05039144, "balance_loss_mlp": 1.02004242, "epoch": 0.6655443996873684, "flos": 25119514391040.0, "grad_norm": 1.8513489780703987, "language_loss": 0.84156668, "learning_rate": 1.062922075750353e-06, "loss": 0.86349237, "num_input_tokens_seen": 119166900, "step": 5535, "time_per_iteration": 2.712646245956421 }, { "auxiliary_loss_clip": 0.01159204, "auxiliary_loss_mlp": 0.01025933, "balance_loss_clip": 1.0469451, "balance_loss_mlp": 1.01912916, "epoch": 0.6656646425780076, "flos": 17457326749440.0, "grad_norm": 2.610650888396371, "language_loss": 0.71981049, "learning_rate": 1.0622339699250267e-06, "loss": 0.74166191, "num_input_tokens_seen": 119184820, "step": 5536, "time_per_iteration": 2.703193426132202 }, { "auxiliary_loss_clip": 0.01161236, "auxiliary_loss_mlp": 0.01022878, "balance_loss_clip": 1.04706502, "balance_loss_mlp": 1.01567137, "epoch": 0.6657848854686467, "flos": 23434190213760.0, "grad_norm": 1.7184377238215929, "language_loss": 0.79138529, "learning_rate": 1.0615460063580624e-06, "loss": 0.81322646, "num_input_tokens_seen": 119203295, "step": 5537, "time_per_iteration": 2.7716000080108643 }, { "auxiliary_loss_clip": 0.01165654, "auxiliary_loss_mlp": 0.01026904, "balance_loss_clip": 1.0473237, "balance_loss_mlp": 1.01926267, "epoch": 0.6659051283592857, "flos": 11509909459200.0, "grad_norm": 1.8356714718186988, "language_loss": 0.73197675, "learning_rate": 1.060858185153821e-06, "loss": 0.75390238, "num_input_tokens_seen": 119221395, "step": 5538, "time_per_iteration": 2.7156341075897217 }, { "auxiliary_loss_clip": 0.0117025, "auxiliary_loss_mlp": 0.01029456, "balance_loss_clip": 1.04918265, "balance_loss_mlp": 1.02130234, "epoch": 0.6660253712499249, "flos": 20594554571520.0, "grad_norm": 2.8730938716468466, "language_loss": 0.7584787, "learning_rate": 1.0601705064166474e-06, "loss": 0.78047574, "num_input_tokens_seen": 119239790, "step": 5539, "time_per_iteration": 2.7312965393066406 }, { "auxiliary_loss_clip": 0.01161702, "auxiliary_loss_mlp": 0.0102308, "balance_loss_clip": 1.04928267, "balance_loss_mlp": 1.01589739, "epoch": 0.666145614140564, "flos": 21251504367360.0, "grad_norm": 4.010475475769456, "language_loss": 0.73472726, "learning_rate": 1.0594829702508596e-06, "loss": 0.75657511, "num_input_tokens_seen": 119257505, "step": 5540, "time_per_iteration": 3.7627997398376465 }, { "auxiliary_loss_clip": 0.01164692, "auxiliary_loss_mlp": 0.01022919, "balance_loss_clip": 1.04864895, "balance_loss_mlp": 1.01568353, "epoch": 0.666265857031203, "flos": 33726188200320.0, "grad_norm": 1.611177785110815, "language_loss": 0.55032241, "learning_rate": 1.0587955767607592e-06, "loss": 0.57219857, "num_input_tokens_seen": 119279365, "step": 5541, "time_per_iteration": 3.026650905609131 }, { "auxiliary_loss_clip": 0.01171301, "auxiliary_loss_mlp": 0.01027606, "balance_loss_clip": 1.0494591, "balance_loss_mlp": 1.01958907, "epoch": 0.6663860999218422, "flos": 17456644391040.0, "grad_norm": 2.8513172923059344, "language_loss": 0.77089667, "learning_rate": 1.0581083260506206e-06, "loss": 0.79288578, "num_input_tokens_seen": 119296150, "step": 5542, "time_per_iteration": 2.7711148262023926 }, { "auxiliary_loss_clip": 0.01163655, "auxiliary_loss_mlp": 0.01030472, "balance_loss_clip": 1.04766321, "balance_loss_mlp": 1.02320004, "epoch": 0.6665063428124812, "flos": 17676740977920.0, "grad_norm": 2.056598510764609, "language_loss": 0.76382744, "learning_rate": 1.0574212182246993e-06, "loss": 0.78576875, "num_input_tokens_seen": 119314845, "step": 5543, "time_per_iteration": 2.8171074390411377 }, { "auxiliary_loss_clip": 0.01170121, "auxiliary_loss_mlp": 0.01029379, "balance_loss_clip": 1.04860401, "balance_loss_mlp": 1.02117157, "epoch": 0.6666265857031203, "flos": 27673265687040.0, "grad_norm": 2.546540012183284, "language_loss": 0.75755948, "learning_rate": 1.0567342533872303e-06, "loss": 0.77955449, "num_input_tokens_seen": 119334875, "step": 5544, "time_per_iteration": 2.8354127407073975 }, { "auxiliary_loss_clip": 0.01163496, "auxiliary_loss_mlp": 0.0102535, "balance_loss_clip": 1.04764557, "balance_loss_mlp": 1.01822102, "epoch": 0.6667468285937594, "flos": 25046831220480.0, "grad_norm": 1.7219212019401837, "language_loss": 0.80980068, "learning_rate": 1.0560474316424255e-06, "loss": 0.83168912, "num_input_tokens_seen": 119354635, "step": 5545, "time_per_iteration": 2.9659836292266846 }, { "auxiliary_loss_clip": 0.01168638, "auxiliary_loss_mlp": 0.01027279, "balance_loss_clip": 1.05108202, "balance_loss_mlp": 1.0189817, "epoch": 0.6668670714843985, "flos": 22780472641920.0, "grad_norm": 2.658204016662152, "language_loss": 0.73551863, "learning_rate": 1.0553607530944746e-06, "loss": 0.75747776, "num_input_tokens_seen": 119372690, "step": 5546, "time_per_iteration": 2.7101151943206787 }, { "auxiliary_loss_clip": 0.01165522, "auxiliary_loss_mlp": 0.01024957, "balance_loss_clip": 1.04913878, "balance_loss_mlp": 1.01717222, "epoch": 0.6669873143750376, "flos": 22163886754560.0, "grad_norm": 4.370975345661216, "language_loss": 0.89890498, "learning_rate": 1.0546742178475463e-06, "loss": 0.92080975, "num_input_tokens_seen": 119391685, "step": 5547, "time_per_iteration": 2.7782223224639893 }, { "auxiliary_loss_clip": 0.01161809, "auxiliary_loss_mlp": 0.01020528, "balance_loss_clip": 1.04848313, "balance_loss_mlp": 1.01329219, "epoch": 0.6671075572656767, "flos": 20514832335360.0, "grad_norm": 1.7420915300306612, "language_loss": 0.8661263, "learning_rate": 1.0539878260057868e-06, "loss": 0.88794965, "num_input_tokens_seen": 119410725, "step": 5548, "time_per_iteration": 2.7666733264923096 }, { "auxiliary_loss_clip": 0.01169944, "auxiliary_loss_mlp": 0.01026098, "balance_loss_clip": 1.0503118, "balance_loss_mlp": 1.01843894, "epoch": 0.6672278001563158, "flos": 17931203902080.0, "grad_norm": 3.6084760972258167, "language_loss": 0.68320835, "learning_rate": 1.0533015776733226e-06, "loss": 0.70516884, "num_input_tokens_seen": 119426875, "step": 5549, "time_per_iteration": 2.672938346862793 }, { "auxiliary_loss_clip": 0.01163401, "auxiliary_loss_mlp": 0.01028777, "balance_loss_clip": 1.04971516, "balance_loss_mlp": 1.0208199, "epoch": 0.6673480430469548, "flos": 22342146975360.0, "grad_norm": 3.261184390002417, "language_loss": 0.78359509, "learning_rate": 1.0526154729542566e-06, "loss": 0.80551684, "num_input_tokens_seen": 119446935, "step": 5550, "time_per_iteration": 2.8305397033691406 }, { "auxiliary_loss_clip": 0.011621, "auxiliary_loss_mlp": 0.01029416, "balance_loss_clip": 1.04973674, "balance_loss_mlp": 1.02179837, "epoch": 0.6674682859375939, "flos": 20703830722560.0, "grad_norm": 8.331644019772748, "language_loss": 0.79586422, "learning_rate": 1.0519295119526699e-06, "loss": 0.81777936, "num_input_tokens_seen": 119463240, "step": 5551, "time_per_iteration": 2.7859349250793457 }, { "auxiliary_loss_clip": 0.01170254, "auxiliary_loss_mlp": 0.01028903, "balance_loss_clip": 1.05048203, "balance_loss_mlp": 1.02161956, "epoch": 0.667588528828233, "flos": 26206673379840.0, "grad_norm": 1.62707101033477, "language_loss": 0.82851797, "learning_rate": 1.0512436947726227e-06, "loss": 0.85050952, "num_input_tokens_seen": 119484655, "step": 5552, "time_per_iteration": 2.802654266357422 }, { "auxiliary_loss_clip": 0.01164013, "auxiliary_loss_mlp": 0.01034011, "balance_loss_clip": 1.04990029, "balance_loss_mlp": 1.02608943, "epoch": 0.6677087717188721, "flos": 23071025756160.0, "grad_norm": 2.346872258307583, "language_loss": 0.65020919, "learning_rate": 1.0505580215181517e-06, "loss": 0.67218947, "num_input_tokens_seen": 119502895, "step": 5553, "time_per_iteration": 2.858365774154663 }, { "auxiliary_loss_clip": 0.01067173, "auxiliary_loss_mlp": 0.01004303, "balance_loss_clip": 1.02089345, "balance_loss_mlp": 1.0032599, "epoch": 0.6678290146095112, "flos": 70941315219840.0, "grad_norm": 0.7817119956349626, "language_loss": 0.56566, "learning_rate": 1.0498724922932753e-06, "loss": 0.58637476, "num_input_tokens_seen": 119561010, "step": 5554, "time_per_iteration": 3.4549925327301025 }, { "auxiliary_loss_clip": 0.01178122, "auxiliary_loss_mlp": 0.01026963, "balance_loss_clip": 1.05231261, "balance_loss_mlp": 1.01874614, "epoch": 0.6679492575001503, "flos": 18661088263680.0, "grad_norm": 5.65637116844862, "language_loss": 0.86522204, "learning_rate": 1.0491871072019851e-06, "loss": 0.88727283, "num_input_tokens_seen": 119578900, "step": 5555, "time_per_iteration": 2.6849446296691895 }, { "auxiliary_loss_clip": 0.01166413, "auxiliary_loss_mlp": 0.01023222, "balance_loss_clip": 1.0479629, "balance_loss_mlp": 1.01590526, "epoch": 0.6680695003907894, "flos": 29711985822720.0, "grad_norm": 1.997276316454164, "language_loss": 0.63922822, "learning_rate": 1.0485018663482555e-06, "loss": 0.66112459, "num_input_tokens_seen": 119598920, "step": 5556, "time_per_iteration": 2.852299451828003 }, { "auxiliary_loss_clip": 0.01165895, "auxiliary_loss_mlp": 0.0102858, "balance_loss_clip": 1.05044925, "balance_loss_mlp": 1.02033663, "epoch": 0.6681897432814284, "flos": 28218964083840.0, "grad_norm": 2.3101698844498344, "language_loss": 0.70641625, "learning_rate": 1.0478167698360354e-06, "loss": 0.72836101, "num_input_tokens_seen": 119618220, "step": 5557, "time_per_iteration": 3.8569653034210205 }, { "auxiliary_loss_clip": 0.01162176, "auxiliary_loss_mlp": 0.01024554, "balance_loss_clip": 1.04753613, "balance_loss_mlp": 1.0171715, "epoch": 0.6683099861720676, "flos": 25046543911680.0, "grad_norm": 5.831396016859059, "language_loss": 0.7004472, "learning_rate": 1.0471318177692556e-06, "loss": 0.72231448, "num_input_tokens_seen": 119638520, "step": 5558, "time_per_iteration": 2.8002734184265137 }, { "auxiliary_loss_clip": 0.01161871, "auxiliary_loss_mlp": 0.01028068, "balance_loss_clip": 1.0474689, "balance_loss_mlp": 1.02070427, "epoch": 0.6684302290627067, "flos": 22996977868800.0, "grad_norm": 2.528558811825926, "language_loss": 0.75892222, "learning_rate": 1.046447010251821e-06, "loss": 0.78082168, "num_input_tokens_seen": 119655850, "step": 5559, "time_per_iteration": 3.8916983604431152 }, { "auxiliary_loss_clip": 0.01162585, "auxiliary_loss_mlp": 0.01028845, "balance_loss_clip": 1.05094838, "balance_loss_mlp": 1.0215013, "epoch": 0.6685504719533457, "flos": 26573824247040.0, "grad_norm": 2.7479080627083357, "language_loss": 0.75782812, "learning_rate": 1.0457623473876157e-06, "loss": 0.77974242, "num_input_tokens_seen": 119675355, "step": 5560, "time_per_iteration": 2.7222349643707275 }, { "auxiliary_loss_clip": 0.01166783, "auxiliary_loss_mlp": 0.01025559, "balance_loss_clip": 1.04635966, "balance_loss_mlp": 1.0180161, "epoch": 0.6686707148439849, "flos": 28986087870720.0, "grad_norm": 2.052076335705675, "language_loss": 0.71317101, "learning_rate": 1.0450778292805046e-06, "loss": 0.73509443, "num_input_tokens_seen": 119695340, "step": 5561, "time_per_iteration": 2.7295000553131104 }, { "auxiliary_loss_clip": 0.01172141, "auxiliary_loss_mlp": 0.01027178, "balance_loss_clip": 1.04862392, "balance_loss_mlp": 1.01950645, "epoch": 0.6687909577346239, "flos": 23623152687360.0, "grad_norm": 2.3645151463223324, "language_loss": 0.78595173, "learning_rate": 1.0443934560343267e-06, "loss": 0.80794489, "num_input_tokens_seen": 119716750, "step": 5562, "time_per_iteration": 2.792750358581543 }, { "auxiliary_loss_clip": 0.0115187, "auxiliary_loss_mlp": 0.01022099, "balance_loss_clip": 1.04836249, "balance_loss_mlp": 1.01423156, "epoch": 0.668911200625263, "flos": 23148593176320.0, "grad_norm": 5.637692426760522, "language_loss": 0.78153569, "learning_rate": 1.0437092277529034e-06, "loss": 0.80327541, "num_input_tokens_seen": 119736005, "step": 5563, "time_per_iteration": 2.797076463699341 }, { "auxiliary_loss_clip": 0.01161553, "auxiliary_loss_mlp": 0.01027218, "balance_loss_clip": 1.04829991, "balance_loss_mlp": 1.0194521, "epoch": 0.6690314435159022, "flos": 18551919853440.0, "grad_norm": 2.010077836756322, "language_loss": 0.73567492, "learning_rate": 1.0430251445400292e-06, "loss": 0.75756264, "num_input_tokens_seen": 119754050, "step": 5564, "time_per_iteration": 2.794633150100708 }, { "auxiliary_loss_clip": 0.0115026, "auxiliary_loss_mlp": 0.01025459, "balance_loss_clip": 1.05018377, "balance_loss_mlp": 1.01776373, "epoch": 0.6691516864065412, "flos": 31759540704000.0, "grad_norm": 2.5044947903972736, "language_loss": 0.62492836, "learning_rate": 1.0423412064994787e-06, "loss": 0.6466856, "num_input_tokens_seen": 119774820, "step": 5565, "time_per_iteration": 2.962430953979492 }, { "auxiliary_loss_clip": 0.01162634, "auxiliary_loss_mlp": 0.0102439, "balance_loss_clip": 1.04704213, "balance_loss_mlp": 1.01731825, "epoch": 0.6692719292971803, "flos": 34933864296960.0, "grad_norm": 2.1072542351513435, "language_loss": 0.74084169, "learning_rate": 1.0416574137350064e-06, "loss": 0.76271194, "num_input_tokens_seen": 119795525, "step": 5566, "time_per_iteration": 3.7706594467163086 }, { "auxiliary_loss_clip": 0.01163603, "auxiliary_loss_mlp": 0.01024011, "balance_loss_clip": 1.05012035, "balance_loss_mlp": 1.016662, "epoch": 0.6693921721878194, "flos": 20449188230400.0, "grad_norm": 2.2208102671839454, "language_loss": 0.80530775, "learning_rate": 1.0409737663503428e-06, "loss": 0.8271839, "num_input_tokens_seen": 119813905, "step": 5567, "time_per_iteration": 2.7752456665039062 }, { "auxiliary_loss_clip": 0.01165105, "auxiliary_loss_mlp": 0.01023464, "balance_loss_clip": 1.04636836, "balance_loss_mlp": 1.01531577, "epoch": 0.6695124150784585, "flos": 16614538963200.0, "grad_norm": 4.101362579080084, "language_loss": 0.8331306, "learning_rate": 1.040290264449196e-06, "loss": 0.85501629, "num_input_tokens_seen": 119832010, "step": 5568, "time_per_iteration": 2.759854793548584 }, { "auxiliary_loss_clip": 0.01161722, "auxiliary_loss_mlp": 0.01024659, "balance_loss_clip": 1.04745889, "balance_loss_mlp": 1.01743233, "epoch": 0.6696326579690975, "flos": 26652145852800.0, "grad_norm": 1.9393734199684025, "language_loss": 0.63819945, "learning_rate": 1.0396069081352532e-06, "loss": 0.66006327, "num_input_tokens_seen": 119851165, "step": 5569, "time_per_iteration": 2.7844486236572266 }, { "auxiliary_loss_clip": 0.01069582, "auxiliary_loss_mlp": 0.01000092, "balance_loss_clip": 1.0132966, "balance_loss_mlp": 0.99907875, "epoch": 0.6697529008597367, "flos": 66964603662720.0, "grad_norm": 0.8133957900493058, "language_loss": 0.56001031, "learning_rate": 1.0389236975121782e-06, "loss": 0.58070701, "num_input_tokens_seen": 119906015, "step": 5570, "time_per_iteration": 3.156907558441162 }, { "auxiliary_loss_clip": 0.01172942, "auxiliary_loss_mlp": 0.0103453, "balance_loss_clip": 1.04792905, "balance_loss_mlp": 1.02692103, "epoch": 0.6698731437503758, "flos": 20886939279360.0, "grad_norm": 2.0145380383229505, "language_loss": 0.71100122, "learning_rate": 1.0382406326836147e-06, "loss": 0.73307598, "num_input_tokens_seen": 119925160, "step": 5571, "time_per_iteration": 2.73681378364563 }, { "auxiliary_loss_clip": 0.0117303, "auxiliary_loss_mlp": 0.01029668, "balance_loss_clip": 1.04971313, "balance_loss_mlp": 1.02191901, "epoch": 0.6699933866410148, "flos": 20409470766720.0, "grad_norm": 1.9551154078952477, "language_loss": 0.76364493, "learning_rate": 1.0375577137531828e-06, "loss": 0.78567195, "num_input_tokens_seen": 119943720, "step": 5572, "time_per_iteration": 2.7369349002838135 }, { "auxiliary_loss_clip": 0.01164739, "auxiliary_loss_mlp": 0.01027014, "balance_loss_clip": 1.04541206, "balance_loss_mlp": 1.01946783, "epoch": 0.670113629531654, "flos": 29023075900800.0, "grad_norm": 1.6201440823948834, "language_loss": 0.71965325, "learning_rate": 1.0368749408244802e-06, "loss": 0.74157083, "num_input_tokens_seen": 119966640, "step": 5573, "time_per_iteration": 2.878779649734497 }, { "auxiliary_loss_clip": 0.01162163, "auxiliary_loss_mlp": 0.01022965, "balance_loss_clip": 1.04863667, "balance_loss_mlp": 1.01553237, "epoch": 0.670233872422293, "flos": 19791699730560.0, "grad_norm": 1.837883052659921, "language_loss": 0.78630507, "learning_rate": 1.0361923140010836e-06, "loss": 0.80815637, "num_input_tokens_seen": 119985125, "step": 5574, "time_per_iteration": 2.766087293624878 }, { "auxiliary_loss_clip": 0.0117338, "auxiliary_loss_mlp": 0.01023401, "balance_loss_clip": 1.0490123, "balance_loss_mlp": 1.01590908, "epoch": 0.6703541153129321, "flos": 24243689070720.0, "grad_norm": 2.925895194848099, "language_loss": 0.63819027, "learning_rate": 1.0355098333865455e-06, "loss": 0.6601581, "num_input_tokens_seen": 120004355, "step": 5575, "time_per_iteration": 2.7458653450012207 }, { "auxiliary_loss_clip": 0.01164074, "auxiliary_loss_mlp": 0.0102558, "balance_loss_clip": 1.05043483, "balance_loss_mlp": 1.01827812, "epoch": 0.6704743582035713, "flos": 26688523351680.0, "grad_norm": 2.131781648114318, "language_loss": 0.69384491, "learning_rate": 1.0348274990844006e-06, "loss": 0.7157414, "num_input_tokens_seen": 120027115, "step": 5576, "time_per_iteration": 2.7838404178619385 }, { "auxiliary_loss_clip": 0.01166965, "auxiliary_loss_mlp": 0.01026815, "balance_loss_clip": 1.04910016, "balance_loss_mlp": 1.01980877, "epoch": 0.6705946010942103, "flos": 23514379326720.0, "grad_norm": 1.9608202332075881, "language_loss": 0.72809291, "learning_rate": 1.034145311198155e-06, "loss": 0.7500307, "num_input_tokens_seen": 120047130, "step": 5577, "time_per_iteration": 2.767524003982544 }, { "auxiliary_loss_clip": 0.01166202, "auxiliary_loss_mlp": 0.01020861, "balance_loss_clip": 1.04667902, "balance_loss_mlp": 1.01364601, "epoch": 0.6707148439848494, "flos": 24061011477120.0, "grad_norm": 2.3784868635463465, "language_loss": 0.63729817, "learning_rate": 1.0334632698312989e-06, "loss": 0.65916878, "num_input_tokens_seen": 120067925, "step": 5578, "time_per_iteration": 2.764509916305542 }, { "auxiliary_loss_clip": 0.01159498, "auxiliary_loss_mlp": 0.01018734, "balance_loss_clip": 1.04832172, "balance_loss_mlp": 1.01108122, "epoch": 0.6708350868754885, "flos": 22528667324160.0, "grad_norm": 3.605743585546186, "language_loss": 0.75195676, "learning_rate": 1.032781375087295e-06, "loss": 0.7737391, "num_input_tokens_seen": 120087825, "step": 5579, "time_per_iteration": 2.7851693630218506 }, { "auxiliary_loss_clip": 0.01167275, "auxiliary_loss_mlp": 0.01023084, "balance_loss_clip": 1.05044389, "balance_loss_mlp": 1.01619983, "epoch": 0.6709553297661276, "flos": 25227749047680.0, "grad_norm": 1.8023307763086611, "language_loss": 0.67184085, "learning_rate": 1.0320996270695891e-06, "loss": 0.69374442, "num_input_tokens_seen": 120108895, "step": 5580, "time_per_iteration": 2.817233085632324 }, { "auxiliary_loss_clip": 0.01164926, "auxiliary_loss_mlp": 0.01026875, "balance_loss_clip": 1.0505085, "balance_loss_mlp": 1.0192101, "epoch": 0.6710755726567667, "flos": 20448757267200.0, "grad_norm": 2.1690153668736922, "language_loss": 0.73432839, "learning_rate": 1.0314180258815998e-06, "loss": 0.75624645, "num_input_tokens_seen": 120127535, "step": 5581, "time_per_iteration": 2.762052297592163 }, { "auxiliary_loss_clip": 0.01154536, "auxiliary_loss_mlp": 0.01023003, "balance_loss_clip": 1.04740095, "balance_loss_mlp": 1.01591027, "epoch": 0.6711958155474057, "flos": 25995411538560.0, "grad_norm": 3.2499413300794973, "language_loss": 0.7432884, "learning_rate": 1.0307365716267247e-06, "loss": 0.76506382, "num_input_tokens_seen": 120147980, "step": 5582, "time_per_iteration": 2.7818305492401123 }, { "auxiliary_loss_clip": 0.01165829, "auxiliary_loss_mlp": 0.01029607, "balance_loss_clip": 1.04774642, "balance_loss_mlp": 1.02181697, "epoch": 0.6713160584380449, "flos": 19937712516480.0, "grad_norm": 3.1413872417216426, "language_loss": 0.78459603, "learning_rate": 1.0300552644083423e-06, "loss": 0.80655038, "num_input_tokens_seen": 120166905, "step": 5583, "time_per_iteration": 3.7026567459106445 }, { "auxiliary_loss_clip": 0.01165152, "auxiliary_loss_mlp": 0.01024677, "balance_loss_clip": 1.04987967, "balance_loss_mlp": 1.01708937, "epoch": 0.6714363013286839, "flos": 18223373128320.0, "grad_norm": 13.03430725452581, "language_loss": 0.72710466, "learning_rate": 1.0293741043298036e-06, "loss": 0.74900293, "num_input_tokens_seen": 120185255, "step": 5584, "time_per_iteration": 2.735912799835205 }, { "auxiliary_loss_clip": 0.01164958, "auxiliary_loss_mlp": 0.01029012, "balance_loss_clip": 1.0532161, "balance_loss_mlp": 1.02110207, "epoch": 0.671556544219323, "flos": 25812374808960.0, "grad_norm": 1.948533209942969, "language_loss": 0.71529251, "learning_rate": 1.0286930914944436e-06, "loss": 0.73723221, "num_input_tokens_seen": 120205070, "step": 5585, "time_per_iteration": 2.7739920616149902 }, { "auxiliary_loss_clip": 0.01168639, "auxiliary_loss_mlp": 0.0102259, "balance_loss_clip": 1.04553831, "balance_loss_mlp": 1.01513958, "epoch": 0.6716767871099621, "flos": 15850431918720.0, "grad_norm": 2.586620470361086, "language_loss": 0.7687071, "learning_rate": 1.0280122260055684e-06, "loss": 0.79061937, "num_input_tokens_seen": 120220780, "step": 5586, "time_per_iteration": 4.585522890090942 }, { "auxiliary_loss_clip": 0.0117234, "auxiliary_loss_mlp": 0.01027609, "balance_loss_clip": 1.04936337, "balance_loss_mlp": 1.01956236, "epoch": 0.6717970300006012, "flos": 19756112330880.0, "grad_norm": 1.9230258765685342, "language_loss": 0.82429522, "learning_rate": 1.0273315079664652e-06, "loss": 0.8462947, "num_input_tokens_seen": 120238735, "step": 5587, "time_per_iteration": 2.752551794052124 }, { "auxiliary_loss_clip": 0.01169413, "auxiliary_loss_mlp": 0.01023703, "balance_loss_clip": 1.04783773, "balance_loss_mlp": 1.0165801, "epoch": 0.6719172728912403, "flos": 25485049146240.0, "grad_norm": 2.092672518365433, "language_loss": 0.74641562, "learning_rate": 1.0266509374803992e-06, "loss": 0.76834679, "num_input_tokens_seen": 120259895, "step": 5588, "time_per_iteration": 2.755584716796875 }, { "auxiliary_loss_clip": 0.01171653, "auxiliary_loss_mlp": 0.01057735, "balance_loss_clip": 1.04862809, "balance_loss_mlp": 1.02204359, "epoch": 0.6720375157818794, "flos": 15880344969600.0, "grad_norm": 2.6449135911668424, "language_loss": 0.84420305, "learning_rate": 1.0259705146506123e-06, "loss": 0.86649692, "num_input_tokens_seen": 120274790, "step": 5589, "time_per_iteration": 2.652346134185791 }, { "auxiliary_loss_clip": 0.01169995, "auxiliary_loss_mlp": 0.01025886, "balance_loss_clip": 1.04903877, "balance_loss_mlp": 1.0179466, "epoch": 0.6721577586725185, "flos": 32010843231360.0, "grad_norm": 3.3483967007870383, "language_loss": 0.77916837, "learning_rate": 1.025290239580324e-06, "loss": 0.8011272, "num_input_tokens_seen": 120295460, "step": 5590, "time_per_iteration": 2.828267812728882 }, { "auxiliary_loss_clip": 0.01159046, "auxiliary_loss_mlp": 0.01028176, "balance_loss_clip": 1.04789066, "balance_loss_mlp": 1.02062392, "epoch": 0.6722780015631575, "flos": 20737873837440.0, "grad_norm": 1.8583695787077368, "language_loss": 0.75469214, "learning_rate": 1.0246101123727313e-06, "loss": 0.77656436, "num_input_tokens_seen": 120314440, "step": 5591, "time_per_iteration": 2.8738253116607666 }, { "auxiliary_loss_clip": 0.01167337, "auxiliary_loss_mlp": 0.01019273, "balance_loss_clip": 1.04821098, "balance_loss_mlp": 1.01242745, "epoch": 0.6723982444537967, "flos": 16909617191040.0, "grad_norm": 2.213698446471939, "language_loss": 0.78453743, "learning_rate": 1.0239301331310085e-06, "loss": 0.80640352, "num_input_tokens_seen": 120332060, "step": 5592, "time_per_iteration": 2.6737160682678223 }, { "auxiliary_loss_clip": 0.01163077, "auxiliary_loss_mlp": 0.01028796, "balance_loss_clip": 1.04738235, "balance_loss_mlp": 1.02174473, "epoch": 0.6725184873444358, "flos": 20667812359680.0, "grad_norm": 1.8795508014693436, "language_loss": 0.88523531, "learning_rate": 1.0232503019583088e-06, "loss": 0.90715396, "num_input_tokens_seen": 120351670, "step": 5593, "time_per_iteration": 3.561847448348999 }, { "auxiliary_loss_clip": 0.01165538, "auxiliary_loss_mlp": 0.01029651, "balance_loss_clip": 1.0497613, "balance_loss_mlp": 1.02193856, "epoch": 0.6726387302350748, "flos": 23727616416000.0, "grad_norm": 1.7410810761418731, "language_loss": 0.696832, "learning_rate": 1.0225706189577619e-06, "loss": 0.71878397, "num_input_tokens_seen": 120370195, "step": 5594, "time_per_iteration": 2.7583160400390625 }, { "auxiliary_loss_clip": 0.01170631, "auxiliary_loss_mlp": 0.01024845, "balance_loss_clip": 1.04959106, "balance_loss_mlp": 1.01774907, "epoch": 0.672758973125714, "flos": 15188274650880.0, "grad_norm": 2.2824854467613145, "language_loss": 0.74770153, "learning_rate": 1.021891084232475e-06, "loss": 0.76965624, "num_input_tokens_seen": 120388130, "step": 5595, "time_per_iteration": 2.8379039764404297 }, { "auxiliary_loss_clip": 0.01169968, "auxiliary_loss_mlp": 0.01023855, "balance_loss_clip": 1.0501318, "balance_loss_mlp": 1.01661301, "epoch": 0.672879216016353, "flos": 18077252601600.0, "grad_norm": 3.2475056530384934, "language_loss": 0.79878837, "learning_rate": 1.0212116978855325e-06, "loss": 0.82072663, "num_input_tokens_seen": 120406145, "step": 5596, "time_per_iteration": 2.6724181175231934 }, { "auxiliary_loss_clip": 0.01159577, "auxiliary_loss_mlp": 0.01025578, "balance_loss_clip": 1.04767776, "balance_loss_mlp": 1.01795495, "epoch": 0.6729994589069921, "flos": 23476349802240.0, "grad_norm": 3.2260566075671333, "language_loss": 0.7911424, "learning_rate": 1.020532460019997e-06, "loss": 0.81299388, "num_input_tokens_seen": 120425395, "step": 5597, "time_per_iteration": 2.8519742488861084 }, { "auxiliary_loss_clip": 0.01159089, "auxiliary_loss_mlp": 0.01024036, "balance_loss_clip": 1.04882061, "balance_loss_mlp": 1.01607871, "epoch": 0.6731197017976313, "flos": 26322018929280.0, "grad_norm": 2.1811350066445696, "language_loss": 0.71171129, "learning_rate": 1.0198533707389096e-06, "loss": 0.73354256, "num_input_tokens_seen": 120446270, "step": 5598, "time_per_iteration": 3.1276915073394775 }, { "auxiliary_loss_clip": 0.01163614, "auxiliary_loss_mlp": 0.01053393, "balance_loss_clip": 1.04806304, "balance_loss_mlp": 1.01804757, "epoch": 0.6732399446882703, "flos": 21616428591360.0, "grad_norm": 1.7385649919367105, "language_loss": 0.73087955, "learning_rate": 1.0191744301452853e-06, "loss": 0.75304967, "num_input_tokens_seen": 120465570, "step": 5599, "time_per_iteration": 2.8557591438293457 }, { "auxiliary_loss_clip": 0.01168467, "auxiliary_loss_mlp": 0.01025, "balance_loss_clip": 1.04591441, "balance_loss_mlp": 1.01772809, "epoch": 0.6733601875789094, "flos": 25880173729920.0, "grad_norm": 1.8050868102735875, "language_loss": 0.71008027, "learning_rate": 1.0184956383421208e-06, "loss": 0.73201501, "num_input_tokens_seen": 120484220, "step": 5600, "time_per_iteration": 2.7158901691436768 }, { "auxiliary_loss_clip": 0.01170015, "auxiliary_loss_mlp": 0.01028644, "balance_loss_clip": 1.04918361, "balance_loss_mlp": 1.02092218, "epoch": 0.6734804304695485, "flos": 22929573997440.0, "grad_norm": 2.9009243595786796, "language_loss": 0.65691757, "learning_rate": 1.017816995432387e-06, "loss": 0.67890418, "num_input_tokens_seen": 120503320, "step": 5601, "time_per_iteration": 2.798138380050659 }, { "auxiliary_loss_clip": 0.01165933, "auxiliary_loss_mlp": 0.01024739, "balance_loss_clip": 1.04890072, "balance_loss_mlp": 1.01750934, "epoch": 0.6736006733601876, "flos": 18697968552960.0, "grad_norm": 2.022170362676607, "language_loss": 0.74153352, "learning_rate": 1.0171385015190353e-06, "loss": 0.76344025, "num_input_tokens_seen": 120523180, "step": 5602, "time_per_iteration": 2.8546979427337646 }, { "auxiliary_loss_clip": 0.01156518, "auxiliary_loss_mlp": 0.01059254, "balance_loss_clip": 1.04693484, "balance_loss_mlp": 1.02469647, "epoch": 0.6737209162508266, "flos": 19427745173760.0, "grad_norm": 1.8664550567662876, "language_loss": 0.73381209, "learning_rate": 1.0164601567049908e-06, "loss": 0.75596982, "num_input_tokens_seen": 120541710, "step": 5603, "time_per_iteration": 2.868147373199463 }, { "auxiliary_loss_clip": 0.01164722, "auxiliary_loss_mlp": 0.01024268, "balance_loss_clip": 1.04864621, "balance_loss_mlp": 1.01648974, "epoch": 0.6738411591414658, "flos": 20158060498560.0, "grad_norm": 1.7838202965265924, "language_loss": 0.8027308, "learning_rate": 1.015781961093158e-06, "loss": 0.82462072, "num_input_tokens_seen": 120561030, "step": 5604, "time_per_iteration": 2.75665020942688 }, { "auxiliary_loss_clip": 0.0116848, "auxiliary_loss_mlp": 0.01023201, "balance_loss_clip": 1.04599977, "balance_loss_mlp": 1.01592326, "epoch": 0.6739614020321049, "flos": 21653847584640.0, "grad_norm": 1.589473870895415, "language_loss": 0.76874852, "learning_rate": 1.0151039147864197e-06, "loss": 0.79066539, "num_input_tokens_seen": 120581005, "step": 5605, "time_per_iteration": 2.736067295074463 }, { "auxiliary_loss_clip": 0.01147474, "auxiliary_loss_mlp": 0.01029949, "balance_loss_clip": 1.04999852, "balance_loss_mlp": 1.02228391, "epoch": 0.6740816449227439, "flos": 19171702051200.0, "grad_norm": 6.769759866621883, "language_loss": 0.65926075, "learning_rate": 1.0144260178876336e-06, "loss": 0.68103504, "num_input_tokens_seen": 120600350, "step": 5606, "time_per_iteration": 2.842946767807007 }, { "auxiliary_loss_clip": 0.0117243, "auxiliary_loss_mlp": 0.01026594, "balance_loss_clip": 1.04952645, "balance_loss_mlp": 1.01879168, "epoch": 0.6742018878133831, "flos": 21097015971840.0, "grad_norm": 2.142745099025977, "language_loss": 0.67358476, "learning_rate": 1.0137482704996388e-06, "loss": 0.695575, "num_input_tokens_seen": 120614700, "step": 5607, "time_per_iteration": 2.756274700164795 }, { "auxiliary_loss_clip": 0.01165371, "auxiliary_loss_mlp": 0.01025536, "balance_loss_clip": 1.04957819, "balance_loss_mlp": 1.01844263, "epoch": 0.6743221307040221, "flos": 23549966726400.0, "grad_norm": 2.8772385892945294, "language_loss": 0.78973198, "learning_rate": 1.0130706727252461e-06, "loss": 0.81164104, "num_input_tokens_seen": 120631755, "step": 5608, "time_per_iteration": 2.894813060760498 }, { "auxiliary_loss_clip": 0.01166133, "auxiliary_loss_mlp": 0.01026968, "balance_loss_clip": 1.0498805, "balance_loss_mlp": 1.01994109, "epoch": 0.6744423735946612, "flos": 16249542912000.0, "grad_norm": 2.781869414950043, "language_loss": 0.68410969, "learning_rate": 1.0123932246672468e-06, "loss": 0.70604074, "num_input_tokens_seen": 120645900, "step": 5609, "time_per_iteration": 3.7720539569854736 }, { "auxiliary_loss_clip": 0.01069563, "auxiliary_loss_mlp": 0.01032639, "balance_loss_clip": 1.01523018, "balance_loss_mlp": 0.99955904, "epoch": 0.6745626164853004, "flos": 57843257829120.0, "grad_norm": 0.7528466239109453, "language_loss": 0.55866647, "learning_rate": 1.0117159264284114e-06, "loss": 0.57968855, "num_input_tokens_seen": 120709070, "step": 5610, "time_per_iteration": 3.2895562648773193 }, { "auxiliary_loss_clip": 0.01166021, "auxiliary_loss_mlp": 0.01028917, "balance_loss_clip": 1.04903316, "balance_loss_mlp": 1.02129912, "epoch": 0.6746828593759394, "flos": 20485027025280.0, "grad_norm": 1.8108790668826578, "language_loss": 0.77111459, "learning_rate": 1.0110387781114837e-06, "loss": 0.79306394, "num_input_tokens_seen": 120727685, "step": 5611, "time_per_iteration": 3.7463371753692627 }, { "auxiliary_loss_clip": 0.01169339, "auxiliary_loss_mlp": 0.01029875, "balance_loss_clip": 1.04860914, "balance_loss_mlp": 1.02203119, "epoch": 0.6748031022665785, "flos": 19208223204480.0, "grad_norm": 2.2330776148126894, "language_loss": 0.77519298, "learning_rate": 1.0103617798191872e-06, "loss": 0.79718512, "num_input_tokens_seen": 120747160, "step": 5612, "time_per_iteration": 3.7465875148773193 }, { "auxiliary_loss_clip": 0.01158076, "auxiliary_loss_mlp": 0.01028275, "balance_loss_clip": 1.04811001, "balance_loss_mlp": 1.02028847, "epoch": 0.6749233451572175, "flos": 15195026407680.0, "grad_norm": 2.2289399121607927, "language_loss": 0.82700777, "learning_rate": 1.0096849316542217e-06, "loss": 0.84887123, "num_input_tokens_seen": 120763710, "step": 5613, "time_per_iteration": 2.7776992321014404 }, { "auxiliary_loss_clip": 0.01142051, "auxiliary_loss_mlp": 0.01020099, "balance_loss_clip": 1.04732752, "balance_loss_mlp": 1.01235342, "epoch": 0.6750435880478567, "flos": 26499489050880.0, "grad_norm": 2.092078441306899, "language_loss": 0.74942052, "learning_rate": 1.0090082337192643e-06, "loss": 0.77104199, "num_input_tokens_seen": 120783355, "step": 5614, "time_per_iteration": 2.8842389583587646 }, { "auxiliary_loss_clip": 0.0115136, "auxiliary_loss_mlp": 0.01026243, "balance_loss_clip": 1.04728341, "balance_loss_mlp": 1.01849759, "epoch": 0.6751638309384957, "flos": 23404313076480.0, "grad_norm": 2.1179948965488125, "language_loss": 0.78534913, "learning_rate": 1.0083316861169705e-06, "loss": 0.80712515, "num_input_tokens_seen": 120802090, "step": 5615, "time_per_iteration": 2.9832699298858643 }, { "auxiliary_loss_clip": 0.01168934, "auxiliary_loss_mlp": 0.01025203, "balance_loss_clip": 1.04767895, "balance_loss_mlp": 1.01722836, "epoch": 0.6752840738291348, "flos": 23441408847360.0, "grad_norm": 2.041373332153247, "language_loss": 0.71113998, "learning_rate": 1.0076552889499713e-06, "loss": 0.73308134, "num_input_tokens_seen": 120822855, "step": 5616, "time_per_iteration": 2.8654532432556152 }, { "auxiliary_loss_clip": 0.011653, "auxiliary_loss_mlp": 0.01024433, "balance_loss_clip": 1.04925656, "balance_loss_mlp": 1.01762271, "epoch": 0.675404316719774, "flos": 30335826257280.0, "grad_norm": 2.1814825959018807, "language_loss": 0.74077535, "learning_rate": 1.006979042320876e-06, "loss": 0.76267272, "num_input_tokens_seen": 120843070, "step": 5617, "time_per_iteration": 2.9349684715270996 }, { "auxiliary_loss_clip": 0.01161921, "auxiliary_loss_mlp": 0.01028147, "balance_loss_clip": 1.04790831, "balance_loss_mlp": 1.02075899, "epoch": 0.675524559610413, "flos": 23622613983360.0, "grad_norm": 2.0451748331910737, "language_loss": 0.6277374, "learning_rate": 1.0063029463322702e-06, "loss": 0.64963812, "num_input_tokens_seen": 120863345, "step": 5618, "time_per_iteration": 2.950655937194824 }, { "auxiliary_loss_clip": 0.0115731, "auxiliary_loss_mlp": 0.01058127, "balance_loss_clip": 1.0486424, "balance_loss_mlp": 1.02247787, "epoch": 0.6756448025010521, "flos": 21248631279360.0, "grad_norm": 2.3048881550071387, "language_loss": 0.75503558, "learning_rate": 1.0056270010867164e-06, "loss": 0.77718991, "num_input_tokens_seen": 120880915, "step": 5619, "time_per_iteration": 3.8394548892974854 }, { "auxiliary_loss_clip": 0.01168319, "auxiliary_loss_mlp": 0.0102583, "balance_loss_clip": 1.04812527, "balance_loss_mlp": 1.01794446, "epoch": 0.6757650453916912, "flos": 21646521210240.0, "grad_norm": 2.2597119517132813, "language_loss": 0.78178871, "learning_rate": 1.004951206686758e-06, "loss": 0.80373019, "num_input_tokens_seen": 120899190, "step": 5620, "time_per_iteration": 2.7446117401123047 }, { "auxiliary_loss_clip": 0.01164226, "auxiliary_loss_mlp": 0.01030024, "balance_loss_clip": 1.04896176, "balance_loss_mlp": 1.0224365, "epoch": 0.6758852882823303, "flos": 21795658479360.0, "grad_norm": 1.9168105848052903, "language_loss": 0.71407449, "learning_rate": 1.0042755632349087e-06, "loss": 0.73601699, "num_input_tokens_seen": 120916080, "step": 5621, "time_per_iteration": 2.7495369911193848 }, { "auxiliary_loss_clip": 0.01160852, "auxiliary_loss_mlp": 0.0102894, "balance_loss_clip": 1.04893875, "balance_loss_mlp": 1.02146566, "epoch": 0.6760055311729694, "flos": 27088783580160.0, "grad_norm": 2.5484754228557622, "language_loss": 0.63098395, "learning_rate": 1.0036000708336653e-06, "loss": 0.65288186, "num_input_tokens_seen": 120935210, "step": 5622, "time_per_iteration": 2.9669504165649414 }, { "auxiliary_loss_clip": 0.01169062, "auxiliary_loss_mlp": 0.01026945, "balance_loss_clip": 1.05081975, "balance_loss_mlp": 1.01925015, "epoch": 0.6761257740636085, "flos": 17999792922240.0, "grad_norm": 2.1870548262927185, "language_loss": 0.79787135, "learning_rate": 1.0029247295854984e-06, "loss": 0.81983131, "num_input_tokens_seen": 120951830, "step": 5623, "time_per_iteration": 2.7727508544921875 }, { "auxiliary_loss_clip": 0.01166426, "auxiliary_loss_mlp": 0.01024418, "balance_loss_clip": 1.05094838, "balance_loss_mlp": 1.01710451, "epoch": 0.6762460169542476, "flos": 15121912273920.0, "grad_norm": 1.7971207979551516, "language_loss": 0.72236311, "learning_rate": 1.0022495395928588e-06, "loss": 0.74427152, "num_input_tokens_seen": 120970310, "step": 5624, "time_per_iteration": 2.8883984088897705 }, { "auxiliary_loss_clip": 0.01069527, "auxiliary_loss_mlp": 0.01000967, "balance_loss_clip": 1.01355004, "balance_loss_mlp": 0.99995416, "epoch": 0.6763662598448866, "flos": 67886970030720.0, "grad_norm": 0.7796390185850076, "language_loss": 0.62321216, "learning_rate": 1.0015745009581697e-06, "loss": 0.64391708, "num_input_tokens_seen": 121031915, "step": 5625, "time_per_iteration": 3.3536643981933594 }, { "auxiliary_loss_clip": 0.0116733, "auxiliary_loss_mlp": 0.01027784, "balance_loss_clip": 1.05043232, "balance_loss_mlp": 1.02012479, "epoch": 0.6764865027355258, "flos": 20631829910400.0, "grad_norm": 1.8687439131726367, "language_loss": 0.6713143, "learning_rate": 1.0008996137838343e-06, "loss": 0.69326544, "num_input_tokens_seen": 121050890, "step": 5626, "time_per_iteration": 2.7506847381591797 }, { "auxiliary_loss_clip": 0.01177654, "auxiliary_loss_mlp": 0.01025805, "balance_loss_clip": 1.05118358, "balance_loss_mlp": 1.0179075, "epoch": 0.6766067456261649, "flos": 21215809226880.0, "grad_norm": 2.148229626847526, "language_loss": 0.80124438, "learning_rate": 1.000224878172234e-06, "loss": 0.82327902, "num_input_tokens_seen": 121070015, "step": 5627, "time_per_iteration": 2.708456516265869 }, { "auxiliary_loss_clip": 0.01169324, "auxiliary_loss_mlp": 0.01020907, "balance_loss_clip": 1.04817176, "balance_loss_mlp": 1.01381993, "epoch": 0.6767269885168039, "flos": 19938251220480.0, "grad_norm": 2.665067212396226, "language_loss": 0.72471189, "learning_rate": 9.99550294225724e-07, "loss": 0.74661416, "num_input_tokens_seen": 121089170, "step": 5628, "time_per_iteration": 2.8547608852386475 }, { "auxiliary_loss_clip": 0.01162734, "auxiliary_loss_mlp": 0.01024549, "balance_loss_clip": 1.0481205, "balance_loss_mlp": 1.01667571, "epoch": 0.6768472314074431, "flos": 20814076540800.0, "grad_norm": 2.2133358507516907, "language_loss": 0.72550869, "learning_rate": 9.988758620466402e-07, "loss": 0.74738157, "num_input_tokens_seen": 121108040, "step": 5629, "time_per_iteration": 2.805217742919922 }, { "auxiliary_loss_clip": 0.01160771, "auxiliary_loss_mlp": 0.01021553, "balance_loss_clip": 1.04962814, "balance_loss_mlp": 1.01499367, "epoch": 0.6769674742980821, "flos": 23186012169600.0, "grad_norm": 1.5943879545543291, "language_loss": 0.75972092, "learning_rate": 9.982015817372917e-07, "loss": 0.78154421, "num_input_tokens_seen": 121128480, "step": 5630, "time_per_iteration": 2.9439785480499268 }, { "auxiliary_loss_clip": 0.01157866, "auxiliary_loss_mlp": 0.01025009, "balance_loss_clip": 1.04890633, "balance_loss_mlp": 1.01733816, "epoch": 0.6770877171887212, "flos": 24242934885120.0, "grad_norm": 1.7241961968204857, "language_loss": 0.82108355, "learning_rate": 9.975274533999657e-07, "loss": 0.84291232, "num_input_tokens_seen": 121148010, "step": 5631, "time_per_iteration": 2.9515106678009033 }, { "auxiliary_loss_clip": 0.01172913, "auxiliary_loss_mlp": 0.01029362, "balance_loss_clip": 1.04963326, "balance_loss_mlp": 1.02146459, "epoch": 0.6772079600793603, "flos": 18141567903360.0, "grad_norm": 2.634701336027263, "language_loss": 0.84086645, "learning_rate": 9.96853477136929e-07, "loss": 0.86288917, "num_input_tokens_seen": 121162755, "step": 5632, "time_per_iteration": 2.8664238452911377 }, { "auxiliary_loss_clip": 0.01154532, "auxiliary_loss_mlp": 0.0102392, "balance_loss_clip": 1.04632688, "balance_loss_mlp": 1.01673782, "epoch": 0.6773282029699994, "flos": 22452069571200.0, "grad_norm": 2.3002764579421893, "language_loss": 0.75422513, "learning_rate": 9.96179653050422e-07, "loss": 0.77600968, "num_input_tokens_seen": 121182915, "step": 5633, "time_per_iteration": 2.880511522293091 }, { "auxiliary_loss_clip": 0.01156074, "auxiliary_loss_mlp": 0.01027787, "balance_loss_clip": 1.04907131, "balance_loss_mlp": 1.02019882, "epoch": 0.6774484458606385, "flos": 18693730748160.0, "grad_norm": 1.9700540319189541, "language_loss": 0.74170291, "learning_rate": 9.955059812426635e-07, "loss": 0.76354152, "num_input_tokens_seen": 121200445, "step": 5634, "time_per_iteration": 2.7822399139404297 }, { "auxiliary_loss_clip": 0.01172584, "auxiliary_loss_mlp": 0.01031922, "balance_loss_clip": 1.05099034, "balance_loss_mlp": 1.02431607, "epoch": 0.6775686887512776, "flos": 25994046821760.0, "grad_norm": 1.98913173313758, "language_loss": 0.82999718, "learning_rate": 9.948324618158493e-07, "loss": 0.8520422, "num_input_tokens_seen": 121220785, "step": 5635, "time_per_iteration": 3.685182809829712 }, { "auxiliary_loss_clip": 0.01168106, "auxiliary_loss_mlp": 0.01023773, "balance_loss_clip": 1.04782343, "balance_loss_mlp": 1.01614964, "epoch": 0.6776889316419167, "flos": 13587987922560.0, "grad_norm": 2.0709474931645295, "language_loss": 0.77795869, "learning_rate": 9.941590948721502e-07, "loss": 0.79987746, "num_input_tokens_seen": 121237985, "step": 5636, "time_per_iteration": 2.760887384414673 }, { "auxiliary_loss_clip": 0.01160231, "auxiliary_loss_mlp": 0.01025464, "balance_loss_clip": 1.04822814, "balance_loss_mlp": 1.0185113, "epoch": 0.6778091745325557, "flos": 27601121220480.0, "grad_norm": 1.6025314011428182, "language_loss": 0.7638585, "learning_rate": 9.934858805137188e-07, "loss": 0.78571546, "num_input_tokens_seen": 121258635, "step": 5637, "time_per_iteration": 2.8739678859710693 }, { "auxiliary_loss_clip": 0.01162845, "auxiliary_loss_mlp": 0.01026617, "balance_loss_clip": 1.04835868, "balance_loss_mlp": 1.01925588, "epoch": 0.6779294174231949, "flos": 18734058743040.0, "grad_norm": 1.85321825477627, "language_loss": 0.80798948, "learning_rate": 9.92812818842677e-07, "loss": 0.82988417, "num_input_tokens_seen": 121277810, "step": 5638, "time_per_iteration": 4.763732433319092 }, { "auxiliary_loss_clip": 0.01161911, "auxiliary_loss_mlp": 0.01027096, "balance_loss_clip": 1.04616785, "balance_loss_mlp": 1.01968169, "epoch": 0.678049660313834, "flos": 45873797765760.0, "grad_norm": 1.9462061708162448, "language_loss": 0.64007616, "learning_rate": 9.921399099611306e-07, "loss": 0.6619662, "num_input_tokens_seen": 121298975, "step": 5639, "time_per_iteration": 2.902541399002075 }, { "auxiliary_loss_clip": 0.01163931, "auxiliary_loss_mlp": 0.01026492, "balance_loss_clip": 1.04799378, "balance_loss_mlp": 1.01970625, "epoch": 0.678169903204473, "flos": 19974556892160.0, "grad_norm": 1.6897475187771016, "language_loss": 0.68828893, "learning_rate": 9.914671539711588e-07, "loss": 0.71019316, "num_input_tokens_seen": 121318495, "step": 5640, "time_per_iteration": 2.730661630630493 }, { "auxiliary_loss_clip": 0.0114995, "auxiliary_loss_mlp": 0.01055421, "balance_loss_clip": 1.04961038, "balance_loss_mlp": 1.01957011, "epoch": 0.6782901460951122, "flos": 21395613732480.0, "grad_norm": 2.1685114664430807, "language_loss": 0.78516614, "learning_rate": 9.90794550974817e-07, "loss": 0.80721986, "num_input_tokens_seen": 121338890, "step": 5641, "time_per_iteration": 2.9165189266204834 }, { "auxiliary_loss_clip": 0.01156992, "auxiliary_loss_mlp": 0.01025948, "balance_loss_clip": 1.04874754, "balance_loss_mlp": 1.01856887, "epoch": 0.6784103889857512, "flos": 21434002392960.0, "grad_norm": 2.1169952809721737, "language_loss": 0.81642407, "learning_rate": 9.901221010741407e-07, "loss": 0.8382535, "num_input_tokens_seen": 121358210, "step": 5642, "time_per_iteration": 2.8082571029663086 }, { "auxiliary_loss_clip": 0.01171775, "auxiliary_loss_mlp": 0.01027583, "balance_loss_clip": 1.04905772, "balance_loss_mlp": 1.01992702, "epoch": 0.6785306318763903, "flos": 32671923091200.0, "grad_norm": 1.9245017440465226, "language_loss": 0.74819189, "learning_rate": 9.894498043711375e-07, "loss": 0.77018553, "num_input_tokens_seen": 121379955, "step": 5643, "time_per_iteration": 2.876650810241699 }, { "auxiliary_loss_clip": 0.01161224, "auxiliary_loss_mlp": 0.01024115, "balance_loss_clip": 1.04612017, "balance_loss_mlp": 1.01713276, "epoch": 0.6786508747670293, "flos": 25632139340160.0, "grad_norm": 1.960284754279577, "language_loss": 0.69124973, "learning_rate": 9.887776609677962e-07, "loss": 0.71310318, "num_input_tokens_seen": 121401325, "step": 5644, "time_per_iteration": 2.8196499347686768 }, { "auxiliary_loss_clip": 0.01152695, "auxiliary_loss_mlp": 0.0102729, "balance_loss_clip": 1.04665935, "balance_loss_mlp": 1.02019668, "epoch": 0.6787711176576685, "flos": 19171881619200.0, "grad_norm": 1.7619871857114995, "language_loss": 0.72430062, "learning_rate": 9.88105670966079e-07, "loss": 0.74610043, "num_input_tokens_seen": 121419785, "step": 5645, "time_per_iteration": 3.651299238204956 }, { "auxiliary_loss_clip": 0.01148342, "auxiliary_loss_mlp": 0.01023074, "balance_loss_clip": 1.04926348, "balance_loss_mlp": 1.01601958, "epoch": 0.6788913605483076, "flos": 13985159581440.0, "grad_norm": 2.365449975063667, "language_loss": 0.78758073, "learning_rate": 9.874338344679283e-07, "loss": 0.80929482, "num_input_tokens_seen": 121435630, "step": 5646, "time_per_iteration": 2.84489107131958 }, { "auxiliary_loss_clip": 0.01166444, "auxiliary_loss_mlp": 0.01025089, "balance_loss_clip": 1.04799151, "balance_loss_mlp": 1.01842165, "epoch": 0.6790116034389466, "flos": 22017586659840.0, "grad_norm": 2.0085844954784053, "language_loss": 0.73820841, "learning_rate": 9.86762151575259e-07, "loss": 0.76012367, "num_input_tokens_seen": 121455625, "step": 5647, "time_per_iteration": 2.789109706878662 }, { "auxiliary_loss_clip": 0.01157667, "auxiliary_loss_mlp": 0.0105967, "balance_loss_clip": 1.04876256, "balance_loss_mlp": 1.02457416, "epoch": 0.6791318463295858, "flos": 20922454851840.0, "grad_norm": 1.4456256107004357, "language_loss": 0.80266654, "learning_rate": 9.860906223899651e-07, "loss": 0.82483995, "num_input_tokens_seen": 121475020, "step": 5648, "time_per_iteration": 2.7785277366638184 }, { "auxiliary_loss_clip": 0.01170518, "auxiliary_loss_mlp": 0.01026714, "balance_loss_clip": 1.05083871, "balance_loss_mlp": 1.01919794, "epoch": 0.6792520892202248, "flos": 28512749422080.0, "grad_norm": 1.8539065785236628, "language_loss": 0.75720316, "learning_rate": 9.854192470139184e-07, "loss": 0.77917546, "num_input_tokens_seen": 121496500, "step": 5649, "time_per_iteration": 2.759190320968628 }, { "auxiliary_loss_clip": 0.01161724, "auxiliary_loss_mlp": 0.01029444, "balance_loss_clip": 1.04888654, "balance_loss_mlp": 1.02244091, "epoch": 0.6793723321108639, "flos": 20011904058240.0, "grad_norm": 1.929601236368748, "language_loss": 0.71816421, "learning_rate": 9.847480255489645e-07, "loss": 0.74007589, "num_input_tokens_seen": 121515525, "step": 5650, "time_per_iteration": 2.7512028217315674 }, { "auxiliary_loss_clip": 0.01168063, "auxiliary_loss_mlp": 0.01028094, "balance_loss_clip": 1.04828537, "balance_loss_mlp": 1.02039552, "epoch": 0.6794925750015031, "flos": 26649488246400.0, "grad_norm": 1.9521683966786048, "language_loss": 0.69157159, "learning_rate": 9.840769580969295e-07, "loss": 0.71353316, "num_input_tokens_seen": 121535965, "step": 5651, "time_per_iteration": 2.7153570652008057 }, { "auxiliary_loss_clip": 0.01160426, "auxiliary_loss_mlp": 0.01029138, "balance_loss_clip": 1.04830682, "balance_loss_mlp": 1.02150869, "epoch": 0.6796128178921421, "flos": 21580374314880.0, "grad_norm": 2.131674563581054, "language_loss": 0.80263424, "learning_rate": 9.834060447596114e-07, "loss": 0.82452995, "num_input_tokens_seen": 121555235, "step": 5652, "time_per_iteration": 2.6762642860412598 }, { "auxiliary_loss_clip": 0.01169971, "auxiliary_loss_mlp": 0.01025694, "balance_loss_clip": 1.04882765, "balance_loss_mlp": 1.01797247, "epoch": 0.6797330607827812, "flos": 22492002516480.0, "grad_norm": 1.7722054580944138, "language_loss": 0.78298116, "learning_rate": 9.827352856387868e-07, "loss": 0.80493784, "num_input_tokens_seen": 121574945, "step": 5653, "time_per_iteration": 2.7936160564422607 }, { "auxiliary_loss_clip": 0.01068217, "auxiliary_loss_mlp": 0.01009718, "balance_loss_clip": 1.01792085, "balance_loss_mlp": 1.00853205, "epoch": 0.6798533036734203, "flos": 66306648286080.0, "grad_norm": 0.788614690811374, "language_loss": 0.64225107, "learning_rate": 9.820646808362118e-07, "loss": 0.66303045, "num_input_tokens_seen": 121641200, "step": 5654, "time_per_iteration": 3.4384310245513916 }, { "auxiliary_loss_clip": 0.01156243, "auxiliary_loss_mlp": 0.01026289, "balance_loss_clip": 1.04659104, "balance_loss_mlp": 1.01926136, "epoch": 0.6799735465640594, "flos": 16180163792640.0, "grad_norm": 2.1757355834418326, "language_loss": 0.72918779, "learning_rate": 9.813942304536154e-07, "loss": 0.7510131, "num_input_tokens_seen": 121659170, "step": 5655, "time_per_iteration": 2.7540719509124756 }, { "auxiliary_loss_clip": 0.01164417, "auxiliary_loss_mlp": 0.01023958, "balance_loss_clip": 1.04835558, "balance_loss_mlp": 1.01639366, "epoch": 0.6800937894546984, "flos": 22125749489280.0, "grad_norm": 1.7265489528717115, "language_loss": 0.64024484, "learning_rate": 9.807239345927043e-07, "loss": 0.66212857, "num_input_tokens_seen": 121679180, "step": 5656, "time_per_iteration": 2.7713725566864014 }, { "auxiliary_loss_clip": 0.01164128, "auxiliary_loss_mlp": 0.01024831, "balance_loss_clip": 1.04534364, "balance_loss_mlp": 1.01775932, "epoch": 0.6802140323453376, "flos": 31612953300480.0, "grad_norm": 2.1121820005054293, "language_loss": 0.71513546, "learning_rate": 9.80053793355162e-07, "loss": 0.73702502, "num_input_tokens_seen": 121697875, "step": 5657, "time_per_iteration": 2.821974039077759 }, { "auxiliary_loss_clip": 0.01154163, "auxiliary_loss_mlp": 0.01031094, "balance_loss_clip": 1.04787099, "balance_loss_mlp": 1.02351224, "epoch": 0.6803342752359767, "flos": 17712938908800.0, "grad_norm": 2.431506065800112, "language_loss": 0.75019205, "learning_rate": 9.793838068426472e-07, "loss": 0.77204454, "num_input_tokens_seen": 121715570, "step": 5658, "time_per_iteration": 2.724581241607666 }, { "auxiliary_loss_clip": 0.01170053, "auxiliary_loss_mlp": 0.01025473, "balance_loss_clip": 1.04904032, "balance_loss_mlp": 1.01820421, "epoch": 0.6804545181266157, "flos": 11326800902400.0, "grad_norm": 1.9789944380172524, "language_loss": 0.61400217, "learning_rate": 9.78713975156799e-07, "loss": 0.63595742, "num_input_tokens_seen": 121731435, "step": 5659, "time_per_iteration": 2.6608927249908447 }, { "auxiliary_loss_clip": 0.01161623, "auxiliary_loss_mlp": 0.01027944, "balance_loss_clip": 1.05116665, "balance_loss_mlp": 1.02067471, "epoch": 0.6805747610172549, "flos": 29350976181120.0, "grad_norm": 1.7125340259423298, "language_loss": 0.71967763, "learning_rate": 9.780442983992273e-07, "loss": 0.74157321, "num_input_tokens_seen": 121749950, "step": 5660, "time_per_iteration": 2.7771525382995605 }, { "auxiliary_loss_clip": 0.01159033, "auxiliary_loss_mlp": 0.01027008, "balance_loss_clip": 1.05006862, "balance_loss_mlp": 1.01936078, "epoch": 0.680695003907894, "flos": 37631868612480.0, "grad_norm": 1.7190629494758798, "language_loss": 0.71564627, "learning_rate": 9.773747766715238e-07, "loss": 0.73750669, "num_input_tokens_seen": 121770770, "step": 5661, "time_per_iteration": 3.9770724773406982 }, { "auxiliary_loss_clip": 0.01168771, "auxiliary_loss_mlp": 0.01028036, "balance_loss_clip": 1.04971814, "balance_loss_mlp": 1.02021039, "epoch": 0.680815246798533, "flos": 22127365601280.0, "grad_norm": 3.588138302645773, "language_loss": 0.80461705, "learning_rate": 9.767054100752536e-07, "loss": 0.82658505, "num_input_tokens_seen": 121790720, "step": 5662, "time_per_iteration": 2.8511579036712646 }, { "auxiliary_loss_clip": 0.01164295, "auxiliary_loss_mlp": 0.01031181, "balance_loss_clip": 1.04922009, "balance_loss_mlp": 1.02401638, "epoch": 0.6809354896891722, "flos": 17201822330880.0, "grad_norm": 2.219698949445636, "language_loss": 0.81934935, "learning_rate": 9.760361987119584e-07, "loss": 0.84130418, "num_input_tokens_seen": 121808455, "step": 5663, "time_per_iteration": 3.7997536659240723 }, { "auxiliary_loss_clip": 0.01160485, "auxiliary_loss_mlp": 0.01020549, "balance_loss_clip": 1.04603136, "balance_loss_mlp": 1.01275253, "epoch": 0.6810557325798112, "flos": 12458166554880.0, "grad_norm": 1.8723097000498345, "language_loss": 0.67874825, "learning_rate": 9.753671426831592e-07, "loss": 0.70055854, "num_input_tokens_seen": 121824470, "step": 5664, "time_per_iteration": 3.705115795135498 }, { "auxiliary_loss_clip": 0.01162136, "auxiliary_loss_mlp": 0.01026936, "balance_loss_clip": 1.04838479, "balance_loss_mlp": 1.01958108, "epoch": 0.6811759754704503, "flos": 22156165330560.0, "grad_norm": 2.1930194111552956, "language_loss": 0.79977822, "learning_rate": 9.746982420903483e-07, "loss": 0.82166898, "num_input_tokens_seen": 121842665, "step": 5665, "time_per_iteration": 2.9659423828125 }, { "auxiliary_loss_clip": 0.01160061, "auxiliary_loss_mlp": 0.01023234, "balance_loss_clip": 1.04724503, "balance_loss_mlp": 1.01636481, "epoch": 0.6812962183610894, "flos": 17525377065600.0, "grad_norm": 2.562749521115089, "language_loss": 0.74752414, "learning_rate": 9.740294970349993e-07, "loss": 0.76935709, "num_input_tokens_seen": 121859080, "step": 5666, "time_per_iteration": 2.801424264907837 }, { "auxiliary_loss_clip": 0.0106803, "auxiliary_loss_mlp": 0.01000487, "balance_loss_clip": 1.01213837, "balance_loss_mlp": 0.99952108, "epoch": 0.6814164612517285, "flos": 60274480855680.0, "grad_norm": 0.8883173191859423, "language_loss": 0.60846668, "learning_rate": 9.733609076185594e-07, "loss": 0.62915188, "num_input_tokens_seen": 121915485, "step": 5667, "time_per_iteration": 3.2125778198242188 }, { "auxiliary_loss_clip": 0.01163775, "auxiliary_loss_mlp": 0.01021512, "balance_loss_clip": 1.04762769, "balance_loss_mlp": 1.01418614, "epoch": 0.6815367041423676, "flos": 19317750750720.0, "grad_norm": 2.0827458769665017, "language_loss": 0.84010231, "learning_rate": 9.72692473942455e-07, "loss": 0.86195517, "num_input_tokens_seen": 121932710, "step": 5668, "time_per_iteration": 2.856447696685791 }, { "auxiliary_loss_clip": 0.01158133, "auxiliary_loss_mlp": 0.01028459, "balance_loss_clip": 1.047786, "balance_loss_mlp": 1.02099359, "epoch": 0.6816569470330067, "flos": 22161696024960.0, "grad_norm": 1.6397095434079367, "language_loss": 0.77255726, "learning_rate": 9.720241961080849e-07, "loss": 0.79442316, "num_input_tokens_seen": 121952025, "step": 5669, "time_per_iteration": 2.964606761932373 }, { "auxiliary_loss_clip": 0.01166746, "auxiliary_loss_mlp": 0.01023169, "balance_loss_clip": 1.04552245, "balance_loss_mlp": 1.01603746, "epoch": 0.6817771899236458, "flos": 41463501137280.0, "grad_norm": 2.596745426781717, "language_loss": 0.7330879, "learning_rate": 9.713560742168259e-07, "loss": 0.75498706, "num_input_tokens_seen": 121974650, "step": 5670, "time_per_iteration": 2.953871011734009 }, { "auxiliary_loss_clip": 0.01159144, "auxiliary_loss_mlp": 0.01029349, "balance_loss_clip": 1.04737067, "balance_loss_mlp": 1.02191019, "epoch": 0.6818974328142848, "flos": 21106138026240.0, "grad_norm": 2.274926277903538, "language_loss": 0.7125597, "learning_rate": 9.706881083700333e-07, "loss": 0.73444462, "num_input_tokens_seen": 121994335, "step": 5671, "time_per_iteration": 3.587874174118042 }, { "auxiliary_loss_clip": 0.0115047, "auxiliary_loss_mlp": 0.01030162, "balance_loss_clip": 1.04885769, "balance_loss_mlp": 1.02279222, "epoch": 0.682017675704924, "flos": 20441897769600.0, "grad_norm": 3.3360370758346964, "language_loss": 0.82694578, "learning_rate": 9.700202986690357e-07, "loss": 0.84875214, "num_input_tokens_seen": 122012635, "step": 5672, "time_per_iteration": 2.791076898574829 }, { "auxiliary_loss_clip": 0.0116225, "auxiliary_loss_mlp": 0.01055104, "balance_loss_clip": 1.04556227, "balance_loss_mlp": 1.01933551, "epoch": 0.682137918595563, "flos": 20044438801920.0, "grad_norm": 1.7627111447064303, "language_loss": 0.66732991, "learning_rate": 9.693526452151413e-07, "loss": 0.68950343, "num_input_tokens_seen": 122031685, "step": 5673, "time_per_iteration": 2.7435638904571533 }, { "auxiliary_loss_clip": 0.01167826, "auxiliary_loss_mlp": 0.01026251, "balance_loss_clip": 1.04923248, "balance_loss_mlp": 1.01849031, "epoch": 0.6822581614862021, "flos": 31684559063040.0, "grad_norm": 1.5963546477732833, "language_loss": 0.75673687, "learning_rate": 9.686851481096305e-07, "loss": 0.7786777, "num_input_tokens_seen": 122052995, "step": 5674, "time_per_iteration": 2.8416433334350586 }, { "auxiliary_loss_clip": 0.01150012, "auxiliary_loss_mlp": 0.01026134, "balance_loss_clip": 1.04714704, "balance_loss_mlp": 1.01944602, "epoch": 0.6823784043768413, "flos": 23477570864640.0, "grad_norm": 2.189139660124087, "language_loss": 0.71945894, "learning_rate": 9.68017807453762e-07, "loss": 0.74122036, "num_input_tokens_seen": 122071740, "step": 5675, "time_per_iteration": 2.7957804203033447 }, { "auxiliary_loss_clip": 0.01164251, "auxiliary_loss_mlp": 0.01061343, "balance_loss_clip": 1.04881465, "balance_loss_mlp": 1.02372718, "epoch": 0.6824986472674803, "flos": 14137134024960.0, "grad_norm": 2.497634897121319, "language_loss": 0.73470807, "learning_rate": 9.673506233487721e-07, "loss": 0.75696397, "num_input_tokens_seen": 122089705, "step": 5676, "time_per_iteration": 2.8040316104888916 }, { "auxiliary_loss_clip": 0.01162597, "auxiliary_loss_mlp": 0.01053234, "balance_loss_clip": 1.04702997, "balance_loss_mlp": 1.01719856, "epoch": 0.6826188901581194, "flos": 21504997624320.0, "grad_norm": 2.0359394276064737, "language_loss": 0.86119187, "learning_rate": 9.666835958958717e-07, "loss": 0.88335019, "num_input_tokens_seen": 122109025, "step": 5677, "time_per_iteration": 2.8089683055877686 }, { "auxiliary_loss_clip": 0.01167426, "auxiliary_loss_mlp": 0.01026271, "balance_loss_clip": 1.04744387, "balance_loss_mlp": 1.01975071, "epoch": 0.6827391330487584, "flos": 20810126044800.0, "grad_norm": 2.434959720269066, "language_loss": 0.80539274, "learning_rate": 9.660167251962484e-07, "loss": 0.8273297, "num_input_tokens_seen": 122127385, "step": 5678, "time_per_iteration": 2.7528903484344482 }, { "auxiliary_loss_clip": 0.01160046, "auxiliary_loss_mlp": 0.01028612, "balance_loss_clip": 1.04584885, "balance_loss_mlp": 1.02167153, "epoch": 0.6828593759393976, "flos": 21688788539520.0, "grad_norm": 1.5001526133381882, "language_loss": 0.77491486, "learning_rate": 9.653500113510654e-07, "loss": 0.79680139, "num_input_tokens_seen": 122146500, "step": 5679, "time_per_iteration": 2.788297653198242 }, { "auxiliary_loss_clip": 0.01158129, "auxiliary_loss_mlp": 0.01028829, "balance_loss_clip": 1.0473094, "balance_loss_mlp": 1.02102113, "epoch": 0.6829796188300367, "flos": 25337707557120.0, "grad_norm": 2.6645951975506312, "language_loss": 0.67585194, "learning_rate": 9.646834544614627e-07, "loss": 0.6977216, "num_input_tokens_seen": 122167000, "step": 5680, "time_per_iteration": 2.943601608276367 }, { "auxiliary_loss_clip": 0.01157746, "auxiliary_loss_mlp": 0.01028888, "balance_loss_clip": 1.05074692, "balance_loss_mlp": 1.02198911, "epoch": 0.6830998617206757, "flos": 20704800389760.0, "grad_norm": 2.0522608114613625, "language_loss": 0.76415086, "learning_rate": 9.64017054628558e-07, "loss": 0.78601718, "num_input_tokens_seen": 122185825, "step": 5681, "time_per_iteration": 2.8666439056396484 }, { "auxiliary_loss_clip": 0.01158949, "auxiliary_loss_mlp": 0.0102625, "balance_loss_clip": 1.04666317, "balance_loss_mlp": 1.01891911, "epoch": 0.6832201046113149, "flos": 21726638496000.0, "grad_norm": 1.6869489723842581, "language_loss": 0.78942633, "learning_rate": 9.63350811953441e-07, "loss": 0.81127834, "num_input_tokens_seen": 122206200, "step": 5682, "time_per_iteration": 2.983527183532715 }, { "auxiliary_loss_clip": 0.01160449, "auxiliary_loss_mlp": 0.0102579, "balance_loss_clip": 1.04673123, "balance_loss_mlp": 1.01844692, "epoch": 0.6833403475019539, "flos": 19536554448000.0, "grad_norm": 2.1626345147759025, "language_loss": 0.7051388, "learning_rate": 9.626847265371826e-07, "loss": 0.72700119, "num_input_tokens_seen": 122225520, "step": 5683, "time_per_iteration": 2.9562315940856934 }, { "auxiliary_loss_clip": 0.01155171, "auxiliary_loss_mlp": 0.01026255, "balance_loss_clip": 1.04746234, "balance_loss_mlp": 1.01907313, "epoch": 0.683460590392593, "flos": 19352153001600.0, "grad_norm": 2.046588231298548, "language_loss": 0.78355658, "learning_rate": 9.620187984808262e-07, "loss": 0.80537093, "num_input_tokens_seen": 122244320, "step": 5684, "time_per_iteration": 2.86077880859375 }, { "auxiliary_loss_clip": 0.01162237, "auxiliary_loss_mlp": 0.0104939, "balance_loss_clip": 1.04767442, "balance_loss_mlp": 1.0150851, "epoch": 0.6835808332832322, "flos": 23288500650240.0, "grad_norm": 2.5614118990068824, "language_loss": 0.86214805, "learning_rate": 9.613530278853919e-07, "loss": 0.88426435, "num_input_tokens_seen": 122264295, "step": 5685, "time_per_iteration": 2.7081046104431152 }, { "auxiliary_loss_clip": 0.01161182, "auxiliary_loss_mlp": 0.01025418, "balance_loss_clip": 1.04586124, "balance_loss_mlp": 1.01774049, "epoch": 0.6837010761738712, "flos": 21653416621440.0, "grad_norm": 1.886442513438038, "language_loss": 0.74230599, "learning_rate": 9.60687414851879e-07, "loss": 0.76417208, "num_input_tokens_seen": 122285300, "step": 5686, "time_per_iteration": 2.645380735397339 }, { "auxiliary_loss_clip": 0.01164525, "auxiliary_loss_mlp": 0.01028914, "balance_loss_clip": 1.04678822, "balance_loss_mlp": 1.0213201, "epoch": 0.6838213190645103, "flos": 17566387418880.0, "grad_norm": 2.4006731416766467, "language_loss": 0.77496791, "learning_rate": 9.600219594812575e-07, "loss": 0.7969023, "num_input_tokens_seen": 122303240, "step": 5687, "time_per_iteration": 3.6347761154174805 }, { "auxiliary_loss_clip": 0.01164653, "auxiliary_loss_mlp": 0.01026076, "balance_loss_clip": 1.04587352, "balance_loss_mlp": 1.01930463, "epoch": 0.6839415619551494, "flos": 23112538899840.0, "grad_norm": 2.024231515481465, "language_loss": 0.72992587, "learning_rate": 9.593566618744786e-07, "loss": 0.75183314, "num_input_tokens_seen": 122323390, "step": 5688, "time_per_iteration": 2.6350936889648438 }, { "auxiliary_loss_clip": 0.01166158, "auxiliary_loss_mlp": 0.01024843, "balance_loss_clip": 1.04548883, "balance_loss_mlp": 1.01772022, "epoch": 0.6840618048457885, "flos": 22127868391680.0, "grad_norm": 1.8766761356503352, "language_loss": 0.7395879, "learning_rate": 9.58691522132466e-07, "loss": 0.76149786, "num_input_tokens_seen": 122342200, "step": 5689, "time_per_iteration": 2.7165236473083496 }, { "auxiliary_loss_clip": 0.01168744, "auxiliary_loss_mlp": 0.01028975, "balance_loss_clip": 1.05022454, "balance_loss_mlp": 1.02139354, "epoch": 0.6841820477364275, "flos": 22015898720640.0, "grad_norm": 2.0429746859945834, "language_loss": 0.84728253, "learning_rate": 9.58026540356123e-07, "loss": 0.86925972, "num_input_tokens_seen": 122360465, "step": 5690, "time_per_iteration": 4.516355514526367 }, { "auxiliary_loss_clip": 0.01168741, "auxiliary_loss_mlp": 0.01025301, "balance_loss_clip": 1.04823554, "balance_loss_mlp": 1.0178777, "epoch": 0.6843022906270667, "flos": 24900531125760.0, "grad_norm": 1.6574040033714994, "language_loss": 0.86943173, "learning_rate": 9.573617166463246e-07, "loss": 0.8913722, "num_input_tokens_seen": 122381680, "step": 5691, "time_per_iteration": 2.6852450370788574 }, { "auxiliary_loss_clip": 0.01164885, "auxiliary_loss_mlp": 0.01023451, "balance_loss_clip": 1.04641843, "balance_loss_mlp": 1.01672173, "epoch": 0.6844225335177058, "flos": 19969924037760.0, "grad_norm": 2.5728117472896614, "language_loss": 0.60258323, "learning_rate": 9.56697051103924e-07, "loss": 0.6244666, "num_input_tokens_seen": 122399120, "step": 5692, "time_per_iteration": 2.774501323699951 }, { "auxiliary_loss_clip": 0.01159286, "auxiliary_loss_mlp": 0.01022167, "balance_loss_clip": 1.04658079, "balance_loss_mlp": 1.0156374, "epoch": 0.6845427764083448, "flos": 25883334126720.0, "grad_norm": 5.444979664509092, "language_loss": 0.81195199, "learning_rate": 9.560325438297522e-07, "loss": 0.83376646, "num_input_tokens_seen": 122417430, "step": 5693, "time_per_iteration": 2.731431722640991 }, { "auxiliary_loss_clip": 0.01160706, "auxiliary_loss_mlp": 0.01028952, "balance_loss_clip": 1.04797506, "balance_loss_mlp": 1.02167785, "epoch": 0.684663019298984, "flos": 18880143356160.0, "grad_norm": 2.146478282387783, "language_loss": 0.86878896, "learning_rate": 9.553681949246127e-07, "loss": 0.89068544, "num_input_tokens_seen": 122435055, "step": 5694, "time_per_iteration": 2.709674119949341 }, { "auxiliary_loss_clip": 0.01166965, "auxiliary_loss_mlp": 0.01026932, "balance_loss_clip": 1.04982555, "balance_loss_mlp": 1.01891541, "epoch": 0.684783262189623, "flos": 54193725302400.0, "grad_norm": 2.1846785906846313, "language_loss": 0.75516844, "learning_rate": 9.547040044892886e-07, "loss": 0.77710742, "num_input_tokens_seen": 122462570, "step": 5695, "time_per_iteration": 3.0057268142700195 }, { "auxiliary_loss_clip": 0.01068094, "auxiliary_loss_mlp": 0.01001916, "balance_loss_clip": 1.01084828, "balance_loss_mlp": 1.00089097, "epoch": 0.6849035050802621, "flos": 63970264143360.0, "grad_norm": 0.8646570746705474, "language_loss": 0.60036206, "learning_rate": 9.540399726245354e-07, "loss": 0.62106216, "num_input_tokens_seen": 122519275, "step": 5696, "time_per_iteration": 4.025351047515869 }, { "auxiliary_loss_clip": 0.01159704, "auxiliary_loss_mlp": 0.01026785, "balance_loss_clip": 1.04694152, "balance_loss_mlp": 1.018893, "epoch": 0.6850237479709013, "flos": 25224121774080.0, "grad_norm": 2.14826273185383, "language_loss": 0.68922073, "learning_rate": 9.533760994310859e-07, "loss": 0.71108556, "num_input_tokens_seen": 122539675, "step": 5697, "time_per_iteration": 2.7743518352508545 }, { "auxiliary_loss_clip": 0.01170766, "auxiliary_loss_mlp": 0.01024005, "balance_loss_clip": 1.04904282, "balance_loss_mlp": 1.01693916, "epoch": 0.6851439908615403, "flos": 19354128249600.0, "grad_norm": 1.872429420499276, "language_loss": 0.75156289, "learning_rate": 9.527123850096508e-07, "loss": 0.77351058, "num_input_tokens_seen": 122558035, "step": 5698, "time_per_iteration": 2.6172101497650146 }, { "auxiliary_loss_clip": 0.01169668, "auxiliary_loss_mlp": 0.010252, "balance_loss_clip": 1.04767084, "balance_loss_mlp": 1.01768434, "epoch": 0.6852642337521794, "flos": 23182133500800.0, "grad_norm": 1.7461634640483585, "language_loss": 0.72114694, "learning_rate": 9.520488294609142e-07, "loss": 0.74309564, "num_input_tokens_seen": 122576815, "step": 5699, "time_per_iteration": 2.6887319087982178 }, { "auxiliary_loss_clip": 0.01063453, "auxiliary_loss_mlp": 0.01001839, "balance_loss_clip": 1.01235628, "balance_loss_mlp": 1.00083184, "epoch": 0.6853844766428185, "flos": 62647206583680.0, "grad_norm": 0.7450863828184426, "language_loss": 0.53839272, "learning_rate": 9.513854328855368e-07, "loss": 0.55904567, "num_input_tokens_seen": 122634690, "step": 5700, "time_per_iteration": 3.2894484996795654 }, { "auxiliary_loss_clip": 0.01164282, "auxiliary_loss_mlp": 0.01019102, "balance_loss_clip": 1.04535103, "balance_loss_mlp": 1.01230443, "epoch": 0.6855047195334576, "flos": 23437242869760.0, "grad_norm": 5.336133288454865, "language_loss": 0.81539822, "learning_rate": 9.507221953841558e-07, "loss": 0.83723205, "num_input_tokens_seen": 122652320, "step": 5701, "time_per_iteration": 2.6943390369415283 }, { "auxiliary_loss_clip": 0.01166168, "auxiliary_loss_mlp": 0.01023138, "balance_loss_clip": 1.04982078, "balance_loss_mlp": 1.01560974, "epoch": 0.6856249624240967, "flos": 20664831530880.0, "grad_norm": 1.5429961516073785, "language_loss": 0.77797967, "learning_rate": 9.500591170573824e-07, "loss": 0.79987276, "num_input_tokens_seen": 122672340, "step": 5702, "time_per_iteration": 2.755251407623291 }, { "auxiliary_loss_clip": 0.01152446, "auxiliary_loss_mlp": 0.01026535, "balance_loss_clip": 1.04738283, "balance_loss_mlp": 1.01914692, "epoch": 0.6857452053147358, "flos": 17087302794240.0, "grad_norm": 2.006139993403921, "language_loss": 0.7436651, "learning_rate": 9.493961980058078e-07, "loss": 0.76545489, "num_input_tokens_seen": 122689935, "step": 5703, "time_per_iteration": 2.7298779487609863 }, { "auxiliary_loss_clip": 0.01139495, "auxiliary_loss_mlp": 0.01025479, "balance_loss_clip": 1.04689407, "balance_loss_mlp": 1.01891911, "epoch": 0.6858654482053749, "flos": 30847266057600.0, "grad_norm": 2.6431573958799226, "language_loss": 0.67762244, "learning_rate": 9.48733438329993e-07, "loss": 0.69927216, "num_input_tokens_seen": 122710200, "step": 5704, "time_per_iteration": 3.140136480331421 }, { "auxiliary_loss_clip": 0.01165654, "auxiliary_loss_mlp": 0.01053267, "balance_loss_clip": 1.0476501, "balance_loss_mlp": 1.01695514, "epoch": 0.6859856910960139, "flos": 28877314510080.0, "grad_norm": 1.7567509335778175, "language_loss": 0.74555838, "learning_rate": 9.480708381304807e-07, "loss": 0.76774758, "num_input_tokens_seen": 122731495, "step": 5705, "time_per_iteration": 2.9416112899780273 }, { "auxiliary_loss_clip": 0.0115425, "auxiliary_loss_mlp": 0.01027727, "balance_loss_clip": 1.04856217, "balance_loss_mlp": 1.02061355, "epoch": 0.6861059339866531, "flos": 19354523299200.0, "grad_norm": 2.0769167508420234, "language_loss": 0.83524203, "learning_rate": 9.474083975077858e-07, "loss": 0.85706186, "num_input_tokens_seen": 122748620, "step": 5706, "time_per_iteration": 2.7599685192108154 }, { "auxiliary_loss_clip": 0.01162386, "auxiliary_loss_mlp": 0.01023146, "balance_loss_clip": 1.04910123, "balance_loss_mlp": 1.0157069, "epoch": 0.6862261768772921, "flos": 22199976944640.0, "grad_norm": 2.034366094561241, "language_loss": 0.80318362, "learning_rate": 9.467461165623994e-07, "loss": 0.82503891, "num_input_tokens_seen": 122767670, "step": 5707, "time_per_iteration": 2.721010208129883 }, { "auxiliary_loss_clip": 0.01167412, "auxiliary_loss_mlp": 0.0102491, "balance_loss_clip": 1.04745674, "balance_loss_mlp": 1.01756978, "epoch": 0.6863464197679312, "flos": 26285677344000.0, "grad_norm": 2.6923697564397227, "language_loss": 0.79730922, "learning_rate": 9.46083995394791e-07, "loss": 0.81923246, "num_input_tokens_seen": 122785480, "step": 5708, "time_per_iteration": 2.711221933364868 }, { "auxiliary_loss_clip": 0.01165865, "auxiliary_loss_mlp": 0.01054236, "balance_loss_clip": 1.04822433, "balance_loss_mlp": 1.01941371, "epoch": 0.6864666626585703, "flos": 37815228564480.0, "grad_norm": 1.9807191646980038, "language_loss": 0.63634181, "learning_rate": 9.454220341054012e-07, "loss": 0.65854287, "num_input_tokens_seen": 122810265, "step": 5709, "time_per_iteration": 2.9387454986572266 }, { "auxiliary_loss_clip": 0.01160577, "auxiliary_loss_mlp": 0.01026429, "balance_loss_clip": 1.04890132, "balance_loss_mlp": 1.01893651, "epoch": 0.6865869055492094, "flos": 19391152193280.0, "grad_norm": 2.679350073507704, "language_loss": 0.80389583, "learning_rate": 9.447602327946512e-07, "loss": 0.82576585, "num_input_tokens_seen": 122828905, "step": 5710, "time_per_iteration": 2.7788944244384766 }, { "auxiliary_loss_clip": 0.01160859, "auxiliary_loss_mlp": 0.0102576, "balance_loss_clip": 1.0467217, "balance_loss_mlp": 1.01827359, "epoch": 0.6867071484398485, "flos": 20375966355840.0, "grad_norm": 1.7557955069873135, "language_loss": 0.76651251, "learning_rate": 9.440985915629338e-07, "loss": 0.78837872, "num_input_tokens_seen": 122846235, "step": 5711, "time_per_iteration": 2.709195375442505 }, { "auxiliary_loss_clip": 0.01167234, "auxiliary_loss_mlp": 0.01024578, "balance_loss_clip": 1.04864573, "balance_loss_mlp": 1.01701427, "epoch": 0.6868273913304875, "flos": 15889143801600.0, "grad_norm": 3.770066934243994, "language_loss": 0.73171163, "learning_rate": 9.434371105106223e-07, "loss": 0.7536298, "num_input_tokens_seen": 122863835, "step": 5712, "time_per_iteration": 2.614231824874878 }, { "auxiliary_loss_clip": 0.01156649, "auxiliary_loss_mlp": 0.01025034, "balance_loss_clip": 1.04776525, "balance_loss_mlp": 1.0166055, "epoch": 0.6869476342211267, "flos": 24462492768000.0, "grad_norm": 1.8706516316312467, "language_loss": 0.70587242, "learning_rate": 9.427757897380602e-07, "loss": 0.72768921, "num_input_tokens_seen": 122883235, "step": 5713, "time_per_iteration": 3.8397045135498047 }, { "auxiliary_loss_clip": 0.01157939, "auxiliary_loss_mlp": 0.01026747, "balance_loss_clip": 1.04741096, "balance_loss_mlp": 1.01903963, "epoch": 0.6870678771117658, "flos": 18442571875200.0, "grad_norm": 2.875258874594134, "language_loss": 0.84308553, "learning_rate": 9.421146293455695e-07, "loss": 0.86493242, "num_input_tokens_seen": 122898975, "step": 5714, "time_per_iteration": 2.822699546813965 }, { "auxiliary_loss_clip": 0.01163108, "auxiliary_loss_mlp": 0.01025312, "balance_loss_clip": 1.04969501, "balance_loss_mlp": 1.01752222, "epoch": 0.6871881200024048, "flos": 22200371994240.0, "grad_norm": 2.0834521421398513, "language_loss": 0.68579096, "learning_rate": 9.414536294334489e-07, "loss": 0.70767522, "num_input_tokens_seen": 122918995, "step": 5715, "time_per_iteration": 2.7158024311065674 }, { "auxiliary_loss_clip": 0.01165629, "auxiliary_loss_mlp": 0.01021403, "balance_loss_clip": 1.04706597, "balance_loss_mlp": 1.01383281, "epoch": 0.687308362893044, "flos": 22127724737280.0, "grad_norm": 1.8535500173471402, "language_loss": 0.69605601, "learning_rate": 9.407927901019708e-07, "loss": 0.71792638, "num_input_tokens_seen": 122938125, "step": 5716, "time_per_iteration": 3.6635093688964844 }, { "auxiliary_loss_clip": 0.0116674, "auxiliary_loss_mlp": 0.01022596, "balance_loss_clip": 1.04742181, "balance_loss_mlp": 1.0156939, "epoch": 0.687428605783683, "flos": 25040546340480.0, "grad_norm": 1.871071193333533, "language_loss": 0.76566648, "learning_rate": 9.401321114513854e-07, "loss": 0.78755981, "num_input_tokens_seen": 122957020, "step": 5717, "time_per_iteration": 2.728177070617676 }, { "auxiliary_loss_clip": 0.01169305, "auxiliary_loss_mlp": 0.01026615, "balance_loss_clip": 1.04771996, "balance_loss_mlp": 1.0191884, "epoch": 0.6875488486743221, "flos": 23770063313280.0, "grad_norm": 1.6494727471263189, "language_loss": 0.75287294, "learning_rate": 9.394715935819155e-07, "loss": 0.77483213, "num_input_tokens_seen": 122977410, "step": 5718, "time_per_iteration": 2.6873350143432617 }, { "auxiliary_loss_clip": 0.01172103, "auxiliary_loss_mlp": 0.01022326, "balance_loss_clip": 1.0498786, "balance_loss_mlp": 1.01525688, "epoch": 0.6876690915649613, "flos": 25516937445120.0, "grad_norm": 1.812768025611245, "language_loss": 0.62484944, "learning_rate": 9.388112365937608e-07, "loss": 0.64679372, "num_input_tokens_seen": 122996875, "step": 5719, "time_per_iteration": 2.7693099975585938 }, { "auxiliary_loss_clip": 0.01161803, "auxiliary_loss_mlp": 0.01022069, "balance_loss_clip": 1.04989207, "balance_loss_mlp": 1.01477647, "epoch": 0.6877893344556003, "flos": 19427996568960.0, "grad_norm": 2.275316537858686, "language_loss": 0.82445735, "learning_rate": 9.381510405870985e-07, "loss": 0.84629601, "num_input_tokens_seen": 123015890, "step": 5720, "time_per_iteration": 2.664991617202759 }, { "auxiliary_loss_clip": 0.01164478, "auxiliary_loss_mlp": 0.01030278, "balance_loss_clip": 1.04785013, "balance_loss_mlp": 1.02236903, "epoch": 0.6879095773462394, "flos": 18661303745280.0, "grad_norm": 3.220136223364343, "language_loss": 0.77533096, "learning_rate": 9.374910056620791e-07, "loss": 0.79727852, "num_input_tokens_seen": 123034955, "step": 5721, "time_per_iteration": 2.6766884326934814 }, { "auxiliary_loss_clip": 0.01167369, "auxiliary_loss_mlp": 0.01023326, "balance_loss_clip": 1.04849255, "balance_loss_mlp": 1.01514792, "epoch": 0.6880298202368785, "flos": 20883132437760.0, "grad_norm": 10.652982551592059, "language_loss": 0.80951554, "learning_rate": 9.368311319188293e-07, "loss": 0.83142251, "num_input_tokens_seen": 123052770, "step": 5722, "time_per_iteration": 2.658966064453125 }, { "auxiliary_loss_clip": 0.01157646, "auxiliary_loss_mlp": 0.01022015, "balance_loss_clip": 1.04615855, "balance_loss_mlp": 1.01487172, "epoch": 0.6881500631275176, "flos": 30153292318080.0, "grad_norm": 2.028822118101658, "language_loss": 0.79093206, "learning_rate": 9.361714194574515e-07, "loss": 0.81272876, "num_input_tokens_seen": 123075105, "step": 5723, "time_per_iteration": 3.7743539810180664 }, { "auxiliary_loss_clip": 0.01065568, "auxiliary_loss_mlp": 0.01003145, "balance_loss_clip": 1.01010609, "balance_loss_mlp": 1.00220895, "epoch": 0.6882703060181566, "flos": 66181537215360.0, "grad_norm": 0.7425094792717394, "language_loss": 0.58322126, "learning_rate": 9.355118683780228e-07, "loss": 0.60390836, "num_input_tokens_seen": 123145175, "step": 5724, "time_per_iteration": 3.3294906616210938 }, { "auxiliary_loss_clip": 0.01166734, "auxiliary_loss_mlp": 0.01023864, "balance_loss_clip": 1.04668033, "balance_loss_mlp": 1.01648533, "epoch": 0.6883905489087958, "flos": 18214646123520.0, "grad_norm": 2.0471814426312207, "language_loss": 0.78845751, "learning_rate": 9.348524787805987e-07, "loss": 0.81036353, "num_input_tokens_seen": 123160365, "step": 5725, "time_per_iteration": 2.68906569480896 }, { "auxiliary_loss_clip": 0.01162048, "auxiliary_loss_mlp": 0.01025169, "balance_loss_clip": 1.04640412, "balance_loss_mlp": 1.01760483, "epoch": 0.6885107917994349, "flos": 14056262553600.0, "grad_norm": 2.902369657638406, "language_loss": 0.84966189, "learning_rate": 9.341932507652053e-07, "loss": 0.87153411, "num_input_tokens_seen": 123174855, "step": 5726, "time_per_iteration": 2.699774980545044 }, { "auxiliary_loss_clip": 0.01162925, "auxiliary_loss_mlp": 0.01023611, "balance_loss_clip": 1.04615855, "balance_loss_mlp": 1.01634538, "epoch": 0.6886310346900739, "flos": 28690722334080.0, "grad_norm": 2.080743698138573, "language_loss": 0.78405428, "learning_rate": 9.335341844318489e-07, "loss": 0.80591971, "num_input_tokens_seen": 123194995, "step": 5727, "time_per_iteration": 2.7750096321105957 }, { "auxiliary_loss_clip": 0.01160713, "auxiliary_loss_mlp": 0.0102965, "balance_loss_clip": 1.04673803, "balance_loss_mlp": 1.02176976, "epoch": 0.6887512775807131, "flos": 24535319592960.0, "grad_norm": 1.6916338388883718, "language_loss": 0.73394763, "learning_rate": 9.328752798805091e-07, "loss": 0.75585127, "num_input_tokens_seen": 123213465, "step": 5728, "time_per_iteration": 2.7161192893981934 }, { "auxiliary_loss_clip": 0.0116726, "auxiliary_loss_mlp": 0.01026122, "balance_loss_clip": 1.04940343, "balance_loss_mlp": 1.01856375, "epoch": 0.6888715204713521, "flos": 22414363269120.0, "grad_norm": 1.9627975454097393, "language_loss": 0.75922942, "learning_rate": 9.322165372111399e-07, "loss": 0.78116322, "num_input_tokens_seen": 123231610, "step": 5729, "time_per_iteration": 2.6416661739349365 }, { "auxiliary_loss_clip": 0.01155089, "auxiliary_loss_mlp": 0.01024674, "balance_loss_clip": 1.0467627, "balance_loss_mlp": 1.01677608, "epoch": 0.6889917633619912, "flos": 22054323294720.0, "grad_norm": 5.564920337249547, "language_loss": 0.75839627, "learning_rate": 9.315579565236747e-07, "loss": 0.78019392, "num_input_tokens_seen": 123250715, "step": 5730, "time_per_iteration": 2.7564451694488525 }, { "auxiliary_loss_clip": 0.01157863, "auxiliary_loss_mlp": 0.01027017, "balance_loss_clip": 1.04759288, "balance_loss_mlp": 1.01944137, "epoch": 0.6891120062526304, "flos": 23949724164480.0, "grad_norm": 1.899431058651555, "language_loss": 0.74190116, "learning_rate": 9.308995379180162e-07, "loss": 0.7637499, "num_input_tokens_seen": 123270270, "step": 5731, "time_per_iteration": 2.75645112991333 }, { "auxiliary_loss_clip": 0.01066682, "auxiliary_loss_mlp": 0.01002969, "balance_loss_clip": 1.01048744, "balance_loss_mlp": 1.00202751, "epoch": 0.6892322491432694, "flos": 64117354337280.0, "grad_norm": 0.7396570327429228, "language_loss": 0.59452498, "learning_rate": 9.302412814940488e-07, "loss": 0.6152215, "num_input_tokens_seen": 123333045, "step": 5732, "time_per_iteration": 3.2709479331970215 }, { "auxiliary_loss_clip": 0.01164262, "auxiliary_loss_mlp": 0.01027771, "balance_loss_clip": 1.04881561, "balance_loss_mlp": 1.02030241, "epoch": 0.6893524920339085, "flos": 23002436736000.0, "grad_norm": 2.094387715199199, "language_loss": 0.70889699, "learning_rate": 9.295831873516276e-07, "loss": 0.73081732, "num_input_tokens_seen": 123352320, "step": 5733, "time_per_iteration": 2.6348683834075928 }, { "auxiliary_loss_clip": 0.01168045, "auxiliary_loss_mlp": 0.01023766, "balance_loss_clip": 1.0481441, "balance_loss_mlp": 1.01618457, "epoch": 0.6894727349245476, "flos": 21396260177280.0, "grad_norm": 1.7440186244417666, "language_loss": 0.76293445, "learning_rate": 9.289252555905873e-07, "loss": 0.78485256, "num_input_tokens_seen": 123372400, "step": 5734, "time_per_iteration": 2.5228567123413086 }, { "auxiliary_loss_clip": 0.01167418, "auxiliary_loss_mlp": 0.01030732, "balance_loss_clip": 1.05143666, "balance_loss_mlp": 1.02275658, "epoch": 0.6895929778151867, "flos": 19865316654720.0, "grad_norm": 2.085902395788679, "language_loss": 0.76142073, "learning_rate": 9.282674863107334e-07, "loss": 0.7834022, "num_input_tokens_seen": 123390215, "step": 5735, "time_per_iteration": 2.5674707889556885 }, { "auxiliary_loss_clip": 0.01164351, "auxiliary_loss_mlp": 0.01020854, "balance_loss_clip": 1.04862094, "balance_loss_mlp": 1.01339173, "epoch": 0.6897132207058257, "flos": 18179166464640.0, "grad_norm": 2.5503470715353647, "language_loss": 0.76042271, "learning_rate": 9.276098796118488e-07, "loss": 0.78227472, "num_input_tokens_seen": 123406700, "step": 5736, "time_per_iteration": 2.5270631313323975 }, { "auxiliary_loss_clip": 0.01163402, "auxiliary_loss_mlp": 0.01025154, "balance_loss_clip": 1.04846168, "balance_loss_mlp": 1.01782608, "epoch": 0.6898334635964649, "flos": 32561641359360.0, "grad_norm": 2.232013186032798, "language_loss": 0.66130722, "learning_rate": 9.269524355936938e-07, "loss": 0.68319273, "num_input_tokens_seen": 123429880, "step": 5737, "time_per_iteration": 2.6356348991394043 }, { "auxiliary_loss_clip": 0.011566, "auxiliary_loss_mlp": 0.01024746, "balance_loss_clip": 1.04569471, "balance_loss_mlp": 1.01806104, "epoch": 0.689953706487104, "flos": 22819004956800.0, "grad_norm": 1.92142593777282, "language_loss": 0.84922755, "learning_rate": 9.262951543560002e-07, "loss": 0.871041, "num_input_tokens_seen": 123449105, "step": 5738, "time_per_iteration": 2.5417449474334717 }, { "auxiliary_loss_clip": 0.01161468, "auxiliary_loss_mlp": 0.01028918, "balance_loss_clip": 1.04993439, "balance_loss_mlp": 1.02066839, "epoch": 0.690073949377743, "flos": 18515362786560.0, "grad_norm": 2.1569904084853326, "language_loss": 0.86129707, "learning_rate": 9.256380359984795e-07, "loss": 0.883201, "num_input_tokens_seen": 123466215, "step": 5739, "time_per_iteration": 3.403015613555908 }, { "auxiliary_loss_clip": 0.01162597, "auxiliary_loss_mlp": 0.01027351, "balance_loss_clip": 1.04563141, "balance_loss_mlp": 1.01967382, "epoch": 0.6901941922683821, "flos": 34857194716800.0, "grad_norm": 1.8906788150975744, "language_loss": 0.74714476, "learning_rate": 9.249810806208139e-07, "loss": 0.76904416, "num_input_tokens_seen": 123485480, "step": 5740, "time_per_iteration": 2.9168243408203125 }, { "auxiliary_loss_clip": 0.01155249, "auxiliary_loss_mlp": 0.01055244, "balance_loss_clip": 1.04537988, "balance_loss_mlp": 1.01846552, "epoch": 0.6903144351590212, "flos": 16253672976000.0, "grad_norm": 1.9624651976759648, "language_loss": 0.80549085, "learning_rate": 9.243242883226627e-07, "loss": 0.82759583, "num_input_tokens_seen": 123504575, "step": 5741, "time_per_iteration": 2.8053009510040283 }, { "auxiliary_loss_clip": 0.01167449, "auxiliary_loss_mlp": 0.01024872, "balance_loss_clip": 1.04517627, "balance_loss_mlp": 1.01692033, "epoch": 0.6904346780496603, "flos": 28035137255040.0, "grad_norm": 1.8328356268320913, "language_loss": 0.69801283, "learning_rate": 9.236676592036628e-07, "loss": 0.71993607, "num_input_tokens_seen": 123524250, "step": 5742, "time_per_iteration": 4.767265558242798 }, { "auxiliary_loss_clip": 0.01162628, "auxiliary_loss_mlp": 0.01025863, "balance_loss_clip": 1.05183589, "balance_loss_mlp": 1.01817989, "epoch": 0.6905549209402994, "flos": 23624266008960.0, "grad_norm": 1.9230933495574614, "language_loss": 0.73386437, "learning_rate": 9.230111933634228e-07, "loss": 0.75574929, "num_input_tokens_seen": 123545845, "step": 5743, "time_per_iteration": 2.8292083740234375 }, { "auxiliary_loss_clip": 0.01168785, "auxiliary_loss_mlp": 0.0102817, "balance_loss_clip": 1.0507853, "balance_loss_mlp": 1.02000368, "epoch": 0.6906751638309385, "flos": 23114945111040.0, "grad_norm": 1.5849204675328572, "language_loss": 0.8094762, "learning_rate": 9.223548909015288e-07, "loss": 0.83144569, "num_input_tokens_seen": 123567535, "step": 5744, "time_per_iteration": 2.7919793128967285 }, { "auxiliary_loss_clip": 0.01149518, "auxiliary_loss_mlp": 0.01023653, "balance_loss_clip": 1.04641616, "balance_loss_mlp": 1.01700997, "epoch": 0.6907954067215776, "flos": 27305468375040.0, "grad_norm": 1.8879539515932635, "language_loss": 0.71890485, "learning_rate": 9.216987519175407e-07, "loss": 0.74063653, "num_input_tokens_seen": 123587710, "step": 5745, "time_per_iteration": 2.757371425628662 }, { "auxiliary_loss_clip": 0.01162278, "auxiliary_loss_mlp": 0.01024623, "balance_loss_clip": 1.04796529, "balance_loss_mlp": 1.01733601, "epoch": 0.6909156496122166, "flos": 21689399070720.0, "grad_norm": 2.0882213095732403, "language_loss": 0.68502307, "learning_rate": 9.210427765109942e-07, "loss": 0.70689207, "num_input_tokens_seen": 123607385, "step": 5746, "time_per_iteration": 2.58207106590271 }, { "auxiliary_loss_clip": 0.01165346, "auxiliary_loss_mlp": 0.01024258, "balance_loss_clip": 1.04701257, "balance_loss_mlp": 1.01620555, "epoch": 0.6910358925028558, "flos": 22561453463040.0, "grad_norm": 3.247629792628385, "language_loss": 0.81341243, "learning_rate": 9.20386964781402e-07, "loss": 0.83530843, "num_input_tokens_seen": 123625405, "step": 5747, "time_per_iteration": 2.682466506958008 }, { "auxiliary_loss_clip": 0.01161623, "auxiliary_loss_mlp": 0.01024924, "balance_loss_clip": 1.04904723, "balance_loss_mlp": 1.01761627, "epoch": 0.6911561353934949, "flos": 22054107813120.0, "grad_norm": 2.2053118023561975, "language_loss": 0.84332943, "learning_rate": 9.197313168282472e-07, "loss": 0.86519492, "num_input_tokens_seen": 123642850, "step": 5748, "time_per_iteration": 2.6795639991760254 }, { "auxiliary_loss_clip": 0.01162008, "auxiliary_loss_mlp": 0.01025329, "balance_loss_clip": 1.04718328, "balance_loss_mlp": 1.0176636, "epoch": 0.6912763782841339, "flos": 24206557386240.0, "grad_norm": 2.9132826739483764, "language_loss": 0.72496659, "learning_rate": 9.190758327509935e-07, "loss": 0.74683994, "num_input_tokens_seen": 123661595, "step": 5749, "time_per_iteration": 3.696136236190796 }, { "auxiliary_loss_clip": 0.0106713, "auxiliary_loss_mlp": 0.01040593, "balance_loss_clip": 1.01130402, "balance_loss_mlp": 1.00387502, "epoch": 0.6913966211747731, "flos": 52329641091840.0, "grad_norm": 0.9278530872917576, "language_loss": 0.64410973, "learning_rate": 9.184205126490767e-07, "loss": 0.66518688, "num_input_tokens_seen": 123710490, "step": 5750, "time_per_iteration": 3.144028663635254 }, { "auxiliary_loss_clip": 0.01066429, "auxiliary_loss_mlp": 0.01036979, "balance_loss_clip": 1.01106071, "balance_loss_mlp": 0.99987757, "epoch": 0.6915168640654121, "flos": 66741274851840.0, "grad_norm": 1.0947128246159503, "language_loss": 0.59615493, "learning_rate": 9.177653566219075e-07, "loss": 0.61718893, "num_input_tokens_seen": 123765215, "step": 5751, "time_per_iteration": 3.2728219032287598 }, { "auxiliary_loss_clip": 0.01163276, "auxiliary_loss_mlp": 0.01022918, "balance_loss_clip": 1.04729486, "balance_loss_mlp": 1.01578617, "epoch": 0.6916371069560512, "flos": 18296523175680.0, "grad_norm": 2.347980936615004, "language_loss": 0.76113403, "learning_rate": 9.171103647688744e-07, "loss": 0.782996, "num_input_tokens_seen": 123783955, "step": 5752, "time_per_iteration": 2.817487955093384 }, { "auxiliary_loss_clip": 0.01145392, "auxiliary_loss_mlp": 0.0103216, "balance_loss_clip": 1.04834187, "balance_loss_mlp": 1.02501082, "epoch": 0.6917573498466904, "flos": 19645794685440.0, "grad_norm": 2.1829175101591085, "language_loss": 0.68912691, "learning_rate": 9.164555371893367e-07, "loss": 0.71090245, "num_input_tokens_seen": 123803885, "step": 5753, "time_per_iteration": 2.804483413696289 }, { "auxiliary_loss_clip": 0.01166163, "auxiliary_loss_mlp": 0.01057389, "balance_loss_clip": 1.04964972, "balance_loss_mlp": 1.01969254, "epoch": 0.6918775927373294, "flos": 14210319985920.0, "grad_norm": 1.7727614336509934, "language_loss": 0.75529313, "learning_rate": 9.158008739826333e-07, "loss": 0.77752864, "num_input_tokens_seen": 123821485, "step": 5754, "time_per_iteration": 2.7815465927124023 }, { "auxiliary_loss_clip": 0.01160221, "auxiliary_loss_mlp": 0.01025301, "balance_loss_clip": 1.04832518, "balance_loss_mlp": 1.01764774, "epoch": 0.6919978356279685, "flos": 23985455218560.0, "grad_norm": 1.5983717455381241, "language_loss": 0.86743557, "learning_rate": 9.151463752480744e-07, "loss": 0.88929081, "num_input_tokens_seen": 123840215, "step": 5755, "time_per_iteration": 2.731173276901245 }, { "auxiliary_loss_clip": 0.01150743, "auxiliary_loss_mlp": 0.01023922, "balance_loss_clip": 1.04667473, "balance_loss_mlp": 1.01660228, "epoch": 0.6921180785186076, "flos": 23622937205760.0, "grad_norm": 1.428833051275655, "language_loss": 0.8045072, "learning_rate": 9.144920410849493e-07, "loss": 0.82625377, "num_input_tokens_seen": 123861450, "step": 5756, "time_per_iteration": 2.777662515640259 }, { "auxiliary_loss_clip": 0.01166348, "auxiliary_loss_mlp": 0.01026866, "balance_loss_clip": 1.04808021, "balance_loss_mlp": 1.01927805, "epoch": 0.6922383214092467, "flos": 21142623265920.0, "grad_norm": 1.7485098444630642, "language_loss": 0.81040978, "learning_rate": 9.138378715925176e-07, "loss": 0.83234191, "num_input_tokens_seen": 123880545, "step": 5757, "time_per_iteration": 2.692199945449829 }, { "auxiliary_loss_clip": 0.01158643, "auxiliary_loss_mlp": 0.01023065, "balance_loss_clip": 1.04823613, "balance_loss_mlp": 1.01563251, "epoch": 0.6923585642998857, "flos": 21470667200640.0, "grad_norm": 1.6108594806167842, "language_loss": 0.80886507, "learning_rate": 9.131838668700167e-07, "loss": 0.83068216, "num_input_tokens_seen": 123900615, "step": 5758, "time_per_iteration": 2.6871511936187744 }, { "auxiliary_loss_clip": 0.01160178, "auxiliary_loss_mlp": 0.01028193, "balance_loss_clip": 1.04768157, "balance_loss_mlp": 1.02130246, "epoch": 0.6924788071905249, "flos": 21105204272640.0, "grad_norm": 1.726551872331716, "language_loss": 0.86649716, "learning_rate": 9.125300270166598e-07, "loss": 0.88838089, "num_input_tokens_seen": 123921220, "step": 5759, "time_per_iteration": 2.7863852977752686 }, { "auxiliary_loss_clip": 0.01167123, "auxiliary_loss_mlp": 0.01020281, "balance_loss_clip": 1.04670131, "balance_loss_mlp": 1.01353335, "epoch": 0.692599050081164, "flos": 26250018117120.0, "grad_norm": 1.8303143647332156, "language_loss": 0.85731936, "learning_rate": 9.118763521316324e-07, "loss": 0.87919343, "num_input_tokens_seen": 123941795, "step": 5760, "time_per_iteration": 2.80572772026062 }, { "auxiliary_loss_clip": 0.01169226, "auxiliary_loss_mlp": 0.01052707, "balance_loss_clip": 1.04727054, "balance_loss_mlp": 1.01689553, "epoch": 0.692719292971803, "flos": 20885215426560.0, "grad_norm": 2.1055318665510203, "language_loss": 0.7592237, "learning_rate": 9.112228423140987e-07, "loss": 0.78144306, "num_input_tokens_seen": 123960715, "step": 5761, "time_per_iteration": 2.6753790378570557 }, { "auxiliary_loss_clip": 0.01166632, "auxiliary_loss_mlp": 0.01025623, "balance_loss_clip": 1.04746151, "balance_loss_mlp": 1.01785016, "epoch": 0.6928395358624422, "flos": 25921938268800.0, "grad_norm": 2.308365048899119, "language_loss": 0.86489993, "learning_rate": 9.105694976631932e-07, "loss": 0.88682246, "num_input_tokens_seen": 123978625, "step": 5762, "time_per_iteration": 2.7585108280181885 }, { "auxiliary_loss_clip": 0.01168134, "auxiliary_loss_mlp": 0.01031043, "balance_loss_clip": 1.04897285, "balance_loss_mlp": 1.02348483, "epoch": 0.6929597787530812, "flos": 23586559706880.0, "grad_norm": 3.075050025135146, "language_loss": 0.72229356, "learning_rate": 9.099163182780283e-07, "loss": 0.74428535, "num_input_tokens_seen": 123996780, "step": 5763, "time_per_iteration": 2.7917113304138184 }, { "auxiliary_loss_clip": 0.01158704, "auxiliary_loss_mlp": 0.01025971, "balance_loss_clip": 1.0454495, "balance_loss_mlp": 1.01806748, "epoch": 0.6930800216437203, "flos": 18255656476800.0, "grad_norm": 2.719874229920475, "language_loss": 0.49704295, "learning_rate": 9.092633042576916e-07, "loss": 0.51888973, "num_input_tokens_seen": 124014045, "step": 5764, "time_per_iteration": 2.738121271133423 }, { "auxiliary_loss_clip": 0.01159242, "auxiliary_loss_mlp": 0.01023057, "balance_loss_clip": 1.04773283, "balance_loss_mlp": 1.01579475, "epoch": 0.6932002645343595, "flos": 29168621809920.0, "grad_norm": 1.8343299175703098, "language_loss": 0.56468391, "learning_rate": 9.086104557012446e-07, "loss": 0.5865069, "num_input_tokens_seen": 124034615, "step": 5765, "time_per_iteration": 2.7821578979492188 }, { "auxiliary_loss_clip": 0.01157297, "auxiliary_loss_mlp": 0.01022199, "balance_loss_clip": 1.04658628, "balance_loss_mlp": 1.01521671, "epoch": 0.6933205074249985, "flos": 23842746483840.0, "grad_norm": 2.168681790221974, "language_loss": 0.66041017, "learning_rate": 9.079577727077239e-07, "loss": 0.68220508, "num_input_tokens_seen": 124053445, "step": 5766, "time_per_iteration": 3.7058701515197754 }, { "auxiliary_loss_clip": 0.01166607, "auxiliary_loss_mlp": 0.01026836, "balance_loss_clip": 1.0488708, "balance_loss_mlp": 1.01856256, "epoch": 0.6934407503156376, "flos": 24166696268160.0, "grad_norm": 2.4764800614809452, "language_loss": 0.71907032, "learning_rate": 9.073052553761404e-07, "loss": 0.74100477, "num_input_tokens_seen": 124072810, "step": 5767, "time_per_iteration": 2.683364152908325 }, { "auxiliary_loss_clip": 0.011616, "auxiliary_loss_mlp": 0.01029629, "balance_loss_clip": 1.04910517, "balance_loss_mlp": 1.02142167, "epoch": 0.6935609932062767, "flos": 20631327120000.0, "grad_norm": 2.1591743062276847, "language_loss": 0.78131974, "learning_rate": 9.066529038054805e-07, "loss": 0.80323207, "num_input_tokens_seen": 124092875, "step": 5768, "time_per_iteration": 4.6257336139678955 }, { "auxiliary_loss_clip": 0.01161665, "auxiliary_loss_mlp": 0.010247, "balance_loss_clip": 1.04891944, "balance_loss_mlp": 1.01790547, "epoch": 0.6936812360969158, "flos": 18254184019200.0, "grad_norm": 1.7709784422559371, "language_loss": 0.748191, "learning_rate": 9.060007180947071e-07, "loss": 0.77005464, "num_input_tokens_seen": 124110930, "step": 5769, "time_per_iteration": 2.871708393096924 }, { "auxiliary_loss_clip": 0.01160868, "auxiliary_loss_mlp": 0.01027041, "balance_loss_clip": 1.04754114, "balance_loss_mlp": 1.01947761, "epoch": 0.6938014789875548, "flos": 31317336368640.0, "grad_norm": 1.7588368983533949, "language_loss": 0.73302042, "learning_rate": 9.053486983427534e-07, "loss": 0.7548995, "num_input_tokens_seen": 124132180, "step": 5770, "time_per_iteration": 2.8811004161834717 }, { "auxiliary_loss_clip": 0.011636, "auxiliary_loss_mlp": 0.01025394, "balance_loss_clip": 1.04456949, "balance_loss_mlp": 1.01820612, "epoch": 0.6939217218781939, "flos": 17528429721600.0, "grad_norm": 2.263071662676485, "language_loss": 0.70550227, "learning_rate": 9.046968446485326e-07, "loss": 0.7273922, "num_input_tokens_seen": 124150585, "step": 5771, "time_per_iteration": 2.794271469116211 }, { "auxiliary_loss_clip": 0.01168855, "auxiliary_loss_mlp": 0.01025019, "balance_loss_clip": 1.04846299, "balance_loss_mlp": 1.01691318, "epoch": 0.6940419647688331, "flos": 18551776199040.0, "grad_norm": 2.8283560905690512, "language_loss": 0.70540714, "learning_rate": 9.040451571109295e-07, "loss": 0.72734594, "num_input_tokens_seen": 124166205, "step": 5772, "time_per_iteration": 2.677757740020752 }, { "auxiliary_loss_clip": 0.01065766, "auxiliary_loss_mlp": 0.01001318, "balance_loss_clip": 1.01686621, "balance_loss_mlp": 1.00025749, "epoch": 0.6941622076594721, "flos": 66926286829440.0, "grad_norm": 0.8272746480145213, "language_loss": 0.60351139, "learning_rate": 9.033936358288042e-07, "loss": 0.62418222, "num_input_tokens_seen": 124219940, "step": 5773, "time_per_iteration": 3.214081287384033 }, { "auxiliary_loss_clip": 0.01172169, "auxiliary_loss_mlp": 0.01027296, "balance_loss_clip": 1.04886031, "balance_loss_mlp": 1.0199945, "epoch": 0.6942824505501112, "flos": 26578062051840.0, "grad_norm": 2.5741565398399677, "language_loss": 0.82520545, "learning_rate": 9.027422809009937e-07, "loss": 0.84720004, "num_input_tokens_seen": 124239885, "step": 5774, "time_per_iteration": 3.5863101482391357 }, { "auxiliary_loss_clip": 0.01169589, "auxiliary_loss_mlp": 0.01024849, "balance_loss_clip": 1.04730844, "balance_loss_mlp": 1.01705825, "epoch": 0.6944026934407503, "flos": 21248308056960.0, "grad_norm": 1.656327274447449, "language_loss": 0.83156049, "learning_rate": 9.020910924263054e-07, "loss": 0.8535049, "num_input_tokens_seen": 124258410, "step": 5775, "time_per_iteration": 2.655277967453003 }, { "auxiliary_loss_clip": 0.01064729, "auxiliary_loss_mlp": 0.01001772, "balance_loss_clip": 1.01785648, "balance_loss_mlp": 1.00078225, "epoch": 0.6945229363313894, "flos": 70677191537280.0, "grad_norm": 0.8179311063642926, "language_loss": 0.58184373, "learning_rate": 9.014400705035261e-07, "loss": 0.60250866, "num_input_tokens_seen": 124315315, "step": 5776, "time_per_iteration": 3.3351635932922363 }, { "auxiliary_loss_clip": 0.01169117, "auxiliary_loss_mlp": 0.01021544, "balance_loss_clip": 1.04916167, "balance_loss_mlp": 1.01405776, "epoch": 0.6946431792220285, "flos": 18952934267520.0, "grad_norm": 2.2489714300428028, "language_loss": 0.76243293, "learning_rate": 9.00789215231414e-07, "loss": 0.78433955, "num_input_tokens_seen": 124333710, "step": 5777, "time_per_iteration": 2.6472606658935547 }, { "auxiliary_loss_clip": 0.01161695, "auxiliary_loss_mlp": 0.01056445, "balance_loss_clip": 1.04788566, "balance_loss_mlp": 1.01794124, "epoch": 0.6947634221126676, "flos": 20338834671360.0, "grad_norm": 3.1125562345691753, "language_loss": 0.82137895, "learning_rate": 9.001385267087056e-07, "loss": 0.84356034, "num_input_tokens_seen": 124352855, "step": 5778, "time_per_iteration": 2.762968063354492 }, { "auxiliary_loss_clip": 0.01167223, "auxiliary_loss_mlp": 0.01028148, "balance_loss_clip": 1.04818249, "balance_loss_mlp": 1.0209949, "epoch": 0.6948836650033067, "flos": 21833723917440.0, "grad_norm": 1.502899968856973, "language_loss": 0.70235205, "learning_rate": 8.994880050341072e-07, "loss": 0.72430575, "num_input_tokens_seen": 124372960, "step": 5779, "time_per_iteration": 2.7635109424591064 }, { "auxiliary_loss_clip": 0.0116214, "auxiliary_loss_mlp": 0.01031426, "balance_loss_clip": 1.0520916, "balance_loss_mlp": 1.02391851, "epoch": 0.6950039078939457, "flos": 23657519024640.0, "grad_norm": 1.8003985778733784, "language_loss": 0.77681643, "learning_rate": 8.988376503063026e-07, "loss": 0.79875207, "num_input_tokens_seen": 124394220, "step": 5780, "time_per_iteration": 2.8928537368774414 }, { "auxiliary_loss_clip": 0.01165115, "auxiliary_loss_mlp": 0.01022577, "balance_loss_clip": 1.0484941, "balance_loss_mlp": 1.01479864, "epoch": 0.6951241507845849, "flos": 21792462168960.0, "grad_norm": 1.8087065903232078, "language_loss": 0.81237721, "learning_rate": 8.981874626239521e-07, "loss": 0.83425415, "num_input_tokens_seen": 124412795, "step": 5781, "time_per_iteration": 2.8314223289489746 }, { "auxiliary_loss_clip": 0.01166359, "auxiliary_loss_mlp": 0.01023399, "balance_loss_clip": 1.04894972, "balance_loss_mlp": 1.01604974, "epoch": 0.695244393675224, "flos": 14647568244480.0, "grad_norm": 2.1189207450020016, "language_loss": 0.88296318, "learning_rate": 8.975374420856872e-07, "loss": 0.90486079, "num_input_tokens_seen": 124429690, "step": 5782, "time_per_iteration": 2.6660444736480713 }, { "auxiliary_loss_clip": 0.0115074, "auxiliary_loss_mlp": 0.0102403, "balance_loss_clip": 1.0461843, "balance_loss_mlp": 1.01715803, "epoch": 0.695364636565863, "flos": 16873203778560.0, "grad_norm": 2.4547538063583385, "language_loss": 0.7258805, "learning_rate": 8.968875887901157e-07, "loss": 0.74762821, "num_input_tokens_seen": 124447070, "step": 5783, "time_per_iteration": 2.732976198196411 }, { "auxiliary_loss_clip": 0.01167054, "auxiliary_loss_mlp": 0.01027891, "balance_loss_clip": 1.0501833, "balance_loss_mlp": 1.02061343, "epoch": 0.6954848794565022, "flos": 19354523299200.0, "grad_norm": 3.4305361655394964, "language_loss": 0.62715995, "learning_rate": 8.9623790283582e-07, "loss": 0.64910948, "num_input_tokens_seen": 124464950, "step": 5784, "time_per_iteration": 2.7298898696899414 }, { "auxiliary_loss_clip": 0.01163864, "auxiliary_loss_mlp": 0.01026747, "balance_loss_clip": 1.05008817, "balance_loss_mlp": 1.01874828, "epoch": 0.6956051223471412, "flos": 18990209606400.0, "grad_norm": 2.1082927754012255, "language_loss": 0.76428366, "learning_rate": 8.955883843213561e-07, "loss": 0.78618979, "num_input_tokens_seen": 124483965, "step": 5785, "time_per_iteration": 2.787806510925293 }, { "auxiliary_loss_clip": 0.0117146, "auxiliary_loss_mlp": 0.01034131, "balance_loss_clip": 1.0469079, "balance_loss_mlp": 1.02642965, "epoch": 0.6957253652377803, "flos": 16107229226880.0, "grad_norm": 2.128168057026924, "language_loss": 0.87049329, "learning_rate": 8.949390333452569e-07, "loss": 0.89254922, "num_input_tokens_seen": 124501910, "step": 5786, "time_per_iteration": 2.6858482360839844 }, { "auxiliary_loss_clip": 0.01167688, "auxiliary_loss_mlp": 0.0101992, "balance_loss_clip": 1.04859805, "balance_loss_mlp": 1.01293421, "epoch": 0.6958456081284194, "flos": 29388646569600.0, "grad_norm": 1.895178730791949, "language_loss": 0.67889911, "learning_rate": 8.942898500060279e-07, "loss": 0.70077527, "num_input_tokens_seen": 124521625, "step": 5787, "time_per_iteration": 2.692856550216675 }, { "auxiliary_loss_clip": 0.01164652, "auxiliary_loss_mlp": 0.01024438, "balance_loss_clip": 1.04925203, "balance_loss_mlp": 1.01657057, "epoch": 0.6959658510190585, "flos": 25154850395520.0, "grad_norm": 3.052918126690615, "language_loss": 0.71372193, "learning_rate": 8.936408344021493e-07, "loss": 0.73561287, "num_input_tokens_seen": 124538540, "step": 5788, "time_per_iteration": 2.734135150909424 }, { "auxiliary_loss_clip": 0.01171521, "auxiliary_loss_mlp": 0.01025908, "balance_loss_clip": 1.05120301, "balance_loss_mlp": 1.01782608, "epoch": 0.6960860939096976, "flos": 42814388759040.0, "grad_norm": 1.9473403075376168, "language_loss": 0.70988739, "learning_rate": 8.929919866320765e-07, "loss": 0.73186171, "num_input_tokens_seen": 124559355, "step": 5789, "time_per_iteration": 2.857548475265503 }, { "auxiliary_loss_clip": 0.01170169, "auxiliary_loss_mlp": 0.01061614, "balance_loss_clip": 1.05182886, "balance_loss_mlp": 1.02395129, "epoch": 0.6962063368003367, "flos": 17566566986880.0, "grad_norm": 1.934868102952417, "language_loss": 0.81489837, "learning_rate": 8.923433067942385e-07, "loss": 0.8372162, "num_input_tokens_seen": 124577920, "step": 5790, "time_per_iteration": 2.7987847328186035 }, { "auxiliary_loss_clip": 0.0116836, "auxiliary_loss_mlp": 0.0102291, "balance_loss_clip": 1.0504452, "balance_loss_mlp": 1.0157696, "epoch": 0.6963265796909758, "flos": 21251648021760.0, "grad_norm": 2.100767481043307, "language_loss": 0.68743408, "learning_rate": 8.916947949870417e-07, "loss": 0.70934677, "num_input_tokens_seen": 124597585, "step": 5791, "time_per_iteration": 3.765824317932129 }, { "auxiliary_loss_clip": 0.01065488, "auxiliary_loss_mlp": 0.01001271, "balance_loss_clip": 1.01026368, "balance_loss_mlp": 1.00019264, "epoch": 0.6964468225816148, "flos": 68828295801600.0, "grad_norm": 0.7493017721520174, "language_loss": 0.58130926, "learning_rate": 8.910464513088615e-07, "loss": 0.60197681, "num_input_tokens_seen": 124661625, "step": 5792, "time_per_iteration": 3.3671796321868896 }, { "auxiliary_loss_clip": 0.01156584, "auxiliary_loss_mlp": 0.01027318, "balance_loss_clip": 1.04729033, "balance_loss_mlp": 1.01951551, "epoch": 0.696567065472254, "flos": 18950887192320.0, "grad_norm": 1.8828552471050843, "language_loss": 0.78421688, "learning_rate": 8.903982758580542e-07, "loss": 0.80605596, "num_input_tokens_seen": 124680565, "step": 5793, "time_per_iteration": 3.688758373260498 }, { "auxiliary_loss_clip": 0.01161226, "auxiliary_loss_mlp": 0.01020945, "balance_loss_clip": 1.04907084, "balance_loss_mlp": 1.01324463, "epoch": 0.696687308362893, "flos": 22856675345280.0, "grad_norm": 2.06389833060289, "language_loss": 0.80581677, "learning_rate": 8.897502687329457e-07, "loss": 0.82763857, "num_input_tokens_seen": 124700365, "step": 5794, "time_per_iteration": 3.6574454307556152 }, { "auxiliary_loss_clip": 0.01154689, "auxiliary_loss_mlp": 0.01026868, "balance_loss_clip": 1.04702878, "balance_loss_mlp": 1.01968861, "epoch": 0.6968075512535321, "flos": 24972926987520.0, "grad_norm": 1.8381802318854021, "language_loss": 0.79948306, "learning_rate": 8.891024300318382e-07, "loss": 0.82129872, "num_input_tokens_seen": 124718935, "step": 5795, "time_per_iteration": 2.851529836654663 }, { "auxiliary_loss_clip": 0.0115287, "auxiliary_loss_mlp": 0.01025459, "balance_loss_clip": 1.04850781, "balance_loss_mlp": 1.01806498, "epoch": 0.6969277941441713, "flos": 21030438113280.0, "grad_norm": 1.474560257626796, "language_loss": 0.75858253, "learning_rate": 8.884547598530103e-07, "loss": 0.78036582, "num_input_tokens_seen": 124739505, "step": 5796, "time_per_iteration": 2.775341033935547 }, { "auxiliary_loss_clip": 0.01142696, "auxiliary_loss_mlp": 0.01030106, "balance_loss_clip": 1.04882646, "balance_loss_mlp": 1.02286148, "epoch": 0.6970480370348103, "flos": 21579404647680.0, "grad_norm": 1.9037537226814214, "language_loss": 0.75006235, "learning_rate": 8.8780725829471e-07, "loss": 0.77179039, "num_input_tokens_seen": 124757410, "step": 5797, "time_per_iteration": 2.768317699432373 }, { "auxiliary_loss_clip": 0.01169086, "auxiliary_loss_mlp": 0.01024563, "balance_loss_clip": 1.04693222, "balance_loss_mlp": 1.01683247, "epoch": 0.6971682799254494, "flos": 22419175691520.0, "grad_norm": 1.990681351563173, "language_loss": 0.77641249, "learning_rate": 8.87159925455165e-07, "loss": 0.79834896, "num_input_tokens_seen": 124777240, "step": 5798, "time_per_iteration": 2.6840527057647705 }, { "auxiliary_loss_clip": 0.0115692, "auxiliary_loss_mlp": 0.01028794, "balance_loss_clip": 1.04810286, "balance_loss_mlp": 1.0212599, "epoch": 0.6972885228160886, "flos": 20005834659840.0, "grad_norm": 1.7933318473163788, "language_loss": 0.73706746, "learning_rate": 8.865127614325738e-07, "loss": 0.7589246, "num_input_tokens_seen": 124795670, "step": 5799, "time_per_iteration": 2.7130587100982666 }, { "auxiliary_loss_clip": 0.01162289, "auxiliary_loss_mlp": 0.0103137, "balance_loss_clip": 1.05193138, "balance_loss_mlp": 1.02270317, "epoch": 0.6974087657067276, "flos": 37853437656960.0, "grad_norm": 2.015676102683446, "language_loss": 0.66702896, "learning_rate": 8.85865766325113e-07, "loss": 0.68896556, "num_input_tokens_seen": 124819600, "step": 5800, "time_per_iteration": 2.853811025619507 }, { "auxiliary_loss_clip": 0.01161413, "auxiliary_loss_mlp": 0.01023871, "balance_loss_clip": 1.04896069, "balance_loss_mlp": 1.01625347, "epoch": 0.6975290085973667, "flos": 29489267543040.0, "grad_norm": 2.4368788897496843, "language_loss": 0.72733104, "learning_rate": 8.852189402309287e-07, "loss": 0.74918389, "num_input_tokens_seen": 124838785, "step": 5801, "time_per_iteration": 3.6846799850463867 }, { "auxiliary_loss_clip": 0.01167636, "auxiliary_loss_mlp": 0.01023795, "balance_loss_clip": 1.04959607, "balance_loss_mlp": 1.01605237, "epoch": 0.6976492514880057, "flos": 12895630295040.0, "grad_norm": 4.922666668993647, "language_loss": 0.74478924, "learning_rate": 8.845722832481441e-07, "loss": 0.76670355, "num_input_tokens_seen": 124854215, "step": 5802, "time_per_iteration": 2.8725926876068115 }, { "auxiliary_loss_clip": 0.01163623, "auxiliary_loss_mlp": 0.01028932, "balance_loss_clip": 1.04862666, "balance_loss_mlp": 1.02117133, "epoch": 0.6977694943786449, "flos": 24352929308160.0, "grad_norm": 1.8126332343011882, "language_loss": 0.77616477, "learning_rate": 8.83925795474858e-07, "loss": 0.79809034, "num_input_tokens_seen": 124874340, "step": 5803, "time_per_iteration": 2.7445664405822754 }, { "auxiliary_loss_clip": 0.01158195, "auxiliary_loss_mlp": 0.01027039, "balance_loss_clip": 1.04892254, "balance_loss_mlp": 1.01881957, "epoch": 0.6978897372692839, "flos": 29898470257920.0, "grad_norm": 2.126977112815076, "language_loss": 0.6000576, "learning_rate": 8.832794770091414e-07, "loss": 0.62190998, "num_input_tokens_seen": 124895175, "step": 5804, "time_per_iteration": 2.807117223739624 }, { "auxiliary_loss_clip": 0.01166643, "auxiliary_loss_mlp": 0.01027407, "balance_loss_clip": 1.04594254, "balance_loss_mlp": 1.0201143, "epoch": 0.698009980159923, "flos": 21761579450880.0, "grad_norm": 2.019837717688988, "language_loss": 0.82395077, "learning_rate": 8.826333279490401e-07, "loss": 0.84589124, "num_input_tokens_seen": 124915810, "step": 5805, "time_per_iteration": 2.7455286979675293 }, { "auxiliary_loss_clip": 0.01166843, "auxiliary_loss_mlp": 0.01023187, "balance_loss_clip": 1.04940343, "balance_loss_mlp": 1.01627851, "epoch": 0.6981302230505622, "flos": 19857164267520.0, "grad_norm": 2.3224517781301355, "language_loss": 0.68593746, "learning_rate": 8.819873483925748e-07, "loss": 0.70783776, "num_input_tokens_seen": 124932930, "step": 5806, "time_per_iteration": 2.660676956176758 }, { "auxiliary_loss_clip": 0.01162825, "auxiliary_loss_mlp": 0.01050983, "balance_loss_clip": 1.04854119, "balance_loss_mlp": 1.01559472, "epoch": 0.6982504659412012, "flos": 22198648141440.0, "grad_norm": 2.463259603519767, "language_loss": 0.74408627, "learning_rate": 8.81341538437739e-07, "loss": 0.76622427, "num_input_tokens_seen": 124951220, "step": 5807, "time_per_iteration": 2.798990488052368 }, { "auxiliary_loss_clip": 0.01165248, "auxiliary_loss_mlp": 0.01023485, "balance_loss_clip": 1.04642892, "balance_loss_mlp": 1.01564097, "epoch": 0.6983707088318403, "flos": 35588479708800.0, "grad_norm": 2.2278787936838995, "language_loss": 0.67828, "learning_rate": 8.80695898182503e-07, "loss": 0.7001673, "num_input_tokens_seen": 124972200, "step": 5808, "time_per_iteration": 2.8553690910339355 }, { "auxiliary_loss_clip": 0.01062808, "auxiliary_loss_mlp": 0.01000432, "balance_loss_clip": 1.01420212, "balance_loss_mlp": 0.99937707, "epoch": 0.6984909517224794, "flos": 65440052760960.0, "grad_norm": 0.8174507730707455, "language_loss": 0.65119654, "learning_rate": 8.800504277248093e-07, "loss": 0.67182899, "num_input_tokens_seen": 125036950, "step": 5809, "time_per_iteration": 3.2894511222839355 }, { "auxiliary_loss_clip": 0.01156533, "auxiliary_loss_mlp": 0.01061113, "balance_loss_clip": 1.05134463, "balance_loss_mlp": 1.02303934, "epoch": 0.6986111946131185, "flos": 18546927863040.0, "grad_norm": 1.8613916987583699, "language_loss": 0.75256437, "learning_rate": 8.794051271625753e-07, "loss": 0.77474082, "num_input_tokens_seen": 125054585, "step": 5810, "time_per_iteration": 2.744835138320923 }, { "auxiliary_loss_clip": 0.01157626, "auxiliary_loss_mlp": 0.01024133, "balance_loss_clip": 1.04443002, "balance_loss_mlp": 1.01727533, "epoch": 0.6987314375037575, "flos": 23039173370880.0, "grad_norm": 1.5884664413076044, "language_loss": 0.83252001, "learning_rate": 8.787599965936925e-07, "loss": 0.85433757, "num_input_tokens_seen": 125075515, "step": 5811, "time_per_iteration": 2.67539119720459 }, { "auxiliary_loss_clip": 0.01156022, "auxiliary_loss_mlp": 0.01029104, "balance_loss_clip": 1.05000055, "balance_loss_mlp": 1.02176332, "epoch": 0.6988516803943967, "flos": 38400393029760.0, "grad_norm": 1.6773205068692674, "language_loss": 0.716905, "learning_rate": 8.781150361160261e-07, "loss": 0.73875624, "num_input_tokens_seen": 125097425, "step": 5812, "time_per_iteration": 2.8514134883880615 }, { "auxiliary_loss_clip": 0.01166177, "auxiliary_loss_mlp": 0.01027214, "balance_loss_clip": 1.05088842, "balance_loss_mlp": 1.01952481, "epoch": 0.6989719232850358, "flos": 24096993926400.0, "grad_norm": 1.6435993068798989, "language_loss": 0.735277, "learning_rate": 8.774702458274181e-07, "loss": 0.75721097, "num_input_tokens_seen": 125117830, "step": 5813, "time_per_iteration": 2.723724126815796 }, { "auxiliary_loss_clip": 0.01164544, "auxiliary_loss_mlp": 0.01032037, "balance_loss_clip": 1.0473423, "balance_loss_mlp": 1.02351975, "epoch": 0.6990921661756748, "flos": 14866838818560.0, "grad_norm": 4.5797255969343125, "language_loss": 0.70988142, "learning_rate": 8.768256258256799e-07, "loss": 0.73184729, "num_input_tokens_seen": 125134455, "step": 5814, "time_per_iteration": 2.6793735027313232 }, { "auxiliary_loss_clip": 0.01170618, "auxiliary_loss_mlp": 0.01026332, "balance_loss_clip": 1.05038989, "balance_loss_mlp": 1.0188601, "epoch": 0.699212409066314, "flos": 20193719725440.0, "grad_norm": 1.9920469474563733, "language_loss": 0.73761743, "learning_rate": 8.76181176208602e-07, "loss": 0.75958699, "num_input_tokens_seen": 125152555, "step": 5815, "time_per_iteration": 2.6935153007507324 }, { "auxiliary_loss_clip": 0.01149558, "auxiliary_loss_mlp": 0.01025772, "balance_loss_clip": 1.04882479, "balance_loss_mlp": 1.01791072, "epoch": 0.699332651956953, "flos": 19427888828160.0, "grad_norm": 1.8384311380832559, "language_loss": 0.73938227, "learning_rate": 8.755368970739461e-07, "loss": 0.76113558, "num_input_tokens_seen": 125171915, "step": 5816, "time_per_iteration": 2.766953468322754 }, { "auxiliary_loss_clip": 0.01166454, "auxiliary_loss_mlp": 0.01024935, "balance_loss_clip": 1.04771936, "balance_loss_mlp": 1.01683497, "epoch": 0.6994528948475921, "flos": 16143714466560.0, "grad_norm": 2.1219867483288732, "language_loss": 0.61250293, "learning_rate": 8.748927885194479e-07, "loss": 0.63441688, "num_input_tokens_seen": 125190220, "step": 5817, "time_per_iteration": 3.636608600616455 }, { "auxiliary_loss_clip": 0.01060468, "auxiliary_loss_mlp": 0.01005302, "balance_loss_clip": 1.01402724, "balance_loss_mlp": 1.00421166, "epoch": 0.6995731377382313, "flos": 64952420699520.0, "grad_norm": 0.7909431006373966, "language_loss": 0.57366347, "learning_rate": 8.742488506428209e-07, "loss": 0.59432119, "num_input_tokens_seen": 125249310, "step": 5818, "time_per_iteration": 3.2605502605438232 }, { "auxiliary_loss_clip": 0.01166079, "auxiliary_loss_mlp": 0.01050468, "balance_loss_clip": 1.04811585, "balance_loss_mlp": 1.01429462, "epoch": 0.6996933806288703, "flos": 24900136076160.0, "grad_norm": 2.678729571058325, "language_loss": 0.78188163, "learning_rate": 8.736050835417466e-07, "loss": 0.80404705, "num_input_tokens_seen": 125269350, "step": 5819, "time_per_iteration": 3.7337615489959717 }, { "auxiliary_loss_clip": 0.01168722, "auxiliary_loss_mlp": 0.01023473, "balance_loss_clip": 1.04835081, "balance_loss_mlp": 1.01586413, "epoch": 0.6998136235195094, "flos": 20777806782720.0, "grad_norm": 2.3766043358638043, "language_loss": 0.61092764, "learning_rate": 8.729614873138862e-07, "loss": 0.63284957, "num_input_tokens_seen": 125286985, "step": 5820, "time_per_iteration": 3.5744495391845703 }, { "auxiliary_loss_clip": 0.01163833, "auxiliary_loss_mlp": 0.01023661, "balance_loss_clip": 1.05069304, "balance_loss_mlp": 1.01614165, "epoch": 0.6999338664101485, "flos": 23733470332800.0, "grad_norm": 2.1209222099027363, "language_loss": 0.77806813, "learning_rate": 8.723180620568716e-07, "loss": 0.79994303, "num_input_tokens_seen": 125306240, "step": 5821, "time_per_iteration": 2.76668381690979 }, { "auxiliary_loss_clip": 0.0116626, "auxiliary_loss_mlp": 0.01031531, "balance_loss_clip": 1.04753399, "balance_loss_mlp": 1.02426863, "epoch": 0.7000541093007876, "flos": 19864598382720.0, "grad_norm": 1.7476269688666495, "language_loss": 0.84690428, "learning_rate": 8.716748078683116e-07, "loss": 0.86888218, "num_input_tokens_seen": 125323015, "step": 5822, "time_per_iteration": 2.7335617542266846 }, { "auxiliary_loss_clip": 0.01146844, "auxiliary_loss_mlp": 0.01024888, "balance_loss_clip": 1.05060351, "balance_loss_mlp": 1.01682377, "epoch": 0.7001743521914267, "flos": 29679056029440.0, "grad_norm": 2.056861373604475, "language_loss": 0.6857425, "learning_rate": 8.710317248457855e-07, "loss": 0.70745981, "num_input_tokens_seen": 125342630, "step": 5823, "time_per_iteration": 2.8627853393554688 }, { "auxiliary_loss_clip": 0.01159461, "auxiliary_loss_mlp": 0.0102198, "balance_loss_clip": 1.04873455, "balance_loss_mlp": 1.01452899, "epoch": 0.7002945950820658, "flos": 27489762080640.0, "grad_norm": 1.8139742856069283, "language_loss": 0.72048771, "learning_rate": 8.703888130868482e-07, "loss": 0.74230218, "num_input_tokens_seen": 125364480, "step": 5824, "time_per_iteration": 2.8522136211395264 }, { "auxiliary_loss_clip": 0.01159637, "auxiliary_loss_mlp": 0.01023615, "balance_loss_clip": 1.04935014, "balance_loss_mlp": 1.01686168, "epoch": 0.7004148379727049, "flos": 22158463800960.0, "grad_norm": 2.1293805605049685, "language_loss": 0.82077777, "learning_rate": 8.697460726890307e-07, "loss": 0.84261036, "num_input_tokens_seen": 125381625, "step": 5825, "time_per_iteration": 2.8412084579467773 }, { "auxiliary_loss_clip": 0.01160799, "auxiliary_loss_mlp": 0.0105473, "balance_loss_clip": 1.04840302, "balance_loss_mlp": 1.01730669, "epoch": 0.7005350808633439, "flos": 19423758764160.0, "grad_norm": 1.9425737197901194, "language_loss": 0.90638602, "learning_rate": 8.691035037498354e-07, "loss": 0.9285413, "num_input_tokens_seen": 125397615, "step": 5826, "time_per_iteration": 3.6110739707946777 }, { "auxiliary_loss_clip": 0.01161811, "auxiliary_loss_mlp": 0.01025888, "balance_loss_clip": 1.04807162, "balance_loss_mlp": 1.01922107, "epoch": 0.7006553237539831, "flos": 23476708938240.0, "grad_norm": 1.626845829464665, "language_loss": 0.72186184, "learning_rate": 8.684611063667391e-07, "loss": 0.74373877, "num_input_tokens_seen": 125418080, "step": 5827, "time_per_iteration": 2.818324565887451 }, { "auxiliary_loss_clip": 0.0116496, "auxiliary_loss_mlp": 0.01023922, "balance_loss_clip": 1.04748726, "balance_loss_mlp": 1.01674271, "epoch": 0.7007755666446221, "flos": 31212872640000.0, "grad_norm": 2.1999311076893022, "language_loss": 0.76857507, "learning_rate": 8.678188806371935e-07, "loss": 0.79046386, "num_input_tokens_seen": 125440115, "step": 5828, "time_per_iteration": 2.8404757976531982 }, { "auxiliary_loss_clip": 0.01165694, "auxiliary_loss_mlp": 0.01023917, "balance_loss_clip": 1.04775739, "balance_loss_mlp": 1.01653838, "epoch": 0.7008958095352612, "flos": 18149899858560.0, "grad_norm": 2.00465964348474, "language_loss": 0.85610098, "learning_rate": 8.671768266586228e-07, "loss": 0.87799716, "num_input_tokens_seen": 125458240, "step": 5829, "time_per_iteration": 2.7092087268829346 }, { "auxiliary_loss_clip": 0.01159076, "auxiliary_loss_mlp": 0.01029881, "balance_loss_clip": 1.04944348, "balance_loss_mlp": 1.02242136, "epoch": 0.7010160524259004, "flos": 27452307173760.0, "grad_norm": 1.7336083321866864, "language_loss": 0.78383517, "learning_rate": 8.665349445284275e-07, "loss": 0.80572474, "num_input_tokens_seen": 125477980, "step": 5830, "time_per_iteration": 2.826519012451172 }, { "auxiliary_loss_clip": 0.01156858, "auxiliary_loss_mlp": 0.01024368, "balance_loss_clip": 1.04636478, "balance_loss_mlp": 1.01619017, "epoch": 0.7011362953165394, "flos": 23842064125440.0, "grad_norm": 1.5880880283425827, "language_loss": 0.81142831, "learning_rate": 8.658932343439799e-07, "loss": 0.83324057, "num_input_tokens_seen": 125497765, "step": 5831, "time_per_iteration": 2.8098645210266113 }, { "auxiliary_loss_clip": 0.0117198, "auxiliary_loss_mlp": 0.0102512, "balance_loss_clip": 1.05076909, "balance_loss_mlp": 1.01727033, "epoch": 0.7012565382071785, "flos": 24823430582400.0, "grad_norm": 2.005724554268181, "language_loss": 0.776232, "learning_rate": 8.65251696202627e-07, "loss": 0.79820305, "num_input_tokens_seen": 125514145, "step": 5832, "time_per_iteration": 2.691028594970703 }, { "auxiliary_loss_clip": 0.01161602, "auxiliary_loss_mlp": 0.0102953, "balance_loss_clip": 1.04889464, "balance_loss_mlp": 1.02168858, "epoch": 0.7013767810978175, "flos": 21397445326080.0, "grad_norm": 6.177984052187922, "language_loss": 0.87887776, "learning_rate": 8.646103302016896e-07, "loss": 0.90078902, "num_input_tokens_seen": 125533115, "step": 5833, "time_per_iteration": 2.732081651687622 }, { "auxiliary_loss_clip": 0.01169773, "auxiliary_loss_mlp": 0.01029344, "balance_loss_clip": 1.0506444, "balance_loss_mlp": 1.02133012, "epoch": 0.7014970239884567, "flos": 16687150306560.0, "grad_norm": 1.9919233693388267, "language_loss": 0.88732433, "learning_rate": 8.639691364384614e-07, "loss": 0.90931547, "num_input_tokens_seen": 125550740, "step": 5834, "time_per_iteration": 2.739361524581909 }, { "auxiliary_loss_clip": 0.01166441, "auxiliary_loss_mlp": 0.01029503, "balance_loss_clip": 1.04987478, "balance_loss_mlp": 1.02190304, "epoch": 0.7016172668790958, "flos": 12568268718720.0, "grad_norm": 1.9448748301477872, "language_loss": 0.73214149, "learning_rate": 8.633281150102136e-07, "loss": 0.75410098, "num_input_tokens_seen": 125567590, "step": 5835, "time_per_iteration": 2.6814327239990234 }, { "auxiliary_loss_clip": 0.01161309, "auxiliary_loss_mlp": 0.01025067, "balance_loss_clip": 1.04880691, "balance_loss_mlp": 1.01776552, "epoch": 0.7017375097697348, "flos": 17452729808640.0, "grad_norm": 2.1206162190547695, "language_loss": 0.67716193, "learning_rate": 8.626872660141855e-07, "loss": 0.69902569, "num_input_tokens_seen": 125585500, "step": 5836, "time_per_iteration": 2.686873435974121 }, { "auxiliary_loss_clip": 0.01157069, "auxiliary_loss_mlp": 0.01031061, "balance_loss_clip": 1.04896522, "balance_loss_mlp": 1.02379549, "epoch": 0.701857752660374, "flos": 18513028402560.0, "grad_norm": 1.8340502752339978, "language_loss": 0.74967539, "learning_rate": 8.620465895475957e-07, "loss": 0.77155674, "num_input_tokens_seen": 125603720, "step": 5837, "time_per_iteration": 2.7553346157073975 }, { "auxiliary_loss_clip": 0.01153688, "auxiliary_loss_mlp": 0.01024016, "balance_loss_clip": 1.04913712, "balance_loss_mlp": 1.01667845, "epoch": 0.701977995551013, "flos": 24425971614720.0, "grad_norm": 4.696489990020018, "language_loss": 0.75365412, "learning_rate": 8.614060857076333e-07, "loss": 0.77543116, "num_input_tokens_seen": 125624390, "step": 5838, "time_per_iteration": 2.80704665184021 }, { "auxiliary_loss_clip": 0.01159678, "auxiliary_loss_mlp": 0.01027756, "balance_loss_clip": 1.04808354, "balance_loss_mlp": 1.02022183, "epoch": 0.7020982384416521, "flos": 23002759958400.0, "grad_norm": 1.9466544399916013, "language_loss": 0.75238347, "learning_rate": 8.60765754591462e-07, "loss": 0.77425778, "num_input_tokens_seen": 125644085, "step": 5839, "time_per_iteration": 2.8171567916870117 }, { "auxiliary_loss_clip": 0.01166418, "auxiliary_loss_mlp": 0.01024231, "balance_loss_clip": 1.04662812, "balance_loss_mlp": 1.01679873, "epoch": 0.7022184813322913, "flos": 20449080489600.0, "grad_norm": 1.9249506788868391, "language_loss": 0.72946066, "learning_rate": 8.601255962962211e-07, "loss": 0.75136709, "num_input_tokens_seen": 125663095, "step": 5840, "time_per_iteration": 2.7624590396881104 }, { "auxiliary_loss_clip": 0.01176497, "auxiliary_loss_mlp": 0.01028776, "balance_loss_clip": 1.05104744, "balance_loss_mlp": 1.02072346, "epoch": 0.7023387242229303, "flos": 19790514581760.0, "grad_norm": 2.320055502360726, "language_loss": 0.72484386, "learning_rate": 8.594856109190194e-07, "loss": 0.74689656, "num_input_tokens_seen": 125680125, "step": 5841, "time_per_iteration": 2.735257387161255 }, { "auxiliary_loss_clip": 0.01168453, "auxiliary_loss_mlp": 0.01024823, "balance_loss_clip": 1.04755664, "balance_loss_mlp": 1.01715207, "epoch": 0.7024589671135694, "flos": 33259278286080.0, "grad_norm": 2.4187797591565827, "language_loss": 0.69337875, "learning_rate": 8.588457985569446e-07, "loss": 0.71531153, "num_input_tokens_seen": 125703035, "step": 5842, "time_per_iteration": 2.817478656768799 }, { "auxiliary_loss_clip": 0.01170954, "auxiliary_loss_mlp": 0.01027155, "balance_loss_clip": 1.04933357, "balance_loss_mlp": 1.02014208, "epoch": 0.7025792100042085, "flos": 19098982967040.0, "grad_norm": 2.1719299442906213, "language_loss": 0.71873844, "learning_rate": 8.582061593070542e-07, "loss": 0.74071956, "num_input_tokens_seen": 125723765, "step": 5843, "time_per_iteration": 4.20978569984436 }, { "auxiliary_loss_clip": 0.01168434, "auxiliary_loss_mlp": 0.01055421, "balance_loss_clip": 1.04717469, "balance_loss_mlp": 1.01701498, "epoch": 0.7026994528948476, "flos": 18952611045120.0, "grad_norm": 2.1539109137893977, "language_loss": 0.77034938, "learning_rate": 8.57566693266383e-07, "loss": 0.79258794, "num_input_tokens_seen": 125741455, "step": 5844, "time_per_iteration": 2.7200589179992676 }, { "auxiliary_loss_clip": 0.01166688, "auxiliary_loss_mlp": 0.0105994, "balance_loss_clip": 1.04769492, "balance_loss_mlp": 1.0219754, "epoch": 0.7028196957854866, "flos": 19536662188800.0, "grad_norm": 2.2314233864370063, "language_loss": 0.69598866, "learning_rate": 8.569274005319354e-07, "loss": 0.71825498, "num_input_tokens_seen": 125759855, "step": 5845, "time_per_iteration": 3.669212818145752 }, { "auxiliary_loss_clip": 0.01166057, "auxiliary_loss_mlp": 0.0102657, "balance_loss_clip": 1.05056095, "balance_loss_mlp": 1.01910782, "epoch": 0.7029399386761258, "flos": 20845318394880.0, "grad_norm": 1.8123940799379825, "language_loss": 0.79611742, "learning_rate": 8.562882812006913e-07, "loss": 0.81804371, "num_input_tokens_seen": 125777345, "step": 5846, "time_per_iteration": 3.6603095531463623 }, { "auxiliary_loss_clip": 0.01166878, "auxiliary_loss_mlp": 0.01028942, "balance_loss_clip": 1.04664481, "balance_loss_mlp": 1.02169991, "epoch": 0.7030601815667649, "flos": 22055005653120.0, "grad_norm": 1.5937493607850945, "language_loss": 0.77451479, "learning_rate": 8.556493353696066e-07, "loss": 0.79647303, "num_input_tokens_seen": 125796345, "step": 5847, "time_per_iteration": 2.7509031295776367 }, { "auxiliary_loss_clip": 0.01171686, "auxiliary_loss_mlp": 0.01058495, "balance_loss_clip": 1.05091143, "balance_loss_mlp": 1.02073812, "epoch": 0.7031804244574039, "flos": 27198742089600.0, "grad_norm": 2.202347649305705, "language_loss": 0.68044752, "learning_rate": 8.550105631356077e-07, "loss": 0.70274931, "num_input_tokens_seen": 125816070, "step": 5848, "time_per_iteration": 2.792825222015381 }, { "auxiliary_loss_clip": 0.01160031, "auxiliary_loss_mlp": 0.01028057, "balance_loss_clip": 1.04966235, "balance_loss_mlp": 1.0205586, "epoch": 0.7033006673480431, "flos": 22379853277440.0, "grad_norm": 2.804279865274953, "language_loss": 0.77128202, "learning_rate": 8.543719645955961e-07, "loss": 0.79316288, "num_input_tokens_seen": 125834400, "step": 5849, "time_per_iteration": 2.866098642349243 }, { "auxiliary_loss_clip": 0.01162289, "auxiliary_loss_mlp": 0.01026118, "balance_loss_clip": 1.04571354, "balance_loss_mlp": 1.0183847, "epoch": 0.7034209102386821, "flos": 24715986024960.0, "grad_norm": 1.56550555488193, "language_loss": 0.74385011, "learning_rate": 8.537335398464467e-07, "loss": 0.7657342, "num_input_tokens_seen": 125854720, "step": 5850, "time_per_iteration": 2.803532361984253 }, { "auxiliary_loss_clip": 0.01166607, "auxiliary_loss_mlp": 0.01026166, "balance_loss_clip": 1.04979229, "balance_loss_mlp": 1.0183934, "epoch": 0.7035411531293212, "flos": 22556174163840.0, "grad_norm": 7.425397563199957, "language_loss": 0.85160708, "learning_rate": 8.53095288985007e-07, "loss": 0.8735348, "num_input_tokens_seen": 125868455, "step": 5851, "time_per_iteration": 2.7787888050079346 }, { "auxiliary_loss_clip": 0.01165491, "auxiliary_loss_mlp": 0.01026488, "balance_loss_clip": 1.0472337, "balance_loss_mlp": 1.01963377, "epoch": 0.7036613960199604, "flos": 22674967418880.0, "grad_norm": 1.6841369733852836, "language_loss": 0.82386935, "learning_rate": 8.524572121081009e-07, "loss": 0.84578919, "num_input_tokens_seen": 125888555, "step": 5852, "time_per_iteration": 3.6837732791900635 }, { "auxiliary_loss_clip": 0.0117124, "auxiliary_loss_mlp": 0.01025358, "balance_loss_clip": 1.0495038, "balance_loss_mlp": 1.01758826, "epoch": 0.7037816389105994, "flos": 22492146170880.0, "grad_norm": 2.7471404988721555, "language_loss": 0.62583226, "learning_rate": 8.518193093125232e-07, "loss": 0.64779824, "num_input_tokens_seen": 125907610, "step": 5853, "time_per_iteration": 2.7666454315185547 }, { "auxiliary_loss_clip": 0.01166697, "auxiliary_loss_mlp": 0.01022487, "balance_loss_clip": 1.04835403, "balance_loss_mlp": 1.01515603, "epoch": 0.7039018818012385, "flos": 27087490690560.0, "grad_norm": 6.479867680615154, "language_loss": 0.80895579, "learning_rate": 8.511815806950436e-07, "loss": 0.83084762, "num_input_tokens_seen": 125928640, "step": 5854, "time_per_iteration": 2.882614850997925 }, { "auxiliary_loss_clip": 0.01159555, "auxiliary_loss_mlp": 0.01024677, "balance_loss_clip": 1.04327905, "balance_loss_mlp": 1.01685691, "epoch": 0.7040221246918776, "flos": 17749819198080.0, "grad_norm": 1.6165539006610299, "language_loss": 0.781371, "learning_rate": 8.505440263524044e-07, "loss": 0.80321336, "num_input_tokens_seen": 125947485, "step": 5855, "time_per_iteration": 2.758883476257324 }, { "auxiliary_loss_clip": 0.01171769, "auxiliary_loss_mlp": 0.01022408, "balance_loss_clip": 1.05029368, "balance_loss_mlp": 1.01456451, "epoch": 0.7041423675825167, "flos": 16279851012480.0, "grad_norm": 4.322599692847229, "language_loss": 0.88270503, "learning_rate": 8.49906646381322e-07, "loss": 0.90464681, "num_input_tokens_seen": 125960320, "step": 5856, "time_per_iteration": 2.792210578918457 }, { "auxiliary_loss_clip": 0.01163024, "auxiliary_loss_mlp": 0.01023938, "balance_loss_clip": 1.04919958, "balance_loss_mlp": 1.01685059, "epoch": 0.7042626104731557, "flos": 25483181639040.0, "grad_norm": 1.945521163548466, "language_loss": 0.72130525, "learning_rate": 8.492694408784884e-07, "loss": 0.74317485, "num_input_tokens_seen": 125980575, "step": 5857, "time_per_iteration": 2.754153251647949 }, { "auxiliary_loss_clip": 0.01168766, "auxiliary_loss_mlp": 0.01027062, "balance_loss_clip": 1.04716039, "balance_loss_mlp": 1.01983213, "epoch": 0.7043828533637949, "flos": 17857622891520.0, "grad_norm": 2.6231640004783703, "language_loss": 0.62233698, "learning_rate": 8.486324099405642e-07, "loss": 0.64429533, "num_input_tokens_seen": 125997420, "step": 5858, "time_per_iteration": 2.7372069358825684 }, { "auxiliary_loss_clip": 0.01162634, "auxiliary_loss_mlp": 0.01024659, "balance_loss_clip": 1.04588556, "balance_loss_mlp": 1.01681495, "epoch": 0.704503096254434, "flos": 29494259533440.0, "grad_norm": 2.013341114799932, "language_loss": 0.74652988, "learning_rate": 8.479955536641887e-07, "loss": 0.76840281, "num_input_tokens_seen": 126018915, "step": 5859, "time_per_iteration": 2.785813808441162 }, { "auxiliary_loss_clip": 0.01157373, "auxiliary_loss_mlp": 0.01024575, "balance_loss_clip": 1.04566979, "balance_loss_mlp": 1.01691544, "epoch": 0.704623339145073, "flos": 30920739327360.0, "grad_norm": 1.8934350610401824, "language_loss": 0.66337085, "learning_rate": 8.473588721459716e-07, "loss": 0.68519032, "num_input_tokens_seen": 126038825, "step": 5860, "time_per_iteration": 2.8560383319854736 }, { "auxiliary_loss_clip": 0.0116999, "auxiliary_loss_mlp": 0.01038644, "balance_loss_clip": 1.05242133, "balance_loss_mlp": 1.03030515, "epoch": 0.7047435820357122, "flos": 23914747296000.0, "grad_norm": 2.084454674301582, "language_loss": 0.70890504, "learning_rate": 8.467223654824967e-07, "loss": 0.73099136, "num_input_tokens_seen": 126058280, "step": 5861, "time_per_iteration": 2.9324986934661865 }, { "auxiliary_loss_clip": 0.01159683, "auxiliary_loss_mlp": 0.01030389, "balance_loss_clip": 1.04726267, "balance_loss_mlp": 1.02315569, "epoch": 0.7048638249263512, "flos": 46494010926720.0, "grad_norm": 1.9065739438716243, "language_loss": 0.62846941, "learning_rate": 8.460860337703233e-07, "loss": 0.65037012, "num_input_tokens_seen": 126078885, "step": 5862, "time_per_iteration": 2.9452555179595947 }, { "auxiliary_loss_clip": 0.01149214, "auxiliary_loss_mlp": 0.01025685, "balance_loss_clip": 1.0474062, "balance_loss_mlp": 1.01745927, "epoch": 0.7049840678169903, "flos": 21689219502720.0, "grad_norm": 1.8114087101306828, "language_loss": 0.70722276, "learning_rate": 8.454498771059797e-07, "loss": 0.72897172, "num_input_tokens_seen": 126098260, "step": 5863, "time_per_iteration": 2.7910690307617188 }, { "auxiliary_loss_clip": 0.0115322, "auxiliary_loss_mlp": 0.01027818, "balance_loss_clip": 1.04898143, "balance_loss_mlp": 1.02055192, "epoch": 0.7051043107076294, "flos": 18405081054720.0, "grad_norm": 2.93194312073155, "language_loss": 0.83041334, "learning_rate": 8.448138955859725e-07, "loss": 0.85222375, "num_input_tokens_seen": 126114845, "step": 5864, "time_per_iteration": 2.7874467372894287 }, { "auxiliary_loss_clip": 0.01164339, "auxiliary_loss_mlp": 0.01026896, "balance_loss_clip": 1.04929709, "balance_loss_mlp": 1.01889658, "epoch": 0.7052245535982685, "flos": 19319043640320.0, "grad_norm": 1.8435493852309592, "language_loss": 0.90204835, "learning_rate": 8.44178089306778e-07, "loss": 0.92396069, "num_input_tokens_seen": 126132780, "step": 5865, "time_per_iteration": 2.812556028366089 }, { "auxiliary_loss_clip": 0.01166736, "auxiliary_loss_mlp": 0.0102445, "balance_loss_clip": 1.04686928, "balance_loss_mlp": 1.01727951, "epoch": 0.7053447964889076, "flos": 19062138591360.0, "grad_norm": 2.0952882706656, "language_loss": 0.77267116, "learning_rate": 8.4354245836485e-07, "loss": 0.79458296, "num_input_tokens_seen": 126151225, "step": 5866, "time_per_iteration": 2.8156070709228516 }, { "auxiliary_loss_clip": 0.01161294, "auxiliary_loss_mlp": 0.01027249, "balance_loss_clip": 1.04844463, "balance_loss_mlp": 1.01934552, "epoch": 0.7054650393795466, "flos": 27379228953600.0, "grad_norm": 1.461844502886357, "language_loss": 0.72911727, "learning_rate": 8.429070028566108e-07, "loss": 0.75100267, "num_input_tokens_seen": 126172535, "step": 5867, "time_per_iteration": 3.038558006286621 }, { "auxiliary_loss_clip": 0.01165632, "auxiliary_loss_mlp": 0.0102389, "balance_loss_clip": 1.04859579, "balance_loss_mlp": 1.01625454, "epoch": 0.7055852822701858, "flos": 16102201322880.0, "grad_norm": 1.816775336969243, "language_loss": 0.74859214, "learning_rate": 8.422717228784586e-07, "loss": 0.77048737, "num_input_tokens_seen": 126189410, "step": 5868, "time_per_iteration": 2.8719289302825928 }, { "auxiliary_loss_clip": 0.01156991, "auxiliary_loss_mlp": 0.0102508, "balance_loss_clip": 1.04872358, "balance_loss_mlp": 1.01768935, "epoch": 0.7057055251608249, "flos": 11692299744000.0, "grad_norm": 2.1130999671429547, "language_loss": 0.69377106, "learning_rate": 8.416366185267663e-07, "loss": 0.71559179, "num_input_tokens_seen": 126206910, "step": 5869, "time_per_iteration": 3.9563727378845215 }, { "auxiliary_loss_clip": 0.01164737, "auxiliary_loss_mlp": 0.01024173, "balance_loss_clip": 1.0461247, "balance_loss_mlp": 1.01674604, "epoch": 0.7058257680514639, "flos": 22711560399360.0, "grad_norm": 4.008116426447498, "language_loss": 0.77685475, "learning_rate": 8.410016898978778e-07, "loss": 0.79874378, "num_input_tokens_seen": 126224385, "step": 5870, "time_per_iteration": 2.7700512409210205 }, { "auxiliary_loss_clip": 0.01156084, "auxiliary_loss_mlp": 0.01027536, "balance_loss_clip": 1.04910946, "balance_loss_mlp": 1.02042246, "epoch": 0.7059460109421031, "flos": 17529543043200.0, "grad_norm": 2.1203386710454977, "language_loss": 0.790133, "learning_rate": 8.403669370881115e-07, "loss": 0.81196922, "num_input_tokens_seen": 126243120, "step": 5871, "time_per_iteration": 3.7120261192321777 }, { "auxiliary_loss_clip": 0.01168449, "auxiliary_loss_mlp": 0.01025735, "balance_loss_clip": 1.0482415, "balance_loss_mlp": 1.01917553, "epoch": 0.7060662538327421, "flos": 23544687427200.0, "grad_norm": 1.6421504153450095, "language_loss": 0.78621161, "learning_rate": 8.397323601937587e-07, "loss": 0.80815345, "num_input_tokens_seen": 126263020, "step": 5872, "time_per_iteration": 3.6105918884277344 }, { "auxiliary_loss_clip": 0.01155535, "auxiliary_loss_mlp": 0.01025976, "balance_loss_clip": 1.04788184, "balance_loss_mlp": 1.01875222, "epoch": 0.7061864967233812, "flos": 30260736875520.0, "grad_norm": 1.7681582955934232, "language_loss": 0.7721647, "learning_rate": 8.390979593110838e-07, "loss": 0.79397976, "num_input_tokens_seen": 126285150, "step": 5873, "time_per_iteration": 2.913205623626709 }, { "auxiliary_loss_clip": 0.0116863, "auxiliary_loss_mlp": 0.0102515, "balance_loss_clip": 1.0504514, "balance_loss_mlp": 1.01758575, "epoch": 0.7063067396140204, "flos": 20701460424960.0, "grad_norm": 1.919794211103449, "language_loss": 0.81364304, "learning_rate": 8.384637345363262e-07, "loss": 0.83558083, "num_input_tokens_seen": 126304340, "step": 5874, "time_per_iteration": 2.896430015563965 }, { "auxiliary_loss_clip": 0.01156932, "auxiliary_loss_mlp": 0.01027839, "balance_loss_clip": 1.04570961, "balance_loss_mlp": 1.02012002, "epoch": 0.7064269825046594, "flos": 32266168081920.0, "grad_norm": 3.1515558719741597, "language_loss": 0.76898783, "learning_rate": 8.378296859656964e-07, "loss": 0.7908355, "num_input_tokens_seen": 126325495, "step": 5875, "time_per_iteration": 2.8277411460876465 }, { "auxiliary_loss_clip": 0.01164949, "auxiliary_loss_mlp": 0.01026592, "balance_loss_clip": 1.04936433, "balance_loss_mlp": 1.01975822, "epoch": 0.7065472253952985, "flos": 30227124723840.0, "grad_norm": 2.429675986165807, "language_loss": 0.68023407, "learning_rate": 8.371958136953792e-07, "loss": 0.70214951, "num_input_tokens_seen": 126345525, "step": 5876, "time_per_iteration": 2.9427969455718994 }, { "auxiliary_loss_clip": 0.01164519, "auxiliary_loss_mlp": 0.01023581, "balance_loss_clip": 1.04613543, "balance_loss_mlp": 1.01549828, "epoch": 0.7066674682859376, "flos": 16216720859520.0, "grad_norm": 2.410077573275738, "language_loss": 0.66432381, "learning_rate": 8.365621178215326e-07, "loss": 0.68620479, "num_input_tokens_seen": 126361995, "step": 5877, "time_per_iteration": 3.7426767349243164 }, { "auxiliary_loss_clip": 0.01161491, "auxiliary_loss_mlp": 0.01024765, "balance_loss_clip": 1.04786539, "balance_loss_mlp": 1.01718068, "epoch": 0.7067877111765767, "flos": 14830461319680.0, "grad_norm": 2.56859739187355, "language_loss": 0.74934757, "learning_rate": 8.359285984402871e-07, "loss": 0.77121007, "num_input_tokens_seen": 126379260, "step": 5878, "time_per_iteration": 2.761134147644043 }, { "auxiliary_loss_clip": 0.01161786, "auxiliary_loss_mlp": 0.01025707, "balance_loss_clip": 1.04831529, "balance_loss_mlp": 1.01858425, "epoch": 0.7069079540672157, "flos": 25440196037760.0, "grad_norm": 2.295182947598582, "language_loss": 0.73931545, "learning_rate": 8.352952556477489e-07, "loss": 0.76119041, "num_input_tokens_seen": 126397170, "step": 5879, "time_per_iteration": 2.8655810356140137 }, { "auxiliary_loss_clip": 0.01164965, "auxiliary_loss_mlp": 0.01025555, "balance_loss_clip": 1.04813313, "balance_loss_mlp": 1.01827705, "epoch": 0.7070281969578549, "flos": 24607751368320.0, "grad_norm": 1.7396405718686754, "language_loss": 0.76511228, "learning_rate": 8.34662089539993e-07, "loss": 0.78701746, "num_input_tokens_seen": 126416680, "step": 5880, "time_per_iteration": 2.6867761611938477 }, { "auxiliary_loss_clip": 0.01165982, "auxiliary_loss_mlp": 0.01025608, "balance_loss_clip": 1.04700661, "balance_loss_mlp": 1.01843202, "epoch": 0.707148439848494, "flos": 26724469887360.0, "grad_norm": 1.9492143751223014, "language_loss": 0.79392421, "learning_rate": 8.340291002130722e-07, "loss": 0.81584018, "num_input_tokens_seen": 126435870, "step": 5881, "time_per_iteration": 2.713341474533081 }, { "auxiliary_loss_clip": 0.01173362, "auxiliary_loss_mlp": 0.01025194, "balance_loss_clip": 1.05058241, "balance_loss_mlp": 1.01755261, "epoch": 0.707268682739133, "flos": 15085750256640.0, "grad_norm": 2.4006171751477936, "language_loss": 0.79523063, "learning_rate": 8.3339628776301e-07, "loss": 0.81721616, "num_input_tokens_seen": 126454010, "step": 5882, "time_per_iteration": 2.6763908863067627 }, { "auxiliary_loss_clip": 0.01169015, "auxiliary_loss_mlp": 0.01022075, "balance_loss_clip": 1.04821682, "balance_loss_mlp": 1.01395094, "epoch": 0.7073889256297722, "flos": 34313148345600.0, "grad_norm": 1.9243684177730227, "language_loss": 0.56899726, "learning_rate": 8.327636522858033e-07, "loss": 0.59090817, "num_input_tokens_seen": 126473615, "step": 5883, "time_per_iteration": 2.8639698028564453 }, { "auxiliary_loss_clip": 0.01157063, "auxiliary_loss_mlp": 0.01027606, "balance_loss_clip": 1.04959464, "balance_loss_mlp": 1.02022648, "epoch": 0.7075091685204112, "flos": 20083940784000.0, "grad_norm": 1.8974945465955253, "language_loss": 0.77113032, "learning_rate": 8.321311938774225e-07, "loss": 0.79297698, "num_input_tokens_seen": 126492705, "step": 5884, "time_per_iteration": 2.8209550380706787 }, { "auxiliary_loss_clip": 0.01171395, "auxiliary_loss_mlp": 0.0102831, "balance_loss_clip": 1.04799128, "balance_loss_mlp": 1.02079976, "epoch": 0.7076294114110503, "flos": 20777124424320.0, "grad_norm": 1.8767280834599682, "language_loss": 0.79151058, "learning_rate": 8.314989126338104e-07, "loss": 0.81350756, "num_input_tokens_seen": 126512715, "step": 5885, "time_per_iteration": 2.794969081878662 }, { "auxiliary_loss_clip": 0.01166862, "auxiliary_loss_mlp": 0.01025476, "balance_loss_clip": 1.0470103, "balance_loss_mlp": 1.01863647, "epoch": 0.7077496543016895, "flos": 17967689141760.0, "grad_norm": 2.1007829240527567, "language_loss": 0.84582865, "learning_rate": 8.308668086508847e-07, "loss": 0.86775196, "num_input_tokens_seen": 126530795, "step": 5886, "time_per_iteration": 2.755155324935913 }, { "auxiliary_loss_clip": 0.01164523, "auxiliary_loss_mlp": 0.01022989, "balance_loss_clip": 1.04811978, "balance_loss_mlp": 1.01524639, "epoch": 0.7078698971923285, "flos": 45478098564480.0, "grad_norm": 6.400156208609326, "language_loss": 0.73885155, "learning_rate": 8.302348820245342e-07, "loss": 0.76072663, "num_input_tokens_seen": 126553360, "step": 5887, "time_per_iteration": 3.0091586112976074 }, { "auxiliary_loss_clip": 0.01162075, "auxiliary_loss_mlp": 0.01022824, "balance_loss_clip": 1.0496316, "balance_loss_mlp": 1.01521873, "epoch": 0.7079901400829676, "flos": 26943704547840.0, "grad_norm": 2.5873424779422485, "language_loss": 0.70065522, "learning_rate": 8.296031328506232e-07, "loss": 0.72250426, "num_input_tokens_seen": 126573110, "step": 5888, "time_per_iteration": 2.9823834896087646 }, { "auxiliary_loss_clip": 0.01165153, "auxiliary_loss_mlp": 0.01024633, "balance_loss_clip": 1.04901052, "balance_loss_mlp": 1.01702785, "epoch": 0.7081103829736067, "flos": 24423206267520.0, "grad_norm": 1.841416788710186, "language_loss": 0.75341207, "learning_rate": 8.289715612249857e-07, "loss": 0.77530992, "num_input_tokens_seen": 126593725, "step": 5889, "time_per_iteration": 2.8943164348602295 }, { "auxiliary_loss_clip": 0.01162025, "auxiliary_loss_mlp": 0.01032631, "balance_loss_clip": 1.05059683, "balance_loss_mlp": 1.02471614, "epoch": 0.7082306258642458, "flos": 18543300589440.0, "grad_norm": 2.840420041159579, "language_loss": 0.77662897, "learning_rate": 8.283401672434305e-07, "loss": 0.79857552, "num_input_tokens_seen": 126608950, "step": 5890, "time_per_iteration": 2.776336431503296 }, { "auxiliary_loss_clip": 0.01158742, "auxiliary_loss_mlp": 0.01019531, "balance_loss_clip": 1.04958928, "balance_loss_mlp": 1.01200294, "epoch": 0.7083508687548848, "flos": 23477534951040.0, "grad_norm": 1.868348880403315, "language_loss": 0.70360267, "learning_rate": 8.277089510017412e-07, "loss": 0.72538543, "num_input_tokens_seen": 126629755, "step": 5891, "time_per_iteration": 2.739785671234131 }, { "auxiliary_loss_clip": 0.0116113, "auxiliary_loss_mlp": 0.01021832, "balance_loss_clip": 1.04919338, "balance_loss_mlp": 1.01491785, "epoch": 0.708471111645524, "flos": 22419463000320.0, "grad_norm": 1.6924190375562596, "language_loss": 0.82141924, "learning_rate": 8.270779125956719e-07, "loss": 0.84324884, "num_input_tokens_seen": 126650135, "step": 5892, "time_per_iteration": 2.77852201461792 }, { "auxiliary_loss_clip": 0.01151453, "auxiliary_loss_mlp": 0.01027534, "balance_loss_clip": 1.04848182, "balance_loss_mlp": 1.01988673, "epoch": 0.7085913545361631, "flos": 20922885815040.0, "grad_norm": 1.9396538745718517, "language_loss": 0.80077493, "learning_rate": 8.264470521209505e-07, "loss": 0.82256478, "num_input_tokens_seen": 126668500, "step": 5893, "time_per_iteration": 2.8052194118499756 }, { "auxiliary_loss_clip": 0.0116137, "auxiliary_loss_mlp": 0.01025914, "balance_loss_clip": 1.04858255, "balance_loss_mlp": 1.01767671, "epoch": 0.7087115974268021, "flos": 15012384727680.0, "grad_norm": 3.0336353714150315, "language_loss": 0.76083672, "learning_rate": 8.258163696732785e-07, "loss": 0.7827096, "num_input_tokens_seen": 126686090, "step": 5894, "time_per_iteration": 2.7607905864715576 }, { "auxiliary_loss_clip": 0.01161725, "auxiliary_loss_mlp": 0.01028292, "balance_loss_clip": 1.04828572, "balance_loss_mlp": 1.02081156, "epoch": 0.7088318403174413, "flos": 21539040739200.0, "grad_norm": 1.9125173684136991, "language_loss": 0.77270317, "learning_rate": 8.251858653483288e-07, "loss": 0.79460335, "num_input_tokens_seen": 126704255, "step": 5895, "time_per_iteration": 3.694974184036255 }, { "auxiliary_loss_clip": 0.01168243, "auxiliary_loss_mlp": 0.01024886, "balance_loss_clip": 1.04998088, "balance_loss_mlp": 1.01680088, "epoch": 0.7089520832080803, "flos": 15516785462400.0, "grad_norm": 2.2671773297586246, "language_loss": 0.8601687, "learning_rate": 8.245555392417501e-07, "loss": 0.88209999, "num_input_tokens_seen": 126718910, "step": 5896, "time_per_iteration": 2.691807508468628 }, { "auxiliary_loss_clip": 0.01148339, "auxiliary_loss_mlp": 0.01024833, "balance_loss_clip": 1.04886186, "balance_loss_mlp": 1.01757026, "epoch": 0.7090723260987194, "flos": 20412667077120.0, "grad_norm": 1.7347090577714699, "language_loss": 0.78626043, "learning_rate": 8.239253914491613e-07, "loss": 0.8079921, "num_input_tokens_seen": 126737235, "step": 5897, "time_per_iteration": 3.909846544265747 }, { "auxiliary_loss_clip": 0.01156591, "auxiliary_loss_mlp": 0.01031893, "balance_loss_clip": 1.05014467, "balance_loss_mlp": 1.02369165, "epoch": 0.7091925689893585, "flos": 25668337271040.0, "grad_norm": 1.8431812886543542, "language_loss": 0.75298727, "learning_rate": 8.232954220661556e-07, "loss": 0.77487212, "num_input_tokens_seen": 126759970, "step": 5898, "time_per_iteration": 3.8359029293060303 }, { "auxiliary_loss_clip": 0.0116969, "auxiliary_loss_mlp": 0.01028588, "balance_loss_clip": 1.05159473, "balance_loss_mlp": 1.02121508, "epoch": 0.7093128118799976, "flos": 24206629213440.0, "grad_norm": 2.700992845360952, "language_loss": 0.70058554, "learning_rate": 8.226656311882989e-07, "loss": 0.72256833, "num_input_tokens_seen": 126779280, "step": 5899, "time_per_iteration": 2.7261083126068115 }, { "auxiliary_loss_clip": 0.01162607, "auxiliary_loss_mlp": 0.01025405, "balance_loss_clip": 1.04850698, "balance_loss_mlp": 1.01867521, "epoch": 0.7094330547706367, "flos": 16646786398080.0, "grad_norm": 2.2517894806296557, "language_loss": 0.77128541, "learning_rate": 8.22036018911129e-07, "loss": 0.79316556, "num_input_tokens_seen": 126797310, "step": 5900, "time_per_iteration": 2.726245641708374 }, { "auxiliary_loss_clip": 0.01172282, "auxiliary_loss_mlp": 0.01026371, "balance_loss_clip": 1.04802203, "balance_loss_mlp": 1.01859272, "epoch": 0.7095532976612757, "flos": 16283370545280.0, "grad_norm": 2.150514478328937, "language_loss": 0.80393267, "learning_rate": 8.214065853301599e-07, "loss": 0.82591915, "num_input_tokens_seen": 126812840, "step": 5901, "time_per_iteration": 2.841980218887329 }, { "auxiliary_loss_clip": 0.01066238, "auxiliary_loss_mlp": 0.01001161, "balance_loss_clip": 1.00974011, "balance_loss_mlp": 1.00024343, "epoch": 0.7096735405519149, "flos": 70722080559360.0, "grad_norm": 0.8386442701195934, "language_loss": 0.58199167, "learning_rate": 8.207773305408734e-07, "loss": 0.60266566, "num_input_tokens_seen": 126880060, "step": 5902, "time_per_iteration": 3.441879987716675 }, { "auxiliary_loss_clip": 0.01165785, "auxiliary_loss_mlp": 0.01027806, "balance_loss_clip": 1.04965949, "balance_loss_mlp": 1.01992583, "epoch": 0.709793783442554, "flos": 23621500661760.0, "grad_norm": 2.875939034630702, "language_loss": 0.80082715, "learning_rate": 8.201482546387288e-07, "loss": 0.82276309, "num_input_tokens_seen": 126899535, "step": 5903, "time_per_iteration": 3.9660251140594482 }, { "auxiliary_loss_clip": 0.01165691, "auxiliary_loss_mlp": 0.01028299, "balance_loss_clip": 1.04912972, "balance_loss_mlp": 1.02036583, "epoch": 0.709914026333193, "flos": 25993472204160.0, "grad_norm": 1.9397312065381298, "language_loss": 0.92017341, "learning_rate": 8.195193577191553e-07, "loss": 0.94211328, "num_input_tokens_seen": 126921365, "step": 5904, "time_per_iteration": 2.8168511390686035 }, { "auxiliary_loss_clip": 0.01170528, "auxiliary_loss_mlp": 0.01057608, "balance_loss_clip": 1.04850578, "balance_loss_mlp": 1.02121031, "epoch": 0.7100342692238322, "flos": 24861531934080.0, "grad_norm": 1.662850963754487, "language_loss": 0.84673631, "learning_rate": 8.188906398775579e-07, "loss": 0.8690176, "num_input_tokens_seen": 126941910, "step": 5905, "time_per_iteration": 2.90312123298645 }, { "auxiliary_loss_clip": 0.01169905, "auxiliary_loss_mlp": 0.01059867, "balance_loss_clip": 1.04819059, "balance_loss_mlp": 1.02305245, "epoch": 0.7101545121144712, "flos": 24932203943040.0, "grad_norm": 2.74975080148772, "language_loss": 0.68861699, "learning_rate": 8.18262101209311e-07, "loss": 0.71091473, "num_input_tokens_seen": 126961120, "step": 5906, "time_per_iteration": 2.7769436836242676 }, { "auxiliary_loss_clip": 0.01168527, "auxiliary_loss_mlp": 0.01022567, "balance_loss_clip": 1.04681242, "balance_loss_mlp": 1.01537859, "epoch": 0.7102747550051103, "flos": 23768842250880.0, "grad_norm": 1.9773347064807896, "language_loss": 0.70172477, "learning_rate": 8.176337418097626e-07, "loss": 0.72363567, "num_input_tokens_seen": 126981590, "step": 5907, "time_per_iteration": 2.7295525074005127 }, { "auxiliary_loss_clip": 0.0116465, "auxiliary_loss_mlp": 0.01062966, "balance_loss_clip": 1.04928684, "balance_loss_mlp": 1.02677894, "epoch": 0.7103949978957494, "flos": 15303907509120.0, "grad_norm": 2.1747301482097856, "language_loss": 0.79865384, "learning_rate": 8.170055617742364e-07, "loss": 0.82093, "num_input_tokens_seen": 126998870, "step": 5908, "time_per_iteration": 2.7825353145599365 }, { "auxiliary_loss_clip": 0.01161342, "auxiliary_loss_mlp": 0.01033634, "balance_loss_clip": 1.0498029, "balance_loss_mlp": 1.02597451, "epoch": 0.7105152407863885, "flos": 22638805401600.0, "grad_norm": 1.7803063827685088, "language_loss": 0.70737827, "learning_rate": 8.163775611980252e-07, "loss": 0.72932798, "num_input_tokens_seen": 127017980, "step": 5909, "time_per_iteration": 2.7112233638763428 }, { "auxiliary_loss_clip": 0.01161303, "auxiliary_loss_mlp": 0.01025908, "balance_loss_clip": 1.04691756, "balance_loss_mlp": 1.01836157, "epoch": 0.7106354836770276, "flos": 17238594879360.0, "grad_norm": 1.7321183056814657, "language_loss": 0.7864126, "learning_rate": 8.157497401763982e-07, "loss": 0.80828476, "num_input_tokens_seen": 127035645, "step": 5910, "time_per_iteration": 2.7579333782196045 }, { "auxiliary_loss_clip": 0.01165165, "auxiliary_loss_mlp": 0.01028022, "balance_loss_clip": 1.04853976, "balance_loss_mlp": 1.02056849, "epoch": 0.7107557265676667, "flos": 20193647898240.0, "grad_norm": 2.8120434328182196, "language_loss": 0.77859199, "learning_rate": 8.151220988045935e-07, "loss": 0.80052388, "num_input_tokens_seen": 127054900, "step": 5911, "time_per_iteration": 2.6863651275634766 }, { "auxiliary_loss_clip": 0.01161491, "auxiliary_loss_mlp": 0.01022332, "balance_loss_clip": 1.04571569, "balance_loss_mlp": 1.01484275, "epoch": 0.7108759694583058, "flos": 21507080613120.0, "grad_norm": 1.7659631889668959, "language_loss": 0.83095247, "learning_rate": 8.144946371778234e-07, "loss": 0.85279065, "num_input_tokens_seen": 127075010, "step": 5912, "time_per_iteration": 2.8808088302612305 }, { "auxiliary_loss_clip": 0.01162386, "auxiliary_loss_mlp": 0.01056862, "balance_loss_clip": 1.04950595, "balance_loss_mlp": 1.0204407, "epoch": 0.7109962123489448, "flos": 24061909317120.0, "grad_norm": 1.7537353559410283, "language_loss": 0.78501797, "learning_rate": 8.138673553912751e-07, "loss": 0.80721045, "num_input_tokens_seen": 127095570, "step": 5913, "time_per_iteration": 2.90409779548645 }, { "auxiliary_loss_clip": 0.01156002, "auxiliary_loss_mlp": 0.01025124, "balance_loss_clip": 1.04849792, "balance_loss_mlp": 1.01708913, "epoch": 0.711116455239584, "flos": 30480474326400.0, "grad_norm": 2.327972077620339, "language_loss": 0.56792355, "learning_rate": 8.132402535401059e-07, "loss": 0.58973479, "num_input_tokens_seen": 127116825, "step": 5914, "time_per_iteration": 2.9394216537475586 }, { "auxiliary_loss_clip": 0.01161903, "auxiliary_loss_mlp": 0.01026469, "balance_loss_clip": 1.04817939, "balance_loss_mlp": 1.01907814, "epoch": 0.711236698130223, "flos": 25045610158080.0, "grad_norm": 1.8139175358523787, "language_loss": 0.73943341, "learning_rate": 8.126133317194465e-07, "loss": 0.76131719, "num_input_tokens_seen": 127137015, "step": 5915, "time_per_iteration": 2.9304699897766113 }, { "auxiliary_loss_clip": 0.01164659, "auxiliary_loss_mlp": 0.01029877, "balance_loss_clip": 1.04940343, "balance_loss_mlp": 1.0214963, "epoch": 0.7113569410208621, "flos": 24206701040640.0, "grad_norm": 2.0799244603087366, "language_loss": 0.74388367, "learning_rate": 8.11986590024401e-07, "loss": 0.76582909, "num_input_tokens_seen": 127156755, "step": 5916, "time_per_iteration": 3.0477941036224365 }, { "auxiliary_loss_clip": 0.01166557, "auxiliary_loss_mlp": 0.01028441, "balance_loss_clip": 1.05017304, "balance_loss_mlp": 1.02043605, "epoch": 0.7114771839115013, "flos": 35439306526080.0, "grad_norm": 2.568494328756352, "language_loss": 0.69110203, "learning_rate": 8.113600285500442e-07, "loss": 0.71305197, "num_input_tokens_seen": 127176965, "step": 5917, "time_per_iteration": 3.0190398693084717 }, { "auxiliary_loss_clip": 0.01166959, "auxiliary_loss_mlp": 0.01019398, "balance_loss_clip": 1.04675961, "balance_loss_mlp": 1.01240659, "epoch": 0.7115974268021403, "flos": 21099458096640.0, "grad_norm": 1.8876896529723972, "language_loss": 0.74476731, "learning_rate": 8.107336473914268e-07, "loss": 0.76663089, "num_input_tokens_seen": 127195595, "step": 5918, "time_per_iteration": 2.7553296089172363 }, { "auxiliary_loss_clip": 0.01061275, "auxiliary_loss_mlp": 0.01002942, "balance_loss_clip": 1.00948787, "balance_loss_mlp": 1.00194693, "epoch": 0.7117176696927794, "flos": 56752866616320.0, "grad_norm": 0.7634551739174348, "language_loss": 0.55697203, "learning_rate": 8.101074466435694e-07, "loss": 0.57761419, "num_input_tokens_seen": 127255070, "step": 5919, "time_per_iteration": 3.263782262802124 }, { "auxiliary_loss_clip": 0.01159846, "auxiliary_loss_mlp": 0.01026183, "balance_loss_clip": 1.0474025, "balance_loss_mlp": 1.0186553, "epoch": 0.7118379125834186, "flos": 15925269905280.0, "grad_norm": 1.9118517354106968, "language_loss": 0.67764622, "learning_rate": 8.094814264014662e-07, "loss": 0.69950652, "num_input_tokens_seen": 127273825, "step": 5920, "time_per_iteration": 2.898068904876709 }, { "auxiliary_loss_clip": 0.01172735, "auxiliary_loss_mlp": 0.01026322, "balance_loss_clip": 1.04890311, "balance_loss_mlp": 1.01820421, "epoch": 0.7119581554740576, "flos": 20193360589440.0, "grad_norm": 2.679298391318821, "language_loss": 0.81079245, "learning_rate": 8.088555867600844e-07, "loss": 0.83278304, "num_input_tokens_seen": 127289990, "step": 5921, "time_per_iteration": 3.80076265335083 }, { "auxiliary_loss_clip": 0.01159079, "auxiliary_loss_mlp": 0.010307, "balance_loss_clip": 1.04973507, "balance_loss_mlp": 1.02360106, "epoch": 0.7120783983646967, "flos": 34715383822080.0, "grad_norm": 1.7523482417401448, "language_loss": 0.6055361, "learning_rate": 8.08229927814362e-07, "loss": 0.62743384, "num_input_tokens_seen": 127312880, "step": 5922, "time_per_iteration": 2.896002769470215 }, { "auxiliary_loss_clip": 0.01157105, "auxiliary_loss_mlp": 0.01022348, "balance_loss_clip": 1.04592013, "balance_loss_mlp": 1.01548791, "epoch": 0.7121986412553358, "flos": 26359114700160.0, "grad_norm": 1.7836535151256203, "language_loss": 0.64439774, "learning_rate": 8.076044496592134e-07, "loss": 0.66619229, "num_input_tokens_seen": 127334730, "step": 5923, "time_per_iteration": 3.8680293560028076 }, { "auxiliary_loss_clip": 0.01161759, "auxiliary_loss_mlp": 0.01026718, "balance_loss_clip": 1.04727244, "balance_loss_mlp": 1.01917827, "epoch": 0.7123188841459749, "flos": 11145344371200.0, "grad_norm": 2.2831199970556155, "language_loss": 0.78189516, "learning_rate": 8.069791523895204e-07, "loss": 0.80377996, "num_input_tokens_seen": 127351180, "step": 5924, "time_per_iteration": 3.8698723316192627 }, { "auxiliary_loss_clip": 0.0115228, "auxiliary_loss_mlp": 0.01028493, "balance_loss_clip": 1.04837084, "balance_loss_mlp": 1.02128363, "epoch": 0.7124391270366139, "flos": 20811670329600.0, "grad_norm": 2.2036918442190303, "language_loss": 0.7730459, "learning_rate": 8.063540361001422e-07, "loss": 0.79485357, "num_input_tokens_seen": 127369750, "step": 5925, "time_per_iteration": 2.7893309593200684 }, { "auxiliary_loss_clip": 0.01153005, "auxiliary_loss_mlp": 0.01026906, "balance_loss_clip": 1.04804671, "balance_loss_mlp": 1.01909518, "epoch": 0.7125593699272531, "flos": 17603734584960.0, "grad_norm": 3.4372050802329, "language_loss": 0.79672754, "learning_rate": 8.057291008859069e-07, "loss": 0.81852669, "num_input_tokens_seen": 127387910, "step": 5926, "time_per_iteration": 2.8487019538879395 }, { "auxiliary_loss_clip": 0.01160961, "auxiliary_loss_mlp": 0.01029486, "balance_loss_clip": 1.04615331, "balance_loss_mlp": 1.02211022, "epoch": 0.7126796128178922, "flos": 28654057526400.0, "grad_norm": 1.7799879745528007, "language_loss": 0.68550837, "learning_rate": 8.051043468416187e-07, "loss": 0.70741284, "num_input_tokens_seen": 127409160, "step": 5927, "time_per_iteration": 2.9603970050811768 }, { "auxiliary_loss_clip": 0.01167755, "auxiliary_loss_mlp": 0.01025239, "balance_loss_clip": 1.04971039, "balance_loss_mlp": 1.01853967, "epoch": 0.7127998557085312, "flos": 16034438315520.0, "grad_norm": 1.981778518542194, "language_loss": 0.82495123, "learning_rate": 8.044797740620506e-07, "loss": 0.84688115, "num_input_tokens_seen": 127427765, "step": 5928, "time_per_iteration": 2.83123779296875 }, { "auxiliary_loss_clip": 0.01152468, "auxiliary_loss_mlp": 0.01027393, "balance_loss_clip": 1.04981041, "balance_loss_mlp": 1.02069342, "epoch": 0.7129200985991703, "flos": 23403271582080.0, "grad_norm": 2.8163603073757417, "language_loss": 0.78500462, "learning_rate": 8.038553826419494e-07, "loss": 0.80680323, "num_input_tokens_seen": 127446475, "step": 5929, "time_per_iteration": 2.867017984390259 }, { "auxiliary_loss_clip": 0.0116661, "auxiliary_loss_mlp": 0.01025099, "balance_loss_clip": 1.04603755, "balance_loss_mlp": 1.01781559, "epoch": 0.7130403414898094, "flos": 21397445326080.0, "grad_norm": 1.5889329260211669, "language_loss": 0.80706477, "learning_rate": 8.032311726760364e-07, "loss": 0.82898188, "num_input_tokens_seen": 127467695, "step": 5930, "time_per_iteration": 3.6977832317352295 }, { "auxiliary_loss_clip": 0.01151259, "auxiliary_loss_mlp": 0.01027301, "balance_loss_clip": 1.04747999, "balance_loss_mlp": 1.01877785, "epoch": 0.7131605843804485, "flos": 74739045306240.0, "grad_norm": 1.769872766998885, "language_loss": 0.68893647, "learning_rate": 8.026071442590022e-07, "loss": 0.71072203, "num_input_tokens_seen": 127494590, "step": 5931, "time_per_iteration": 3.2083370685577393 }, { "auxiliary_loss_clip": 0.01164065, "auxiliary_loss_mlp": 0.01029378, "balance_loss_clip": 1.04899597, "balance_loss_mlp": 1.02215719, "epoch": 0.7132808272710875, "flos": 18368739469440.0, "grad_norm": 1.9682811661132271, "language_loss": 0.80956733, "learning_rate": 8.019832974855134e-07, "loss": 0.83150184, "num_input_tokens_seen": 127512550, "step": 5932, "time_per_iteration": 2.768498420715332 }, { "auxiliary_loss_clip": 0.01159956, "auxiliary_loss_mlp": 0.01025352, "balance_loss_clip": 1.0512625, "balance_loss_mlp": 1.01805341, "epoch": 0.7134010701617267, "flos": 23253380127360.0, "grad_norm": 2.136426324445605, "language_loss": 0.82717156, "learning_rate": 8.013596324502052e-07, "loss": 0.84902465, "num_input_tokens_seen": 127531015, "step": 5933, "time_per_iteration": 2.826047420501709 }, { "auxiliary_loss_clip": 0.01159037, "auxiliary_loss_mlp": 0.01025081, "balance_loss_clip": 1.04703951, "balance_loss_mlp": 1.01801825, "epoch": 0.7135213130523658, "flos": 23653137565440.0, "grad_norm": 1.855544266281025, "language_loss": 0.7879616, "learning_rate": 8.007361492476872e-07, "loss": 0.80980277, "num_input_tokens_seen": 127550340, "step": 5934, "time_per_iteration": 2.803326368331909 }, { "auxiliary_loss_clip": 0.01164401, "auxiliary_loss_mlp": 0.01024639, "balance_loss_clip": 1.04528058, "balance_loss_mlp": 1.01746583, "epoch": 0.7136415559430048, "flos": 24790644443520.0, "grad_norm": 1.6560837414819682, "language_loss": 0.79039037, "learning_rate": 8.001128479725426e-07, "loss": 0.81228071, "num_input_tokens_seen": 127572245, "step": 5935, "time_per_iteration": 2.8802778720855713 }, { "auxiliary_loss_clip": 0.0115239, "auxiliary_loss_mlp": 0.01029461, "balance_loss_clip": 1.04894257, "balance_loss_mlp": 1.0222075, "epoch": 0.713761798833644, "flos": 18296954138880.0, "grad_norm": 1.6243637620278832, "language_loss": 0.80929494, "learning_rate": 7.994897287193248e-07, "loss": 0.83111346, "num_input_tokens_seen": 127591625, "step": 5936, "time_per_iteration": 2.8929078578948975 }, { "auxiliary_loss_clip": 0.0116763, "auxiliary_loss_mlp": 0.01022866, "balance_loss_clip": 1.04634452, "balance_loss_mlp": 1.01590109, "epoch": 0.713882041724283, "flos": 15558262692480.0, "grad_norm": 2.589627399435198, "language_loss": 0.83825946, "learning_rate": 7.988667915825605e-07, "loss": 0.86016452, "num_input_tokens_seen": 127608690, "step": 5937, "time_per_iteration": 2.7577245235443115 }, { "auxiliary_loss_clip": 0.01165642, "auxiliary_loss_mlp": 0.01025052, "balance_loss_clip": 1.05109394, "balance_loss_mlp": 1.01763082, "epoch": 0.7140022846149221, "flos": 24061011477120.0, "grad_norm": 3.6972189057774805, "language_loss": 0.75570297, "learning_rate": 7.982440366567491e-07, "loss": 0.77760988, "num_input_tokens_seen": 127627180, "step": 5938, "time_per_iteration": 2.8478896617889404 }, { "auxiliary_loss_clip": 0.01159107, "auxiliary_loss_mlp": 0.01023397, "balance_loss_clip": 1.04637742, "balance_loss_mlp": 1.01628029, "epoch": 0.7141225275055613, "flos": 27891710248320.0, "grad_norm": 2.9603008089695817, "language_loss": 0.75242358, "learning_rate": 7.97621464036361e-07, "loss": 0.7742486, "num_input_tokens_seen": 127648940, "step": 5939, "time_per_iteration": 2.900897741317749 }, { "auxiliary_loss_clip": 0.01165632, "auxiliary_loss_mlp": 0.01033381, "balance_loss_clip": 1.0459162, "balance_loss_mlp": 1.02600837, "epoch": 0.7142427703962003, "flos": 19682603147520.0, "grad_norm": 1.6426687977111547, "language_loss": 0.67638731, "learning_rate": 7.969990738158417e-07, "loss": 0.69837749, "num_input_tokens_seen": 127667350, "step": 5940, "time_per_iteration": 2.776427984237671 }, { "auxiliary_loss_clip": 0.01165289, "auxiliary_loss_mlp": 0.01022172, "balance_loss_clip": 1.04692662, "balance_loss_mlp": 1.01455164, "epoch": 0.7143630132868394, "flos": 21032377447680.0, "grad_norm": 2.46136339846221, "language_loss": 0.85128313, "learning_rate": 7.963768660896062e-07, "loss": 0.87315774, "num_input_tokens_seen": 127685760, "step": 5941, "time_per_iteration": 2.8351664543151855 }, { "auxiliary_loss_clip": 0.01168265, "auxiliary_loss_mlp": 0.01026534, "balance_loss_clip": 1.04853094, "balance_loss_mlp": 1.01970339, "epoch": 0.7144832561774785, "flos": 24129923719680.0, "grad_norm": 1.8507254181100263, "language_loss": 0.82350028, "learning_rate": 7.957548409520432e-07, "loss": 0.84544826, "num_input_tokens_seen": 127704985, "step": 5942, "time_per_iteration": 2.7578330039978027 }, { "auxiliary_loss_clip": 0.01159375, "auxiliary_loss_mlp": 0.01026521, "balance_loss_clip": 1.04674685, "balance_loss_mlp": 1.01903188, "epoch": 0.7146034990681176, "flos": 16325817442560.0, "grad_norm": 1.8948463602392922, "language_loss": 0.83907896, "learning_rate": 7.951329984975135e-07, "loss": 0.86093795, "num_input_tokens_seen": 127721925, "step": 5943, "time_per_iteration": 2.815852165222168 }, { "auxiliary_loss_clip": 0.01066445, "auxiliary_loss_mlp": 0.01004239, "balance_loss_clip": 1.00910163, "balance_loss_mlp": 1.00326145, "epoch": 0.7147237419587567, "flos": 69627164232960.0, "grad_norm": 0.7136444800665033, "language_loss": 0.54285765, "learning_rate": 7.94511338820349e-07, "loss": 0.56356454, "num_input_tokens_seen": 127784230, "step": 5944, "time_per_iteration": 3.289297580718994 }, { "auxiliary_loss_clip": 0.01161236, "auxiliary_loss_mlp": 0.01053582, "balance_loss_clip": 1.04731762, "balance_loss_mlp": 1.0184443, "epoch": 0.7148439848493958, "flos": 22266806198400.0, "grad_norm": 2.2658419056196193, "language_loss": 0.78532892, "learning_rate": 7.938898620148575e-07, "loss": 0.80747712, "num_input_tokens_seen": 127801990, "step": 5945, "time_per_iteration": 2.7816050052642822 }, { "auxiliary_loss_clip": 0.01158023, "auxiliary_loss_mlp": 0.01022201, "balance_loss_clip": 1.04716444, "balance_loss_mlp": 1.01519787, "epoch": 0.7149642277400349, "flos": 17931383470080.0, "grad_norm": 1.8626323295314975, "language_loss": 0.70905399, "learning_rate": 7.932685681753135e-07, "loss": 0.7308563, "num_input_tokens_seen": 127819270, "step": 5946, "time_per_iteration": 3.8186652660369873 }, { "auxiliary_loss_clip": 0.01162867, "auxiliary_loss_mlp": 0.01024023, "balance_loss_clip": 1.04575419, "balance_loss_mlp": 1.01641703, "epoch": 0.7150844706306739, "flos": 31681937370240.0, "grad_norm": 2.3423764620657517, "language_loss": 0.62671268, "learning_rate": 7.92647457395969e-07, "loss": 0.6485815, "num_input_tokens_seen": 127841095, "step": 5947, "time_per_iteration": 2.863433837890625 }, { "auxiliary_loss_clip": 0.01164391, "auxiliary_loss_mlp": 0.0102914, "balance_loss_clip": 1.05002129, "balance_loss_mlp": 1.021999, "epoch": 0.7152047135213131, "flos": 10926217451520.0, "grad_norm": 2.1384588665261903, "language_loss": 0.74092638, "learning_rate": 7.920265297710444e-07, "loss": 0.76286173, "num_input_tokens_seen": 127858485, "step": 5948, "time_per_iteration": 2.9225246906280518 }, { "auxiliary_loss_clip": 0.01165294, "auxiliary_loss_mlp": 0.01025782, "balance_loss_clip": 1.0474937, "balance_loss_mlp": 1.01890337, "epoch": 0.7153249564119522, "flos": 20995640812800.0, "grad_norm": 3.1109974497855095, "language_loss": 0.73387486, "learning_rate": 7.914057853947363e-07, "loss": 0.75578558, "num_input_tokens_seen": 127877665, "step": 5949, "time_per_iteration": 2.73861026763916 }, { "auxiliary_loss_clip": 0.01157637, "auxiliary_loss_mlp": 0.01026093, "balance_loss_clip": 1.04796195, "balance_loss_mlp": 1.01862478, "epoch": 0.7154451993025912, "flos": 24243114453120.0, "grad_norm": 1.9988017315409319, "language_loss": 0.62896442, "learning_rate": 7.907852243612089e-07, "loss": 0.65080172, "num_input_tokens_seen": 127898070, "step": 5950, "time_per_iteration": 4.872831106185913 }, { "auxiliary_loss_clip": 0.01158193, "auxiliary_loss_mlp": 0.01023462, "balance_loss_clip": 1.04719627, "balance_loss_mlp": 1.01582026, "epoch": 0.7155654421932304, "flos": 23330947547520.0, "grad_norm": 1.8959248271241453, "language_loss": 0.72473252, "learning_rate": 7.901648467646009e-07, "loss": 0.74654907, "num_input_tokens_seen": 127917010, "step": 5951, "time_per_iteration": 2.9557836055755615 }, { "auxiliary_loss_clip": 0.01169951, "auxiliary_loss_mlp": 0.01028251, "balance_loss_clip": 1.04857802, "balance_loss_mlp": 1.0208714, "epoch": 0.7156856850838694, "flos": 22711883621760.0, "grad_norm": 3.6759056646798514, "language_loss": 0.72538626, "learning_rate": 7.895446526990244e-07, "loss": 0.74736822, "num_input_tokens_seen": 127937025, "step": 5952, "time_per_iteration": 2.7723426818847656 }, { "auxiliary_loss_clip": 0.01166075, "auxiliary_loss_mlp": 0.01024606, "balance_loss_clip": 1.04847336, "balance_loss_mlp": 1.01715517, "epoch": 0.7158059279745085, "flos": 19865424395520.0, "grad_norm": 2.22289359288139, "language_loss": 0.75971091, "learning_rate": 7.889246422585609e-07, "loss": 0.78161776, "num_input_tokens_seen": 127956410, "step": 5953, "time_per_iteration": 2.8446531295776367 }, { "auxiliary_loss_clip": 0.01167001, "auxiliary_loss_mlp": 0.01025482, "balance_loss_clip": 1.04588056, "balance_loss_mlp": 1.0174408, "epoch": 0.7159261708651476, "flos": 24134772055680.0, "grad_norm": 6.614407533257325, "language_loss": 0.73786873, "learning_rate": 7.883048155372675e-07, "loss": 0.75979352, "num_input_tokens_seen": 127974925, "step": 5954, "time_per_iteration": 2.729402780532837 }, { "auxiliary_loss_clip": 0.01165158, "auxiliary_loss_mlp": 0.01027734, "balance_loss_clip": 1.04777145, "balance_loss_mlp": 1.02076602, "epoch": 0.7160464137557867, "flos": 16983198201600.0, "grad_norm": 2.4295934314014827, "language_loss": 0.71382177, "learning_rate": 7.876851726291698e-07, "loss": 0.73575073, "num_input_tokens_seen": 127993225, "step": 5955, "time_per_iteration": 2.735908031463623 }, { "auxiliary_loss_clip": 0.01163302, "auxiliary_loss_mlp": 0.01030277, "balance_loss_clip": 1.04817069, "balance_loss_mlp": 1.02290416, "epoch": 0.7161666566464258, "flos": 25228251838080.0, "grad_norm": 3.844179601907665, "language_loss": 0.78411072, "learning_rate": 7.870657136282666e-07, "loss": 0.80604649, "num_input_tokens_seen": 128012085, "step": 5956, "time_per_iteration": 3.7617292404174805 }, { "auxiliary_loss_clip": 0.01162673, "auxiliary_loss_mlp": 0.0103028, "balance_loss_clip": 1.04875958, "balance_loss_mlp": 1.02279997, "epoch": 0.7162868995370649, "flos": 26468390851200.0, "grad_norm": 1.6178831688687778, "language_loss": 0.82011062, "learning_rate": 7.86446438628531e-07, "loss": 0.84204012, "num_input_tokens_seen": 128033155, "step": 5957, "time_per_iteration": 2.80790114402771 }, { "auxiliary_loss_clip": 0.01063715, "auxiliary_loss_mlp": 0.01001441, "balance_loss_clip": 1.00879002, "balance_loss_mlp": 1.00050545, "epoch": 0.716407142427704, "flos": 69998912040960.0, "grad_norm": 0.7683735350747641, "language_loss": 0.56792164, "learning_rate": 7.858273477239059e-07, "loss": 0.58857322, "num_input_tokens_seen": 128101575, "step": 5958, "time_per_iteration": 3.3869454860687256 }, { "auxiliary_loss_clip": 0.01146272, "auxiliary_loss_mlp": 0.01025185, "balance_loss_clip": 1.0476253, "balance_loss_mlp": 1.01770771, "epoch": 0.716527385318343, "flos": 20740459616640.0, "grad_norm": 3.6068698976791636, "language_loss": 0.70910013, "learning_rate": 7.852084410083067e-07, "loss": 0.7308147, "num_input_tokens_seen": 128120395, "step": 5959, "time_per_iteration": 2.9551608562469482 }, { "auxiliary_loss_clip": 0.01156212, "auxiliary_loss_mlp": 0.01024417, "balance_loss_clip": 1.04646254, "balance_loss_mlp": 1.01726162, "epoch": 0.7166476282089821, "flos": 25371966153600.0, "grad_norm": 2.0623202442871236, "language_loss": 0.63785893, "learning_rate": 7.84589718575621e-07, "loss": 0.65966523, "num_input_tokens_seen": 128140840, "step": 5960, "time_per_iteration": 2.8233721256256104 }, { "auxiliary_loss_clip": 0.0116381, "auxiliary_loss_mlp": 0.01024282, "balance_loss_clip": 1.0456748, "balance_loss_mlp": 1.01675415, "epoch": 0.7167678710996213, "flos": 24133730561280.0, "grad_norm": 2.1290537223729076, "language_loss": 0.68846929, "learning_rate": 7.83971180519708e-07, "loss": 0.71035016, "num_input_tokens_seen": 128159695, "step": 5961, "time_per_iteration": 2.868252754211426 }, { "auxiliary_loss_clip": 0.01170239, "auxiliary_loss_mlp": 0.01024357, "balance_loss_clip": 1.04902601, "balance_loss_mlp": 1.01673341, "epoch": 0.7168881139902603, "flos": 30226586019840.0, "grad_norm": 2.065265477014484, "language_loss": 0.75740528, "learning_rate": 7.833528269344008e-07, "loss": 0.77935123, "num_input_tokens_seen": 128179600, "step": 5962, "time_per_iteration": 2.8968191146850586 }, { "auxiliary_loss_clip": 0.01157589, "auxiliary_loss_mlp": 0.0102662, "balance_loss_clip": 1.04920352, "balance_loss_mlp": 1.01919627, "epoch": 0.7170083568808994, "flos": 14606414236800.0, "grad_norm": 2.3541177616664233, "language_loss": 0.77411449, "learning_rate": 7.827346579135023e-07, "loss": 0.79595661, "num_input_tokens_seen": 128196940, "step": 5963, "time_per_iteration": 2.781446695327759 }, { "auxiliary_loss_clip": 0.01161317, "auxiliary_loss_mlp": 0.01024129, "balance_loss_clip": 1.04764724, "balance_loss_mlp": 1.01669347, "epoch": 0.7171285997715385, "flos": 23331091201920.0, "grad_norm": 2.0636537235421906, "language_loss": 0.83261144, "learning_rate": 7.821166735507885e-07, "loss": 0.85446596, "num_input_tokens_seen": 128215970, "step": 5964, "time_per_iteration": 2.865549325942993 }, { "auxiliary_loss_clip": 0.01167397, "auxiliary_loss_mlp": 0.01026219, "balance_loss_clip": 1.04803729, "balance_loss_mlp": 1.0189178, "epoch": 0.7172488426621776, "flos": 16543543731840.0, "grad_norm": 1.7368510659277998, "language_loss": 0.68566763, "learning_rate": 7.81498873940007e-07, "loss": 0.70760381, "num_input_tokens_seen": 128233185, "step": 5965, "time_per_iteration": 2.7162537574768066 }, { "auxiliary_loss_clip": 0.01170129, "auxiliary_loss_mlp": 0.01025563, "balance_loss_clip": 1.04645658, "balance_loss_mlp": 1.01802266, "epoch": 0.7173690855528166, "flos": 26541612725760.0, "grad_norm": 1.9042998116973573, "language_loss": 0.77043742, "learning_rate": 7.808812591748768e-07, "loss": 0.7923944, "num_input_tokens_seen": 128253565, "step": 5966, "time_per_iteration": 2.87121844291687 }, { "auxiliary_loss_clip": 0.01156026, "auxiliary_loss_mlp": 0.01027152, "balance_loss_clip": 1.04735231, "balance_loss_mlp": 1.01980281, "epoch": 0.7174893284434558, "flos": 22784099915520.0, "grad_norm": 2.37657087354416, "language_loss": 0.65495455, "learning_rate": 7.802638293490915e-07, "loss": 0.67678624, "num_input_tokens_seen": 128273210, "step": 5967, "time_per_iteration": 2.8074142932891846 }, { "auxiliary_loss_clip": 0.01169926, "auxiliary_loss_mlp": 0.01023557, "balance_loss_clip": 1.0509243, "balance_loss_mlp": 1.01603794, "epoch": 0.7176095713340949, "flos": 23293564467840.0, "grad_norm": 1.8326225307939246, "language_loss": 0.76922178, "learning_rate": 7.796465845563123e-07, "loss": 0.79115665, "num_input_tokens_seen": 128292085, "step": 5968, "time_per_iteration": 2.778183937072754 }, { "auxiliary_loss_clip": 0.01159778, "auxiliary_loss_mlp": 0.01051427, "balance_loss_clip": 1.04978824, "balance_loss_mlp": 1.01439893, "epoch": 0.7177298142247339, "flos": 25591631777280.0, "grad_norm": 1.8870766900003462, "language_loss": 0.79646897, "learning_rate": 7.790295248901766e-07, "loss": 0.81858099, "num_input_tokens_seen": 128313215, "step": 5969, "time_per_iteration": 2.7708146572113037 }, { "auxiliary_loss_clip": 0.01165147, "auxiliary_loss_mlp": 0.01030598, "balance_loss_clip": 1.04897022, "balance_loss_mlp": 1.02338839, "epoch": 0.7178500571153731, "flos": 31652778504960.0, "grad_norm": 1.6291996359040999, "language_loss": 0.62068474, "learning_rate": 7.784126504442902e-07, "loss": 0.6426422, "num_input_tokens_seen": 128336445, "step": 5970, "time_per_iteration": 2.821007490158081 }, { "auxiliary_loss_clip": 0.01151523, "auxiliary_loss_mlp": 0.01024684, "balance_loss_clip": 1.0470438, "balance_loss_mlp": 1.01701629, "epoch": 0.7179703000060121, "flos": 19427242383360.0, "grad_norm": 1.3888722204376711, "language_loss": 0.67882645, "learning_rate": 7.777959613122351e-07, "loss": 0.70058852, "num_input_tokens_seen": 128356270, "step": 5971, "time_per_iteration": 2.7427501678466797 }, { "auxiliary_loss_clip": 0.0115282, "auxiliary_loss_mlp": 0.01028335, "balance_loss_clip": 1.04654205, "balance_loss_mlp": 1.02106357, "epoch": 0.7180905428966512, "flos": 28839249072000.0, "grad_norm": 1.7818037830287328, "language_loss": 0.7791909, "learning_rate": 7.771794575875604e-07, "loss": 0.8010025, "num_input_tokens_seen": 128378140, "step": 5972, "time_per_iteration": 3.915187358856201 }, { "auxiliary_loss_clip": 0.01163499, "auxiliary_loss_mlp": 0.01029671, "balance_loss_clip": 1.04799271, "balance_loss_mlp": 1.02165151, "epoch": 0.7182107857872904, "flos": 20047563285120.0, "grad_norm": 2.27860495289929, "language_loss": 0.77464145, "learning_rate": 7.765631393637888e-07, "loss": 0.79657316, "num_input_tokens_seen": 128396335, "step": 5973, "time_per_iteration": 2.8475351333618164 }, { "auxiliary_loss_clip": 0.0116244, "auxiliary_loss_mlp": 0.01026658, "balance_loss_clip": 1.04728246, "balance_loss_mlp": 1.01900434, "epoch": 0.7183310286779294, "flos": 22747686503040.0, "grad_norm": 5.086676494258922, "language_loss": 0.49601719, "learning_rate": 7.75947006734417e-07, "loss": 0.51790822, "num_input_tokens_seen": 128414115, "step": 5974, "time_per_iteration": 2.7154476642608643 }, { "auxiliary_loss_clip": 0.01166529, "auxiliary_loss_mlp": 0.01023129, "balance_loss_clip": 1.04542732, "balance_loss_mlp": 1.01563966, "epoch": 0.7184512715685685, "flos": 17158262112000.0, "grad_norm": 2.1331622386273703, "language_loss": 0.82907695, "learning_rate": 7.753310597929101e-07, "loss": 0.85097349, "num_input_tokens_seen": 128430755, "step": 5975, "time_per_iteration": 2.6997265815734863 }, { "auxiliary_loss_clip": 0.01063334, "auxiliary_loss_mlp": 0.01001902, "balance_loss_clip": 1.00886512, "balance_loss_mlp": 1.00093079, "epoch": 0.7185715144592076, "flos": 65509611448320.0, "grad_norm": 0.7567171461397487, "language_loss": 0.55058122, "learning_rate": 7.747152986327095e-07, "loss": 0.57123357, "num_input_tokens_seen": 128491300, "step": 5976, "time_per_iteration": 4.160888671875 }, { "auxiliary_loss_clip": 0.01161998, "auxiliary_loss_mlp": 0.01024916, "balance_loss_clip": 1.04959059, "balance_loss_mlp": 1.01789451, "epoch": 0.7186917573498467, "flos": 16180522928640.0, "grad_norm": 1.827157321609325, "language_loss": 0.68124676, "learning_rate": 7.740997233472228e-07, "loss": 0.70311588, "num_input_tokens_seen": 128508920, "step": 5977, "time_per_iteration": 3.8863909244537354 }, { "auxiliary_loss_clip": 0.01159201, "auxiliary_loss_mlp": 0.0101933, "balance_loss_clip": 1.04534006, "balance_loss_mlp": 1.01218343, "epoch": 0.7188120002404857, "flos": 29242274647680.0, "grad_norm": 2.122185919957893, "language_loss": 0.70941186, "learning_rate": 7.734843340298329e-07, "loss": 0.73119718, "num_input_tokens_seen": 128528745, "step": 5978, "time_per_iteration": 2.8083693981170654 }, { "auxiliary_loss_clip": 0.01167343, "auxiliary_loss_mlp": 0.0102652, "balance_loss_clip": 1.04565907, "balance_loss_mlp": 1.01859272, "epoch": 0.7189322431311249, "flos": 33401161008000.0, "grad_norm": 2.0764267268127834, "language_loss": 0.74931526, "learning_rate": 7.72869130773895e-07, "loss": 0.77125388, "num_input_tokens_seen": 128549345, "step": 5979, "time_per_iteration": 2.9367806911468506 }, { "auxiliary_loss_clip": 0.01064346, "auxiliary_loss_mlp": 0.01002656, "balance_loss_clip": 1.00877309, "balance_loss_mlp": 1.00176167, "epoch": 0.719052486021764, "flos": 61351263792000.0, "grad_norm": 0.8047773653041673, "language_loss": 0.59361011, "learning_rate": 7.722541136727343e-07, "loss": 0.6142801, "num_input_tokens_seen": 128605360, "step": 5980, "time_per_iteration": 3.138794183731079 }, { "auxiliary_loss_clip": 0.0116533, "auxiliary_loss_mlp": 0.01026271, "balance_loss_clip": 1.04711211, "balance_loss_mlp": 1.01908898, "epoch": 0.719172728912403, "flos": 15596795007360.0, "grad_norm": 1.9104654104593246, "language_loss": 0.80995309, "learning_rate": 7.716392828196483e-07, "loss": 0.83186913, "num_input_tokens_seen": 128623160, "step": 5981, "time_per_iteration": 2.750525951385498 }, { "auxiliary_loss_clip": 0.01164182, "auxiliary_loss_mlp": 0.01029379, "balance_loss_clip": 1.0472784, "balance_loss_mlp": 1.0220511, "epoch": 0.7192929718030422, "flos": 15553162961280.0, "grad_norm": 4.414464337437229, "language_loss": 0.77569997, "learning_rate": 7.710246383079064e-07, "loss": 0.79763556, "num_input_tokens_seen": 128638545, "step": 5982, "time_per_iteration": 2.7050490379333496 }, { "auxiliary_loss_clip": 0.01163242, "auxiliary_loss_mlp": 0.01029597, "balance_loss_clip": 1.04609096, "balance_loss_mlp": 1.02204561, "epoch": 0.7194132146936812, "flos": 21862487733120.0, "grad_norm": 2.1898486757392184, "language_loss": 0.91937518, "learning_rate": 7.704101802307492e-07, "loss": 0.94130361, "num_input_tokens_seen": 128650845, "step": 5983, "time_per_iteration": 3.7663683891296387 }, { "auxiliary_loss_clip": 0.01157685, "auxiliary_loss_mlp": 0.01028113, "balance_loss_clip": 1.05117679, "balance_loss_mlp": 1.02085936, "epoch": 0.7195334575843203, "flos": 27338900958720.0, "grad_norm": 5.0418245720842965, "language_loss": 0.86884171, "learning_rate": 7.697959086813912e-07, "loss": 0.89069974, "num_input_tokens_seen": 128667010, "step": 5984, "time_per_iteration": 2.864689588546753 }, { "auxiliary_loss_clip": 0.01154069, "auxiliary_loss_mlp": 0.0102227, "balance_loss_clip": 1.0475173, "balance_loss_mlp": 1.01472998, "epoch": 0.7196537004749595, "flos": 18770615809920.0, "grad_norm": 1.8776056968245598, "language_loss": 0.79936761, "learning_rate": 7.691818237530145e-07, "loss": 0.82113099, "num_input_tokens_seen": 128685870, "step": 5985, "time_per_iteration": 2.8205034732818604 }, { "auxiliary_loss_clip": 0.01170311, "auxiliary_loss_mlp": 0.01025341, "balance_loss_clip": 1.05054832, "balance_loss_mlp": 1.0179441, "epoch": 0.7197739433655985, "flos": 24531009960960.0, "grad_norm": 1.9281727206071297, "language_loss": 0.7747243, "learning_rate": 7.685679255387774e-07, "loss": 0.79668081, "num_input_tokens_seen": 128704185, "step": 5986, "time_per_iteration": 2.980041265487671 }, { "auxiliary_loss_clip": 0.01157239, "auxiliary_loss_mlp": 0.01021773, "balance_loss_clip": 1.04672217, "balance_loss_mlp": 1.01457882, "epoch": 0.7198941862562376, "flos": 18040587793920.0, "grad_norm": 1.8337366563425326, "language_loss": 0.77145469, "learning_rate": 7.679542141318065e-07, "loss": 0.79324484, "num_input_tokens_seen": 128721290, "step": 5987, "time_per_iteration": 2.815119981765747 }, { "auxiliary_loss_clip": 0.01151132, "auxiliary_loss_mlp": 0.01029579, "balance_loss_clip": 1.04625487, "balance_loss_mlp": 1.02256048, "epoch": 0.7200144291468767, "flos": 29022393542400.0, "grad_norm": 1.8864425001718894, "language_loss": 0.75504804, "learning_rate": 7.673406896252013e-07, "loss": 0.77685517, "num_input_tokens_seen": 128742665, "step": 5988, "time_per_iteration": 2.8835034370422363 }, { "auxiliary_loss_clip": 0.01160208, "auxiliary_loss_mlp": 0.01034109, "balance_loss_clip": 1.04853058, "balance_loss_mlp": 1.0261395, "epoch": 0.7201346720375158, "flos": 25374264624000.0, "grad_norm": 1.7102842734602304, "language_loss": 0.78144729, "learning_rate": 7.667273521120347e-07, "loss": 0.80339038, "num_input_tokens_seen": 128762225, "step": 5989, "time_per_iteration": 2.830000638961792 }, { "auxiliary_loss_clip": 0.01161187, "auxiliary_loss_mlp": 0.01027135, "balance_loss_clip": 1.04723358, "balance_loss_mlp": 1.01964843, "epoch": 0.7202549149281549, "flos": 14355614499840.0, "grad_norm": 1.891883604717989, "language_loss": 0.79458392, "learning_rate": 7.661142016853468e-07, "loss": 0.81646717, "num_input_tokens_seen": 128779585, "step": 5990, "time_per_iteration": 2.7443957328796387 }, { "auxiliary_loss_clip": 0.01153104, "auxiliary_loss_mlp": 0.01023647, "balance_loss_clip": 1.04751539, "balance_loss_mlp": 1.01604474, "epoch": 0.7203751578187939, "flos": 23001682550400.0, "grad_norm": 2.06668777843873, "language_loss": 0.75142789, "learning_rate": 7.655012384381543e-07, "loss": 0.77319539, "num_input_tokens_seen": 128799070, "step": 5991, "time_per_iteration": 2.8856558799743652 }, { "auxiliary_loss_clip": 0.01157354, "auxiliary_loss_mlp": 0.01024679, "balance_loss_clip": 1.04864001, "balance_loss_mlp": 1.0176065, "epoch": 0.7204954007094331, "flos": 23692424065920.0, "grad_norm": 2.183044149338877, "language_loss": 0.82165265, "learning_rate": 7.648884624634415e-07, "loss": 0.84347302, "num_input_tokens_seen": 128817620, "step": 5992, "time_per_iteration": 2.8651363849639893 }, { "auxiliary_loss_clip": 0.01161164, "auxiliary_loss_mlp": 0.01021741, "balance_loss_clip": 1.04661477, "balance_loss_mlp": 1.01443911, "epoch": 0.7206156436000721, "flos": 16253026531200.0, "grad_norm": 2.359321676058426, "language_loss": 0.89065087, "learning_rate": 7.642758738541683e-07, "loss": 0.91247988, "num_input_tokens_seen": 128834200, "step": 5993, "time_per_iteration": 2.8753433227539062 }, { "auxiliary_loss_clip": 0.01063561, "auxiliary_loss_mlp": 0.01001062, "balance_loss_clip": 1.00917959, "balance_loss_mlp": 1.00001323, "epoch": 0.7207358864907112, "flos": 54377806504320.0, "grad_norm": 0.7765372974820913, "language_loss": 0.60759842, "learning_rate": 7.636634727032621e-07, "loss": 0.62824464, "num_input_tokens_seen": 128891305, "step": 5994, "time_per_iteration": 3.1725738048553467 }, { "auxiliary_loss_clip": 0.01166802, "auxiliary_loss_mlp": 0.01027103, "balance_loss_clip": 1.04997611, "balance_loss_mlp": 1.01907396, "epoch": 0.7208561293813504, "flos": 19135540033920.0, "grad_norm": 2.0217303483869387, "language_loss": 0.7849263, "learning_rate": 7.630512591036231e-07, "loss": 0.80686533, "num_input_tokens_seen": 128910615, "step": 5995, "time_per_iteration": 2.8842456340789795 }, { "auxiliary_loss_clip": 0.01166436, "auxiliary_loss_mlp": 0.01028656, "balance_loss_clip": 1.04847264, "balance_loss_mlp": 1.02000141, "epoch": 0.7209763722719894, "flos": 17748526308480.0, "grad_norm": 2.866137112393758, "language_loss": 0.65021443, "learning_rate": 7.624392331481255e-07, "loss": 0.67216539, "num_input_tokens_seen": 128928270, "step": 5996, "time_per_iteration": 2.820897102355957 }, { "auxiliary_loss_clip": 0.01062034, "auxiliary_loss_mlp": 0.0100471, "balance_loss_clip": 1.00981688, "balance_loss_mlp": 1.00364339, "epoch": 0.7210966151626285, "flos": 66819488716800.0, "grad_norm": 0.7550813241671385, "language_loss": 0.51831532, "learning_rate": 7.618273949296115e-07, "loss": 0.53898275, "num_input_tokens_seen": 128987780, "step": 5997, "time_per_iteration": 3.264667510986328 }, { "auxiliary_loss_clip": 0.01159163, "auxiliary_loss_mlp": 0.01021721, "balance_loss_clip": 1.04795969, "balance_loss_mlp": 1.01427042, "epoch": 0.7212168580532676, "flos": 21141869080320.0, "grad_norm": 4.076666640327178, "language_loss": 0.6859535, "learning_rate": 7.612157445408987e-07, "loss": 0.70776236, "num_input_tokens_seen": 129005590, "step": 5998, "time_per_iteration": 2.8672702312469482 }, { "auxiliary_loss_clip": 0.01166913, "auxiliary_loss_mlp": 0.01025599, "balance_loss_clip": 1.05062521, "balance_loss_mlp": 1.01756477, "epoch": 0.7213371009439067, "flos": 22345738335360.0, "grad_norm": 2.1242201006605512, "language_loss": 0.74065137, "learning_rate": 7.606042820747716e-07, "loss": 0.76257652, "num_input_tokens_seen": 129021995, "step": 5999, "time_per_iteration": 3.8735995292663574 }, { "auxiliary_loss_clip": 0.01165961, "auxiliary_loss_mlp": 0.0102435, "balance_loss_clip": 1.04867458, "balance_loss_mlp": 1.0172689, "epoch": 0.7214573438345457, "flos": 18515901490560.0, "grad_norm": 1.9407699188564906, "language_loss": 0.85337245, "learning_rate": 7.599930076239889e-07, "loss": 0.87527555, "num_input_tokens_seen": 129039280, "step": 6000, "time_per_iteration": 2.7108876705169678 }, { "auxiliary_loss_clip": 0.01161855, "auxiliary_loss_mlp": 0.0105231, "balance_loss_clip": 1.05107272, "balance_loss_mlp": 1.01516676, "epoch": 0.7215775867251849, "flos": 35736108606720.0, "grad_norm": 1.8613668143102022, "language_loss": 0.70860565, "learning_rate": 7.593819212812818e-07, "loss": 0.73074722, "num_input_tokens_seen": 129060860, "step": 6001, "time_per_iteration": 2.9491634368896484 }, { "auxiliary_loss_clip": 0.01164205, "auxiliary_loss_mlp": 0.01027168, "balance_loss_clip": 1.04822183, "balance_loss_mlp": 1.01960385, "epoch": 0.721697829615824, "flos": 20372410909440.0, "grad_norm": 13.749506898465768, "language_loss": 0.71541941, "learning_rate": 7.587710231393508e-07, "loss": 0.73733306, "num_input_tokens_seen": 129079215, "step": 6002, "time_per_iteration": 4.7280519008636475 }, { "auxiliary_loss_clip": 0.01139819, "auxiliary_loss_mlp": 0.01020477, "balance_loss_clip": 1.04756713, "balance_loss_mlp": 1.01353621, "epoch": 0.721818072506463, "flos": 20229809915520.0, "grad_norm": 1.9067227164861493, "language_loss": 0.83657002, "learning_rate": 7.581603132908685e-07, "loss": 0.85817295, "num_input_tokens_seen": 129097185, "step": 6003, "time_per_iteration": 2.8819358348846436 }, { "auxiliary_loss_clip": 0.01155704, "auxiliary_loss_mlp": 0.01028937, "balance_loss_clip": 1.04889345, "balance_loss_mlp": 1.02036548, "epoch": 0.7219383153971022, "flos": 18186887888640.0, "grad_norm": 1.992434137985826, "language_loss": 0.78318322, "learning_rate": 7.575497918284795e-07, "loss": 0.80502963, "num_input_tokens_seen": 129114730, "step": 6004, "time_per_iteration": 2.8862555027008057 }, { "auxiliary_loss_clip": 0.01172641, "auxiliary_loss_mlp": 0.01027701, "balance_loss_clip": 1.04851842, "balance_loss_mlp": 1.01949942, "epoch": 0.7220585582877412, "flos": 17342124854400.0, "grad_norm": 1.9468901088155615, "language_loss": 0.74592352, "learning_rate": 7.569394588447984e-07, "loss": 0.76792693, "num_input_tokens_seen": 129131745, "step": 6005, "time_per_iteration": 2.854177236557007 }, { "auxiliary_loss_clip": 0.01157278, "auxiliary_loss_mlp": 0.01021542, "balance_loss_clip": 1.04633713, "balance_loss_mlp": 1.01420498, "epoch": 0.7221788011783803, "flos": 16976338704000.0, "grad_norm": 2.3912732894488493, "language_loss": 0.7807194, "learning_rate": 7.563293144324146e-07, "loss": 0.80250764, "num_input_tokens_seen": 129147295, "step": 6006, "time_per_iteration": 2.7824416160583496 }, { "auxiliary_loss_clip": 0.01168141, "auxiliary_loss_mlp": 0.0102857, "balance_loss_clip": 1.04936099, "balance_loss_mlp": 1.02162623, "epoch": 0.7222990440690195, "flos": 26286359702400.0, "grad_norm": 2.1768064577920914, "language_loss": 0.80249363, "learning_rate": 7.557193586838834e-07, "loss": 0.82446074, "num_input_tokens_seen": 129162660, "step": 6007, "time_per_iteration": 2.7804620265960693 }, { "auxiliary_loss_clip": 0.01161772, "auxiliary_loss_mlp": 0.01026479, "balance_loss_clip": 1.04461825, "balance_loss_mlp": 1.0189743, "epoch": 0.7224192869596585, "flos": 17601687509760.0, "grad_norm": 2.7034067414102156, "language_loss": 0.69865346, "learning_rate": 7.551095916917371e-07, "loss": 0.72053599, "num_input_tokens_seen": 129179990, "step": 6008, "time_per_iteration": 2.727736234664917 }, { "auxiliary_loss_clip": 0.01172286, "auxiliary_loss_mlp": 0.01026019, "balance_loss_clip": 1.05069232, "balance_loss_mlp": 1.01824617, "epoch": 0.7225395298502976, "flos": 12932331016320.0, "grad_norm": 2.3298822864133393, "language_loss": 0.66584051, "learning_rate": 7.545000135484758e-07, "loss": 0.68782353, "num_input_tokens_seen": 129197425, "step": 6009, "time_per_iteration": 3.7818708419799805 }, { "auxiliary_loss_clip": 0.01168506, "auxiliary_loss_mlp": 0.01056316, "balance_loss_clip": 1.04799414, "balance_loss_mlp": 1.01863146, "epoch": 0.7226597727409367, "flos": 29643899592960.0, "grad_norm": 2.2079363598213386, "language_loss": 0.62706959, "learning_rate": 7.538906243465714e-07, "loss": 0.64931774, "num_input_tokens_seen": 129217560, "step": 6010, "time_per_iteration": 2.7928144931793213 }, { "auxiliary_loss_clip": 0.01169988, "auxiliary_loss_mlp": 0.01025532, "balance_loss_clip": 1.04942369, "balance_loss_mlp": 1.01802135, "epoch": 0.7227800156315758, "flos": 13771635183360.0, "grad_norm": 2.1006058775141274, "language_loss": 0.78777003, "learning_rate": 7.5328142417847e-07, "loss": 0.80972522, "num_input_tokens_seen": 129234325, "step": 6011, "time_per_iteration": 2.735741376876831 }, { "auxiliary_loss_clip": 0.01163937, "auxiliary_loss_mlp": 0.01024049, "balance_loss_clip": 1.04675376, "balance_loss_mlp": 1.01702726, "epoch": 0.7229002585222148, "flos": 20301882554880.0, "grad_norm": 1.8554370299700842, "language_loss": 0.69323826, "learning_rate": 7.526724131365838e-07, "loss": 0.71511811, "num_input_tokens_seen": 129255280, "step": 6012, "time_per_iteration": 2.7812516689300537 }, { "auxiliary_loss_clip": 0.01164853, "auxiliary_loss_mlp": 0.01026152, "balance_loss_clip": 1.05151498, "balance_loss_mlp": 1.01868927, "epoch": 0.723020501412854, "flos": 16581250033920.0, "grad_norm": 1.7219457304545855, "language_loss": 0.70466387, "learning_rate": 7.520635913133017e-07, "loss": 0.72657394, "num_input_tokens_seen": 129273910, "step": 6013, "time_per_iteration": 2.739651679992676 }, { "auxiliary_loss_clip": 0.01169046, "auxiliary_loss_mlp": 0.01028601, "balance_loss_clip": 1.04680061, "balance_loss_mlp": 1.0199883, "epoch": 0.7231407443034931, "flos": 28548300908160.0, "grad_norm": 1.8328583386829993, "language_loss": 0.82369268, "learning_rate": 7.514549588009798e-07, "loss": 0.84566921, "num_input_tokens_seen": 129294785, "step": 6014, "time_per_iteration": 2.884300947189331 }, { "auxiliary_loss_clip": 0.01165164, "auxiliary_loss_mlp": 0.01025734, "balance_loss_clip": 1.04850554, "balance_loss_mlp": 1.01823831, "epoch": 0.7232609871941321, "flos": 30008536508160.0, "grad_norm": 2.664399220313889, "language_loss": 0.70510954, "learning_rate": 7.508465156919492e-07, "loss": 0.72701854, "num_input_tokens_seen": 129318295, "step": 6015, "time_per_iteration": 2.8702681064605713 }, { "auxiliary_loss_clip": 0.01162446, "auxiliary_loss_mlp": 0.01027374, "balance_loss_clip": 1.04845309, "balance_loss_mlp": 1.01988173, "epoch": 0.7233812300847713, "flos": 16654005031680.0, "grad_norm": 2.732704897737856, "language_loss": 0.613837, "learning_rate": 7.502382620785083e-07, "loss": 0.63573527, "num_input_tokens_seen": 129334845, "step": 6016, "time_per_iteration": 2.8903861045837402 }, { "auxiliary_loss_clip": 0.01061027, "auxiliary_loss_mlp": 0.01001253, "balance_loss_clip": 1.014624, "balance_loss_mlp": 1.00041807, "epoch": 0.7235014729754103, "flos": 67258784050560.0, "grad_norm": 0.809738659947463, "language_loss": 0.62461925, "learning_rate": 7.496301980529289e-07, "loss": 0.64524204, "num_input_tokens_seen": 129398055, "step": 6017, "time_per_iteration": 3.4681313037872314 }, { "auxiliary_loss_clip": 0.01169467, "auxiliary_loss_mlp": 0.01022999, "balance_loss_clip": 1.04790092, "balance_loss_mlp": 1.01559591, "epoch": 0.7236217158660494, "flos": 26943237671040.0, "grad_norm": 2.234193103133683, "language_loss": 0.74558085, "learning_rate": 7.490223237074547e-07, "loss": 0.76750553, "num_input_tokens_seen": 129417765, "step": 6018, "time_per_iteration": 2.723780870437622 }, { "auxiliary_loss_clip": 0.01159156, "auxiliary_loss_mlp": 0.01025764, "balance_loss_clip": 1.04754329, "balance_loss_mlp": 1.01813507, "epoch": 0.7237419587566886, "flos": 29423372042880.0, "grad_norm": 2.2108103922996016, "language_loss": 0.66365772, "learning_rate": 7.484146391342989e-07, "loss": 0.68550694, "num_input_tokens_seen": 129437560, "step": 6019, "time_per_iteration": 2.874180555343628 }, { "auxiliary_loss_clip": 0.01156248, "auxiliary_loss_mlp": 0.01023329, "balance_loss_clip": 1.04657423, "balance_loss_mlp": 1.0160718, "epoch": 0.7238622016473276, "flos": 17821496787840.0, "grad_norm": 6.26569628072551, "language_loss": 0.56847596, "learning_rate": 7.478071444256484e-07, "loss": 0.59027177, "num_input_tokens_seen": 129455320, "step": 6020, "time_per_iteration": 2.704909563064575 }, { "auxiliary_loss_clip": 0.01170205, "auxiliary_loss_mlp": 0.01024228, "balance_loss_clip": 1.04902339, "balance_loss_mlp": 1.01648259, "epoch": 0.7239824445379667, "flos": 25739117020800.0, "grad_norm": 2.1846689054521575, "language_loss": 0.79556751, "learning_rate": 7.471998396736579e-07, "loss": 0.81751192, "num_input_tokens_seen": 129475700, "step": 6021, "time_per_iteration": 2.84381103515625 }, { "auxiliary_loss_clip": 0.01160549, "auxiliary_loss_mlp": 0.01024658, "balance_loss_clip": 1.04743731, "balance_loss_mlp": 1.01689744, "epoch": 0.7241026874286057, "flos": 23148916398720.0, "grad_norm": 1.7200905370190998, "language_loss": 0.76161182, "learning_rate": 7.465927249704549e-07, "loss": 0.78346395, "num_input_tokens_seen": 129493585, "step": 6022, "time_per_iteration": 2.8372550010681152 }, { "auxiliary_loss_clip": 0.01163648, "auxiliary_loss_mlp": 0.01019184, "balance_loss_clip": 1.04600441, "balance_loss_mlp": 1.01194191, "epoch": 0.7242229303192449, "flos": 20266905686400.0, "grad_norm": 2.846664345156511, "language_loss": 0.77576041, "learning_rate": 7.459858004081398e-07, "loss": 0.79758877, "num_input_tokens_seen": 129511555, "step": 6023, "time_per_iteration": 2.7611570358276367 }, { "auxiliary_loss_clip": 0.01057263, "auxiliary_loss_mlp": 0.01005137, "balance_loss_clip": 1.01198268, "balance_loss_mlp": 1.00426066, "epoch": 0.724343173209884, "flos": 62311659684480.0, "grad_norm": 0.6593829111241446, "language_loss": 0.58006537, "learning_rate": 7.453790660787815e-07, "loss": 0.60068935, "num_input_tokens_seen": 129579650, "step": 6024, "time_per_iteration": 3.4963624477386475 }, { "auxiliary_loss_clip": 0.0116605, "auxiliary_loss_mlp": 0.01024372, "balance_loss_clip": 1.0502038, "balance_loss_mlp": 1.01683784, "epoch": 0.724463416100523, "flos": 35006403813120.0, "grad_norm": 2.121617655502003, "language_loss": 0.6352911, "learning_rate": 7.447725220744214e-07, "loss": 0.65719533, "num_input_tokens_seen": 129601895, "step": 6025, "time_per_iteration": 3.8012242317199707 }, { "auxiliary_loss_clip": 0.01169455, "auxiliary_loss_mlp": 0.01024499, "balance_loss_clip": 1.04773211, "balance_loss_mlp": 1.016801, "epoch": 0.7245836589911622, "flos": 21871968923520.0, "grad_norm": 2.7912754742959796, "language_loss": 0.77650148, "learning_rate": 7.441661684870717e-07, "loss": 0.79844105, "num_input_tokens_seen": 129622150, "step": 6026, "time_per_iteration": 2.7191760540008545 }, { "auxiliary_loss_clip": 0.0116825, "auxiliary_loss_mlp": 0.01021423, "balance_loss_clip": 1.04728007, "balance_loss_mlp": 1.01413345, "epoch": 0.7247039018818012, "flos": 23006494972800.0, "grad_norm": 1.7906397717673204, "language_loss": 0.82104278, "learning_rate": 7.435600054087152e-07, "loss": 0.84293956, "num_input_tokens_seen": 129644315, "step": 6027, "time_per_iteration": 2.769376277923584 }, { "auxiliary_loss_clip": 0.01170111, "auxiliary_loss_mlp": 0.01026981, "balance_loss_clip": 1.04905272, "balance_loss_mlp": 1.01955438, "epoch": 0.7248241447724403, "flos": 31722588587520.0, "grad_norm": 2.057561317807921, "language_loss": 0.74322921, "learning_rate": 7.42954032931308e-07, "loss": 0.76520014, "num_input_tokens_seen": 129665355, "step": 6028, "time_per_iteration": 4.654143333435059 }, { "auxiliary_loss_clip": 0.01164245, "auxiliary_loss_mlp": 0.01027475, "balance_loss_clip": 1.04773808, "balance_loss_mlp": 1.01978648, "epoch": 0.7249443876630794, "flos": 34896984007680.0, "grad_norm": 2.480415579617318, "language_loss": 0.74773514, "learning_rate": 7.423482511467733e-07, "loss": 0.76965237, "num_input_tokens_seen": 129686125, "step": 6029, "time_per_iteration": 2.8684589862823486 }, { "auxiliary_loss_clip": 0.01146951, "auxiliary_loss_mlp": 0.01025733, "balance_loss_clip": 1.0500133, "balance_loss_mlp": 1.01825535, "epoch": 0.7250646305537185, "flos": 26359294268160.0, "grad_norm": 3.819170602520221, "language_loss": 0.64921856, "learning_rate": 7.417426601470099e-07, "loss": 0.67094541, "num_input_tokens_seen": 129706485, "step": 6030, "time_per_iteration": 2.821038246154785 }, { "auxiliary_loss_clip": 0.01166338, "auxiliary_loss_mlp": 0.0102712, "balance_loss_clip": 1.04743958, "balance_loss_mlp": 1.01944888, "epoch": 0.7251848734443576, "flos": 30081614728320.0, "grad_norm": 2.08062792769424, "language_loss": 0.78704035, "learning_rate": 7.411372600238841e-07, "loss": 0.80897492, "num_input_tokens_seen": 129727100, "step": 6031, "time_per_iteration": 2.7476999759674072 }, { "auxiliary_loss_clip": 0.01168361, "auxiliary_loss_mlp": 0.01023395, "balance_loss_clip": 1.04734194, "balance_loss_mlp": 1.01612937, "epoch": 0.7253051163349967, "flos": 17785262943360.0, "grad_norm": 3.3985701658677137, "language_loss": 0.73878646, "learning_rate": 7.405320508692346e-07, "loss": 0.7607041, "num_input_tokens_seen": 129745840, "step": 6032, "time_per_iteration": 2.669234037399292 }, { "auxiliary_loss_clip": 0.0116238, "auxiliary_loss_mlp": 0.01031978, "balance_loss_clip": 1.04495001, "balance_loss_mlp": 1.02475393, "epoch": 0.7254253592256358, "flos": 12641346938880.0, "grad_norm": 1.8228332553466435, "language_loss": 0.75255895, "learning_rate": 7.399270327748727e-07, "loss": 0.77450252, "num_input_tokens_seen": 129763500, "step": 6033, "time_per_iteration": 2.682520627975464 }, { "auxiliary_loss_clip": 0.01161627, "auxiliary_loss_mlp": 0.01055789, "balance_loss_clip": 1.04809248, "balance_loss_mlp": 1.01999712, "epoch": 0.7255456021162748, "flos": 27199208966400.0, "grad_norm": 1.9304846602131776, "language_loss": 0.74401498, "learning_rate": 7.39322205832577e-07, "loss": 0.7661891, "num_input_tokens_seen": 129784390, "step": 6034, "time_per_iteration": 2.825418710708618 }, { "auxiliary_loss_clip": 0.01159597, "auxiliary_loss_mlp": 0.01025703, "balance_loss_clip": 1.0489862, "balance_loss_mlp": 1.01800811, "epoch": 0.725665845006914, "flos": 21288205088640.0, "grad_norm": 1.8858411339941878, "language_loss": 0.80928588, "learning_rate": 7.387175701341009e-07, "loss": 0.83113885, "num_input_tokens_seen": 129803060, "step": 6035, "time_per_iteration": 3.6585731506347656 }, { "auxiliary_loss_clip": 0.01164283, "auxiliary_loss_mlp": 0.01025855, "balance_loss_clip": 1.04572082, "balance_loss_mlp": 1.01869035, "epoch": 0.7257860878975531, "flos": 16033684129920.0, "grad_norm": 2.5477559423143457, "language_loss": 0.72075713, "learning_rate": 7.381131257711659e-07, "loss": 0.74265844, "num_input_tokens_seen": 129820165, "step": 6036, "time_per_iteration": 2.772444725036621 }, { "auxiliary_loss_clip": 0.01156252, "auxiliary_loss_mlp": 0.0102711, "balance_loss_clip": 1.04775929, "balance_loss_mlp": 1.02012408, "epoch": 0.7259063307881921, "flos": 12129943052160.0, "grad_norm": 1.8587496034105286, "language_loss": 0.83895528, "learning_rate": 7.375088728354677e-07, "loss": 0.86078888, "num_input_tokens_seen": 129835195, "step": 6037, "time_per_iteration": 2.756169080734253 }, { "auxiliary_loss_clip": 0.01161511, "auxiliary_loss_mlp": 0.01029738, "balance_loss_clip": 1.04682302, "balance_loss_mlp": 1.02257323, "epoch": 0.7260265736788313, "flos": 30443845432320.0, "grad_norm": 1.630659577753891, "language_loss": 0.67481387, "learning_rate": 7.369048114186691e-07, "loss": 0.69672632, "num_input_tokens_seen": 129856240, "step": 6038, "time_per_iteration": 2.913715362548828 }, { "auxiliary_loss_clip": 0.01167266, "auxiliary_loss_mlp": 0.01049616, "balance_loss_clip": 1.04987466, "balance_loss_mlp": 1.01341939, "epoch": 0.7261468165694703, "flos": 21142264129920.0, "grad_norm": 1.9393919281942653, "language_loss": 0.83073288, "learning_rate": 7.363009416124055e-07, "loss": 0.8529017, "num_input_tokens_seen": 129875565, "step": 6039, "time_per_iteration": 2.7324252128601074 }, { "auxiliary_loss_clip": 0.01165784, "auxiliary_loss_mlp": 0.01021225, "balance_loss_clip": 1.05136979, "balance_loss_mlp": 1.01392674, "epoch": 0.7262670594601094, "flos": 22306308180480.0, "grad_norm": 2.6651130216155545, "language_loss": 0.62993497, "learning_rate": 7.356972635082852e-07, "loss": 0.65180504, "num_input_tokens_seen": 129894420, "step": 6040, "time_per_iteration": 2.790292739868164 }, { "auxiliary_loss_clip": 0.01152966, "auxiliary_loss_mlp": 0.01024711, "balance_loss_clip": 1.04929924, "balance_loss_mlp": 1.01735008, "epoch": 0.7263873023507486, "flos": 25335049950720.0, "grad_norm": 1.7653247055713588, "language_loss": 0.75451815, "learning_rate": 7.35093777197884e-07, "loss": 0.77629495, "num_input_tokens_seen": 129914490, "step": 6041, "time_per_iteration": 2.8879220485687256 }, { "auxiliary_loss_clip": 0.01160685, "auxiliary_loss_mlp": 0.01023871, "balance_loss_clip": 1.04846632, "balance_loss_mlp": 1.01664722, "epoch": 0.7265075452413876, "flos": 23878621192320.0, "grad_norm": 2.9075861698770162, "language_loss": 0.8583194, "learning_rate": 7.344904827727525e-07, "loss": 0.88016498, "num_input_tokens_seen": 129931670, "step": 6042, "time_per_iteration": 2.788015365600586 }, { "auxiliary_loss_clip": 0.01160267, "auxiliary_loss_mlp": 0.01025184, "balance_loss_clip": 1.04752696, "balance_loss_mlp": 1.01760805, "epoch": 0.7266277881320267, "flos": 28724549967360.0, "grad_norm": 2.4764257863756085, "language_loss": 0.73821008, "learning_rate": 7.338873803244076e-07, "loss": 0.7600646, "num_input_tokens_seen": 129946905, "step": 6043, "time_per_iteration": 2.8090271949768066 }, { "auxiliary_loss_clip": 0.01157392, "auxiliary_loss_mlp": 0.01024064, "balance_loss_clip": 1.04685473, "balance_loss_mlp": 1.01688802, "epoch": 0.7267480310226658, "flos": 24863507182080.0, "grad_norm": 1.793303503287256, "language_loss": 0.8088243, "learning_rate": 7.332844699443401e-07, "loss": 0.83063889, "num_input_tokens_seen": 129965505, "step": 6044, "time_per_iteration": 2.8030831813812256 }, { "auxiliary_loss_clip": 0.01150613, "auxiliary_loss_mlp": 0.01029999, "balance_loss_clip": 1.04818988, "balance_loss_mlp": 1.02278948, "epoch": 0.7268682739133049, "flos": 27198490694400.0, "grad_norm": 1.911957902466156, "language_loss": 0.75357497, "learning_rate": 7.326817517240121e-07, "loss": 0.77538109, "num_input_tokens_seen": 129987210, "step": 6045, "time_per_iteration": 2.8877177238464355 }, { "auxiliary_loss_clip": 0.01165465, "auxiliary_loss_mlp": 0.01055088, "balance_loss_clip": 1.04721189, "balance_loss_mlp": 1.02117383, "epoch": 0.7269885168039439, "flos": 33508138688640.0, "grad_norm": 1.9193701536038337, "language_loss": 0.83195287, "learning_rate": 7.320792257548545e-07, "loss": 0.8541584, "num_input_tokens_seen": 130008385, "step": 6046, "time_per_iteration": 2.8123855590820312 }, { "auxiliary_loss_clip": 0.01166141, "auxiliary_loss_mlp": 0.0102304, "balance_loss_clip": 1.04830837, "balance_loss_mlp": 1.01551199, "epoch": 0.7271087596945831, "flos": 24313750548480.0, "grad_norm": 3.5681247645091845, "language_loss": 0.76536769, "learning_rate": 7.314768921282704e-07, "loss": 0.78725946, "num_input_tokens_seen": 130029040, "step": 6047, "time_per_iteration": 2.726571798324585 }, { "auxiliary_loss_clip": 0.01168591, "auxiliary_loss_mlp": 0.0102542, "balance_loss_clip": 1.04789448, "balance_loss_mlp": 1.01771331, "epoch": 0.7272290025852222, "flos": 23805147922560.0, "grad_norm": 3.921486337385666, "language_loss": 0.71773076, "learning_rate": 7.30874750935633e-07, "loss": 0.73967081, "num_input_tokens_seen": 130048725, "step": 6048, "time_per_iteration": 2.761801242828369 }, { "auxiliary_loss_clip": 0.01158861, "auxiliary_loss_mlp": 0.01024989, "balance_loss_clip": 1.04876709, "balance_loss_mlp": 1.01733613, "epoch": 0.7273492454758612, "flos": 16720367408640.0, "grad_norm": 1.7807998478838438, "language_loss": 0.79094088, "learning_rate": 7.30272802268286e-07, "loss": 0.81277943, "num_input_tokens_seen": 130065720, "step": 6049, "time_per_iteration": 2.782447338104248 }, { "auxiliary_loss_clip": 0.01134757, "auxiliary_loss_mlp": 0.01025146, "balance_loss_clip": 1.04887617, "balance_loss_mlp": 1.01816618, "epoch": 0.7274694883665004, "flos": 28031330413440.0, "grad_norm": 1.762845154886662, "language_loss": 0.76119208, "learning_rate": 7.29671046217547e-07, "loss": 0.78279114, "num_input_tokens_seen": 130084830, "step": 6050, "time_per_iteration": 3.0085997581481934 }, { "auxiliary_loss_clip": 0.01161523, "auxiliary_loss_mlp": 0.01026762, "balance_loss_clip": 1.04865599, "balance_loss_mlp": 1.01958549, "epoch": 0.7275897312571394, "flos": 30372706546560.0, "grad_norm": 1.777516763595972, "language_loss": 0.81582326, "learning_rate": 7.290694828746988e-07, "loss": 0.83770615, "num_input_tokens_seen": 130104495, "step": 6051, "time_per_iteration": 3.774783134460449 }, { "auxiliary_loss_clip": 0.01163709, "auxiliary_loss_mlp": 0.01025113, "balance_loss_clip": 1.04714417, "balance_loss_mlp": 1.01761425, "epoch": 0.7277099741477785, "flos": 19204775498880.0, "grad_norm": 1.8760960304841166, "language_loss": 0.85752225, "learning_rate": 7.284681123310004e-07, "loss": 0.87941051, "num_input_tokens_seen": 130123210, "step": 6052, "time_per_iteration": 2.768179416656494 }, { "auxiliary_loss_clip": 0.0116473, "auxiliary_loss_mlp": 0.010256, "balance_loss_clip": 1.04838467, "balance_loss_mlp": 1.01721394, "epoch": 0.7278302170384175, "flos": 20667884186880.0, "grad_norm": 1.6845401569274963, "language_loss": 0.79707754, "learning_rate": 7.27866934677678e-07, "loss": 0.81898081, "num_input_tokens_seen": 130142880, "step": 6053, "time_per_iteration": 2.790057420730591 }, { "auxiliary_loss_clip": 0.01154634, "auxiliary_loss_mlp": 0.01026684, "balance_loss_clip": 1.04844713, "balance_loss_mlp": 1.01914382, "epoch": 0.7279504599290567, "flos": 19093200877440.0, "grad_norm": 1.6705576903361772, "language_loss": 0.78642321, "learning_rate": 7.272659500059297e-07, "loss": 0.80823642, "num_input_tokens_seen": 130160220, "step": 6054, "time_per_iteration": 4.796281576156616 }, { "auxiliary_loss_clip": 0.01161712, "auxiliary_loss_mlp": 0.01028459, "balance_loss_clip": 1.04877543, "balance_loss_mlp": 1.0201081, "epoch": 0.7280707028196958, "flos": 19062174504960.0, "grad_norm": 1.9120322274774364, "language_loss": 0.80563784, "learning_rate": 7.266651584069264e-07, "loss": 0.82753956, "num_input_tokens_seen": 130177885, "step": 6055, "time_per_iteration": 2.7728962898254395 }, { "auxiliary_loss_clip": 0.01167662, "auxiliary_loss_mlp": 0.01024412, "balance_loss_clip": 1.04967308, "balance_loss_mlp": 1.01738429, "epoch": 0.7281909457103348, "flos": 37196308293120.0, "grad_norm": 1.6541473059563416, "language_loss": 0.56686819, "learning_rate": 7.260645599718045e-07, "loss": 0.58878899, "num_input_tokens_seen": 130204240, "step": 6056, "time_per_iteration": 2.9034712314605713 }, { "auxiliary_loss_clip": 0.01165603, "auxiliary_loss_mlp": 0.01032057, "balance_loss_clip": 1.04991317, "balance_loss_mlp": 1.02379608, "epoch": 0.728311188600974, "flos": 20667094087680.0, "grad_norm": 3.978754944628698, "language_loss": 0.67610872, "learning_rate": 7.254641547916767e-07, "loss": 0.69808531, "num_input_tokens_seen": 130221735, "step": 6057, "time_per_iteration": 2.644930601119995 }, { "auxiliary_loss_clip": 0.01169074, "auxiliary_loss_mlp": 0.01022196, "balance_loss_clip": 1.05032635, "balance_loss_mlp": 1.01430154, "epoch": 0.728431431491613, "flos": 28840685616000.0, "grad_norm": 2.006164647390645, "language_loss": 0.69455445, "learning_rate": 7.248639429576226e-07, "loss": 0.71646714, "num_input_tokens_seen": 130241190, "step": 6058, "time_per_iteration": 2.796621799468994 }, { "auxiliary_loss_clip": 0.01166207, "auxiliary_loss_mlp": 0.01021965, "balance_loss_clip": 1.04653394, "balance_loss_mlp": 1.01462436, "epoch": 0.7285516743822521, "flos": 25991856092160.0, "grad_norm": 1.5817644180116226, "language_loss": 0.71865487, "learning_rate": 7.242639245606959e-07, "loss": 0.74053657, "num_input_tokens_seen": 130260980, "step": 6059, "time_per_iteration": 2.8274435997009277 }, { "auxiliary_loss_clip": 0.01169758, "auxiliary_loss_mlp": 0.0102739, "balance_loss_clip": 1.04956698, "balance_loss_mlp": 1.02021098, "epoch": 0.7286719172728913, "flos": 16399721675520.0, "grad_norm": 2.4635977202451085, "language_loss": 0.82568336, "learning_rate": 7.236640996919168e-07, "loss": 0.84765482, "num_input_tokens_seen": 130280025, "step": 6060, "time_per_iteration": 2.756265163421631 }, { "auxiliary_loss_clip": 0.01166166, "auxiliary_loss_mlp": 0.01022106, "balance_loss_clip": 1.04753125, "balance_loss_mlp": 1.01512003, "epoch": 0.7287921601635303, "flos": 22018161277440.0, "grad_norm": 1.9009921920498825, "language_loss": 0.70689154, "learning_rate": 7.230644684422782e-07, "loss": 0.72877419, "num_input_tokens_seen": 130300255, "step": 6061, "time_per_iteration": 3.6513071060180664 }, { "auxiliary_loss_clip": 0.01157945, "auxiliary_loss_mlp": 0.01021994, "balance_loss_clip": 1.0472604, "balance_loss_mlp": 1.01406074, "epoch": 0.7289124030541694, "flos": 24600927784320.0, "grad_norm": 3.08898692903837, "language_loss": 0.81663018, "learning_rate": 7.224650309027451e-07, "loss": 0.83842957, "num_input_tokens_seen": 130320005, "step": 6062, "time_per_iteration": 2.805575370788574 }, { "auxiliary_loss_clip": 0.01169706, "auxiliary_loss_mlp": 0.01023162, "balance_loss_clip": 1.04976988, "balance_loss_mlp": 1.01576471, "epoch": 0.7290326459448085, "flos": 21393638484480.0, "grad_norm": 1.8683730951956465, "language_loss": 0.68851846, "learning_rate": 7.218657871642506e-07, "loss": 0.71044713, "num_input_tokens_seen": 130338810, "step": 6063, "time_per_iteration": 2.8306996822357178 }, { "auxiliary_loss_clip": 0.01171569, "auxiliary_loss_mlp": 0.01026421, "balance_loss_clip": 1.04945076, "balance_loss_mlp": 1.01897669, "epoch": 0.7291528888354476, "flos": 18587686821120.0, "grad_norm": 1.9009430018827271, "language_loss": 0.62511653, "learning_rate": 7.212667373177012e-07, "loss": 0.6470964, "num_input_tokens_seen": 130353805, "step": 6064, "time_per_iteration": 2.6661624908447266 }, { "auxiliary_loss_clip": 0.01157999, "auxiliary_loss_mlp": 0.01025448, "balance_loss_clip": 1.04675126, "balance_loss_mlp": 1.01837873, "epoch": 0.7292731317260867, "flos": 18951066760320.0, "grad_norm": 2.184088855194685, "language_loss": 0.75349331, "learning_rate": 7.206678814539704e-07, "loss": 0.77532774, "num_input_tokens_seen": 130372105, "step": 6065, "time_per_iteration": 2.831149101257324 }, { "auxiliary_loss_clip": 0.01163707, "auxiliary_loss_mlp": 0.01025419, "balance_loss_clip": 1.04750395, "balance_loss_mlp": 1.01891923, "epoch": 0.7293933746167258, "flos": 21067569797760.0, "grad_norm": 1.51328004366092, "language_loss": 0.72728968, "learning_rate": 7.20069219663904e-07, "loss": 0.74918091, "num_input_tokens_seen": 130391990, "step": 6066, "time_per_iteration": 2.7762765884399414 }, { "auxiliary_loss_clip": 0.01167654, "auxiliary_loss_mlp": 0.0102413, "balance_loss_clip": 1.04661536, "balance_loss_mlp": 1.01699853, "epoch": 0.7295136175073649, "flos": 22453326547200.0, "grad_norm": 1.6742651720957242, "language_loss": 0.79768044, "learning_rate": 7.1947075203832e-07, "loss": 0.81959832, "num_input_tokens_seen": 130411970, "step": 6067, "time_per_iteration": 2.7251498699188232 }, { "auxiliary_loss_clip": 0.01063957, "auxiliary_loss_mlp": 0.01000506, "balance_loss_clip": 1.0093869, "balance_loss_mlp": 0.9996298, "epoch": 0.7296338603980039, "flos": 56125506648960.0, "grad_norm": 0.8597029843458559, "language_loss": 0.60105455, "learning_rate": 7.188724786680049e-07, "loss": 0.62169921, "num_input_tokens_seen": 130472440, "step": 6068, "time_per_iteration": 3.2294955253601074 }, { "auxiliary_loss_clip": 0.01160728, "auxiliary_loss_mlp": 0.01020444, "balance_loss_clip": 1.04618704, "balance_loss_mlp": 1.01325321, "epoch": 0.7297541032886431, "flos": 25228287751680.0, "grad_norm": 1.5722186150930675, "language_loss": 0.75818014, "learning_rate": 7.182743996437162e-07, "loss": 0.77999187, "num_input_tokens_seen": 130491975, "step": 6069, "time_per_iteration": 2.820378303527832 }, { "auxiliary_loss_clip": 0.01168983, "auxiliary_loss_mlp": 0.01023212, "balance_loss_clip": 1.04828763, "balance_loss_mlp": 1.01589227, "epoch": 0.7298743461792822, "flos": 26467600752000.0, "grad_norm": 1.866835679791622, "language_loss": 0.68911439, "learning_rate": 7.176765150561819e-07, "loss": 0.71103632, "num_input_tokens_seen": 130510580, "step": 6070, "time_per_iteration": 2.9042632579803467 }, { "auxiliary_loss_clip": 0.011704, "auxiliary_loss_mlp": 0.01030623, "balance_loss_clip": 1.04752934, "balance_loss_mlp": 1.02295804, "epoch": 0.7299945890699212, "flos": 19569053278080.0, "grad_norm": 2.0857855512102477, "language_loss": 0.79665005, "learning_rate": 7.170788249961002e-07, "loss": 0.81866026, "num_input_tokens_seen": 130529090, "step": 6071, "time_per_iteration": 2.703364849090576 }, { "auxiliary_loss_clip": 0.0116506, "auxiliary_loss_mlp": 0.01021064, "balance_loss_clip": 1.04627872, "balance_loss_mlp": 1.01384604, "epoch": 0.7301148319605604, "flos": 22928963466240.0, "grad_norm": 2.365505725078205, "language_loss": 0.88285065, "learning_rate": 7.164813295541418e-07, "loss": 0.90471184, "num_input_tokens_seen": 130548655, "step": 6072, "time_per_iteration": 2.6751842498779297 }, { "auxiliary_loss_clip": 0.01163599, "auxiliary_loss_mlp": 0.01027364, "balance_loss_clip": 1.04803717, "balance_loss_mlp": 1.01988387, "epoch": 0.7302350748511994, "flos": 25369703596800.0, "grad_norm": 1.7609812612711835, "language_loss": 0.70385468, "learning_rate": 7.15884028820944e-07, "loss": 0.72576433, "num_input_tokens_seen": 130567710, "step": 6073, "time_per_iteration": 2.7819526195526123 }, { "auxiliary_loss_clip": 0.01154881, "auxiliary_loss_mlp": 0.01022301, "balance_loss_clip": 1.04573345, "balance_loss_mlp": 1.01479685, "epoch": 0.7303553177418385, "flos": 27819170732160.0, "grad_norm": 2.6512017700848958, "language_loss": 0.60121578, "learning_rate": 7.152869228871185e-07, "loss": 0.62298763, "num_input_tokens_seen": 130590195, "step": 6074, "time_per_iteration": 2.7877492904663086 }, { "auxiliary_loss_clip": 0.01161643, "auxiliary_loss_mlp": 0.01029456, "balance_loss_clip": 1.05054486, "balance_loss_mlp": 1.02151656, "epoch": 0.7304755606324776, "flos": 24426510318720.0, "grad_norm": 1.8424533389567461, "language_loss": 0.72531247, "learning_rate": 7.146900118432457e-07, "loss": 0.7472235, "num_input_tokens_seen": 130609940, "step": 6075, "time_per_iteration": 2.8063220977783203 }, { "auxiliary_loss_clip": 0.01148905, "auxiliary_loss_mlp": 0.01025561, "balance_loss_clip": 1.04591322, "balance_loss_mlp": 1.01841462, "epoch": 0.7305958035231167, "flos": 23840483927040.0, "grad_norm": 1.9133010813283993, "language_loss": 0.85876346, "learning_rate": 7.140932957798753e-07, "loss": 0.88050812, "num_input_tokens_seen": 130628380, "step": 6076, "time_per_iteration": 2.815408229827881 }, { "auxiliary_loss_clip": 0.01169086, "auxiliary_loss_mlp": 0.01024716, "balance_loss_clip": 1.04904079, "balance_loss_mlp": 1.0172776, "epoch": 0.7307160464137558, "flos": 16726939597440.0, "grad_norm": 2.013580449455544, "language_loss": 0.71334922, "learning_rate": 7.134967747875309e-07, "loss": 0.73528731, "num_input_tokens_seen": 130646590, "step": 6077, "time_per_iteration": 3.6919686794281006 }, { "auxiliary_loss_clip": 0.01158985, "auxiliary_loss_mlp": 0.01030377, "balance_loss_clip": 1.04524851, "balance_loss_mlp": 1.02266443, "epoch": 0.7308362893043949, "flos": 21798280172160.0, "grad_norm": 1.8725907594624296, "language_loss": 0.8172667, "learning_rate": 7.129004489567014e-07, "loss": 0.83916032, "num_input_tokens_seen": 130664070, "step": 6078, "time_per_iteration": 2.6737256050109863 }, { "auxiliary_loss_clip": 0.01160579, "auxiliary_loss_mlp": 0.01022921, "balance_loss_clip": 1.04621422, "balance_loss_mlp": 1.01558661, "epoch": 0.730956532195034, "flos": 10707377840640.0, "grad_norm": 2.9640885271397677, "language_loss": 0.77947366, "learning_rate": 7.123043183778512e-07, "loss": 0.80130869, "num_input_tokens_seen": 130681400, "step": 6079, "time_per_iteration": 2.676926374435425 }, { "auxiliary_loss_clip": 0.01164356, "auxiliary_loss_mlp": 0.01025834, "balance_loss_clip": 1.0487504, "balance_loss_mlp": 1.017838, "epoch": 0.731076775085673, "flos": 19791987039360.0, "grad_norm": 2.6854341559538817, "language_loss": 0.65404987, "learning_rate": 7.117083831414114e-07, "loss": 0.67595172, "num_input_tokens_seen": 130700675, "step": 6080, "time_per_iteration": 3.6596362590789795 }, { "auxiliary_loss_clip": 0.01163232, "auxiliary_loss_mlp": 0.01026492, "balance_loss_clip": 1.04567838, "balance_loss_mlp": 1.01910114, "epoch": 0.7311970179763122, "flos": 20447033414400.0, "grad_norm": 2.0006344659606285, "language_loss": 0.69418919, "learning_rate": 7.11112643337787e-07, "loss": 0.71608639, "num_input_tokens_seen": 130719720, "step": 6081, "time_per_iteration": 3.707585096359253 }, { "auxiliary_loss_clip": 0.01161379, "auxiliary_loss_mlp": 0.01022725, "balance_loss_clip": 1.04799581, "balance_loss_mlp": 1.01528633, "epoch": 0.7313172608669513, "flos": 18513818501760.0, "grad_norm": 2.843042147896644, "language_loss": 0.76331675, "learning_rate": 7.10517099057349e-07, "loss": 0.7851578, "num_input_tokens_seen": 130736670, "step": 6082, "time_per_iteration": 2.7242462635040283 }, { "auxiliary_loss_clip": 0.01164374, "auxiliary_loss_mlp": 0.01026574, "balance_loss_clip": 1.04715729, "balance_loss_mlp": 1.01918912, "epoch": 0.7314375037575903, "flos": 16180738410240.0, "grad_norm": 2.3397969945429273, "language_loss": 0.61236095, "learning_rate": 7.099217503904411e-07, "loss": 0.63427049, "num_input_tokens_seen": 130754525, "step": 6083, "time_per_iteration": 2.6780545711517334 }, { "auxiliary_loss_clip": 0.01164349, "auxiliary_loss_mlp": 0.01022779, "balance_loss_clip": 1.04713118, "balance_loss_mlp": 1.01615727, "epoch": 0.7315577466482295, "flos": 17967940536960.0, "grad_norm": 1.9387342440080464, "language_loss": 0.90055811, "learning_rate": 7.093265974273788e-07, "loss": 0.92242938, "num_input_tokens_seen": 130772420, "step": 6084, "time_per_iteration": 2.741997718811035 }, { "auxiliary_loss_clip": 0.01164905, "auxiliary_loss_mlp": 0.01024927, "balance_loss_clip": 1.04652858, "balance_loss_mlp": 1.01792943, "epoch": 0.7316779895388685, "flos": 18405440190720.0, "grad_norm": 1.911738029720038, "language_loss": 0.71802229, "learning_rate": 7.087316402584447e-07, "loss": 0.73992062, "num_input_tokens_seen": 130791245, "step": 6085, "time_per_iteration": 2.7066938877105713 }, { "auxiliary_loss_clip": 0.01166362, "auxiliary_loss_mlp": 0.01030524, "balance_loss_clip": 1.04653001, "balance_loss_mlp": 1.02326095, "epoch": 0.7317982324295076, "flos": 17928294900480.0, "grad_norm": 1.8134720631421415, "language_loss": 0.86609626, "learning_rate": 7.081368789738953e-07, "loss": 0.8880651, "num_input_tokens_seen": 130808445, "step": 6086, "time_per_iteration": 2.641075849533081 }, { "auxiliary_loss_clip": 0.01159845, "auxiliary_loss_mlp": 0.01023504, "balance_loss_clip": 1.04779744, "balance_loss_mlp": 1.0165062, "epoch": 0.7319184753201466, "flos": 27229840289280.0, "grad_norm": 1.8887640622880577, "language_loss": 0.77397722, "learning_rate": 7.075423136639537e-07, "loss": 0.7958107, "num_input_tokens_seen": 130827700, "step": 6087, "time_per_iteration": 3.683335781097412 }, { "auxiliary_loss_clip": 0.01159102, "auxiliary_loss_mlp": 0.01025676, "balance_loss_clip": 1.05001795, "balance_loss_mlp": 1.01843977, "epoch": 0.7320387182107858, "flos": 37448544574080.0, "grad_norm": 2.0081473932578344, "language_loss": 0.74551272, "learning_rate": 7.069479444188149e-07, "loss": 0.76736045, "num_input_tokens_seen": 130848290, "step": 6088, "time_per_iteration": 2.884915351867676 }, { "auxiliary_loss_clip": 0.01156077, "auxiliary_loss_mlp": 0.01024471, "balance_loss_clip": 1.04892397, "balance_loss_mlp": 1.0172441, "epoch": 0.7321589611014249, "flos": 17859023521920.0, "grad_norm": 1.853218690783789, "language_loss": 0.82309359, "learning_rate": 7.063537713286453e-07, "loss": 0.84489912, "num_input_tokens_seen": 130865970, "step": 6089, "time_per_iteration": 2.726310968399048 }, { "auxiliary_loss_clip": 0.01167183, "auxiliary_loss_mlp": 0.01022531, "balance_loss_clip": 1.04824412, "balance_loss_mlp": 1.01507688, "epoch": 0.7322792039920639, "flos": 26100593539200.0, "grad_norm": 2.080659791151928, "language_loss": 0.8087852, "learning_rate": 7.057597944835803e-07, "loss": 0.83068234, "num_input_tokens_seen": 130885245, "step": 6090, "time_per_iteration": 2.765061616897583 }, { "auxiliary_loss_clip": 0.01163437, "auxiliary_loss_mlp": 0.01028399, "balance_loss_clip": 1.04623365, "balance_loss_mlp": 1.02093649, "epoch": 0.7323994468827031, "flos": 25369093065600.0, "grad_norm": 1.6004698152851726, "language_loss": 0.74713409, "learning_rate": 7.051660139737253e-07, "loss": 0.76905245, "num_input_tokens_seen": 130903465, "step": 6091, "time_per_iteration": 2.747957706451416 }, { "auxiliary_loss_clip": 0.01166558, "auxiliary_loss_mlp": 0.01055857, "balance_loss_clip": 1.05132556, "balance_loss_mlp": 1.02118325, "epoch": 0.7325196897733421, "flos": 26907075653760.0, "grad_norm": 3.37880698684609, "language_loss": 0.76173329, "learning_rate": 7.045724298891565e-07, "loss": 0.78395742, "num_input_tokens_seen": 130922935, "step": 6092, "time_per_iteration": 2.78619122505188 }, { "auxiliary_loss_clip": 0.01165094, "auxiliary_loss_mlp": 0.01024081, "balance_loss_clip": 1.0491612, "balance_loss_mlp": 1.01669884, "epoch": 0.7326399326639812, "flos": 25775781828480.0, "grad_norm": 2.1612290429506724, "language_loss": 0.69252646, "learning_rate": 7.039790423199192e-07, "loss": 0.71441817, "num_input_tokens_seen": 130942575, "step": 6093, "time_per_iteration": 2.822880744934082 }, { "auxiliary_loss_clip": 0.01164581, "auxiliary_loss_mlp": 0.01025282, "balance_loss_clip": 1.04798603, "balance_loss_mlp": 1.01758671, "epoch": 0.7327601755546204, "flos": 21032269706880.0, "grad_norm": 2.0577461965641395, "language_loss": 0.78126729, "learning_rate": 7.033858513560322e-07, "loss": 0.80316591, "num_input_tokens_seen": 130958870, "step": 6094, "time_per_iteration": 2.809150457382202 }, { "auxiliary_loss_clip": 0.01165981, "auxiliary_loss_mlp": 0.01026542, "balance_loss_clip": 1.04989648, "balance_loss_mlp": 1.01970184, "epoch": 0.7328804184452594, "flos": 16289224462080.0, "grad_norm": 2.6487227666238446, "language_loss": 0.76324403, "learning_rate": 7.027928570874794e-07, "loss": 0.78516924, "num_input_tokens_seen": 130977060, "step": 6095, "time_per_iteration": 2.735790729522705 }, { "auxiliary_loss_clip": 0.01166871, "auxiliary_loss_mlp": 0.01026279, "balance_loss_clip": 1.04746342, "balance_loss_mlp": 1.01875079, "epoch": 0.7330006613358985, "flos": 17858233422720.0, "grad_norm": 1.8079950499328157, "language_loss": 0.85722828, "learning_rate": 7.022000596042194e-07, "loss": 0.87915981, "num_input_tokens_seen": 130994160, "step": 6096, "time_per_iteration": 2.6913931369781494 }, { "auxiliary_loss_clip": 0.01159515, "auxiliary_loss_mlp": 0.01025479, "balance_loss_clip": 1.04443324, "balance_loss_mlp": 1.01813269, "epoch": 0.7331209042265376, "flos": 22492074343680.0, "grad_norm": 3.380385620574391, "language_loss": 0.8210026, "learning_rate": 7.016074589961784e-07, "loss": 0.84285253, "num_input_tokens_seen": 131012725, "step": 6097, "time_per_iteration": 2.7378718852996826 }, { "auxiliary_loss_clip": 0.01156412, "auxiliary_loss_mlp": 0.01028791, "balance_loss_clip": 1.04728615, "balance_loss_mlp": 1.02174294, "epoch": 0.7332411471171767, "flos": 33072757937280.0, "grad_norm": 1.8115646919833388, "language_loss": 0.67152011, "learning_rate": 7.01015055353253e-07, "loss": 0.69337213, "num_input_tokens_seen": 131035150, "step": 6098, "time_per_iteration": 2.84844708442688 }, { "auxiliary_loss_clip": 0.01147159, "auxiliary_loss_mlp": 0.01024552, "balance_loss_clip": 1.04858971, "balance_loss_mlp": 1.01711905, "epoch": 0.7333613900078157, "flos": 22743017735040.0, "grad_norm": 2.153069584632185, "language_loss": 0.77849507, "learning_rate": 7.004228487653123e-07, "loss": 0.8002122, "num_input_tokens_seen": 131055955, "step": 6099, "time_per_iteration": 2.83467960357666 }, { "auxiliary_loss_clip": 0.0115952, "auxiliary_loss_mlp": 0.01031889, "balance_loss_clip": 1.04747057, "balance_loss_mlp": 1.02453339, "epoch": 0.7334816328984549, "flos": 22346133384960.0, "grad_norm": 1.8915420286204498, "language_loss": 0.78677493, "learning_rate": 6.998308393221906e-07, "loss": 0.808689, "num_input_tokens_seen": 131074360, "step": 6100, "time_per_iteration": 2.7501227855682373 }, { "auxiliary_loss_clip": 0.01160844, "auxiliary_loss_mlp": 0.01024902, "balance_loss_clip": 1.04939044, "balance_loss_mlp": 1.01839328, "epoch": 0.733601875789094, "flos": 20736149984640.0, "grad_norm": 2.311802644886223, "language_loss": 0.71348351, "learning_rate": 6.992390271136977e-07, "loss": 0.73534095, "num_input_tokens_seen": 131090070, "step": 6101, "time_per_iteration": 2.7940409183502197 }, { "auxiliary_loss_clip": 0.01161229, "auxiliary_loss_mlp": 0.01026903, "balance_loss_clip": 1.04879081, "balance_loss_mlp": 1.01955366, "epoch": 0.733722118679733, "flos": 22564362464640.0, "grad_norm": 1.885171196534972, "language_loss": 0.85504156, "learning_rate": 6.986474122296094e-07, "loss": 0.87692291, "num_input_tokens_seen": 131109185, "step": 6102, "time_per_iteration": 2.752206563949585 }, { "auxiliary_loss_clip": 0.01171891, "auxiliary_loss_mlp": 0.01022295, "balance_loss_clip": 1.04964125, "balance_loss_mlp": 1.01481497, "epoch": 0.7338423615703722, "flos": 20084192179200.0, "grad_norm": 2.7289708956032332, "language_loss": 0.72299272, "learning_rate": 6.980559947596751e-07, "loss": 0.74493456, "num_input_tokens_seen": 131127725, "step": 6103, "time_per_iteration": 3.5722928047180176 }, { "auxiliary_loss_clip": 0.01156735, "auxiliary_loss_mlp": 0.01029517, "balance_loss_clip": 1.04909325, "balance_loss_mlp": 1.02231383, "epoch": 0.7339626044610112, "flos": 21687675217920.0, "grad_norm": 1.9364703809798864, "language_loss": 0.76024574, "learning_rate": 6.974647747936109e-07, "loss": 0.78210831, "num_input_tokens_seen": 131146110, "step": 6104, "time_per_iteration": 2.8081414699554443 }, { "auxiliary_loss_clip": 0.01169301, "auxiliary_loss_mlp": 0.01055549, "balance_loss_clip": 1.04890108, "balance_loss_mlp": 1.01799726, "epoch": 0.7340828473516503, "flos": 15268248282240.0, "grad_norm": 1.8653902422107136, "language_loss": 0.82223547, "learning_rate": 6.968737524211039e-07, "loss": 0.84448397, "num_input_tokens_seen": 131162920, "step": 6105, "time_per_iteration": 2.7391293048858643 }, { "auxiliary_loss_clip": 0.01163672, "auxiliary_loss_mlp": 0.01029051, "balance_loss_clip": 1.04949558, "balance_loss_mlp": 1.0214932, "epoch": 0.7342030902422895, "flos": 22930112701440.0, "grad_norm": 2.6447913134994923, "language_loss": 0.8013109, "learning_rate": 6.962829277318132e-07, "loss": 0.82323813, "num_input_tokens_seen": 131182515, "step": 6106, "time_per_iteration": 3.674095392227173 }, { "auxiliary_loss_clip": 0.01167736, "auxiliary_loss_mlp": 0.01026433, "balance_loss_clip": 1.04982281, "balance_loss_mlp": 1.01951897, "epoch": 0.7343233331329285, "flos": 25847890381440.0, "grad_norm": 1.81675442079574, "language_loss": 0.83760399, "learning_rate": 6.956923008153652e-07, "loss": 0.85954565, "num_input_tokens_seen": 131202280, "step": 6107, "time_per_iteration": 3.678468704223633 }, { "auxiliary_loss_clip": 0.01164869, "auxiliary_loss_mlp": 0.01021486, "balance_loss_clip": 1.04582143, "balance_loss_mlp": 1.01442301, "epoch": 0.7344435760235676, "flos": 18478985287680.0, "grad_norm": 5.428482240769719, "language_loss": 0.84079897, "learning_rate": 6.951018717613593e-07, "loss": 0.86266255, "num_input_tokens_seen": 131221295, "step": 6108, "time_per_iteration": 2.703254461288452 }, { "auxiliary_loss_clip": 0.01163799, "auxiliary_loss_mlp": 0.01028294, "balance_loss_clip": 1.0471822, "balance_loss_mlp": 1.02058113, "epoch": 0.7345638189142067, "flos": 17640040256640.0, "grad_norm": 2.211098807237807, "language_loss": 0.78390181, "learning_rate": 6.945116406593614e-07, "loss": 0.80582279, "num_input_tokens_seen": 131240150, "step": 6109, "time_per_iteration": 2.754948616027832 }, { "auxiliary_loss_clip": 0.01158179, "auxiliary_loss_mlp": 0.01027571, "balance_loss_clip": 1.04814434, "balance_loss_mlp": 1.02034998, "epoch": 0.7346840618048458, "flos": 20260225756800.0, "grad_norm": 2.2148667259689394, "language_loss": 0.74084818, "learning_rate": 6.939216075989089e-07, "loss": 0.76270568, "num_input_tokens_seen": 131258080, "step": 6110, "time_per_iteration": 2.908177375793457 }, { "auxiliary_loss_clip": 0.01160602, "auxiliary_loss_mlp": 0.01026384, "balance_loss_clip": 1.04705811, "balance_loss_mlp": 1.01853454, "epoch": 0.7348043046954849, "flos": 29023183641600.0, "grad_norm": 1.7560084916688545, "language_loss": 0.66061693, "learning_rate": 6.933317726695109e-07, "loss": 0.68248677, "num_input_tokens_seen": 131279310, "step": 6111, "time_per_iteration": 3.0510342121124268 }, { "auxiliary_loss_clip": 0.01154132, "auxiliary_loss_mlp": 0.01025293, "balance_loss_clip": 1.04833412, "balance_loss_mlp": 1.0181644, "epoch": 0.734924547586124, "flos": 17931203902080.0, "grad_norm": 3.40581392904593, "language_loss": 0.80116057, "learning_rate": 6.92742135960644e-07, "loss": 0.82295477, "num_input_tokens_seen": 131297010, "step": 6112, "time_per_iteration": 2.7463173866271973 }, { "auxiliary_loss_clip": 0.01064154, "auxiliary_loss_mlp": 0.01000916, "balance_loss_clip": 1.00944805, "balance_loss_mlp": 0.99986678, "epoch": 0.7350447904767631, "flos": 63588319850880.0, "grad_norm": 0.8087541078978504, "language_loss": 0.5558297, "learning_rate": 6.921526975617556e-07, "loss": 0.57648039, "num_input_tokens_seen": 131356470, "step": 6113, "time_per_iteration": 4.166110992431641 }, { "auxiliary_loss_clip": 0.01166869, "auxiliary_loss_mlp": 0.01024239, "balance_loss_clip": 1.04815781, "balance_loss_mlp": 1.0164876, "epoch": 0.7351650333674021, "flos": 21580015178880.0, "grad_norm": 1.8033476603567364, "language_loss": 0.75773251, "learning_rate": 6.915634575622631e-07, "loss": 0.77964365, "num_input_tokens_seen": 131374985, "step": 6114, "time_per_iteration": 2.7541260719299316 }, { "auxiliary_loss_clip": 0.01165359, "auxiliary_loss_mlp": 0.01026998, "balance_loss_clip": 1.04502332, "balance_loss_mlp": 1.01953578, "epoch": 0.7352852762580413, "flos": 18186349184640.0, "grad_norm": 2.09244830142376, "language_loss": 0.70811474, "learning_rate": 6.909744160515532e-07, "loss": 0.73003834, "num_input_tokens_seen": 131393125, "step": 6115, "time_per_iteration": 2.7583935260772705 }, { "auxiliary_loss_clip": 0.01161094, "auxiliary_loss_mlp": 0.0102956, "balance_loss_clip": 1.04874027, "balance_loss_mlp": 1.0222708, "epoch": 0.7354055191486804, "flos": 38910073063680.0, "grad_norm": 5.559512364666078, "language_loss": 0.69203931, "learning_rate": 6.903855731189849e-07, "loss": 0.71394587, "num_input_tokens_seen": 131415760, "step": 6116, "time_per_iteration": 2.9060661792755127 }, { "auxiliary_loss_clip": 0.01167356, "auxiliary_loss_mlp": 0.01025977, "balance_loss_clip": 1.04727113, "balance_loss_mlp": 1.01856804, "epoch": 0.7355257620393194, "flos": 16289978647680.0, "grad_norm": 2.5567425124775047, "language_loss": 0.820306, "learning_rate": 6.897969288538825e-07, "loss": 0.84223938, "num_input_tokens_seen": 131433705, "step": 6117, "time_per_iteration": 2.7613799571990967 }, { "auxiliary_loss_clip": 0.01157359, "auxiliary_loss_mlp": 0.01022939, "balance_loss_clip": 1.04523289, "balance_loss_mlp": 1.01567936, "epoch": 0.7356460049299585, "flos": 18114240631680.0, "grad_norm": 1.8351213810778149, "language_loss": 0.81103837, "learning_rate": 6.892084833455452e-07, "loss": 0.8328414, "num_input_tokens_seen": 131453275, "step": 6118, "time_per_iteration": 2.8152217864990234 }, { "auxiliary_loss_clip": 0.01162053, "auxiliary_loss_mlp": 0.01021639, "balance_loss_clip": 1.04808688, "balance_loss_mlp": 1.0146172, "epoch": 0.7357662478205976, "flos": 21325193118720.0, "grad_norm": 1.4111189661321428, "language_loss": 0.8400532, "learning_rate": 6.886202366832384e-07, "loss": 0.86189008, "num_input_tokens_seen": 131474960, "step": 6119, "time_per_iteration": 2.7862796783447266 }, { "auxiliary_loss_clip": 0.01152425, "auxiliary_loss_mlp": 0.01024344, "balance_loss_clip": 1.04806781, "balance_loss_mlp": 1.01676869, "epoch": 0.7358864907112367, "flos": 14246841139200.0, "grad_norm": 2.3815588403331374, "language_loss": 0.73595893, "learning_rate": 6.880321889561987e-07, "loss": 0.75772673, "num_input_tokens_seen": 131492935, "step": 6120, "time_per_iteration": 2.804508924484253 }, { "auxiliary_loss_clip": 0.0115209, "auxiliary_loss_mlp": 0.01031298, "balance_loss_clip": 1.04567027, "balance_loss_mlp": 1.02289355, "epoch": 0.7360067336018757, "flos": 22309684058880.0, "grad_norm": 1.9772351533397314, "language_loss": 0.65291262, "learning_rate": 6.874443402536338e-07, "loss": 0.67474639, "num_input_tokens_seen": 131512025, "step": 6121, "time_per_iteration": 2.8198280334472656 }, { "auxiliary_loss_clip": 0.01162668, "auxiliary_loss_mlp": 0.01026919, "balance_loss_clip": 1.04654229, "balance_loss_mlp": 1.01943851, "epoch": 0.7361269764925149, "flos": 25554607833600.0, "grad_norm": 1.6493925907065803, "language_loss": 0.80482113, "learning_rate": 6.868566906647177e-07, "loss": 0.82671696, "num_input_tokens_seen": 131532975, "step": 6122, "time_per_iteration": 2.7808749675750732 }, { "auxiliary_loss_clip": 0.01168499, "auxiliary_loss_mlp": 0.01023964, "balance_loss_clip": 1.05022383, "balance_loss_mlp": 1.01659417, "epoch": 0.736247219383154, "flos": 20376505059840.0, "grad_norm": 1.6839071115603974, "language_loss": 0.83512676, "learning_rate": 6.862692402785984e-07, "loss": 0.85705137, "num_input_tokens_seen": 131553225, "step": 6123, "time_per_iteration": 2.8393607139587402 }, { "auxiliary_loss_clip": 0.01061974, "auxiliary_loss_mlp": 0.0099813, "balance_loss_clip": 1.01593041, "balance_loss_mlp": 0.99713415, "epoch": 0.736367462273793, "flos": 70339525735680.0, "grad_norm": 0.6806651350351972, "language_loss": 0.49590635, "learning_rate": 6.856819891843899e-07, "loss": 0.51650739, "num_input_tokens_seen": 131617930, "step": 6124, "time_per_iteration": 3.4852898120880127 }, { "auxiliary_loss_clip": 0.01153068, "auxiliary_loss_mlp": 0.01030853, "balance_loss_clip": 1.04742146, "balance_loss_mlp": 1.02359939, "epoch": 0.7364877051644322, "flos": 22412711243520.0, "grad_norm": 2.236914839894126, "language_loss": 0.7186619, "learning_rate": 6.8509493747118e-07, "loss": 0.74050117, "num_input_tokens_seen": 131636740, "step": 6125, "time_per_iteration": 2.9240684509277344 }, { "auxiliary_loss_clip": 0.01168578, "auxiliary_loss_mlp": 0.01029696, "balance_loss_clip": 1.04921579, "balance_loss_mlp": 1.02207303, "epoch": 0.7366079480550712, "flos": 12130266274560.0, "grad_norm": 2.1814022827083863, "language_loss": 0.88479656, "learning_rate": 6.845080852280221e-07, "loss": 0.90677929, "num_input_tokens_seen": 131653810, "step": 6126, "time_per_iteration": 2.7061092853546143 }, { "auxiliary_loss_clip": 0.01158035, "auxiliary_loss_mlp": 0.01025708, "balance_loss_clip": 1.04581201, "balance_loss_mlp": 1.01888299, "epoch": 0.7367281909457103, "flos": 15049336844160.0, "grad_norm": 1.7170956753480233, "language_loss": 0.74528033, "learning_rate": 6.839214325439409e-07, "loss": 0.76711774, "num_input_tokens_seen": 131671505, "step": 6127, "time_per_iteration": 2.815279483795166 }, { "auxiliary_loss_clip": 0.01156189, "auxiliary_loss_mlp": 0.01025015, "balance_loss_clip": 1.04622674, "balance_loss_mlp": 1.01844645, "epoch": 0.7368484338363495, "flos": 23510752053120.0, "grad_norm": 1.7410323169925896, "language_loss": 0.71932167, "learning_rate": 6.833349795079327e-07, "loss": 0.74113369, "num_input_tokens_seen": 131690615, "step": 6128, "time_per_iteration": 2.881916046142578 }, { "auxiliary_loss_clip": 0.01155957, "auxiliary_loss_mlp": 0.01027663, "balance_loss_clip": 1.04762912, "balance_loss_mlp": 1.02057624, "epoch": 0.7369686767269885, "flos": 27417833095680.0, "grad_norm": 1.5773941378397827, "language_loss": 0.68785226, "learning_rate": 6.827487262089613e-07, "loss": 0.70968843, "num_input_tokens_seen": 131711120, "step": 6129, "time_per_iteration": 3.807229995727539 }, { "auxiliary_loss_clip": 0.010611, "auxiliary_loss_mlp": 0.00998889, "balance_loss_clip": 1.01129007, "balance_loss_mlp": 0.99796563, "epoch": 0.7370889196176276, "flos": 70293343824000.0, "grad_norm": 0.9712814749696288, "language_loss": 0.56781751, "learning_rate": 6.821626727359606e-07, "loss": 0.58841741, "num_input_tokens_seen": 131776680, "step": 6130, "time_per_iteration": 3.4442570209503174 }, { "auxiliary_loss_clip": 0.01159136, "auxiliary_loss_mlp": 0.01029882, "balance_loss_clip": 1.04885149, "balance_loss_mlp": 1.02248502, "epoch": 0.7372091625082667, "flos": 18040839189120.0, "grad_norm": 2.188752938006127, "language_loss": 0.77100408, "learning_rate": 6.815768191778348e-07, "loss": 0.7928943, "num_input_tokens_seen": 131794760, "step": 6131, "time_per_iteration": 2.778101921081543 }, { "auxiliary_loss_clip": 0.01164263, "auxiliary_loss_mlp": 0.01030484, "balance_loss_clip": 1.04992723, "balance_loss_mlp": 1.02236629, "epoch": 0.7373294053989058, "flos": 33726331854720.0, "grad_norm": 2.1836300692100092, "language_loss": 0.73164916, "learning_rate": 6.809911656234569e-07, "loss": 0.75359666, "num_input_tokens_seen": 131816735, "step": 6132, "time_per_iteration": 3.825424909591675 }, { "auxiliary_loss_clip": 0.01159928, "auxiliary_loss_mlp": 0.01025531, "balance_loss_clip": 1.04596674, "balance_loss_mlp": 1.01809192, "epoch": 0.7374496482895448, "flos": 21506326427520.0, "grad_norm": 2.370055724741622, "language_loss": 0.78061986, "learning_rate": 6.804057121616707e-07, "loss": 0.80247444, "num_input_tokens_seen": 131834940, "step": 6133, "time_per_iteration": 3.7908945083618164 }, { "auxiliary_loss_clip": 0.01164833, "auxiliary_loss_mlp": 0.01021861, "balance_loss_clip": 1.04706514, "balance_loss_mlp": 1.01441908, "epoch": 0.737569891180184, "flos": 24936908624640.0, "grad_norm": 2.5808884943307375, "language_loss": 0.72122997, "learning_rate": 6.798204588812888e-07, "loss": 0.74309695, "num_input_tokens_seen": 131854355, "step": 6134, "time_per_iteration": 2.80356764793396 }, { "auxiliary_loss_clip": 0.01144379, "auxiliary_loss_mlp": 0.01054641, "balance_loss_clip": 1.04634893, "balance_loss_mlp": 1.01880836, "epoch": 0.7376901340708231, "flos": 20664544222080.0, "grad_norm": 1.7088047550323577, "language_loss": 0.75486135, "learning_rate": 6.792354058710937e-07, "loss": 0.77685159, "num_input_tokens_seen": 131871825, "step": 6135, "time_per_iteration": 2.805035352706909 }, { "auxiliary_loss_clip": 0.01162072, "auxiliary_loss_mlp": 0.01022758, "balance_loss_clip": 1.04578948, "balance_loss_mlp": 1.01631486, "epoch": 0.7378103769614621, "flos": 23805794367360.0, "grad_norm": 2.0732602534715467, "language_loss": 0.65279794, "learning_rate": 6.786505532198374e-07, "loss": 0.67464626, "num_input_tokens_seen": 131890770, "step": 6136, "time_per_iteration": 2.789294719696045 }, { "auxiliary_loss_clip": 0.01167786, "auxiliary_loss_mlp": 0.01022996, "balance_loss_clip": 1.04716575, "balance_loss_mlp": 1.01564074, "epoch": 0.7379306198521013, "flos": 22237216369920.0, "grad_norm": 1.7694650559905725, "language_loss": 0.85315424, "learning_rate": 6.780659010162411e-07, "loss": 0.87506205, "num_input_tokens_seen": 131909720, "step": 6137, "time_per_iteration": 2.77960467338562 }, { "auxiliary_loss_clip": 0.01162366, "auxiliary_loss_mlp": 0.01025871, "balance_loss_clip": 1.04726577, "balance_loss_mlp": 1.01925802, "epoch": 0.7380508627427403, "flos": 14903108576640.0, "grad_norm": 1.6657585132248158, "language_loss": 0.8326596, "learning_rate": 6.774814493489975e-07, "loss": 0.8545419, "num_input_tokens_seen": 131927395, "step": 6138, "time_per_iteration": 2.8050999641418457 }, { "auxiliary_loss_clip": 0.01160864, "auxiliary_loss_mlp": 0.01021917, "balance_loss_clip": 1.04726577, "balance_loss_mlp": 1.01441824, "epoch": 0.7381711056333794, "flos": 21685843624320.0, "grad_norm": 1.7361542525245377, "language_loss": 0.66339636, "learning_rate": 6.768971983067655e-07, "loss": 0.68522418, "num_input_tokens_seen": 131947725, "step": 6139, "time_per_iteration": 3.6943628787994385 }, { "auxiliary_loss_clip": 0.01063386, "auxiliary_loss_mlp": 0.01001069, "balance_loss_clip": 1.00923705, "balance_loss_mlp": 1.0001328, "epoch": 0.7382913485240186, "flos": 52404263596800.0, "grad_norm": 1.0071171106373018, "language_loss": 0.67719114, "learning_rate": 6.763131479781772e-07, "loss": 0.69783568, "num_input_tokens_seen": 131997485, "step": 6140, "time_per_iteration": 3.0614094734191895 }, { "auxiliary_loss_clip": 0.01156275, "auxiliary_loss_mlp": 0.01025968, "balance_loss_clip": 1.049088, "balance_loss_mlp": 1.01882994, "epoch": 0.7384115914146576, "flos": 21798818876160.0, "grad_norm": 1.7712821608881764, "language_loss": 0.76049304, "learning_rate": 6.757292984518316e-07, "loss": 0.78231549, "num_input_tokens_seen": 132016885, "step": 6141, "time_per_iteration": 2.8546204566955566 }, { "auxiliary_loss_clip": 0.01065136, "auxiliary_loss_mlp": 0.01000093, "balance_loss_clip": 1.01079988, "balance_loss_mlp": 0.9992525, "epoch": 0.7385318343052967, "flos": 61494331662720.0, "grad_norm": 1.413735097217299, "language_loss": 0.56356859, "learning_rate": 6.751456498162981e-07, "loss": 0.58422089, "num_input_tokens_seen": 132075920, "step": 6142, "time_per_iteration": 3.2564165592193604 }, { "auxiliary_loss_clip": 0.01163948, "auxiliary_loss_mlp": 0.01025751, "balance_loss_clip": 1.04688907, "balance_loss_mlp": 1.01862848, "epoch": 0.7386520771959358, "flos": 17013757697280.0, "grad_norm": 3.5287622012510322, "language_loss": 0.85698003, "learning_rate": 6.745622021601174e-07, "loss": 0.87887698, "num_input_tokens_seen": 132092945, "step": 6143, "time_per_iteration": 2.758391857147217 }, { "auxiliary_loss_clip": 0.0116142, "auxiliary_loss_mlp": 0.01022528, "balance_loss_clip": 1.04736447, "balance_loss_mlp": 1.01561403, "epoch": 0.7387723200865749, "flos": 18770759464320.0, "grad_norm": 1.7570155236086393, "language_loss": 0.6957826, "learning_rate": 6.739789555717954e-07, "loss": 0.71762204, "num_input_tokens_seen": 132109920, "step": 6144, "time_per_iteration": 2.8257484436035156 }, { "auxiliary_loss_clip": 0.01164273, "auxiliary_loss_mlp": 0.01023577, "balance_loss_clip": 1.04646683, "balance_loss_mlp": 1.01623654, "epoch": 0.738892562977214, "flos": 22525542840960.0, "grad_norm": 2.058420762427282, "language_loss": 0.77744734, "learning_rate": 6.733959101398124e-07, "loss": 0.79932582, "num_input_tokens_seen": 132128050, "step": 6145, "time_per_iteration": 2.7160274982452393 }, { "auxiliary_loss_clip": 0.01159919, "auxiliary_loss_mlp": 0.01020421, "balance_loss_clip": 1.0480727, "balance_loss_mlp": 1.01323533, "epoch": 0.7390128058678531, "flos": 21501478091520.0, "grad_norm": 1.6386763137199687, "language_loss": 0.81674957, "learning_rate": 6.728130659526143e-07, "loss": 0.83855295, "num_input_tokens_seen": 132145860, "step": 6146, "time_per_iteration": 2.751481294631958 }, { "auxiliary_loss_clip": 0.01162421, "auxiliary_loss_mlp": 0.01027826, "balance_loss_clip": 1.04744387, "balance_loss_mlp": 1.02007747, "epoch": 0.7391330487584922, "flos": 25776176878080.0, "grad_norm": 2.1379657229427522, "language_loss": 0.71162128, "learning_rate": 6.7223042309862e-07, "loss": 0.73352373, "num_input_tokens_seen": 132166060, "step": 6147, "time_per_iteration": 2.772169351577759 }, { "auxiliary_loss_clip": 0.01163357, "auxiliary_loss_mlp": 0.01024541, "balance_loss_clip": 1.04846406, "balance_loss_mlp": 1.01765323, "epoch": 0.7392532916491312, "flos": 28366736636160.0, "grad_norm": 1.7746892309815827, "language_loss": 0.73595566, "learning_rate": 6.716479816662144e-07, "loss": 0.75783467, "num_input_tokens_seen": 132187790, "step": 6148, "time_per_iteration": 2.789741277694702 }, { "auxiliary_loss_clip": 0.01166349, "auxiliary_loss_mlp": 0.01022329, "balance_loss_clip": 1.04733562, "balance_loss_mlp": 1.01523006, "epoch": 0.7393735345397703, "flos": 23585877348480.0, "grad_norm": 1.9442260381826415, "language_loss": 0.7316975, "learning_rate": 6.710657417437531e-07, "loss": 0.75358427, "num_input_tokens_seen": 132207495, "step": 6149, "time_per_iteration": 2.7748217582702637 }, { "auxiliary_loss_clip": 0.01159508, "auxiliary_loss_mlp": 0.01029056, "balance_loss_clip": 1.04650187, "balance_loss_mlp": 1.02160573, "epoch": 0.7394937774304094, "flos": 19974772373760.0, "grad_norm": 2.1629783536870177, "language_loss": 0.79967654, "learning_rate": 6.704837034195628e-07, "loss": 0.82156217, "num_input_tokens_seen": 132225960, "step": 6150, "time_per_iteration": 2.738201856613159 }, { "auxiliary_loss_clip": 0.01160997, "auxiliary_loss_mlp": 0.01030995, "balance_loss_clip": 1.04996204, "balance_loss_mlp": 1.02335393, "epoch": 0.7396140203210485, "flos": 23478037741440.0, "grad_norm": 2.7502584435242805, "language_loss": 0.8509692, "learning_rate": 6.699018667819376e-07, "loss": 0.8728891, "num_input_tokens_seen": 132245360, "step": 6151, "time_per_iteration": 2.768930673599243 }, { "auxiliary_loss_clip": 0.01164398, "auxiliary_loss_mlp": 0.01023706, "balance_loss_clip": 1.04867005, "balance_loss_mlp": 1.01560605, "epoch": 0.7397342632116876, "flos": 25555433846400.0, "grad_norm": 1.5316907471596328, "language_loss": 0.72771811, "learning_rate": 6.693202319191415e-07, "loss": 0.74959916, "num_input_tokens_seen": 132267095, "step": 6152, "time_per_iteration": 2.7888290882110596 }, { "auxiliary_loss_clip": 0.0116676, "auxiliary_loss_mlp": 0.0102834, "balance_loss_clip": 1.04939759, "balance_loss_mlp": 1.02025795, "epoch": 0.7398545061023267, "flos": 24755021130240.0, "grad_norm": 1.7118598646946208, "language_loss": 0.74755692, "learning_rate": 6.687387989194084e-07, "loss": 0.769508, "num_input_tokens_seen": 132286610, "step": 6153, "time_per_iteration": 2.7473249435424805 }, { "auxiliary_loss_clip": 0.01153254, "auxiliary_loss_mlp": 0.01022498, "balance_loss_clip": 1.04601979, "balance_loss_mlp": 1.01554561, "epoch": 0.7399747489929658, "flos": 16508602776960.0, "grad_norm": 2.2811323592016657, "language_loss": 0.7946986, "learning_rate": 6.681575678709404e-07, "loss": 0.81645614, "num_input_tokens_seen": 132305300, "step": 6154, "time_per_iteration": 2.705626964569092 }, { "auxiliary_loss_clip": 0.01162091, "auxiliary_loss_mlp": 0.01023892, "balance_loss_clip": 1.04658806, "balance_loss_mlp": 1.01668835, "epoch": 0.7400949918836048, "flos": 24097065753600.0, "grad_norm": 1.859101542010576, "language_loss": 0.71211392, "learning_rate": 6.67576538861911e-07, "loss": 0.73397374, "num_input_tokens_seen": 132323875, "step": 6155, "time_per_iteration": 3.6472442150115967 }, { "auxiliary_loss_clip": 0.0115796, "auxiliary_loss_mlp": 0.0102226, "balance_loss_clip": 1.04622841, "balance_loss_mlp": 1.01534295, "epoch": 0.740215234774244, "flos": 21802517976960.0, "grad_norm": 3.9189665904346374, "language_loss": 0.82175851, "learning_rate": 6.669957119804612e-07, "loss": 0.8435607, "num_input_tokens_seen": 132345510, "step": 6156, "time_per_iteration": 2.84316349029541 }, { "auxiliary_loss_clip": 0.01168424, "auxiliary_loss_mlp": 0.01029068, "balance_loss_clip": 1.04804528, "balance_loss_mlp": 1.02131355, "epoch": 0.7403354776648831, "flos": 18733196816640.0, "grad_norm": 5.219010379198493, "language_loss": 0.73144877, "learning_rate": 6.66415087314702e-07, "loss": 0.75342369, "num_input_tokens_seen": 132360465, "step": 6157, "time_per_iteration": 2.7850797176361084 }, { "auxiliary_loss_clip": 0.01162327, "auxiliary_loss_mlp": 0.0102618, "balance_loss_clip": 1.04724407, "balance_loss_mlp": 1.01886904, "epoch": 0.7404557205555221, "flos": 16909581277440.0, "grad_norm": 2.0965350510044596, "language_loss": 0.73811078, "learning_rate": 6.65834664952714e-07, "loss": 0.75999582, "num_input_tokens_seen": 132377915, "step": 6158, "time_per_iteration": 3.647876024246216 }, { "auxiliary_loss_clip": 0.01158181, "auxiliary_loss_mlp": 0.0102085, "balance_loss_clip": 1.04659653, "balance_loss_mlp": 1.01410866, "epoch": 0.7405759634461613, "flos": 21214408596480.0, "grad_norm": 1.8408374329005874, "language_loss": 0.75976145, "learning_rate": 6.652544449825457e-07, "loss": 0.78155178, "num_input_tokens_seen": 132398170, "step": 6159, "time_per_iteration": 3.7275753021240234 }, { "auxiliary_loss_clip": 0.01171089, "auxiliary_loss_mlp": 0.01032487, "balance_loss_clip": 1.04920876, "balance_loss_mlp": 1.02451825, "epoch": 0.7406962063368003, "flos": 20480106862080.0, "grad_norm": 1.6831975619179353, "language_loss": 0.76672196, "learning_rate": 6.646744274922182e-07, "loss": 0.7887578, "num_input_tokens_seen": 132416615, "step": 6160, "time_per_iteration": 2.7382330894470215 }, { "auxiliary_loss_clip": 0.01160207, "auxiliary_loss_mlp": 0.01024036, "balance_loss_clip": 1.04559088, "balance_loss_mlp": 1.0167042, "epoch": 0.7408164492274394, "flos": 19791915212160.0, "grad_norm": 2.886133811204775, "language_loss": 0.75190961, "learning_rate": 6.640946125697171e-07, "loss": 0.77375197, "num_input_tokens_seen": 132434145, "step": 6161, "time_per_iteration": 2.791293144226074 }, { "auxiliary_loss_clip": 0.01163447, "auxiliary_loss_mlp": 0.01025985, "balance_loss_clip": 1.0454371, "balance_loss_mlp": 1.01816189, "epoch": 0.7409366921180786, "flos": 29204855654400.0, "grad_norm": 2.1069944979451627, "language_loss": 0.75936347, "learning_rate": 6.635150003030017e-07, "loss": 0.78125787, "num_input_tokens_seen": 132452670, "step": 6162, "time_per_iteration": 2.7624619007110596 }, { "auxiliary_loss_clip": 0.01156739, "auxiliary_loss_mlp": 0.01025309, "balance_loss_clip": 1.04600501, "balance_loss_mlp": 1.01779604, "epoch": 0.7410569350087176, "flos": 22930004960640.0, "grad_norm": 2.288974202659857, "language_loss": 0.85951507, "learning_rate": 6.629355907799981e-07, "loss": 0.8813355, "num_input_tokens_seen": 132472475, "step": 6163, "time_per_iteration": 2.766726016998291 }, { "auxiliary_loss_clip": 0.01165104, "auxiliary_loss_mlp": 0.01022833, "balance_loss_clip": 1.04583788, "balance_loss_mlp": 1.01506042, "epoch": 0.7411771778993567, "flos": 30440397726720.0, "grad_norm": 1.7736021441125351, "language_loss": 0.6914379, "learning_rate": 6.623563840886015e-07, "loss": 0.71331728, "num_input_tokens_seen": 132493400, "step": 6164, "time_per_iteration": 2.773808479309082 }, { "auxiliary_loss_clip": 0.01159309, "auxiliary_loss_mlp": 0.01024447, "balance_loss_clip": 1.04536295, "balance_loss_mlp": 1.017151, "epoch": 0.7412974207899958, "flos": 20522050968960.0, "grad_norm": 1.9477664938200474, "language_loss": 0.69678527, "learning_rate": 6.617773803166795e-07, "loss": 0.7186228, "num_input_tokens_seen": 132511725, "step": 6165, "time_per_iteration": 3.5237109661102295 }, { "auxiliary_loss_clip": 0.01163771, "auxiliary_loss_mlp": 0.01060303, "balance_loss_clip": 1.04701042, "balance_loss_mlp": 1.0246489, "epoch": 0.7414176636806349, "flos": 22090700793600.0, "grad_norm": 2.2806883741547006, "language_loss": 0.8199926, "learning_rate": 6.611985795520634e-07, "loss": 0.84223336, "num_input_tokens_seen": 132530270, "step": 6166, "time_per_iteration": 2.7791013717651367 }, { "auxiliary_loss_clip": 0.01167955, "auxiliary_loss_mlp": 0.0102427, "balance_loss_clip": 1.05029428, "balance_loss_mlp": 1.0161761, "epoch": 0.7415379065712739, "flos": 25155245445120.0, "grad_norm": 1.9541056405265962, "language_loss": 0.77403504, "learning_rate": 6.606199818825588e-07, "loss": 0.79595727, "num_input_tokens_seen": 132550725, "step": 6167, "time_per_iteration": 2.7391257286071777 }, { "auxiliary_loss_clip": 0.01166332, "auxiliary_loss_mlp": 0.01025435, "balance_loss_clip": 1.04573631, "balance_loss_mlp": 1.01821685, "epoch": 0.7416581494619131, "flos": 16871731320960.0, "grad_norm": 2.707837389792877, "language_loss": 0.81837714, "learning_rate": 6.600415873959377e-07, "loss": 0.84029484, "num_input_tokens_seen": 132568600, "step": 6168, "time_per_iteration": 2.809795379638672 }, { "auxiliary_loss_clip": 0.01151447, "auxiliary_loss_mlp": 0.01052797, "balance_loss_clip": 1.04621279, "balance_loss_mlp": 1.01772928, "epoch": 0.7417783923525522, "flos": 28438881102720.0, "grad_norm": 2.4749320789609164, "language_loss": 0.64860088, "learning_rate": 6.594633961799437e-07, "loss": 0.67064333, "num_input_tokens_seen": 132587640, "step": 6169, "time_per_iteration": 2.803457736968994 }, { "auxiliary_loss_clip": 0.01164861, "auxiliary_loss_mlp": 0.01026335, "balance_loss_clip": 1.04646158, "balance_loss_mlp": 1.01887524, "epoch": 0.7418986352431912, "flos": 20084299920000.0, "grad_norm": 1.6298434801896686, "language_loss": 0.81775022, "learning_rate": 6.588854083222857e-07, "loss": 0.83966219, "num_input_tokens_seen": 132607075, "step": 6170, "time_per_iteration": 2.8558952808380127 }, { "auxiliary_loss_clip": 0.01166118, "auxiliary_loss_mlp": 0.0102369, "balance_loss_clip": 1.04822052, "balance_loss_mlp": 1.01615024, "epoch": 0.7420188781338304, "flos": 18259571059200.0, "grad_norm": 2.145069054460442, "language_loss": 0.81013459, "learning_rate": 6.583076239106444e-07, "loss": 0.83203268, "num_input_tokens_seen": 132625580, "step": 6171, "time_per_iteration": 2.663710117340088 }, { "auxiliary_loss_clip": 0.0116628, "auxiliary_loss_mlp": 0.01025897, "balance_loss_clip": 1.04717767, "balance_loss_mlp": 1.01821411, "epoch": 0.7421391210244694, "flos": 13771994319360.0, "grad_norm": 2.2869172958770387, "language_loss": 0.75963891, "learning_rate": 6.577300430326707e-07, "loss": 0.78156072, "num_input_tokens_seen": 132640525, "step": 6172, "time_per_iteration": 2.7657713890075684 }, { "auxiliary_loss_clip": 0.01154442, "auxiliary_loss_mlp": 0.01026904, "balance_loss_clip": 1.04756546, "balance_loss_mlp": 1.01980805, "epoch": 0.7422593639151085, "flos": 15961683317760.0, "grad_norm": 2.8820561639513556, "language_loss": 0.72043967, "learning_rate": 6.571526657759821e-07, "loss": 0.74225318, "num_input_tokens_seen": 132656265, "step": 6173, "time_per_iteration": 2.7995059490203857 }, { "auxiliary_loss_clip": 0.01160403, "auxiliary_loss_mlp": 0.01026972, "balance_loss_clip": 1.04640031, "balance_loss_mlp": 1.01953912, "epoch": 0.7423796068057477, "flos": 30114400867200.0, "grad_norm": 1.6476684125428902, "language_loss": 0.70963222, "learning_rate": 6.565754922281663e-07, "loss": 0.73150599, "num_input_tokens_seen": 132678510, "step": 6174, "time_per_iteration": 2.8550877571105957 }, { "auxiliary_loss_clip": 0.01160132, "auxiliary_loss_mlp": 0.01024346, "balance_loss_clip": 1.04652667, "balance_loss_mlp": 1.01697612, "epoch": 0.7424998496963867, "flos": 20521907314560.0, "grad_norm": 1.9258244633286792, "language_loss": 0.78351438, "learning_rate": 6.559985224767801e-07, "loss": 0.80535913, "num_input_tokens_seen": 132696385, "step": 6175, "time_per_iteration": 2.723909854888916 }, { "auxiliary_loss_clip": 0.01163496, "auxiliary_loss_mlp": 0.01028285, "balance_loss_clip": 1.04629529, "balance_loss_mlp": 1.02047729, "epoch": 0.7426200925870258, "flos": 21871573873920.0, "grad_norm": 2.6031962158633584, "language_loss": 0.75452209, "learning_rate": 6.55421756609349e-07, "loss": 0.77643991, "num_input_tokens_seen": 132714640, "step": 6176, "time_per_iteration": 2.6713430881500244 }, { "auxiliary_loss_clip": 0.01160801, "auxiliary_loss_mlp": 0.01024258, "balance_loss_clip": 1.04812074, "balance_loss_mlp": 1.01656306, "epoch": 0.7427403354776649, "flos": 26432049265920.0, "grad_norm": 1.798770452938704, "language_loss": 0.78938425, "learning_rate": 6.54845194713369e-07, "loss": 0.81123483, "num_input_tokens_seen": 132735590, "step": 6177, "time_per_iteration": 2.7627322673797607 }, { "auxiliary_loss_clip": 0.01163776, "auxiliary_loss_mlp": 0.0103209, "balance_loss_clip": 1.04988861, "balance_loss_mlp": 1.02481842, "epoch": 0.742860578368304, "flos": 19898390102400.0, "grad_norm": 1.9546034968715982, "language_loss": 0.80035293, "learning_rate": 6.542688368763034e-07, "loss": 0.82231158, "num_input_tokens_seen": 132753995, "step": 6178, "time_per_iteration": 2.7962520122528076 }, { "auxiliary_loss_clip": 0.01160992, "auxiliary_loss_mlp": 0.01026523, "balance_loss_clip": 1.0474658, "balance_loss_mlp": 1.01964426, "epoch": 0.742980821258943, "flos": 24827201510400.0, "grad_norm": 1.5425936844084283, "language_loss": 0.76909572, "learning_rate": 6.536926831855854e-07, "loss": 0.7909708, "num_input_tokens_seen": 132773160, "step": 6179, "time_per_iteration": 2.743504047393799 }, { "auxiliary_loss_clip": 0.01157919, "auxiliary_loss_mlp": 0.01024558, "balance_loss_clip": 1.04638505, "balance_loss_mlp": 1.01724482, "epoch": 0.7431010641495821, "flos": 25228646887680.0, "grad_norm": 20.425974570145485, "language_loss": 0.73165023, "learning_rate": 6.531167337286165e-07, "loss": 0.75347507, "num_input_tokens_seen": 132793180, "step": 6180, "time_per_iteration": 2.818382501602173 }, { "auxiliary_loss_clip": 0.01158828, "auxiliary_loss_mlp": 0.01024562, "balance_loss_clip": 1.04883313, "balance_loss_mlp": 1.01721001, "epoch": 0.7432213070402213, "flos": 21762369550080.0, "grad_norm": 1.4890076578018838, "language_loss": 0.79804993, "learning_rate": 6.52540988592768e-07, "loss": 0.81988382, "num_input_tokens_seen": 132814200, "step": 6181, "time_per_iteration": 3.633657932281494 }, { "auxiliary_loss_clip": 0.01163261, "auxiliary_loss_mlp": 0.01027314, "balance_loss_clip": 1.04679787, "balance_loss_mlp": 1.02005672, "epoch": 0.7433415499308603, "flos": 14793832425600.0, "grad_norm": 6.959561585818351, "language_loss": 0.83360797, "learning_rate": 6.519654478653814e-07, "loss": 0.85551369, "num_input_tokens_seen": 132832565, "step": 6182, "time_per_iteration": 2.7952470779418945 }, { "auxiliary_loss_clip": 0.0106405, "auxiliary_loss_mlp": 0.01001147, "balance_loss_clip": 1.00909591, "balance_loss_mlp": 1.00015211, "epoch": 0.7434617928214994, "flos": 67155577297920.0, "grad_norm": 0.7499772353327054, "language_loss": 0.56070948, "learning_rate": 6.51390111633763e-07, "loss": 0.58136141, "num_input_tokens_seen": 132897840, "step": 6183, "time_per_iteration": 3.3500020503997803 }, { "auxiliary_loss_clip": 0.01156329, "auxiliary_loss_mlp": 0.0102704, "balance_loss_clip": 1.05017447, "balance_loss_mlp": 1.02017927, "epoch": 0.7435820357121385, "flos": 27377576928000.0, "grad_norm": 1.6314468081776765, "language_loss": 0.76480401, "learning_rate": 6.508149799851932e-07, "loss": 0.78663766, "num_input_tokens_seen": 132919505, "step": 6184, "time_per_iteration": 3.847440719604492 }, { "auxiliary_loss_clip": 0.01154195, "auxiliary_loss_mlp": 0.01024243, "balance_loss_clip": 1.04486525, "balance_loss_mlp": 1.01753163, "epoch": 0.7437022786027776, "flos": 23987645948160.0, "grad_norm": 1.8408075338644643, "language_loss": 0.61128622, "learning_rate": 6.502400530069183e-07, "loss": 0.63307059, "num_input_tokens_seen": 132939390, "step": 6185, "time_per_iteration": 3.8671326637268066 }, { "auxiliary_loss_clip": 0.01162089, "auxiliary_loss_mlp": 0.01032194, "balance_loss_clip": 1.05187869, "balance_loss_mlp": 1.02430856, "epoch": 0.7438225214934167, "flos": 21866761451520.0, "grad_norm": 2.102815702540646, "language_loss": 0.68708622, "learning_rate": 6.496653307861535e-07, "loss": 0.70902908, "num_input_tokens_seen": 132960060, "step": 6186, "time_per_iteration": 2.7647485733032227 }, { "auxiliary_loss_clip": 0.01168491, "auxiliary_loss_mlp": 0.01028612, "balance_loss_clip": 1.0475812, "balance_loss_mlp": 1.02154315, "epoch": 0.7439427643840558, "flos": 20230097224320.0, "grad_norm": 1.8344383475255228, "language_loss": 0.65563309, "learning_rate": 6.490908134100857e-07, "loss": 0.67760414, "num_input_tokens_seen": 132978525, "step": 6187, "time_per_iteration": 2.795315980911255 }, { "auxiliary_loss_clip": 0.01168734, "auxiliary_loss_mlp": 0.01030014, "balance_loss_clip": 1.04835701, "balance_loss_mlp": 1.02206278, "epoch": 0.7440630072746949, "flos": 20849915335680.0, "grad_norm": 2.055303873809372, "language_loss": 0.69443452, "learning_rate": 6.48516500965866e-07, "loss": 0.71642202, "num_input_tokens_seen": 132998460, "step": 6188, "time_per_iteration": 2.7707126140594482 }, { "auxiliary_loss_clip": 0.01165451, "auxiliary_loss_mlp": 0.01024427, "balance_loss_clip": 1.04356194, "balance_loss_mlp": 1.01673198, "epoch": 0.7441832501653339, "flos": 26503762769280.0, "grad_norm": 1.6971570503621323, "language_loss": 0.8190974, "learning_rate": 6.479423935406192e-07, "loss": 0.84099621, "num_input_tokens_seen": 133018445, "step": 6189, "time_per_iteration": 2.8460705280303955 }, { "auxiliary_loss_clip": 0.01062742, "auxiliary_loss_mlp": 0.01001713, "balance_loss_clip": 1.0144242, "balance_loss_mlp": 1.00068188, "epoch": 0.7443034930559731, "flos": 68602848088320.0, "grad_norm": 0.8017434533649956, "language_loss": 0.61999571, "learning_rate": 6.473684912214357e-07, "loss": 0.64064032, "num_input_tokens_seen": 133082005, "step": 6190, "time_per_iteration": 3.4208598136901855 }, { "auxiliary_loss_clip": 0.01164118, "auxiliary_loss_mlp": 0.0102641, "balance_loss_clip": 1.04725647, "balance_loss_mlp": 1.01859546, "epoch": 0.7444237359466122, "flos": 18654982951680.0, "grad_norm": 1.8622549048707626, "language_loss": 0.69490719, "learning_rate": 6.467947940953778e-07, "loss": 0.71681243, "num_input_tokens_seen": 133100530, "step": 6191, "time_per_iteration": 3.6249659061431885 }, { "auxiliary_loss_clip": 0.0115893, "auxiliary_loss_mlp": 0.01020531, "balance_loss_clip": 1.04497755, "balance_loss_mlp": 1.01374459, "epoch": 0.7445439788372512, "flos": 22817604326400.0, "grad_norm": 1.8909003201504746, "language_loss": 0.72624516, "learning_rate": 6.462213022494732e-07, "loss": 0.74803972, "num_input_tokens_seen": 133119775, "step": 6192, "time_per_iteration": 2.8410048484802246 }, { "auxiliary_loss_clip": 0.01063582, "auxiliary_loss_mlp": 0.01001434, "balance_loss_clip": 1.00932384, "balance_loss_mlp": 1.00042653, "epoch": 0.7446642217278904, "flos": 67045690615680.0, "grad_norm": 0.7705690956700141, "language_loss": 0.61027974, "learning_rate": 6.456480157707201e-07, "loss": 0.63092989, "num_input_tokens_seen": 133184550, "step": 6193, "time_per_iteration": 3.2449071407318115 }, { "auxiliary_loss_clip": 0.0115525, "auxiliary_loss_mlp": 0.01029341, "balance_loss_clip": 1.04970241, "balance_loss_mlp": 1.0221405, "epoch": 0.7447844646185294, "flos": 17417465631360.0, "grad_norm": 2.068920004702362, "language_loss": 0.85242963, "learning_rate": 6.450749347460866e-07, "loss": 0.87427551, "num_input_tokens_seen": 133201525, "step": 6194, "time_per_iteration": 2.722022294998169 }, { "auxiliary_loss_clip": 0.0116729, "auxiliary_loss_mlp": 0.01024525, "balance_loss_clip": 1.04641223, "balance_loss_mlp": 1.01636529, "epoch": 0.7449047075091685, "flos": 26615876094720.0, "grad_norm": 1.8233960908887223, "language_loss": 0.78633422, "learning_rate": 6.445020592625083e-07, "loss": 0.80825233, "num_input_tokens_seen": 133222175, "step": 6195, "time_per_iteration": 2.75051212310791 }, { "auxiliary_loss_clip": 0.01165467, "auxiliary_loss_mlp": 0.01027929, "balance_loss_clip": 1.0459578, "balance_loss_mlp": 1.02087474, "epoch": 0.7450249503998077, "flos": 14170458867840.0, "grad_norm": 2.270784619043749, "language_loss": 0.79728901, "learning_rate": 6.4392938940689e-07, "loss": 0.81922293, "num_input_tokens_seen": 133237590, "step": 6196, "time_per_iteration": 2.658940553665161 }, { "auxiliary_loss_clip": 0.01151415, "auxiliary_loss_mlp": 0.01060457, "balance_loss_clip": 1.04860616, "balance_loss_mlp": 1.02367699, "epoch": 0.7451451932904467, "flos": 19606687752960.0, "grad_norm": 2.3966338662067983, "language_loss": 0.71238351, "learning_rate": 6.433569252661049e-07, "loss": 0.7345022, "num_input_tokens_seen": 133255590, "step": 6197, "time_per_iteration": 2.871645450592041 }, { "auxiliary_loss_clip": 0.01152279, "auxiliary_loss_mlp": 0.0102674, "balance_loss_clip": 1.04580045, "balance_loss_mlp": 1.02012146, "epoch": 0.7452654361810858, "flos": 12495405980160.0, "grad_norm": 2.804562653392996, "language_loss": 0.71345276, "learning_rate": 6.427846669269952e-07, "loss": 0.73524296, "num_input_tokens_seen": 133273210, "step": 6198, "time_per_iteration": 2.779567241668701 }, { "auxiliary_loss_clip": 0.01171152, "auxiliary_loss_mlp": 0.0102977, "balance_loss_clip": 1.05146122, "balance_loss_mlp": 1.02273655, "epoch": 0.7453856790717249, "flos": 22127329687680.0, "grad_norm": 3.6042027314626774, "language_loss": 0.82359314, "learning_rate": 6.422126144763729e-07, "loss": 0.84560239, "num_input_tokens_seen": 133292600, "step": 6199, "time_per_iteration": 2.695106267929077 }, { "auxiliary_loss_clip": 0.01159166, "auxiliary_loss_mlp": 0.01054512, "balance_loss_clip": 1.04624844, "balance_loss_mlp": 1.01962543, "epoch": 0.745505921962364, "flos": 20010682995840.0, "grad_norm": 3.2852808435426866, "language_loss": 0.76751471, "learning_rate": 6.416407680010174e-07, "loss": 0.78965139, "num_input_tokens_seen": 133306960, "step": 6200, "time_per_iteration": 2.8638081550598145 }, { "auxiliary_loss_clip": 0.01167558, "auxiliary_loss_mlp": 0.01024173, "balance_loss_clip": 1.04825521, "balance_loss_mlp": 1.01672852, "epoch": 0.745626164853003, "flos": 24677884673280.0, "grad_norm": 2.5728605935377553, "language_loss": 0.81166625, "learning_rate": 6.410691275876774e-07, "loss": 0.83358359, "num_input_tokens_seen": 133326380, "step": 6201, "time_per_iteration": 2.901262044906616 }, { "auxiliary_loss_clip": 0.01167137, "auxiliary_loss_mlp": 0.01024318, "balance_loss_clip": 1.04690826, "balance_loss_mlp": 1.016891, "epoch": 0.7457464077436422, "flos": 14538830797440.0, "grad_norm": 2.257974698798398, "language_loss": 0.76743132, "learning_rate": 6.404976933230704e-07, "loss": 0.78934586, "num_input_tokens_seen": 133342900, "step": 6202, "time_per_iteration": 2.736121416091919 }, { "auxiliary_loss_clip": 0.01168876, "auxiliary_loss_mlp": 0.01022789, "balance_loss_clip": 1.04986, "balance_loss_mlp": 1.01502228, "epoch": 0.7458666506342813, "flos": 34021194600960.0, "grad_norm": 1.9974163064733468, "language_loss": 0.72436816, "learning_rate": 6.399264652938813e-07, "loss": 0.74628478, "num_input_tokens_seen": 133363805, "step": 6203, "time_per_iteration": 2.8497135639190674 }, { "auxiliary_loss_clip": 0.01159916, "auxiliary_loss_mlp": 0.01027801, "balance_loss_clip": 1.0466882, "balance_loss_mlp": 1.01999283, "epoch": 0.7459868935249203, "flos": 24279025075200.0, "grad_norm": 1.909525871842739, "language_loss": 0.74350685, "learning_rate": 6.393554435867679e-07, "loss": 0.76538408, "num_input_tokens_seen": 133384655, "step": 6204, "time_per_iteration": 2.797963857650757 }, { "auxiliary_loss_clip": 0.01157025, "auxiliary_loss_mlp": 0.01032656, "balance_loss_clip": 1.04822624, "balance_loss_mlp": 1.02478838, "epoch": 0.7461071364155595, "flos": 21908777385600.0, "grad_norm": 2.45981359116472, "language_loss": 0.83799046, "learning_rate": 6.387846282883502e-07, "loss": 0.8598873, "num_input_tokens_seen": 133401185, "step": 6205, "time_per_iteration": 2.7791953086853027 }, { "auxiliary_loss_clip": 0.01165128, "auxiliary_loss_mlp": 0.0102222, "balance_loss_clip": 1.0470171, "balance_loss_mlp": 1.014382, "epoch": 0.7462273793061985, "flos": 22889712879360.0, "grad_norm": 2.0585869077233565, "language_loss": 0.76597917, "learning_rate": 6.38214019485223e-07, "loss": 0.78785264, "num_input_tokens_seen": 133420010, "step": 6206, "time_per_iteration": 2.6998088359832764 }, { "auxiliary_loss_clip": 0.01148752, "auxiliary_loss_mlp": 0.01023149, "balance_loss_clip": 1.04603243, "balance_loss_mlp": 1.01593661, "epoch": 0.7463476221968376, "flos": 19968451580160.0, "grad_norm": 1.6795738241927527, "language_loss": 0.7142086, "learning_rate": 6.376436172639461e-07, "loss": 0.73592758, "num_input_tokens_seen": 133437855, "step": 6207, "time_per_iteration": 3.774352550506592 }, { "auxiliary_loss_clip": 0.0115104, "auxiliary_loss_mlp": 0.01026898, "balance_loss_clip": 1.04736185, "balance_loss_mlp": 1.01883972, "epoch": 0.7464678650874768, "flos": 16836610798080.0, "grad_norm": 2.478874615054495, "language_loss": 0.65667892, "learning_rate": 6.370734217110487e-07, "loss": 0.67845827, "num_input_tokens_seen": 133456600, "step": 6208, "time_per_iteration": 2.801469087600708 }, { "auxiliary_loss_clip": 0.01165229, "auxiliary_loss_mlp": 0.01024404, "balance_loss_clip": 1.04890954, "balance_loss_mlp": 1.01668549, "epoch": 0.7465881079781158, "flos": 48100869843840.0, "grad_norm": 1.5936587625253578, "language_loss": 0.64350927, "learning_rate": 6.36503432913031e-07, "loss": 0.66540563, "num_input_tokens_seen": 133479745, "step": 6209, "time_per_iteration": 3.015563726425171 }, { "auxiliary_loss_clip": 0.01165081, "auxiliary_loss_mlp": 0.01022203, "balance_loss_clip": 1.0484581, "balance_loss_mlp": 1.01424026, "epoch": 0.7467083508687549, "flos": 19677359761920.0, "grad_norm": 1.8737254454873096, "language_loss": 0.69109428, "learning_rate": 6.359336509563569e-07, "loss": 0.71296716, "num_input_tokens_seen": 133495765, "step": 6210, "time_per_iteration": 3.7502987384796143 }, { "auxiliary_loss_clip": 0.01148905, "auxiliary_loss_mlp": 0.01024595, "balance_loss_clip": 1.04847407, "balance_loss_mlp": 1.01718009, "epoch": 0.7468285937593939, "flos": 17895436934400.0, "grad_norm": 2.1148767838003075, "language_loss": 0.80783689, "learning_rate": 6.353640759274641e-07, "loss": 0.8295719, "num_input_tokens_seen": 133514655, "step": 6211, "time_per_iteration": 3.713407039642334 }, { "auxiliary_loss_clip": 0.01163958, "auxiliary_loss_mlp": 0.01024839, "balance_loss_clip": 1.04590833, "balance_loss_mlp": 1.01707602, "epoch": 0.7469488366500331, "flos": 23141446369920.0, "grad_norm": 3.3786702831663367, "language_loss": 0.75270736, "learning_rate": 6.347947079127556e-07, "loss": 0.77459538, "num_input_tokens_seen": 133532555, "step": 6212, "time_per_iteration": 2.714052438735962 }, { "auxiliary_loss_clip": 0.01158376, "auxiliary_loss_mlp": 0.01025876, "balance_loss_clip": 1.0481894, "balance_loss_mlp": 1.0185802, "epoch": 0.7470690795406721, "flos": 16690849407360.0, "grad_norm": 2.142074695993658, "language_loss": 0.76898658, "learning_rate": 6.342255469986053e-07, "loss": 0.79082918, "num_input_tokens_seen": 133551300, "step": 6213, "time_per_iteration": 2.711339235305786 }, { "auxiliary_loss_clip": 0.01165405, "auxiliary_loss_mlp": 0.0102371, "balance_loss_clip": 1.04690576, "balance_loss_mlp": 1.01655769, "epoch": 0.7471893224313112, "flos": 25192700352000.0, "grad_norm": 1.9693427372217522, "language_loss": 0.76105464, "learning_rate": 6.336565932713533e-07, "loss": 0.78294581, "num_input_tokens_seen": 133570725, "step": 6214, "time_per_iteration": 2.734541654586792 }, { "auxiliary_loss_clip": 0.01154165, "auxiliary_loss_mlp": 0.01026581, "balance_loss_clip": 1.0460794, "balance_loss_mlp": 1.01908922, "epoch": 0.7473095653219504, "flos": 22526225199360.0, "grad_norm": 2.2583917146899024, "language_loss": 0.77638388, "learning_rate": 6.330878468173088e-07, "loss": 0.79819131, "num_input_tokens_seen": 133590790, "step": 6215, "time_per_iteration": 2.827580690383911 }, { "auxiliary_loss_clip": 0.01157172, "auxiliary_loss_mlp": 0.01025175, "balance_loss_clip": 1.04638553, "balance_loss_mlp": 1.01730442, "epoch": 0.7474298082125894, "flos": 18113989236480.0, "grad_norm": 2.152267665070532, "language_loss": 0.73174393, "learning_rate": 6.32519307722752e-07, "loss": 0.7535674, "num_input_tokens_seen": 133608685, "step": 6216, "time_per_iteration": 2.7701003551483154 }, { "auxiliary_loss_clip": 0.01061315, "auxiliary_loss_mlp": 0.01003458, "balance_loss_clip": 1.01566887, "balance_loss_mlp": 1.00243855, "epoch": 0.7475500511032285, "flos": 62086535193600.0, "grad_norm": 0.8423479011541726, "language_loss": 0.5498035, "learning_rate": 6.31950976073929e-07, "loss": 0.5704512, "num_input_tokens_seen": 133662775, "step": 6217, "time_per_iteration": 4.169734954833984 }, { "auxiliary_loss_clip": 0.01151963, "auxiliary_loss_mlp": 0.01023873, "balance_loss_clip": 1.0477252, "balance_loss_mlp": 1.01591015, "epoch": 0.7476702939938676, "flos": 17785586165760.0, "grad_norm": 2.2610085725335787, "language_loss": 0.80631053, "learning_rate": 6.31382851957055e-07, "loss": 0.82806891, "num_input_tokens_seen": 133679595, "step": 6218, "time_per_iteration": 2.739373207092285 }, { "auxiliary_loss_clip": 0.01154397, "auxiliary_loss_mlp": 0.01047796, "balance_loss_clip": 1.04804754, "balance_loss_mlp": 1.01385236, "epoch": 0.7477905368845067, "flos": 27927944092800.0, "grad_norm": 2.055418159852696, "language_loss": 0.71806848, "learning_rate": 6.308149354583143e-07, "loss": 0.74009037, "num_input_tokens_seen": 133699000, "step": 6219, "time_per_iteration": 2.729595422744751 }, { "auxiliary_loss_clip": 0.0117013, "auxiliary_loss_mlp": 0.01031984, "balance_loss_clip": 1.04928386, "balance_loss_mlp": 1.02377069, "epoch": 0.7479107797751458, "flos": 26870374932480.0, "grad_norm": 6.23203361856775, "language_loss": 0.81334716, "learning_rate": 6.302472266638586e-07, "loss": 0.83536828, "num_input_tokens_seen": 133719540, "step": 6220, "time_per_iteration": 2.7770936489105225 }, { "auxiliary_loss_clip": 0.01174485, "auxiliary_loss_mlp": 0.01025569, "balance_loss_clip": 1.04976177, "balance_loss_mlp": 1.01718879, "epoch": 0.7480310226657849, "flos": 33943375785600.0, "grad_norm": 2.0896433481492824, "language_loss": 0.70217931, "learning_rate": 6.296797256598101e-07, "loss": 0.72417992, "num_input_tokens_seen": 133741020, "step": 6221, "time_per_iteration": 2.825075626373291 }, { "auxiliary_loss_clip": 0.01149007, "auxiliary_loss_mlp": 0.01022165, "balance_loss_clip": 1.04609656, "balance_loss_mlp": 1.01444328, "epoch": 0.748151265556424, "flos": 24826555065600.0, "grad_norm": 2.0150863578321503, "language_loss": 0.81239069, "learning_rate": 6.291124325322576e-07, "loss": 0.83410239, "num_input_tokens_seen": 133761145, "step": 6222, "time_per_iteration": 2.7984824180603027 }, { "auxiliary_loss_clip": 0.01168038, "auxiliary_loss_mlp": 0.0102894, "balance_loss_clip": 1.05013692, "balance_loss_mlp": 1.02160859, "epoch": 0.748271508447063, "flos": 38399351535360.0, "grad_norm": 1.8071809979290467, "language_loss": 0.62490869, "learning_rate": 6.285453473672595e-07, "loss": 0.64687854, "num_input_tokens_seen": 133783715, "step": 6223, "time_per_iteration": 2.821329355239868 }, { "auxiliary_loss_clip": 0.01164377, "auxiliary_loss_mlp": 0.01022917, "balance_loss_clip": 1.04439878, "balance_loss_mlp": 1.01559162, "epoch": 0.7483917513377022, "flos": 21541842000000.0, "grad_norm": 1.9602686627968184, "language_loss": 0.75685859, "learning_rate": 6.279784702508415e-07, "loss": 0.77873147, "num_input_tokens_seen": 133804465, "step": 6224, "time_per_iteration": 2.678004503250122 }, { "auxiliary_loss_clip": 0.01064531, "auxiliary_loss_mlp": 0.00999755, "balance_loss_clip": 1.00836587, "balance_loss_mlp": 0.99874783, "epoch": 0.7485119942283412, "flos": 62314532772480.0, "grad_norm": 0.7840594149968485, "language_loss": 0.58591479, "learning_rate": 6.274118012689979e-07, "loss": 0.60655767, "num_input_tokens_seen": 133866365, "step": 6225, "time_per_iteration": 3.380171060562134 }, { "auxiliary_loss_clip": 0.01156402, "auxiliary_loss_mlp": 0.01025851, "balance_loss_clip": 1.04823112, "balance_loss_mlp": 1.01832271, "epoch": 0.7486322371189803, "flos": 29937613104000.0, "grad_norm": 1.4699750756515637, "language_loss": 0.67874897, "learning_rate": 6.268453405076943e-07, "loss": 0.70057154, "num_input_tokens_seen": 133888760, "step": 6226, "time_per_iteration": 2.8390133380889893 }, { "auxiliary_loss_clip": 0.01160154, "auxiliary_loss_mlp": 0.01025983, "balance_loss_clip": 1.04545486, "balance_loss_mlp": 1.01906633, "epoch": 0.7487524800096195, "flos": 18949414734720.0, "grad_norm": 2.0858535360188575, "language_loss": 0.82630014, "learning_rate": 6.262790880528592e-07, "loss": 0.84816152, "num_input_tokens_seen": 133906380, "step": 6227, "time_per_iteration": 2.7714412212371826 }, { "auxiliary_loss_clip": 0.01172076, "auxiliary_loss_mlp": 0.01030212, "balance_loss_clip": 1.04902816, "balance_loss_mlp": 1.02242184, "epoch": 0.7488727229002585, "flos": 18697393935360.0, "grad_norm": 2.7373724520322034, "language_loss": 0.79611266, "learning_rate": 6.257130439903951e-07, "loss": 0.8181355, "num_input_tokens_seen": 133922875, "step": 6228, "time_per_iteration": 2.685173273086548 }, { "auxiliary_loss_clip": 0.01170478, "auxiliary_loss_mlp": 0.0102466, "balance_loss_clip": 1.04929233, "balance_loss_mlp": 1.01737058, "epoch": 0.7489929657908976, "flos": 23623368168960.0, "grad_norm": 1.913521747674225, "language_loss": 0.81103116, "learning_rate": 6.251472084061695e-07, "loss": 0.8329826, "num_input_tokens_seen": 133941795, "step": 6229, "time_per_iteration": 2.756714344024658 }, { "auxiliary_loss_clip": 0.01160198, "auxiliary_loss_mlp": 0.01023153, "balance_loss_clip": 1.04667389, "balance_loss_mlp": 1.01624465, "epoch": 0.7491132086815367, "flos": 20551533056640.0, "grad_norm": 2.6021181092223302, "language_loss": 0.89004892, "learning_rate": 6.245815813860191e-07, "loss": 0.9118824, "num_input_tokens_seen": 133957305, "step": 6230, "time_per_iteration": 2.6773767471313477 }, { "auxiliary_loss_clip": 0.0117053, "auxiliary_loss_mlp": 0.01024713, "balance_loss_clip": 1.0471549, "balance_loss_mlp": 1.01647592, "epoch": 0.7492334515721758, "flos": 23003011353600.0, "grad_norm": 2.049282206116317, "language_loss": 0.70093244, "learning_rate": 6.240161630157495e-07, "loss": 0.72288489, "num_input_tokens_seen": 133976660, "step": 6231, "time_per_iteration": 2.7108824253082275 }, { "auxiliary_loss_clip": 0.01169923, "auxiliary_loss_mlp": 0.01026195, "balance_loss_clip": 1.04823864, "balance_loss_mlp": 1.01875687, "epoch": 0.7493536944628149, "flos": 16398823835520.0, "grad_norm": 2.0808947294349704, "language_loss": 0.70663595, "learning_rate": 6.23450953381133e-07, "loss": 0.72859716, "num_input_tokens_seen": 133994750, "step": 6232, "time_per_iteration": 2.7096920013427734 }, { "auxiliary_loss_clip": 0.01156664, "auxiliary_loss_mlp": 0.01026839, "balance_loss_clip": 1.04847121, "balance_loss_mlp": 1.01985657, "epoch": 0.749473937353454, "flos": 15338561155200.0, "grad_norm": 1.9122724798111268, "language_loss": 0.67748427, "learning_rate": 6.228859525679131e-07, "loss": 0.69931936, "num_input_tokens_seen": 134009165, "step": 6233, "time_per_iteration": 3.626298666000366 }, { "auxiliary_loss_clip": 0.01163241, "auxiliary_loss_mlp": 0.01023545, "balance_loss_clip": 1.04718935, "balance_loss_mlp": 1.01611876, "epoch": 0.7495941802440931, "flos": 18951138587520.0, "grad_norm": 2.21423011379981, "language_loss": 0.79650229, "learning_rate": 6.223211606617986e-07, "loss": 0.81837022, "num_input_tokens_seen": 134027585, "step": 6234, "time_per_iteration": 2.828517436981201 }, { "auxiliary_loss_clip": 0.01161096, "auxiliary_loss_mlp": 0.01028723, "balance_loss_clip": 1.04919779, "balance_loss_mlp": 1.02232397, "epoch": 0.7497144231347321, "flos": 22492469393280.0, "grad_norm": 1.8352558521460771, "language_loss": 0.84151369, "learning_rate": 6.217565777484701e-07, "loss": 0.8634119, "num_input_tokens_seen": 134046680, "step": 6235, "time_per_iteration": 3.6215548515319824 }, { "auxiliary_loss_clip": 0.01155149, "auxiliary_loss_mlp": 0.01053004, "balance_loss_clip": 1.04611635, "balance_loss_mlp": 1.01744509, "epoch": 0.7498346660253713, "flos": 24243509502720.0, "grad_norm": 1.9812242089860417, "language_loss": 0.80218768, "learning_rate": 6.211922039135722e-07, "loss": 0.82426918, "num_input_tokens_seen": 134066825, "step": 6236, "time_per_iteration": 2.782515048980713 }, { "auxiliary_loss_clip": 0.01168733, "auxiliary_loss_mlp": 0.01025001, "balance_loss_clip": 1.04840767, "balance_loss_mlp": 1.01750827, "epoch": 0.7499549089160104, "flos": 24387080163840.0, "grad_norm": 1.8599487608990661, "language_loss": 0.81095195, "learning_rate": 6.206280392427201e-07, "loss": 0.8328892, "num_input_tokens_seen": 134086410, "step": 6237, "time_per_iteration": 3.643091917037964 }, { "auxiliary_loss_clip": 0.01158844, "auxiliary_loss_mlp": 0.01026164, "balance_loss_clip": 1.04625881, "balance_loss_mlp": 1.01880562, "epoch": 0.7500751518066494, "flos": 34057320704640.0, "grad_norm": 3.259949499570241, "language_loss": 0.73816663, "learning_rate": 6.200640838214983e-07, "loss": 0.76001668, "num_input_tokens_seen": 134109185, "step": 6238, "time_per_iteration": 2.7979865074157715 }, { "auxiliary_loss_clip": 0.01164252, "auxiliary_loss_mlp": 0.0102169, "balance_loss_clip": 1.04580522, "balance_loss_mlp": 1.01448405, "epoch": 0.7501953946972886, "flos": 18843586289280.0, "grad_norm": 1.8566724994166157, "language_loss": 0.67230552, "learning_rate": 6.195003377354578e-07, "loss": 0.69416493, "num_input_tokens_seen": 134128455, "step": 6239, "time_per_iteration": 2.6260290145874023 }, { "auxiliary_loss_clip": 0.01160343, "auxiliary_loss_mlp": 0.01027661, "balance_loss_clip": 1.04555798, "balance_loss_mlp": 1.02009094, "epoch": 0.7503156375879276, "flos": 20257675891200.0, "grad_norm": 2.4707983115021457, "language_loss": 0.73284882, "learning_rate": 6.189368010701183e-07, "loss": 0.75472879, "num_input_tokens_seen": 134145515, "step": 6240, "time_per_iteration": 2.816488742828369 }, { "auxiliary_loss_clip": 0.01168288, "auxiliary_loss_mlp": 0.01026673, "balance_loss_clip": 1.04629183, "balance_loss_mlp": 1.01908565, "epoch": 0.7504358804785667, "flos": 13480040574720.0, "grad_norm": 2.702448377452814, "language_loss": 0.76541638, "learning_rate": 6.183734739109683e-07, "loss": 0.78736597, "num_input_tokens_seen": 134163335, "step": 6241, "time_per_iteration": 2.787424325942993 }, { "auxiliary_loss_clip": 0.01173343, "auxiliary_loss_mlp": 0.01028913, "balance_loss_clip": 1.05036092, "balance_loss_mlp": 1.0211581, "epoch": 0.7505561233692057, "flos": 29461042431360.0, "grad_norm": 2.0006665759941744, "language_loss": 0.68286526, "learning_rate": 6.178103563434629e-07, "loss": 0.70488787, "num_input_tokens_seen": 134182335, "step": 6242, "time_per_iteration": 2.7879092693328857 }, { "auxiliary_loss_clip": 0.01166353, "auxiliary_loss_mlp": 0.01021865, "balance_loss_clip": 1.04707444, "balance_loss_mlp": 1.01399767, "epoch": 0.7506763662598449, "flos": 20302457172480.0, "grad_norm": 1.802071087358531, "language_loss": 0.83870041, "learning_rate": 6.172474484530283e-07, "loss": 0.86058259, "num_input_tokens_seen": 134201070, "step": 6243, "time_per_iteration": 3.651604175567627 }, { "auxiliary_loss_clip": 0.01155403, "auxiliary_loss_mlp": 0.01026647, "balance_loss_clip": 1.04841518, "balance_loss_mlp": 1.01895773, "epoch": 0.750796609150484, "flos": 37230961939200.0, "grad_norm": 2.02765252701155, "language_loss": 0.76151824, "learning_rate": 6.166847503250563e-07, "loss": 0.78333879, "num_input_tokens_seen": 134223310, "step": 6244, "time_per_iteration": 2.927239418029785 }, { "auxiliary_loss_clip": 0.01161552, "auxiliary_loss_mlp": 0.01019709, "balance_loss_clip": 1.04733372, "balance_loss_mlp": 1.01246381, "epoch": 0.750916852041123, "flos": 19609417186560.0, "grad_norm": 2.2715718132604716, "language_loss": 0.7888509, "learning_rate": 6.161222620449078e-07, "loss": 0.81066352, "num_input_tokens_seen": 134242085, "step": 6245, "time_per_iteration": 2.747659921646118 }, { "auxiliary_loss_clip": 0.01161657, "auxiliary_loss_mlp": 0.01025749, "balance_loss_clip": 1.04731584, "balance_loss_mlp": 1.01804173, "epoch": 0.7510370949317622, "flos": 25112690807040.0, "grad_norm": 1.9126401866626712, "language_loss": 0.80077338, "learning_rate": 6.155599836979117e-07, "loss": 0.82264745, "num_input_tokens_seen": 134260770, "step": 6246, "time_per_iteration": 2.808743715286255 }, { "auxiliary_loss_clip": 0.01155742, "auxiliary_loss_mlp": 0.01024614, "balance_loss_clip": 1.04726052, "balance_loss_mlp": 1.01735139, "epoch": 0.7511573378224012, "flos": 19062282245760.0, "grad_norm": 2.4432908544850958, "language_loss": 0.81740028, "learning_rate": 6.149979153693649e-07, "loss": 0.83920383, "num_input_tokens_seen": 134278025, "step": 6247, "time_per_iteration": 2.8076419830322266 }, { "auxiliary_loss_clip": 0.01163815, "auxiliary_loss_mlp": 0.01026823, "balance_loss_clip": 1.04900563, "balance_loss_mlp": 1.0193162, "epoch": 0.7512775807130403, "flos": 19937676602880.0, "grad_norm": 2.3814492256052726, "language_loss": 0.76992965, "learning_rate": 6.144360571445343e-07, "loss": 0.79183602, "num_input_tokens_seen": 134297170, "step": 6248, "time_per_iteration": 2.7680017948150635 }, { "auxiliary_loss_clip": 0.01159696, "auxiliary_loss_mlp": 0.0102704, "balance_loss_clip": 1.04653621, "balance_loss_mlp": 1.01960707, "epoch": 0.7513978236036795, "flos": 20739920912640.0, "grad_norm": 1.8623517779070695, "language_loss": 0.80000186, "learning_rate": 6.138744091086509e-07, "loss": 0.82186925, "num_input_tokens_seen": 134316755, "step": 6249, "time_per_iteration": 2.7571256160736084 }, { "auxiliary_loss_clip": 0.01160334, "auxiliary_loss_mlp": 0.01023429, "balance_loss_clip": 1.04909229, "balance_loss_mlp": 1.01562989, "epoch": 0.7515180664943185, "flos": 27563163523200.0, "grad_norm": 2.908116428179841, "language_loss": 0.7280522, "learning_rate": 6.133129713469183e-07, "loss": 0.74988985, "num_input_tokens_seen": 134335960, "step": 6250, "time_per_iteration": 2.8290252685546875 }, { "auxiliary_loss_clip": 0.01165037, "auxiliary_loss_mlp": 0.01026741, "balance_loss_clip": 1.04609096, "balance_loss_mlp": 1.01957035, "epoch": 0.7516383093849576, "flos": 33803181002880.0, "grad_norm": 1.6194034247077338, "language_loss": 0.64129794, "learning_rate": 6.127517439445053e-07, "loss": 0.66321576, "num_input_tokens_seen": 134356805, "step": 6251, "time_per_iteration": 2.879857063293457 }, { "auxiliary_loss_clip": 0.01149817, "auxiliary_loss_mlp": 0.01027416, "balance_loss_clip": 1.04905176, "balance_loss_mlp": 1.02067733, "epoch": 0.7517585522755967, "flos": 29746172592000.0, "grad_norm": 1.9729791197388438, "language_loss": 0.82013166, "learning_rate": 6.121907269865498e-07, "loss": 0.84190398, "num_input_tokens_seen": 134376295, "step": 6252, "time_per_iteration": 2.7714507579803467 }, { "auxiliary_loss_clip": 0.0105691, "auxiliary_loss_mlp": 0.01000808, "balance_loss_clip": 1.01025188, "balance_loss_mlp": 0.99969941, "epoch": 0.7518787951662358, "flos": 69807974319360.0, "grad_norm": 0.930885120241913, "language_loss": 0.67261827, "learning_rate": 6.116299205581577e-07, "loss": 0.69319546, "num_input_tokens_seen": 134431125, "step": 6253, "time_per_iteration": 3.2633509635925293 }, { "auxiliary_loss_clip": 0.01173831, "auxiliary_loss_mlp": 0.01030076, "balance_loss_clip": 1.05053639, "balance_loss_mlp": 1.02225327, "epoch": 0.7519990380568748, "flos": 34203225749760.0, "grad_norm": 2.2352215786456, "language_loss": 0.6843493, "learning_rate": 6.110693247444018e-07, "loss": 0.70638835, "num_input_tokens_seen": 134452960, "step": 6254, "time_per_iteration": 2.771402359008789 }, { "auxiliary_loss_clip": 0.01148341, "auxiliary_loss_mlp": 0.01027592, "balance_loss_clip": 1.04674435, "balance_loss_mlp": 1.02082419, "epoch": 0.752119280947514, "flos": 21725704742400.0, "grad_norm": 1.6905725608084863, "language_loss": 0.82337564, "learning_rate": 6.105089396303258e-07, "loss": 0.84513497, "num_input_tokens_seen": 134471350, "step": 6255, "time_per_iteration": 2.8095767498016357 }, { "auxiliary_loss_clip": 0.01165764, "auxiliary_loss_mlp": 0.0102498, "balance_loss_clip": 1.04921508, "balance_loss_mlp": 1.01736903, "epoch": 0.7522395238381531, "flos": 32742774668160.0, "grad_norm": 2.137679374500218, "language_loss": 0.75732207, "learning_rate": 6.099487653009383e-07, "loss": 0.77922952, "num_input_tokens_seen": 134490695, "step": 6256, "time_per_iteration": 2.8706185817718506 }, { "auxiliary_loss_clip": 0.0116127, "auxiliary_loss_mlp": 0.01024913, "balance_loss_clip": 1.04567099, "balance_loss_mlp": 1.01815403, "epoch": 0.7523597667287921, "flos": 23476026579840.0, "grad_norm": 2.287009594613085, "language_loss": 0.83082861, "learning_rate": 6.093888018412192e-07, "loss": 0.85269046, "num_input_tokens_seen": 134506885, "step": 6257, "time_per_iteration": 2.782515048980713 }, { "auxiliary_loss_clip": 0.01064426, "auxiliary_loss_mlp": 0.01001609, "balance_loss_clip": 1.00919831, "balance_loss_mlp": 1.00064969, "epoch": 0.7524800096194313, "flos": 67346730501120.0, "grad_norm": 0.7075935978223552, "language_loss": 0.54643941, "learning_rate": 6.088290493361125e-07, "loss": 0.56709981, "num_input_tokens_seen": 134571770, "step": 6258, "time_per_iteration": 3.443648099899292 }, { "auxiliary_loss_clip": 0.01148648, "auxiliary_loss_mlp": 0.0103565, "balance_loss_clip": 1.04900277, "balance_loss_mlp": 1.02865529, "epoch": 0.7526002525100703, "flos": 13006055681280.0, "grad_norm": 2.3710789911783476, "language_loss": 0.71642882, "learning_rate": 6.082695078705322e-07, "loss": 0.73827183, "num_input_tokens_seen": 134589250, "step": 6259, "time_per_iteration": 3.78346586227417 }, { "auxiliary_loss_clip": 0.01159355, "auxiliary_loss_mlp": 0.01026248, "balance_loss_clip": 1.04825306, "balance_loss_mlp": 1.01869345, "epoch": 0.7527204954007094, "flos": 21397229844480.0, "grad_norm": 2.091580660567158, "language_loss": 0.691006, "learning_rate": 6.077101775293618e-07, "loss": 0.71286201, "num_input_tokens_seen": 134608075, "step": 6260, "time_per_iteration": 2.9848148822784424 }, { "auxiliary_loss_clip": 0.01165395, "auxiliary_loss_mlp": 0.010217, "balance_loss_clip": 1.0462091, "balance_loss_mlp": 1.0142374, "epoch": 0.7528407382913486, "flos": 18947188091520.0, "grad_norm": 3.7950552812403013, "language_loss": 0.82317841, "learning_rate": 6.071510583974504e-07, "loss": 0.84504938, "num_input_tokens_seen": 134623260, "step": 6261, "time_per_iteration": 3.614394426345825 }, { "auxiliary_loss_clip": 0.01168808, "auxiliary_loss_mlp": 0.01026497, "balance_loss_clip": 1.0485518, "balance_loss_mlp": 1.01936841, "epoch": 0.7529609811819876, "flos": 15231798956160.0, "grad_norm": 1.8387791577343704, "language_loss": 0.72216249, "learning_rate": 6.065921505596161e-07, "loss": 0.74411553, "num_input_tokens_seen": 134641540, "step": 6262, "time_per_iteration": 2.6249117851257324 }, { "auxiliary_loss_clip": 0.01158819, "auxiliary_loss_mlp": 0.01020353, "balance_loss_clip": 1.04903507, "balance_loss_mlp": 1.01311076, "epoch": 0.7530812240726267, "flos": 19354487385600.0, "grad_norm": 1.662489042530907, "language_loss": 0.76851285, "learning_rate": 6.060334541006445e-07, "loss": 0.79030466, "num_input_tokens_seen": 134660035, "step": 6263, "time_per_iteration": 3.71795916557312 }, { "auxiliary_loss_clip": 0.01161116, "auxiliary_loss_mlp": 0.01026925, "balance_loss_clip": 1.04696071, "balance_loss_mlp": 1.01947713, "epoch": 0.7532014669632658, "flos": 27748247328000.0, "grad_norm": 2.2822307890894673, "language_loss": 0.69108695, "learning_rate": 6.05474969105289e-07, "loss": 0.7129674, "num_input_tokens_seen": 134683025, "step": 6264, "time_per_iteration": 2.743288516998291 }, { "auxiliary_loss_clip": 0.01166308, "auxiliary_loss_mlp": 0.01027634, "balance_loss_clip": 1.05009139, "balance_loss_mlp": 1.01959038, "epoch": 0.7533217098539049, "flos": 14137421333760.0, "grad_norm": 2.9615478893205736, "language_loss": 0.73495352, "learning_rate": 6.049166956582725e-07, "loss": 0.75689292, "num_input_tokens_seen": 134701290, "step": 6265, "time_per_iteration": 2.5647246837615967 }, { "auxiliary_loss_clip": 0.01161273, "auxiliary_loss_mlp": 0.01029249, "balance_loss_clip": 1.04763937, "balance_loss_mlp": 1.02208173, "epoch": 0.753441952744544, "flos": 26429068437120.0, "grad_norm": 2.0107620693302697, "language_loss": 0.87448317, "learning_rate": 6.043586338442841e-07, "loss": 0.89638847, "num_input_tokens_seen": 134720345, "step": 6266, "time_per_iteration": 2.616488218307495 }, { "auxiliary_loss_clip": 0.01164639, "auxiliary_loss_mlp": 0.01026188, "balance_loss_clip": 1.04791915, "balance_loss_mlp": 1.01943183, "epoch": 0.7535621956351831, "flos": 23878621192320.0, "grad_norm": 2.7976296328295818, "language_loss": 0.73119003, "learning_rate": 6.038007837479815e-07, "loss": 0.75309825, "num_input_tokens_seen": 134741450, "step": 6267, "time_per_iteration": 2.515047550201416 }, { "auxiliary_loss_clip": 0.01161297, "auxiliary_loss_mlp": 0.01017128, "balance_loss_clip": 1.04669857, "balance_loss_mlp": 1.01004708, "epoch": 0.7536824385258222, "flos": 21795873960960.0, "grad_norm": 1.996188179056862, "language_loss": 0.6400733, "learning_rate": 6.032431454539897e-07, "loss": 0.6618576, "num_input_tokens_seen": 134760295, "step": 6268, "time_per_iteration": 2.53708553314209 }, { "auxiliary_loss_clip": 0.01161567, "auxiliary_loss_mlp": 0.01022077, "balance_loss_clip": 1.04925966, "balance_loss_mlp": 1.01481414, "epoch": 0.7538026814164612, "flos": 28911644933760.0, "grad_norm": 1.8048084761118457, "language_loss": 0.81642282, "learning_rate": 6.026857190469014e-07, "loss": 0.83825934, "num_input_tokens_seen": 134782050, "step": 6269, "time_per_iteration": 3.3944263458251953 }, { "auxiliary_loss_clip": 0.01164388, "auxiliary_loss_mlp": 0.01029611, "balance_loss_clip": 1.04892516, "balance_loss_mlp": 1.02285755, "epoch": 0.7539229243071004, "flos": 21104701482240.0, "grad_norm": 2.581474664869148, "language_loss": 0.74180591, "learning_rate": 6.0212850461128e-07, "loss": 0.7637459, "num_input_tokens_seen": 134801170, "step": 6270, "time_per_iteration": 2.6630008220672607 }, { "auxiliary_loss_clip": 0.01164365, "auxiliary_loss_mlp": 0.01024888, "balance_loss_clip": 1.04599476, "balance_loss_mlp": 1.01733637, "epoch": 0.7540431671977395, "flos": 15158469340800.0, "grad_norm": 2.604875321923819, "language_loss": 0.74958396, "learning_rate": 6.015715022316516e-07, "loss": 0.77147645, "num_input_tokens_seen": 134819150, "step": 6271, "time_per_iteration": 2.7424731254577637 }, { "auxiliary_loss_clip": 0.01159242, "auxiliary_loss_mlp": 0.01023498, "balance_loss_clip": 1.04808784, "balance_loss_mlp": 1.01646423, "epoch": 0.7541634100883785, "flos": 18770579896320.0, "grad_norm": 4.609615390997092, "language_loss": 0.77760369, "learning_rate": 6.010147119925154e-07, "loss": 0.79943109, "num_input_tokens_seen": 134836905, "step": 6272, "time_per_iteration": 2.771667003631592 }, { "auxiliary_loss_clip": 0.01151258, "auxiliary_loss_mlp": 0.0101972, "balance_loss_clip": 1.04863405, "balance_loss_mlp": 1.01215017, "epoch": 0.7542836529790176, "flos": 20594770053120.0, "grad_norm": 2.572160305547101, "language_loss": 0.66699892, "learning_rate": 6.004581339783348e-07, "loss": 0.68870872, "num_input_tokens_seen": 134855225, "step": 6273, "time_per_iteration": 2.8900387287139893 }, { "auxiliary_loss_clip": 0.01166326, "auxiliary_loss_mlp": 0.01028659, "balance_loss_clip": 1.04763722, "balance_loss_mlp": 1.02082133, "epoch": 0.7544038958696567, "flos": 19095104298240.0, "grad_norm": 2.3938950085402153, "language_loss": 0.6837424, "learning_rate": 5.999017682735425e-07, "loss": 0.70569229, "num_input_tokens_seen": 134871615, "step": 6274, "time_per_iteration": 2.8124196529388428 }, { "auxiliary_loss_clip": 0.01160095, "auxiliary_loss_mlp": 0.0102638, "balance_loss_clip": 1.04901373, "balance_loss_mlp": 1.01824427, "epoch": 0.7545241387602958, "flos": 31723306859520.0, "grad_norm": 1.8105110985683042, "language_loss": 0.66422677, "learning_rate": 5.993456149625387e-07, "loss": 0.68609154, "num_input_tokens_seen": 134892765, "step": 6275, "time_per_iteration": 2.9880964756011963 }, { "auxiliary_loss_clip": 0.01149924, "auxiliary_loss_mlp": 0.01027422, "balance_loss_clip": 1.04752064, "balance_loss_mlp": 1.02006686, "epoch": 0.7546443816509348, "flos": 20296495514880.0, "grad_norm": 1.8268375246812727, "language_loss": 0.82369232, "learning_rate": 5.987896741296909e-07, "loss": 0.84546578, "num_input_tokens_seen": 134910505, "step": 6276, "time_per_iteration": 2.705127000808716 }, { "auxiliary_loss_clip": 0.01159873, "auxiliary_loss_mlp": 0.0102848, "balance_loss_clip": 1.04842353, "balance_loss_mlp": 1.02158988, "epoch": 0.754764624541574, "flos": 23696159080320.0, "grad_norm": 1.8909834718781036, "language_loss": 0.78427887, "learning_rate": 5.982339458593361e-07, "loss": 0.80616236, "num_input_tokens_seen": 134930445, "step": 6277, "time_per_iteration": 2.7991209030151367 }, { "auxiliary_loss_clip": 0.01161729, "auxiliary_loss_mlp": 0.01049706, "balance_loss_clip": 1.04883242, "balance_loss_mlp": 1.01417112, "epoch": 0.7548848674322131, "flos": 25337204766720.0, "grad_norm": 1.6799160838939882, "language_loss": 0.84252036, "learning_rate": 5.976784302357767e-07, "loss": 0.86463469, "num_input_tokens_seen": 134951010, "step": 6278, "time_per_iteration": 2.758087635040283 }, { "auxiliary_loss_clip": 0.01169142, "auxiliary_loss_mlp": 0.01023949, "balance_loss_clip": 1.04962683, "balance_loss_mlp": 1.01729369, "epoch": 0.7550051103228521, "flos": 19573147428480.0, "grad_norm": 15.719253227903133, "language_loss": 0.73528343, "learning_rate": 5.971231273432855e-07, "loss": 0.75721431, "num_input_tokens_seen": 134970495, "step": 6279, "time_per_iteration": 2.750976085662842 }, { "auxiliary_loss_clip": 0.01063419, "auxiliary_loss_mlp": 0.01002704, "balance_loss_clip": 1.00948167, "balance_loss_mlp": 1.00163066, "epoch": 0.7551253532134913, "flos": 64150068648960.0, "grad_norm": 0.8121980003670745, "language_loss": 0.5451138, "learning_rate": 5.965680372661e-07, "loss": 0.56577498, "num_input_tokens_seen": 135028060, "step": 6280, "time_per_iteration": 3.1746230125427246 }, { "auxiliary_loss_clip": 0.01160371, "auxiliary_loss_mlp": 0.01020716, "balance_loss_clip": 1.04844296, "balance_loss_mlp": 1.01398635, "epoch": 0.7552455961041303, "flos": 26067986968320.0, "grad_norm": 1.9288317402969068, "language_loss": 0.56419295, "learning_rate": 5.960131600884266e-07, "loss": 0.58600378, "num_input_tokens_seen": 135047330, "step": 6281, "time_per_iteration": 2.8873586654663086 }, { "auxiliary_loss_clip": 0.01159345, "auxiliary_loss_mlp": 0.01024152, "balance_loss_clip": 1.04718518, "balance_loss_mlp": 1.01737201, "epoch": 0.7553658389947694, "flos": 24498223822080.0, "grad_norm": 1.6721585853654768, "language_loss": 0.75953352, "learning_rate": 5.954584958944413e-07, "loss": 0.78136855, "num_input_tokens_seen": 135065995, "step": 6282, "time_per_iteration": 2.8464365005493164 }, { "auxiliary_loss_clip": 0.01159672, "auxiliary_loss_mlp": 0.01053783, "balance_loss_clip": 1.04646921, "balance_loss_mlp": 1.01782131, "epoch": 0.7554860818854086, "flos": 21799465320960.0, "grad_norm": 4.413619209316448, "language_loss": 0.81940967, "learning_rate": 5.949040447682854e-07, "loss": 0.84154427, "num_input_tokens_seen": 135085820, "step": 6283, "time_per_iteration": 2.772592544555664 }, { "auxiliary_loss_clip": 0.01167377, "auxiliary_loss_mlp": 0.01029596, "balance_loss_clip": 1.04919982, "balance_loss_mlp": 1.0222168, "epoch": 0.7556063247760476, "flos": 16362123114240.0, "grad_norm": 2.803410372505499, "language_loss": 0.6836949, "learning_rate": 5.943498067940686e-07, "loss": 0.70566463, "num_input_tokens_seen": 135102845, "step": 6284, "time_per_iteration": 2.712877035140991 }, { "auxiliary_loss_clip": 0.01155617, "auxiliary_loss_mlp": 0.01022703, "balance_loss_clip": 1.04975069, "balance_loss_mlp": 1.01598525, "epoch": 0.7557265676666867, "flos": 27235155502080.0, "grad_norm": 2.110120319851048, "language_loss": 0.8159802, "learning_rate": 5.937957820558686e-07, "loss": 0.83776349, "num_input_tokens_seen": 135122190, "step": 6285, "time_per_iteration": 3.733534336090088 }, { "auxiliary_loss_clip": 0.01064968, "auxiliary_loss_mlp": 0.01001677, "balance_loss_clip": 1.00913501, "balance_loss_mlp": 1.00069988, "epoch": 0.7558468105573258, "flos": 62189131415040.0, "grad_norm": 0.8491353459346922, "language_loss": 0.65469146, "learning_rate": 5.932419706377296e-07, "loss": 0.67535794, "num_input_tokens_seen": 135180495, "step": 6286, "time_per_iteration": 3.220402240753174 }, { "auxiliary_loss_clip": 0.01153895, "auxiliary_loss_mlp": 0.01026017, "balance_loss_clip": 1.04857147, "balance_loss_mlp": 1.01896, "epoch": 0.7559670534479649, "flos": 33249078823680.0, "grad_norm": 1.9333629822721379, "language_loss": 0.73926103, "learning_rate": 5.92688372623666e-07, "loss": 0.76106012, "num_input_tokens_seen": 135199200, "step": 6287, "time_per_iteration": 3.8487212657928467 }, { "auxiliary_loss_clip": 0.01163675, "auxiliary_loss_mlp": 0.01025285, "balance_loss_clip": 1.04457366, "balance_loss_mlp": 1.01744103, "epoch": 0.7560872963386039, "flos": 14064379027200.0, "grad_norm": 3.29078494694257, "language_loss": 0.73691618, "learning_rate": 5.921349880976574e-07, "loss": 0.75880575, "num_input_tokens_seen": 135217035, "step": 6288, "time_per_iteration": 2.8030333518981934 }, { "auxiliary_loss_clip": 0.01165174, "auxiliary_loss_mlp": 0.01056571, "balance_loss_clip": 1.04821849, "balance_loss_mlp": 1.02021754, "epoch": 0.7562075392292431, "flos": 20412307941120.0, "grad_norm": 1.6799796690944764, "language_loss": 0.82045424, "learning_rate": 5.915818171436515e-07, "loss": 0.84267175, "num_input_tokens_seen": 135236370, "step": 6289, "time_per_iteration": 3.6250674724578857 }, { "auxiliary_loss_clip": 0.01162988, "auxiliary_loss_mlp": 0.0102255, "balance_loss_clip": 1.04722881, "balance_loss_mlp": 1.01502752, "epoch": 0.7563277821198822, "flos": 20376792368640.0, "grad_norm": 1.7628319625927948, "language_loss": 0.74446571, "learning_rate": 5.910288598455642e-07, "loss": 0.76632106, "num_input_tokens_seen": 135255720, "step": 6290, "time_per_iteration": 2.770432233810425 }, { "auxiliary_loss_clip": 0.0117283, "auxiliary_loss_mlp": 0.0102875, "balance_loss_clip": 1.0497694, "balance_loss_mlp": 1.02031636, "epoch": 0.7564480250105212, "flos": 18588261438720.0, "grad_norm": 2.198020996497561, "language_loss": 0.74526459, "learning_rate": 5.90476116287278e-07, "loss": 0.7672804, "num_input_tokens_seen": 135273320, "step": 6291, "time_per_iteration": 2.685483932495117 }, { "auxiliary_loss_clip": 0.01159881, "auxiliary_loss_mlp": 0.01025966, "balance_loss_clip": 1.04869211, "balance_loss_mlp": 1.01909637, "epoch": 0.7565682679011604, "flos": 21215521918080.0, "grad_norm": 2.792055783663112, "language_loss": 0.68038648, "learning_rate": 5.899235865526456e-07, "loss": 0.70224494, "num_input_tokens_seen": 135292615, "step": 6292, "time_per_iteration": 2.8274452686309814 }, { "auxiliary_loss_clip": 0.01151806, "auxiliary_loss_mlp": 0.01021526, "balance_loss_clip": 1.04823625, "balance_loss_mlp": 1.01496625, "epoch": 0.7566885107917994, "flos": 20449008662400.0, "grad_norm": 2.1566791816281783, "language_loss": 0.82461643, "learning_rate": 5.893712707254825e-07, "loss": 0.84634978, "num_input_tokens_seen": 135310075, "step": 6293, "time_per_iteration": 2.719554901123047 }, { "auxiliary_loss_clip": 0.01153012, "auxiliary_loss_mlp": 0.01025143, "balance_loss_clip": 1.04808784, "balance_loss_mlp": 1.01722169, "epoch": 0.7568087536824385, "flos": 19025832919680.0, "grad_norm": 2.4542912791391482, "language_loss": 0.65952563, "learning_rate": 5.888191688895769e-07, "loss": 0.6813072, "num_input_tokens_seen": 135327335, "step": 6294, "time_per_iteration": 2.7251696586608887 }, { "auxiliary_loss_clip": 0.01166773, "auxiliary_loss_mlp": 0.01021942, "balance_loss_clip": 1.04516256, "balance_loss_mlp": 1.01443148, "epoch": 0.7569289965730777, "flos": 15225442248960.0, "grad_norm": 2.1979670076574247, "language_loss": 0.62179089, "learning_rate": 5.882672811286813e-07, "loss": 0.64367801, "num_input_tokens_seen": 135343615, "step": 6295, "time_per_iteration": 3.505580186843872 }, { "auxiliary_loss_clip": 0.01169617, "auxiliary_loss_mlp": 0.0102721, "balance_loss_clip": 1.04795718, "balance_loss_mlp": 1.01894319, "epoch": 0.7570492394637167, "flos": 20769367086720.0, "grad_norm": 2.0556554362647406, "language_loss": 0.69726562, "learning_rate": 5.877156075265166e-07, "loss": 0.71923381, "num_input_tokens_seen": 135359880, "step": 6296, "time_per_iteration": 2.650747060775757 }, { "auxiliary_loss_clip": 0.01160251, "auxiliary_loss_mlp": 0.01026197, "balance_loss_clip": 1.04651475, "balance_loss_mlp": 1.01862752, "epoch": 0.7571694823543558, "flos": 15664091137920.0, "grad_norm": 3.527526813749716, "language_loss": 0.69620585, "learning_rate": 5.871641481667715e-07, "loss": 0.71807027, "num_input_tokens_seen": 135374325, "step": 6297, "time_per_iteration": 2.707435369491577 }, { "auxiliary_loss_clip": 0.0115903, "auxiliary_loss_mlp": 0.0102825, "balance_loss_clip": 1.04701579, "balance_loss_mlp": 1.02097511, "epoch": 0.7572897252449949, "flos": 25409241492480.0, "grad_norm": 1.6689992182140487, "language_loss": 0.84633344, "learning_rate": 5.866129031331011e-07, "loss": 0.86820626, "num_input_tokens_seen": 135393980, "step": 6298, "time_per_iteration": 2.7912161350250244 }, { "auxiliary_loss_clip": 0.01164321, "auxiliary_loss_mlp": 0.01025582, "balance_loss_clip": 1.04688466, "balance_loss_mlp": 1.01845336, "epoch": 0.757409968135634, "flos": 24279348297600.0, "grad_norm": 2.2611403720904235, "language_loss": 0.83606201, "learning_rate": 5.8606187250913e-07, "loss": 0.85796106, "num_input_tokens_seen": 135412030, "step": 6299, "time_per_iteration": 2.767845869064331 }, { "auxiliary_loss_clip": 0.01164768, "auxiliary_loss_mlp": 0.01055135, "balance_loss_clip": 1.05056643, "balance_loss_mlp": 1.01914585, "epoch": 0.757530211026273, "flos": 24133766474880.0, "grad_norm": 1.8167916689987371, "language_loss": 0.8421042, "learning_rate": 5.855110563784482e-07, "loss": 0.86430329, "num_input_tokens_seen": 135430565, "step": 6300, "time_per_iteration": 2.7099976539611816 }, { "auxiliary_loss_clip": 0.01157845, "auxiliary_loss_mlp": 0.01049479, "balance_loss_clip": 1.04580188, "balance_loss_mlp": 1.01457953, "epoch": 0.7576504539169122, "flos": 23951807153280.0, "grad_norm": 2.921869876533182, "language_loss": 0.64233232, "learning_rate": 5.849604548246156e-07, "loss": 0.66440558, "num_input_tokens_seen": 135451675, "step": 6301, "time_per_iteration": 2.770084857940674 }, { "auxiliary_loss_clip": 0.01165199, "auxiliary_loss_mlp": 0.01052964, "balance_loss_clip": 1.04791641, "balance_loss_mlp": 1.01638699, "epoch": 0.7577706968075513, "flos": 21251360712960.0, "grad_norm": 2.093182247756709, "language_loss": 0.80500245, "learning_rate": 5.844100679311565e-07, "loss": 0.82718414, "num_input_tokens_seen": 135470635, "step": 6302, "time_per_iteration": 2.751493453979492 }, { "auxiliary_loss_clip": 0.01162268, "auxiliary_loss_mlp": 0.01019975, "balance_loss_clip": 1.04961848, "balance_loss_mlp": 1.01211298, "epoch": 0.7578909396981903, "flos": 18296595002880.0, "grad_norm": 1.9617463125904324, "language_loss": 0.76148736, "learning_rate": 5.838598957815637e-07, "loss": 0.78330976, "num_input_tokens_seen": 135487865, "step": 6303, "time_per_iteration": 2.696208953857422 }, { "auxiliary_loss_clip": 0.01155913, "auxiliary_loss_mlp": 0.01021741, "balance_loss_clip": 1.04588819, "balance_loss_mlp": 1.01430202, "epoch": 0.7580111825888295, "flos": 25373869574400.0, "grad_norm": 1.6191769681517967, "language_loss": 0.85384244, "learning_rate": 5.833099384592996e-07, "loss": 0.87561893, "num_input_tokens_seen": 135508440, "step": 6304, "time_per_iteration": 2.7998478412628174 }, { "auxiliary_loss_clip": 0.01157, "auxiliary_loss_mlp": 0.01023869, "balance_loss_clip": 1.04644227, "balance_loss_mlp": 1.01650238, "epoch": 0.7581314254794685, "flos": 23768662682880.0, "grad_norm": 2.3764342908504625, "language_loss": 0.71713322, "learning_rate": 5.827601960477913e-07, "loss": 0.73894191, "num_input_tokens_seen": 135526365, "step": 6305, "time_per_iteration": 2.7400853633880615 }, { "auxiliary_loss_clip": 0.01160333, "auxiliary_loss_mlp": 0.01020763, "balance_loss_clip": 1.04585397, "balance_loss_mlp": 1.01389968, "epoch": 0.7582516683701076, "flos": 22054610603520.0, "grad_norm": 2.254650306944281, "language_loss": 0.70420277, "learning_rate": 5.822106686304344e-07, "loss": 0.72601378, "num_input_tokens_seen": 135545655, "step": 6306, "time_per_iteration": 2.69934344291687 }, { "auxiliary_loss_clip": 0.01162106, "auxiliary_loss_mlp": 0.01021257, "balance_loss_clip": 1.04580164, "balance_loss_mlp": 1.01433706, "epoch": 0.7583719112607467, "flos": 31649725848960.0, "grad_norm": 3.1143220523252495, "language_loss": 0.57618248, "learning_rate": 5.816613562905919e-07, "loss": 0.59801614, "num_input_tokens_seen": 135566840, "step": 6307, "time_per_iteration": 2.8502211570739746 }, { "auxiliary_loss_clip": 0.01149982, "auxiliary_loss_mlp": 0.01029675, "balance_loss_clip": 1.04643798, "balance_loss_mlp": 1.02237904, "epoch": 0.7584921541513858, "flos": 33068376478080.0, "grad_norm": 1.7977271406348605, "language_loss": 0.69823837, "learning_rate": 5.811122591115933e-07, "loss": 0.72003496, "num_input_tokens_seen": 135587825, "step": 6308, "time_per_iteration": 2.8397021293640137 }, { "auxiliary_loss_clip": 0.01154279, "auxiliary_loss_mlp": 0.01025898, "balance_loss_clip": 1.04902411, "balance_loss_mlp": 1.01842642, "epoch": 0.7586123970420249, "flos": 23326350606720.0, "grad_norm": 2.6185946305822463, "language_loss": 0.71410561, "learning_rate": 5.805633771767376e-07, "loss": 0.73590732, "num_input_tokens_seen": 135605220, "step": 6309, "time_per_iteration": 2.786397933959961 }, { "auxiliary_loss_clip": 0.01163185, "auxiliary_loss_mlp": 0.01027288, "balance_loss_clip": 1.05141103, "balance_loss_mlp": 1.01979017, "epoch": 0.7587326399326639, "flos": 18334229477760.0, "grad_norm": 1.72369860146769, "language_loss": 0.78143716, "learning_rate": 5.800147105692888e-07, "loss": 0.80334187, "num_input_tokens_seen": 135624795, "step": 6310, "time_per_iteration": 2.749295949935913 }, { "auxiliary_loss_clip": 0.0116689, "auxiliary_loss_mlp": 0.01021785, "balance_loss_clip": 1.04651284, "balance_loss_mlp": 1.01500201, "epoch": 0.7588528828233031, "flos": 17275080119040.0, "grad_norm": 2.673825745821952, "language_loss": 0.79003894, "learning_rate": 5.794662593724795e-07, "loss": 0.81192565, "num_input_tokens_seen": 135643800, "step": 6311, "time_per_iteration": 3.619614839553833 }, { "auxiliary_loss_clip": 0.01170908, "auxiliary_loss_mlp": 0.0102469, "balance_loss_clip": 1.0505991, "balance_loss_mlp": 1.01744461, "epoch": 0.7589731257139422, "flos": 17713621267200.0, "grad_norm": 2.000367118195033, "language_loss": 0.75322092, "learning_rate": 5.789180236695091e-07, "loss": 0.77517688, "num_input_tokens_seen": 135660655, "step": 6312, "time_per_iteration": 2.6577210426330566 }, { "auxiliary_loss_clip": 0.01157515, "auxiliary_loss_mlp": 0.01019541, "balance_loss_clip": 1.04605412, "balance_loss_mlp": 1.01316309, "epoch": 0.7590933686045812, "flos": 15961072786560.0, "grad_norm": 2.7091575178699423, "language_loss": 0.85449475, "learning_rate": 5.78370003543544e-07, "loss": 0.87626529, "num_input_tokens_seen": 135679410, "step": 6313, "time_per_iteration": 3.6136178970336914 }, { "auxiliary_loss_clip": 0.01164483, "auxiliary_loss_mlp": 0.01054005, "balance_loss_clip": 1.04739773, "balance_loss_mlp": 1.01633239, "epoch": 0.7592136114952204, "flos": 21068072588160.0, "grad_norm": 2.225287116250703, "language_loss": 0.83702105, "learning_rate": 5.778221990777203e-07, "loss": 0.8592059, "num_input_tokens_seen": 135697150, "step": 6314, "time_per_iteration": 2.6647706031799316 }, { "auxiliary_loss_clip": 0.01162399, "auxiliary_loss_mlp": 0.01026338, "balance_loss_clip": 1.04804325, "balance_loss_mlp": 1.01893544, "epoch": 0.7593338543858594, "flos": 25297666871040.0, "grad_norm": 2.2796774684414105, "language_loss": 0.82708234, "learning_rate": 5.772746103551372e-07, "loss": 0.8489697, "num_input_tokens_seen": 135712545, "step": 6315, "time_per_iteration": 2.737928867340088 }, { "auxiliary_loss_clip": 0.01158875, "auxiliary_loss_mlp": 0.01023181, "balance_loss_clip": 1.04835856, "balance_loss_mlp": 1.01646304, "epoch": 0.7594540972764985, "flos": 31832367528960.0, "grad_norm": 2.097765897119807, "language_loss": 0.71772802, "learning_rate": 5.767272374588648e-07, "loss": 0.73954856, "num_input_tokens_seen": 135733950, "step": 6316, "time_per_iteration": 3.911797046661377 }, { "auxiliary_loss_clip": 0.01163587, "auxiliary_loss_mlp": 0.01028153, "balance_loss_clip": 1.04889107, "balance_loss_mlp": 1.02084506, "epoch": 0.7595743401671377, "flos": 37597250880000.0, "grad_norm": 1.6575971442109918, "language_loss": 0.77809441, "learning_rate": 5.76180080471939e-07, "loss": 0.80001181, "num_input_tokens_seen": 135757120, "step": 6317, "time_per_iteration": 2.8238461017608643 }, { "auxiliary_loss_clip": 0.01174737, "auxiliary_loss_mlp": 0.01027891, "balance_loss_clip": 1.05074835, "balance_loss_mlp": 1.02002597, "epoch": 0.7596945830577767, "flos": 18287724343680.0, "grad_norm": 2.6073366735070156, "language_loss": 0.72081476, "learning_rate": 5.756331394773631e-07, "loss": 0.74284106, "num_input_tokens_seen": 135773335, "step": 6318, "time_per_iteration": 2.7137107849121094 }, { "auxiliary_loss_clip": 0.01150011, "auxiliary_loss_mlp": 0.01056394, "balance_loss_clip": 1.04524469, "balance_loss_mlp": 1.02142274, "epoch": 0.7598148259484158, "flos": 22233122219520.0, "grad_norm": 2.054272629989283, "language_loss": 0.76144779, "learning_rate": 5.750864145581071e-07, "loss": 0.78351188, "num_input_tokens_seen": 135792555, "step": 6319, "time_per_iteration": 2.845207452774048 }, { "auxiliary_loss_clip": 0.01166986, "auxiliary_loss_mlp": 0.01028598, "balance_loss_clip": 1.04739594, "balance_loss_mlp": 1.02160335, "epoch": 0.7599350688390549, "flos": 27161718145920.0, "grad_norm": 2.845077843719794, "language_loss": 0.86490411, "learning_rate": 5.745399057971085e-07, "loss": 0.88685989, "num_input_tokens_seen": 135813690, "step": 6320, "time_per_iteration": 2.7315244674682617 }, { "auxiliary_loss_clip": 0.01167792, "auxiliary_loss_mlp": 0.01025462, "balance_loss_clip": 1.04769945, "balance_loss_mlp": 1.01794291, "epoch": 0.760055311729694, "flos": 15560704817280.0, "grad_norm": 2.4590440804778124, "language_loss": 0.75605512, "learning_rate": 5.739936132772738e-07, "loss": 0.7779876, "num_input_tokens_seen": 135832255, "step": 6321, "time_per_iteration": 3.7242958545684814 }, { "auxiliary_loss_clip": 0.01166207, "auxiliary_loss_mlp": 0.01019695, "balance_loss_clip": 1.04691195, "balance_loss_mlp": 1.01230991, "epoch": 0.760175554620333, "flos": 25155496840320.0, "grad_norm": 3.0662367217671034, "language_loss": 0.743572, "learning_rate": 5.734475370814733e-07, "loss": 0.76543099, "num_input_tokens_seen": 135851935, "step": 6322, "time_per_iteration": 2.7489125728607178 }, { "auxiliary_loss_clip": 0.01166069, "auxiliary_loss_mlp": 0.01022312, "balance_loss_clip": 1.04649758, "balance_loss_mlp": 1.01493859, "epoch": 0.7602957975109722, "flos": 24353791234560.0, "grad_norm": 1.527808999580532, "language_loss": 0.78649431, "learning_rate": 5.729016772925483e-07, "loss": 0.80837816, "num_input_tokens_seen": 135873510, "step": 6323, "time_per_iteration": 2.7735133171081543 }, { "auxiliary_loss_clip": 0.01151218, "auxiliary_loss_mlp": 0.01022979, "balance_loss_clip": 1.04828811, "balance_loss_mlp": 1.01542735, "epoch": 0.7604160404016113, "flos": 25192664438400.0, "grad_norm": 1.7900320658943587, "language_loss": 0.70842963, "learning_rate": 5.723560339933038e-07, "loss": 0.73017156, "num_input_tokens_seen": 135893845, "step": 6324, "time_per_iteration": 2.8207204341888428 }, { "auxiliary_loss_clip": 0.01165419, "auxiliary_loss_mlp": 0.0105735, "balance_loss_clip": 1.04914069, "balance_loss_mlp": 1.02139437, "epoch": 0.7605362832922503, "flos": 29861841363840.0, "grad_norm": 2.2110732184682136, "language_loss": 0.65316206, "learning_rate": 5.71810607266513e-07, "loss": 0.67538977, "num_input_tokens_seen": 135912430, "step": 6325, "time_per_iteration": 2.8445215225219727 }, { "auxiliary_loss_clip": 0.01161811, "auxiliary_loss_mlp": 0.0101944, "balance_loss_clip": 1.04431915, "balance_loss_mlp": 1.01206136, "epoch": 0.7606565261828895, "flos": 13917935278080.0, "grad_norm": 2.2715351236070576, "language_loss": 0.61043769, "learning_rate": 5.712653971949184e-07, "loss": 0.63225019, "num_input_tokens_seen": 135930550, "step": 6326, "time_per_iteration": 2.948054790496826 }, { "auxiliary_loss_clip": 0.01161069, "auxiliary_loss_mlp": 0.01031061, "balance_loss_clip": 1.04732871, "balance_loss_mlp": 1.02352726, "epoch": 0.7607767690735285, "flos": 18551273408640.0, "grad_norm": 2.959109403043596, "language_loss": 0.75673604, "learning_rate": 5.707204038612268e-07, "loss": 0.77865732, "num_input_tokens_seen": 135947980, "step": 6327, "time_per_iteration": 2.6936442852020264 }, { "auxiliary_loss_clip": 0.01171285, "auxiliary_loss_mlp": 0.01023626, "balance_loss_clip": 1.05413175, "balance_loss_mlp": 1.01585388, "epoch": 0.7608970119641676, "flos": 20922993555840.0, "grad_norm": 2.6791222977786604, "language_loss": 0.73710859, "learning_rate": 5.701756273481138e-07, "loss": 0.75905776, "num_input_tokens_seen": 135965400, "step": 6328, "time_per_iteration": 2.693265914916992 }, { "auxiliary_loss_clip": 0.01165731, "auxiliary_loss_mlp": 0.01024689, "balance_loss_clip": 1.04806554, "balance_loss_mlp": 1.01703537, "epoch": 0.7610172548548068, "flos": 23807302738560.0, "grad_norm": 1.5108039002735962, "language_loss": 0.74057752, "learning_rate": 5.696310677382212e-07, "loss": 0.76248169, "num_input_tokens_seen": 135986795, "step": 6329, "time_per_iteration": 2.705035924911499 }, { "auxiliary_loss_clip": 0.01064467, "auxiliary_loss_mlp": 0.0100269, "balance_loss_clip": 1.01414752, "balance_loss_mlp": 1.00173664, "epoch": 0.7611374977454458, "flos": 66496580426880.0, "grad_norm": 0.8859004504925283, "language_loss": 0.61703223, "learning_rate": 5.690867251141576e-07, "loss": 0.63770384, "num_input_tokens_seen": 136053450, "step": 6330, "time_per_iteration": 3.4325950145721436 }, { "auxiliary_loss_clip": 0.0116802, "auxiliary_loss_mlp": 0.01025897, "balance_loss_clip": 1.04490149, "balance_loss_mlp": 1.01800537, "epoch": 0.7612577406360849, "flos": 15633136592640.0, "grad_norm": 2.8210626544567785, "language_loss": 0.91828632, "learning_rate": 5.685425995585013e-07, "loss": 0.94022548, "num_input_tokens_seen": 136071375, "step": 6331, "time_per_iteration": 2.6505038738250732 }, { "auxiliary_loss_clip": 0.01064904, "auxiliary_loss_mlp": 0.0100025, "balance_loss_clip": 1.01067924, "balance_loss_mlp": 0.99929661, "epoch": 0.761377983526724, "flos": 60526253237760.0, "grad_norm": 0.7521194192168975, "language_loss": 0.58978081, "learning_rate": 5.679986911537935e-07, "loss": 0.61043239, "num_input_tokens_seen": 136138905, "step": 6332, "time_per_iteration": 3.546353578567505 }, { "auxiliary_loss_clip": 0.01143954, "auxiliary_loss_mlp": 0.01020973, "balance_loss_clip": 1.04592085, "balance_loss_mlp": 1.01327252, "epoch": 0.7614982264173631, "flos": 35772522019200.0, "grad_norm": 1.939627011550698, "language_loss": 0.66805708, "learning_rate": 5.674549999825462e-07, "loss": 0.68970633, "num_input_tokens_seen": 136161720, "step": 6333, "time_per_iteration": 2.9507288932800293 }, { "auxiliary_loss_clip": 0.01063277, "auxiliary_loss_mlp": 0.010032, "balance_loss_clip": 1.00904942, "balance_loss_mlp": 1.00222254, "epoch": 0.7616184693080021, "flos": 67925502345600.0, "grad_norm": 0.9320212755938365, "language_loss": 0.71434456, "learning_rate": 5.669115261272363e-07, "loss": 0.73500931, "num_input_tokens_seen": 136222040, "step": 6334, "time_per_iteration": 3.2650394439697266 }, { "auxiliary_loss_clip": 0.01163937, "auxiliary_loss_mlp": 0.0102676, "balance_loss_clip": 1.046561, "balance_loss_mlp": 1.01935744, "epoch": 0.7617387121986413, "flos": 20521979141760.0, "grad_norm": 4.180790588126199, "language_loss": 0.72999626, "learning_rate": 5.663682696703081e-07, "loss": 0.75190324, "num_input_tokens_seen": 136240305, "step": 6335, "time_per_iteration": 2.7323243618011475 }, { "auxiliary_loss_clip": 0.01164105, "auxiliary_loss_mlp": 0.01022451, "balance_loss_clip": 1.04665303, "balance_loss_mlp": 1.0156616, "epoch": 0.7618589550892804, "flos": 18624495283200.0, "grad_norm": 2.678175321553181, "language_loss": 0.82361966, "learning_rate": 5.658252306941746e-07, "loss": 0.84548521, "num_input_tokens_seen": 136259625, "step": 6336, "time_per_iteration": 2.680894374847412 }, { "auxiliary_loss_clip": 0.01157466, "auxiliary_loss_mlp": 0.01025442, "balance_loss_clip": 1.04867828, "balance_loss_mlp": 1.01732969, "epoch": 0.7619791979799194, "flos": 17453735389440.0, "grad_norm": 2.099933746419589, "language_loss": 0.75485706, "learning_rate": 5.65282409281212e-07, "loss": 0.77668613, "num_input_tokens_seen": 136277090, "step": 6337, "time_per_iteration": 3.7219903469085693 }, { "auxiliary_loss_clip": 0.01157386, "auxiliary_loss_mlp": 0.01029857, "balance_loss_clip": 1.04679561, "balance_loss_mlp": 1.02235329, "epoch": 0.7620994408705585, "flos": 14137421333760.0, "grad_norm": 6.735413857242469, "language_loss": 0.69769776, "learning_rate": 5.64739805513768e-07, "loss": 0.71957016, "num_input_tokens_seen": 136294635, "step": 6338, "time_per_iteration": 3.754920721054077 }, { "auxiliary_loss_clip": 0.01058427, "auxiliary_loss_mlp": 0.01035338, "balance_loss_clip": 1.00935197, "balance_loss_mlp": 1.00266767, "epoch": 0.7622196837611976, "flos": 70708792527360.0, "grad_norm": 0.7851525261723817, "language_loss": 0.55673885, "learning_rate": 5.641974194741541e-07, "loss": 0.57767653, "num_input_tokens_seen": 136350320, "step": 6339, "time_per_iteration": 3.190908432006836 }, { "auxiliary_loss_clip": 0.01060891, "auxiliary_loss_mlp": 0.01001946, "balance_loss_clip": 1.01896775, "balance_loss_mlp": 1.00095618, "epoch": 0.7623399266518367, "flos": 60684150447360.0, "grad_norm": 0.7734322481272669, "language_loss": 0.6370635, "learning_rate": 5.636552512446502e-07, "loss": 0.65769196, "num_input_tokens_seen": 136411375, "step": 6340, "time_per_iteration": 3.206833600997925 }, { "auxiliary_loss_clip": 0.0116034, "auxiliary_loss_mlp": 0.01025148, "balance_loss_clip": 1.04611075, "balance_loss_mlp": 1.01775384, "epoch": 0.7624601695424758, "flos": 26468893641600.0, "grad_norm": 1.762464757677717, "language_loss": 0.78097171, "learning_rate": 5.631133009075027e-07, "loss": 0.80282658, "num_input_tokens_seen": 136430560, "step": 6341, "time_per_iteration": 2.8010127544403076 }, { "auxiliary_loss_clip": 0.01160641, "auxiliary_loss_mlp": 0.01054755, "balance_loss_clip": 1.04481399, "balance_loss_mlp": 1.01884496, "epoch": 0.7625804124331149, "flos": 19135755515520.0, "grad_norm": 1.9307385440629852, "language_loss": 0.68789136, "learning_rate": 5.625715685449242e-07, "loss": 0.71004534, "num_input_tokens_seen": 136448665, "step": 6342, "time_per_iteration": 3.782590389251709 }, { "auxiliary_loss_clip": 0.0115392, "auxiliary_loss_mlp": 0.01022012, "balance_loss_clip": 1.04973483, "balance_loss_mlp": 1.01512146, "epoch": 0.762700655323754, "flos": 26213101914240.0, "grad_norm": 1.8023674077660534, "language_loss": 0.71556747, "learning_rate": 5.620300542390966e-07, "loss": 0.7373268, "num_input_tokens_seen": 136469710, "step": 6343, "time_per_iteration": 2.776642322540283 }, { "auxiliary_loss_clip": 0.01156957, "auxiliary_loss_mlp": 0.01031088, "balance_loss_clip": 1.04489899, "balance_loss_mlp": 1.02416182, "epoch": 0.762820898214393, "flos": 22382582711040.0, "grad_norm": 2.528819664041751, "language_loss": 0.85294729, "learning_rate": 5.614887580721659e-07, "loss": 0.87482774, "num_input_tokens_seen": 136489855, "step": 6344, "time_per_iteration": 2.7397518157958984 }, { "auxiliary_loss_clip": 0.01152444, "auxiliary_loss_mlp": 0.01022616, "balance_loss_clip": 1.04838145, "balance_loss_mlp": 1.01536822, "epoch": 0.7629411411050322, "flos": 15700504550400.0, "grad_norm": 2.378599825271198, "language_loss": 0.74373126, "learning_rate": 5.609476801262481e-07, "loss": 0.76548189, "num_input_tokens_seen": 136504715, "step": 6345, "time_per_iteration": 2.6958436965942383 }, { "auxiliary_loss_clip": 0.01154929, "auxiliary_loss_mlp": 0.01026068, "balance_loss_clip": 1.04948187, "balance_loss_mlp": 1.01871276, "epoch": 0.7630613839956712, "flos": 13770342293760.0, "grad_norm": 2.1572936615044664, "language_loss": 0.64287299, "learning_rate": 5.604068204834223e-07, "loss": 0.66468298, "num_input_tokens_seen": 136521610, "step": 6346, "time_per_iteration": 2.742237091064453 }, { "auxiliary_loss_clip": 0.01152039, "auxiliary_loss_mlp": 0.010535, "balance_loss_clip": 1.04710865, "balance_loss_mlp": 1.01719308, "epoch": 0.7631816268863103, "flos": 14569569861120.0, "grad_norm": 2.285279090595265, "language_loss": 0.77067417, "learning_rate": 5.598661792257367e-07, "loss": 0.7927295, "num_input_tokens_seen": 136538655, "step": 6347, "time_per_iteration": 3.6943182945251465 }, { "auxiliary_loss_clip": 0.01161563, "auxiliary_loss_mlp": 0.01024003, "balance_loss_clip": 1.04541314, "balance_loss_mlp": 1.01693416, "epoch": 0.7633018697769495, "flos": 19062210418560.0, "grad_norm": 2.717366977366493, "language_loss": 0.76204902, "learning_rate": 5.593257564352071e-07, "loss": 0.78390473, "num_input_tokens_seen": 136557095, "step": 6348, "time_per_iteration": 2.7454841136932373 }, { "auxiliary_loss_clip": 0.01160636, "auxiliary_loss_mlp": 0.01020139, "balance_loss_clip": 1.04607964, "balance_loss_mlp": 1.01312041, "epoch": 0.7634221126675885, "flos": 22052958577920.0, "grad_norm": 1.457660781699079, "language_loss": 0.75265872, "learning_rate": 5.58785552193815e-07, "loss": 0.77446645, "num_input_tokens_seen": 136577340, "step": 6349, "time_per_iteration": 2.7591867446899414 }, { "auxiliary_loss_clip": 0.01166632, "auxiliary_loss_mlp": 0.01022927, "balance_loss_clip": 1.04713798, "balance_loss_mlp": 1.0161202, "epoch": 0.7635423555582276, "flos": 29382720825600.0, "grad_norm": 4.220117163047306, "language_loss": 0.75446868, "learning_rate": 5.582455665835086e-07, "loss": 0.77636433, "num_input_tokens_seen": 136597635, "step": 6350, "time_per_iteration": 2.731152057647705 }, { "auxiliary_loss_clip": 0.01172218, "auxiliary_loss_mlp": 0.01027785, "balance_loss_clip": 1.04834771, "balance_loss_mlp": 1.01914847, "epoch": 0.7636625984488667, "flos": 17784903807360.0, "grad_norm": 3.408255696436362, "language_loss": 0.7276544, "learning_rate": 5.577057996862036e-07, "loss": 0.74965441, "num_input_tokens_seen": 136615260, "step": 6351, "time_per_iteration": 2.7115678787231445 }, { "auxiliary_loss_clip": 0.01162682, "auxiliary_loss_mlp": 0.01019655, "balance_loss_clip": 1.04600573, "balance_loss_mlp": 1.01283646, "epoch": 0.7637828413395058, "flos": 23734583654400.0, "grad_norm": 1.5813256202670962, "language_loss": 0.76265621, "learning_rate": 5.571662515837814e-07, "loss": 0.78447962, "num_input_tokens_seen": 136637220, "step": 6352, "time_per_iteration": 2.76991605758667 }, { "auxiliary_loss_clip": 0.01161104, "auxiliary_loss_mlp": 0.01024877, "balance_loss_clip": 1.04769397, "balance_loss_mlp": 1.0175662, "epoch": 0.7639030842301449, "flos": 36283279461120.0, "grad_norm": 1.661647290834348, "language_loss": 0.83475441, "learning_rate": 5.566269223580926e-07, "loss": 0.85661423, "num_input_tokens_seen": 136658930, "step": 6353, "time_per_iteration": 2.92525053024292 }, { "auxiliary_loss_clip": 0.01164309, "auxiliary_loss_mlp": 0.01024569, "balance_loss_clip": 1.04657865, "balance_loss_mlp": 1.01805425, "epoch": 0.764023327120784, "flos": 28878104609280.0, "grad_norm": 1.9909836030904569, "language_loss": 0.75258338, "learning_rate": 5.560878120909511e-07, "loss": 0.77447212, "num_input_tokens_seen": 136681530, "step": 6354, "time_per_iteration": 2.7824978828430176 }, { "auxiliary_loss_clip": 0.01062401, "auxiliary_loss_mlp": 0.01003052, "balance_loss_clip": 1.00839424, "balance_loss_mlp": 1.0020808, "epoch": 0.7641435700114231, "flos": 64789711067520.0, "grad_norm": 0.8446551044460421, "language_loss": 0.58536136, "learning_rate": 5.55548920864141e-07, "loss": 0.60601592, "num_input_tokens_seen": 136742185, "step": 6355, "time_per_iteration": 3.273970127105713 }, { "auxiliary_loss_clip": 0.01163835, "auxiliary_loss_mlp": 0.01025854, "balance_loss_clip": 1.0493027, "balance_loss_mlp": 1.01854634, "epoch": 0.7642638129020621, "flos": 16835784785280.0, "grad_norm": 1.5664438753725103, "language_loss": 0.78035849, "learning_rate": 5.550102487594113e-07, "loss": 0.80225539, "num_input_tokens_seen": 136760855, "step": 6356, "time_per_iteration": 2.681530714035034 }, { "auxiliary_loss_clip": 0.01157875, "auxiliary_loss_mlp": 0.01054307, "balance_loss_clip": 1.04614794, "balance_loss_mlp": 1.01670337, "epoch": 0.7643840557927013, "flos": 30408940391040.0, "grad_norm": 1.5010680026108902, "language_loss": 0.71590739, "learning_rate": 5.54471795858477e-07, "loss": 0.73802918, "num_input_tokens_seen": 136780925, "step": 6357, "time_per_iteration": 2.8559768199920654 }, { "auxiliary_loss_clip": 0.01161479, "auxiliary_loss_mlp": 0.01027893, "balance_loss_clip": 1.04747641, "balance_loss_mlp": 1.02037048, "epoch": 0.7645042986833404, "flos": 16983234115200.0, "grad_norm": 2.133842919621732, "language_loss": 0.82948643, "learning_rate": 5.539335622430235e-07, "loss": 0.85138011, "num_input_tokens_seen": 136799545, "step": 6358, "time_per_iteration": 2.7812323570251465 }, { "auxiliary_loss_clip": 0.01159754, "auxiliary_loss_mlp": 0.01021196, "balance_loss_clip": 1.04660416, "balance_loss_mlp": 1.01390672, "epoch": 0.7646245415739794, "flos": 17311493531520.0, "grad_norm": 2.2884992926357093, "language_loss": 0.74997228, "learning_rate": 5.533955479946975e-07, "loss": 0.7717818, "num_input_tokens_seen": 136818325, "step": 6359, "time_per_iteration": 2.653247356414795 }, { "auxiliary_loss_clip": 0.01061092, "auxiliary_loss_mlp": 0.01032814, "balance_loss_clip": 1.01915979, "balance_loss_mlp": 0.99947166, "epoch": 0.7647447844646186, "flos": 70402332666240.0, "grad_norm": 0.8676361523372571, "language_loss": 0.65797019, "learning_rate": 5.528577531951173e-07, "loss": 0.67890918, "num_input_tokens_seen": 136878730, "step": 6360, "time_per_iteration": 3.294922113418579 }, { "auxiliary_loss_clip": 0.01162731, "auxiliary_loss_mlp": 0.01028656, "balance_loss_clip": 1.04647577, "balance_loss_mlp": 1.02134836, "epoch": 0.7648650273552576, "flos": 17675914965120.0, "grad_norm": 2.422667132153833, "language_loss": 0.73745954, "learning_rate": 5.523201779258653e-07, "loss": 0.75937343, "num_input_tokens_seen": 136897705, "step": 6361, "time_per_iteration": 2.699138879776001 }, { "auxiliary_loss_clip": 0.01165128, "auxiliary_loss_mlp": 0.0102335, "balance_loss_clip": 1.04440427, "balance_loss_mlp": 1.01628137, "epoch": 0.7649852702458967, "flos": 22162019247360.0, "grad_norm": 1.9054684336358256, "language_loss": 0.83986735, "learning_rate": 5.517828222684912e-07, "loss": 0.86175215, "num_input_tokens_seen": 136918360, "step": 6362, "time_per_iteration": 2.6803224086761475 }, { "auxiliary_loss_clip": 0.01059702, "auxiliary_loss_mlp": 0.01004205, "balance_loss_clip": 1.00997329, "balance_loss_mlp": 1.00317383, "epoch": 0.7651055131365359, "flos": 69848338227840.0, "grad_norm": 0.7731616958641448, "language_loss": 0.59054768, "learning_rate": 5.512456863045117e-07, "loss": 0.61118674, "num_input_tokens_seen": 136979050, "step": 6363, "time_per_iteration": 4.220109939575195 }, { "auxiliary_loss_clip": 0.01166702, "auxiliary_loss_mlp": 0.01023794, "balance_loss_clip": 1.04583573, "balance_loss_mlp": 1.01628971, "epoch": 0.7652257560271749, "flos": 19464014931840.0, "grad_norm": 1.779712351434282, "language_loss": 0.74016011, "learning_rate": 5.507087701154089e-07, "loss": 0.76206505, "num_input_tokens_seen": 136998970, "step": 6364, "time_per_iteration": 2.694350242614746 }, { "auxiliary_loss_clip": 0.01158548, "auxiliary_loss_mlp": 0.01023372, "balance_loss_clip": 1.04781842, "balance_loss_mlp": 1.01633286, "epoch": 0.765345998917814, "flos": 15961108700160.0, "grad_norm": 2.7041700081899553, "language_loss": 0.75238669, "learning_rate": 5.50172073782634e-07, "loss": 0.77420592, "num_input_tokens_seen": 137016950, "step": 6365, "time_per_iteration": 3.711301326751709 }, { "auxiliary_loss_clip": 0.01157742, "auxiliary_loss_mlp": 0.01026948, "balance_loss_clip": 1.05015171, "balance_loss_mlp": 1.0198698, "epoch": 0.7654662418084531, "flos": 23659853408640.0, "grad_norm": 4.263646108189059, "language_loss": 0.87598884, "learning_rate": 5.496355973876023e-07, "loss": 0.89783573, "num_input_tokens_seen": 137036205, "step": 6366, "time_per_iteration": 2.8809189796447754 }, { "auxiliary_loss_clip": 0.01160185, "auxiliary_loss_mlp": 0.01051801, "balance_loss_clip": 1.05108428, "balance_loss_mlp": 1.01606965, "epoch": 0.7655864846990922, "flos": 41463608878080.0, "grad_norm": 1.878991190923849, "language_loss": 0.70949888, "learning_rate": 5.490993410116984e-07, "loss": 0.73161876, "num_input_tokens_seen": 137059195, "step": 6367, "time_per_iteration": 2.9207653999328613 }, { "auxiliary_loss_clip": 0.01150678, "auxiliary_loss_mlp": 0.0102583, "balance_loss_clip": 1.04662657, "balance_loss_mlp": 1.0185132, "epoch": 0.7657067275897312, "flos": 43142684088960.0, "grad_norm": 1.9764167328288136, "language_loss": 0.6942786, "learning_rate": 5.485633047362704e-07, "loss": 0.71604359, "num_input_tokens_seen": 137081200, "step": 6368, "time_per_iteration": 3.8292999267578125 }, { "auxiliary_loss_clip": 0.01172275, "auxiliary_loss_mlp": 0.01028571, "balance_loss_clip": 1.05112004, "balance_loss_mlp": 1.02125418, "epoch": 0.7658269704803703, "flos": 17311780840320.0, "grad_norm": 2.494022200433159, "language_loss": 0.78755438, "learning_rate": 5.480274886426341e-07, "loss": 0.8095628, "num_input_tokens_seen": 137097840, "step": 6369, "time_per_iteration": 2.7066526412963867 }, { "auxiliary_loss_clip": 0.0115957, "auxiliary_loss_mlp": 0.01026112, "balance_loss_clip": 1.04781878, "balance_loss_mlp": 1.0187211, "epoch": 0.7659472133710095, "flos": 12568160977920.0, "grad_norm": 1.9423715426697563, "language_loss": 0.77743798, "learning_rate": 5.474918928120744e-07, "loss": 0.79929477, "num_input_tokens_seen": 137114335, "step": 6370, "time_per_iteration": 2.652416944503784 }, { "auxiliary_loss_clip": 0.01158449, "auxiliary_loss_mlp": 0.01026255, "balance_loss_clip": 1.04513073, "balance_loss_mlp": 1.0195111, "epoch": 0.7660674562616485, "flos": 22707430335360.0, "grad_norm": 1.6775179170624483, "language_loss": 0.87415093, "learning_rate": 5.469565173258392e-07, "loss": 0.895998, "num_input_tokens_seen": 137132850, "step": 6371, "time_per_iteration": 2.713097333908081 }, { "auxiliary_loss_clip": 0.01171266, "auxiliary_loss_mlp": 0.01019952, "balance_loss_clip": 1.04799914, "balance_loss_mlp": 1.01174712, "epoch": 0.7661876991522876, "flos": 17056455989760.0, "grad_norm": 1.6850876384847184, "language_loss": 0.63947719, "learning_rate": 5.464213622651454e-07, "loss": 0.66138935, "num_input_tokens_seen": 137150665, "step": 6372, "time_per_iteration": 2.6291584968566895 }, { "auxiliary_loss_clip": 0.01161464, "auxiliary_loss_mlp": 0.01024758, "balance_loss_clip": 1.04627609, "balance_loss_mlp": 1.0170753, "epoch": 0.7663079420429267, "flos": 20084228092800.0, "grad_norm": 1.9896737319257316, "language_loss": 0.84296358, "learning_rate": 5.458864277111753e-07, "loss": 0.86482584, "num_input_tokens_seen": 137168500, "step": 6373, "time_per_iteration": 3.6514408588409424 }, { "auxiliary_loss_clip": 0.01156184, "auxiliary_loss_mlp": 0.01057304, "balance_loss_clip": 1.04694295, "balance_loss_mlp": 1.02247226, "epoch": 0.7664281849335658, "flos": 12677473042560.0, "grad_norm": 2.722440393510407, "language_loss": 0.68356657, "learning_rate": 5.453517137450769e-07, "loss": 0.70570147, "num_input_tokens_seen": 137185075, "step": 6374, "time_per_iteration": 2.699720621109009 }, { "auxiliary_loss_clip": 0.01165314, "auxiliary_loss_mlp": 0.01021755, "balance_loss_clip": 1.0492506, "balance_loss_mlp": 1.01380944, "epoch": 0.7665484278242048, "flos": 22345271458560.0, "grad_norm": 1.8982387720070106, "language_loss": 0.75998843, "learning_rate": 5.448172204479684e-07, "loss": 0.78185916, "num_input_tokens_seen": 137204355, "step": 6375, "time_per_iteration": 2.77133846282959 }, { "auxiliary_loss_clip": 0.01160992, "auxiliary_loss_mlp": 0.01022067, "balance_loss_clip": 1.04413915, "balance_loss_mlp": 1.01474762, "epoch": 0.766668670714844, "flos": 23617909301760.0, "grad_norm": 1.799372666570993, "language_loss": 0.74644899, "learning_rate": 5.442829479009294e-07, "loss": 0.76827967, "num_input_tokens_seen": 137223135, "step": 6376, "time_per_iteration": 2.6663949489593506 }, { "auxiliary_loss_clip": 0.01170889, "auxiliary_loss_mlp": 0.01025278, "balance_loss_clip": 1.0474062, "balance_loss_mlp": 1.01724362, "epoch": 0.7667889136054831, "flos": 19427134642560.0, "grad_norm": 1.8797450185644828, "language_loss": 0.71894556, "learning_rate": 5.437488961850103e-07, "loss": 0.74090725, "num_input_tokens_seen": 137242935, "step": 6377, "time_per_iteration": 2.614450216293335 }, { "auxiliary_loss_clip": 0.01150918, "auxiliary_loss_mlp": 0.01024601, "balance_loss_clip": 1.04558086, "balance_loss_mlp": 1.01820505, "epoch": 0.7669091564961221, "flos": 26866352609280.0, "grad_norm": 1.66502819542313, "language_loss": 0.75605762, "learning_rate": 5.432150653812258e-07, "loss": 0.77781284, "num_input_tokens_seen": 137262970, "step": 6378, "time_per_iteration": 2.794276714324951 }, { "auxiliary_loss_clip": 0.01158924, "auxiliary_loss_mlp": 0.01024021, "balance_loss_clip": 1.04588723, "balance_loss_mlp": 1.01646352, "epoch": 0.7670293993867613, "flos": 12385303816320.0, "grad_norm": 7.637732073751746, "language_loss": 0.82649767, "learning_rate": 5.42681455570557e-07, "loss": 0.84832716, "num_input_tokens_seen": 137279500, "step": 6379, "time_per_iteration": 2.654872179031372 }, { "auxiliary_loss_clip": 0.01164832, "auxiliary_loss_mlp": 0.01022531, "balance_loss_clip": 1.04592741, "balance_loss_mlp": 1.01537573, "epoch": 0.7671496422774003, "flos": 21762944167680.0, "grad_norm": 2.113608458167278, "language_loss": 0.64699876, "learning_rate": 5.42148066833954e-07, "loss": 0.66887236, "num_input_tokens_seen": 137298745, "step": 6380, "time_per_iteration": 2.8136141300201416 }, { "auxiliary_loss_clip": 0.01166176, "auxiliary_loss_mlp": 0.01024401, "balance_loss_clip": 1.0474472, "balance_loss_mlp": 1.01692629, "epoch": 0.7672698851680394, "flos": 21069221823360.0, "grad_norm": 1.9873165696612145, "language_loss": 0.75517184, "learning_rate": 5.416148992523289e-07, "loss": 0.77707762, "num_input_tokens_seen": 137317320, "step": 6381, "time_per_iteration": 2.6568644046783447 }, { "auxiliary_loss_clip": 0.01148662, "auxiliary_loss_mlp": 0.01031455, "balance_loss_clip": 1.04904938, "balance_loss_mlp": 1.02413225, "epoch": 0.7673901280586786, "flos": 16976697840000.0, "grad_norm": 1.9567936359211269, "language_loss": 0.78269589, "learning_rate": 5.410819529065644e-07, "loss": 0.80449706, "num_input_tokens_seen": 137335275, "step": 6382, "time_per_iteration": 2.9075088500976562 }, { "auxiliary_loss_clip": 0.01155008, "auxiliary_loss_mlp": 0.01022927, "balance_loss_clip": 1.04611146, "balance_loss_mlp": 1.01533651, "epoch": 0.7675103709493176, "flos": 29242669697280.0, "grad_norm": 3.712766799657867, "language_loss": 0.65525299, "learning_rate": 5.405492278775079e-07, "loss": 0.67703235, "num_input_tokens_seen": 137355055, "step": 6383, "time_per_iteration": 2.857149839401245 }, { "auxiliary_loss_clip": 0.01164448, "auxiliary_loss_mlp": 0.01026057, "balance_loss_clip": 1.0487659, "balance_loss_mlp": 1.01794493, "epoch": 0.7676306138399567, "flos": 29023004073600.0, "grad_norm": 2.2332797221157574, "language_loss": 0.79709548, "learning_rate": 5.400167242459732e-07, "loss": 0.81900048, "num_input_tokens_seen": 137374015, "step": 6384, "time_per_iteration": 2.7902560234069824 }, { "auxiliary_loss_clip": 0.01159934, "auxiliary_loss_mlp": 0.01030673, "balance_loss_clip": 1.04537797, "balance_loss_mlp": 1.02322197, "epoch": 0.7677508567305958, "flos": 22565116650240.0, "grad_norm": 7.503279162228198, "language_loss": 0.8050909, "learning_rate": 5.394844420927405e-07, "loss": 0.82699698, "num_input_tokens_seen": 137393625, "step": 6385, "time_per_iteration": 2.904510974884033 }, { "auxiliary_loss_clip": 0.01164108, "auxiliary_loss_mlp": 0.01027855, "balance_loss_clip": 1.0464344, "balance_loss_mlp": 1.01999962, "epoch": 0.7678710996212349, "flos": 25411432222080.0, "grad_norm": 2.4200199031168075, "language_loss": 0.73222405, "learning_rate": 5.389523814985562e-07, "loss": 0.75414371, "num_input_tokens_seen": 137413045, "step": 6386, "time_per_iteration": 2.789219617843628 }, { "auxiliary_loss_clip": 0.01154833, "auxiliary_loss_mlp": 0.01022624, "balance_loss_clip": 1.04811168, "balance_loss_mlp": 1.01495266, "epoch": 0.767991342511874, "flos": 26756825063040.0, "grad_norm": 2.135617350827945, "language_loss": 0.76396203, "learning_rate": 5.384205425441344e-07, "loss": 0.78573662, "num_input_tokens_seen": 137433955, "step": 6387, "time_per_iteration": 2.9293782711029053 }, { "auxiliary_loss_clip": 0.01161354, "auxiliary_loss_mlp": 0.01019547, "balance_loss_clip": 1.04691887, "balance_loss_mlp": 1.01244843, "epoch": 0.7681115854025131, "flos": 26359509749760.0, "grad_norm": 1.969298334231228, "language_loss": 0.84293085, "learning_rate": 5.378889253101537e-07, "loss": 0.86473989, "num_input_tokens_seen": 137454510, "step": 6388, "time_per_iteration": 2.6950583457946777 }, { "auxiliary_loss_clip": 0.01164568, "auxiliary_loss_mlp": 0.01030233, "balance_loss_clip": 1.04666495, "balance_loss_mlp": 1.02334869, "epoch": 0.7682318282931522, "flos": 23257043314560.0, "grad_norm": 1.85782711140796, "language_loss": 0.80853373, "learning_rate": 5.373575298772617e-07, "loss": 0.83048177, "num_input_tokens_seen": 137473630, "step": 6389, "time_per_iteration": 3.6312267780303955 }, { "auxiliary_loss_clip": 0.0106297, "auxiliary_loss_mlp": 0.01004385, "balance_loss_clip": 1.00845659, "balance_loss_mlp": 1.00333583, "epoch": 0.7683520711837912, "flos": 70072457137920.0, "grad_norm": 0.7799432121907792, "language_loss": 0.61346126, "learning_rate": 5.368263563260689e-07, "loss": 0.63413489, "num_input_tokens_seen": 137538765, "step": 6390, "time_per_iteration": 3.3651394844055176 }, { "auxiliary_loss_clip": 0.01163351, "auxiliary_loss_mlp": 0.01026848, "balance_loss_clip": 1.04651785, "balance_loss_mlp": 1.01954603, "epoch": 0.7684723140744304, "flos": 18624890332800.0, "grad_norm": 1.5573617462121137, "language_loss": 0.64562345, "learning_rate": 5.362954047371537e-07, "loss": 0.66752541, "num_input_tokens_seen": 137557875, "step": 6391, "time_per_iteration": 3.650660276412964 }, { "auxiliary_loss_clip": 0.01156875, "auxiliary_loss_mlp": 0.01026642, "balance_loss_clip": 1.05124068, "balance_loss_mlp": 1.01925683, "epoch": 0.7685925569650695, "flos": 27452989532160.0, "grad_norm": 1.7095690186547714, "language_loss": 0.72062677, "learning_rate": 5.357646751910627e-07, "loss": 0.74246198, "num_input_tokens_seen": 137579055, "step": 6392, "time_per_iteration": 2.821122884750366 }, { "auxiliary_loss_clip": 0.01161746, "auxiliary_loss_mlp": 0.01026992, "balance_loss_clip": 1.04933786, "balance_loss_mlp": 1.01890993, "epoch": 0.7687127998557085, "flos": 24535714642560.0, "grad_norm": 2.27416461926229, "language_loss": 0.80040193, "learning_rate": 5.352341677683061e-07, "loss": 0.82228935, "num_input_tokens_seen": 137600355, "step": 6393, "time_per_iteration": 2.813080310821533 }, { "auxiliary_loss_clip": 0.01162314, "auxiliary_loss_mlp": 0.01026026, "balance_loss_clip": 1.04614019, "balance_loss_mlp": 1.01888585, "epoch": 0.7688330427463477, "flos": 25155963717120.0, "grad_norm": 2.322340566132204, "language_loss": 0.79159451, "learning_rate": 5.347038825493617e-07, "loss": 0.81347787, "num_input_tokens_seen": 137621885, "step": 6394, "time_per_iteration": 3.866847515106201 }, { "auxiliary_loss_clip": 0.01159861, "auxiliary_loss_mlp": 0.01027294, "balance_loss_clip": 1.04999113, "balance_loss_mlp": 1.02034152, "epoch": 0.7689532856369867, "flos": 21211284113280.0, "grad_norm": 2.088457300998634, "language_loss": 0.68673134, "learning_rate": 5.341738196146732e-07, "loss": 0.70860285, "num_input_tokens_seen": 137640230, "step": 6395, "time_per_iteration": 2.7823760509490967 }, { "auxiliary_loss_clip": 0.01160888, "auxiliary_loss_mlp": 0.01024568, "balance_loss_clip": 1.0459559, "balance_loss_mlp": 1.01705766, "epoch": 0.7690735285276258, "flos": 25119083427840.0, "grad_norm": 2.195435950703558, "language_loss": 0.73817587, "learning_rate": 5.336439790446503e-07, "loss": 0.76003039, "num_input_tokens_seen": 137659330, "step": 6396, "time_per_iteration": 2.8033511638641357 }, { "auxiliary_loss_clip": 0.01156952, "auxiliary_loss_mlp": 0.01026296, "balance_loss_clip": 1.0480473, "balance_loss_mlp": 1.0191077, "epoch": 0.769193771418265, "flos": 54744020640000.0, "grad_norm": 1.9408611220906435, "language_loss": 0.62423986, "learning_rate": 5.331143609196711e-07, "loss": 0.64607239, "num_input_tokens_seen": 137683145, "step": 6397, "time_per_iteration": 3.0824084281921387 }, { "auxiliary_loss_clip": 0.01161738, "auxiliary_loss_mlp": 0.01023902, "balance_loss_clip": 1.04709637, "balance_loss_mlp": 1.0161711, "epoch": 0.769314014308904, "flos": 37341890115840.0, "grad_norm": 1.7179898480642564, "language_loss": 0.76831001, "learning_rate": 5.325849653200758e-07, "loss": 0.79016644, "num_input_tokens_seen": 137707095, "step": 6398, "time_per_iteration": 2.842130661010742 }, { "auxiliary_loss_clip": 0.01167937, "auxiliary_loss_mlp": 0.01023056, "balance_loss_clip": 1.04876697, "balance_loss_mlp": 1.01570046, "epoch": 0.7694342571995431, "flos": 20631686256000.0, "grad_norm": 1.695282236272673, "language_loss": 0.76747608, "learning_rate": 5.32055792326175e-07, "loss": 0.78938603, "num_input_tokens_seen": 137725520, "step": 6399, "time_per_iteration": 3.5259785652160645 }, { "auxiliary_loss_clip": 0.01164685, "auxiliary_loss_mlp": 0.01027304, "balance_loss_clip": 1.04933929, "balance_loss_mlp": 1.02008653, "epoch": 0.7695545000901821, "flos": 24207706621440.0, "grad_norm": 1.908089174477491, "language_loss": 0.73060369, "learning_rate": 5.315268420182437e-07, "loss": 0.7525236, "num_input_tokens_seen": 137744195, "step": 6400, "time_per_iteration": 2.776543617248535 }, { "auxiliary_loss_clip": 0.01164569, "auxiliary_loss_mlp": 0.0105115, "balance_loss_clip": 1.04832351, "balance_loss_mlp": 1.01613247, "epoch": 0.7696747429808213, "flos": 28001273708160.0, "grad_norm": 3.8453071111208432, "language_loss": 0.76411283, "learning_rate": 5.309981144765221e-07, "loss": 0.78627002, "num_input_tokens_seen": 137764340, "step": 6401, "time_per_iteration": 2.858778715133667 }, { "auxiliary_loss_clip": 0.0116184, "auxiliary_loss_mlp": 0.01025033, "balance_loss_clip": 1.04855299, "balance_loss_mlp": 1.0179255, "epoch": 0.7697949858714603, "flos": 11509550323200.0, "grad_norm": 2.6955615400266035, "language_loss": 0.76019108, "learning_rate": 5.304696097812196e-07, "loss": 0.78205985, "num_input_tokens_seen": 137780940, "step": 6402, "time_per_iteration": 2.726209878921509 }, { "auxiliary_loss_clip": 0.01158824, "auxiliary_loss_mlp": 0.01027584, "balance_loss_clip": 1.04702592, "balance_loss_mlp": 1.01951957, "epoch": 0.7699152287620994, "flos": 26688271956480.0, "grad_norm": 4.351469570895153, "language_loss": 0.60127062, "learning_rate": 5.299413280125078e-07, "loss": 0.62313473, "num_input_tokens_seen": 137799250, "step": 6403, "time_per_iteration": 2.7939293384552 }, { "auxiliary_loss_clip": 0.01162172, "auxiliary_loss_mlp": 0.01028862, "balance_loss_clip": 1.04791713, "balance_loss_mlp": 1.02153385, "epoch": 0.7700354716527386, "flos": 16544944362240.0, "grad_norm": 2.567553204733703, "language_loss": 0.72756124, "learning_rate": 5.294132692505284e-07, "loss": 0.7494716, "num_input_tokens_seen": 137817660, "step": 6404, "time_per_iteration": 2.7225894927978516 }, { "auxiliary_loss_clip": 0.01148486, "auxiliary_loss_mlp": 0.01025699, "balance_loss_clip": 1.04809725, "balance_loss_mlp": 1.01817989, "epoch": 0.7701557145433776, "flos": 19242733196160.0, "grad_norm": 2.280154341606291, "language_loss": 0.79127091, "learning_rate": 5.288854335753861e-07, "loss": 0.81301284, "num_input_tokens_seen": 137835920, "step": 6405, "time_per_iteration": 2.761502981185913 }, { "auxiliary_loss_clip": 0.01165603, "auxiliary_loss_mlp": 0.01022295, "balance_loss_clip": 1.04676867, "balance_loss_mlp": 1.01486802, "epoch": 0.7702759574340167, "flos": 31685744211840.0, "grad_norm": 2.0508425420415777, "language_loss": 0.75832987, "learning_rate": 5.283578210671551e-07, "loss": 0.78020883, "num_input_tokens_seen": 137858160, "step": 6406, "time_per_iteration": 2.798860549926758 }, { "auxiliary_loss_clip": 0.01164383, "auxiliary_loss_mlp": 0.01029306, "balance_loss_clip": 1.04875517, "balance_loss_mlp": 1.02186155, "epoch": 0.7703962003246558, "flos": 16800089644800.0, "grad_norm": 2.7108917089006104, "language_loss": 0.77184319, "learning_rate": 5.278304318058719e-07, "loss": 0.79378009, "num_input_tokens_seen": 137876015, "step": 6407, "time_per_iteration": 2.725407600402832 }, { "auxiliary_loss_clip": 0.0115089, "auxiliary_loss_mlp": 0.01026588, "balance_loss_clip": 1.04709744, "balance_loss_mlp": 1.01819015, "epoch": 0.7705164432152949, "flos": 35736072693120.0, "grad_norm": 1.8113445378270876, "language_loss": 0.79482836, "learning_rate": 5.273032658715411e-07, "loss": 0.81660312, "num_input_tokens_seen": 137898825, "step": 6408, "time_per_iteration": 2.917478561401367 }, { "auxiliary_loss_clip": 0.01151745, "auxiliary_loss_mlp": 0.01025266, "balance_loss_clip": 1.04639459, "balance_loss_mlp": 1.01834917, "epoch": 0.7706366861059339, "flos": 23365960329600.0, "grad_norm": 1.8238137833350894, "language_loss": 0.76715994, "learning_rate": 5.267763233441347e-07, "loss": 0.78893006, "num_input_tokens_seen": 137919455, "step": 6409, "time_per_iteration": 2.8441145420074463 }, { "auxiliary_loss_clip": 0.01167374, "auxiliary_loss_mlp": 0.0102524, "balance_loss_clip": 1.04889441, "balance_loss_mlp": 1.01722372, "epoch": 0.7707569289965731, "flos": 22929897219840.0, "grad_norm": 2.0559054157744945, "language_loss": 0.69715607, "learning_rate": 5.26249604303588e-07, "loss": 0.71908224, "num_input_tokens_seen": 137937960, "step": 6410, "time_per_iteration": 2.685668468475342 }, { "auxiliary_loss_clip": 0.0116635, "auxiliary_loss_mlp": 0.01027038, "balance_loss_clip": 1.04738641, "balance_loss_mlp": 1.019274, "epoch": 0.7708771718872122, "flos": 17420661941760.0, "grad_norm": 5.129748464256748, "language_loss": 0.78984642, "learning_rate": 5.257231088298057e-07, "loss": 0.81178033, "num_input_tokens_seen": 137956370, "step": 6411, "time_per_iteration": 2.676093816757202 }, { "auxiliary_loss_clip": 0.01059541, "auxiliary_loss_mlp": 0.01001963, "balance_loss_clip": 1.0105021, "balance_loss_mlp": 1.00098574, "epoch": 0.7709974147778512, "flos": 72241316248320.0, "grad_norm": 0.7928348448196039, "language_loss": 0.53896725, "learning_rate": 5.25196837002655e-07, "loss": 0.55958235, "num_input_tokens_seen": 138016080, "step": 6412, "time_per_iteration": 3.3274447917938232 }, { "auxiliary_loss_clip": 0.01159785, "auxiliary_loss_mlp": 0.01027909, "balance_loss_clip": 1.04974174, "balance_loss_mlp": 1.0207442, "epoch": 0.7711176576684904, "flos": 39859694876160.0, "grad_norm": 8.103508795010326, "language_loss": 0.68272215, "learning_rate": 5.24670788901971e-07, "loss": 0.70459908, "num_input_tokens_seen": 138039170, "step": 6413, "time_per_iteration": 2.8102242946624756 }, { "auxiliary_loss_clip": 0.01163022, "auxiliary_loss_mlp": 0.01029303, "balance_loss_clip": 1.05134666, "balance_loss_mlp": 1.02097654, "epoch": 0.7712379005591294, "flos": 36976391274240.0, "grad_norm": 2.2650529443980667, "language_loss": 0.68400574, "learning_rate": 5.241449646075557e-07, "loss": 0.70592898, "num_input_tokens_seen": 138062395, "step": 6414, "time_per_iteration": 2.861774206161499 }, { "auxiliary_loss_clip": 0.01170791, "auxiliary_loss_mlp": 0.01023968, "balance_loss_clip": 1.0478183, "balance_loss_mlp": 1.01727986, "epoch": 0.7713581434497685, "flos": 22776773541120.0, "grad_norm": 2.3309765570917147, "language_loss": 0.72893715, "learning_rate": 5.236193641991762e-07, "loss": 0.75088465, "num_input_tokens_seen": 138080325, "step": 6415, "time_per_iteration": 3.5971524715423584 }, { "auxiliary_loss_clip": 0.0115878, "auxiliary_loss_mlp": 0.01026893, "balance_loss_clip": 1.04643917, "balance_loss_mlp": 1.01995766, "epoch": 0.7714783863404077, "flos": 24097460803200.0, "grad_norm": 3.4337566626450475, "language_loss": 0.69688863, "learning_rate": 5.23093987756565e-07, "loss": 0.71874535, "num_input_tokens_seen": 138099020, "step": 6416, "time_per_iteration": 2.7141292095184326 }, { "auxiliary_loss_clip": 0.01165744, "auxiliary_loss_mlp": 0.01026622, "balance_loss_clip": 1.04742289, "balance_loss_mlp": 1.0179019, "epoch": 0.7715986292310467, "flos": 21063655215360.0, "grad_norm": 2.4193112462299364, "language_loss": 0.75548613, "learning_rate": 5.225688353594217e-07, "loss": 0.77740979, "num_input_tokens_seen": 138118650, "step": 6417, "time_per_iteration": 3.6990694999694824 }, { "auxiliary_loss_clip": 0.01164717, "auxiliary_loss_mlp": 0.01053164, "balance_loss_clip": 1.04803705, "balance_loss_mlp": 1.01799774, "epoch": 0.7717188721216858, "flos": 20594877793920.0, "grad_norm": 2.7430802784040935, "language_loss": 0.77699363, "learning_rate": 5.220439070874108e-07, "loss": 0.79917252, "num_input_tokens_seen": 138137890, "step": 6418, "time_per_iteration": 2.6193830966949463 }, { "auxiliary_loss_clip": 0.01165847, "auxiliary_loss_mlp": 0.01024178, "balance_loss_clip": 1.04973912, "balance_loss_mlp": 1.01637602, "epoch": 0.7718391150123249, "flos": 26250951870720.0, "grad_norm": 1.8906356144161625, "language_loss": 0.71183819, "learning_rate": 5.215192030201652e-07, "loss": 0.73373848, "num_input_tokens_seen": 138158880, "step": 6419, "time_per_iteration": 2.7506675720214844 }, { "auxiliary_loss_clip": 0.01148084, "auxiliary_loss_mlp": 0.0102736, "balance_loss_clip": 1.04594922, "balance_loss_mlp": 1.02040458, "epoch": 0.771959357902964, "flos": 22049762267520.0, "grad_norm": 2.058147555735376, "language_loss": 0.86265206, "learning_rate": 5.209947232372798e-07, "loss": 0.88440657, "num_input_tokens_seen": 138176370, "step": 6420, "time_per_iteration": 3.6088123321533203 }, { "auxiliary_loss_clip": 0.01167992, "auxiliary_loss_mlp": 0.01055655, "balance_loss_clip": 1.0485568, "balance_loss_mlp": 1.01899123, "epoch": 0.772079600793603, "flos": 30446000248320.0, "grad_norm": 1.6722910124662171, "language_loss": 0.8113302, "learning_rate": 5.204704678183196e-07, "loss": 0.83356667, "num_input_tokens_seen": 138195105, "step": 6421, "time_per_iteration": 2.792999744415283 }, { "auxiliary_loss_clip": 0.01168579, "auxiliary_loss_mlp": 0.01022597, "balance_loss_clip": 1.04888463, "balance_loss_mlp": 1.01506281, "epoch": 0.7721998436842422, "flos": 12969857750400.0, "grad_norm": 2.0686499289573255, "language_loss": 0.85041815, "learning_rate": 5.19946436842813e-07, "loss": 0.87232989, "num_input_tokens_seen": 138212235, "step": 6422, "time_per_iteration": 2.673027276992798 }, { "auxiliary_loss_clip": 0.01156542, "auxiliary_loss_mlp": 0.01023755, "balance_loss_clip": 1.04679847, "balance_loss_mlp": 1.01646554, "epoch": 0.7723200865748813, "flos": 32635509678720.0, "grad_norm": 1.962559743210939, "language_loss": 0.68421119, "learning_rate": 5.194226303902546e-07, "loss": 0.70601416, "num_input_tokens_seen": 138231970, "step": 6423, "time_per_iteration": 2.9041755199432373 }, { "auxiliary_loss_clip": 0.01159693, "auxiliary_loss_mlp": 0.01023104, "balance_loss_clip": 1.04759192, "balance_loss_mlp": 1.01593947, "epoch": 0.7724403294655203, "flos": 21105707063040.0, "grad_norm": 1.7372279706877667, "language_loss": 0.70789075, "learning_rate": 5.188990485401072e-07, "loss": 0.72971869, "num_input_tokens_seen": 138251175, "step": 6424, "time_per_iteration": 2.885814666748047 }, { "auxiliary_loss_clip": 0.01163221, "auxiliary_loss_mlp": 0.01026181, "balance_loss_clip": 1.04672885, "balance_loss_mlp": 1.0182538, "epoch": 0.7725605723561595, "flos": 22090736707200.0, "grad_norm": 1.825402619330329, "language_loss": 0.86186135, "learning_rate": 5.183756913717954e-07, "loss": 0.88375539, "num_input_tokens_seen": 138270950, "step": 6425, "time_per_iteration": 3.7627828121185303 }, { "auxiliary_loss_clip": 0.0115838, "auxiliary_loss_mlp": 0.01024948, "balance_loss_clip": 1.04789877, "balance_loss_mlp": 1.01752138, "epoch": 0.7726808152467985, "flos": 34495610457600.0, "grad_norm": 1.8923753488635304, "language_loss": 0.73067838, "learning_rate": 5.178525589647136e-07, "loss": 0.75251174, "num_input_tokens_seen": 138292590, "step": 6426, "time_per_iteration": 2.8842015266418457 }, { "auxiliary_loss_clip": 0.01166206, "auxiliary_loss_mlp": 0.01023135, "balance_loss_clip": 1.04745269, "balance_loss_mlp": 1.01597917, "epoch": 0.7728010581374376, "flos": 22306344094080.0, "grad_norm": 1.888833280143806, "language_loss": 0.78620982, "learning_rate": 5.173296513982197e-07, "loss": 0.80810326, "num_input_tokens_seen": 138311115, "step": 6427, "time_per_iteration": 2.7148733139038086 }, { "auxiliary_loss_clip": 0.011698, "auxiliary_loss_mlp": 0.01025918, "balance_loss_clip": 1.0507834, "balance_loss_mlp": 1.01801145, "epoch": 0.7729213010280768, "flos": 27126453968640.0, "grad_norm": 2.400790926346453, "language_loss": 0.64807439, "learning_rate": 5.168069687516398e-07, "loss": 0.67003155, "num_input_tokens_seen": 138330885, "step": 6428, "time_per_iteration": 2.8610281944274902 }, { "auxiliary_loss_clip": 0.01164041, "auxiliary_loss_mlp": 0.01026335, "balance_loss_clip": 1.0510633, "balance_loss_mlp": 1.01901865, "epoch": 0.7730415439187158, "flos": 18150223080960.0, "grad_norm": 2.780805126728642, "language_loss": 0.71858275, "learning_rate": 5.16284511104263e-07, "loss": 0.7404865, "num_input_tokens_seen": 138350020, "step": 6429, "time_per_iteration": 2.8012683391571045 }, { "auxiliary_loss_clip": 0.01161524, "auxiliary_loss_mlp": 0.01025155, "balance_loss_clip": 1.04784751, "balance_loss_mlp": 1.01746047, "epoch": 0.7731617868093549, "flos": 11947480940160.0, "grad_norm": 2.164304420086445, "language_loss": 0.8072176, "learning_rate": 5.157622785353457e-07, "loss": 0.8290844, "num_input_tokens_seen": 138368135, "step": 6430, "time_per_iteration": 2.690694808959961 }, { "auxiliary_loss_clip": 0.01063084, "auxiliary_loss_mlp": 0.01000339, "balance_loss_clip": 1.00878859, "balance_loss_mlp": 0.99940884, "epoch": 0.7732820296999939, "flos": 64201027069440.0, "grad_norm": 0.6441353060204857, "language_loss": 0.60344362, "learning_rate": 5.152402711241113e-07, "loss": 0.62407792, "num_input_tokens_seen": 138436040, "step": 6431, "time_per_iteration": 3.4309322834014893 }, { "auxiliary_loss_clip": 0.01157561, "auxiliary_loss_mlp": 0.01031712, "balance_loss_clip": 1.04794812, "balance_loss_mlp": 1.02479815, "epoch": 0.7734022725906331, "flos": 25302191984640.0, "grad_norm": 1.925741969539902, "language_loss": 0.83163214, "learning_rate": 5.147184889497465e-07, "loss": 0.85352486, "num_input_tokens_seen": 138455510, "step": 6432, "time_per_iteration": 2.783514976501465 }, { "auxiliary_loss_clip": 0.0115129, "auxiliary_loss_mlp": 0.01026527, "balance_loss_clip": 1.04740214, "balance_loss_mlp": 1.01848614, "epoch": 0.7735225154812722, "flos": 17347440067200.0, "grad_norm": 2.960381181957025, "language_loss": 0.79830694, "learning_rate": 5.141969320914072e-07, "loss": 0.82008517, "num_input_tokens_seen": 138473015, "step": 6433, "time_per_iteration": 2.7282114028930664 }, { "auxiliary_loss_clip": 0.01171103, "auxiliary_loss_mlp": 0.01028556, "balance_loss_clip": 1.04876554, "balance_loss_mlp": 1.02056861, "epoch": 0.7736427583719112, "flos": 32630086725120.0, "grad_norm": 5.035179987677156, "language_loss": 0.62808073, "learning_rate": 5.136756006282113e-07, "loss": 0.65007734, "num_input_tokens_seen": 138491680, "step": 6434, "time_per_iteration": 2.7039711475372314 }, { "auxiliary_loss_clip": 0.01169518, "auxiliary_loss_mlp": 0.01023408, "balance_loss_clip": 1.04854751, "balance_loss_mlp": 1.01627898, "epoch": 0.7737630012625504, "flos": 19860073269120.0, "grad_norm": 2.182140176361045, "language_loss": 0.8486076, "learning_rate": 5.131544946392446e-07, "loss": 0.87053686, "num_input_tokens_seen": 138506960, "step": 6435, "time_per_iteration": 2.5911591053009033 }, { "auxiliary_loss_clip": 0.0116193, "auxiliary_loss_mlp": 0.01030283, "balance_loss_clip": 1.0494504, "balance_loss_mlp": 1.02246618, "epoch": 0.7738832441531894, "flos": 36022639397760.0, "grad_norm": 4.552624549073312, "language_loss": 0.63821292, "learning_rate": 5.126336142035592e-07, "loss": 0.66013515, "num_input_tokens_seen": 138526995, "step": 6436, "time_per_iteration": 2.7720532417297363 }, { "auxiliary_loss_clip": 0.01160691, "auxiliary_loss_mlp": 0.01022232, "balance_loss_clip": 1.04519928, "balance_loss_mlp": 1.01477528, "epoch": 0.7740034870438285, "flos": 13405274415360.0, "grad_norm": 2.2480470331785947, "language_loss": 0.72259295, "learning_rate": 5.121129594001721e-07, "loss": 0.7444222, "num_input_tokens_seen": 138541260, "step": 6437, "time_per_iteration": 2.6449990272521973 }, { "auxiliary_loss_clip": 0.01163764, "auxiliary_loss_mlp": 0.01028205, "balance_loss_clip": 1.04828238, "balance_loss_mlp": 1.02073097, "epoch": 0.7741237299344677, "flos": 22086714384000.0, "grad_norm": 1.5907934515741586, "language_loss": 0.81613314, "learning_rate": 5.115925303080661e-07, "loss": 0.83805287, "num_input_tokens_seen": 138560970, "step": 6438, "time_per_iteration": 2.675968885421753 }, { "auxiliary_loss_clip": 0.01160049, "auxiliary_loss_mlp": 0.01026072, "balance_loss_clip": 1.04547274, "balance_loss_mlp": 1.01870775, "epoch": 0.7742439728251067, "flos": 19864777950720.0, "grad_norm": 2.503696424702073, "language_loss": 0.79078257, "learning_rate": 5.110723270061899e-07, "loss": 0.81264377, "num_input_tokens_seen": 138577460, "step": 6439, "time_per_iteration": 2.6313364505767822 }, { "auxiliary_loss_clip": 0.01165621, "auxiliary_loss_mlp": 0.01023724, "balance_loss_clip": 1.04686069, "balance_loss_mlp": 1.01671743, "epoch": 0.7743642157157458, "flos": 16690167048960.0, "grad_norm": 1.7052839194516989, "language_loss": 0.79434526, "learning_rate": 5.105523495734572e-07, "loss": 0.8162387, "num_input_tokens_seen": 138594860, "step": 6440, "time_per_iteration": 2.718658208847046 }, { "auxiliary_loss_clip": 0.01166032, "auxiliary_loss_mlp": 0.01025624, "balance_loss_clip": 1.04548359, "balance_loss_mlp": 1.01831627, "epoch": 0.7744844586063849, "flos": 20304360593280.0, "grad_norm": 1.7067136228970325, "language_loss": 0.75070095, "learning_rate": 5.100325980887499e-07, "loss": 0.77261746, "num_input_tokens_seen": 138614785, "step": 6441, "time_per_iteration": 3.5807321071624756 }, { "auxiliary_loss_clip": 0.01168528, "auxiliary_loss_mlp": 0.01031021, "balance_loss_clip": 1.0480777, "balance_loss_mlp": 1.02371955, "epoch": 0.774604701497024, "flos": 22966705681920.0, "grad_norm": 1.893744225016967, "language_loss": 0.83539683, "learning_rate": 5.095130726309116e-07, "loss": 0.85739231, "num_input_tokens_seen": 138634960, "step": 6442, "time_per_iteration": 2.6965808868408203 }, { "auxiliary_loss_clip": 0.01062285, "auxiliary_loss_mlp": 0.01001911, "balance_loss_clip": 1.00885749, "balance_loss_mlp": 1.00100505, "epoch": 0.774724944387663, "flos": 60288523073280.0, "grad_norm": 0.789255927683735, "language_loss": 0.58950758, "learning_rate": 5.089937732787559e-07, "loss": 0.61014956, "num_input_tokens_seen": 138699520, "step": 6443, "time_per_iteration": 4.186717748641968 }, { "auxiliary_loss_clip": 0.01161201, "auxiliary_loss_mlp": 0.01026016, "balance_loss_clip": 1.0489347, "balance_loss_mlp": 1.01861584, "epoch": 0.7748451872783022, "flos": 26761026954240.0, "grad_norm": 2.1558196157901337, "language_loss": 0.66723812, "learning_rate": 5.084747001110592e-07, "loss": 0.68911028, "num_input_tokens_seen": 138719145, "step": 6444, "time_per_iteration": 2.813976764678955 }, { "auxiliary_loss_clip": 0.01156989, "auxiliary_loss_mlp": 0.01057032, "balance_loss_clip": 1.04702902, "balance_loss_mlp": 1.02079523, "epoch": 0.7749654301689413, "flos": 30338627518080.0, "grad_norm": 1.6926246713577235, "language_loss": 0.70206308, "learning_rate": 5.07955853206564e-07, "loss": 0.72420329, "num_input_tokens_seen": 138743850, "step": 6445, "time_per_iteration": 2.772770643234253 }, { "auxiliary_loss_clip": 0.0116799, "auxiliary_loss_mlp": 0.01028606, "balance_loss_clip": 1.04804039, "balance_loss_mlp": 1.02160883, "epoch": 0.7750856730595803, "flos": 43179851687040.0, "grad_norm": 1.5313818211190173, "language_loss": 0.71010339, "learning_rate": 5.074372326439807e-07, "loss": 0.73206937, "num_input_tokens_seen": 138766860, "step": 6446, "time_per_iteration": 3.8610033988952637 }, { "auxiliary_loss_clip": 0.01160512, "auxiliary_loss_mlp": 0.01027656, "balance_loss_clip": 1.0469414, "balance_loss_mlp": 1.01997852, "epoch": 0.7752059159502195, "flos": 17640040256640.0, "grad_norm": 2.076030189405211, "language_loss": 0.73332018, "learning_rate": 5.069188385019814e-07, "loss": 0.75520182, "num_input_tokens_seen": 138784560, "step": 6447, "time_per_iteration": 2.69685959815979 }, { "auxiliary_loss_clip": 0.01163278, "auxiliary_loss_mlp": 0.01021949, "balance_loss_clip": 1.04843795, "balance_loss_mlp": 1.01453471, "epoch": 0.7753261588408585, "flos": 12677688524160.0, "grad_norm": 3.5982264155899815, "language_loss": 0.61296999, "learning_rate": 5.064006708592077e-07, "loss": 0.63482231, "num_input_tokens_seen": 138800805, "step": 6448, "time_per_iteration": 2.852912425994873 }, { "auxiliary_loss_clip": 0.01153385, "auxiliary_loss_mlp": 0.01025387, "balance_loss_clip": 1.04572678, "balance_loss_mlp": 1.01826084, "epoch": 0.7754464017314976, "flos": 16690741666560.0, "grad_norm": 3.766655273502718, "language_loss": 0.75732321, "learning_rate": 5.058827297942641e-07, "loss": 0.77911085, "num_input_tokens_seen": 138815910, "step": 6449, "time_per_iteration": 2.7178304195404053 }, { "auxiliary_loss_clip": 0.01166042, "auxiliary_loss_mlp": 0.01027009, "balance_loss_clip": 1.04700148, "balance_loss_mlp": 1.01985073, "epoch": 0.7755666446221368, "flos": 19718944732800.0, "grad_norm": 2.2138602965640315, "language_loss": 0.75157106, "learning_rate": 5.053650153857237e-07, "loss": 0.77350157, "num_input_tokens_seen": 138834920, "step": 6450, "time_per_iteration": 2.7710676193237305 }, { "auxiliary_loss_clip": 0.01165645, "auxiliary_loss_mlp": 0.01025724, "balance_loss_clip": 1.04926276, "balance_loss_mlp": 1.01825583, "epoch": 0.7756868875127758, "flos": 18693623007360.0, "grad_norm": 1.6234797345920633, "language_loss": 0.70072269, "learning_rate": 5.048475277121214e-07, "loss": 0.7226364, "num_input_tokens_seen": 138852135, "step": 6451, "time_per_iteration": 3.59500789642334 }, { "auxiliary_loss_clip": 0.01164105, "auxiliary_loss_mlp": 0.01025452, "balance_loss_clip": 1.04699564, "balance_loss_mlp": 1.01740527, "epoch": 0.7758071304034149, "flos": 28404191543040.0, "grad_norm": 2.053519829273823, "language_loss": 0.77085388, "learning_rate": 5.043302668519598e-07, "loss": 0.7927494, "num_input_tokens_seen": 138871470, "step": 6452, "time_per_iteration": 2.7652926445007324 }, { "auxiliary_loss_clip": 0.01166531, "auxiliary_loss_mlp": 0.01024764, "balance_loss_clip": 1.04650593, "balance_loss_mlp": 1.01717949, "epoch": 0.775927373294054, "flos": 20595344670720.0, "grad_norm": 1.8946567002432062, "language_loss": 0.72163725, "learning_rate": 5.038132328837079e-07, "loss": 0.74355018, "num_input_tokens_seen": 138889860, "step": 6453, "time_per_iteration": 2.6953554153442383 }, { "auxiliary_loss_clip": 0.01168251, "auxiliary_loss_mlp": 0.01026554, "balance_loss_clip": 1.0498991, "balance_loss_mlp": 1.01878738, "epoch": 0.7760476161846931, "flos": 22526368853760.0, "grad_norm": 2.115304232888229, "language_loss": 0.73546934, "learning_rate": 5.032964258857993e-07, "loss": 0.75741738, "num_input_tokens_seen": 138909955, "step": 6454, "time_per_iteration": 2.6774582862854004 }, { "auxiliary_loss_clip": 0.01162473, "auxiliary_loss_mlp": 0.01023764, "balance_loss_clip": 1.04482913, "balance_loss_mlp": 1.01642978, "epoch": 0.7761678590753321, "flos": 48651488403840.0, "grad_norm": 1.5708098516236593, "language_loss": 0.68211579, "learning_rate": 5.027798459366329e-07, "loss": 0.70397812, "num_input_tokens_seen": 138935320, "step": 6455, "time_per_iteration": 2.9083077907562256 }, { "auxiliary_loss_clip": 0.01168745, "auxiliary_loss_mlp": 0.01022803, "balance_loss_clip": 1.04770434, "balance_loss_mlp": 1.01555479, "epoch": 0.7762881019659713, "flos": 26177047637760.0, "grad_norm": 1.6357622257839879, "language_loss": 0.6357007, "learning_rate": 5.02263493114573e-07, "loss": 0.6576162, "num_input_tokens_seen": 138957115, "step": 6456, "time_per_iteration": 2.7471463680267334 }, { "auxiliary_loss_clip": 0.01166028, "auxiliary_loss_mlp": 0.01022275, "balance_loss_clip": 1.04660082, "balance_loss_mlp": 1.01464844, "epoch": 0.7764083448566104, "flos": 20588341518720.0, "grad_norm": 3.0136253249734004, "language_loss": 0.77156186, "learning_rate": 5.017473674979502e-07, "loss": 0.79344487, "num_input_tokens_seen": 138973140, "step": 6457, "time_per_iteration": 2.7953073978424072 }, { "auxiliary_loss_clip": 0.01065491, "auxiliary_loss_mlp": 0.0100285, "balance_loss_clip": 1.02075922, "balance_loss_mlp": 1.00180101, "epoch": 0.7765285877472494, "flos": 67293078560640.0, "grad_norm": 0.7465164141011388, "language_loss": 0.58361375, "learning_rate": 5.01231469165061e-07, "loss": 0.60429716, "num_input_tokens_seen": 139028965, "step": 6458, "time_per_iteration": 3.261414051055908 }, { "auxiliary_loss_clip": 0.01064023, "auxiliary_loss_mlp": 0.01000912, "balance_loss_clip": 1.01019096, "balance_loss_mlp": 1.00008988, "epoch": 0.7766488306378886, "flos": 61344476121600.0, "grad_norm": 0.8257157382205674, "language_loss": 0.56881291, "learning_rate": 5.007157981941663e-07, "loss": 0.58946222, "num_input_tokens_seen": 139094325, "step": 6459, "time_per_iteration": 3.453321695327759 }, { "auxiliary_loss_clip": 0.01062201, "auxiliary_loss_mlp": 0.01000749, "balance_loss_clip": 1.00898111, "balance_loss_mlp": 0.99972975, "epoch": 0.7767690735285276, "flos": 62946199393920.0, "grad_norm": 0.8906765448226823, "language_loss": 0.67420936, "learning_rate": 5.002003546634928e-07, "loss": 0.69483888, "num_input_tokens_seen": 139150425, "step": 6460, "time_per_iteration": 3.244539260864258 }, { "auxiliary_loss_clip": 0.01154042, "auxiliary_loss_mlp": 0.01019476, "balance_loss_clip": 1.04988313, "balance_loss_mlp": 1.0123651, "epoch": 0.7768893164191667, "flos": 20886400575360.0, "grad_norm": 1.6001040050348239, "language_loss": 0.76047337, "learning_rate": 4.996851386512331e-07, "loss": 0.78220856, "num_input_tokens_seen": 139169130, "step": 6461, "time_per_iteration": 2.7619426250457764 }, { "auxiliary_loss_clip": 0.01161428, "auxiliary_loss_mlp": 0.01025788, "balance_loss_clip": 1.04717398, "balance_loss_mlp": 1.01759291, "epoch": 0.7770095593098058, "flos": 20704584908160.0, "grad_norm": 2.226082510164922, "language_loss": 0.83128977, "learning_rate": 4.991701502355444e-07, "loss": 0.85316199, "num_input_tokens_seen": 139189595, "step": 6462, "time_per_iteration": 2.689467430114746 }, { "auxiliary_loss_clip": 0.0116642, "auxiliary_loss_mlp": 0.01027582, "balance_loss_clip": 1.04643297, "balance_loss_mlp": 1.02082539, "epoch": 0.7771298022004449, "flos": 24717709877760.0, "grad_norm": 1.4820671215055192, "language_loss": 0.75956738, "learning_rate": 4.986553894945518e-07, "loss": 0.78150737, "num_input_tokens_seen": 139210805, "step": 6463, "time_per_iteration": 2.8304803371429443 }, { "auxiliary_loss_clip": 0.0115721, "auxiliary_loss_mlp": 0.01023721, "balance_loss_clip": 1.04803538, "balance_loss_mlp": 1.01674414, "epoch": 0.777250045091084, "flos": 25009232659200.0, "grad_norm": 2.3358065247394677, "language_loss": 0.8563658, "learning_rate": 4.981408565063416e-07, "loss": 0.87817514, "num_input_tokens_seen": 139230750, "step": 6464, "time_per_iteration": 2.810967445373535 }, { "auxiliary_loss_clip": 0.01168943, "auxiliary_loss_mlp": 0.01026391, "balance_loss_clip": 1.0475297, "balance_loss_mlp": 1.01872027, "epoch": 0.777370287981723, "flos": 20119887319680.0, "grad_norm": 1.8749378895493165, "language_loss": 0.75730824, "learning_rate": 4.976265513489701e-07, "loss": 0.77926165, "num_input_tokens_seen": 139250720, "step": 6465, "time_per_iteration": 2.6872074604034424 }, { "auxiliary_loss_clip": 0.01167065, "auxiliary_loss_mlp": 0.01022951, "balance_loss_clip": 1.04854321, "balance_loss_mlp": 1.01489806, "epoch": 0.7774905308723622, "flos": 21718809331200.0, "grad_norm": 2.0829005194866754, "language_loss": 0.80325794, "learning_rate": 4.971124741004562e-07, "loss": 0.82515812, "num_input_tokens_seen": 139269720, "step": 6466, "time_per_iteration": 2.685229539871216 }, { "auxiliary_loss_clip": 0.01159787, "auxiliary_loss_mlp": 0.01025227, "balance_loss_clip": 1.04387224, "balance_loss_mlp": 1.01776159, "epoch": 0.7776107737630013, "flos": 16034115093120.0, "grad_norm": 1.6851747178077845, "language_loss": 0.77058744, "learning_rate": 4.965986248387846e-07, "loss": 0.79243767, "num_input_tokens_seen": 139288035, "step": 6467, "time_per_iteration": 3.655518054962158 }, { "auxiliary_loss_clip": 0.01164602, "auxiliary_loss_mlp": 0.01021176, "balance_loss_clip": 1.04707623, "balance_loss_mlp": 1.01393127, "epoch": 0.7777310166536403, "flos": 24790895838720.0, "grad_norm": 1.6830305256141302, "language_loss": 0.77123797, "learning_rate": 4.960850036419073e-07, "loss": 0.79309571, "num_input_tokens_seen": 139307135, "step": 6468, "time_per_iteration": 2.712656259536743 }, { "auxiliary_loss_clip": 0.01158017, "auxiliary_loss_mlp": 0.01025712, "balance_loss_clip": 1.04709411, "balance_loss_mlp": 1.01884913, "epoch": 0.7778512595442795, "flos": 17272530253440.0, "grad_norm": 1.7934295394730586, "language_loss": 0.7857976, "learning_rate": 4.955716105877378e-07, "loss": 0.80763489, "num_input_tokens_seen": 139325905, "step": 6469, "time_per_iteration": 3.6734042167663574 }, { "auxiliary_loss_clip": 0.01166356, "auxiliary_loss_mlp": 0.01053343, "balance_loss_clip": 1.04668105, "balance_loss_mlp": 1.01690948, "epoch": 0.7779715024349185, "flos": 17748418567680.0, "grad_norm": 1.8785070486689526, "language_loss": 0.82826346, "learning_rate": 4.950584457541598e-07, "loss": 0.85046041, "num_input_tokens_seen": 139344370, "step": 6470, "time_per_iteration": 2.675076723098755 }, { "auxiliary_loss_clip": 0.01168249, "auxiliary_loss_mlp": 0.01026762, "balance_loss_clip": 1.04817271, "balance_loss_mlp": 1.0193826, "epoch": 0.7780917453255576, "flos": 24316875031680.0, "grad_norm": 1.4084289362124642, "language_loss": 0.82098025, "learning_rate": 4.945455092190183e-07, "loss": 0.84293038, "num_input_tokens_seen": 139365625, "step": 6471, "time_per_iteration": 2.7237963676452637 }, { "auxiliary_loss_clip": 0.01062072, "auxiliary_loss_mlp": 0.01000531, "balance_loss_clip": 1.00844646, "balance_loss_mlp": 0.99961346, "epoch": 0.7782119882161967, "flos": 56364601530240.0, "grad_norm": 0.6798075157935887, "language_loss": 0.55989242, "learning_rate": 4.940328010601271e-07, "loss": 0.58051842, "num_input_tokens_seen": 139430540, "step": 6472, "time_per_iteration": 3.2377796173095703 }, { "auxiliary_loss_clip": 0.0117197, "auxiliary_loss_mlp": 0.01027874, "balance_loss_clip": 1.05193615, "balance_loss_mlp": 1.01987529, "epoch": 0.7783322311068358, "flos": 46789986994560.0, "grad_norm": 1.834397413269565, "language_loss": 0.7700634, "learning_rate": 4.935203213552621e-07, "loss": 0.79206192, "num_input_tokens_seen": 139454280, "step": 6473, "time_per_iteration": 3.962590456008911 }, { "auxiliary_loss_clip": 0.01162351, "auxiliary_loss_mlp": 0.01024468, "balance_loss_clip": 1.04750109, "balance_loss_mlp": 1.01732755, "epoch": 0.7784524739974749, "flos": 19057864872960.0, "grad_norm": 2.225120218383918, "language_loss": 0.67228085, "learning_rate": 4.930080701821662e-07, "loss": 0.69414914, "num_input_tokens_seen": 139471745, "step": 6474, "time_per_iteration": 2.7502949237823486 }, { "auxiliary_loss_clip": 0.01161975, "auxiliary_loss_mlp": 0.01021948, "balance_loss_clip": 1.04649544, "balance_loss_mlp": 1.01494765, "epoch": 0.778572716888114, "flos": 24791111320320.0, "grad_norm": 2.147047379959897, "language_loss": 0.77061224, "learning_rate": 4.92496047618548e-07, "loss": 0.7924515, "num_input_tokens_seen": 139491505, "step": 6475, "time_per_iteration": 2.805772304534912 }, { "auxiliary_loss_clip": 0.01166597, "auxiliary_loss_mlp": 0.01024144, "balance_loss_clip": 1.04957664, "balance_loss_mlp": 1.01616311, "epoch": 0.7786929597787531, "flos": 20078086867200.0, "grad_norm": 2.265914999248979, "language_loss": 0.77636045, "learning_rate": 4.919842537420811e-07, "loss": 0.79826784, "num_input_tokens_seen": 139508620, "step": 6476, "time_per_iteration": 2.6734113693237305 }, { "auxiliary_loss_clip": 0.01162203, "auxiliary_loss_mlp": 0.01025445, "balance_loss_clip": 1.04967582, "balance_loss_mlp": 1.01818252, "epoch": 0.7788132026693921, "flos": 21872220318720.0, "grad_norm": 1.5327003380089301, "language_loss": 0.79076993, "learning_rate": 4.91472688630404e-07, "loss": 0.81264639, "num_input_tokens_seen": 139529360, "step": 6477, "time_per_iteration": 3.689896583557129 }, { "auxiliary_loss_clip": 0.01163286, "auxiliary_loss_mlp": 0.01022655, "balance_loss_clip": 1.04585886, "balance_loss_mlp": 1.01555598, "epoch": 0.7789334455600313, "flos": 11181937351680.0, "grad_norm": 2.9537034161726483, "language_loss": 0.73964375, "learning_rate": 4.909613523611202e-07, "loss": 0.76150316, "num_input_tokens_seen": 139546240, "step": 6478, "time_per_iteration": 2.726322889328003 }, { "auxiliary_loss_clip": 0.01153413, "auxiliary_loss_mlp": 0.01056223, "balance_loss_clip": 1.04716456, "balance_loss_mlp": 1.01900983, "epoch": 0.7790536884506704, "flos": 28695427015680.0, "grad_norm": 1.7336841702561987, "language_loss": 0.74381697, "learning_rate": 4.904502450117991e-07, "loss": 0.76591337, "num_input_tokens_seen": 139567200, "step": 6479, "time_per_iteration": 2.77976393699646 }, { "auxiliary_loss_clip": 0.01157832, "auxiliary_loss_mlp": 0.01028303, "balance_loss_clip": 1.04990923, "balance_loss_mlp": 1.01999128, "epoch": 0.7791739313413094, "flos": 11072302064640.0, "grad_norm": 2.464483932901106, "language_loss": 0.72613049, "learning_rate": 4.899393666599762e-07, "loss": 0.7479918, "num_input_tokens_seen": 139583775, "step": 6480, "time_per_iteration": 2.7243144512176514 }, { "auxiliary_loss_clip": 0.01165673, "auxiliary_loss_mlp": 0.01024737, "balance_loss_clip": 1.04527378, "balance_loss_mlp": 1.01767421, "epoch": 0.7792941742319486, "flos": 14679276975360.0, "grad_norm": 2.2831186252679614, "language_loss": 0.72609699, "learning_rate": 4.894287173831506e-07, "loss": 0.7480011, "num_input_tokens_seen": 139599735, "step": 6481, "time_per_iteration": 2.691195011138916 }, { "auxiliary_loss_clip": 0.01162224, "auxiliary_loss_mlp": 0.01021203, "balance_loss_clip": 1.04640794, "balance_loss_mlp": 1.01390159, "epoch": 0.7794144171225876, "flos": 23258874908160.0, "grad_norm": 2.3710266890163068, "language_loss": 0.84601533, "learning_rate": 4.889182972587877e-07, "loss": 0.86784965, "num_input_tokens_seen": 139619030, "step": 6482, "time_per_iteration": 2.845139741897583 }, { "auxiliary_loss_clip": 0.01168823, "auxiliary_loss_mlp": 0.01023056, "balance_loss_clip": 1.05174899, "balance_loss_mlp": 1.01620746, "epoch": 0.7795346600132267, "flos": 21507080613120.0, "grad_norm": 1.7067473852078803, "language_loss": 0.66188276, "learning_rate": 4.884081063643177e-07, "loss": 0.68380153, "num_input_tokens_seen": 139637690, "step": 6483, "time_per_iteration": 2.8598744869232178 }, { "auxiliary_loss_clip": 0.01061175, "auxiliary_loss_mlp": 0.01003864, "balance_loss_clip": 1.01466024, "balance_loss_mlp": 1.00283289, "epoch": 0.7796549029038659, "flos": 70052273694720.0, "grad_norm": 0.838467840211791, "language_loss": 0.52450538, "learning_rate": 4.878981447771353e-07, "loss": 0.54515576, "num_input_tokens_seen": 139692070, "step": 6484, "time_per_iteration": 3.315763235092163 }, { "auxiliary_loss_clip": 0.01154444, "auxiliary_loss_mlp": 0.01032072, "balance_loss_clip": 1.04894769, "balance_loss_mlp": 1.02449632, "epoch": 0.7797751457945049, "flos": 23989405714560.0, "grad_norm": 1.5564900972731472, "language_loss": 0.7300939, "learning_rate": 4.873884125746035e-07, "loss": 0.75195903, "num_input_tokens_seen": 139713745, "step": 6485, "time_per_iteration": 2.892174005508423 }, { "auxiliary_loss_clip": 0.01154729, "auxiliary_loss_mlp": 0.01025764, "balance_loss_clip": 1.04545653, "balance_loss_mlp": 1.01843274, "epoch": 0.779895388685144, "flos": 22674751937280.0, "grad_norm": 3.0310849012948546, "language_loss": 0.72500122, "learning_rate": 4.868789098340456e-07, "loss": 0.74680614, "num_input_tokens_seen": 139731650, "step": 6486, "time_per_iteration": 2.730799436569214 }, { "auxiliary_loss_clip": 0.01158578, "auxiliary_loss_mlp": 0.01027162, "balance_loss_clip": 1.04721701, "balance_loss_mlp": 1.01994944, "epoch": 0.7800156315757831, "flos": 23768698596480.0, "grad_norm": 2.116113296555897, "language_loss": 0.73238212, "learning_rate": 4.863696366327543e-07, "loss": 0.75423956, "num_input_tokens_seen": 139750820, "step": 6487, "time_per_iteration": 2.8106536865234375 }, { "auxiliary_loss_clip": 0.01166937, "auxiliary_loss_mlp": 0.01019259, "balance_loss_clip": 1.04709518, "balance_loss_mlp": 1.01208246, "epoch": 0.7801358744664222, "flos": 26429714881920.0, "grad_norm": 1.8117908129488138, "language_loss": 0.77775896, "learning_rate": 4.85860593047986e-07, "loss": 0.79962087, "num_input_tokens_seen": 139770885, "step": 6488, "time_per_iteration": 2.7244772911071777 }, { "auxiliary_loss_clip": 0.01153322, "auxiliary_loss_mlp": 0.01024427, "balance_loss_clip": 1.04615474, "balance_loss_mlp": 1.01707172, "epoch": 0.7802561173570612, "flos": 26322162583680.0, "grad_norm": 1.6558647445947645, "language_loss": 0.74922413, "learning_rate": 4.853517791569613e-07, "loss": 0.77100158, "num_input_tokens_seen": 139793065, "step": 6489, "time_per_iteration": 2.880324125289917 }, { "auxiliary_loss_clip": 0.01166033, "auxiliary_loss_mlp": 0.0105058, "balance_loss_clip": 1.046561, "balance_loss_mlp": 1.01533389, "epoch": 0.7803763602477004, "flos": 40333751596800.0, "grad_norm": 1.9381682740178512, "language_loss": 0.66352105, "learning_rate": 4.848431950368684e-07, "loss": 0.68568718, "num_input_tokens_seen": 139815625, "step": 6490, "time_per_iteration": 2.9916372299194336 }, { "auxiliary_loss_clip": 0.01062109, "auxiliary_loss_mlp": 0.01032288, "balance_loss_clip": 1.00855923, "balance_loss_mlp": 0.99903852, "epoch": 0.7804966031383395, "flos": 67001448038400.0, "grad_norm": 0.7004523711011721, "language_loss": 0.55740052, "learning_rate": 4.843348407648569e-07, "loss": 0.5783444, "num_input_tokens_seen": 139876905, "step": 6491, "time_per_iteration": 3.2421255111694336 }, { "auxiliary_loss_clip": 0.01167632, "auxiliary_loss_mlp": 0.01028378, "balance_loss_clip": 1.04585087, "balance_loss_mlp": 1.02075458, "epoch": 0.7806168460289785, "flos": 17740733057280.0, "grad_norm": 4.039159002129423, "language_loss": 0.83165139, "learning_rate": 4.838267164180457e-07, "loss": 0.85361147, "num_input_tokens_seen": 139892575, "step": 6492, "time_per_iteration": 2.844778537750244 }, { "auxiliary_loss_clip": 0.0116897, "auxiliary_loss_mlp": 0.01024766, "balance_loss_clip": 1.04642439, "balance_loss_mlp": 1.01764381, "epoch": 0.7807370889196176, "flos": 23946240545280.0, "grad_norm": 1.8854222642664509, "language_loss": 0.83926463, "learning_rate": 4.833188220735156e-07, "loss": 0.861202, "num_input_tokens_seen": 139912245, "step": 6493, "time_per_iteration": 3.7649831771850586 }, { "auxiliary_loss_clip": 0.0115931, "auxiliary_loss_mlp": 0.01023176, "balance_loss_clip": 1.04471099, "balance_loss_mlp": 1.01608002, "epoch": 0.7808573318102567, "flos": 18989024457600.0, "grad_norm": 2.1261913959942116, "language_loss": 0.74877566, "learning_rate": 4.828111578083152e-07, "loss": 0.77060056, "num_input_tokens_seen": 139929150, "step": 6494, "time_per_iteration": 2.8770558834075928 }, { "auxiliary_loss_clip": 0.01158691, "auxiliary_loss_mlp": 0.01023136, "balance_loss_clip": 1.04840982, "balance_loss_mlp": 1.01607895, "epoch": 0.7809775747008958, "flos": 23980750536960.0, "grad_norm": 2.0291454282468413, "language_loss": 0.81210446, "learning_rate": 4.823037236994556e-07, "loss": 0.83392274, "num_input_tokens_seen": 139947315, "step": 6495, "time_per_iteration": 3.9535837173461914 }, { "auxiliary_loss_clip": 0.01062305, "auxiliary_loss_mlp": 0.01001663, "balance_loss_clip": 1.00816154, "balance_loss_mlp": 1.00067961, "epoch": 0.7810978175915348, "flos": 68535875180160.0, "grad_norm": 0.7169620391798551, "language_loss": 0.56313038, "learning_rate": 4.817965198239136e-07, "loss": 0.58377004, "num_input_tokens_seen": 140013775, "step": 6496, "time_per_iteration": 3.3051252365112305 }, { "auxiliary_loss_clip": 0.0115608, "auxiliary_loss_mlp": 0.01026741, "balance_loss_clip": 1.04461777, "balance_loss_mlp": 1.01900399, "epoch": 0.781218060482174, "flos": 19642131498240.0, "grad_norm": 2.2133556308256033, "language_loss": 0.74261475, "learning_rate": 4.812895462586331e-07, "loss": 0.76444304, "num_input_tokens_seen": 140031600, "step": 6497, "time_per_iteration": 2.8497211933135986 }, { "auxiliary_loss_clip": 0.0115893, "auxiliary_loss_mlp": 0.01024938, "balance_loss_clip": 1.04562473, "balance_loss_mlp": 1.0178957, "epoch": 0.7813383033728131, "flos": 25627865621760.0, "grad_norm": 2.1718380821205723, "language_loss": 0.81728464, "learning_rate": 4.807828030805207e-07, "loss": 0.83912331, "num_input_tokens_seen": 140050590, "step": 6498, "time_per_iteration": 3.8231990337371826 }, { "auxiliary_loss_clip": 0.01158668, "auxiliary_loss_mlp": 0.01018053, "balance_loss_clip": 1.04490566, "balance_loss_mlp": 1.01039982, "epoch": 0.7814585462634521, "flos": 20485924865280.0, "grad_norm": 1.8438199483206728, "language_loss": 0.68022943, "learning_rate": 4.802762903664495e-07, "loss": 0.70199662, "num_input_tokens_seen": 140069770, "step": 6499, "time_per_iteration": 2.786348342895508 }, { "auxiliary_loss_clip": 0.01167317, "auxiliary_loss_mlp": 0.01026827, "balance_loss_clip": 1.04943037, "balance_loss_mlp": 1.01925993, "epoch": 0.7815787891540913, "flos": 22304297018880.0, "grad_norm": 2.4629685665847685, "language_loss": 0.741054, "learning_rate": 4.797700081932565e-07, "loss": 0.76299548, "num_input_tokens_seen": 140087635, "step": 6500, "time_per_iteration": 2.7704811096191406 }, { "auxiliary_loss_clip": 0.01150875, "auxiliary_loss_mlp": 0.01022655, "balance_loss_clip": 1.04878807, "balance_loss_mlp": 1.01562774, "epoch": 0.7816990320447303, "flos": 22600668136320.0, "grad_norm": 2.8203728685844567, "language_loss": 0.81842387, "learning_rate": 4.792639566377442e-07, "loss": 0.84015924, "num_input_tokens_seen": 140105045, "step": 6501, "time_per_iteration": 2.861663579940796 }, { "auxiliary_loss_clip": 0.01159487, "auxiliary_loss_mlp": 0.01027659, "balance_loss_clip": 1.04620552, "balance_loss_mlp": 1.0199759, "epoch": 0.7818192749353694, "flos": 24935974871040.0, "grad_norm": 1.8786670800459695, "language_loss": 0.7772463, "learning_rate": 4.78758135776681e-07, "loss": 0.7991178, "num_input_tokens_seen": 140124900, "step": 6502, "time_per_iteration": 2.7891876697540283 }, { "auxiliary_loss_clip": 0.01162608, "auxiliary_loss_mlp": 0.01028406, "balance_loss_clip": 1.04751325, "balance_loss_mlp": 1.0211277, "epoch": 0.7819395178260086, "flos": 23733039369600.0, "grad_norm": 2.355505728313874, "language_loss": 0.79113567, "learning_rate": 4.782525456867989e-07, "loss": 0.8130458, "num_input_tokens_seen": 140143755, "step": 6503, "time_per_iteration": 3.755754232406616 }, { "auxiliary_loss_clip": 0.01159561, "auxiliary_loss_mlp": 0.01025204, "balance_loss_clip": 1.04733622, "balance_loss_mlp": 1.01745534, "epoch": 0.7820597607166476, "flos": 23221671396480.0, "grad_norm": 1.6515475487125675, "language_loss": 0.83141398, "learning_rate": 4.777471864447959e-07, "loss": 0.85326171, "num_input_tokens_seen": 140164495, "step": 6504, "time_per_iteration": 2.734855890274048 }, { "auxiliary_loss_clip": 0.01164138, "auxiliary_loss_mlp": 0.01022643, "balance_loss_clip": 1.04845846, "balance_loss_mlp": 1.01503754, "epoch": 0.7821800036072867, "flos": 22309540404480.0, "grad_norm": 2.338053581955887, "language_loss": 0.80675334, "learning_rate": 4.772420581273344e-07, "loss": 0.82862115, "num_input_tokens_seen": 140181980, "step": 6505, "time_per_iteration": 2.7294931411743164 }, { "auxiliary_loss_clip": 0.01157384, "auxiliary_loss_mlp": 0.01024398, "balance_loss_clip": 1.04483414, "balance_loss_mlp": 1.01726902, "epoch": 0.7823002464979258, "flos": 21544176384000.0, "grad_norm": 2.062047315479726, "language_loss": 0.76033962, "learning_rate": 4.7673716081104134e-07, "loss": 0.78215742, "num_input_tokens_seen": 140202155, "step": 6506, "time_per_iteration": 2.8143744468688965 }, { "auxiliary_loss_clip": 0.01163168, "auxiliary_loss_mlp": 0.01023993, "balance_loss_clip": 1.04848325, "balance_loss_mlp": 1.01709056, "epoch": 0.7824204893885649, "flos": 24535642815360.0, "grad_norm": 2.0960069807993653, "language_loss": 0.84300768, "learning_rate": 4.762324945725109e-07, "loss": 0.86487937, "num_input_tokens_seen": 140221600, "step": 6507, "time_per_iteration": 2.718777656555176 }, { "auxiliary_loss_clip": 0.01160055, "auxiliary_loss_mlp": 0.01028294, "balance_loss_clip": 1.04957068, "balance_loss_mlp": 1.02163649, "epoch": 0.782540732279204, "flos": 27415211402880.0, "grad_norm": 1.6770131100055863, "language_loss": 0.75937033, "learning_rate": 4.7572805948829844e-07, "loss": 0.78125381, "num_input_tokens_seen": 140241860, "step": 6508, "time_per_iteration": 2.7822952270507812 }, { "auxiliary_loss_clip": 0.01158194, "auxiliary_loss_mlp": 0.0102175, "balance_loss_clip": 1.04614592, "balance_loss_mlp": 1.014925, "epoch": 0.7826609751698431, "flos": 24353216616960.0, "grad_norm": 1.8633734135737254, "language_loss": 0.70671391, "learning_rate": 4.7522385563492795e-07, "loss": 0.7285133, "num_input_tokens_seen": 140262160, "step": 6509, "time_per_iteration": 2.812110424041748 }, { "auxiliary_loss_clip": 0.01162492, "auxiliary_loss_mlp": 0.01024081, "balance_loss_clip": 1.04872799, "balance_loss_mlp": 1.0167762, "epoch": 0.7827812180604822, "flos": 23988543788160.0, "grad_norm": 2.054948939476398, "language_loss": 0.6993311, "learning_rate": 4.747198830888863e-07, "loss": 0.72119689, "num_input_tokens_seen": 140282030, "step": 6510, "time_per_iteration": 2.8381571769714355 }, { "auxiliary_loss_clip": 0.01154919, "auxiliary_loss_mlp": 0.01023548, "balance_loss_clip": 1.04547, "balance_loss_mlp": 1.01654792, "epoch": 0.7829014609511212, "flos": 27454318335360.0, "grad_norm": 1.8559798445428572, "language_loss": 0.68764818, "learning_rate": 4.742161419266251e-07, "loss": 0.70943284, "num_input_tokens_seen": 140301190, "step": 6511, "time_per_iteration": 2.7234575748443604 }, { "auxiliary_loss_clip": 0.01171135, "auxiliary_loss_mlp": 0.01027633, "balance_loss_clip": 1.05000341, "balance_loss_mlp": 1.02026021, "epoch": 0.7830217038417604, "flos": 29204532432000.0, "grad_norm": 2.6128646923786616, "language_loss": 0.64830524, "learning_rate": 4.7371263222456304e-07, "loss": 0.67029285, "num_input_tokens_seen": 140318510, "step": 6512, "time_per_iteration": 2.7342684268951416 }, { "auxiliary_loss_clip": 0.0105857, "auxiliary_loss_mlp": 0.01001434, "balance_loss_clip": 1.00919271, "balance_loss_mlp": 1.00047481, "epoch": 0.7831419467323995, "flos": 60950895822720.0, "grad_norm": 0.8060574544603848, "language_loss": 0.61344111, "learning_rate": 4.7320935405908004e-07, "loss": 0.63404119, "num_input_tokens_seen": 140379380, "step": 6513, "time_per_iteration": 3.243960380554199 }, { "auxiliary_loss_clip": 0.01170394, "auxiliary_loss_mlp": 0.01026385, "balance_loss_clip": 1.04777431, "balance_loss_mlp": 1.01861262, "epoch": 0.7832621896230385, "flos": 19682531320320.0, "grad_norm": 2.28594242257255, "language_loss": 0.84175837, "learning_rate": 4.7270630750652475e-07, "loss": 0.8637262, "num_input_tokens_seen": 140395335, "step": 6514, "time_per_iteration": 2.6519010066986084 }, { "auxiliary_loss_clip": 0.01159772, "auxiliary_loss_mlp": 0.01026334, "balance_loss_clip": 1.04479933, "balance_loss_mlp": 1.01871705, "epoch": 0.7833824325136777, "flos": 25009232659200.0, "grad_norm": 1.7186439901474306, "language_loss": 0.80622733, "learning_rate": 4.7220349264320746e-07, "loss": 0.8280884, "num_input_tokens_seen": 140414420, "step": 6515, "time_per_iteration": 2.8266119956970215 }, { "auxiliary_loss_clip": 0.01060953, "auxiliary_loss_mlp": 0.01002996, "balance_loss_clip": 1.00866818, "balance_loss_mlp": 1.00201273, "epoch": 0.7835026754043167, "flos": 68800142517120.0, "grad_norm": 0.7368095171996333, "language_loss": 0.54868972, "learning_rate": 4.71700909545407e-07, "loss": 0.56932926, "num_input_tokens_seen": 140477365, "step": 6516, "time_per_iteration": 3.2543623447418213 }, { "auxiliary_loss_clip": 0.01163876, "auxiliary_loss_mlp": 0.01026078, "balance_loss_clip": 1.04538727, "balance_loss_mlp": 1.01877654, "epoch": 0.7836229182949558, "flos": 19864598382720.0, "grad_norm": 2.0073655530123165, "language_loss": 0.76801312, "learning_rate": 4.711985582893627e-07, "loss": 0.78991264, "num_input_tokens_seen": 140495885, "step": 6517, "time_per_iteration": 2.7455086708068848 }, { "auxiliary_loss_clip": 0.01156933, "auxiliary_loss_mlp": 0.0102242, "balance_loss_clip": 1.04592872, "balance_loss_mlp": 1.01478457, "epoch": 0.783743161185595, "flos": 22965843755520.0, "grad_norm": 1.7611540426203942, "language_loss": 0.71960491, "learning_rate": 4.706964389512811e-07, "loss": 0.74139845, "num_input_tokens_seen": 140515920, "step": 6518, "time_per_iteration": 2.859858989715576 }, { "auxiliary_loss_clip": 0.01166259, "auxiliary_loss_mlp": 0.01017527, "balance_loss_clip": 1.04895353, "balance_loss_mlp": 1.00992405, "epoch": 0.783863404076234, "flos": 12458489777280.0, "grad_norm": 2.0411131175039783, "language_loss": 0.8780334, "learning_rate": 4.701945516073345e-07, "loss": 0.89987123, "num_input_tokens_seen": 140533395, "step": 6519, "time_per_iteration": 3.6530983448028564 }, { "auxiliary_loss_clip": 0.01154542, "auxiliary_loss_mlp": 0.01027592, "balance_loss_clip": 1.04637229, "balance_loss_mlp": 1.02007031, "epoch": 0.7839836469668731, "flos": 24243940465920.0, "grad_norm": 1.9285764440497053, "language_loss": 0.75564092, "learning_rate": 4.696928963336577e-07, "loss": 0.77746224, "num_input_tokens_seen": 140552825, "step": 6520, "time_per_iteration": 2.7857532501220703 }, { "auxiliary_loss_clip": 0.01058294, "auxiliary_loss_mlp": 0.01002467, "balance_loss_clip": 1.00895047, "balance_loss_mlp": 1.00135827, "epoch": 0.7841038898575122, "flos": 62121978938880.0, "grad_norm": 0.8580955802801088, "language_loss": 0.6105237, "learning_rate": 4.6919147320635224e-07, "loss": 0.63113129, "num_input_tokens_seen": 140615535, "step": 6521, "time_per_iteration": 4.1628358364105225 }, { "auxiliary_loss_clip": 0.01167517, "auxiliary_loss_mlp": 0.01022907, "balance_loss_clip": 1.04882336, "balance_loss_mlp": 1.01614237, "epoch": 0.7842241327481513, "flos": 20193899293440.0, "grad_norm": 3.478608896690843, "language_loss": 0.73178732, "learning_rate": 4.6869028230148286e-07, "loss": 0.75369155, "num_input_tokens_seen": 140633330, "step": 6522, "time_per_iteration": 2.694605588912964 }, { "auxiliary_loss_clip": 0.01155323, "auxiliary_loss_mlp": 0.01026857, "balance_loss_clip": 1.04612124, "balance_loss_mlp": 1.01931667, "epoch": 0.7843443756387903, "flos": 28074531496320.0, "grad_norm": 2.7049839818690127, "language_loss": 0.59769744, "learning_rate": 4.6818932369507957e-07, "loss": 0.61951917, "num_input_tokens_seen": 140652830, "step": 6523, "time_per_iteration": 2.7794806957244873 }, { "auxiliary_loss_clip": 0.01164935, "auxiliary_loss_mlp": 0.01026576, "balance_loss_clip": 1.04887414, "balance_loss_mlp": 1.01882172, "epoch": 0.7844646185294295, "flos": 21323397438720.0, "grad_norm": 2.0549328292947884, "language_loss": 0.89355528, "learning_rate": 4.676885974631386e-07, "loss": 0.91547036, "num_input_tokens_seen": 140671190, "step": 6524, "time_per_iteration": 3.5936591625213623 }, { "auxiliary_loss_clip": 0.01166088, "auxiliary_loss_mlp": 0.01025029, "balance_loss_clip": 1.04876053, "balance_loss_mlp": 1.01736414, "epoch": 0.7845848614200686, "flos": 23656585271040.0, "grad_norm": 2.028120055937718, "language_loss": 0.80984581, "learning_rate": 4.67188103681619e-07, "loss": 0.83175695, "num_input_tokens_seen": 140690975, "step": 6525, "time_per_iteration": 2.787421226501465 }, { "auxiliary_loss_clip": 0.01156372, "auxiliary_loss_mlp": 0.01060378, "balance_loss_clip": 1.04545617, "balance_loss_mlp": 1.02457166, "epoch": 0.7847051043107076, "flos": 23402194174080.0, "grad_norm": 2.042302655675883, "language_loss": 0.69445932, "learning_rate": 4.666878424264453e-07, "loss": 0.71662682, "num_input_tokens_seen": 140710930, "step": 6526, "time_per_iteration": 2.703920364379883 }, { "auxiliary_loss_clip": 0.01154033, "auxiliary_loss_mlp": 0.01024737, "balance_loss_clip": 1.04801691, "balance_loss_mlp": 1.01797152, "epoch": 0.7848253472013467, "flos": 19022277473280.0, "grad_norm": 1.57168952622339, "language_loss": 0.73797131, "learning_rate": 4.661878137735069e-07, "loss": 0.75975895, "num_input_tokens_seen": 140729120, "step": 6527, "time_per_iteration": 2.7781615257263184 }, { "auxiliary_loss_clip": 0.01158905, "auxiliary_loss_mlp": 0.01021624, "balance_loss_clip": 1.04817128, "balance_loss_mlp": 1.01469779, "epoch": 0.7849455900919858, "flos": 21179180332800.0, "grad_norm": 1.8088918941559207, "language_loss": 0.74901164, "learning_rate": 4.656880177986571e-07, "loss": 0.77081686, "num_input_tokens_seen": 140747665, "step": 6528, "time_per_iteration": 2.782731294631958 }, { "auxiliary_loss_clip": 0.01166377, "auxiliary_loss_mlp": 0.0102657, "balance_loss_clip": 1.04815078, "balance_loss_mlp": 1.01889896, "epoch": 0.7850658329826249, "flos": 19536482620800.0, "grad_norm": 2.038536585491541, "language_loss": 0.81629485, "learning_rate": 4.6518845457771607e-07, "loss": 0.83822429, "num_input_tokens_seen": 140766525, "step": 6529, "time_per_iteration": 3.5677132606506348 }, { "auxiliary_loss_clip": 0.01160764, "auxiliary_loss_mlp": 0.0105455, "balance_loss_clip": 1.0492357, "balance_loss_mlp": 1.01691937, "epoch": 0.7851860758732639, "flos": 12495334152960.0, "grad_norm": 1.8947348605973504, "language_loss": 0.79769683, "learning_rate": 4.646891241864652e-07, "loss": 0.81984997, "num_input_tokens_seen": 140785090, "step": 6530, "time_per_iteration": 2.7108547687530518 }, { "auxiliary_loss_clip": 0.01163348, "auxiliary_loss_mlp": 0.01024176, "balance_loss_clip": 1.04720736, "balance_loss_mlp": 1.01696432, "epoch": 0.7853063187639031, "flos": 22960959505920.0, "grad_norm": 1.8740724942830473, "language_loss": 0.7301625, "learning_rate": 4.6419002670065397e-07, "loss": 0.75203776, "num_input_tokens_seen": 140804670, "step": 6531, "time_per_iteration": 2.6678903102874756 }, { "auxiliary_loss_clip": 0.01160837, "auxiliary_loss_mlp": 0.01023984, "balance_loss_clip": 1.04812074, "balance_loss_mlp": 1.01684928, "epoch": 0.7854265616545422, "flos": 17347260499200.0, "grad_norm": 2.0709088093213603, "language_loss": 0.86582965, "learning_rate": 4.6369116219599445e-07, "loss": 0.88767791, "num_input_tokens_seen": 140820655, "step": 6532, "time_per_iteration": 2.7540054321289062 }, { "auxiliary_loss_clip": 0.01155604, "auxiliary_loss_mlp": 0.01026625, "balance_loss_clip": 1.04535913, "balance_loss_mlp": 1.01968098, "epoch": 0.7855468045451812, "flos": 23838293197440.0, "grad_norm": 1.744004191693069, "language_loss": 0.79106605, "learning_rate": 4.631925307481637e-07, "loss": 0.81288826, "num_input_tokens_seen": 140840470, "step": 6533, "time_per_iteration": 2.745063066482544 }, { "auxiliary_loss_clip": 0.01158976, "auxiliary_loss_mlp": 0.01024499, "balance_loss_clip": 1.048316, "balance_loss_mlp": 1.01777875, "epoch": 0.7856670474358204, "flos": 25666792986240.0, "grad_norm": 2.533259593847322, "language_loss": 0.75511724, "learning_rate": 4.6269413243280533e-07, "loss": 0.77695203, "num_input_tokens_seen": 140859890, "step": 6534, "time_per_iteration": 2.822415828704834 }, { "auxiliary_loss_clip": 0.01165015, "auxiliary_loss_mlp": 0.01025833, "balance_loss_clip": 1.04947305, "balance_loss_mlp": 1.017766, "epoch": 0.7857872903264594, "flos": 18144656472960.0, "grad_norm": 2.480699230629625, "language_loss": 0.74313623, "learning_rate": 4.621959673255236e-07, "loss": 0.76504469, "num_input_tokens_seen": 140876190, "step": 6535, "time_per_iteration": 2.75105619430542 }, { "auxiliary_loss_clip": 0.01156551, "auxiliary_loss_mlp": 0.01026523, "balance_loss_clip": 1.04870319, "balance_loss_mlp": 1.01870883, "epoch": 0.7859075332170985, "flos": 14386138081920.0, "grad_norm": 2.7491618060843996, "language_loss": 0.90567601, "learning_rate": 4.6169803550189135e-07, "loss": 0.92750674, "num_input_tokens_seen": 140891885, "step": 6536, "time_per_iteration": 2.815164566040039 }, { "auxiliary_loss_clip": 0.01149051, "auxiliary_loss_mlp": 0.01023651, "balance_loss_clip": 1.04849303, "balance_loss_mlp": 1.01609337, "epoch": 0.7860277761077377, "flos": 19864059678720.0, "grad_norm": 2.1174859890128976, "language_loss": 0.77494949, "learning_rate": 4.6120033703744355e-07, "loss": 0.79667652, "num_input_tokens_seen": 140910780, "step": 6537, "time_per_iteration": 2.8869266510009766 }, { "auxiliary_loss_clip": 0.01150773, "auxiliary_loss_mlp": 0.01029659, "balance_loss_clip": 1.04519629, "balance_loss_mlp": 1.02198744, "epoch": 0.7861480189983767, "flos": 26396174557440.0, "grad_norm": 1.8080959667229164, "language_loss": 0.7847532, "learning_rate": 4.607028720076822e-07, "loss": 0.80655748, "num_input_tokens_seen": 140927460, "step": 6538, "time_per_iteration": 2.7870635986328125 }, { "auxiliary_loss_clip": 0.01162409, "auxiliary_loss_mlp": 0.01026736, "balance_loss_clip": 1.04733133, "balance_loss_mlp": 1.01961875, "epoch": 0.7862682618890158, "flos": 24236578177920.0, "grad_norm": 2.059608285772728, "language_loss": 0.7364589, "learning_rate": 4.6020564048807074e-07, "loss": 0.75835031, "num_input_tokens_seen": 140945135, "step": 6539, "time_per_iteration": 2.8305349349975586 }, { "auxiliary_loss_clip": 0.01166109, "auxiliary_loss_mlp": 0.01025329, "balance_loss_clip": 1.04779482, "balance_loss_mlp": 1.01791453, "epoch": 0.7863885047796549, "flos": 47551508259840.0, "grad_norm": 19.33054556629981, "language_loss": 0.72043729, "learning_rate": 4.5970864255403883e-07, "loss": 0.74235165, "num_input_tokens_seen": 140966660, "step": 6540, "time_per_iteration": 2.960907220840454 }, { "auxiliary_loss_clip": 0.0115542, "auxiliary_loss_mlp": 0.0102279, "balance_loss_clip": 1.04711163, "balance_loss_mlp": 1.01564336, "epoch": 0.786508747670294, "flos": 24389234979840.0, "grad_norm": 1.895413495168468, "language_loss": 0.81886971, "learning_rate": 4.59211878280982e-07, "loss": 0.84065181, "num_input_tokens_seen": 140986175, "step": 6541, "time_per_iteration": 2.7668399810791016 }, { "auxiliary_loss_clip": 0.01162377, "auxiliary_loss_mlp": 0.01023988, "balance_loss_clip": 1.04718435, "balance_loss_mlp": 1.01652837, "epoch": 0.786628990560933, "flos": 18041234238720.0, "grad_norm": 2.527789059725602, "language_loss": 0.70105588, "learning_rate": 4.587153477442578e-07, "loss": 0.72291952, "num_input_tokens_seen": 141002490, "step": 6542, "time_per_iteration": 2.8408896923065186 }, { "auxiliary_loss_clip": 0.01171653, "auxiliary_loss_mlp": 0.01025328, "balance_loss_clip": 1.04938483, "balance_loss_mlp": 1.01723945, "epoch": 0.7867492334515722, "flos": 25848860048640.0, "grad_norm": 3.7497403102701514, "language_loss": 0.81284052, "learning_rate": 4.582190510191899e-07, "loss": 0.83481032, "num_input_tokens_seen": 141021150, "step": 6543, "time_per_iteration": 2.8366384506225586 }, { "auxiliary_loss_clip": 0.01153602, "auxiliary_loss_mlp": 0.01026651, "balance_loss_clip": 1.04768741, "balance_loss_mlp": 1.01970673, "epoch": 0.7868694763422113, "flos": 16580819070720.0, "grad_norm": 2.123043784207104, "language_loss": 0.8713479, "learning_rate": 4.5772298818106625e-07, "loss": 0.89315045, "num_input_tokens_seen": 141036940, "step": 6544, "time_per_iteration": 2.7308766841888428 }, { "auxiliary_loss_clip": 0.0116462, "auxiliary_loss_mlp": 0.010283, "balance_loss_clip": 1.04954958, "balance_loss_mlp": 1.0205518, "epoch": 0.7869897192328503, "flos": 29386276272000.0, "grad_norm": 6.503724012142287, "language_loss": 0.72035098, "learning_rate": 4.572271593051384e-07, "loss": 0.74228013, "num_input_tokens_seen": 141054295, "step": 6545, "time_per_iteration": 3.720149040222168 }, { "auxiliary_loss_clip": 0.011486, "auxiliary_loss_mlp": 0.01018985, "balance_loss_clip": 1.04849172, "balance_loss_mlp": 1.0113852, "epoch": 0.7871099621234895, "flos": 17128923678720.0, "grad_norm": 1.815247925167966, "language_loss": 0.78178328, "learning_rate": 4.567315644666245e-07, "loss": 0.80345917, "num_input_tokens_seen": 141073090, "step": 6546, "time_per_iteration": 2.8613390922546387 }, { "auxiliary_loss_clip": 0.01150794, "auxiliary_loss_mlp": 0.01025637, "balance_loss_clip": 1.04705811, "balance_loss_mlp": 1.01868451, "epoch": 0.7872302050141285, "flos": 23440187784960.0, "grad_norm": 2.066203605942187, "language_loss": 0.84556401, "learning_rate": 4.5623620374070507e-07, "loss": 0.86732829, "num_input_tokens_seen": 141092405, "step": 6547, "time_per_iteration": 3.8087527751922607 }, { "auxiliary_loss_clip": 0.01062431, "auxiliary_loss_mlp": 0.01002318, "balance_loss_clip": 1.01174402, "balance_loss_mlp": 1.00136423, "epoch": 0.7873504479047676, "flos": 65959752689280.0, "grad_norm": 0.7658954607321421, "language_loss": 0.58428037, "learning_rate": 4.557410772025263e-07, "loss": 0.60492778, "num_input_tokens_seen": 141154355, "step": 6548, "time_per_iteration": 3.4756617546081543 }, { "auxiliary_loss_clip": 0.01158772, "auxiliary_loss_mlp": 0.01023336, "balance_loss_clip": 1.04759765, "balance_loss_mlp": 1.01571274, "epoch": 0.7874706907954068, "flos": 23258336204160.0, "grad_norm": 2.280214871046874, "language_loss": 0.66399032, "learning_rate": 4.5524618492719803e-07, "loss": 0.6858114, "num_input_tokens_seen": 141173575, "step": 6549, "time_per_iteration": 2.7652037143707275 }, { "auxiliary_loss_clip": 0.01163093, "auxiliary_loss_mlp": 0.0102178, "balance_loss_clip": 1.04520273, "balance_loss_mlp": 1.01491404, "epoch": 0.7875909336860458, "flos": 28767786963840.0, "grad_norm": 1.6687438462727608, "language_loss": 0.78934306, "learning_rate": 4.54751526989795e-07, "loss": 0.8111918, "num_input_tokens_seen": 141195415, "step": 6550, "time_per_iteration": 3.669980764389038 }, { "auxiliary_loss_clip": 0.0116671, "auxiliary_loss_mlp": 0.01022515, "balance_loss_clip": 1.04687214, "balance_loss_mlp": 1.01506472, "epoch": 0.7877111765766849, "flos": 18697286194560.0, "grad_norm": 2.0637476876232053, "language_loss": 0.78855515, "learning_rate": 4.5425710346535775e-07, "loss": 0.81044739, "num_input_tokens_seen": 141213360, "step": 6551, "time_per_iteration": 2.662713050842285 }, { "auxiliary_loss_clip": 0.01165205, "auxiliary_loss_mlp": 0.01028093, "balance_loss_clip": 1.04604542, "balance_loss_mlp": 1.02080941, "epoch": 0.787831419467324, "flos": 27592968833280.0, "grad_norm": 2.198008872850402, "language_loss": 0.82069582, "learning_rate": 4.537629144288877e-07, "loss": 0.84262878, "num_input_tokens_seen": 141230815, "step": 6552, "time_per_iteration": 2.756843328475952 }, { "auxiliary_loss_clip": 0.01160741, "auxiliary_loss_mlp": 0.01026711, "balance_loss_clip": 1.04675484, "balance_loss_mlp": 1.01957631, "epoch": 0.7879516623579631, "flos": 18150187167360.0, "grad_norm": 3.250494201164931, "language_loss": 0.74803841, "learning_rate": 4.5326895995535477e-07, "loss": 0.76991284, "num_input_tokens_seen": 141249715, "step": 6553, "time_per_iteration": 2.714068651199341 }, { "auxiliary_loss_clip": 0.01162209, "auxiliary_loss_mlp": 0.01025467, "balance_loss_clip": 1.04659152, "balance_loss_mlp": 1.01820087, "epoch": 0.7880719052486022, "flos": 20339193807360.0, "grad_norm": 2.401455558602021, "language_loss": 0.83965862, "learning_rate": 4.527752401196907e-07, "loss": 0.86153537, "num_input_tokens_seen": 141267730, "step": 6554, "time_per_iteration": 2.7265732288360596 }, { "auxiliary_loss_clip": 0.0115831, "auxiliary_loss_mlp": 0.0102101, "balance_loss_clip": 1.04928803, "balance_loss_mlp": 1.01308239, "epoch": 0.7881921481392413, "flos": 21653237053440.0, "grad_norm": 1.8276112981009818, "language_loss": 0.67106307, "learning_rate": 4.5228175499679254e-07, "loss": 0.69285631, "num_input_tokens_seen": 141287315, "step": 6555, "time_per_iteration": 3.600846767425537 }, { "auxiliary_loss_clip": 0.01061097, "auxiliary_loss_mlp": 0.01002032, "balance_loss_clip": 1.00798559, "balance_loss_mlp": 1.00104237, "epoch": 0.7883123910298804, "flos": 68565860058240.0, "grad_norm": 0.810344827898888, "language_loss": 0.54473609, "learning_rate": 4.5178850466152174e-07, "loss": 0.5653674, "num_input_tokens_seen": 141346145, "step": 6556, "time_per_iteration": 3.2433831691741943 }, { "auxiliary_loss_clip": 0.01157513, "auxiliary_loss_mlp": 0.0102378, "balance_loss_clip": 1.04638076, "balance_loss_mlp": 1.01650798, "epoch": 0.7884326339205194, "flos": 19318217627520.0, "grad_norm": 1.949141836724702, "language_loss": 0.81977797, "learning_rate": 4.512954891887031e-07, "loss": 0.84159094, "num_input_tokens_seen": 141364445, "step": 6557, "time_per_iteration": 2.685138702392578 }, { "auxiliary_loss_clip": 0.01153744, "auxiliary_loss_mlp": 0.01026575, "balance_loss_clip": 1.04600394, "balance_loss_mlp": 1.01904726, "epoch": 0.7885528768111585, "flos": 17784903807360.0, "grad_norm": 2.4363576181176447, "language_loss": 0.8368752, "learning_rate": 4.5080270865312806e-07, "loss": 0.8586784, "num_input_tokens_seen": 141381640, "step": 6558, "time_per_iteration": 2.684016466140747 }, { "auxiliary_loss_clip": 0.01163338, "auxiliary_loss_mlp": 0.01025508, "balance_loss_clip": 1.04627967, "balance_loss_mlp": 1.01811743, "epoch": 0.7886731197017977, "flos": 18807639753600.0, "grad_norm": 2.325863955515972, "language_loss": 0.71261072, "learning_rate": 4.5031016312954985e-07, "loss": 0.73449922, "num_input_tokens_seen": 141399955, "step": 6559, "time_per_iteration": 2.835887908935547 }, { "auxiliary_loss_clip": 0.01170933, "auxiliary_loss_mlp": 0.01022301, "balance_loss_clip": 1.04926908, "balance_loss_mlp": 1.01489258, "epoch": 0.7887933625924367, "flos": 33365358126720.0, "grad_norm": 1.8976987820290845, "language_loss": 0.74521744, "learning_rate": 4.498178526926886e-07, "loss": 0.76714981, "num_input_tokens_seen": 141420820, "step": 6560, "time_per_iteration": 2.84462833404541 }, { "auxiliary_loss_clip": 0.01168166, "auxiliary_loss_mlp": 0.01028183, "balance_loss_clip": 1.04890323, "balance_loss_mlp": 1.02093852, "epoch": 0.7889136054830758, "flos": 17019360218880.0, "grad_norm": 2.751681749870934, "language_loss": 0.71993518, "learning_rate": 4.4932577741722635e-07, "loss": 0.74189866, "num_input_tokens_seen": 141439350, "step": 6561, "time_per_iteration": 2.604199171066284 }, { "auxiliary_loss_clip": 0.01159225, "auxiliary_loss_mlp": 0.01021616, "balance_loss_clip": 1.04797173, "balance_loss_mlp": 1.01409996, "epoch": 0.7890338483737149, "flos": 29424629018880.0, "grad_norm": 1.7529875923834564, "language_loss": 0.74223876, "learning_rate": 4.4883393737780985e-07, "loss": 0.76404721, "num_input_tokens_seen": 141460300, "step": 6562, "time_per_iteration": 2.7714929580688477 }, { "auxiliary_loss_clip": 0.0115685, "auxiliary_loss_mlp": 0.01026053, "balance_loss_clip": 1.04516244, "balance_loss_mlp": 1.01859021, "epoch": 0.789154091264354, "flos": 19971576063360.0, "grad_norm": 2.017881476208678, "language_loss": 0.78109932, "learning_rate": 4.4834233264905254e-07, "loss": 0.80292833, "num_input_tokens_seen": 141477315, "step": 6563, "time_per_iteration": 2.658582925796509 }, { "auxiliary_loss_clip": 0.01154829, "auxiliary_loss_mlp": 0.0103018, "balance_loss_clip": 1.04919624, "balance_loss_mlp": 1.02289915, "epoch": 0.789274334154993, "flos": 14537825216640.0, "grad_norm": 2.433901237256121, "language_loss": 0.71617687, "learning_rate": 4.478509633055294e-07, "loss": 0.73802692, "num_input_tokens_seen": 141495025, "step": 6564, "time_per_iteration": 2.753970146179199 }, { "auxiliary_loss_clip": 0.01164285, "auxiliary_loss_mlp": 0.01024582, "balance_loss_clip": 1.04818225, "balance_loss_mlp": 1.01701844, "epoch": 0.7893945770456322, "flos": 21827403123840.0, "grad_norm": 2.19365505650379, "language_loss": 0.80637288, "learning_rate": 4.473598294217813e-07, "loss": 0.82826161, "num_input_tokens_seen": 141510450, "step": 6565, "time_per_iteration": 2.626716375350952 }, { "auxiliary_loss_clip": 0.01159024, "auxiliary_loss_mlp": 0.01025518, "balance_loss_clip": 1.04492211, "balance_loss_mlp": 1.01863337, "epoch": 0.7895148199362713, "flos": 20740639184640.0, "grad_norm": 2.0269069988874966, "language_loss": 0.71495378, "learning_rate": 4.468689310723124e-07, "loss": 0.73679924, "num_input_tokens_seen": 141528265, "step": 6566, "time_per_iteration": 2.6907272338867188 }, { "auxiliary_loss_clip": 0.01160935, "auxiliary_loss_mlp": 0.01022255, "balance_loss_clip": 1.04726648, "balance_loss_mlp": 1.01511443, "epoch": 0.7896350628269103, "flos": 16690669839360.0, "grad_norm": 1.7052295312433243, "language_loss": 0.78587282, "learning_rate": 4.463782683315913e-07, "loss": 0.80770469, "num_input_tokens_seen": 141547270, "step": 6567, "time_per_iteration": 2.7038512229919434 }, { "auxiliary_loss_clip": 0.01164801, "auxiliary_loss_mlp": 0.01024157, "balance_loss_clip": 1.04634106, "balance_loss_mlp": 1.01740408, "epoch": 0.7897553057175495, "flos": 22638374438400.0, "grad_norm": 2.0800961058370935, "language_loss": 0.73327601, "learning_rate": 4.458878412740523e-07, "loss": 0.75516558, "num_input_tokens_seen": 141566050, "step": 6568, "time_per_iteration": 2.7444660663604736 }, { "auxiliary_loss_clip": 0.01160551, "auxiliary_loss_mlp": 0.01023913, "balance_loss_clip": 1.04667139, "balance_loss_mlp": 1.01689434, "epoch": 0.7898755486081885, "flos": 14537573821440.0, "grad_norm": 2.347130923713473, "language_loss": 0.77808821, "learning_rate": 4.453976499740919e-07, "loss": 0.79993284, "num_input_tokens_seen": 141583695, "step": 6569, "time_per_iteration": 2.638322114944458 }, { "auxiliary_loss_clip": 0.01161402, "auxiliary_loss_mlp": 0.01024131, "balance_loss_clip": 1.04797602, "balance_loss_mlp": 1.01713979, "epoch": 0.7899957914988276, "flos": 17238487138560.0, "grad_norm": 1.906802058625441, "language_loss": 0.77651709, "learning_rate": 4.4490769450607215e-07, "loss": 0.79837239, "num_input_tokens_seen": 141601320, "step": 6570, "time_per_iteration": 2.6893765926361084 }, { "auxiliary_loss_clip": 0.01158631, "auxiliary_loss_mlp": 0.01026771, "balance_loss_clip": 1.0477165, "balance_loss_mlp": 1.01929712, "epoch": 0.7901160343894668, "flos": 41279351086080.0, "grad_norm": 1.867506993332099, "language_loss": 0.72763997, "learning_rate": 4.4441797494431845e-07, "loss": 0.74949402, "num_input_tokens_seen": 141623125, "step": 6571, "time_per_iteration": 3.840202808380127 }, { "auxiliary_loss_clip": 0.01159477, "auxiliary_loss_mlp": 0.01022272, "balance_loss_clip": 1.04618013, "balance_loss_mlp": 1.01485705, "epoch": 0.7902362772801058, "flos": 16837005847680.0, "grad_norm": 4.936892107771669, "language_loss": 0.77748913, "learning_rate": 4.439284913631207e-07, "loss": 0.79930663, "num_input_tokens_seen": 141640335, "step": 6572, "time_per_iteration": 2.8235299587249756 }, { "auxiliary_loss_clip": 0.01160996, "auxiliary_loss_mlp": 0.01028179, "balance_loss_clip": 1.04937506, "balance_loss_mlp": 1.02071691, "epoch": 0.7903565201707449, "flos": 27125987091840.0, "grad_norm": 2.5298128544765244, "language_loss": 0.83914125, "learning_rate": 4.434392438367347e-07, "loss": 0.86103308, "num_input_tokens_seen": 141659760, "step": 6573, "time_per_iteration": 3.822848081588745 }, { "auxiliary_loss_clip": 0.0116753, "auxiliary_loss_mlp": 0.0102377, "balance_loss_clip": 1.04577518, "balance_loss_mlp": 1.016379, "epoch": 0.790476763061384, "flos": 31025167142400.0, "grad_norm": 2.230203415887652, "language_loss": 0.74168736, "learning_rate": 4.4295023243937677e-07, "loss": 0.76360035, "num_input_tokens_seen": 141679965, "step": 6574, "time_per_iteration": 2.811840295791626 }, { "auxiliary_loss_clip": 0.01167448, "auxiliary_loss_mlp": 0.01026856, "balance_loss_clip": 1.05029285, "balance_loss_mlp": 1.0183568, "epoch": 0.7905970059520231, "flos": 22089084681600.0, "grad_norm": 1.727006524278389, "language_loss": 0.8011502, "learning_rate": 4.4246145724523123e-07, "loss": 0.82309318, "num_input_tokens_seen": 141697710, "step": 6575, "time_per_iteration": 2.634036064147949 }, { "auxiliary_loss_clip": 0.01158335, "auxiliary_loss_mlp": 0.01025792, "balance_loss_clip": 1.04817653, "balance_loss_mlp": 1.01869273, "epoch": 0.7907172488426621, "flos": 20558141159040.0, "grad_norm": 2.4787675469808637, "language_loss": 0.77479875, "learning_rate": 4.41972918328444e-07, "loss": 0.79663998, "num_input_tokens_seen": 141715145, "step": 6576, "time_per_iteration": 3.629948139190674 }, { "auxiliary_loss_clip": 0.01164529, "auxiliary_loss_mlp": 0.01025436, "balance_loss_clip": 1.04972529, "balance_loss_mlp": 1.01828074, "epoch": 0.7908374917333013, "flos": 30081542901120.0, "grad_norm": 2.48900792259135, "language_loss": 0.77904022, "learning_rate": 4.4148461576312646e-07, "loss": 0.80093992, "num_input_tokens_seen": 141734810, "step": 6577, "time_per_iteration": 2.826218843460083 }, { "auxiliary_loss_clip": 0.01164612, "auxiliary_loss_mlp": 0.01027119, "balance_loss_clip": 1.0487597, "balance_loss_mlp": 1.02033639, "epoch": 0.7909577346239404, "flos": 20996359084800.0, "grad_norm": 1.428738506418785, "language_loss": 0.74571502, "learning_rate": 4.4099654962335343e-07, "loss": 0.76763225, "num_input_tokens_seen": 141755260, "step": 6578, "time_per_iteration": 2.786860704421997 }, { "auxiliary_loss_clip": 0.01168307, "auxiliary_loss_mlp": 0.01026526, "balance_loss_clip": 1.04920208, "balance_loss_mlp": 1.01911688, "epoch": 0.7910779775145794, "flos": 26247935128320.0, "grad_norm": 1.9974257463805296, "language_loss": 0.7496562, "learning_rate": 4.405087199831636e-07, "loss": 0.77160454, "num_input_tokens_seen": 141775500, "step": 6579, "time_per_iteration": 2.806447744369507 }, { "auxiliary_loss_clip": 0.01162845, "auxiliary_loss_mlp": 0.01055853, "balance_loss_clip": 1.04618597, "balance_loss_mlp": 1.02079356, "epoch": 0.7911982204052186, "flos": 22564434291840.0, "grad_norm": 1.8528126010581312, "language_loss": 0.67141235, "learning_rate": 4.400211269165619e-07, "loss": 0.69359922, "num_input_tokens_seen": 141791955, "step": 6580, "time_per_iteration": 2.692073106765747 }, { "auxiliary_loss_clip": 0.01167832, "auxiliary_loss_mlp": 0.01025798, "balance_loss_clip": 1.05018044, "balance_loss_mlp": 1.01899064, "epoch": 0.7913184632958576, "flos": 23112538899840.0, "grad_norm": 1.5127206572811278, "language_loss": 0.7683621, "learning_rate": 4.3953377049751416e-07, "loss": 0.79029846, "num_input_tokens_seen": 141812380, "step": 6581, "time_per_iteration": 2.7946529388427734 }, { "auxiliary_loss_clip": 0.01165299, "auxiliary_loss_mlp": 0.0102648, "balance_loss_clip": 1.04876208, "balance_loss_mlp": 1.01935458, "epoch": 0.7914387061864967, "flos": 12311758719360.0, "grad_norm": 3.8061846867245945, "language_loss": 0.78329206, "learning_rate": 4.390466507999537e-07, "loss": 0.80520988, "num_input_tokens_seen": 141828130, "step": 6582, "time_per_iteration": 3.7657968997955322 }, { "auxiliary_loss_clip": 0.01157612, "auxiliary_loss_mlp": 0.01023116, "balance_loss_clip": 1.04801607, "balance_loss_mlp": 1.01557589, "epoch": 0.7915589490771359, "flos": 17603267708160.0, "grad_norm": 2.170412392143667, "language_loss": 0.75427377, "learning_rate": 4.385597678977748e-07, "loss": 0.77608109, "num_input_tokens_seen": 141846965, "step": 6583, "time_per_iteration": 3.083364725112915 }, { "auxiliary_loss_clip": 0.0115929, "auxiliary_loss_mlp": 0.01025344, "balance_loss_clip": 1.04685009, "balance_loss_mlp": 1.01778603, "epoch": 0.7916791919677749, "flos": 25591272641280.0, "grad_norm": 1.711930823144205, "language_loss": 0.75623351, "learning_rate": 4.3807312186483726e-07, "loss": 0.77807987, "num_input_tokens_seen": 141867685, "step": 6584, "time_per_iteration": 2.757859468460083 }, { "auxiliary_loss_clip": 0.01163561, "auxiliary_loss_mlp": 0.01024739, "balance_loss_clip": 1.05072558, "balance_loss_mlp": 1.01765239, "epoch": 0.791799434858414, "flos": 18844340474880.0, "grad_norm": 1.7430652384910874, "language_loss": 0.78626168, "learning_rate": 4.375867127749655e-07, "loss": 0.80814469, "num_input_tokens_seen": 141885960, "step": 6585, "time_per_iteration": 2.8851654529571533 }, { "auxiliary_loss_clip": 0.01159247, "auxiliary_loss_mlp": 0.0102675, "balance_loss_clip": 1.04887903, "balance_loss_mlp": 1.01958239, "epoch": 0.7919196777490531, "flos": 25812015672960.0, "grad_norm": 1.9074705033282917, "language_loss": 0.67104059, "learning_rate": 4.3710054070194744e-07, "loss": 0.69290054, "num_input_tokens_seen": 141905655, "step": 6586, "time_per_iteration": 2.9122276306152344 }, { "auxiliary_loss_clip": 0.0116676, "auxiliary_loss_mlp": 0.01056494, "balance_loss_clip": 1.04660225, "balance_loss_mlp": 1.02055454, "epoch": 0.7920399206396922, "flos": 11947624594560.0, "grad_norm": 2.7460120583147565, "language_loss": 0.65862191, "learning_rate": 4.3661460571953455e-07, "loss": 0.6808545, "num_input_tokens_seen": 141922390, "step": 6587, "time_per_iteration": 2.7248475551605225 }, { "auxiliary_loss_clip": 0.0116502, "auxiliary_loss_mlp": 0.0102481, "balance_loss_clip": 1.04609287, "balance_loss_mlp": 1.01742768, "epoch": 0.7921601635303313, "flos": 21579907438080.0, "grad_norm": 1.6360134254071557, "language_loss": 0.68646669, "learning_rate": 4.36128907901443e-07, "loss": 0.70836496, "num_input_tokens_seen": 141941985, "step": 6588, "time_per_iteration": 2.7788970470428467 }, { "auxiliary_loss_clip": 0.01159987, "auxiliary_loss_mlp": 0.01023179, "balance_loss_clip": 1.04733634, "balance_loss_mlp": 1.0156951, "epoch": 0.7922804064209703, "flos": 18113989236480.0, "grad_norm": 9.52464615229414, "language_loss": 0.72875702, "learning_rate": 4.356434473213519e-07, "loss": 0.75058866, "num_input_tokens_seen": 141959435, "step": 6589, "time_per_iteration": 2.7985823154449463 }, { "auxiliary_loss_clip": 0.01161883, "auxiliary_loss_mlp": 0.01028646, "balance_loss_clip": 1.05036783, "balance_loss_mlp": 1.02052224, "epoch": 0.7924006493116095, "flos": 21652806090240.0, "grad_norm": 1.6064152737674655, "language_loss": 0.79653627, "learning_rate": 4.351582240529068e-07, "loss": 0.81844157, "num_input_tokens_seen": 141980265, "step": 6590, "time_per_iteration": 2.8293139934539795 }, { "auxiliary_loss_clip": 0.01064209, "auxiliary_loss_mlp": 0.01003376, "balance_loss_clip": 1.00865865, "balance_loss_mlp": 1.00235677, "epoch": 0.7925208922022485, "flos": 64242755694720.0, "grad_norm": 0.6881547794912919, "language_loss": 0.58142114, "learning_rate": 4.346732381697149e-07, "loss": 0.60209703, "num_input_tokens_seen": 142044395, "step": 6591, "time_per_iteration": 3.3456077575683594 }, { "auxiliary_loss_clip": 0.01155927, "auxiliary_loss_mlp": 0.01023871, "balance_loss_clip": 1.04750013, "balance_loss_mlp": 1.01653695, "epoch": 0.7926411350928876, "flos": 16941541403520.0, "grad_norm": 2.082290748847842, "language_loss": 0.81184423, "learning_rate": 4.3418848974534825e-07, "loss": 0.83364218, "num_input_tokens_seen": 142061335, "step": 6592, "time_per_iteration": 2.736387252807617 }, { "auxiliary_loss_clip": 0.01162052, "auxiliary_loss_mlp": 0.01022773, "balance_loss_clip": 1.0479877, "balance_loss_mlp": 1.01609111, "epoch": 0.7927613779835267, "flos": 34459987144320.0, "grad_norm": 1.6767515375423527, "language_loss": 0.68694592, "learning_rate": 4.3370397885334276e-07, "loss": 0.70879424, "num_input_tokens_seen": 142081965, "step": 6593, "time_per_iteration": 2.8645129203796387 }, { "auxiliary_loss_clip": 0.01159007, "auxiliary_loss_mlp": 0.01028768, "balance_loss_clip": 1.04851007, "balance_loss_mlp": 1.02160323, "epoch": 0.7928816208741658, "flos": 18951174501120.0, "grad_norm": 1.7985149494601456, "language_loss": 0.75507689, "learning_rate": 4.3321970556719777e-07, "loss": 0.77695465, "num_input_tokens_seen": 142100260, "step": 6594, "time_per_iteration": 2.649656295776367 }, { "auxiliary_loss_clip": 0.01166096, "auxiliary_loss_mlp": 0.01025294, "balance_loss_clip": 1.04674208, "balance_loss_mlp": 1.01757514, "epoch": 0.7930018637648049, "flos": 18623022825600.0, "grad_norm": 2.432762964140492, "language_loss": 0.71977699, "learning_rate": 4.3273566996037856e-07, "loss": 0.74169093, "num_input_tokens_seen": 142116955, "step": 6595, "time_per_iteration": 2.7762064933776855 }, { "auxiliary_loss_clip": 0.01158757, "auxiliary_loss_mlp": 0.01023703, "balance_loss_clip": 1.04596388, "balance_loss_mlp": 1.01669347, "epoch": 0.793122106655444, "flos": 24530650824960.0, "grad_norm": 2.029204201287458, "language_loss": 0.80026674, "learning_rate": 4.322518721063113e-07, "loss": 0.82209134, "num_input_tokens_seen": 142135505, "step": 6596, "time_per_iteration": 2.7664682865142822 }, { "auxiliary_loss_clip": 0.01164853, "auxiliary_loss_mlp": 0.01027237, "balance_loss_clip": 1.04844177, "balance_loss_mlp": 1.01946807, "epoch": 0.7932423495460831, "flos": 34421203434240.0, "grad_norm": 1.9326999363343087, "language_loss": 0.7036289, "learning_rate": 4.3176831207838906e-07, "loss": 0.72554982, "num_input_tokens_seen": 142158915, "step": 6597, "time_per_iteration": 3.746563673019409 }, { "auxiliary_loss_clip": 0.01162538, "auxiliary_loss_mlp": 0.01024375, "balance_loss_clip": 1.04966736, "balance_loss_mlp": 1.01749039, "epoch": 0.7933625924367221, "flos": 26980333441920.0, "grad_norm": 2.8569461359072474, "language_loss": 0.74814045, "learning_rate": 4.3128498994996685e-07, "loss": 0.77000958, "num_input_tokens_seen": 142178390, "step": 6598, "time_per_iteration": 2.745107650756836 }, { "auxiliary_loss_clip": 0.01168721, "auxiliary_loss_mlp": 0.01025574, "balance_loss_clip": 1.04803061, "balance_loss_mlp": 1.01745629, "epoch": 0.7934828353273613, "flos": 29568630643200.0, "grad_norm": 1.9112488383463155, "language_loss": 0.71634436, "learning_rate": 4.308019057943646e-07, "loss": 0.73828733, "num_input_tokens_seen": 142200115, "step": 6599, "time_per_iteration": 3.7336065769195557 }, { "auxiliary_loss_clip": 0.01163526, "auxiliary_loss_mlp": 0.01025751, "balance_loss_clip": 1.04813457, "balance_loss_mlp": 1.01847315, "epoch": 0.7936030782180004, "flos": 28615381557120.0, "grad_norm": 11.372274933824455, "language_loss": 0.74794972, "learning_rate": 4.3031905968486535e-07, "loss": 0.76984251, "num_input_tokens_seen": 142220945, "step": 6600, "time_per_iteration": 2.846816062927246 }, { "auxiliary_loss_clip": 0.01154022, "auxiliary_loss_mlp": 0.01023545, "balance_loss_clip": 1.04887223, "balance_loss_mlp": 1.01626182, "epoch": 0.7937233211086394, "flos": 16392574869120.0, "grad_norm": 2.3328911393278813, "language_loss": 0.68383825, "learning_rate": 4.298364516947162e-07, "loss": 0.70561397, "num_input_tokens_seen": 142238175, "step": 6601, "time_per_iteration": 2.902909517288208 }, { "auxiliary_loss_clip": 0.01152171, "auxiliary_loss_mlp": 0.0102514, "balance_loss_clip": 1.04724181, "balance_loss_mlp": 1.01781476, "epoch": 0.7938435639992786, "flos": 22013420682240.0, "grad_norm": 2.3259701606882244, "language_loss": 0.65837693, "learning_rate": 4.293540818971295e-07, "loss": 0.68015003, "num_input_tokens_seen": 142255980, "step": 6602, "time_per_iteration": 2.8291940689086914 }, { "auxiliary_loss_clip": 0.01167166, "auxiliary_loss_mlp": 0.01026679, "balance_loss_clip": 1.04705954, "balance_loss_mlp": 1.01920509, "epoch": 0.7939638068899176, "flos": 22197032029440.0, "grad_norm": 3.1382082229661474, "language_loss": 0.76758361, "learning_rate": 4.2887195036527934e-07, "loss": 0.78952205, "num_input_tokens_seen": 142274785, "step": 6603, "time_per_iteration": 3.746629476547241 }, { "auxiliary_loss_clip": 0.0115675, "auxiliary_loss_mlp": 0.0103071, "balance_loss_clip": 1.04457355, "balance_loss_mlp": 1.023435, "epoch": 0.7940840497805567, "flos": 17745186343680.0, "grad_norm": 2.296039012486191, "language_loss": 0.72908461, "learning_rate": 4.28390057172306e-07, "loss": 0.75095916, "num_input_tokens_seen": 142291290, "step": 6604, "time_per_iteration": 2.7149200439453125 }, { "auxiliary_loss_clip": 0.01156716, "auxiliary_loss_mlp": 0.01027798, "balance_loss_clip": 1.04847252, "balance_loss_mlp": 1.01947761, "epoch": 0.7942042926711959, "flos": 23805435231360.0, "grad_norm": 6.623085113551442, "language_loss": 0.72185588, "learning_rate": 4.279084023913111e-07, "loss": 0.74370098, "num_input_tokens_seen": 142309165, "step": 6605, "time_per_iteration": 2.8135833740234375 }, { "auxiliary_loss_clip": 0.01163288, "auxiliary_loss_mlp": 0.01027211, "balance_loss_clip": 1.04829478, "balance_loss_mlp": 1.01999021, "epoch": 0.7943245355618349, "flos": 19244959839360.0, "grad_norm": 1.8622575928755094, "language_loss": 0.69375229, "learning_rate": 4.2742698609536096e-07, "loss": 0.71565735, "num_input_tokens_seen": 142327475, "step": 6606, "time_per_iteration": 2.7897861003875732 }, { "auxiliary_loss_clip": 0.01161551, "auxiliary_loss_mlp": 0.01027702, "balance_loss_clip": 1.04661083, "balance_loss_mlp": 1.02079356, "epoch": 0.794444778452474, "flos": 25007616547200.0, "grad_norm": 14.905531412985379, "language_loss": 0.78714073, "learning_rate": 4.2694580835748706e-07, "loss": 0.80903327, "num_input_tokens_seen": 142347335, "step": 6607, "time_per_iteration": 2.7910659313201904 }, { "auxiliary_loss_clip": 0.01160496, "auxiliary_loss_mlp": 0.01022068, "balance_loss_clip": 1.04733539, "balance_loss_mlp": 1.01541579, "epoch": 0.7945650213431131, "flos": 23221491828480.0, "grad_norm": 2.216712216125791, "language_loss": 0.74130338, "learning_rate": 4.264648692506836e-07, "loss": 0.763129, "num_input_tokens_seen": 142366125, "step": 6608, "time_per_iteration": 3.7246487140655518 }, { "auxiliary_loss_clip": 0.0115817, "auxiliary_loss_mlp": 0.01026914, "balance_loss_clip": 1.04788065, "balance_loss_mlp": 1.01957679, "epoch": 0.7946852642337522, "flos": 26062887237120.0, "grad_norm": 1.7982048694509776, "language_loss": 0.72193509, "learning_rate": 4.2598416884790824e-07, "loss": 0.74378586, "num_input_tokens_seen": 142385175, "step": 6609, "time_per_iteration": 2.7952282428741455 }, { "auxiliary_loss_clip": 0.01170151, "auxiliary_loss_mlp": 0.01031796, "balance_loss_clip": 1.04688752, "balance_loss_mlp": 1.02384472, "epoch": 0.7948055071243912, "flos": 23769704177280.0, "grad_norm": 3.378347932801689, "language_loss": 0.80716223, "learning_rate": 4.255037072220828e-07, "loss": 0.82918167, "num_input_tokens_seen": 142406545, "step": 6610, "time_per_iteration": 2.877311944961548 }, { "auxiliary_loss_clip": 0.01164219, "auxiliary_loss_mlp": 0.01024847, "balance_loss_clip": 1.04598117, "balance_loss_mlp": 1.01759601, "epoch": 0.7949257500150304, "flos": 21980814111360.0, "grad_norm": 2.528441360524376, "language_loss": 0.720267, "learning_rate": 4.2502348444609293e-07, "loss": 0.7421577, "num_input_tokens_seen": 142426165, "step": 6611, "time_per_iteration": 2.737138271331787 }, { "auxiliary_loss_clip": 0.01156014, "auxiliary_loss_mlp": 0.01023592, "balance_loss_clip": 1.04855978, "balance_loss_mlp": 1.01668382, "epoch": 0.7950459929056695, "flos": 25774129802880.0, "grad_norm": 2.0246561139376427, "language_loss": 0.69569874, "learning_rate": 4.2454350059278844e-07, "loss": 0.71749485, "num_input_tokens_seen": 142447225, "step": 6612, "time_per_iteration": 2.7841591835021973 }, { "auxiliary_loss_clip": 0.01154683, "auxiliary_loss_mlp": 0.01027381, "balance_loss_clip": 1.04336286, "balance_loss_mlp": 1.01989448, "epoch": 0.7951662357963085, "flos": 22158068751360.0, "grad_norm": 1.9391082830813469, "language_loss": 0.84482074, "learning_rate": 4.240637557349824e-07, "loss": 0.8666414, "num_input_tokens_seen": 142464440, "step": 6613, "time_per_iteration": 2.731226921081543 }, { "auxiliary_loss_clip": 0.01149199, "auxiliary_loss_mlp": 0.01023935, "balance_loss_clip": 1.04640436, "balance_loss_mlp": 1.01685691, "epoch": 0.7952864786869477, "flos": 24641938137600.0, "grad_norm": 2.2459262845905794, "language_loss": 0.66810948, "learning_rate": 4.235842499454516e-07, "loss": 0.68984079, "num_input_tokens_seen": 142484355, "step": 6614, "time_per_iteration": 2.7364370822906494 }, { "auxiliary_loss_clip": 0.0116373, "auxiliary_loss_mlp": 0.01022107, "balance_loss_clip": 1.04854655, "balance_loss_mlp": 1.01524019, "epoch": 0.7954067215775867, "flos": 21830922656640.0, "grad_norm": 1.6540391277959563, "language_loss": 0.8276577, "learning_rate": 4.2310498329693687e-07, "loss": 0.84951609, "num_input_tokens_seen": 142505255, "step": 6615, "time_per_iteration": 2.8482182025909424 }, { "auxiliary_loss_clip": 0.01165193, "auxiliary_loss_mlp": 0.01028065, "balance_loss_clip": 1.04710531, "balance_loss_mlp": 1.02067995, "epoch": 0.7955269644682258, "flos": 24060652341120.0, "grad_norm": 2.028561869270285, "language_loss": 0.8102442, "learning_rate": 4.2262595586214164e-07, "loss": 0.8321768, "num_input_tokens_seen": 142526350, "step": 6616, "time_per_iteration": 2.7273623943328857 }, { "auxiliary_loss_clip": 0.01167878, "auxiliary_loss_mlp": 0.01022703, "balance_loss_clip": 1.04827452, "balance_loss_mlp": 1.01497865, "epoch": 0.795647207358865, "flos": 25010741030400.0, "grad_norm": 1.968068954439917, "language_loss": 0.77042234, "learning_rate": 4.221471677137358e-07, "loss": 0.79232812, "num_input_tokens_seen": 142547165, "step": 6617, "time_per_iteration": 2.7577736377716064 }, { "auxiliary_loss_clip": 0.01151182, "auxiliary_loss_mlp": 0.01023119, "balance_loss_clip": 1.04706669, "balance_loss_mlp": 1.01611233, "epoch": 0.795767450249504, "flos": 14648358343680.0, "grad_norm": 1.8895226863662902, "language_loss": 0.70158762, "learning_rate": 4.216686189243492e-07, "loss": 0.72333062, "num_input_tokens_seen": 142565955, "step": 6618, "time_per_iteration": 2.7130014896392822 }, { "auxiliary_loss_clip": 0.01153284, "auxiliary_loss_mlp": 0.01022961, "balance_loss_clip": 1.04679763, "balance_loss_mlp": 1.01577568, "epoch": 0.7958876931401431, "flos": 18547897530240.0, "grad_norm": 2.2170789334988776, "language_loss": 0.73003995, "learning_rate": 4.211903095665785e-07, "loss": 0.75180244, "num_input_tokens_seen": 142585340, "step": 6619, "time_per_iteration": 2.773052453994751 }, { "auxiliary_loss_clip": 0.01156876, "auxiliary_loss_mlp": 0.01022207, "balance_loss_clip": 1.04448199, "balance_loss_mlp": 1.01516771, "epoch": 0.7960079360307821, "flos": 21543960902400.0, "grad_norm": 1.9051240871508306, "language_loss": 0.75197387, "learning_rate": 4.2071223971298277e-07, "loss": 0.77376467, "num_input_tokens_seen": 142602525, "step": 6620, "time_per_iteration": 2.6736738681793213 }, { "auxiliary_loss_clip": 0.01165903, "auxiliary_loss_mlp": 0.01025202, "balance_loss_clip": 1.04739976, "balance_loss_mlp": 1.01778138, "epoch": 0.7961281789214213, "flos": 25481745095040.0, "grad_norm": 2.9300115114822898, "language_loss": 0.61136293, "learning_rate": 4.2023440943608433e-07, "loss": 0.63327402, "num_input_tokens_seen": 142622490, "step": 6621, "time_per_iteration": 2.8370449542999268 }, { "auxiliary_loss_clip": 0.01163693, "auxiliary_loss_mlp": 0.01024947, "balance_loss_clip": 1.04711723, "balance_loss_mlp": 1.01766372, "epoch": 0.7962484218120603, "flos": 21944436612480.0, "grad_norm": 1.8625089307148106, "language_loss": 0.78332508, "learning_rate": 4.1975681880837023e-07, "loss": 0.80521154, "num_input_tokens_seen": 142642495, "step": 6622, "time_per_iteration": 2.7446677684783936 }, { "auxiliary_loss_clip": 0.01155284, "auxiliary_loss_mlp": 0.01026247, "balance_loss_clip": 1.04673648, "balance_loss_mlp": 1.0190413, "epoch": 0.7963686647026994, "flos": 18876264687360.0, "grad_norm": 1.629866425132338, "language_loss": 0.82312977, "learning_rate": 4.192794679022895e-07, "loss": 0.84494507, "num_input_tokens_seen": 142660820, "step": 6623, "time_per_iteration": 3.6480677127838135 }, { "auxiliary_loss_clip": 0.01168183, "auxiliary_loss_mlp": 0.01019673, "balance_loss_clip": 1.04836082, "balance_loss_mlp": 1.01261878, "epoch": 0.7964889075933386, "flos": 29716582763520.0, "grad_norm": 1.8807266587741343, "language_loss": 0.71866667, "learning_rate": 4.1880235679025743e-07, "loss": 0.74054527, "num_input_tokens_seen": 142680915, "step": 6624, "time_per_iteration": 2.7620151042938232 }, { "auxiliary_loss_clip": 0.0115389, "auxiliary_loss_mlp": 0.01027793, "balance_loss_clip": 1.04963493, "balance_loss_mlp": 1.02027082, "epoch": 0.7966091504839776, "flos": 29491458272640.0, "grad_norm": 2.2389209970415784, "language_loss": 0.63828474, "learning_rate": 4.1832548554464986e-07, "loss": 0.66010153, "num_input_tokens_seen": 142699210, "step": 6625, "time_per_iteration": 3.7539429664611816 }, { "auxiliary_loss_clip": 0.01059166, "auxiliary_loss_mlp": 0.00998897, "balance_loss_clip": 1.00952125, "balance_loss_mlp": 0.99794894, "epoch": 0.7967293933746167, "flos": 67288697101440.0, "grad_norm": 0.7422025548981535, "language_loss": 0.5868783, "learning_rate": 4.178488542378098e-07, "loss": 0.60745883, "num_input_tokens_seen": 142756790, "step": 6626, "time_per_iteration": 3.252305030822754 }, { "auxiliary_loss_clip": 0.01169408, "auxiliary_loss_mlp": 0.01026334, "balance_loss_clip": 1.04752123, "balance_loss_mlp": 1.01841891, "epoch": 0.7968496362652558, "flos": 25554679660800.0, "grad_norm": 1.835155528530035, "language_loss": 0.88867843, "learning_rate": 4.173724629420401e-07, "loss": 0.91063595, "num_input_tokens_seen": 142778150, "step": 6627, "time_per_iteration": 2.644880771636963 }, { "auxiliary_loss_clip": 0.01167427, "auxiliary_loss_mlp": 0.0102642, "balance_loss_clip": 1.04903197, "balance_loss_mlp": 1.01861143, "epoch": 0.7969698791558949, "flos": 14501088581760.0, "grad_norm": 2.458044893650689, "language_loss": 0.68266928, "learning_rate": 4.168963117296087e-07, "loss": 0.70460778, "num_input_tokens_seen": 142795485, "step": 6628, "time_per_iteration": 2.64067006111145 }, { "auxiliary_loss_clip": 0.01167149, "auxiliary_loss_mlp": 0.01030187, "balance_loss_clip": 1.04783952, "balance_loss_mlp": 1.0225693, "epoch": 0.797090122046534, "flos": 22127545169280.0, "grad_norm": 2.318896268075853, "language_loss": 0.76086974, "learning_rate": 4.1642040067274876e-07, "loss": 0.78284311, "num_input_tokens_seen": 142815155, "step": 6629, "time_per_iteration": 3.580980062484741 }, { "auxiliary_loss_clip": 0.01164104, "auxiliary_loss_mlp": 0.01027803, "balance_loss_clip": 1.04676938, "balance_loss_mlp": 1.02060854, "epoch": 0.7972103649371731, "flos": 19897671830400.0, "grad_norm": 1.5734496381952188, "language_loss": 0.72899532, "learning_rate": 4.1594472984365493e-07, "loss": 0.75091439, "num_input_tokens_seen": 142833840, "step": 6630, "time_per_iteration": 2.76090145111084 }, { "auxiliary_loss_clip": 0.01158672, "auxiliary_loss_mlp": 0.01024907, "balance_loss_clip": 1.04515409, "balance_loss_mlp": 1.01797819, "epoch": 0.7973306078278122, "flos": 36058621847040.0, "grad_norm": 2.0788856007463146, "language_loss": 0.77427095, "learning_rate": 4.154692993144862e-07, "loss": 0.7961067, "num_input_tokens_seen": 142853610, "step": 6631, "time_per_iteration": 2.8492350578308105 }, { "auxiliary_loss_clip": 0.01166769, "auxiliary_loss_mlp": 0.01052759, "balance_loss_clip": 1.04698038, "balance_loss_mlp": 1.01689899, "epoch": 0.7974508507184512, "flos": 21360600950400.0, "grad_norm": 2.2569679579801245, "language_loss": 0.71669501, "learning_rate": 4.1499410915736476e-07, "loss": 0.73889029, "num_input_tokens_seen": 142872540, "step": 6632, "time_per_iteration": 2.719536542892456 }, { "auxiliary_loss_clip": 0.01061735, "auxiliary_loss_mlp": 0.01001474, "balance_loss_clip": 1.00850117, "balance_loss_mlp": 1.00040102, "epoch": 0.7975710936090904, "flos": 68253115317120.0, "grad_norm": 0.8054667478365772, "language_loss": 0.64265668, "learning_rate": 4.145191594443762e-07, "loss": 0.66328877, "num_input_tokens_seen": 142936895, "step": 6633, "time_per_iteration": 3.442237138748169 }, { "auxiliary_loss_clip": 0.01153478, "auxiliary_loss_mlp": 0.01027561, "balance_loss_clip": 1.04880416, "balance_loss_mlp": 1.01983666, "epoch": 0.7976913364997295, "flos": 22492433479680.0, "grad_norm": 1.947171566244213, "language_loss": 0.70713806, "learning_rate": 4.140444502475713e-07, "loss": 0.72894841, "num_input_tokens_seen": 142956445, "step": 6634, "time_per_iteration": 3.714144229888916 }, { "auxiliary_loss_clip": 0.01162387, "auxiliary_loss_mlp": 0.01023927, "balance_loss_clip": 1.04784179, "balance_loss_mlp": 1.01646435, "epoch": 0.7978115793903685, "flos": 15263220378240.0, "grad_norm": 2.4642747245566325, "language_loss": 0.70590973, "learning_rate": 4.1356998163896216e-07, "loss": 0.72777295, "num_input_tokens_seen": 142973495, "step": 6635, "time_per_iteration": 2.7216403484344482 }, { "auxiliary_loss_clip": 0.01162518, "auxiliary_loss_mlp": 0.01026542, "balance_loss_clip": 1.04749799, "balance_loss_mlp": 1.018924, "epoch": 0.7979318222810077, "flos": 19719232041600.0, "grad_norm": 2.2769945591206318, "language_loss": 0.7511785, "learning_rate": 4.130957536905255e-07, "loss": 0.77306902, "num_input_tokens_seen": 142991510, "step": 6636, "time_per_iteration": 2.75406813621521 }, { "auxiliary_loss_clip": 0.01170004, "auxiliary_loss_mlp": 0.01022293, "balance_loss_clip": 1.04923046, "balance_loss_mlp": 1.01421928, "epoch": 0.7980520651716467, "flos": 15560273854080.0, "grad_norm": 2.751760910380621, "language_loss": 0.71506518, "learning_rate": 4.1262176647420134e-07, "loss": 0.73698813, "num_input_tokens_seen": 143009675, "step": 6637, "time_per_iteration": 2.7019035816192627 }, { "auxiliary_loss_clip": 0.01167319, "auxiliary_loss_mlp": 0.01027021, "balance_loss_clip": 1.04939508, "balance_loss_mlp": 1.02031863, "epoch": 0.7981723080622858, "flos": 22309432663680.0, "grad_norm": 1.7327173656531443, "language_loss": 0.80020839, "learning_rate": 4.121480200618923e-07, "loss": 0.82215178, "num_input_tokens_seen": 143029330, "step": 6638, "time_per_iteration": 2.642277479171753 }, { "auxiliary_loss_clip": 0.01156278, "auxiliary_loss_mlp": 0.01030587, "balance_loss_clip": 1.0483377, "balance_loss_mlp": 1.02304685, "epoch": 0.798292550952925, "flos": 22929573997440.0, "grad_norm": 1.7660804988554006, "language_loss": 0.80375564, "learning_rate": 4.116745145254674e-07, "loss": 0.82562435, "num_input_tokens_seen": 143048865, "step": 6639, "time_per_iteration": 2.697493553161621 }, { "auxiliary_loss_clip": 0.01054649, "auxiliary_loss_mlp": 0.01000801, "balance_loss_clip": 1.00860834, "balance_loss_mlp": 0.99979383, "epoch": 0.798412793843564, "flos": 64497936890880.0, "grad_norm": 0.7707235088388955, "language_loss": 0.58018082, "learning_rate": 4.1120124993675476e-07, "loss": 0.60073537, "num_input_tokens_seen": 143113295, "step": 6640, "time_per_iteration": 3.2287957668304443 }, { "auxiliary_loss_clip": 0.01167543, "auxiliary_loss_mlp": 0.01023238, "balance_loss_clip": 1.04741919, "balance_loss_mlp": 1.01570058, "epoch": 0.7985330367342031, "flos": 13586910514560.0, "grad_norm": 2.2261062362314448, "language_loss": 0.62175083, "learning_rate": 4.107282263675498e-07, "loss": 0.64365864, "num_input_tokens_seen": 143130965, "step": 6641, "time_per_iteration": 2.7379138469696045 }, { "auxiliary_loss_clip": 0.01060509, "auxiliary_loss_mlp": 0.01037123, "balance_loss_clip": 1.01235056, "balance_loss_mlp": 1.00295877, "epoch": 0.7986532796248422, "flos": 67698797656320.0, "grad_norm": 0.772870949796782, "language_loss": 0.52494586, "learning_rate": 4.1025544388960907e-07, "loss": 0.54592216, "num_input_tokens_seen": 143192005, "step": 6642, "time_per_iteration": 3.2512457370758057 }, { "auxiliary_loss_clip": 0.01165718, "auxiliary_loss_mlp": 0.01026151, "balance_loss_clip": 1.04983878, "balance_loss_mlp": 1.01902258, "epoch": 0.7987735225154813, "flos": 22455373622400.0, "grad_norm": 2.006201837276253, "language_loss": 0.71355724, "learning_rate": 4.097829025746538e-07, "loss": 0.7354759, "num_input_tokens_seen": 143213550, "step": 6643, "time_per_iteration": 2.6784262657165527 }, { "auxiliary_loss_clip": 0.01059524, "auxiliary_loss_mlp": 0.00999944, "balance_loss_clip": 1.00841641, "balance_loss_mlp": 0.999008, "epoch": 0.7988937654061203, "flos": 68864098682880.0, "grad_norm": 0.6922730083920189, "language_loss": 0.60948431, "learning_rate": 4.0931060249436757e-07, "loss": 0.63007903, "num_input_tokens_seen": 143277390, "step": 6644, "time_per_iteration": 3.300069808959961 }, { "auxiliary_loss_clip": 0.01162733, "auxiliary_loss_mlp": 0.01026243, "balance_loss_clip": 1.04946113, "balance_loss_mlp": 1.01904273, "epoch": 0.7990140082967595, "flos": 20806893820800.0, "grad_norm": 3.906957764985952, "language_loss": 0.69630742, "learning_rate": 4.088385437203978e-07, "loss": 0.71819711, "num_input_tokens_seen": 143294400, "step": 6645, "time_per_iteration": 2.683241128921509 }, { "auxiliary_loss_clip": 0.01166931, "auxiliary_loss_mlp": 0.010241, "balance_loss_clip": 1.04633355, "balance_loss_mlp": 1.01654851, "epoch": 0.7991342511873986, "flos": 18985289443200.0, "grad_norm": 2.5974793347836753, "language_loss": 0.77530336, "learning_rate": 4.083667263243564e-07, "loss": 0.79721367, "num_input_tokens_seen": 143312745, "step": 6646, "time_per_iteration": 2.6836206912994385 }, { "auxiliary_loss_clip": 0.0115699, "auxiliary_loss_mlp": 0.01021931, "balance_loss_clip": 1.04639673, "balance_loss_mlp": 1.01501632, "epoch": 0.7992544940780376, "flos": 20816805974400.0, "grad_norm": 1.9117127191919756, "language_loss": 0.71977818, "learning_rate": 4.0789515037781653e-07, "loss": 0.74156737, "num_input_tokens_seen": 143333470, "step": 6647, "time_per_iteration": 2.6933882236480713 }, { "auxiliary_loss_clip": 0.01165178, "auxiliary_loss_mlp": 0.01027799, "balance_loss_clip": 1.0462594, "balance_loss_mlp": 1.02028847, "epoch": 0.7993747369686768, "flos": 12640772321280.0, "grad_norm": 15.062627680043704, "language_loss": 0.82746494, "learning_rate": 4.0742381595231755e-07, "loss": 0.84939468, "num_input_tokens_seen": 143350195, "step": 6648, "time_per_iteration": 2.7719950675964355 }, { "auxiliary_loss_clip": 0.01159588, "auxiliary_loss_mlp": 0.0102121, "balance_loss_clip": 1.04635453, "balance_loss_mlp": 1.01388717, "epoch": 0.7994949798593158, "flos": 20078769225600.0, "grad_norm": 2.026206861936551, "language_loss": 0.78256768, "learning_rate": 4.06952723119359e-07, "loss": 0.80437565, "num_input_tokens_seen": 143370070, "step": 6649, "time_per_iteration": 3.740002393722534 }, { "auxiliary_loss_clip": 0.0115391, "auxiliary_loss_mlp": 0.01029213, "balance_loss_clip": 1.04765427, "balance_loss_mlp": 1.02087402, "epoch": 0.7996152227499549, "flos": 38654209509120.0, "grad_norm": 2.368084981928538, "language_loss": 0.67434037, "learning_rate": 4.0648187195040504e-07, "loss": 0.69617158, "num_input_tokens_seen": 143392275, "step": 6650, "time_per_iteration": 2.8224542140960693 }, { "auxiliary_loss_clip": 0.01058175, "auxiliary_loss_mlp": 0.00999774, "balance_loss_clip": 1.00910449, "balance_loss_mlp": 0.99882048, "epoch": 0.799735465640594, "flos": 70243821947520.0, "grad_norm": 0.8100292200012065, "language_loss": 0.6758396, "learning_rate": 4.060112625168848e-07, "loss": 0.69641906, "num_input_tokens_seen": 143457385, "step": 6651, "time_per_iteration": 4.193526029586792 }, { "auxiliary_loss_clip": 0.01168712, "auxiliary_loss_mlp": 0.01029699, "balance_loss_clip": 1.04974329, "balance_loss_mlp": 1.02235556, "epoch": 0.7998557085312331, "flos": 24240995550720.0, "grad_norm": 2.023843591786883, "language_loss": 0.73980319, "learning_rate": 4.055408948901886e-07, "loss": 0.7617873, "num_input_tokens_seen": 143478785, "step": 6652, "time_per_iteration": 2.7155532836914062 }, { "auxiliary_loss_clip": 0.01169199, "auxiliary_loss_mlp": 0.01025519, "balance_loss_clip": 1.04811358, "balance_loss_mlp": 1.01741874, "epoch": 0.7999759514218722, "flos": 27564025449600.0, "grad_norm": 1.6915934927628704, "language_loss": 0.7149213, "learning_rate": 4.050707691416708e-07, "loss": 0.7368685, "num_input_tokens_seen": 143500095, "step": 6653, "time_per_iteration": 2.6643478870391846 }, { "auxiliary_loss_clip": 0.01058577, "auxiliary_loss_mlp": 0.0100065, "balance_loss_clip": 1.00929642, "balance_loss_mlp": 0.99970835, "epoch": 0.8000961943125112, "flos": 67337428878720.0, "grad_norm": 0.6691666170263721, "language_loss": 0.59748447, "learning_rate": 4.046008853426495e-07, "loss": 0.61807674, "num_input_tokens_seen": 143563410, "step": 6654, "time_per_iteration": 3.374305248260498 }, { "auxiliary_loss_clip": 0.01155935, "auxiliary_loss_mlp": 0.01027151, "balance_loss_clip": 1.04884398, "balance_loss_mlp": 1.01910472, "epoch": 0.8002164372031504, "flos": 28733815676160.0, "grad_norm": 1.767474026146121, "language_loss": 0.6282593, "learning_rate": 4.0413124356440464e-07, "loss": 0.65009016, "num_input_tokens_seen": 143587455, "step": 6655, "time_per_iteration": 3.776292562484741 }, { "auxiliary_loss_clip": 0.01159518, "auxiliary_loss_mlp": 0.01020946, "balance_loss_clip": 1.04689074, "balance_loss_mlp": 1.01346517, "epoch": 0.8003366800937894, "flos": 17639429725440.0, "grad_norm": 2.481251583424402, "language_loss": 0.81982011, "learning_rate": 4.0366184387818223e-07, "loss": 0.84162474, "num_input_tokens_seen": 143605915, "step": 6656, "time_per_iteration": 2.7506000995635986 }, { "auxiliary_loss_clip": 0.01171383, "auxiliary_loss_mlp": 0.01022169, "balance_loss_clip": 1.04749465, "balance_loss_mlp": 1.01456594, "epoch": 0.8004569229844285, "flos": 25995303797760.0, "grad_norm": 2.68396920053846, "language_loss": 0.85357511, "learning_rate": 4.0319268635518797e-07, "loss": 0.87551057, "num_input_tokens_seen": 143626490, "step": 6657, "time_per_iteration": 265.5929298400879 }, { "auxiliary_loss_clip": 0.01165871, "auxiliary_loss_mlp": 0.01028362, "balance_loss_clip": 1.04747176, "balance_loss_mlp": 1.0209862, "epoch": 0.8005771658750677, "flos": 20812352688000.0, "grad_norm": 1.6775022794587866, "language_loss": 0.75593662, "learning_rate": 4.027237710665943e-07, "loss": 0.77787894, "num_input_tokens_seen": 143644955, "step": 6658, "time_per_iteration": 2.6946825981140137 }, { "auxiliary_loss_clip": 0.01162037, "auxiliary_loss_mlp": 0.01024149, "balance_loss_clip": 1.04622173, "balance_loss_mlp": 1.01619208, "epoch": 0.8006974087657067, "flos": 25812626204160.0, "grad_norm": 1.9698151787510414, "language_loss": 0.69635236, "learning_rate": 4.022550980835344e-07, "loss": 0.71821427, "num_input_tokens_seen": 143667200, "step": 6659, "time_per_iteration": 2.8045945167541504 }, { "auxiliary_loss_clip": 0.01159552, "auxiliary_loss_mlp": 0.01023438, "balance_loss_clip": 1.04598856, "balance_loss_mlp": 1.01587975, "epoch": 0.8008176516563458, "flos": 17164690646400.0, "grad_norm": 1.9605449018988763, "language_loss": 0.79341686, "learning_rate": 4.017866674771051e-07, "loss": 0.8152467, "num_input_tokens_seen": 143684685, "step": 6660, "time_per_iteration": 3.807443618774414 }, { "auxiliary_loss_clip": 0.01151619, "auxiliary_loss_mlp": 0.01020261, "balance_loss_clip": 1.04756784, "balance_loss_mlp": 1.01290023, "epoch": 0.8009378945469849, "flos": 24207311571840.0, "grad_norm": 3.843124515392235, "language_loss": 0.74559498, "learning_rate": 4.013184793183688e-07, "loss": 0.76731372, "num_input_tokens_seen": 143706780, "step": 6661, "time_per_iteration": 2.8527286052703857 }, { "auxiliary_loss_clip": 0.01162711, "auxiliary_loss_mlp": 0.0102252, "balance_loss_clip": 1.04559779, "balance_loss_mlp": 1.01570976, "epoch": 0.801058137437624, "flos": 19787318271360.0, "grad_norm": 3.5124834477809417, "language_loss": 0.72717679, "learning_rate": 4.008505336783472e-07, "loss": 0.74902916, "num_input_tokens_seen": 143724505, "step": 6662, "time_per_iteration": 2.66050124168396 }, { "auxiliary_loss_clip": 0.01153928, "auxiliary_loss_mlp": 0.01026728, "balance_loss_clip": 1.04492426, "balance_loss_mlp": 1.01984036, "epoch": 0.801178380328263, "flos": 18659400324480.0, "grad_norm": 2.052292919435777, "language_loss": 0.80557823, "learning_rate": 4.003828306280284e-07, "loss": 0.82738483, "num_input_tokens_seen": 143742180, "step": 6663, "time_per_iteration": 2.744732618331909 }, { "auxiliary_loss_clip": 0.01164751, "auxiliary_loss_mlp": 0.01027673, "balance_loss_clip": 1.04713154, "balance_loss_mlp": 1.02077341, "epoch": 0.8012986232189022, "flos": 15706573948800.0, "grad_norm": 2.2002608987621977, "language_loss": 0.78247797, "learning_rate": 3.999153702383626e-07, "loss": 0.80440223, "num_input_tokens_seen": 143760070, "step": 6664, "time_per_iteration": 2.6222691535949707 }, { "auxiliary_loss_clip": 0.01166893, "auxiliary_loss_mlp": 0.01025568, "balance_loss_clip": 1.04679394, "balance_loss_mlp": 1.01802158, "epoch": 0.8014188661095413, "flos": 28584139703040.0, "grad_norm": 1.952480686588722, "language_loss": 0.73550248, "learning_rate": 3.9944815258026263e-07, "loss": 0.7574271, "num_input_tokens_seen": 143781890, "step": 6665, "time_per_iteration": 2.758359909057617 }, { "auxiliary_loss_clip": 0.01167447, "auxiliary_loss_mlp": 0.01026977, "balance_loss_clip": 1.04768181, "balance_loss_mlp": 1.01959467, "epoch": 0.8015391090001803, "flos": 29310360877440.0, "grad_norm": 1.7805608441998124, "language_loss": 0.83121073, "learning_rate": 3.989811777246057e-07, "loss": 0.85315502, "num_input_tokens_seen": 143802060, "step": 6666, "time_per_iteration": 2.7388863563537598 }, { "auxiliary_loss_clip": 0.01060522, "auxiliary_loss_mlp": 0.01002397, "balance_loss_clip": 1.00751483, "balance_loss_mlp": 1.00149107, "epoch": 0.8016593518908195, "flos": 70397340675840.0, "grad_norm": 0.8488449725157559, "language_loss": 0.66153032, "learning_rate": 3.985144457422305e-07, "loss": 0.68215948, "num_input_tokens_seen": 143856345, "step": 6667, "time_per_iteration": 3.2873733043670654 }, { "auxiliary_loss_clip": 0.01168336, "auxiliary_loss_mlp": 0.01026508, "balance_loss_clip": 1.04857385, "balance_loss_mlp": 1.01919174, "epoch": 0.8017795947814585, "flos": 26026114688640.0, "grad_norm": 1.9621447480557985, "language_loss": 0.76918709, "learning_rate": 3.9804795670394096e-07, "loss": 0.79113555, "num_input_tokens_seen": 143876470, "step": 6668, "time_per_iteration": 2.6704635620117188 }, { "auxiliary_loss_clip": 0.01153929, "auxiliary_loss_mlp": 0.01022662, "balance_loss_clip": 1.04635882, "balance_loss_mlp": 1.01584065, "epoch": 0.8018998376720976, "flos": 22087181260800.0, "grad_norm": 1.7506595864836394, "language_loss": 0.70660919, "learning_rate": 3.975817106805022e-07, "loss": 0.72837508, "num_input_tokens_seen": 143895170, "step": 6669, "time_per_iteration": 2.735128879547119 }, { "auxiliary_loss_clip": 0.01159134, "auxiliary_loss_mlp": 0.01027482, "balance_loss_clip": 1.04861856, "balance_loss_mlp": 1.01982307, "epoch": 0.8020200805627368, "flos": 34568545023360.0, "grad_norm": 1.9081145149067138, "language_loss": 0.64850444, "learning_rate": 3.97115707742645e-07, "loss": 0.67037058, "num_input_tokens_seen": 143915845, "step": 6670, "time_per_iteration": 2.901158332824707 }, { "auxiliary_loss_clip": 0.01163173, "auxiliary_loss_mlp": 0.01028109, "balance_loss_clip": 1.04894257, "balance_loss_mlp": 1.02104008, "epoch": 0.8021403234533758, "flos": 20120354196480.0, "grad_norm": 3.444710887394256, "language_loss": 0.64896387, "learning_rate": 3.966499479610599e-07, "loss": 0.67087668, "num_input_tokens_seen": 143933940, "step": 6671, "time_per_iteration": 2.798060655593872 }, { "auxiliary_loss_clip": 0.01156206, "auxiliary_loss_mlp": 0.01024434, "balance_loss_clip": 1.04782283, "balance_loss_mlp": 1.01771641, "epoch": 0.8022605663440149, "flos": 27746200252800.0, "grad_norm": 2.4550566920274655, "language_loss": 0.64931577, "learning_rate": 3.9618443140640225e-07, "loss": 0.67112219, "num_input_tokens_seen": 143952850, "step": 6672, "time_per_iteration": 2.8906450271606445 }, { "auxiliary_loss_clip": 0.0105933, "auxiliary_loss_mlp": 0.01004859, "balance_loss_clip": 1.00956607, "balance_loss_mlp": 1.00401294, "epoch": 0.802380809234654, "flos": 60244998768000.0, "grad_norm": 0.6845432556889589, "language_loss": 0.51265401, "learning_rate": 3.957191581492918e-07, "loss": 0.53329587, "num_input_tokens_seen": 144013610, "step": 6673, "time_per_iteration": 3.3769237995147705 }, { "auxiliary_loss_clip": 0.01159253, "auxiliary_loss_mlp": 0.01027665, "balance_loss_clip": 1.04880691, "balance_loss_mlp": 1.01957679, "epoch": 0.8025010521252931, "flos": 15080722352640.0, "grad_norm": 3.6746848624530952, "language_loss": 0.72029746, "learning_rate": 3.952541282603097e-07, "loss": 0.74216664, "num_input_tokens_seen": 144028715, "step": 6674, "time_per_iteration": 2.7491400241851807 }, { "auxiliary_loss_clip": 0.01162108, "auxiliary_loss_mlp": 0.01026594, "balance_loss_clip": 1.04699028, "balance_loss_mlp": 1.01933694, "epoch": 0.8026212950159322, "flos": 22163527618560.0, "grad_norm": 2.8721352405289884, "language_loss": 0.83554924, "learning_rate": 3.9478934181000013e-07, "loss": 0.8574363, "num_input_tokens_seen": 144048740, "step": 6675, "time_per_iteration": 3.7336010932922363 }, { "auxiliary_loss_clip": 0.01168289, "auxiliary_loss_mlp": 0.01027125, "balance_loss_clip": 1.0464921, "balance_loss_mlp": 1.01941252, "epoch": 0.8027415379065713, "flos": 17675986792320.0, "grad_norm": 2.237590245795204, "language_loss": 0.8455348, "learning_rate": 3.943247988688714e-07, "loss": 0.86748892, "num_input_tokens_seen": 144067435, "step": 6676, "time_per_iteration": 2.6832783222198486 }, { "auxiliary_loss_clip": 0.01165187, "auxiliary_loss_mlp": 0.01023102, "balance_loss_clip": 1.04743552, "balance_loss_mlp": 1.01599407, "epoch": 0.8028617807972104, "flos": 21979593048960.0, "grad_norm": 1.6420397116673622, "language_loss": 0.72082055, "learning_rate": 3.938604995073933e-07, "loss": 0.74270338, "num_input_tokens_seen": 144085905, "step": 6677, "time_per_iteration": 3.963372230529785 }, { "auxiliary_loss_clip": 0.01163033, "auxiliary_loss_mlp": 0.01031986, "balance_loss_clip": 1.04754436, "balance_loss_mlp": 1.02490199, "epoch": 0.8029820236878494, "flos": 26428457905920.0, "grad_norm": 1.6960876135836291, "language_loss": 0.6533227, "learning_rate": 3.9339644379600157e-07, "loss": 0.67527288, "num_input_tokens_seen": 144105735, "step": 6678, "time_per_iteration": 2.8251891136169434 }, { "auxiliary_loss_clip": 0.01168806, "auxiliary_loss_mlp": 0.01024046, "balance_loss_clip": 1.05029368, "balance_loss_mlp": 1.01698327, "epoch": 0.8031022665784886, "flos": 17676489582720.0, "grad_norm": 2.9954284954857644, "language_loss": 0.71247697, "learning_rate": 3.929326318050907e-07, "loss": 0.73440552, "num_input_tokens_seen": 144123405, "step": 6679, "time_per_iteration": 2.6775104999542236 }, { "auxiliary_loss_clip": 0.01162521, "auxiliary_loss_mlp": 0.01021694, "balance_loss_clip": 1.04439139, "balance_loss_mlp": 1.01401424, "epoch": 0.8032225094691277, "flos": 15450279431040.0, "grad_norm": 1.828903464933899, "language_loss": 0.78919959, "learning_rate": 3.924690636050225e-07, "loss": 0.81104171, "num_input_tokens_seen": 144140815, "step": 6680, "time_per_iteration": 2.657878875732422 }, { "auxiliary_loss_clip": 0.01164316, "auxiliary_loss_mlp": 0.01030605, "balance_loss_clip": 1.04637659, "balance_loss_mlp": 1.02285099, "epoch": 0.8033427523597667, "flos": 26179202453760.0, "grad_norm": 1.9660686902489497, "language_loss": 0.73070371, "learning_rate": 3.9200573926611915e-07, "loss": 0.75265288, "num_input_tokens_seen": 144162230, "step": 6681, "time_per_iteration": 3.709620475769043 }, { "auxiliary_loss_clip": 0.01161032, "auxiliary_loss_mlp": 0.01024413, "balance_loss_clip": 1.04701281, "balance_loss_mlp": 1.01677728, "epoch": 0.8034629952504058, "flos": 21324905809920.0, "grad_norm": 1.8466880300398785, "language_loss": 0.73095685, "learning_rate": 3.9154265885866613e-07, "loss": 0.75281131, "num_input_tokens_seen": 144181540, "step": 6682, "time_per_iteration": 2.705348253250122 }, { "auxiliary_loss_clip": 0.01165184, "auxiliary_loss_mlp": 0.01025465, "balance_loss_clip": 1.04964817, "balance_loss_mlp": 1.01755857, "epoch": 0.8035832381410449, "flos": 21651585027840.0, "grad_norm": 20.653127805987236, "language_loss": 0.74579668, "learning_rate": 3.9107982245291394e-07, "loss": 0.76770318, "num_input_tokens_seen": 144199665, "step": 6683, "time_per_iteration": 2.785090684890747 }, { "auxiliary_loss_clip": 0.01157879, "auxiliary_loss_mlp": 0.01028995, "balance_loss_clip": 1.04736888, "balance_loss_mlp": 1.02092457, "epoch": 0.803703481031684, "flos": 20518818744960.0, "grad_norm": 2.0736903892984953, "language_loss": 0.77643955, "learning_rate": 3.9061723011907245e-07, "loss": 0.79830825, "num_input_tokens_seen": 144219020, "step": 6684, "time_per_iteration": 2.8355889320373535 }, { "auxiliary_loss_clip": 0.01163103, "auxiliary_loss_mlp": 0.01029431, "balance_loss_clip": 1.04944158, "balance_loss_mlp": 1.02149761, "epoch": 0.803823723922323, "flos": 22854807838080.0, "grad_norm": 1.8408560429413499, "language_loss": 0.79572105, "learning_rate": 3.901548819273179e-07, "loss": 0.81764644, "num_input_tokens_seen": 144239035, "step": 6685, "time_per_iteration": 2.8185677528381348 }, { "auxiliary_loss_clip": 0.01168408, "auxiliary_loss_mlp": 0.01030464, "balance_loss_clip": 1.05120432, "balance_loss_mlp": 1.02297759, "epoch": 0.8039439668129622, "flos": 21362145235200.0, "grad_norm": 1.8331791859698208, "language_loss": 0.69267201, "learning_rate": 3.896927779477881e-07, "loss": 0.7146607, "num_input_tokens_seen": 144258295, "step": 6686, "time_per_iteration": 3.7643702030181885 }, { "auxiliary_loss_clip": 0.01158861, "auxiliary_loss_mlp": 0.01031267, "balance_loss_clip": 1.04677439, "balance_loss_mlp": 1.02336359, "epoch": 0.8040642097036013, "flos": 23802382575360.0, "grad_norm": 2.127673621762209, "language_loss": 0.66885608, "learning_rate": 3.892309182505833e-07, "loss": 0.69075733, "num_input_tokens_seen": 144276110, "step": 6687, "time_per_iteration": 2.72623348236084 }, { "auxiliary_loss_clip": 0.01164846, "auxiliary_loss_mlp": 0.01025437, "balance_loss_clip": 1.04436576, "balance_loss_mlp": 1.0180552, "epoch": 0.8041844525942403, "flos": 25922046009600.0, "grad_norm": 3.595990644495934, "language_loss": 0.86007571, "learning_rate": 3.887693029057675e-07, "loss": 0.88197857, "num_input_tokens_seen": 144295620, "step": 6688, "time_per_iteration": 2.8109378814697266 }, { "auxiliary_loss_clip": 0.01159797, "auxiliary_loss_mlp": 0.01026158, "balance_loss_clip": 1.04633594, "balance_loss_mlp": 1.01919079, "epoch": 0.8043046954848795, "flos": 25191120153600.0, "grad_norm": 1.6500364959744802, "language_loss": 0.81383663, "learning_rate": 3.8830793198336684e-07, "loss": 0.83569616, "num_input_tokens_seen": 144315210, "step": 6689, "time_per_iteration": 2.7970306873321533 }, { "auxiliary_loss_clip": 0.01168478, "auxiliary_loss_mlp": 0.01023821, "balance_loss_clip": 1.04685199, "balance_loss_mlp": 1.01652193, "epoch": 0.8044249383755185, "flos": 41719185123840.0, "grad_norm": 1.9493122816640505, "language_loss": 0.70032346, "learning_rate": 3.878468055533721e-07, "loss": 0.72224641, "num_input_tokens_seen": 144337750, "step": 6690, "time_per_iteration": 2.944941282272339 }, { "auxiliary_loss_clip": 0.01164598, "auxiliary_loss_mlp": 0.01025272, "balance_loss_clip": 1.04893255, "balance_loss_mlp": 1.01729655, "epoch": 0.8045451812661576, "flos": 20631434860800.0, "grad_norm": 2.474803807471229, "language_loss": 0.84931898, "learning_rate": 3.8738592368573464e-07, "loss": 0.87121767, "num_input_tokens_seen": 144355305, "step": 6691, "time_per_iteration": 2.801750421524048 }, { "auxiliary_loss_clip": 0.01152198, "auxiliary_loss_mlp": 0.01024775, "balance_loss_clip": 1.04762983, "balance_loss_mlp": 1.01660943, "epoch": 0.8046654241567968, "flos": 29711806254720.0, "grad_norm": 2.011338247714454, "language_loss": 0.88138258, "learning_rate": 3.8692528645037137e-07, "loss": 0.90315235, "num_input_tokens_seen": 144374485, "step": 6692, "time_per_iteration": 2.862536907196045 }, { "auxiliary_loss_clip": 0.01169197, "auxiliary_loss_mlp": 0.01024689, "balance_loss_clip": 1.04858208, "balance_loss_mlp": 1.01730347, "epoch": 0.8047856670474358, "flos": 17671389851520.0, "grad_norm": 2.1374175971404794, "language_loss": 0.77990007, "learning_rate": 3.8646489391715907e-07, "loss": 0.80183893, "num_input_tokens_seen": 144388780, "step": 6693, "time_per_iteration": 2.718163013458252 }, { "auxiliary_loss_clip": 0.01163805, "auxiliary_loss_mlp": 0.01024947, "balance_loss_clip": 1.04821372, "balance_loss_mlp": 1.01735663, "epoch": 0.8049059099380749, "flos": 17120699464320.0, "grad_norm": 2.9460707828501573, "language_loss": 0.87968725, "learning_rate": 3.8600474615593903e-07, "loss": 0.90157485, "num_input_tokens_seen": 144403395, "step": 6694, "time_per_iteration": 2.7838191986083984 }, { "auxiliary_loss_clip": 0.010607, "auxiliary_loss_mlp": 0.00999913, "balance_loss_clip": 1.00794911, "balance_loss_mlp": 0.99890614, "epoch": 0.805026152828714, "flos": 62212903240320.0, "grad_norm": 0.8247542461350282, "language_loss": 0.59651804, "learning_rate": 3.8554484323651605e-07, "loss": 0.6171242, "num_input_tokens_seen": 144465265, "step": 6695, "time_per_iteration": 3.4568843841552734 }, { "auxiliary_loss_clip": 0.01161277, "auxiliary_loss_mlp": 0.01053233, "balance_loss_clip": 1.046345, "balance_loss_mlp": 1.01808047, "epoch": 0.8051463957193531, "flos": 21688608971520.0, "grad_norm": 1.5540948096442486, "language_loss": 0.78850883, "learning_rate": 3.85085185228657e-07, "loss": 0.81065392, "num_input_tokens_seen": 144484235, "step": 6696, "time_per_iteration": 2.6835529804229736 }, { "auxiliary_loss_clip": 0.01159361, "auxiliary_loss_mlp": 0.01023583, "balance_loss_clip": 1.04964614, "balance_loss_mlp": 1.01578104, "epoch": 0.8052666386099921, "flos": 32051458535040.0, "grad_norm": 2.408341592881524, "language_loss": 0.73225373, "learning_rate": 3.8462577220209114e-07, "loss": 0.75408316, "num_input_tokens_seen": 144504610, "step": 6697, "time_per_iteration": 2.826460123062134 }, { "auxiliary_loss_clip": 0.01060556, "auxiliary_loss_mlp": 0.01002824, "balance_loss_clip": 1.00736201, "balance_loss_mlp": 1.00194168, "epoch": 0.8053868815006313, "flos": 67157875768320.0, "grad_norm": 0.7090577093465693, "language_loss": 0.58948898, "learning_rate": 3.8416660422651127e-07, "loss": 0.6101228, "num_input_tokens_seen": 144574260, "step": 6698, "time_per_iteration": 3.416288375854492 }, { "auxiliary_loss_clip": 0.01163498, "auxiliary_loss_mlp": 0.0102173, "balance_loss_clip": 1.04769695, "balance_loss_mlp": 1.01452398, "epoch": 0.8055071243912704, "flos": 23837000307840.0, "grad_norm": 1.7497956857266581, "language_loss": 0.67971361, "learning_rate": 3.837076813715723e-07, "loss": 0.70156598, "num_input_tokens_seen": 144594145, "step": 6699, "time_per_iteration": 2.851097345352173 }, { "auxiliary_loss_clip": 0.01160734, "auxiliary_loss_mlp": 0.01029418, "balance_loss_clip": 1.05040753, "balance_loss_mlp": 1.02100158, "epoch": 0.8056273672819094, "flos": 21324510760320.0, "grad_norm": 1.8675371731753023, "language_loss": 0.75119513, "learning_rate": 3.832490037068941e-07, "loss": 0.77309668, "num_input_tokens_seen": 144612935, "step": 6700, "time_per_iteration": 2.8048253059387207 }, { "auxiliary_loss_clip": 0.0114862, "auxiliary_loss_mlp": 0.01020617, "balance_loss_clip": 1.04723561, "balance_loss_mlp": 1.01327109, "epoch": 0.8057476101725486, "flos": 25768383626880.0, "grad_norm": 2.6088475822689183, "language_loss": 0.7608971, "learning_rate": 3.827905713020554e-07, "loss": 0.7825895, "num_input_tokens_seen": 144630580, "step": 6701, "time_per_iteration": 3.7771222591400146 }, { "auxiliary_loss_clip": 0.01169275, "auxiliary_loss_mlp": 0.01027966, "balance_loss_clip": 1.05367434, "balance_loss_mlp": 1.0194428, "epoch": 0.8058678530631876, "flos": 24535283679360.0, "grad_norm": 2.225879401139023, "language_loss": 0.68846679, "learning_rate": 3.823323842266017e-07, "loss": 0.71043921, "num_input_tokens_seen": 144649975, "step": 6702, "time_per_iteration": 3.8418145179748535 }, { "auxiliary_loss_clip": 0.01165162, "auxiliary_loss_mlp": 0.01028205, "balance_loss_clip": 1.04513574, "balance_loss_mlp": 1.02098656, "epoch": 0.8059880959538267, "flos": 24753728240640.0, "grad_norm": 4.864294607508518, "language_loss": 0.72910655, "learning_rate": 3.818744425500393e-07, "loss": 0.75104022, "num_input_tokens_seen": 144667990, "step": 6703, "time_per_iteration": 2.7706942558288574 }, { "auxiliary_loss_clip": 0.0115311, "auxiliary_loss_mlp": 0.01023467, "balance_loss_clip": 1.04619074, "balance_loss_mlp": 1.01642132, "epoch": 0.8061083388444659, "flos": 22196349671040.0, "grad_norm": 1.7870507762714494, "language_loss": 0.80567026, "learning_rate": 3.8141674634183675e-07, "loss": 0.82743603, "num_input_tokens_seen": 144687020, "step": 6704, "time_per_iteration": 2.683228015899658 }, { "auxiliary_loss_clip": 0.01153636, "auxiliary_loss_mlp": 0.0102135, "balance_loss_clip": 1.04758048, "balance_loss_mlp": 1.0144031, "epoch": 0.8062285817351049, "flos": 30044195735040.0, "grad_norm": 1.6299711981243232, "language_loss": 0.66277462, "learning_rate": 3.809592956714278e-07, "loss": 0.68452448, "num_input_tokens_seen": 144710255, "step": 6705, "time_per_iteration": 2.9151477813720703 }, { "auxiliary_loss_clip": 0.01174109, "auxiliary_loss_mlp": 0.01025499, "balance_loss_clip": 1.05321491, "balance_loss_mlp": 1.018049, "epoch": 0.806348824625744, "flos": 22782591544320.0, "grad_norm": 2.1246394184498887, "language_loss": 0.74746841, "learning_rate": 3.805020906082057e-07, "loss": 0.76946449, "num_input_tokens_seen": 144728830, "step": 6706, "time_per_iteration": 2.8108034133911133 }, { "auxiliary_loss_clip": 0.01165494, "auxiliary_loss_mlp": 0.01029574, "balance_loss_clip": 1.04820395, "balance_loss_mlp": 1.02228451, "epoch": 0.8064690675163831, "flos": 23404600385280.0, "grad_norm": 2.13713440811846, "language_loss": 0.80850029, "learning_rate": 3.8004513122152917e-07, "loss": 0.83045095, "num_input_tokens_seen": 144747140, "step": 6707, "time_per_iteration": 3.7789595127105713 }, { "auxiliary_loss_clip": 0.01152269, "auxiliary_loss_mlp": 0.01024131, "balance_loss_clip": 1.04899275, "balance_loss_mlp": 1.01693082, "epoch": 0.8065893104070222, "flos": 24060903736320.0, "grad_norm": 1.7852862344154612, "language_loss": 0.67298865, "learning_rate": 3.79588417580718e-07, "loss": 0.69475269, "num_input_tokens_seen": 144765250, "step": 6708, "time_per_iteration": 2.766097068786621 }, { "auxiliary_loss_clip": 0.01163445, "auxiliary_loss_mlp": 0.01025268, "balance_loss_clip": 1.04725289, "balance_loss_mlp": 1.01796079, "epoch": 0.8067095532976613, "flos": 22305410340480.0, "grad_norm": 1.8689382823905307, "language_loss": 0.76376271, "learning_rate": 3.791319497550558e-07, "loss": 0.78564984, "num_input_tokens_seen": 144783080, "step": 6709, "time_per_iteration": 2.7601473331451416 }, { "auxiliary_loss_clip": 0.01162315, "auxiliary_loss_mlp": 0.01050772, "balance_loss_clip": 1.0476613, "balance_loss_mlp": 1.01460493, "epoch": 0.8068297961883004, "flos": 17129498296320.0, "grad_norm": 2.018256825949698, "language_loss": 0.70852888, "learning_rate": 3.78675727813788e-07, "loss": 0.73065972, "num_input_tokens_seen": 144800645, "step": 6710, "time_per_iteration": 2.772017002105713 }, { "auxiliary_loss_clip": 0.01162184, "auxiliary_loss_mlp": 0.01026101, "balance_loss_clip": 1.04854882, "balance_loss_mlp": 1.01841784, "epoch": 0.8069500390789395, "flos": 22018843635840.0, "grad_norm": 1.6814325351265684, "language_loss": 0.73585081, "learning_rate": 3.782197518261225e-07, "loss": 0.75773364, "num_input_tokens_seen": 144820085, "step": 6711, "time_per_iteration": 2.7701354026794434 }, { "auxiliary_loss_clip": 0.01166418, "auxiliary_loss_mlp": 0.01023386, "balance_loss_clip": 1.04852414, "balance_loss_mlp": 1.01588166, "epoch": 0.8070702819695785, "flos": 19244241567360.0, "grad_norm": 1.9631239294421001, "language_loss": 0.95561755, "learning_rate": 3.777640218612319e-07, "loss": 0.97751558, "num_input_tokens_seen": 144838070, "step": 6712, "time_per_iteration": 3.7257771492004395 }, { "auxiliary_loss_clip": 0.01159673, "auxiliary_loss_mlp": 0.01024914, "balance_loss_clip": 1.04672813, "balance_loss_mlp": 1.01758897, "epoch": 0.8071905248602176, "flos": 21544320038400.0, "grad_norm": 2.682555227321322, "language_loss": 0.71602911, "learning_rate": 3.773085379882488e-07, "loss": 0.73787498, "num_input_tokens_seen": 144857125, "step": 6713, "time_per_iteration": 2.874424934387207 }, { "auxiliary_loss_clip": 0.01166845, "auxiliary_loss_mlp": 0.01051948, "balance_loss_clip": 1.04753637, "balance_loss_mlp": 1.01393032, "epoch": 0.8073107677508568, "flos": 37268309105280.0, "grad_norm": 4.30840865720721, "language_loss": 0.7576521, "learning_rate": 3.768533002762715e-07, "loss": 0.77984005, "num_input_tokens_seen": 144880660, "step": 6714, "time_per_iteration": 2.9189536571502686 }, { "auxiliary_loss_clip": 0.01161592, "auxiliary_loss_mlp": 0.01024435, "balance_loss_clip": 1.046911, "balance_loss_mlp": 1.01716328, "epoch": 0.8074310106414958, "flos": 28366269759360.0, "grad_norm": 1.6804122284940772, "language_loss": 0.76912725, "learning_rate": 3.763983087943572e-07, "loss": 0.79098755, "num_input_tokens_seen": 144900050, "step": 6715, "time_per_iteration": 2.913296937942505 }, { "auxiliary_loss_clip": 0.011562, "auxiliary_loss_mlp": 0.01050998, "balance_loss_clip": 1.04569101, "balance_loss_mlp": 1.01586056, "epoch": 0.8075512535321349, "flos": 24281646768000.0, "grad_norm": 1.6458237035351104, "language_loss": 0.81014335, "learning_rate": 3.759435636115282e-07, "loss": 0.83221537, "num_input_tokens_seen": 144920835, "step": 6716, "time_per_iteration": 2.788113832473755 }, { "auxiliary_loss_clip": 0.01148019, "auxiliary_loss_mlp": 0.01055045, "balance_loss_clip": 1.05012202, "balance_loss_mlp": 1.01831007, "epoch": 0.807671496422774, "flos": 26030855283840.0, "grad_norm": 2.021622241777889, "language_loss": 0.73097217, "learning_rate": 3.7548906479676967e-07, "loss": 0.75300276, "num_input_tokens_seen": 144940430, "step": 6717, "time_per_iteration": 2.9119760990142822 }, { "auxiliary_loss_clip": 0.01165527, "auxiliary_loss_mlp": 0.01024191, "balance_loss_clip": 1.04514599, "balance_loss_mlp": 1.01672816, "epoch": 0.8077917393134131, "flos": 23730740899200.0, "grad_norm": 2.3662142429409, "language_loss": 0.71270001, "learning_rate": 3.7503481241902855e-07, "loss": 0.73459721, "num_input_tokens_seen": 144960405, "step": 6718, "time_per_iteration": 2.861334800720215 }, { "auxiliary_loss_clip": 0.01160661, "auxiliary_loss_mlp": 0.01050275, "balance_loss_clip": 1.0471983, "balance_loss_mlp": 1.01571369, "epoch": 0.8079119822040521, "flos": 18402028398720.0, "grad_norm": 1.68240332947459, "language_loss": 0.80280817, "learning_rate": 3.745808065472145e-07, "loss": 0.82491755, "num_input_tokens_seen": 144977700, "step": 6719, "time_per_iteration": 2.7244064807891846 }, { "auxiliary_loss_clip": 0.01159223, "auxiliary_loss_mlp": 0.01030342, "balance_loss_clip": 1.05018258, "balance_loss_mlp": 1.02336216, "epoch": 0.8080322250946913, "flos": 23621787970560.0, "grad_norm": 2.1347854230797507, "language_loss": 0.76216209, "learning_rate": 3.741270472501994e-07, "loss": 0.78405774, "num_input_tokens_seen": 144998340, "step": 6720, "time_per_iteration": 2.722931146621704 }, { "auxiliary_loss_clip": 0.01158648, "auxiliary_loss_mlp": 0.01026677, "balance_loss_clip": 1.04800498, "balance_loss_mlp": 1.01893985, "epoch": 0.8081524679853304, "flos": 22820692896000.0, "grad_norm": 4.240886918673602, "language_loss": 0.72582674, "learning_rate": 3.736735345968183e-07, "loss": 0.74768001, "num_input_tokens_seen": 145017950, "step": 6721, "time_per_iteration": 2.676797389984131 }, { "auxiliary_loss_clip": 0.01166662, "auxiliary_loss_mlp": 0.01021593, "balance_loss_clip": 1.04878879, "balance_loss_mlp": 1.01437533, "epoch": 0.8082727108759694, "flos": 17640004343040.0, "grad_norm": 1.6926732401417453, "language_loss": 0.78630579, "learning_rate": 3.7322026865586986e-07, "loss": 0.80818838, "num_input_tokens_seen": 145036985, "step": 6722, "time_per_iteration": 2.635462760925293 }, { "auxiliary_loss_clip": 0.01173468, "auxiliary_loss_mlp": 0.01027635, "balance_loss_clip": 1.05127037, "balance_loss_mlp": 1.01917708, "epoch": 0.8083929537666086, "flos": 25958172113280.0, "grad_norm": 6.6970106068159545, "language_loss": 0.73512363, "learning_rate": 3.7276724949611206e-07, "loss": 0.75713468, "num_input_tokens_seen": 145057095, "step": 6723, "time_per_iteration": 2.5795559883117676 }, { "auxiliary_loss_clip": 0.01163332, "auxiliary_loss_mlp": 0.01023485, "balance_loss_clip": 1.04835808, "balance_loss_mlp": 1.01602876, "epoch": 0.8085131966572476, "flos": 27089178629760.0, "grad_norm": 1.939943005827365, "language_loss": 0.75281459, "learning_rate": 3.723144771862694e-07, "loss": 0.77468276, "num_input_tokens_seen": 145077735, "step": 6724, "time_per_iteration": 2.7094101905822754 }, { "auxiliary_loss_clip": 0.01161586, "auxiliary_loss_mlp": 0.0102268, "balance_loss_clip": 1.04736209, "balance_loss_mlp": 1.0148834, "epoch": 0.8086334395478867, "flos": 23988543788160.0, "grad_norm": 1.8323570996534446, "language_loss": 0.7678659, "learning_rate": 3.718619517950263e-07, "loss": 0.78970861, "num_input_tokens_seen": 145098330, "step": 6725, "time_per_iteration": 2.6282520294189453 }, { "auxiliary_loss_clip": 0.01168081, "auxiliary_loss_mlp": 0.01025579, "balance_loss_clip": 1.04889727, "balance_loss_mlp": 1.01812851, "epoch": 0.8087536824385259, "flos": 20405879406720.0, "grad_norm": 1.996838706846849, "language_loss": 0.77070713, "learning_rate": 3.714096733910301e-07, "loss": 0.79264367, "num_input_tokens_seen": 145115855, "step": 6726, "time_per_iteration": 3.4135091304779053 }, { "auxiliary_loss_clip": 0.01174919, "auxiliary_loss_mlp": 0.01024594, "balance_loss_clip": 1.05120516, "balance_loss_mlp": 1.01631522, "epoch": 0.8088739253291649, "flos": 25919639798400.0, "grad_norm": 2.8009220126575447, "language_loss": 0.70236981, "learning_rate": 3.709576420428926e-07, "loss": 0.72436494, "num_input_tokens_seen": 145136655, "step": 6727, "time_per_iteration": 2.6080524921417236 }, { "auxiliary_loss_clip": 0.0116209, "auxiliary_loss_mlp": 0.01024297, "balance_loss_clip": 1.04558253, "balance_loss_mlp": 1.01703775, "epoch": 0.808994168219804, "flos": 28402072640640.0, "grad_norm": 5.137277857129151, "language_loss": 0.7337817, "learning_rate": 3.7050585781918463e-07, "loss": 0.75564557, "num_input_tokens_seen": 145156955, "step": 6728, "time_per_iteration": 3.7453856468200684 }, { "auxiliary_loss_clip": 0.01169109, "auxiliary_loss_mlp": 0.01027269, "balance_loss_clip": 1.04850876, "balance_loss_mlp": 1.0195086, "epoch": 0.8091144111104431, "flos": 17421056991360.0, "grad_norm": 2.573212176732892, "language_loss": 0.6871382, "learning_rate": 3.700543207884428e-07, "loss": 0.70910197, "num_input_tokens_seen": 145173865, "step": 6729, "time_per_iteration": 2.8720498085021973 }, { "auxiliary_loss_clip": 0.01163377, "auxiliary_loss_mlp": 0.01029274, "balance_loss_clip": 1.04831326, "balance_loss_mlp": 1.02175212, "epoch": 0.8092346540010822, "flos": 32153803361280.0, "grad_norm": 2.8075103262130896, "language_loss": 0.71059871, "learning_rate": 3.6960303101916466e-07, "loss": 0.73252523, "num_input_tokens_seen": 145193780, "step": 6730, "time_per_iteration": 3.0040159225463867 }, { "auxiliary_loss_clip": 0.01060415, "auxiliary_loss_mlp": 0.01035787, "balance_loss_clip": 1.00764823, "balance_loss_mlp": 1.0003314, "epoch": 0.8093548968917212, "flos": 58035093390720.0, "grad_norm": 0.7347377212130202, "language_loss": 0.55483663, "learning_rate": 3.6915198857981047e-07, "loss": 0.57579863, "num_input_tokens_seen": 145258980, "step": 6731, "time_per_iteration": 3.438173294067383 }, { "auxiliary_loss_clip": 0.01157451, "auxiliary_loss_mlp": 0.01028662, "balance_loss_clip": 1.04957318, "balance_loss_mlp": 1.02060962, "epoch": 0.8094751397823604, "flos": 27381599251200.0, "grad_norm": 1.7704612139883078, "language_loss": 0.68339264, "learning_rate": 3.687011935388027e-07, "loss": 0.70525384, "num_input_tokens_seen": 145281875, "step": 6732, "time_per_iteration": 2.8534183502197266 }, { "auxiliary_loss_clip": 0.01163143, "auxiliary_loss_mlp": 0.01023004, "balance_loss_clip": 1.04643047, "balance_loss_mlp": 1.01612902, "epoch": 0.8095953826729995, "flos": 24061083304320.0, "grad_norm": 1.9567763749822094, "language_loss": 0.72961533, "learning_rate": 3.6825064596452646e-07, "loss": 0.75147676, "num_input_tokens_seen": 145302220, "step": 6733, "time_per_iteration": 3.66640043258667 }, { "auxiliary_loss_clip": 0.01164016, "auxiliary_loss_mlp": 0.01026842, "balance_loss_clip": 1.04606044, "balance_loss_mlp": 1.01950502, "epoch": 0.8097156255636385, "flos": 23951412103680.0, "grad_norm": 1.6862256759891419, "language_loss": 0.70416248, "learning_rate": 3.678003459253305e-07, "loss": 0.72607106, "num_input_tokens_seen": 145323070, "step": 6734, "time_per_iteration": 2.8219358921051025 }, { "auxiliary_loss_clip": 0.01158964, "auxiliary_loss_mlp": 0.01021588, "balance_loss_clip": 1.0476594, "balance_loss_mlp": 1.01370227, "epoch": 0.8098358684542777, "flos": 21799142098560.0, "grad_norm": 2.2171700789579707, "language_loss": 0.73870587, "learning_rate": 3.673502934895236e-07, "loss": 0.76051146, "num_input_tokens_seen": 145342575, "step": 6735, "time_per_iteration": 2.8240301609039307 }, { "auxiliary_loss_clip": 0.01060514, "auxiliary_loss_mlp": 0.01002179, "balance_loss_clip": 1.00760555, "balance_loss_mlp": 1.001261, "epoch": 0.8099561113449167, "flos": 68809515966720.0, "grad_norm": 0.7699207124190267, "language_loss": 0.57984519, "learning_rate": 3.669004887253802e-07, "loss": 0.60047221, "num_input_tokens_seen": 145408865, "step": 6736, "time_per_iteration": 3.358430862426758 }, { "auxiliary_loss_clip": 0.01165368, "auxiliary_loss_mlp": 0.010316, "balance_loss_clip": 1.04890537, "balance_loss_mlp": 1.02435529, "epoch": 0.8100763542355558, "flos": 23586056916480.0, "grad_norm": 1.4976145978989894, "language_loss": 0.78960657, "learning_rate": 3.664509317011335e-07, "loss": 0.81157625, "num_input_tokens_seen": 145429200, "step": 6737, "time_per_iteration": 2.776052474975586 }, { "auxiliary_loss_clip": 0.01165529, "auxiliary_loss_mlp": 0.01028902, "balance_loss_clip": 1.04954565, "balance_loss_mlp": 1.02115965, "epoch": 0.810196597126195, "flos": 31650408207360.0, "grad_norm": 2.0809307606447502, "language_loss": 0.73888683, "learning_rate": 3.6600162248498134e-07, "loss": 0.76083118, "num_input_tokens_seen": 145452830, "step": 6738, "time_per_iteration": 3.8155105113983154 }, { "auxiliary_loss_clip": 0.01138599, "auxiliary_loss_mlp": 0.01025425, "balance_loss_clip": 1.04573989, "balance_loss_mlp": 1.01794457, "epoch": 0.810316840016834, "flos": 24900459298560.0, "grad_norm": 2.3297921392567225, "language_loss": 0.76312196, "learning_rate": 3.6555256114508426e-07, "loss": 0.78476214, "num_input_tokens_seen": 145472625, "step": 6739, "time_per_iteration": 2.843048334121704 }, { "auxiliary_loss_clip": 0.01164805, "auxiliary_loss_mlp": 0.01031315, "balance_loss_clip": 1.0475024, "balance_loss_mlp": 1.02307177, "epoch": 0.8104370829074731, "flos": 27965003950080.0, "grad_norm": 1.923899046043354, "language_loss": 0.72856623, "learning_rate": 3.651037477495642e-07, "loss": 0.75052744, "num_input_tokens_seen": 145494075, "step": 6740, "time_per_iteration": 2.78775691986084 }, { "auxiliary_loss_clip": 0.01167161, "auxiliary_loss_mlp": 0.01024052, "balance_loss_clip": 1.04670835, "balance_loss_mlp": 1.01644635, "epoch": 0.8105573257981122, "flos": 24640752988800.0, "grad_norm": 2.032210756518652, "language_loss": 0.68307209, "learning_rate": 3.6465518236650584e-07, "loss": 0.70498425, "num_input_tokens_seen": 145514220, "step": 6741, "time_per_iteration": 2.8196098804473877 }, { "auxiliary_loss_clip": 0.01156626, "auxiliary_loss_mlp": 0.01025181, "balance_loss_clip": 1.04643309, "balance_loss_mlp": 1.01785219, "epoch": 0.8106775686887513, "flos": 26358935132160.0, "grad_norm": 1.7012178142497973, "language_loss": 0.78307867, "learning_rate": 3.642068650639558e-07, "loss": 0.80489671, "num_input_tokens_seen": 145533965, "step": 6742, "time_per_iteration": 2.789506673812866 }, { "auxiliary_loss_clip": 0.01157667, "auxiliary_loss_mlp": 0.0102753, "balance_loss_clip": 1.04563999, "balance_loss_mlp": 1.01944423, "epoch": 0.8107978115793903, "flos": 27271892136960.0, "grad_norm": 7.461076774754253, "language_loss": 0.64492249, "learning_rate": 3.6375879590992334e-07, "loss": 0.66677439, "num_input_tokens_seen": 145554310, "step": 6743, "time_per_iteration": 2.7896060943603516 }, { "auxiliary_loss_clip": 0.01158043, "auxiliary_loss_mlp": 0.0102412, "balance_loss_clip": 1.04668725, "balance_loss_mlp": 1.01664019, "epoch": 0.8109180544700295, "flos": 24934322845440.0, "grad_norm": 2.1774244492191763, "language_loss": 0.81022847, "learning_rate": 3.6331097497238173e-07, "loss": 0.8320502, "num_input_tokens_seen": 145573755, "step": 6744, "time_per_iteration": 2.8197853565216064 }, { "auxiliary_loss_clip": 0.01153398, "auxiliary_loss_mlp": 0.01025254, "balance_loss_clip": 1.04533184, "balance_loss_mlp": 1.01845336, "epoch": 0.8110382973606686, "flos": 21105383840640.0, "grad_norm": 2.003034928548081, "language_loss": 0.80240345, "learning_rate": 3.628634023192627e-07, "loss": 0.82418996, "num_input_tokens_seen": 145594000, "step": 6745, "time_per_iteration": 2.8375771045684814 }, { "auxiliary_loss_clip": 0.011674, "auxiliary_loss_mlp": 0.01025567, "balance_loss_clip": 1.04701078, "balance_loss_mlp": 1.0185008, "epoch": 0.8111585402513076, "flos": 15414081500160.0, "grad_norm": 2.1964513032047654, "language_loss": 0.75310385, "learning_rate": 3.624160780184644e-07, "loss": 0.77503353, "num_input_tokens_seen": 145611215, "step": 6746, "time_per_iteration": 2.784963369369507 }, { "auxiliary_loss_clip": 0.0115896, "auxiliary_loss_mlp": 0.01026312, "balance_loss_clip": 1.04881811, "balance_loss_mlp": 1.01831937, "epoch": 0.8112787831419467, "flos": 24095736950400.0, "grad_norm": 1.8294611601213306, "language_loss": 0.74506772, "learning_rate": 3.6196900213784496e-07, "loss": 0.76692045, "num_input_tokens_seen": 145630530, "step": 6747, "time_per_iteration": 2.798924446105957 }, { "auxiliary_loss_clip": 0.01163607, "auxiliary_loss_mlp": 0.01022041, "balance_loss_clip": 1.0458951, "balance_loss_mlp": 1.01493347, "epoch": 0.8113990260325858, "flos": 20483374999680.0, "grad_norm": 2.1100231742101867, "language_loss": 0.86352789, "learning_rate": 3.6152217474522527e-07, "loss": 0.88538438, "num_input_tokens_seen": 145647345, "step": 6748, "time_per_iteration": 2.6635448932647705 }, { "auxiliary_loss_clip": 0.01164625, "auxiliary_loss_mlp": 0.01029314, "balance_loss_clip": 1.05013621, "balance_loss_mlp": 1.02216423, "epoch": 0.8115192689232249, "flos": 24901141656960.0, "grad_norm": 1.6096554993269623, "language_loss": 0.72614187, "learning_rate": 3.6107559590838975e-07, "loss": 0.74808127, "num_input_tokens_seen": 145666330, "step": 6749, "time_per_iteration": 2.7574448585510254 }, { "auxiliary_loss_clip": 0.01146735, "auxiliary_loss_mlp": 0.01021819, "balance_loss_clip": 1.04692268, "balance_loss_mlp": 1.01446962, "epoch": 0.811639511813864, "flos": 24057204635520.0, "grad_norm": 2.4449116765856824, "language_loss": 0.6635921, "learning_rate": 3.606292656950822e-07, "loss": 0.6852777, "num_input_tokens_seen": 145684740, "step": 6750, "time_per_iteration": 2.8163862228393555 }, { "auxiliary_loss_clip": 0.01156894, "auxiliary_loss_mlp": 0.01024302, "balance_loss_clip": 1.04580688, "balance_loss_mlp": 1.01642561, "epoch": 0.8117597547045031, "flos": 23185150243200.0, "grad_norm": 2.308408454667451, "language_loss": 0.8665899, "learning_rate": 3.601831841730121e-07, "loss": 0.88840187, "num_input_tokens_seen": 145702660, "step": 6751, "time_per_iteration": 2.8134560585021973 }, { "auxiliary_loss_clip": 0.01161693, "auxiliary_loss_mlp": 0.01025705, "balance_loss_clip": 1.04727089, "balance_loss_mlp": 1.0185467, "epoch": 0.8118799975951422, "flos": 23040250778880.0, "grad_norm": 1.6990264499947214, "language_loss": 0.73132563, "learning_rate": 3.5973735140984916e-07, "loss": 0.75319958, "num_input_tokens_seen": 145722830, "step": 6752, "time_per_iteration": 2.7119178771972656 }, { "auxiliary_loss_clip": 0.01151584, "auxiliary_loss_mlp": 0.01054875, "balance_loss_clip": 1.0453887, "balance_loss_mlp": 1.01986647, "epoch": 0.8120002404857812, "flos": 24639962889600.0, "grad_norm": 1.9584745903513678, "language_loss": 0.79652816, "learning_rate": 3.5929176747322607e-07, "loss": 0.81859273, "num_input_tokens_seen": 145741935, "step": 6753, "time_per_iteration": 3.706186056137085 }, { "auxiliary_loss_clip": 0.01062173, "auxiliary_loss_mlp": 0.01004388, "balance_loss_clip": 1.00807667, "balance_loss_mlp": 1.00333261, "epoch": 0.8121204833764204, "flos": 57415742156160.0, "grad_norm": 0.8124185930361587, "language_loss": 0.56159484, "learning_rate": 3.588464324307372e-07, "loss": 0.58226049, "num_input_tokens_seen": 145805560, "step": 6754, "time_per_iteration": 3.3328769207000732 }, { "auxiliary_loss_clip": 0.01166767, "auxiliary_loss_mlp": 0.01031447, "balance_loss_clip": 1.04641986, "balance_loss_mlp": 1.02366292, "epoch": 0.8122407262670595, "flos": 19464589549440.0, "grad_norm": 1.6898172148165287, "language_loss": 0.7543577, "learning_rate": 3.584013463499391e-07, "loss": 0.77633989, "num_input_tokens_seen": 145824180, "step": 6755, "time_per_iteration": 3.6403729915618896 }, { "auxiliary_loss_clip": 0.01059102, "auxiliary_loss_mlp": 0.01000314, "balance_loss_clip": 1.00832105, "balance_loss_mlp": 0.99924749, "epoch": 0.8123609691576985, "flos": 56425325472000.0, "grad_norm": 0.8132956432109334, "language_loss": 0.6445936, "learning_rate": 3.579565092983521e-07, "loss": 0.66518784, "num_input_tokens_seen": 145885300, "step": 6756, "time_per_iteration": 3.1403796672821045 }, { "auxiliary_loss_clip": 0.01167463, "auxiliary_loss_mlp": 0.01025399, "balance_loss_clip": 1.04811406, "balance_loss_mlp": 1.01833332, "epoch": 0.8124812120483377, "flos": 20631973564800.0, "grad_norm": 2.316862836825762, "language_loss": 0.83996439, "learning_rate": 3.575119213434565e-07, "loss": 0.861893, "num_input_tokens_seen": 145903815, "step": 6757, "time_per_iteration": 2.722541332244873 }, { "auxiliary_loss_clip": 0.01161625, "auxiliary_loss_mlp": 0.01023251, "balance_loss_clip": 1.04697466, "balance_loss_mlp": 1.0160358, "epoch": 0.8126014549389767, "flos": 22492397566080.0, "grad_norm": 1.89583867625654, "language_loss": 0.81820905, "learning_rate": 3.5706758255269765e-07, "loss": 0.84005779, "num_input_tokens_seen": 145922270, "step": 6758, "time_per_iteration": 2.6611506938934326 }, { "auxiliary_loss_clip": 0.01164032, "auxiliary_loss_mlp": 0.01029453, "balance_loss_clip": 1.04636908, "balance_loss_mlp": 1.0214119, "epoch": 0.8127216978296158, "flos": 23287961946240.0, "grad_norm": 1.6225501718504494, "language_loss": 0.69658315, "learning_rate": 3.566234929934795e-07, "loss": 0.71851802, "num_input_tokens_seen": 145941470, "step": 6759, "time_per_iteration": 3.7014896869659424 }, { "auxiliary_loss_clip": 0.01159347, "auxiliary_loss_mlp": 0.01025887, "balance_loss_clip": 1.04668546, "balance_loss_mlp": 1.01818621, "epoch": 0.812841940720255, "flos": 25154994049920.0, "grad_norm": 1.4255611110122097, "language_loss": 0.71879125, "learning_rate": 3.561796527331706e-07, "loss": 0.7406435, "num_input_tokens_seen": 145963145, "step": 6760, "time_per_iteration": 2.7081844806671143 }, { "auxiliary_loss_clip": 0.0116177, "auxiliary_loss_mlp": 0.0102409, "balance_loss_clip": 1.04868507, "balance_loss_mlp": 1.01710153, "epoch": 0.812962183610894, "flos": 26648446752000.0, "grad_norm": 1.8900938252778994, "language_loss": 0.77553827, "learning_rate": 3.5573606183910163e-07, "loss": 0.79739684, "num_input_tokens_seen": 145983150, "step": 6761, "time_per_iteration": 2.9109301567077637 }, { "auxiliary_loss_clip": 0.01169804, "auxiliary_loss_mlp": 0.01022592, "balance_loss_clip": 1.04594886, "balance_loss_mlp": 1.0148201, "epoch": 0.8130824265015331, "flos": 24966965329920.0, "grad_norm": 1.8666041131796787, "language_loss": 0.78826022, "learning_rate": 3.5529272037856493e-07, "loss": 0.81018418, "num_input_tokens_seen": 146001365, "step": 6762, "time_per_iteration": 2.821932554244995 }, { "auxiliary_loss_clip": 0.01063447, "auxiliary_loss_mlp": 0.01003564, "balance_loss_clip": 1.00802112, "balance_loss_mlp": 1.00250292, "epoch": 0.8132026693921722, "flos": 67622918175360.0, "grad_norm": 0.7210544032698317, "language_loss": 0.53840721, "learning_rate": 3.548496284188149e-07, "loss": 0.55907732, "num_input_tokens_seen": 146061570, "step": 6763, "time_per_iteration": 3.3520586490631104 }, { "auxiliary_loss_clip": 0.0114957, "auxiliary_loss_mlp": 0.01026443, "balance_loss_clip": 1.04671896, "balance_loss_mlp": 1.01891756, "epoch": 0.8133229122828113, "flos": 19495149045120.0, "grad_norm": 1.7737293243029648, "language_loss": 0.79363787, "learning_rate": 3.544067860270681e-07, "loss": 0.8153981, "num_input_tokens_seen": 146079145, "step": 6764, "time_per_iteration": 3.634443759918213 }, { "auxiliary_loss_clip": 0.0116175, "auxiliary_loss_mlp": 0.01030644, "balance_loss_clip": 1.04753053, "balance_loss_mlp": 1.02272248, "epoch": 0.8134431551734503, "flos": 20668135582080.0, "grad_norm": 1.808860387820226, "language_loss": 0.71127945, "learning_rate": 3.539641932705029e-07, "loss": 0.73320341, "num_input_tokens_seen": 146097625, "step": 6765, "time_per_iteration": 2.744879961013794 }, { "auxiliary_loss_clip": 0.01170835, "auxiliary_loss_mlp": 0.01021947, "balance_loss_clip": 1.0481416, "balance_loss_mlp": 1.01421022, "epoch": 0.8135633980640895, "flos": 21507332008320.0, "grad_norm": 2.889364442579274, "language_loss": 0.77288103, "learning_rate": 3.53521850216262e-07, "loss": 0.7948088, "num_input_tokens_seen": 146117195, "step": 6766, "time_per_iteration": 2.670319080352783 }, { "auxiliary_loss_clip": 0.01168277, "auxiliary_loss_mlp": 0.0102463, "balance_loss_clip": 1.04795063, "balance_loss_mlp": 1.01686954, "epoch": 0.8136836409547286, "flos": 20554442058240.0, "grad_norm": 1.7295281829163889, "language_loss": 0.76780671, "learning_rate": 3.530797569314461e-07, "loss": 0.78973579, "num_input_tokens_seen": 146136220, "step": 6767, "time_per_iteration": 2.7747116088867188 }, { "auxiliary_loss_clip": 0.01170645, "auxiliary_loss_mlp": 0.01028407, "balance_loss_clip": 1.04975271, "balance_loss_mlp": 1.02036059, "epoch": 0.8138038838453676, "flos": 20299045380480.0, "grad_norm": 2.6802411923168177, "language_loss": 0.77791667, "learning_rate": 3.5263791348312235e-07, "loss": 0.79990715, "num_input_tokens_seen": 146155415, "step": 6768, "time_per_iteration": 2.6512513160705566 }, { "auxiliary_loss_clip": 0.01158453, "auxiliary_loss_mlp": 0.01023998, "balance_loss_clip": 1.04420924, "balance_loss_mlp": 1.01641679, "epoch": 0.8139241267360068, "flos": 29789840551680.0, "grad_norm": 1.7283215473935423, "language_loss": 0.70710397, "learning_rate": 3.521963199383171e-07, "loss": 0.72892851, "num_input_tokens_seen": 146178370, "step": 6769, "time_per_iteration": 2.782559871673584 }, { "auxiliary_loss_clip": 0.01157557, "auxiliary_loss_mlp": 0.01026043, "balance_loss_clip": 1.04976177, "balance_loss_mlp": 1.01788592, "epoch": 0.8140443696266458, "flos": 19713270384000.0, "grad_norm": 3.9914442424397665, "language_loss": 0.7679199, "learning_rate": 3.517549763640197e-07, "loss": 0.78975594, "num_input_tokens_seen": 146196010, "step": 6770, "time_per_iteration": 2.70386004447937 }, { "auxiliary_loss_clip": 0.01164722, "auxiliary_loss_mlp": 0.01055465, "balance_loss_clip": 1.04988921, "balance_loss_mlp": 1.02015829, "epoch": 0.8141646125172849, "flos": 27160568910720.0, "grad_norm": 2.5090455823918707, "language_loss": 0.7107954, "learning_rate": 3.513138828271829e-07, "loss": 0.73299724, "num_input_tokens_seen": 146215880, "step": 6771, "time_per_iteration": 2.7499639987945557 }, { "auxiliary_loss_clip": 0.01152841, "auxiliary_loss_mlp": 0.01024823, "balance_loss_clip": 1.04628932, "balance_loss_mlp": 1.01745272, "epoch": 0.8142848554079241, "flos": 39673102700160.0, "grad_norm": 1.750776507573079, "language_loss": 0.6996122, "learning_rate": 3.508730393947179e-07, "loss": 0.72138882, "num_input_tokens_seen": 146239135, "step": 6772, "time_per_iteration": 2.876061201095581 }, { "auxiliary_loss_clip": 0.01158909, "auxiliary_loss_mlp": 0.01028319, "balance_loss_clip": 1.0489223, "balance_loss_mlp": 1.02097583, "epoch": 0.8144050982985631, "flos": 22237288197120.0, "grad_norm": 6.567435577967118, "language_loss": 0.72099352, "learning_rate": 3.504324461335024e-07, "loss": 0.7428658, "num_input_tokens_seen": 146259245, "step": 6773, "time_per_iteration": 2.7357048988342285 }, { "auxiliary_loss_clip": 0.01152679, "auxiliary_loss_mlp": 0.01032324, "balance_loss_clip": 1.05027199, "balance_loss_mlp": 1.02373505, "epoch": 0.8145253411892022, "flos": 23038239617280.0, "grad_norm": 2.163270733173385, "language_loss": 0.88432705, "learning_rate": 3.499921031103732e-07, "loss": 0.90617716, "num_input_tokens_seen": 146280015, "step": 6774, "time_per_iteration": 2.8785560131073 }, { "auxiliary_loss_clip": 0.01170178, "auxiliary_loss_mlp": 0.01026002, "balance_loss_clip": 1.0479598, "balance_loss_mlp": 1.0187782, "epoch": 0.8146455840798413, "flos": 24827668387200.0, "grad_norm": 1.625526030692439, "language_loss": 0.7826342, "learning_rate": 3.4955201039212987e-07, "loss": 0.80459595, "num_input_tokens_seen": 146300935, "step": 6775, "time_per_iteration": 2.726027011871338 }, { "auxiliary_loss_clip": 0.01170003, "auxiliary_loss_mlp": 0.0102568, "balance_loss_clip": 1.04851377, "balance_loss_mlp": 1.01736498, "epoch": 0.8147658269704804, "flos": 19974520978560.0, "grad_norm": 2.0414514414903904, "language_loss": 0.65448773, "learning_rate": 3.4911216804553465e-07, "loss": 0.67644453, "num_input_tokens_seen": 146319835, "step": 6776, "time_per_iteration": 2.789618730545044 }, { "auxiliary_loss_clip": 0.01162549, "auxiliary_loss_mlp": 0.01029981, "balance_loss_clip": 1.04987168, "balance_loss_mlp": 1.02230978, "epoch": 0.8148860698611194, "flos": 21178031097600.0, "grad_norm": 2.00904125517627, "language_loss": 0.70428675, "learning_rate": 3.4867257613731017e-07, "loss": 0.72621202, "num_input_tokens_seen": 146339030, "step": 6777, "time_per_iteration": 2.7947769165039062 }, { "auxiliary_loss_clip": 0.01162192, "auxiliary_loss_mlp": 0.01023637, "balance_loss_clip": 1.04633772, "balance_loss_mlp": 1.01600718, "epoch": 0.8150063127517585, "flos": 19606903234560.0, "grad_norm": 1.7335612786810504, "language_loss": 0.85539049, "learning_rate": 3.4823323473414343e-07, "loss": 0.87724876, "num_input_tokens_seen": 146358550, "step": 6778, "time_per_iteration": 3.5702064037323 }, { "auxiliary_loss_clip": 0.01164509, "auxiliary_loss_mlp": 0.01023448, "balance_loss_clip": 1.04666471, "balance_loss_mlp": 1.01547849, "epoch": 0.8151265556423977, "flos": 22638374438400.0, "grad_norm": 2.1188784565057186, "language_loss": 0.76255012, "learning_rate": 3.477941439026812e-07, "loss": 0.78442973, "num_input_tokens_seen": 146376770, "step": 6779, "time_per_iteration": 2.7328200340270996 }, { "auxiliary_loss_clip": 0.01163646, "auxiliary_loss_mlp": 0.01027913, "balance_loss_clip": 1.04952967, "balance_loss_mlp": 1.01997375, "epoch": 0.8152467985330367, "flos": 17968048277760.0, "grad_norm": 1.7461672993922035, "language_loss": 0.729186, "learning_rate": 3.473553037095349e-07, "loss": 0.75110161, "num_input_tokens_seen": 146395795, "step": 6780, "time_per_iteration": 3.6612954139709473 }, { "auxiliary_loss_clip": 0.01156626, "auxiliary_loss_mlp": 0.01025453, "balance_loss_clip": 1.04770684, "balance_loss_mlp": 1.01843119, "epoch": 0.8153670414236758, "flos": 24969012405120.0, "grad_norm": 1.7892907236575444, "language_loss": 0.83414447, "learning_rate": 3.469167142212743e-07, "loss": 0.85596526, "num_input_tokens_seen": 146417640, "step": 6781, "time_per_iteration": 2.767021417617798 }, { "auxiliary_loss_clip": 0.01166041, "auxiliary_loss_mlp": 0.01025654, "balance_loss_clip": 1.04908478, "balance_loss_mlp": 1.01742864, "epoch": 0.8154872843143149, "flos": 31066069754880.0, "grad_norm": 2.271288693556586, "language_loss": 0.63287044, "learning_rate": 3.4647837550443337e-07, "loss": 0.65478742, "num_input_tokens_seen": 146436205, "step": 6782, "time_per_iteration": 2.7395949363708496 }, { "auxiliary_loss_clip": 0.01157046, "auxiliary_loss_mlp": 0.01025269, "balance_loss_clip": 1.0473032, "balance_loss_mlp": 1.01782107, "epoch": 0.815607527204954, "flos": 19391654983680.0, "grad_norm": 1.728183002127947, "language_loss": 0.74382639, "learning_rate": 3.460402876255086e-07, "loss": 0.76564956, "num_input_tokens_seen": 146453595, "step": 6783, "time_per_iteration": 2.7505667209625244 }, { "auxiliary_loss_clip": 0.0116823, "auxiliary_loss_mlp": 0.01027599, "balance_loss_clip": 1.04851794, "balance_loss_mlp": 1.02053022, "epoch": 0.815727770095593, "flos": 26140418743680.0, "grad_norm": 3.173363076610353, "language_loss": 0.72266603, "learning_rate": 3.456024506509574e-07, "loss": 0.74462426, "num_input_tokens_seen": 146474515, "step": 6784, "time_per_iteration": 2.6875953674316406 }, { "auxiliary_loss_clip": 0.0116527, "auxiliary_loss_mlp": 0.01057795, "balance_loss_clip": 1.04915905, "balance_loss_mlp": 1.02154899, "epoch": 0.8158480129862322, "flos": 25337527989120.0, "grad_norm": 1.6016829545996627, "language_loss": 0.74123037, "learning_rate": 3.4516486464719873e-07, "loss": 0.76346099, "num_input_tokens_seen": 146493905, "step": 6785, "time_per_iteration": 3.785623550415039 }, { "auxiliary_loss_clip": 0.01151155, "auxiliary_loss_mlp": 0.01030084, "balance_loss_clip": 1.0465852, "balance_loss_mlp": 1.02187657, "epoch": 0.8159682558768713, "flos": 34423645559040.0, "grad_norm": 1.645352832564427, "language_loss": 0.62070441, "learning_rate": 3.4472752968061445e-07, "loss": 0.64251679, "num_input_tokens_seen": 146518335, "step": 6786, "time_per_iteration": 2.9010212421417236 }, { "auxiliary_loss_clip": 0.01163874, "auxiliary_loss_mlp": 0.01021665, "balance_loss_clip": 1.04620945, "balance_loss_mlp": 1.01419055, "epoch": 0.8160884987675103, "flos": 18653223185280.0, "grad_norm": 1.7584758944645142, "language_loss": 0.73693204, "learning_rate": 3.442904458175475e-07, "loss": 0.75878739, "num_input_tokens_seen": 146535655, "step": 6787, "time_per_iteration": 2.6327576637268066 }, { "auxiliary_loss_clip": 0.01163936, "auxiliary_loss_mlp": 0.01029877, "balance_loss_clip": 1.04731083, "balance_loss_mlp": 1.02197623, "epoch": 0.8162087416581495, "flos": 31430527102080.0, "grad_norm": 1.5057010383772538, "language_loss": 0.7642113, "learning_rate": 3.438536131243044e-07, "loss": 0.78614938, "num_input_tokens_seen": 146556815, "step": 6788, "time_per_iteration": 2.7855594158172607 }, { "auxiliary_loss_clip": 0.01164868, "auxiliary_loss_mlp": 0.01025319, "balance_loss_clip": 1.04697764, "balance_loss_mlp": 1.01730847, "epoch": 0.8163289845487885, "flos": 37593910915200.0, "grad_norm": 2.6620186963976815, "language_loss": 0.6195721, "learning_rate": 3.434170316671503e-07, "loss": 0.64147401, "num_input_tokens_seen": 146581845, "step": 6789, "time_per_iteration": 2.923234224319458 }, { "auxiliary_loss_clip": 0.01152734, "auxiliary_loss_mlp": 0.01024969, "balance_loss_clip": 1.04823947, "balance_loss_mlp": 1.01802182, "epoch": 0.8164492274394276, "flos": 13953989554560.0, "grad_norm": 3.3690060307712844, "language_loss": 0.89510727, "learning_rate": 3.4298070151231583e-07, "loss": 0.91688436, "num_input_tokens_seen": 146597245, "step": 6790, "time_per_iteration": 3.7570972442626953 }, { "auxiliary_loss_clip": 0.01165635, "auxiliary_loss_mlp": 0.01024481, "balance_loss_clip": 1.04728425, "balance_loss_mlp": 1.01722145, "epoch": 0.8165694703300668, "flos": 28986554747520.0, "grad_norm": 1.8512466034669617, "language_loss": 0.59912705, "learning_rate": 3.425446227259916e-07, "loss": 0.62102824, "num_input_tokens_seen": 146618210, "step": 6791, "time_per_iteration": 2.854560136795044 }, { "auxiliary_loss_clip": 0.01162041, "auxiliary_loss_mlp": 0.01022773, "balance_loss_clip": 1.04701376, "balance_loss_mlp": 1.01567101, "epoch": 0.8166897132207058, "flos": 25118365155840.0, "grad_norm": 1.8350770254611621, "language_loss": 0.82715327, "learning_rate": 3.421087953743296e-07, "loss": 0.84900141, "num_input_tokens_seen": 146637975, "step": 6792, "time_per_iteration": 2.7967772483825684 }, { "auxiliary_loss_clip": 0.0116365, "auxiliary_loss_mlp": 0.01026644, "balance_loss_clip": 1.04650521, "balance_loss_mlp": 1.01911592, "epoch": 0.8168099561113449, "flos": 23148593176320.0, "grad_norm": 3.0930632901494866, "language_loss": 0.80183411, "learning_rate": 3.416732195234464e-07, "loss": 0.82373703, "num_input_tokens_seen": 146658030, "step": 6793, "time_per_iteration": 2.7707090377807617 }, { "auxiliary_loss_clip": 0.01166684, "auxiliary_loss_mlp": 0.01026341, "balance_loss_clip": 1.04695189, "balance_loss_mlp": 1.01916742, "epoch": 0.816930199001984, "flos": 18407666833920.0, "grad_norm": 1.5420210350247898, "language_loss": 0.79264367, "learning_rate": 3.4123789523941613e-07, "loss": 0.81457388, "num_input_tokens_seen": 146677855, "step": 6794, "time_per_iteration": 2.7428903579711914 }, { "auxiliary_loss_clip": 0.01158874, "auxiliary_loss_mlp": 0.01027573, "balance_loss_clip": 1.04542041, "balance_loss_mlp": 1.02038431, "epoch": 0.8170504418926231, "flos": 21251324799360.0, "grad_norm": 1.6225128277596867, "language_loss": 0.63561392, "learning_rate": 3.4080282258827884e-07, "loss": 0.65747839, "num_input_tokens_seen": 146696230, "step": 6795, "time_per_iteration": 2.779201030731201 }, { "auxiliary_loss_clip": 0.01166716, "auxiliary_loss_mlp": 0.01021253, "balance_loss_clip": 1.04790533, "balance_loss_mlp": 1.01409459, "epoch": 0.8171706847832622, "flos": 19099234362240.0, "grad_norm": 3.0775655304230387, "language_loss": 0.72101355, "learning_rate": 3.403680016360342e-07, "loss": 0.74289316, "num_input_tokens_seen": 146714835, "step": 6796, "time_per_iteration": 2.7894139289855957 }, { "auxiliary_loss_clip": 0.01161613, "auxiliary_loss_mlp": 0.01027255, "balance_loss_clip": 1.05003119, "balance_loss_mlp": 1.01953876, "epoch": 0.8172909276739013, "flos": 21470128496640.0, "grad_norm": 1.535465565787393, "language_loss": 0.67499328, "learning_rate": 3.3993343244864403e-07, "loss": 0.69688195, "num_input_tokens_seen": 146734425, "step": 6797, "time_per_iteration": 2.7522268295288086 }, { "auxiliary_loss_clip": 0.01162325, "auxiliary_loss_mlp": 0.01022408, "balance_loss_clip": 1.04684639, "balance_loss_mlp": 1.01512718, "epoch": 0.8174111705645404, "flos": 27599792417280.0, "grad_norm": 1.6107709391789948, "language_loss": 0.72943223, "learning_rate": 3.394991150920323e-07, "loss": 0.75127959, "num_input_tokens_seen": 146757545, "step": 6798, "time_per_iteration": 2.807835578918457 }, { "auxiliary_loss_clip": 0.01159936, "auxiliary_loss_mlp": 0.01056431, "balance_loss_clip": 1.04889798, "balance_loss_mlp": 1.01983905, "epoch": 0.8175314134551794, "flos": 14064594508800.0, "grad_norm": 2.1962106225219244, "language_loss": 0.74304914, "learning_rate": 3.3906504963208396e-07, "loss": 0.76521283, "num_input_tokens_seen": 146774240, "step": 6799, "time_per_iteration": 2.6876587867736816 }, { "auxiliary_loss_clip": 0.01152995, "auxiliary_loss_mlp": 0.01023366, "balance_loss_clip": 1.04860103, "balance_loss_mlp": 1.01518798, "epoch": 0.8176516563458186, "flos": 22708076780160.0, "grad_norm": 1.895225847701364, "language_loss": 0.66177183, "learning_rate": 3.3863123613464774e-07, "loss": 0.6835354, "num_input_tokens_seen": 146793140, "step": 6800, "time_per_iteration": 2.8074193000793457 }, { "auxiliary_loss_clip": 0.01163167, "auxiliary_loss_mlp": 0.01027465, "balance_loss_clip": 1.04512691, "balance_loss_mlp": 1.01973462, "epoch": 0.8177718992364577, "flos": 21945406279680.0, "grad_norm": 1.6697612446199432, "language_loss": 0.74999684, "learning_rate": 3.381976746655317e-07, "loss": 0.77190316, "num_input_tokens_seen": 146812895, "step": 6801, "time_per_iteration": 2.7256481647491455 }, { "auxiliary_loss_clip": 0.01148291, "auxiliary_loss_mlp": 0.01028157, "balance_loss_clip": 1.04824066, "balance_loss_mlp": 1.02109337, "epoch": 0.8178921421270967, "flos": 22017443005440.0, "grad_norm": 2.119878440055512, "language_loss": 0.67521882, "learning_rate": 3.3776436529050756e-07, "loss": 0.69698334, "num_input_tokens_seen": 146832445, "step": 6802, "time_per_iteration": 2.8614931106567383 }, { "auxiliary_loss_clip": 0.01165393, "auxiliary_loss_mlp": 0.01023838, "balance_loss_clip": 1.04670143, "balance_loss_mlp": 1.01617837, "epoch": 0.8180123850177359, "flos": 33183111496320.0, "grad_norm": 1.7198253692248004, "language_loss": 0.72445917, "learning_rate": 3.373313080753073e-07, "loss": 0.74635142, "num_input_tokens_seen": 146856505, "step": 6803, "time_per_iteration": 2.8255667686462402 }, { "auxiliary_loss_clip": 0.01160641, "auxiliary_loss_mlp": 0.0102634, "balance_loss_clip": 1.04718852, "balance_loss_mlp": 1.01878548, "epoch": 0.8181326279083749, "flos": 22091167670400.0, "grad_norm": 1.512183327777207, "language_loss": 0.77501881, "learning_rate": 3.3689850308562527e-07, "loss": 0.79688859, "num_input_tokens_seen": 146876950, "step": 6804, "time_per_iteration": 4.306336402893066 }, { "auxiliary_loss_clip": 0.01150022, "auxiliary_loss_mlp": 0.01025024, "balance_loss_clip": 1.05035734, "balance_loss_mlp": 1.01804137, "epoch": 0.818252870799014, "flos": 15705747936000.0, "grad_norm": 1.9074078727315245, "language_loss": 0.77613199, "learning_rate": 3.364659503871183e-07, "loss": 0.79788244, "num_input_tokens_seen": 146894885, "step": 6805, "time_per_iteration": 2.834688901901245 }, { "auxiliary_loss_clip": 0.01154628, "auxiliary_loss_mlp": 0.01021563, "balance_loss_clip": 1.04528904, "balance_loss_mlp": 1.01490784, "epoch": 0.8183731136896532, "flos": 18770687637120.0, "grad_norm": 1.9950713778097733, "language_loss": 0.84133911, "learning_rate": 3.3603365004540417e-07, "loss": 0.86310101, "num_input_tokens_seen": 146913180, "step": 6806, "time_per_iteration": 2.940563917160034 }, { "auxiliary_loss_clip": 0.01167165, "auxiliary_loss_mlp": 0.01026101, "balance_loss_clip": 1.04820728, "balance_loss_mlp": 1.01876116, "epoch": 0.8184933565802922, "flos": 26541792293760.0, "grad_norm": 2.1436119488680054, "language_loss": 0.7757622, "learning_rate": 3.356016021260624e-07, "loss": 0.79769486, "num_input_tokens_seen": 146933510, "step": 6807, "time_per_iteration": 3.907113790512085 }, { "auxiliary_loss_clip": 0.01166421, "auxiliary_loss_mlp": 0.01024824, "balance_loss_clip": 1.04831016, "balance_loss_mlp": 1.0173527, "epoch": 0.8186135994709313, "flos": 17530117660800.0, "grad_norm": 3.062453018885885, "language_loss": 0.65353012, "learning_rate": 3.35169806694634e-07, "loss": 0.67544258, "num_input_tokens_seen": 146951760, "step": 6808, "time_per_iteration": 2.7835593223571777 }, { "auxiliary_loss_clip": 0.01062772, "auxiliary_loss_mlp": 0.01000518, "balance_loss_clip": 1.01245856, "balance_loss_mlp": 0.9994812, "epoch": 0.8187338423615703, "flos": 63480300675840.0, "grad_norm": 0.7186000966216227, "language_loss": 0.60612476, "learning_rate": 3.3473826381662186e-07, "loss": 0.62675762, "num_input_tokens_seen": 147022900, "step": 6809, "time_per_iteration": 3.4706649780273438 }, { "auxiliary_loss_clip": 0.01160356, "auxiliary_loss_mlp": 0.01025815, "balance_loss_clip": 1.04822302, "balance_loss_mlp": 1.01797366, "epoch": 0.8188540852522095, "flos": 17529974006400.0, "grad_norm": 2.060410597082642, "language_loss": 0.81817818, "learning_rate": 3.3430697355749216e-07, "loss": 0.84003997, "num_input_tokens_seen": 147040590, "step": 6810, "time_per_iteration": 2.67683482170105 }, { "auxiliary_loss_clip": 0.01152307, "auxiliary_loss_mlp": 0.0102592, "balance_loss_clip": 1.04806376, "balance_loss_mlp": 1.01904178, "epoch": 0.8189743281428485, "flos": 14392530702720.0, "grad_norm": 2.017230292357262, "language_loss": 0.75369036, "learning_rate": 3.3387593598266907e-07, "loss": 0.77547264, "num_input_tokens_seen": 147057200, "step": 6811, "time_per_iteration": 2.7660746574401855 }, { "auxiliary_loss_clip": 0.01152124, "auxiliary_loss_mlp": 0.01025676, "balance_loss_clip": 1.0469799, "balance_loss_mlp": 1.01848781, "epoch": 0.8190945710334876, "flos": 25080479285760.0, "grad_norm": 1.7565874772724535, "language_loss": 0.78316855, "learning_rate": 3.3344515115754225e-07, "loss": 0.80494654, "num_input_tokens_seen": 147076180, "step": 6812, "time_per_iteration": 3.7211906909942627 }, { "auxiliary_loss_clip": 0.01162193, "auxiliary_loss_mlp": 0.01028847, "balance_loss_clip": 1.04725361, "balance_loss_mlp": 1.02189076, "epoch": 0.8192148139241268, "flos": 21507152440320.0, "grad_norm": 3.492711227658252, "language_loss": 0.79641509, "learning_rate": 3.33014619147461e-07, "loss": 0.81832552, "num_input_tokens_seen": 147094205, "step": 6813, "time_per_iteration": 2.864051580429077 }, { "auxiliary_loss_clip": 0.01160999, "auxiliary_loss_mlp": 0.01030051, "balance_loss_clip": 1.04892468, "balance_loss_mlp": 1.02237988, "epoch": 0.8193350568147658, "flos": 23952166289280.0, "grad_norm": 2.023209103271885, "language_loss": 0.71795118, "learning_rate": 3.325843400177362e-07, "loss": 0.73986173, "num_input_tokens_seen": 147115545, "step": 6814, "time_per_iteration": 2.8909285068511963 }, { "auxiliary_loss_clip": 0.0116762, "auxiliary_loss_mlp": 0.01056764, "balance_loss_clip": 1.04855418, "balance_loss_mlp": 1.02158487, "epoch": 0.8194552997054049, "flos": 20559469962240.0, "grad_norm": 1.8286019863489296, "language_loss": 0.73494476, "learning_rate": 3.32154313833642e-07, "loss": 0.75718868, "num_input_tokens_seen": 147135700, "step": 6815, "time_per_iteration": 2.8269498348236084 }, { "auxiliary_loss_clip": 0.01168687, "auxiliary_loss_mlp": 0.01028263, "balance_loss_clip": 1.04712653, "balance_loss_mlp": 1.0201807, "epoch": 0.819575542596044, "flos": 26031753123840.0, "grad_norm": 2.1799303059472495, "language_loss": 0.59492493, "learning_rate": 3.3172454066041164e-07, "loss": 0.61689442, "num_input_tokens_seen": 147155205, "step": 6816, "time_per_iteration": 3.636141538619995 }, { "auxiliary_loss_clip": 0.01147225, "auxiliary_loss_mlp": 0.01051903, "balance_loss_clip": 1.04661977, "balance_loss_mlp": 1.01743102, "epoch": 0.8196957854866831, "flos": 29096944220160.0, "grad_norm": 2.058799514223959, "language_loss": 0.76109987, "learning_rate": 3.3129502056324234e-07, "loss": 0.78309113, "num_input_tokens_seen": 147176570, "step": 6817, "time_per_iteration": 2.9345595836639404 }, { "auxiliary_loss_clip": 0.01063594, "auxiliary_loss_mlp": 0.01004171, "balance_loss_clip": 1.01424825, "balance_loss_mlp": 1.00304413, "epoch": 0.8198160283773221, "flos": 69033631898880.0, "grad_norm": 0.8125418110705774, "language_loss": 0.59773415, "learning_rate": 3.3086575360729165e-07, "loss": 0.61841178, "num_input_tokens_seen": 147234105, "step": 6818, "time_per_iteration": 3.2948131561279297 }, { "auxiliary_loss_clip": 0.01162384, "auxiliary_loss_mlp": 0.01026308, "balance_loss_clip": 1.05049455, "balance_loss_mlp": 1.01870286, "epoch": 0.8199362712679613, "flos": 16618058496000.0, "grad_norm": 4.281085129103178, "language_loss": 0.71283686, "learning_rate": 3.3043673985767906e-07, "loss": 0.73472381, "num_input_tokens_seen": 147253170, "step": 6819, "time_per_iteration": 2.7959630489349365 }, { "auxiliary_loss_clip": 0.01149103, "auxiliary_loss_mlp": 0.01028834, "balance_loss_clip": 1.04632974, "balance_loss_mlp": 1.02072179, "epoch": 0.8200565141586004, "flos": 21757664868480.0, "grad_norm": 1.6129641177225995, "language_loss": 0.77504849, "learning_rate": 3.3000797937948564e-07, "loss": 0.79682785, "num_input_tokens_seen": 147271465, "step": 6820, "time_per_iteration": 2.7740941047668457 }, { "auxiliary_loss_clip": 0.0106011, "auxiliary_loss_mlp": 0.01002836, "balance_loss_clip": 1.00810933, "balance_loss_mlp": 1.00189424, "epoch": 0.8201767570492394, "flos": 69807112392960.0, "grad_norm": 0.9384131964280529, "language_loss": 0.64962137, "learning_rate": 3.295794722377534e-07, "loss": 0.67025083, "num_input_tokens_seen": 147335070, "step": 6821, "time_per_iteration": 3.366995334625244 }, { "auxiliary_loss_clip": 0.0116267, "auxiliary_loss_mlp": 0.01022278, "balance_loss_clip": 1.04477978, "balance_loss_mlp": 1.01511371, "epoch": 0.8202969999398786, "flos": 23111892455040.0, "grad_norm": 1.598778241316034, "language_loss": 0.80036914, "learning_rate": 3.291512184974876e-07, "loss": 0.8222186, "num_input_tokens_seen": 147355460, "step": 6822, "time_per_iteration": 2.806774854660034 }, { "auxiliary_loss_clip": 0.0116299, "auxiliary_loss_mlp": 0.01025026, "balance_loss_clip": 1.04840255, "balance_loss_mlp": 1.0178256, "epoch": 0.8204172428305176, "flos": 28220616109440.0, "grad_norm": 1.6096820844179889, "language_loss": 0.66621679, "learning_rate": 3.2872321822365346e-07, "loss": 0.68809694, "num_input_tokens_seen": 147375675, "step": 6823, "time_per_iteration": 2.8579578399658203 }, { "auxiliary_loss_clip": 0.01164425, "auxiliary_loss_mlp": 0.01024119, "balance_loss_clip": 1.04807866, "balance_loss_mlp": 1.01620388, "epoch": 0.8205374857211567, "flos": 20887011106560.0, "grad_norm": 1.7580716382678077, "language_loss": 0.73450422, "learning_rate": 3.282954714811783e-07, "loss": 0.75638962, "num_input_tokens_seen": 147394580, "step": 6824, "time_per_iteration": 2.8163681030273438 }, { "auxiliary_loss_clip": 0.0115424, "auxiliary_loss_mlp": 0.01022927, "balance_loss_clip": 1.04904985, "balance_loss_mlp": 1.01568508, "epoch": 0.8206577286117959, "flos": 13152140294400.0, "grad_norm": 2.6783869885285623, "language_loss": 0.70675206, "learning_rate": 3.2786797833495093e-07, "loss": 0.72852373, "num_input_tokens_seen": 147409935, "step": 6825, "time_per_iteration": 2.8160369396209717 }, { "auxiliary_loss_clip": 0.01164386, "auxiliary_loss_mlp": 0.0103093, "balance_loss_clip": 1.04573941, "balance_loss_mlp": 1.02419758, "epoch": 0.8207779715024349, "flos": 25265634917760.0, "grad_norm": 1.9574528625993008, "language_loss": 0.72569054, "learning_rate": 3.274407388498213e-07, "loss": 0.74764371, "num_input_tokens_seen": 147428065, "step": 6826, "time_per_iteration": 2.8386621475219727 }, { "auxiliary_loss_clip": 0.01155278, "auxiliary_loss_mlp": 0.01023892, "balance_loss_clip": 1.04878068, "balance_loss_mlp": 1.01660252, "epoch": 0.820898214393074, "flos": 19610243199360.0, "grad_norm": 2.3015192637734607, "language_loss": 0.74410391, "learning_rate": 3.270137530906021e-07, "loss": 0.76589561, "num_input_tokens_seen": 147447300, "step": 6827, "time_per_iteration": 2.759791851043701 }, { "auxiliary_loss_clip": 0.01143177, "auxiliary_loss_mlp": 0.01027024, "balance_loss_clip": 1.0442971, "balance_loss_mlp": 1.02013993, "epoch": 0.8210184572837131, "flos": 15596615439360.0, "grad_norm": 1.7353926801611375, "language_loss": 0.83306468, "learning_rate": 3.265870211220665e-07, "loss": 0.85476673, "num_input_tokens_seen": 147465135, "step": 6828, "time_per_iteration": 2.8591177463531494 }, { "auxiliary_loss_clip": 0.01155074, "auxiliary_loss_mlp": 0.01030256, "balance_loss_clip": 1.04814827, "balance_loss_mlp": 1.02181864, "epoch": 0.8211387001743522, "flos": 20813932886400.0, "grad_norm": 2.1509637559740367, "language_loss": 0.81812525, "learning_rate": 3.2616054300894934e-07, "loss": 0.83997858, "num_input_tokens_seen": 147484585, "step": 6829, "time_per_iteration": 2.8990328311920166 }, { "auxiliary_loss_clip": 0.01149039, "auxiliary_loss_mlp": 0.01034823, "balance_loss_clip": 1.04577756, "balance_loss_mlp": 1.02685392, "epoch": 0.8212589430649913, "flos": 27704579368320.0, "grad_norm": 2.2076108643500163, "language_loss": 0.84530616, "learning_rate": 3.2573431881594693e-07, "loss": 0.86714476, "num_input_tokens_seen": 147504130, "step": 6830, "time_per_iteration": 4.1554951667785645 }, { "auxiliary_loss_clip": 0.01152236, "auxiliary_loss_mlp": 0.01025918, "balance_loss_clip": 1.04566789, "balance_loss_mlp": 1.01806164, "epoch": 0.8213791859556304, "flos": 22455625017600.0, "grad_norm": 2.6579068253371982, "language_loss": 0.65776342, "learning_rate": 3.2530834860771663e-07, "loss": 0.67954493, "num_input_tokens_seen": 147523510, "step": 6831, "time_per_iteration": 2.7569077014923096 }, { "auxiliary_loss_clip": 0.01166454, "auxiliary_loss_mlp": 0.01025216, "balance_loss_clip": 1.04927731, "balance_loss_mlp": 1.01758647, "epoch": 0.8214994288462695, "flos": 16654471908480.0, "grad_norm": 2.1680696739972354, "language_loss": 0.74543047, "learning_rate": 3.248826324488794e-07, "loss": 0.7673471, "num_input_tokens_seen": 147540805, "step": 6832, "time_per_iteration": 2.7135696411132812 }, { "auxiliary_loss_clip": 0.0116652, "auxiliary_loss_mlp": 0.01027332, "balance_loss_clip": 1.04914105, "balance_loss_mlp": 1.02020335, "epoch": 0.8216196717369085, "flos": 25221787390080.0, "grad_norm": 1.8190510650557736, "language_loss": 0.87907898, "learning_rate": 3.244571704040138e-07, "loss": 0.90101755, "num_input_tokens_seen": 147560965, "step": 6833, "time_per_iteration": 3.739464521408081 }, { "auxiliary_loss_clip": 0.01161189, "auxiliary_loss_mlp": 0.01024446, "balance_loss_clip": 1.04618931, "balance_loss_mlp": 1.01634598, "epoch": 0.8217399146275477, "flos": 25371930240000.0, "grad_norm": 1.8860366748649835, "language_loss": 0.73657334, "learning_rate": 3.2403196253766374e-07, "loss": 0.75842971, "num_input_tokens_seen": 147580045, "step": 6834, "time_per_iteration": 2.7399845123291016 }, { "auxiliary_loss_clip": 0.01161951, "auxiliary_loss_mlp": 0.0102456, "balance_loss_clip": 1.04838383, "balance_loss_mlp": 1.01690078, "epoch": 0.8218601575181868, "flos": 25629625388160.0, "grad_norm": 2.6653147837763043, "language_loss": 0.79148912, "learning_rate": 3.2360700891433254e-07, "loss": 0.81335425, "num_input_tokens_seen": 147599070, "step": 6835, "time_per_iteration": 2.737436532974243 }, { "auxiliary_loss_clip": 0.01060438, "auxiliary_loss_mlp": 0.01002245, "balance_loss_clip": 1.00916457, "balance_loss_mlp": 1.00119579, "epoch": 0.8219804004088258, "flos": 67660229427840.0, "grad_norm": 0.812555883121194, "language_loss": 0.57204187, "learning_rate": 3.231823095984847e-07, "loss": 0.59266865, "num_input_tokens_seen": 147653710, "step": 6836, "time_per_iteration": 3.1940884590148926 }, { "auxiliary_loss_clip": 0.01158132, "auxiliary_loss_mlp": 0.01024599, "balance_loss_clip": 1.04559255, "balance_loss_mlp": 1.01783705, "epoch": 0.822100643299465, "flos": 19464266327040.0, "grad_norm": 2.1702558183134135, "language_loss": 0.76354343, "learning_rate": 3.2275786465454814e-07, "loss": 0.78537071, "num_input_tokens_seen": 147670360, "step": 6837, "time_per_iteration": 2.6675052642822266 }, { "auxiliary_loss_clip": 0.01156533, "auxiliary_loss_mlp": 0.01024635, "balance_loss_clip": 1.0473032, "balance_loss_mlp": 1.01755953, "epoch": 0.822220886190104, "flos": 24681368292480.0, "grad_norm": 1.8877364340161473, "language_loss": 0.75562888, "learning_rate": 3.2233367414690917e-07, "loss": 0.77744055, "num_input_tokens_seen": 147692550, "step": 6838, "time_per_iteration": 3.703174591064453 }, { "auxiliary_loss_clip": 0.01153913, "auxiliary_loss_mlp": 0.01020223, "balance_loss_clip": 1.04537714, "balance_loss_mlp": 1.01289475, "epoch": 0.8223411290807431, "flos": 27819062991360.0, "grad_norm": 2.1316973382620676, "language_loss": 0.84822786, "learning_rate": 3.219097381399183e-07, "loss": 0.86996919, "num_input_tokens_seen": 147709725, "step": 6839, "time_per_iteration": 2.7581863403320312 }, { "auxiliary_loss_clip": 0.01168575, "auxiliary_loss_mlp": 0.01025713, "balance_loss_clip": 1.04836559, "balance_loss_mlp": 1.0186621, "epoch": 0.8224613719713821, "flos": 23218546913280.0, "grad_norm": 1.823550112378206, "language_loss": 0.81326801, "learning_rate": 3.2148605669788584e-07, "loss": 0.83521092, "num_input_tokens_seen": 147729615, "step": 6840, "time_per_iteration": 2.7239694595336914 }, { "auxiliary_loss_clip": 0.01160575, "auxiliary_loss_mlp": 0.0102381, "balance_loss_clip": 1.04699874, "balance_loss_mlp": 1.01668739, "epoch": 0.8225816148620213, "flos": 15706250726400.0, "grad_norm": 2.861598716932773, "language_loss": 0.78035367, "learning_rate": 3.2106262988508405e-07, "loss": 0.80219752, "num_input_tokens_seen": 147747665, "step": 6841, "time_per_iteration": 2.7090141773223877 }, { "auxiliary_loss_clip": 0.01162479, "auxiliary_loss_mlp": 0.01021886, "balance_loss_clip": 1.04770088, "balance_loss_mlp": 1.01433432, "epoch": 0.8227018577526604, "flos": 18515111391360.0, "grad_norm": 2.592605057853645, "language_loss": 0.74563473, "learning_rate": 3.206394577657465e-07, "loss": 0.76747847, "num_input_tokens_seen": 147765445, "step": 6842, "time_per_iteration": 3.7566986083984375 }, { "auxiliary_loss_clip": 0.01167487, "auxiliary_loss_mlp": 0.01029288, "balance_loss_clip": 1.04673731, "balance_loss_mlp": 1.02185822, "epoch": 0.8228221006432994, "flos": 22236785406720.0, "grad_norm": 2.4515333847118863, "language_loss": 0.73119771, "learning_rate": 3.202165404040675e-07, "loss": 0.75316554, "num_input_tokens_seen": 147783365, "step": 6843, "time_per_iteration": 2.810070276260376 }, { "auxiliary_loss_clip": 0.01154802, "auxiliary_loss_mlp": 0.01032577, "balance_loss_clip": 1.05203629, "balance_loss_mlp": 1.02491832, "epoch": 0.8229423435339386, "flos": 24097532630400.0, "grad_norm": 2.3826598314957637, "language_loss": 0.7488606, "learning_rate": 3.1979387786420396e-07, "loss": 0.77073443, "num_input_tokens_seen": 147803605, "step": 6844, "time_per_iteration": 2.8822309970855713 }, { "auxiliary_loss_clip": 0.01163693, "auxiliary_loss_mlp": 0.01022697, "balance_loss_clip": 1.04691243, "balance_loss_mlp": 1.01494253, "epoch": 0.8230625864245776, "flos": 23878549365120.0, "grad_norm": 2.0674098829320022, "language_loss": 0.82194257, "learning_rate": 3.1937147021027346e-07, "loss": 0.84380651, "num_input_tokens_seen": 147822060, "step": 6845, "time_per_iteration": 2.77858567237854 }, { "auxiliary_loss_clip": 0.01160899, "auxiliary_loss_mlp": 0.01025956, "balance_loss_clip": 1.04635572, "balance_loss_mlp": 1.01891696, "epoch": 0.8231828293152167, "flos": 16581106379520.0, "grad_norm": 3.4162963200722865, "language_loss": 0.76903033, "learning_rate": 3.189493175063547e-07, "loss": 0.79089886, "num_input_tokens_seen": 147839295, "step": 6846, "time_per_iteration": 2.721107006072998 }, { "auxiliary_loss_clip": 0.01160581, "auxiliary_loss_mlp": 0.01023954, "balance_loss_clip": 1.04805076, "balance_loss_mlp": 1.01653326, "epoch": 0.8233030722058559, "flos": 18880071528960.0, "grad_norm": 2.3348548161678013, "language_loss": 0.67486358, "learning_rate": 3.1852741981648776e-07, "loss": 0.69670892, "num_input_tokens_seen": 147857945, "step": 6847, "time_per_iteration": 2.7465829849243164 }, { "auxiliary_loss_clip": 0.01147795, "auxiliary_loss_mlp": 0.010265, "balance_loss_clip": 1.04841661, "balance_loss_mlp": 1.01937413, "epoch": 0.8234233150964949, "flos": 28439024757120.0, "grad_norm": 1.815303350124872, "language_loss": 0.7003051, "learning_rate": 3.1810577720467404e-07, "loss": 0.72204804, "num_input_tokens_seen": 147879675, "step": 6848, "time_per_iteration": 2.864132881164551 }, { "auxiliary_loss_clip": 0.01165784, "auxiliary_loss_mlp": 0.01025816, "balance_loss_clip": 1.0488776, "balance_loss_mlp": 1.01846421, "epoch": 0.823543557987134, "flos": 33765941577600.0, "grad_norm": 1.6372588355716797, "language_loss": 0.56567931, "learning_rate": 3.176843897348769e-07, "loss": 0.58759528, "num_input_tokens_seen": 147902870, "step": 6849, "time_per_iteration": 2.8484153747558594 }, { "auxiliary_loss_clip": 0.01157509, "auxiliary_loss_mlp": 0.01027271, "balance_loss_clip": 1.0476495, "balance_loss_mlp": 1.01920676, "epoch": 0.8236638008777731, "flos": 17092366611840.0, "grad_norm": 3.113094028378381, "language_loss": 0.756917, "learning_rate": 3.1726325747102034e-07, "loss": 0.77876484, "num_input_tokens_seen": 147921245, "step": 6850, "time_per_iteration": 2.8064029216766357 }, { "auxiliary_loss_clip": 0.01150954, "auxiliary_loss_mlp": 0.01025221, "balance_loss_clip": 1.04629529, "balance_loss_mlp": 1.01741314, "epoch": 0.8237840437684122, "flos": 61639982334720.0, "grad_norm": 1.646872683744666, "language_loss": 0.64316809, "learning_rate": 3.1684238047698974e-07, "loss": 0.66492993, "num_input_tokens_seen": 147949515, "step": 6851, "time_per_iteration": 3.078573703765869 }, { "auxiliary_loss_clip": 0.01162662, "auxiliary_loss_mlp": 0.01028064, "balance_loss_clip": 1.04744697, "balance_loss_mlp": 1.01999319, "epoch": 0.8239042866590512, "flos": 27309023821440.0, "grad_norm": 2.390963674652913, "language_loss": 0.53026932, "learning_rate": 3.1642175881663155e-07, "loss": 0.55217659, "num_input_tokens_seen": 147969245, "step": 6852, "time_per_iteration": 2.8019638061523438 }, { "auxiliary_loss_clip": 0.01163806, "auxiliary_loss_mlp": 0.01021735, "balance_loss_clip": 1.04614329, "balance_loss_mlp": 1.01420653, "epoch": 0.8240245295496904, "flos": 21726351187200.0, "grad_norm": 3.4939147390343814, "language_loss": 0.83706087, "learning_rate": 3.160013925537537e-07, "loss": 0.85891628, "num_input_tokens_seen": 147990080, "step": 6853, "time_per_iteration": 2.6966171264648438 }, { "auxiliary_loss_clip": 0.01161576, "auxiliary_loss_mlp": 0.01023552, "balance_loss_clip": 1.04732037, "balance_loss_mlp": 1.01626825, "epoch": 0.8241447724403295, "flos": 20009318279040.0, "grad_norm": 1.8171322910938195, "language_loss": 0.75731468, "learning_rate": 3.155812817521266e-07, "loss": 0.77916592, "num_input_tokens_seen": 148010455, "step": 6854, "time_per_iteration": 2.7627928256988525 }, { "auxiliary_loss_clip": 0.01162066, "auxiliary_loss_mlp": 0.0102748, "balance_loss_clip": 1.04791808, "balance_loss_mlp": 1.02002978, "epoch": 0.8242650153309685, "flos": 22272983337600.0, "grad_norm": 2.177165314748128, "language_loss": 0.78186536, "learning_rate": 3.151614264754787e-07, "loss": 0.80376077, "num_input_tokens_seen": 148028400, "step": 6855, "time_per_iteration": 2.8026394844055176 }, { "auxiliary_loss_clip": 0.01168355, "auxiliary_loss_mlp": 0.0102635, "balance_loss_clip": 1.04707086, "balance_loss_mlp": 1.01872706, "epoch": 0.8243852582216077, "flos": 22309971367680.0, "grad_norm": 3.081235388168661, "language_loss": 0.7909857, "learning_rate": 3.147418267875035e-07, "loss": 0.81293273, "num_input_tokens_seen": 148046530, "step": 6856, "time_per_iteration": 3.95851993560791 }, { "auxiliary_loss_clip": 0.01147822, "auxiliary_loss_mlp": 0.01052478, "balance_loss_clip": 1.04638577, "balance_loss_mlp": 1.01695895, "epoch": 0.8245055011122467, "flos": 24645421756800.0, "grad_norm": 1.9726625079393258, "language_loss": 0.65696788, "learning_rate": 3.1432248275185315e-07, "loss": 0.67897093, "num_input_tokens_seen": 148067040, "step": 6857, "time_per_iteration": 2.872619390487671 }, { "auxiliary_loss_clip": 0.01161225, "auxiliary_loss_mlp": 0.01025527, "balance_loss_clip": 1.04764938, "balance_loss_mlp": 1.01857114, "epoch": 0.8246257440028858, "flos": 17487275713920.0, "grad_norm": 2.121208476162396, "language_loss": 0.77419752, "learning_rate": 3.139033944321412e-07, "loss": 0.79606503, "num_input_tokens_seen": 148084400, "step": 6858, "time_per_iteration": 2.742431879043579 }, { "auxiliary_loss_clip": 0.01165869, "auxiliary_loss_mlp": 0.01023687, "balance_loss_clip": 1.0470264, "balance_loss_mlp": 1.01608777, "epoch": 0.824745986893525, "flos": 25010130499200.0, "grad_norm": 1.8509201242773672, "language_loss": 0.78798366, "learning_rate": 3.1348456189194507e-07, "loss": 0.80987924, "num_input_tokens_seen": 148104860, "step": 6859, "time_per_iteration": 3.679845094680786 }, { "auxiliary_loss_clip": 0.01151268, "auxiliary_loss_mlp": 0.0102527, "balance_loss_clip": 1.0475955, "balance_loss_mlp": 1.0177207, "epoch": 0.824866229784164, "flos": 18772698798720.0, "grad_norm": 1.5920865070613517, "language_loss": 0.82843602, "learning_rate": 3.1306598519479876e-07, "loss": 0.85020137, "num_input_tokens_seen": 148124680, "step": 6860, "time_per_iteration": 2.7914748191833496 }, { "auxiliary_loss_clip": 0.01161695, "auxiliary_loss_mlp": 0.01026206, "balance_loss_clip": 1.04910707, "balance_loss_mlp": 1.01850486, "epoch": 0.8249864726748031, "flos": 23842171866240.0, "grad_norm": 1.7068536921721709, "language_loss": 0.78503239, "learning_rate": 3.1264766440420177e-07, "loss": 0.80691141, "num_input_tokens_seen": 148147150, "step": 6861, "time_per_iteration": 2.746361494064331 }, { "auxiliary_loss_clip": 0.01161036, "auxiliary_loss_mlp": 0.01027088, "balance_loss_clip": 1.04738307, "balance_loss_mlp": 1.01980472, "epoch": 0.8251067155654422, "flos": 20303103617280.0, "grad_norm": 1.9146821740814937, "language_loss": 0.69241655, "learning_rate": 3.122295995836124e-07, "loss": 0.71429783, "num_input_tokens_seen": 148167020, "step": 6862, "time_per_iteration": 2.7436575889587402 }, { "auxiliary_loss_clip": 0.01168509, "auxiliary_loss_mlp": 0.01021646, "balance_loss_clip": 1.04784179, "balance_loss_mlp": 1.01416516, "epoch": 0.8252269584560813, "flos": 25009699536000.0, "grad_norm": 3.2111398130104094, "language_loss": 0.77545857, "learning_rate": 3.118117907964508e-07, "loss": 0.79736018, "num_input_tokens_seen": 148188965, "step": 6863, "time_per_iteration": 2.8159821033477783 }, { "auxiliary_loss_clip": 0.01163716, "auxiliary_loss_mlp": 0.01021313, "balance_loss_clip": 1.04763818, "balance_loss_mlp": 1.01458931, "epoch": 0.8253472013467203, "flos": 17128564542720.0, "grad_norm": 1.8849573558508341, "language_loss": 0.80990195, "learning_rate": 3.1139423810609856e-07, "loss": 0.83175224, "num_input_tokens_seen": 148205660, "step": 6864, "time_per_iteration": 3.581550359725952 }, { "auxiliary_loss_clip": 0.01165524, "auxiliary_loss_mlp": 0.0101881, "balance_loss_clip": 1.0457232, "balance_loss_mlp": 1.01225615, "epoch": 0.8254674442373595, "flos": 22414794232320.0, "grad_norm": 2.321195418220863, "language_loss": 0.75377148, "learning_rate": 3.1097694157589714e-07, "loss": 0.77561486, "num_input_tokens_seen": 148225545, "step": 6865, "time_per_iteration": 2.7602388858795166 }, { "auxiliary_loss_clip": 0.01159774, "auxiliary_loss_mlp": 0.01024181, "balance_loss_clip": 1.04737508, "balance_loss_mlp": 1.01679337, "epoch": 0.8255876871279986, "flos": 24786765774720.0, "grad_norm": 3.6505214924548413, "language_loss": 0.75654757, "learning_rate": 3.105599012691511e-07, "loss": 0.77838719, "num_input_tokens_seen": 148243975, "step": 6866, "time_per_iteration": 2.770228862762451 }, { "auxiliary_loss_clip": 0.0115842, "auxiliary_loss_mlp": 0.01023555, "balance_loss_clip": 1.04522157, "balance_loss_mlp": 1.0156219, "epoch": 0.8257079300186376, "flos": 27455431656960.0, "grad_norm": 1.6826019493747921, "language_loss": 0.82288218, "learning_rate": 3.101431172491249e-07, "loss": 0.84470189, "num_input_tokens_seen": 148265520, "step": 6867, "time_per_iteration": 2.780853748321533 }, { "auxiliary_loss_clip": 0.01162673, "auxiliary_loss_mlp": 0.01053616, "balance_loss_clip": 1.04702926, "balance_loss_mlp": 1.01842141, "epoch": 0.8258281729092768, "flos": 16471866142080.0, "grad_norm": 3.471658805751021, "language_loss": 0.71935356, "learning_rate": 3.097265895790444e-07, "loss": 0.74151647, "num_input_tokens_seen": 148283730, "step": 6868, "time_per_iteration": 3.704697608947754 }, { "auxiliary_loss_clip": 0.0116136, "auxiliary_loss_mlp": 0.01025703, "balance_loss_clip": 1.04991817, "balance_loss_mlp": 1.0178653, "epoch": 0.8259484157999158, "flos": 21433822824960.0, "grad_norm": 2.1735905862297913, "language_loss": 0.83574003, "learning_rate": 3.093103183220962e-07, "loss": 0.85761064, "num_input_tokens_seen": 148303775, "step": 6869, "time_per_iteration": 2.744321823120117 }, { "auxiliary_loss_clip": 0.01061171, "auxiliary_loss_mlp": 0.01000698, "balance_loss_clip": 1.00838172, "balance_loss_mlp": 0.99974996, "epoch": 0.8260686586905549, "flos": 58322342453760.0, "grad_norm": 0.8219046126827332, "language_loss": 0.59382457, "learning_rate": 3.0889430354142796e-07, "loss": 0.61444324, "num_input_tokens_seen": 148365285, "step": 6870, "time_per_iteration": 3.342479705810547 }, { "auxiliary_loss_clip": 0.01159071, "auxiliary_loss_mlp": 0.01024617, "balance_loss_clip": 1.04491949, "balance_loss_mlp": 1.01723838, "epoch": 0.826188901581194, "flos": 27527288814720.0, "grad_norm": 2.137006648074961, "language_loss": 0.69988716, "learning_rate": 3.084785453001497e-07, "loss": 0.72172403, "num_input_tokens_seen": 148386200, "step": 6871, "time_per_iteration": 2.7972469329833984 }, { "auxiliary_loss_clip": 0.01159307, "auxiliary_loss_mlp": 0.01053657, "balance_loss_clip": 1.04693031, "balance_loss_mlp": 1.01757574, "epoch": 0.8263091444718331, "flos": 23696051339520.0, "grad_norm": 2.065464531775564, "language_loss": 0.82000041, "learning_rate": 3.080630436613314e-07, "loss": 0.84213006, "num_input_tokens_seen": 148403970, "step": 6872, "time_per_iteration": 2.80259370803833 }, { "auxiliary_loss_clip": 0.01155677, "auxiliary_loss_mlp": 0.01026908, "balance_loss_clip": 1.04609346, "balance_loss_mlp": 1.01920712, "epoch": 0.8264293873624722, "flos": 17165157523200.0, "grad_norm": 1.986118551148813, "language_loss": 0.86137319, "learning_rate": 3.076477986880039e-07, "loss": 0.88319904, "num_input_tokens_seen": 148421765, "step": 6873, "time_per_iteration": 2.70597505569458 }, { "auxiliary_loss_clip": 0.0116093, "auxiliary_loss_mlp": 0.0102146, "balance_loss_clip": 1.04797316, "balance_loss_mlp": 1.01444125, "epoch": 0.8265496302531112, "flos": 24098645952000.0, "grad_norm": 2.1180479562832524, "language_loss": 0.69351566, "learning_rate": 3.0723281044315986e-07, "loss": 0.71533954, "num_input_tokens_seen": 148443720, "step": 6874, "time_per_iteration": 2.76538348197937 }, { "auxiliary_loss_clip": 0.01160349, "auxiliary_loss_mlp": 0.01023054, "balance_loss_clip": 1.04322124, "balance_loss_mlp": 1.01614022, "epoch": 0.8266698731437504, "flos": 14099894599680.0, "grad_norm": 1.9751828300376078, "language_loss": 0.76545095, "learning_rate": 3.068180789897521e-07, "loss": 0.78728497, "num_input_tokens_seen": 148462130, "step": 6875, "time_per_iteration": 2.7918331623077393 }, { "auxiliary_loss_clip": 0.01166989, "auxiliary_loss_mlp": 0.01030417, "balance_loss_clip": 1.04665744, "balance_loss_mlp": 1.02244759, "epoch": 0.8267901160343895, "flos": 30777563715840.0, "grad_norm": 1.510263975205103, "language_loss": 0.81632888, "learning_rate": 3.064036043906966e-07, "loss": 0.83830291, "num_input_tokens_seen": 148485570, "step": 6876, "time_per_iteration": 2.7979655265808105 }, { "auxiliary_loss_clip": 0.01168597, "auxiliary_loss_mlp": 0.01030923, "balance_loss_clip": 1.04918396, "balance_loss_mlp": 1.02287006, "epoch": 0.8269103589250285, "flos": 40624915242240.0, "grad_norm": 2.786079148111875, "language_loss": 0.68130815, "learning_rate": 3.059893867088668e-07, "loss": 0.70330328, "num_input_tokens_seen": 148509715, "step": 6877, "time_per_iteration": 3.0186266899108887 }, { "auxiliary_loss_clip": 0.01160184, "auxiliary_loss_mlp": 0.01024352, "balance_loss_clip": 1.04538095, "balance_loss_mlp": 1.01696086, "epoch": 0.8270306018156677, "flos": 30263645877120.0, "grad_norm": 1.8528567170840426, "language_loss": 0.66819298, "learning_rate": 3.055754260071004e-07, "loss": 0.69003832, "num_input_tokens_seen": 148532010, "step": 6878, "time_per_iteration": 2.7543063163757324 }, { "auxiliary_loss_clip": 0.01161695, "auxiliary_loss_mlp": 0.01022559, "balance_loss_clip": 1.04625857, "balance_loss_mlp": 1.0152874, "epoch": 0.8271508447063067, "flos": 25226599812480.0, "grad_norm": 2.423079427972755, "language_loss": 0.73472095, "learning_rate": 3.051617223481948e-07, "loss": 0.75656348, "num_input_tokens_seen": 148553330, "step": 6879, "time_per_iteration": 2.899155855178833 }, { "auxiliary_loss_clip": 0.01170911, "auxiliary_loss_mlp": 0.01033611, "balance_loss_clip": 1.0496726, "balance_loss_mlp": 1.02577257, "epoch": 0.8272710875969458, "flos": 17566602900480.0, "grad_norm": 2.405720865217871, "language_loss": 0.75335121, "learning_rate": 3.047482757949078e-07, "loss": 0.77539647, "num_input_tokens_seen": 148570960, "step": 6880, "time_per_iteration": 2.7524986267089844 }, { "auxiliary_loss_clip": 0.01152426, "auxiliary_loss_mlp": 0.01050562, "balance_loss_clip": 1.04312515, "balance_loss_mlp": 1.01666963, "epoch": 0.827391330487585, "flos": 19755465886080.0, "grad_norm": 1.9881622581523855, "language_loss": 0.85980475, "learning_rate": 3.043350864099605e-07, "loss": 0.88183463, "num_input_tokens_seen": 148589520, "step": 6881, "time_per_iteration": 2.826353073120117 }, { "auxiliary_loss_clip": 0.01166987, "auxiliary_loss_mlp": 0.01024773, "balance_loss_clip": 1.04675567, "balance_loss_mlp": 1.01738214, "epoch": 0.827511573378224, "flos": 16835174254080.0, "grad_norm": 2.1332174485786615, "language_loss": 0.80779755, "learning_rate": 3.039221542560315e-07, "loss": 0.82971513, "num_input_tokens_seen": 148606085, "step": 6882, "time_per_iteration": 3.918139696121216 }, { "auxiliary_loss_clip": 0.01159707, "auxiliary_loss_mlp": 0.01027623, "balance_loss_clip": 1.04693174, "balance_loss_mlp": 1.02003288, "epoch": 0.8276318162688631, "flos": 18369242259840.0, "grad_norm": 1.7997661878250948, "language_loss": 0.73493111, "learning_rate": 3.0350947939576356e-07, "loss": 0.75680441, "num_input_tokens_seen": 148625240, "step": 6883, "time_per_iteration": 2.756028890609741 }, { "auxiliary_loss_clip": 0.0117053, "auxiliary_loss_mlp": 0.01028297, "balance_loss_clip": 1.04918075, "balance_loss_mlp": 1.02057838, "epoch": 0.8277520591595022, "flos": 19352691705600.0, "grad_norm": 1.8221685823474199, "language_loss": 0.72313786, "learning_rate": 3.0309706189175876e-07, "loss": 0.74512613, "num_input_tokens_seen": 148645075, "step": 6884, "time_per_iteration": 3.8097639083862305 }, { "auxiliary_loss_clip": 0.01062186, "auxiliary_loss_mlp": 0.01002601, "balance_loss_clip": 1.00852823, "balance_loss_mlp": 1.00166547, "epoch": 0.8278723020501413, "flos": 67918858329600.0, "grad_norm": 0.7650107145964178, "language_loss": 0.5729546, "learning_rate": 3.0268490180658045e-07, "loss": 0.59360248, "num_input_tokens_seen": 148707855, "step": 6885, "time_per_iteration": 3.31330943107605 }, { "auxiliary_loss_clip": 0.01170291, "auxiliary_loss_mlp": 0.01025849, "balance_loss_clip": 1.05028462, "balance_loss_mlp": 1.01878285, "epoch": 0.8279925449407803, "flos": 18185738653440.0, "grad_norm": 2.062340176075373, "language_loss": 0.79744744, "learning_rate": 3.0227299920275305e-07, "loss": 0.81940889, "num_input_tokens_seen": 148724170, "step": 6886, "time_per_iteration": 2.6647605895996094 }, { "auxiliary_loss_clip": 0.01162134, "auxiliary_loss_mlp": 0.01029312, "balance_loss_clip": 1.05003119, "balance_loss_mlp": 1.02119374, "epoch": 0.8281127878314195, "flos": 20631434860800.0, "grad_norm": 2.103645684816963, "language_loss": 0.85581851, "learning_rate": 3.018613541427613e-07, "loss": 0.87773299, "num_input_tokens_seen": 148743690, "step": 6887, "time_per_iteration": 2.884218692779541 }, { "auxiliary_loss_clip": 0.01164353, "auxiliary_loss_mlp": 0.01023621, "balance_loss_clip": 1.04492116, "balance_loss_mlp": 1.0158844, "epoch": 0.8282330307220586, "flos": 18004282122240.0, "grad_norm": 1.7037991671898896, "language_loss": 0.73677224, "learning_rate": 3.0144996668905243e-07, "loss": 0.75865197, "num_input_tokens_seen": 148761070, "step": 6888, "time_per_iteration": 2.7264068126678467 }, { "auxiliary_loss_clip": 0.01157294, "auxiliary_loss_mlp": 0.010579, "balance_loss_clip": 1.04852819, "balance_loss_mlp": 1.02112579, "epoch": 0.8283532736126976, "flos": 20084120352000.0, "grad_norm": 1.8702354899471707, "language_loss": 0.82050139, "learning_rate": 3.010388369040331e-07, "loss": 0.84265333, "num_input_tokens_seen": 148779730, "step": 6889, "time_per_iteration": 3.9065942764282227 }, { "auxiliary_loss_clip": 0.01164469, "auxiliary_loss_mlp": 0.01025991, "balance_loss_clip": 1.04767203, "balance_loss_mlp": 1.0184567, "epoch": 0.8284735165033368, "flos": 31868421805440.0, "grad_norm": 1.6212464516567204, "language_loss": 0.8275426, "learning_rate": 3.0062796485007156e-07, "loss": 0.84944719, "num_input_tokens_seen": 148800670, "step": 6890, "time_per_iteration": 2.893880605697632 }, { "auxiliary_loss_clip": 0.01167228, "auxiliary_loss_mlp": 0.01053447, "balance_loss_clip": 1.04633582, "balance_loss_mlp": 1.01801956, "epoch": 0.8285937593939758, "flos": 26651319840000.0, "grad_norm": 3.2787372718994634, "language_loss": 0.6541847, "learning_rate": 3.002173505894965e-07, "loss": 0.67639148, "num_input_tokens_seen": 148819820, "step": 6891, "time_per_iteration": 2.72458815574646 }, { "auxiliary_loss_clip": 0.01168213, "auxiliary_loss_mlp": 0.0102878, "balance_loss_clip": 1.04732752, "balance_loss_mlp": 1.02124631, "epoch": 0.8287140022846149, "flos": 20193683811840.0, "grad_norm": 3.7046573113050894, "language_loss": 0.62851369, "learning_rate": 2.998069941845973e-07, "loss": 0.65048355, "num_input_tokens_seen": 148838890, "step": 6892, "time_per_iteration": 2.870225667953491 }, { "auxiliary_loss_clip": 0.01060296, "auxiliary_loss_mlp": 0.01000865, "balance_loss_clip": 1.00775611, "balance_loss_mlp": 0.99987525, "epoch": 0.8288342451752541, "flos": 70755980019840.0, "grad_norm": 0.7096944568628373, "language_loss": 0.57437682, "learning_rate": 2.993968956976258e-07, "loss": 0.59498835, "num_input_tokens_seen": 148906635, "step": 6893, "time_per_iteration": 3.357456922531128 }, { "auxiliary_loss_clip": 0.01171456, "auxiliary_loss_mlp": 0.01026628, "balance_loss_clip": 1.04728734, "balance_loss_mlp": 1.01866519, "epoch": 0.8289544880658931, "flos": 24572235795840.0, "grad_norm": 1.94449571858699, "language_loss": 0.70319217, "learning_rate": 2.9898705519079313e-07, "loss": 0.725173, "num_input_tokens_seen": 148925740, "step": 6894, "time_per_iteration": 3.585702657699585 }, { "auxiliary_loss_clip": 0.01154246, "auxiliary_loss_mlp": 0.01021318, "balance_loss_clip": 1.04722142, "balance_loss_mlp": 1.01431441, "epoch": 0.8290747309565322, "flos": 22273378387200.0, "grad_norm": 1.660593829364798, "language_loss": 0.74823856, "learning_rate": 2.985774727262715e-07, "loss": 0.76999414, "num_input_tokens_seen": 148944585, "step": 6895, "time_per_iteration": 2.90714168548584 }, { "auxiliary_loss_clip": 0.01164743, "auxiliary_loss_mlp": 0.01023378, "balance_loss_clip": 1.04614615, "balance_loss_mlp": 1.01655602, "epoch": 0.8291949738471713, "flos": 23255570856960.0, "grad_norm": 2.38334170656207, "language_loss": 0.81385732, "learning_rate": 2.981681483661949e-07, "loss": 0.83573854, "num_input_tokens_seen": 148964170, "step": 6896, "time_per_iteration": 2.758619546890259 }, { "auxiliary_loss_clip": 0.01167843, "auxiliary_loss_mlp": 0.01030448, "balance_loss_clip": 1.05009556, "balance_loss_mlp": 1.02272367, "epoch": 0.8293152167378104, "flos": 52555768185600.0, "grad_norm": 1.5995983716197337, "language_loss": 0.70817316, "learning_rate": 2.9775908217265633e-07, "loss": 0.73015612, "num_input_tokens_seen": 148989405, "step": 6897, "time_per_iteration": 3.0261549949645996 }, { "auxiliary_loss_clip": 0.0105858, "auxiliary_loss_mlp": 0.00999983, "balance_loss_clip": 1.01143968, "balance_loss_mlp": 0.99911273, "epoch": 0.8294354596284494, "flos": 63356156294400.0, "grad_norm": 0.8317231099791389, "language_loss": 0.50295174, "learning_rate": 2.9735027420771253e-07, "loss": 0.52353734, "num_input_tokens_seen": 149049740, "step": 6898, "time_per_iteration": 3.2881321907043457 }, { "auxiliary_loss_clip": 0.01156015, "auxiliary_loss_mlp": 0.01023125, "balance_loss_clip": 1.0494802, "balance_loss_mlp": 1.01615155, "epoch": 0.8295557025190886, "flos": 24827021942400.0, "grad_norm": 1.8244402819231564, "language_loss": 0.71541905, "learning_rate": 2.969417245333774e-07, "loss": 0.73721039, "num_input_tokens_seen": 149069120, "step": 6899, "time_per_iteration": 2.7489726543426514 }, { "auxiliary_loss_clip": 0.01151465, "auxiliary_loss_mlp": 0.01020541, "balance_loss_clip": 1.045645, "balance_loss_mlp": 1.01341546, "epoch": 0.8296759454097277, "flos": 25118580637440.0, "grad_norm": 1.878679205979098, "language_loss": 0.77992928, "learning_rate": 2.9653343321162915e-07, "loss": 0.80164933, "num_input_tokens_seen": 149088630, "step": 6900, "time_per_iteration": 2.843721389770508 }, { "auxiliary_loss_clip": 0.0115895, "auxiliary_loss_mlp": 0.01023195, "balance_loss_clip": 1.04897928, "balance_loss_mlp": 1.0158335, "epoch": 0.8297961883003667, "flos": 24132581326080.0, "grad_norm": 2.0489766968676753, "language_loss": 0.64892352, "learning_rate": 2.9612540030440446e-07, "loss": 0.67074496, "num_input_tokens_seen": 149109175, "step": 6901, "time_per_iteration": 2.773881435394287 }, { "auxiliary_loss_clip": 0.01060116, "auxiliary_loss_mlp": 0.01001953, "balance_loss_clip": 1.00879276, "balance_loss_mlp": 1.0010891, "epoch": 0.8299164311910058, "flos": 67446561375360.0, "grad_norm": 0.8575280837551957, "language_loss": 0.64099151, "learning_rate": 2.9571762587360206e-07, "loss": 0.66161221, "num_input_tokens_seen": 149165560, "step": 6902, "time_per_iteration": 3.2295126914978027 }, { "auxiliary_loss_clip": 0.01154336, "auxiliary_loss_mlp": 0.01022184, "balance_loss_clip": 1.04572654, "balance_loss_mlp": 1.01529968, "epoch": 0.8300366740816449, "flos": 25228682801280.0, "grad_norm": 1.5755744000998406, "language_loss": 0.73782426, "learning_rate": 2.953101099810806e-07, "loss": 0.75958943, "num_input_tokens_seen": 149185165, "step": 6903, "time_per_iteration": 2.8727991580963135 }, { "auxiliary_loss_clip": 0.01158439, "auxiliary_loss_mlp": 0.01023435, "balance_loss_clip": 1.04751825, "balance_loss_mlp": 1.01616335, "epoch": 0.830156916972284, "flos": 18041018757120.0, "grad_norm": 2.1405532027904774, "language_loss": 0.82605559, "learning_rate": 2.9490285268865965e-07, "loss": 0.84787434, "num_input_tokens_seen": 149202655, "step": 6904, "time_per_iteration": 2.7890255451202393 }, { "auxiliary_loss_clip": 0.01167346, "auxiliary_loss_mlp": 0.01024749, "balance_loss_clip": 1.04937208, "balance_loss_mlp": 1.01721478, "epoch": 0.830277159862923, "flos": 26322485806080.0, "grad_norm": 2.382983480027945, "language_loss": 0.79523253, "learning_rate": 2.9449585405812085e-07, "loss": 0.81715351, "num_input_tokens_seen": 149220035, "step": 6905, "time_per_iteration": 2.904057502746582 }, { "auxiliary_loss_clip": 0.0116003, "auxiliary_loss_mlp": 0.0102827, "balance_loss_clip": 1.04586291, "balance_loss_mlp": 1.02021408, "epoch": 0.8303974027535622, "flos": 19938861751680.0, "grad_norm": 3.1030022486421007, "language_loss": 0.74133956, "learning_rate": 2.940891141512043e-07, "loss": 0.76322258, "num_input_tokens_seen": 149238055, "step": 6906, "time_per_iteration": 2.803478240966797 }, { "auxiliary_loss_clip": 0.01161354, "auxiliary_loss_mlp": 0.01025129, "balance_loss_clip": 1.04864824, "balance_loss_mlp": 1.01707613, "epoch": 0.8305176456442013, "flos": 17165552572800.0, "grad_norm": 2.112978190127009, "language_loss": 0.72136098, "learning_rate": 2.9368263302961385e-07, "loss": 0.74322581, "num_input_tokens_seen": 149256755, "step": 6907, "time_per_iteration": 2.8624839782714844 }, { "auxiliary_loss_clip": 0.01152817, "auxiliary_loss_mlp": 0.01021509, "balance_loss_clip": 1.04685152, "balance_loss_mlp": 1.01379013, "epoch": 0.8306378885348403, "flos": 25627614226560.0, "grad_norm": 1.974755694126247, "language_loss": 0.79674256, "learning_rate": 2.9327641075501075e-07, "loss": 0.81848586, "num_input_tokens_seen": 149275745, "step": 6908, "time_per_iteration": 3.8377039432525635 }, { "auxiliary_loss_clip": 0.01158825, "auxiliary_loss_mlp": 0.01025362, "balance_loss_clip": 1.04919934, "balance_loss_mlp": 1.0175066, "epoch": 0.8307581314254795, "flos": 33947864985600.0, "grad_norm": 3.981071065965797, "language_loss": 0.66478431, "learning_rate": 2.9287044738901866e-07, "loss": 0.6866262, "num_input_tokens_seen": 149293730, "step": 6909, "time_per_iteration": 2.829038619995117 }, { "auxiliary_loss_clip": 0.01164226, "auxiliary_loss_mlp": 0.01054663, "balance_loss_clip": 1.04646373, "balance_loss_mlp": 1.01770842, "epoch": 0.8308783743161186, "flos": 17562724231680.0, "grad_norm": 2.363179009861777, "language_loss": 0.90825176, "learning_rate": 2.9246474299322274e-07, "loss": 0.93044066, "num_input_tokens_seen": 149309290, "step": 6910, "time_per_iteration": 3.734138011932373 }, { "auxiliary_loss_clip": 0.01058225, "auxiliary_loss_mlp": 0.01002246, "balance_loss_clip": 1.00908995, "balance_loss_mlp": 1.00128043, "epoch": 0.8309986172067576, "flos": 69412885649280.0, "grad_norm": 0.8949021135853504, "language_loss": 0.63223553, "learning_rate": 2.920592976291678e-07, "loss": 0.65284026, "num_input_tokens_seen": 149366620, "step": 6911, "time_per_iteration": 3.267516851425171 }, { "auxiliary_loss_clip": 0.01163233, "auxiliary_loss_mlp": 0.01027923, "balance_loss_clip": 1.04723835, "balance_loss_mlp": 1.02015638, "epoch": 0.8311188600973968, "flos": 22309755886080.0, "grad_norm": 2.163834338147555, "language_loss": 0.81096339, "learning_rate": 2.916541113583595e-07, "loss": 0.83287489, "num_input_tokens_seen": 149385120, "step": 6912, "time_per_iteration": 2.7184834480285645 }, { "auxiliary_loss_clip": 0.01161271, "auxiliary_loss_mlp": 0.01026373, "balance_loss_clip": 1.0485028, "balance_loss_mlp": 1.01917315, "epoch": 0.8312391029880358, "flos": 18770077105920.0, "grad_norm": 2.3252531533433447, "language_loss": 0.6651752, "learning_rate": 2.912491842422642e-07, "loss": 0.68705171, "num_input_tokens_seen": 149402825, "step": 6913, "time_per_iteration": 2.7989344596862793 }, { "auxiliary_loss_clip": 0.01166626, "auxiliary_loss_mlp": 0.01028675, "balance_loss_clip": 1.04978442, "balance_loss_mlp": 1.0215404, "epoch": 0.8313593458786749, "flos": 20376648714240.0, "grad_norm": 1.6011653372557864, "language_loss": 0.71009874, "learning_rate": 2.9084451634230857e-07, "loss": 0.73205173, "num_input_tokens_seen": 149422125, "step": 6914, "time_per_iteration": 2.842174768447876 }, { "auxiliary_loss_clip": 0.01156697, "auxiliary_loss_mlp": 0.01026076, "balance_loss_clip": 1.04565907, "balance_loss_mlp": 1.01841104, "epoch": 0.831479588769314, "flos": 32124069878400.0, "grad_norm": 2.3055775141077466, "language_loss": 0.71445668, "learning_rate": 2.9044010771988125e-07, "loss": 0.73628443, "num_input_tokens_seen": 149441940, "step": 6915, "time_per_iteration": 3.751141309738159 }, { "auxiliary_loss_clip": 0.01154253, "auxiliary_loss_mlp": 0.01025438, "balance_loss_clip": 1.046556, "balance_loss_mlp": 1.0181601, "epoch": 0.8315998316599531, "flos": 45185929338240.0, "grad_norm": 2.0066501424428758, "language_loss": 0.72396487, "learning_rate": 2.900359584363303e-07, "loss": 0.74576175, "num_input_tokens_seen": 149465045, "step": 6916, "time_per_iteration": 3.052456855773926 }, { "auxiliary_loss_clip": 0.01156472, "auxiliary_loss_mlp": 0.0102802, "balance_loss_clip": 1.05287266, "balance_loss_mlp": 1.02006531, "epoch": 0.8317200745505922, "flos": 18363747479040.0, "grad_norm": 2.2698125871017076, "language_loss": 0.84502369, "learning_rate": 2.8963206855296494e-07, "loss": 0.86686862, "num_input_tokens_seen": 149481285, "step": 6917, "time_per_iteration": 2.88679575920105 }, { "auxiliary_loss_clip": 0.0116136, "auxiliary_loss_mlp": 0.01022784, "balance_loss_clip": 1.04503441, "balance_loss_mlp": 1.01539898, "epoch": 0.8318403174412313, "flos": 24206557386240.0, "grad_norm": 1.9200952864962122, "language_loss": 0.77162337, "learning_rate": 2.892284381310548e-07, "loss": 0.79346478, "num_input_tokens_seen": 149502700, "step": 6918, "time_per_iteration": 2.9389731884002686 }, { "auxiliary_loss_clip": 0.01157056, "auxiliary_loss_mlp": 0.01027396, "balance_loss_clip": 1.04565573, "balance_loss_mlp": 1.01965678, "epoch": 0.8319605603318704, "flos": 22418780641920.0, "grad_norm": 2.495397882099216, "language_loss": 0.72491658, "learning_rate": 2.888250672318302e-07, "loss": 0.74676108, "num_input_tokens_seen": 149520100, "step": 6919, "time_per_iteration": 2.946030378341675 }, { "auxiliary_loss_clip": 0.01169499, "auxiliary_loss_mlp": 0.01022858, "balance_loss_clip": 1.04874551, "balance_loss_mlp": 1.01515102, "epoch": 0.8320808032225094, "flos": 37414501459200.0, "grad_norm": 1.9376435619996493, "language_loss": 0.68723154, "learning_rate": 2.884219559164831e-07, "loss": 0.70915508, "num_input_tokens_seen": 149543245, "step": 6920, "time_per_iteration": 2.9237565994262695 }, { "auxiliary_loss_clip": 0.01164031, "auxiliary_loss_mlp": 0.01027181, "balance_loss_clip": 1.04690313, "balance_loss_mlp": 1.01974547, "epoch": 0.8322010461131486, "flos": 12787395638400.0, "grad_norm": 1.8377787406293258, "language_loss": 0.81373608, "learning_rate": 2.880191042461635e-07, "loss": 0.8356483, "num_input_tokens_seen": 149559185, "step": 6921, "time_per_iteration": 3.7291758060455322 }, { "auxiliary_loss_clip": 0.01158566, "auxiliary_loss_mlp": 0.01022221, "balance_loss_clip": 1.04629469, "balance_loss_mlp": 1.01517844, "epoch": 0.8323212890037877, "flos": 15815455050240.0, "grad_norm": 1.9534154476557881, "language_loss": 0.8006205, "learning_rate": 2.876165122819849e-07, "loss": 0.82242841, "num_input_tokens_seen": 149577165, "step": 6922, "time_per_iteration": 2.80954909324646 }, { "auxiliary_loss_clip": 0.01164725, "auxiliary_loss_mlp": 0.0102296, "balance_loss_clip": 1.0466001, "balance_loss_mlp": 1.01541448, "epoch": 0.8324415318944267, "flos": 21719276208000.0, "grad_norm": 2.283307108362466, "language_loss": 0.79506111, "learning_rate": 2.872141800850201e-07, "loss": 0.81693792, "num_input_tokens_seen": 149594340, "step": 6923, "time_per_iteration": 2.7993979454040527 }, { "auxiliary_loss_clip": 0.01165711, "auxiliary_loss_mlp": 0.01019432, "balance_loss_clip": 1.04605246, "balance_loss_mlp": 1.01214814, "epoch": 0.8325617747850659, "flos": 34198700636160.0, "grad_norm": 1.8251141839499154, "language_loss": 0.73346341, "learning_rate": 2.868121077163024e-07, "loss": 0.75531483, "num_input_tokens_seen": 149613895, "step": 6924, "time_per_iteration": 2.8490428924560547 }, { "auxiliary_loss_clip": 0.01167885, "auxiliary_loss_mlp": 0.01023008, "balance_loss_clip": 1.04805589, "balance_loss_mlp": 1.01575446, "epoch": 0.8326820176757049, "flos": 18369457741440.0, "grad_norm": 1.8569505900630137, "language_loss": 0.72532982, "learning_rate": 2.864102952368257e-07, "loss": 0.74723876, "num_input_tokens_seen": 149631820, "step": 6925, "time_per_iteration": 2.759214401245117 }, { "auxiliary_loss_clip": 0.01145125, "auxiliary_loss_mlp": 0.01027321, "balance_loss_clip": 1.04506493, "balance_loss_mlp": 1.01977503, "epoch": 0.832802260566344, "flos": 35991325716480.0, "grad_norm": 1.2787597571344107, "language_loss": 0.59257865, "learning_rate": 2.860087427075444e-07, "loss": 0.61430311, "num_input_tokens_seen": 149656070, "step": 6926, "time_per_iteration": 2.970608949661255 }, { "auxiliary_loss_clip": 0.01156591, "auxiliary_loss_mlp": 0.01021977, "balance_loss_clip": 1.04783797, "balance_loss_mlp": 1.01498544, "epoch": 0.8329225034569832, "flos": 14244434928000.0, "grad_norm": 2.246109693189035, "language_loss": 0.8626318, "learning_rate": 2.856074501893744e-07, "loss": 0.88441747, "num_input_tokens_seen": 149671270, "step": 6927, "time_per_iteration": 2.72251296043396 }, { "auxiliary_loss_clip": 0.01167109, "auxiliary_loss_mlp": 0.01029006, "balance_loss_clip": 1.04994321, "balance_loss_mlp": 1.02144241, "epoch": 0.8330427463476222, "flos": 18077468083200.0, "grad_norm": 2.1927960348774964, "language_loss": 0.81870437, "learning_rate": 2.8520641774319054e-07, "loss": 0.84066552, "num_input_tokens_seen": 149689360, "step": 6928, "time_per_iteration": 2.855717420578003 }, { "auxiliary_loss_clip": 0.01160898, "auxiliary_loss_mlp": 0.01030709, "balance_loss_clip": 1.04468429, "balance_loss_mlp": 1.02298427, "epoch": 0.8331629892382613, "flos": 18040839189120.0, "grad_norm": 2.2380968413169966, "language_loss": 0.75573736, "learning_rate": 2.848056454298309e-07, "loss": 0.77765346, "num_input_tokens_seen": 149706685, "step": 6929, "time_per_iteration": 2.802988290786743 }, { "auxiliary_loss_clip": 0.01157539, "auxiliary_loss_mlp": 0.01027773, "balance_loss_clip": 1.04694605, "balance_loss_mlp": 1.02026916, "epoch": 0.8332832321289004, "flos": 17457398576640.0, "grad_norm": 2.6371258319484716, "language_loss": 0.6547206, "learning_rate": 2.844051333100905e-07, "loss": 0.67657375, "num_input_tokens_seen": 149724230, "step": 6930, "time_per_iteration": 2.800645351409912 }, { "auxiliary_loss_clip": 0.01161637, "auxiliary_loss_mlp": 0.01026434, "balance_loss_clip": 1.04956639, "balance_loss_mlp": 1.01976466, "epoch": 0.8334034750195395, "flos": 15084852416640.0, "grad_norm": 5.532474867803943, "language_loss": 0.83535695, "learning_rate": 2.840048814447269e-07, "loss": 0.85723764, "num_input_tokens_seen": 149742395, "step": 6931, "time_per_iteration": 2.765394926071167 }, { "auxiliary_loss_clip": 0.01155461, "auxiliary_loss_mlp": 0.01024122, "balance_loss_clip": 1.04816711, "balance_loss_mlp": 1.01701689, "epoch": 0.8335237179101785, "flos": 19427170556160.0, "grad_norm": 3.0361404081668293, "language_loss": 0.74018788, "learning_rate": 2.836048898944587e-07, "loss": 0.76198375, "num_input_tokens_seen": 149760820, "step": 6932, "time_per_iteration": 2.7767746448516846 }, { "auxiliary_loss_clip": 0.01159208, "auxiliary_loss_mlp": 0.01022044, "balance_loss_clip": 1.04594016, "balance_loss_mlp": 1.01537764, "epoch": 0.8336439608008177, "flos": 21762046327680.0, "grad_norm": 2.8658266662771763, "language_loss": 0.72134203, "learning_rate": 2.832051587199642e-07, "loss": 0.74315459, "num_input_tokens_seen": 149778075, "step": 6933, "time_per_iteration": 2.7504518032073975 }, { "auxiliary_loss_clip": 0.01061091, "auxiliary_loss_mlp": 0.01001776, "balance_loss_clip": 1.00787449, "balance_loss_mlp": 1.00079298, "epoch": 0.8337642036914568, "flos": 59702783990400.0, "grad_norm": 0.8078720511206616, "language_loss": 0.57689506, "learning_rate": 2.828056879818821e-07, "loss": 0.59752375, "num_input_tokens_seen": 149837150, "step": 6934, "time_per_iteration": 4.249389410018921 }, { "auxiliary_loss_clip": 0.0115685, "auxiliary_loss_mlp": 0.01018088, "balance_loss_clip": 1.045187, "balance_loss_mlp": 1.01130497, "epoch": 0.8338844465820958, "flos": 27162185022720.0, "grad_norm": 1.90495777520263, "language_loss": 0.83668125, "learning_rate": 2.824064777408117e-07, "loss": 0.85843056, "num_input_tokens_seen": 149856940, "step": 6935, "time_per_iteration": 2.891754627227783 }, { "auxiliary_loss_clip": 0.01164348, "auxiliary_loss_mlp": 0.010286, "balance_loss_clip": 1.04863214, "balance_loss_mlp": 1.02075624, "epoch": 0.8340046894727349, "flos": 30481264425600.0, "grad_norm": 1.7203337054077705, "language_loss": 0.75715476, "learning_rate": 2.8200752805731263e-07, "loss": 0.77908432, "num_input_tokens_seen": 149879930, "step": 6936, "time_per_iteration": 3.8221559524536133 }, { "auxiliary_loss_clip": 0.01163131, "auxiliary_loss_mlp": 0.01027205, "balance_loss_clip": 1.04766965, "balance_loss_mlp": 1.01947689, "epoch": 0.834124932363374, "flos": 27126166659840.0, "grad_norm": 1.524358775353077, "language_loss": 0.80870247, "learning_rate": 2.8160883899190625e-07, "loss": 0.8306058, "num_input_tokens_seen": 149903200, "step": 6937, "time_per_iteration": 2.8159871101379395 }, { "auxiliary_loss_clip": 0.0115242, "auxiliary_loss_mlp": 0.01025338, "balance_loss_clip": 1.05068648, "balance_loss_mlp": 1.01804852, "epoch": 0.8342451752540131, "flos": 24569865498240.0, "grad_norm": 5.326407109255925, "language_loss": 0.7315647, "learning_rate": 2.8121041060507234e-07, "loss": 0.75334227, "num_input_tokens_seen": 149922230, "step": 6938, "time_per_iteration": 2.818463087081909 }, { "auxiliary_loss_clip": 0.01167725, "auxiliary_loss_mlp": 0.01029467, "balance_loss_clip": 1.04673481, "balance_loss_mlp": 1.02174783, "epoch": 0.8343654181446521, "flos": 26615085995520.0, "grad_norm": 1.6255415756273348, "language_loss": 0.71439976, "learning_rate": 2.808122429572528e-07, "loss": 0.7363717, "num_input_tokens_seen": 149942435, "step": 6939, "time_per_iteration": 2.74534010887146 }, { "auxiliary_loss_clip": 0.0116213, "auxiliary_loss_mlp": 0.01023797, "balance_loss_clip": 1.04512978, "balance_loss_mlp": 1.0161494, "epoch": 0.8344856610352913, "flos": 20777268078720.0, "grad_norm": 3.0490827710020634, "language_loss": 0.76210481, "learning_rate": 2.804143361088489e-07, "loss": 0.78396404, "num_input_tokens_seen": 149961615, "step": 6940, "time_per_iteration": 2.8069446086883545 }, { "auxiliary_loss_clip": 0.01153592, "auxiliary_loss_mlp": 0.01027653, "balance_loss_clip": 1.04625559, "balance_loss_mlp": 1.02007723, "epoch": 0.8346059039259304, "flos": 26095960684800.0, "grad_norm": 2.521048529469844, "language_loss": 0.77794421, "learning_rate": 2.8001669012022277e-07, "loss": 0.79975665, "num_input_tokens_seen": 149979585, "step": 6941, "time_per_iteration": 3.738878011703491 }, { "auxiliary_loss_clip": 0.01163577, "auxiliary_loss_mlp": 0.01025384, "balance_loss_clip": 1.0499661, "balance_loss_mlp": 1.01764786, "epoch": 0.8347261468165694, "flos": 29027708755200.0, "grad_norm": 1.689558189884814, "language_loss": 0.69104475, "learning_rate": 2.7961930505169795e-07, "loss": 0.71293437, "num_input_tokens_seen": 150003830, "step": 6942, "time_per_iteration": 2.832259178161621 }, { "auxiliary_loss_clip": 0.01164405, "auxiliary_loss_mlp": 0.01052372, "balance_loss_clip": 1.04755914, "balance_loss_mlp": 1.01688409, "epoch": 0.8348463897072086, "flos": 26396461866240.0, "grad_norm": 1.9731806147269866, "language_loss": 0.76430821, "learning_rate": 2.792221809635558e-07, "loss": 0.78647602, "num_input_tokens_seen": 150024460, "step": 6943, "time_per_iteration": 2.7448134422302246 }, { "auxiliary_loss_clip": 0.01149422, "auxiliary_loss_mlp": 0.01026896, "balance_loss_clip": 1.04638779, "balance_loss_mlp": 1.01920104, "epoch": 0.8349666325978476, "flos": 23367720096000.0, "grad_norm": 2.0199174715708836, "language_loss": 0.74878794, "learning_rate": 2.788253179160411e-07, "loss": 0.77055109, "num_input_tokens_seen": 150045620, "step": 6944, "time_per_iteration": 2.8076894283294678 }, { "auxiliary_loss_clip": 0.01159001, "auxiliary_loss_mlp": 0.01024482, "balance_loss_clip": 1.04560578, "balance_loss_mlp": 1.01721072, "epoch": 0.8350868754884867, "flos": 12896528135040.0, "grad_norm": 1.7780683203225731, "language_loss": 0.64880192, "learning_rate": 2.7842871596935725e-07, "loss": 0.67063677, "num_input_tokens_seen": 150064135, "step": 6945, "time_per_iteration": 2.8110926151275635 }, { "auxiliary_loss_clip": 0.01166604, "auxiliary_loss_mlp": 0.01027473, "balance_loss_clip": 1.04526806, "balance_loss_mlp": 1.01993334, "epoch": 0.8352071183791259, "flos": 26505522535680.0, "grad_norm": 2.0702397154094885, "language_loss": 0.69220716, "learning_rate": 2.780323751836682e-07, "loss": 0.71414793, "num_input_tokens_seen": 150085350, "step": 6946, "time_per_iteration": 2.8444507122039795 }, { "auxiliary_loss_clip": 0.01161054, "auxiliary_loss_mlp": 0.01053743, "balance_loss_clip": 1.04665184, "balance_loss_mlp": 1.01855493, "epoch": 0.8353273612697649, "flos": 20668063754880.0, "grad_norm": 1.424343288490367, "language_loss": 0.7873829, "learning_rate": 2.7763629561909876e-07, "loss": 0.80953085, "num_input_tokens_seen": 150106180, "step": 6947, "time_per_iteration": 3.763827323913574 }, { "auxiliary_loss_clip": 0.01162637, "auxiliary_loss_mlp": 0.01028197, "balance_loss_clip": 1.04436517, "balance_loss_mlp": 1.02063274, "epoch": 0.835447604160404, "flos": 19754137082880.0, "grad_norm": 2.623403478638838, "language_loss": 0.76954168, "learning_rate": 2.772404773357335e-07, "loss": 0.79145002, "num_input_tokens_seen": 150125585, "step": 6948, "time_per_iteration": 2.7526051998138428 }, { "auxiliary_loss_clip": 0.01151915, "auxiliary_loss_mlp": 0.0102972, "balance_loss_clip": 1.04741526, "balance_loss_mlp": 1.02215576, "epoch": 0.8355678470510431, "flos": 23435842239360.0, "grad_norm": 1.8680420312234902, "language_loss": 0.78554839, "learning_rate": 2.7684492039361853e-07, "loss": 0.80736476, "num_input_tokens_seen": 150144810, "step": 6949, "time_per_iteration": 2.8080952167510986 }, { "auxiliary_loss_clip": 0.0116875, "auxiliary_loss_mlp": 0.01025871, "balance_loss_clip": 1.04834723, "balance_loss_mlp": 1.01827729, "epoch": 0.8356880899416822, "flos": 21214588164480.0, "grad_norm": 1.7262864123790662, "language_loss": 0.83700871, "learning_rate": 2.764496248527586e-07, "loss": 0.85895503, "num_input_tokens_seen": 150163785, "step": 6950, "time_per_iteration": 2.7672901153564453 }, { "auxiliary_loss_clip": 0.01163779, "auxiliary_loss_mlp": 0.01025581, "balance_loss_clip": 1.04609323, "balance_loss_mlp": 1.0176599, "epoch": 0.8358083328323213, "flos": 28037543466240.0, "grad_norm": 1.8967837178670555, "language_loss": 0.78782952, "learning_rate": 2.760545907731211e-07, "loss": 0.80972314, "num_input_tokens_seen": 150184360, "step": 6951, "time_per_iteration": 2.8780550956726074 }, { "auxiliary_loss_clip": 0.01165229, "auxiliary_loss_mlp": 0.01026804, "balance_loss_clip": 1.04687774, "balance_loss_mlp": 1.01963973, "epoch": 0.8359285757229604, "flos": 27783655159680.0, "grad_norm": 1.7060543820403975, "language_loss": 0.67869705, "learning_rate": 2.75659818214631e-07, "loss": 0.70061737, "num_input_tokens_seen": 150205465, "step": 6952, "time_per_iteration": 2.8887994289398193 }, { "auxiliary_loss_clip": 0.01163205, "auxiliary_loss_mlp": 0.010237, "balance_loss_clip": 1.04700923, "balance_loss_mlp": 1.01642275, "epoch": 0.8360488186135995, "flos": 21435115714560.0, "grad_norm": 1.6966230331946477, "language_loss": 0.78219724, "learning_rate": 2.752653072371749e-07, "loss": 0.8040663, "num_input_tokens_seen": 150224900, "step": 6953, "time_per_iteration": 2.7997114658355713 }, { "auxiliary_loss_clip": 0.01150076, "auxiliary_loss_mlp": 0.01028574, "balance_loss_clip": 1.04513526, "balance_loss_mlp": 1.02163863, "epoch": 0.8361690615042385, "flos": 27632327160960.0, "grad_norm": 1.6859411379505327, "language_loss": 0.74868107, "learning_rate": 2.7487105790060105e-07, "loss": 0.77046752, "num_input_tokens_seen": 150244310, "step": 6954, "time_per_iteration": 2.9687678813934326 }, { "auxiliary_loss_clip": 0.0116335, "auxiliary_loss_mlp": 0.01024726, "balance_loss_clip": 1.04452229, "balance_loss_mlp": 1.01752305, "epoch": 0.8362893043948777, "flos": 39202529598720.0, "grad_norm": 1.850189583167143, "language_loss": 0.69076324, "learning_rate": 2.7447707026471587e-07, "loss": 0.71264404, "num_input_tokens_seen": 150267285, "step": 6955, "time_per_iteration": 2.9019086360931396 }, { "auxiliary_loss_clip": 0.01158794, "auxiliary_loss_mlp": 0.01025856, "balance_loss_clip": 1.0465138, "balance_loss_mlp": 1.01865244, "epoch": 0.8364095472855168, "flos": 24785329230720.0, "grad_norm": 2.0919300139921386, "language_loss": 0.79332602, "learning_rate": 2.740833443892874e-07, "loss": 0.81517249, "num_input_tokens_seen": 150285455, "step": 6956, "time_per_iteration": 2.837855100631714 }, { "auxiliary_loss_clip": 0.01159311, "auxiliary_loss_mlp": 0.01026022, "balance_loss_clip": 1.04584408, "balance_loss_mlp": 1.01890779, "epoch": 0.8365297901761558, "flos": 22743412784640.0, "grad_norm": 1.7765486960168275, "language_loss": 0.79685509, "learning_rate": 2.7368988033404327e-07, "loss": 0.81870842, "num_input_tokens_seen": 150302970, "step": 6957, "time_per_iteration": 2.8676939010620117 }, { "auxiliary_loss_clip": 0.01157697, "auxiliary_loss_mlp": 0.01023263, "balance_loss_clip": 1.04557574, "balance_loss_mlp": 1.01629233, "epoch": 0.836650033066795, "flos": 28396003242240.0, "grad_norm": 1.5460774847403556, "language_loss": 0.84476542, "learning_rate": 2.732966781586712e-07, "loss": 0.86657506, "num_input_tokens_seen": 150322715, "step": 6958, "time_per_iteration": 2.829516649246216 }, { "auxiliary_loss_clip": 0.01157374, "auxiliary_loss_mlp": 0.01024084, "balance_loss_clip": 1.04535675, "balance_loss_mlp": 1.01695538, "epoch": 0.836770275957434, "flos": 22236857233920.0, "grad_norm": 1.6922436236785476, "language_loss": 0.66883409, "learning_rate": 2.729037379228205e-07, "loss": 0.69064867, "num_input_tokens_seen": 150342900, "step": 6959, "time_per_iteration": 2.6972880363464355 }, { "auxiliary_loss_clip": 0.0115912, "auxiliary_loss_mlp": 0.01027307, "balance_loss_clip": 1.04866385, "balance_loss_mlp": 1.01987481, "epoch": 0.8368905188480731, "flos": 22491930689280.0, "grad_norm": 1.468932945129131, "language_loss": 0.80539674, "learning_rate": 2.725110596860998e-07, "loss": 0.82726097, "num_input_tokens_seen": 150363580, "step": 6960, "time_per_iteration": 3.739044666290283 }, { "auxiliary_loss_clip": 0.01151426, "auxiliary_loss_mlp": 0.01028125, "balance_loss_clip": 1.04770613, "balance_loss_mlp": 1.02097881, "epoch": 0.8370107617387123, "flos": 13370405287680.0, "grad_norm": 2.126204982581903, "language_loss": 0.69808316, "learning_rate": 2.7211864350807776e-07, "loss": 0.71987867, "num_input_tokens_seen": 150381780, "step": 6961, "time_per_iteration": 2.80293607711792 }, { "auxiliary_loss_clip": 0.01166238, "auxiliary_loss_mlp": 0.01024443, "balance_loss_clip": 1.04721832, "balance_loss_mlp": 1.01721895, "epoch": 0.8371310046293513, "flos": 25261289372160.0, "grad_norm": 1.7446985525673817, "language_loss": 0.73970294, "learning_rate": 2.717264894482836e-07, "loss": 0.76160979, "num_input_tokens_seen": 150402120, "step": 6962, "time_per_iteration": 3.7029595375061035 }, { "auxiliary_loss_clip": 0.01167655, "auxiliary_loss_mlp": 0.01026046, "balance_loss_clip": 1.04938328, "balance_loss_mlp": 1.01827919, "epoch": 0.8372512475199904, "flos": 19792705311360.0, "grad_norm": 2.2068326118633204, "language_loss": 0.80768698, "learning_rate": 2.7133459756620646e-07, "loss": 0.82962394, "num_input_tokens_seen": 150419315, "step": 6963, "time_per_iteration": 2.757495164871216 }, { "auxiliary_loss_clip": 0.01159478, "auxiliary_loss_mlp": 0.01028918, "balance_loss_clip": 1.04762268, "balance_loss_mlp": 1.02127099, "epoch": 0.8373714904106295, "flos": 19391224020480.0, "grad_norm": 2.474576751161846, "language_loss": 0.73559147, "learning_rate": 2.7094296792129733e-07, "loss": 0.75747544, "num_input_tokens_seen": 150438915, "step": 6964, "time_per_iteration": 2.775038480758667 }, { "auxiliary_loss_clip": 0.01161118, "auxiliary_loss_mlp": 0.01025941, "balance_loss_clip": 1.04478705, "balance_loss_mlp": 1.01883924, "epoch": 0.8374917333012686, "flos": 14975935401600.0, "grad_norm": 2.773403997060099, "language_loss": 0.75401843, "learning_rate": 2.7055160057296424e-07, "loss": 0.77588904, "num_input_tokens_seen": 150456155, "step": 6965, "time_per_iteration": 2.74141788482666 }, { "auxiliary_loss_clip": 0.01158348, "auxiliary_loss_mlp": 0.01021722, "balance_loss_clip": 1.04746711, "balance_loss_mlp": 1.01384795, "epoch": 0.8376119761919076, "flos": 30331839847680.0, "grad_norm": 2.226303905642295, "language_loss": 0.7259891, "learning_rate": 2.7016049558057896e-07, "loss": 0.7477898, "num_input_tokens_seen": 150478115, "step": 6966, "time_per_iteration": 2.8461740016937256 }, { "auxiliary_loss_clip": 0.01164643, "auxiliary_loss_mlp": 0.01024982, "balance_loss_clip": 1.04832447, "balance_loss_mlp": 1.01764488, "epoch": 0.8377322190825467, "flos": 29423336129280.0, "grad_norm": 2.2865705915441703, "language_loss": 0.70685977, "learning_rate": 2.6976965300347074e-07, "loss": 0.72875607, "num_input_tokens_seen": 150500725, "step": 6967, "time_per_iteration": 3.7001702785491943 }, { "auxiliary_loss_clip": 0.01155881, "auxiliary_loss_mlp": 0.01024431, "balance_loss_clip": 1.04520726, "balance_loss_mlp": 1.01684928, "epoch": 0.8378524619731859, "flos": 26687086807680.0, "grad_norm": 2.203323880847998, "language_loss": 0.69315845, "learning_rate": 2.693790729009309e-07, "loss": 0.71496159, "num_input_tokens_seen": 150522335, "step": 6968, "time_per_iteration": 2.994068145751953 }, { "auxiliary_loss_clip": 0.01158772, "auxiliary_loss_mlp": 0.01029419, "balance_loss_clip": 1.0455153, "balance_loss_mlp": 1.02239799, "epoch": 0.8379727048638249, "flos": 20703866636160.0, "grad_norm": 2.0506151698953787, "language_loss": 0.88649881, "learning_rate": 2.6898875533220946e-07, "loss": 0.90838075, "num_input_tokens_seen": 150541640, "step": 6969, "time_per_iteration": 2.8355295658111572 }, { "auxiliary_loss_clip": 0.01161391, "auxiliary_loss_mlp": 0.01022721, "balance_loss_clip": 1.04628134, "balance_loss_mlp": 1.0161761, "epoch": 0.838092947754464, "flos": 20084084438400.0, "grad_norm": 1.8044629887847523, "language_loss": 0.81768453, "learning_rate": 2.685987003565171e-07, "loss": 0.8395257, "num_input_tokens_seen": 150559680, "step": 6970, "time_per_iteration": 2.714660167694092 }, { "auxiliary_loss_clip": 0.0115015, "auxiliary_loss_mlp": 0.01024752, "balance_loss_clip": 1.04741812, "balance_loss_mlp": 1.01762605, "epoch": 0.8382131906451031, "flos": 18113270964480.0, "grad_norm": 5.128322382734472, "language_loss": 0.75587499, "learning_rate": 2.6820890803302566e-07, "loss": 0.77762401, "num_input_tokens_seen": 150575205, "step": 6971, "time_per_iteration": 2.721457004547119 }, { "auxiliary_loss_clip": 0.01160332, "auxiliary_loss_mlp": 0.01021757, "balance_loss_clip": 1.04931045, "balance_loss_mlp": 1.01467609, "epoch": 0.8383334335357422, "flos": 17092653920640.0, "grad_norm": 2.0377419656379234, "language_loss": 0.8183679, "learning_rate": 2.6781937842086557e-07, "loss": 0.8401888, "num_input_tokens_seen": 150593995, "step": 6972, "time_per_iteration": 2.7266077995300293 }, { "auxiliary_loss_clip": 0.01163297, "auxiliary_loss_mlp": 0.01025342, "balance_loss_clip": 1.04585397, "balance_loss_mlp": 1.01815403, "epoch": 0.8384536764263812, "flos": 20704728562560.0, "grad_norm": 1.8189258173537641, "language_loss": 0.6721276, "learning_rate": 2.6743011157912933e-07, "loss": 0.69401395, "num_input_tokens_seen": 150613715, "step": 6973, "time_per_iteration": 3.723055839538574 }, { "auxiliary_loss_clip": 0.01157047, "auxiliary_loss_mlp": 0.0102924, "balance_loss_clip": 1.04884732, "balance_loss_mlp": 1.02166736, "epoch": 0.8385739193170204, "flos": 28986842056320.0, "grad_norm": 3.1162940347350774, "language_loss": 0.65256691, "learning_rate": 2.6704110756686725e-07, "loss": 0.67442983, "num_input_tokens_seen": 150634540, "step": 6974, "time_per_iteration": 2.8353474140167236 }, { "auxiliary_loss_clip": 0.01158107, "auxiliary_loss_mlp": 0.01055631, "balance_loss_clip": 1.04635644, "balance_loss_mlp": 1.01993847, "epoch": 0.8386941622076595, "flos": 23438068882560.0, "grad_norm": 1.8923427168635518, "language_loss": 0.84123915, "learning_rate": 2.6665236644309085e-07, "loss": 0.86337656, "num_input_tokens_seen": 150654850, "step": 6975, "time_per_iteration": 2.835358142852783 }, { "auxiliary_loss_clip": 0.01161591, "auxiliary_loss_mlp": 0.01023293, "balance_loss_clip": 1.04539466, "balance_loss_mlp": 1.01653957, "epoch": 0.8388144050982985, "flos": 23002724044800.0, "grad_norm": 2.4602428598391803, "language_loss": 0.80017245, "learning_rate": 2.662638882667727e-07, "loss": 0.82202137, "num_input_tokens_seen": 150673790, "step": 6976, "time_per_iteration": 2.751633882522583 }, { "auxiliary_loss_clip": 0.01166311, "auxiliary_loss_mlp": 0.01025775, "balance_loss_clip": 1.04496789, "balance_loss_mlp": 1.01849103, "epoch": 0.8389346479889377, "flos": 24280353878400.0, "grad_norm": 1.8688422997530074, "language_loss": 0.72754431, "learning_rate": 2.658756730968443e-07, "loss": 0.74946517, "num_input_tokens_seen": 150692255, "step": 6977, "time_per_iteration": 2.771509885787964 }, { "auxiliary_loss_clip": 0.01164117, "auxiliary_loss_mlp": 0.01021427, "balance_loss_clip": 1.0485121, "balance_loss_mlp": 1.01415241, "epoch": 0.8390548908795767, "flos": 21215019127680.0, "grad_norm": 2.0886970686399864, "language_loss": 0.88456273, "learning_rate": 2.654877209921975e-07, "loss": 0.9064182, "num_input_tokens_seen": 150709790, "step": 6978, "time_per_iteration": 2.761460304260254 }, { "auxiliary_loss_clip": 0.01160494, "auxiliary_loss_mlp": 0.01024862, "balance_loss_clip": 1.04771328, "balance_loss_mlp": 1.01676762, "epoch": 0.8391751337702158, "flos": 35627299332480.0, "grad_norm": 2.2803828138181665, "language_loss": 0.63196146, "learning_rate": 2.651000320116843e-07, "loss": 0.65381503, "num_input_tokens_seen": 150730675, "step": 6979, "time_per_iteration": 3.009519338607788 }, { "auxiliary_loss_clip": 0.01155604, "auxiliary_loss_mlp": 0.01050316, "balance_loss_clip": 1.04752958, "balance_loss_mlp": 1.01519489, "epoch": 0.839295376660855, "flos": 21325229032320.0, "grad_norm": 1.9874731269616617, "language_loss": 0.76351798, "learning_rate": 2.647126062141163e-07, "loss": 0.78557718, "num_input_tokens_seen": 150749750, "step": 6980, "time_per_iteration": 2.8536994457244873 }, { "auxiliary_loss_clip": 0.01164444, "auxiliary_loss_mlp": 0.01023209, "balance_loss_clip": 1.04737997, "balance_loss_mlp": 1.01594329, "epoch": 0.839415619551494, "flos": 18442535961600.0, "grad_norm": 2.786575766737296, "language_loss": 0.8391239, "learning_rate": 2.643254436582669e-07, "loss": 0.86100042, "num_input_tokens_seen": 150769240, "step": 6981, "time_per_iteration": 2.799853563308716 }, { "auxiliary_loss_clip": 0.01156786, "auxiliary_loss_mlp": 0.01019407, "balance_loss_clip": 1.04708385, "balance_loss_mlp": 1.01169991, "epoch": 0.8395358624421331, "flos": 23221958705280.0, "grad_norm": 1.8481418908334188, "language_loss": 0.82860988, "learning_rate": 2.6393854440286743e-07, "loss": 0.85037184, "num_input_tokens_seen": 150788410, "step": 6982, "time_per_iteration": 2.8730709552764893 }, { "auxiliary_loss_clip": 0.01166564, "auxiliary_loss_mlp": 0.0102482, "balance_loss_clip": 1.0494312, "balance_loss_mlp": 1.01813877, "epoch": 0.8396561053327722, "flos": 24381657210240.0, "grad_norm": 1.9537795243702745, "language_loss": 0.70672119, "learning_rate": 2.6355190850661045e-07, "loss": 0.72863507, "num_input_tokens_seen": 150805245, "step": 6983, "time_per_iteration": 2.7488486766815186 }, { "auxiliary_loss_clip": 0.01161302, "auxiliary_loss_mlp": 0.01021738, "balance_loss_clip": 1.04926264, "balance_loss_mlp": 1.01494622, "epoch": 0.8397763482234113, "flos": 22237755073920.0, "grad_norm": 1.5822162784566671, "language_loss": 0.86833835, "learning_rate": 2.631655360281486e-07, "loss": 0.89016879, "num_input_tokens_seen": 150824920, "step": 6984, "time_per_iteration": 2.989769458770752 }, { "auxiliary_loss_clip": 0.01167302, "auxiliary_loss_mlp": 0.01054694, "balance_loss_clip": 1.04635286, "balance_loss_mlp": 1.01911843, "epoch": 0.8398965911140504, "flos": 22163743100160.0, "grad_norm": 1.9111613902342108, "language_loss": 0.65618509, "learning_rate": 2.6277942702609323e-07, "loss": 0.67840505, "num_input_tokens_seen": 150844400, "step": 6985, "time_per_iteration": 2.8762903213500977 }, { "auxiliary_loss_clip": 0.01162727, "auxiliary_loss_mlp": 0.01025013, "balance_loss_clip": 1.04982948, "balance_loss_mlp": 1.01774669, "epoch": 0.8400168340046895, "flos": 21542775753600.0, "grad_norm": 1.8944964826312962, "language_loss": 0.87438571, "learning_rate": 2.623935815590186e-07, "loss": 0.89626318, "num_input_tokens_seen": 150862780, "step": 6986, "time_per_iteration": 3.844318151473999 }, { "auxiliary_loss_clip": 0.01161966, "auxiliary_loss_mlp": 0.01025656, "balance_loss_clip": 1.0484302, "balance_loss_mlp": 1.01840234, "epoch": 0.8401370768953286, "flos": 22491966602880.0, "grad_norm": 1.8019260314391825, "language_loss": 0.80669653, "learning_rate": 2.6200799968545516e-07, "loss": 0.82857275, "num_input_tokens_seen": 150883075, "step": 6987, "time_per_iteration": 2.851304054260254 }, { "auxiliary_loss_clip": 0.01060265, "auxiliary_loss_mlp": 0.01002627, "balance_loss_clip": 1.01130676, "balance_loss_mlp": 1.00179887, "epoch": 0.8402573197859676, "flos": 59238890818560.0, "grad_norm": 0.7937976324137495, "language_loss": 0.56454384, "learning_rate": 2.616226814638969e-07, "loss": 0.58517277, "num_input_tokens_seen": 150948180, "step": 6988, "time_per_iteration": 4.346230745315552 }, { "auxiliary_loss_clip": 0.01161109, "auxiliary_loss_mlp": 0.01022267, "balance_loss_clip": 1.04684114, "balance_loss_mlp": 1.0150907, "epoch": 0.8403775626766068, "flos": 22674608282880.0, "grad_norm": 1.8439376501441043, "language_loss": 0.77497017, "learning_rate": 2.612376269527954e-07, "loss": 0.79680389, "num_input_tokens_seen": 150967885, "step": 6989, "time_per_iteration": 2.8459668159484863 }, { "auxiliary_loss_clip": 0.01155806, "auxiliary_loss_mlp": 0.01029356, "balance_loss_clip": 1.04626882, "balance_loss_mlp": 1.02247179, "epoch": 0.8404978055672458, "flos": 19609704495360.0, "grad_norm": 2.1709998199649783, "language_loss": 0.67428088, "learning_rate": 2.608528362105635e-07, "loss": 0.69613254, "num_input_tokens_seen": 150987255, "step": 6990, "time_per_iteration": 2.7750914096832275 }, { "auxiliary_loss_clip": 0.01159268, "auxiliary_loss_mlp": 0.01020749, "balance_loss_clip": 1.04678285, "balance_loss_mlp": 1.01311958, "epoch": 0.8406180484578849, "flos": 27526929678720.0, "grad_norm": 1.920542954415184, "language_loss": 0.73377252, "learning_rate": 2.6046830929557374e-07, "loss": 0.75557268, "num_input_tokens_seen": 151006905, "step": 6991, "time_per_iteration": 2.762279748916626 }, { "auxiliary_loss_clip": 0.01155291, "auxiliary_loss_mlp": 0.01027806, "balance_loss_clip": 1.04924452, "balance_loss_mlp": 1.02011442, "epoch": 0.8407382913485241, "flos": 22127473342080.0, "grad_norm": 2.366918367897451, "language_loss": 0.84813559, "learning_rate": 2.6008404626615776e-07, "loss": 0.86996651, "num_input_tokens_seen": 151025405, "step": 6992, "time_per_iteration": 2.745694398880005 }, { "auxiliary_loss_clip": 0.01169238, "auxiliary_loss_mlp": 0.01023716, "balance_loss_clip": 1.05054402, "balance_loss_mlp": 1.01639652, "epoch": 0.8408585342391631, "flos": 13918473982080.0, "grad_norm": 3.625069350600834, "language_loss": 0.73890901, "learning_rate": 2.597000471806092e-07, "loss": 0.76083851, "num_input_tokens_seen": 151041970, "step": 6993, "time_per_iteration": 3.632779359817505 }, { "auxiliary_loss_clip": 0.01159292, "auxiliary_loss_mlp": 0.0103071, "balance_loss_clip": 1.0506227, "balance_loss_mlp": 1.02263916, "epoch": 0.8409787771298022, "flos": 20187865808640.0, "grad_norm": 2.34547985731187, "language_loss": 0.73107183, "learning_rate": 2.593163120971793e-07, "loss": 0.75297189, "num_input_tokens_seen": 151060835, "step": 6994, "time_per_iteration": 2.7838761806488037 }, { "auxiliary_loss_clip": 0.01149828, "auxiliary_loss_mlp": 0.01021538, "balance_loss_clip": 1.04746318, "balance_loss_mlp": 1.01461184, "epoch": 0.8410990200204413, "flos": 23142523777920.0, "grad_norm": 2.015051855350199, "language_loss": 0.68826139, "learning_rate": 2.5893284107408165e-07, "loss": 0.709975, "num_input_tokens_seen": 151078205, "step": 6995, "time_per_iteration": 2.827915906906128 }, { "auxiliary_loss_clip": 0.01156016, "auxiliary_loss_mlp": 0.0102777, "balance_loss_clip": 1.0510124, "balance_loss_mlp": 1.02031922, "epoch": 0.8412192629110804, "flos": 24027219757440.0, "grad_norm": 4.69942525625176, "language_loss": 0.77986079, "learning_rate": 2.5854963416948726e-07, "loss": 0.80169868, "num_input_tokens_seen": 151100470, "step": 6996, "time_per_iteration": 2.7832818031311035 }, { "auxiliary_loss_clip": 0.01156492, "auxiliary_loss_mlp": 0.01029679, "balance_loss_clip": 1.04698157, "balance_loss_mlp": 1.02236545, "epoch": 0.8413395058017195, "flos": 25591703604480.0, "grad_norm": 1.6825396621266613, "language_loss": 0.69576097, "learning_rate": 2.5816669144152816e-07, "loss": 0.71762264, "num_input_tokens_seen": 151121650, "step": 6997, "time_per_iteration": 2.794173002243042 }, { "auxiliary_loss_clip": 0.01060485, "auxiliary_loss_mlp": 0.01002207, "balance_loss_clip": 1.00766182, "balance_loss_mlp": 1.00122976, "epoch": 0.8414597486923585, "flos": 63635396624640.0, "grad_norm": 0.8735471508030072, "language_loss": 0.66333479, "learning_rate": 2.5778401294829777e-07, "loss": 0.68396169, "num_input_tokens_seen": 151180390, "step": 6998, "time_per_iteration": 3.290274143218994 }, { "auxiliary_loss_clip": 0.01159816, "auxiliary_loss_mlp": 0.0105168, "balance_loss_clip": 1.0466032, "balance_loss_mlp": 1.01680565, "epoch": 0.8415799915829977, "flos": 19098731571840.0, "grad_norm": 1.6978311799480403, "language_loss": 0.65035164, "learning_rate": 2.574015987478473e-07, "loss": 0.67246664, "num_input_tokens_seen": 151198520, "step": 6999, "time_per_iteration": 3.6927125453948975 }, { "auxiliary_loss_clip": 0.01167746, "auxiliary_loss_mlp": 0.01026353, "balance_loss_clip": 1.04798794, "balance_loss_mlp": 1.01875687, "epoch": 0.8417002344736367, "flos": 19821612781440.0, "grad_norm": 2.354716675905176, "language_loss": 0.87074554, "learning_rate": 2.570194488981887e-07, "loss": 0.89268655, "num_input_tokens_seen": 151215065, "step": 7000, "time_per_iteration": 2.7613158226013184 }, { "auxiliary_loss_clip": 0.01060251, "auxiliary_loss_mlp": 0.01001912, "balance_loss_clip": 1.0075773, "balance_loss_mlp": 1.00089884, "epoch": 0.8418204773642758, "flos": 62161516834560.0, "grad_norm": 0.8369998854241064, "language_loss": 0.60280371, "learning_rate": 2.566375634572939e-07, "loss": 0.62342536, "num_input_tokens_seen": 151275705, "step": 7001, "time_per_iteration": 3.228722095489502 }, { "auxiliary_loss_clip": 0.01164601, "auxiliary_loss_mlp": 0.01027046, "balance_loss_clip": 1.04755473, "balance_loss_mlp": 1.01916885, "epoch": 0.841940720254915, "flos": 17092905315840.0, "grad_norm": 1.9192756841276424, "language_loss": 0.76042438, "learning_rate": 2.562559424830943e-07, "loss": 0.78234088, "num_input_tokens_seen": 151293665, "step": 7002, "time_per_iteration": 2.7903106212615967 }, { "auxiliary_loss_clip": 0.0115933, "auxiliary_loss_mlp": 0.01023812, "balance_loss_clip": 1.04834867, "balance_loss_mlp": 1.01630819, "epoch": 0.842060963145554, "flos": 16283586026880.0, "grad_norm": 2.792334436307396, "language_loss": 0.70480156, "learning_rate": 2.5587458603348256e-07, "loss": 0.72663295, "num_input_tokens_seen": 151310955, "step": 7003, "time_per_iteration": 2.6958065032958984 }, { "auxiliary_loss_clip": 0.01152453, "auxiliary_loss_mlp": 0.01028666, "balance_loss_clip": 1.04677248, "balance_loss_mlp": 1.02086091, "epoch": 0.8421812060361931, "flos": 21908238681600.0, "grad_norm": 1.8539406783976025, "language_loss": 0.84161282, "learning_rate": 2.554934941663085e-07, "loss": 0.86342394, "num_input_tokens_seen": 151328490, "step": 7004, "time_per_iteration": 2.7540500164031982 }, { "auxiliary_loss_clip": 0.01158713, "auxiliary_loss_mlp": 0.01025359, "balance_loss_clip": 1.04741549, "balance_loss_mlp": 1.0173068, "epoch": 0.8423014489268322, "flos": 27777693502080.0, "grad_norm": 3.1494469393193123, "language_loss": 0.73336154, "learning_rate": 2.5511266693938484e-07, "loss": 0.75520229, "num_input_tokens_seen": 151346950, "step": 7005, "time_per_iteration": 2.755786895751953 }, { "auxiliary_loss_clip": 0.01158601, "auxiliary_loss_mlp": 0.01025126, "balance_loss_clip": 1.04916453, "balance_loss_mlp": 1.01756525, "epoch": 0.8424216918174713, "flos": 25117610970240.0, "grad_norm": 1.9128212881761153, "language_loss": 0.77755916, "learning_rate": 2.547321044104822e-07, "loss": 0.79939646, "num_input_tokens_seen": 151368445, "step": 7006, "time_per_iteration": 2.670009136199951 }, { "auxiliary_loss_clip": 0.01170926, "auxiliary_loss_mlp": 0.0102977, "balance_loss_clip": 1.04969907, "balance_loss_mlp": 1.02187824, "epoch": 0.8425419347081103, "flos": 24748448941440.0, "grad_norm": 1.9472655334799769, "language_loss": 0.768408, "learning_rate": 2.5435180663733113e-07, "loss": 0.79041493, "num_input_tokens_seen": 151388745, "step": 7007, "time_per_iteration": 2.738847494125366 }, { "auxiliary_loss_clip": 0.01164359, "auxiliary_loss_mlp": 0.01025133, "balance_loss_clip": 1.04828024, "balance_loss_mlp": 1.01736951, "epoch": 0.8426621775987495, "flos": 24820916630400.0, "grad_norm": 2.5617752737007415, "language_loss": 0.71999025, "learning_rate": 2.539717736776241e-07, "loss": 0.74188519, "num_input_tokens_seen": 151404970, "step": 7008, "time_per_iteration": 2.775681734085083 }, { "auxiliary_loss_clip": 0.01158234, "auxiliary_loss_mlp": 0.01019242, "balance_loss_clip": 1.04515052, "balance_loss_mlp": 1.01208687, "epoch": 0.8427824204893886, "flos": 23550074467200.0, "grad_norm": 1.4504269348437624, "language_loss": 0.76420665, "learning_rate": 2.535920055890097e-07, "loss": 0.78598142, "num_input_tokens_seen": 151426265, "step": 7009, "time_per_iteration": 2.7235915660858154 }, { "auxiliary_loss_clip": 0.01152976, "auxiliary_loss_mlp": 0.0102332, "balance_loss_clip": 1.0483675, "balance_loss_mlp": 1.01522541, "epoch": 0.8429026633800276, "flos": 16143858120960.0, "grad_norm": 2.7057400915908567, "language_loss": 0.64611745, "learning_rate": 2.5321250242910006e-07, "loss": 0.66788042, "num_input_tokens_seen": 151444180, "step": 7010, "time_per_iteration": 2.7497384548187256 }, { "auxiliary_loss_clip": 0.01166044, "auxiliary_loss_mlp": 0.01024516, "balance_loss_clip": 1.04754138, "balance_loss_mlp": 1.0170238, "epoch": 0.8430229062706668, "flos": 22198540400640.0, "grad_norm": 1.6592837389885697, "language_loss": 0.86995435, "learning_rate": 2.5283326425546493e-07, "loss": 0.89185995, "num_input_tokens_seen": 151463290, "step": 7011, "time_per_iteration": 2.7409956455230713 }, { "auxiliary_loss_clip": 0.01151574, "auxiliary_loss_mlp": 0.01024544, "balance_loss_clip": 1.04933596, "balance_loss_mlp": 1.0176481, "epoch": 0.8431431491613058, "flos": 35330317683840.0, "grad_norm": 2.034877310749859, "language_loss": 0.69644153, "learning_rate": 2.5245429112563443e-07, "loss": 0.71820271, "num_input_tokens_seen": 151483965, "step": 7012, "time_per_iteration": 3.951488494873047 }, { "auxiliary_loss_clip": 0.01163214, "auxiliary_loss_mlp": 0.01025631, "balance_loss_clip": 1.04804516, "balance_loss_mlp": 1.01838613, "epoch": 0.8432633920519449, "flos": 25812374808960.0, "grad_norm": 2.281721257974891, "language_loss": 0.8218652, "learning_rate": 2.5207558309709865e-07, "loss": 0.8437537, "num_input_tokens_seen": 151503700, "step": 7013, "time_per_iteration": 2.7451705932617188 }, { "auxiliary_loss_clip": 0.01064605, "auxiliary_loss_mlp": 0.01034518, "balance_loss_clip": 1.00778306, "balance_loss_mlp": 1.00190771, "epoch": 0.8433836349425841, "flos": 64959531592320.0, "grad_norm": 0.6584386572891955, "language_loss": 0.56202579, "learning_rate": 2.516971402273065e-07, "loss": 0.58301699, "num_input_tokens_seen": 151569765, "step": 7014, "time_per_iteration": 4.249814987182617 }, { "auxiliary_loss_clip": 0.0115942, "auxiliary_loss_mlp": 0.01026976, "balance_loss_clip": 1.04564703, "balance_loss_mlp": 1.01984429, "epoch": 0.8435038778332231, "flos": 20229989483520.0, "grad_norm": 1.7783111491624912, "language_loss": 0.68046099, "learning_rate": 2.513189625736687e-07, "loss": 0.70232499, "num_input_tokens_seen": 151586660, "step": 7015, "time_per_iteration": 2.71315860748291 }, { "auxiliary_loss_clip": 0.01160199, "auxiliary_loss_mlp": 0.01030874, "balance_loss_clip": 1.04542208, "balance_loss_mlp": 1.02295887, "epoch": 0.8436241207238622, "flos": 20992229020800.0, "grad_norm": 2.0123815090048187, "language_loss": 0.71167803, "learning_rate": 2.509410501935534e-07, "loss": 0.73358881, "num_input_tokens_seen": 151602295, "step": 7016, "time_per_iteration": 2.691115617752075 }, { "auxiliary_loss_clip": 0.01164216, "auxiliary_loss_mlp": 0.01025434, "balance_loss_clip": 1.04788089, "balance_loss_mlp": 1.01772988, "epoch": 0.8437443636145013, "flos": 14682257804160.0, "grad_norm": 2.311312301161585, "language_loss": 0.75196779, "learning_rate": 2.5056340314429116e-07, "loss": 0.77386427, "num_input_tokens_seen": 151619760, "step": 7017, "time_per_iteration": 2.6759448051452637 }, { "auxiliary_loss_clip": 0.01159464, "auxiliary_loss_mlp": 0.0102651, "balance_loss_clip": 1.04756892, "balance_loss_mlp": 1.01897645, "epoch": 0.8438646065051404, "flos": 21608814908160.0, "grad_norm": 4.141508648015448, "language_loss": 0.80322754, "learning_rate": 2.5018602148316904e-07, "loss": 0.82508731, "num_input_tokens_seen": 151635795, "step": 7018, "time_per_iteration": 2.7341692447662354 }, { "auxiliary_loss_clip": 0.01163066, "auxiliary_loss_mlp": 0.01026066, "balance_loss_clip": 1.04727006, "balance_loss_mlp": 1.01955426, "epoch": 0.8439848493957794, "flos": 23289937194240.0, "grad_norm": 1.7275460941655558, "language_loss": 0.80164838, "learning_rate": 2.498089052674359e-07, "loss": 0.82353973, "num_input_tokens_seen": 151653770, "step": 7019, "time_per_iteration": 3.5486979484558105 }, { "auxiliary_loss_clip": 0.01164906, "auxiliary_loss_mlp": 0.01023476, "balance_loss_clip": 1.04917955, "balance_loss_mlp": 1.01654696, "epoch": 0.8441050922864186, "flos": 19719339782400.0, "grad_norm": 1.8772492185836211, "language_loss": 0.75164473, "learning_rate": 2.494320545543007e-07, "loss": 0.77352858, "num_input_tokens_seen": 151673340, "step": 7020, "time_per_iteration": 2.6834282875061035 }, { "auxiliary_loss_clip": 0.01170199, "auxiliary_loss_mlp": 0.01023775, "balance_loss_clip": 1.04726744, "balance_loss_mlp": 1.01573384, "epoch": 0.8442253351770577, "flos": 21835268202240.0, "grad_norm": 1.9161825982852465, "language_loss": 0.66766101, "learning_rate": 2.490554694009308e-07, "loss": 0.68960077, "num_input_tokens_seen": 151694205, "step": 7021, "time_per_iteration": 2.681995391845703 }, { "auxiliary_loss_clip": 0.01166987, "auxiliary_loss_mlp": 0.0102577, "balance_loss_clip": 1.04670787, "balance_loss_mlp": 1.01863575, "epoch": 0.8443455780676967, "flos": 34346365447680.0, "grad_norm": 1.5950832991844288, "language_loss": 0.78431123, "learning_rate": 2.4867914986445426e-07, "loss": 0.80623883, "num_input_tokens_seen": 151716595, "step": 7022, "time_per_iteration": 2.8992550373077393 }, { "auxiliary_loss_clip": 0.0116425, "auxiliary_loss_mlp": 0.0102335, "balance_loss_clip": 1.04556704, "balance_loss_mlp": 1.01628745, "epoch": 0.8444658209583359, "flos": 48214599281280.0, "grad_norm": 1.953217777734909, "language_loss": 0.71284878, "learning_rate": 2.483030960019581e-07, "loss": 0.73472476, "num_input_tokens_seen": 151740525, "step": 7023, "time_per_iteration": 2.915282964706421 }, { "auxiliary_loss_clip": 0.01058835, "auxiliary_loss_mlp": 0.01001479, "balance_loss_clip": 1.01054549, "balance_loss_mlp": 1.00048947, "epoch": 0.8445860638489749, "flos": 68484773105280.0, "grad_norm": 0.7322376497571085, "language_loss": 0.55423844, "learning_rate": 2.479273078704891e-07, "loss": 0.57484162, "num_input_tokens_seen": 151793890, "step": 7024, "time_per_iteration": 3.180150270462036 }, { "auxiliary_loss_clip": 0.01054286, "auxiliary_loss_mlp": 0.01002715, "balance_loss_clip": 1.01199317, "balance_loss_mlp": 1.00180268, "epoch": 0.844706306739614, "flos": 62833331882880.0, "grad_norm": 0.7810522185061765, "language_loss": 0.64713937, "learning_rate": 2.475517855270552e-07, "loss": 0.66770941, "num_input_tokens_seen": 151853970, "step": 7025, "time_per_iteration": 4.162234544754028 }, { "auxiliary_loss_clip": 0.01165754, "auxiliary_loss_mlp": 0.01020906, "balance_loss_clip": 1.04782987, "balance_loss_mlp": 1.01408124, "epoch": 0.8448265496302532, "flos": 14976114969600.0, "grad_norm": 3.3530078714828466, "language_loss": 0.72868609, "learning_rate": 2.4717652902862143e-07, "loss": 0.75055265, "num_input_tokens_seen": 151872945, "step": 7026, "time_per_iteration": 2.6038758754730225 }, { "auxiliary_loss_clip": 0.01162055, "auxiliary_loss_mlp": 0.01026078, "balance_loss_clip": 1.04442155, "balance_loss_mlp": 1.01890182, "epoch": 0.8449467925208922, "flos": 23441265192960.0, "grad_norm": 1.7843713967190826, "language_loss": 0.81223541, "learning_rate": 2.4680153843211495e-07, "loss": 0.83411676, "num_input_tokens_seen": 151892875, "step": 7027, "time_per_iteration": 2.7025930881500244 }, { "auxiliary_loss_clip": 0.01161862, "auxiliary_loss_mlp": 0.01021261, "balance_loss_clip": 1.05063343, "balance_loss_mlp": 1.01401615, "epoch": 0.8450670354115313, "flos": 22748045639040.0, "grad_norm": 1.7482580410701123, "language_loss": 0.7217263, "learning_rate": 2.464268137944212e-07, "loss": 0.74355751, "num_input_tokens_seen": 151914170, "step": 7028, "time_per_iteration": 2.693554401397705 }, { "auxiliary_loss_clip": 0.01144887, "auxiliary_loss_mlp": 0.01023477, "balance_loss_clip": 1.04795241, "balance_loss_mlp": 1.01563334, "epoch": 0.8451872783021703, "flos": 29825571605760.0, "grad_norm": 1.9633350290943563, "language_loss": 0.78175068, "learning_rate": 2.46052355172385e-07, "loss": 0.80343437, "num_input_tokens_seen": 151932210, "step": 7029, "time_per_iteration": 2.7488739490509033 }, { "auxiliary_loss_clip": 0.01167328, "auxiliary_loss_mlp": 0.0102413, "balance_loss_clip": 1.04618871, "balance_loss_mlp": 1.01663733, "epoch": 0.8453075211928095, "flos": 21870029589120.0, "grad_norm": 1.819374985132603, "language_loss": 0.74597287, "learning_rate": 2.456781626228128e-07, "loss": 0.76788741, "num_input_tokens_seen": 151951715, "step": 7030, "time_per_iteration": 2.648256540298462 }, { "auxiliary_loss_clip": 0.01058334, "auxiliary_loss_mlp": 0.01036749, "balance_loss_clip": 1.00863004, "balance_loss_mlp": 1.00337315, "epoch": 0.8454277640834486, "flos": 58751869288320.0, "grad_norm": 0.9290600987111421, "language_loss": 0.66249907, "learning_rate": 2.453042362024675e-07, "loss": 0.68344986, "num_input_tokens_seen": 152004960, "step": 7031, "time_per_iteration": 3.2720932960510254 }, { "auxiliary_loss_clip": 0.0116473, "auxiliary_loss_mlp": 0.0102498, "balance_loss_clip": 1.04651141, "balance_loss_mlp": 1.01798224, "epoch": 0.8455480069740876, "flos": 27090076469760.0, "grad_norm": 1.668582274785048, "language_loss": 0.73068649, "learning_rate": 2.449305759680751e-07, "loss": 0.75258362, "num_input_tokens_seen": 152026285, "step": 7032, "time_per_iteration": 2.698554754257202 }, { "auxiliary_loss_clip": 0.01155944, "auxiliary_loss_mlp": 0.01025733, "balance_loss_clip": 1.04925251, "balance_loss_mlp": 1.01840484, "epoch": 0.8456682498647268, "flos": 27198670262400.0, "grad_norm": 1.502050699805549, "language_loss": 0.75066555, "learning_rate": 2.445571819763188e-07, "loss": 0.77248228, "num_input_tokens_seen": 152048585, "step": 7033, "time_per_iteration": 2.7993721961975098 }, { "auxiliary_loss_clip": 0.01167207, "auxiliary_loss_mlp": 0.0102445, "balance_loss_clip": 1.04873514, "balance_loss_mlp": 1.01715446, "epoch": 0.8457884927553658, "flos": 20631901737600.0, "grad_norm": 1.7019002721643226, "language_loss": 0.58316928, "learning_rate": 2.4418405428384227e-07, "loss": 0.60508585, "num_input_tokens_seen": 152068795, "step": 7034, "time_per_iteration": 2.7132110595703125 }, { "auxiliary_loss_clip": 0.01162603, "auxiliary_loss_mlp": 0.01057138, "balance_loss_clip": 1.04511261, "balance_loss_mlp": 1.02106452, "epoch": 0.8459087356460049, "flos": 15299023259520.0, "grad_norm": 6.169527719504166, "language_loss": 0.71816838, "learning_rate": 2.4381119294724864e-07, "loss": 0.74036574, "num_input_tokens_seen": 152086240, "step": 7035, "time_per_iteration": 2.6325066089630127 }, { "auxiliary_loss_clip": 0.01164809, "auxiliary_loss_mlp": 0.01022252, "balance_loss_clip": 1.04584646, "balance_loss_mlp": 1.01507902, "epoch": 0.846028978536644, "flos": 18843155326080.0, "grad_norm": 1.8705287278618592, "language_loss": 0.53566611, "learning_rate": 2.434385980231004e-07, "loss": 0.55753684, "num_input_tokens_seen": 152105080, "step": 7036, "time_per_iteration": 2.73309326171875 }, { "auxiliary_loss_clip": 0.01161527, "auxiliary_loss_mlp": 0.01025988, "balance_loss_clip": 1.0459882, "balance_loss_mlp": 1.01909208, "epoch": 0.8461492214272831, "flos": 52661740285440.0, "grad_norm": 1.4907284875390838, "language_loss": 0.654369, "learning_rate": 2.4306626956792043e-07, "loss": 0.6762442, "num_input_tokens_seen": 152130025, "step": 7037, "time_per_iteration": 3.0006051063537598 }, { "auxiliary_loss_clip": 0.0116008, "auxiliary_loss_mlp": 0.01024333, "balance_loss_clip": 1.04427564, "balance_loss_mlp": 1.01727247, "epoch": 0.8462694643179222, "flos": 18588405093120.0, "grad_norm": 1.7712562153600466, "language_loss": 0.75430453, "learning_rate": 2.4269420763819017e-07, "loss": 0.77614862, "num_input_tokens_seen": 152148070, "step": 7038, "time_per_iteration": 3.766233205795288 }, { "auxiliary_loss_clip": 0.01159834, "auxiliary_loss_mlp": 0.01025227, "balance_loss_clip": 1.04451454, "balance_loss_mlp": 1.01846218, "epoch": 0.8463897072085613, "flos": 24387080163840.0, "grad_norm": 2.8875574328875357, "language_loss": 0.83655256, "learning_rate": 2.4232241229035223e-07, "loss": 0.85840321, "num_input_tokens_seen": 152165825, "step": 7039, "time_per_iteration": 2.682194471359253 }, { "auxiliary_loss_clip": 0.01060455, "auxiliary_loss_mlp": 0.01001856, "balance_loss_clip": 1.0073272, "balance_loss_mlp": 1.00088429, "epoch": 0.8465099500992004, "flos": 68702140258560.0, "grad_norm": 0.7425277595409515, "language_loss": 0.56719565, "learning_rate": 2.419508835808064e-07, "loss": 0.5878188, "num_input_tokens_seen": 152222380, "step": 7040, "time_per_iteration": 4.119365692138672 }, { "auxiliary_loss_clip": 0.01157635, "auxiliary_loss_mlp": 0.01024839, "balance_loss_clip": 1.0460031, "balance_loss_mlp": 1.01743662, "epoch": 0.8466301929898394, "flos": 13735724561280.0, "grad_norm": 1.9969600204015052, "language_loss": 0.63077503, "learning_rate": 2.415796215659134e-07, "loss": 0.65259975, "num_input_tokens_seen": 152239085, "step": 7041, "time_per_iteration": 2.7221763134002686 }, { "auxiliary_loss_clip": 0.01159364, "auxiliary_loss_mlp": 0.01032227, "balance_loss_clip": 1.04459953, "balance_loss_mlp": 1.02458251, "epoch": 0.8467504358804786, "flos": 19241260738560.0, "grad_norm": 3.3192766955280475, "language_loss": 0.77045935, "learning_rate": 2.412086263019939e-07, "loss": 0.79237521, "num_input_tokens_seen": 152257110, "step": 7042, "time_per_iteration": 2.6765408515930176 }, { "auxiliary_loss_clip": 0.01161711, "auxiliary_loss_mlp": 0.01024778, "balance_loss_clip": 1.0463376, "balance_loss_mlp": 1.01767015, "epoch": 0.8468706787711177, "flos": 21324115710720.0, "grad_norm": 1.696912474859114, "language_loss": 0.80096817, "learning_rate": 2.408378978453276e-07, "loss": 0.82283306, "num_input_tokens_seen": 152277230, "step": 7043, "time_per_iteration": 2.6883561611175537 }, { "auxiliary_loss_clip": 0.0106012, "auxiliary_loss_mlp": 0.01002551, "balance_loss_clip": 1.00716472, "balance_loss_mlp": 1.00151372, "epoch": 0.8469909216617567, "flos": 64877439058560.0, "grad_norm": 0.8180500995762761, "language_loss": 0.63921487, "learning_rate": 2.404674362521533e-07, "loss": 0.6598416, "num_input_tokens_seen": 152335725, "step": 7044, "time_per_iteration": 3.118992805480957 }, { "auxiliary_loss_clip": 0.01161352, "auxiliary_loss_mlp": 0.01028797, "balance_loss_clip": 1.04700696, "balance_loss_mlp": 1.0211556, "epoch": 0.8471111645523959, "flos": 19280583152640.0, "grad_norm": 2.261650311210781, "language_loss": 0.74561125, "learning_rate": 2.4009724157866997e-07, "loss": 0.76751268, "num_input_tokens_seen": 152352785, "step": 7045, "time_per_iteration": 3.4957566261291504 }, { "auxiliary_loss_clip": 0.01164433, "auxiliary_loss_mlp": 0.0102663, "balance_loss_clip": 1.04700565, "balance_loss_mlp": 1.02007902, "epoch": 0.8472314074430349, "flos": 22015826893440.0, "grad_norm": 1.9848669536082075, "language_loss": 0.76779795, "learning_rate": 2.3972731388103564e-07, "loss": 0.78970861, "num_input_tokens_seen": 152371265, "step": 7046, "time_per_iteration": 2.718735456466675 }, { "auxiliary_loss_clip": 0.0105129, "auxiliary_loss_mlp": 0.0100033, "balance_loss_clip": 1.00885534, "balance_loss_mlp": 0.99933428, "epoch": 0.847351650333674, "flos": 57882580243200.0, "grad_norm": 0.8020664296001199, "language_loss": 0.62323779, "learning_rate": 2.393576532153687e-07, "loss": 0.64375395, "num_input_tokens_seen": 152435050, "step": 7047, "time_per_iteration": 3.3749024868011475 }, { "auxiliary_loss_clip": 0.01058044, "auxiliary_loss_mlp": 0.01002207, "balance_loss_clip": 1.00737715, "balance_loss_mlp": 1.00125301, "epoch": 0.8474718932243132, "flos": 41284238313600.0, "grad_norm": 0.9262467309464367, "language_loss": 0.57774311, "learning_rate": 2.389882596377453e-07, "loss": 0.59834564, "num_input_tokens_seen": 152489315, "step": 7048, "time_per_iteration": 3.1507842540740967 }, { "auxiliary_loss_clip": 0.01160094, "auxiliary_loss_mlp": 0.0102078, "balance_loss_clip": 1.04205906, "balance_loss_mlp": 1.01283741, "epoch": 0.8475921361149522, "flos": 38180906974080.0, "grad_norm": 1.8453787856572037, "language_loss": 0.76365733, "learning_rate": 2.386191332042031e-07, "loss": 0.78546607, "num_input_tokens_seen": 152511210, "step": 7049, "time_per_iteration": 2.9102816581726074 }, { "auxiliary_loss_clip": 0.0116778, "auxiliary_loss_mlp": 0.01023667, "balance_loss_clip": 1.04601276, "balance_loss_mlp": 1.01618958, "epoch": 0.8477123790055913, "flos": 25375054723200.0, "grad_norm": 10.521979507592714, "language_loss": 0.72650123, "learning_rate": 2.3825027397073794e-07, "loss": 0.74841571, "num_input_tokens_seen": 152531685, "step": 7050, "time_per_iteration": 2.6729750633239746 }, { "auxiliary_loss_clip": 0.01161051, "auxiliary_loss_mlp": 0.01025114, "balance_loss_clip": 1.04903102, "balance_loss_mlp": 1.01798558, "epoch": 0.8478326218962304, "flos": 30225185389440.0, "grad_norm": 3.2814238279897996, "language_loss": 0.67268997, "learning_rate": 2.3788168199330515e-07, "loss": 0.69455159, "num_input_tokens_seen": 152553245, "step": 7051, "time_per_iteration": 2.7336466312408447 }, { "auxiliary_loss_clip": 0.01145846, "auxiliary_loss_mlp": 0.01020751, "balance_loss_clip": 1.04286456, "balance_loss_mlp": 1.01348197, "epoch": 0.8479528647868695, "flos": 38213800853760.0, "grad_norm": 1.5792413911509837, "language_loss": 0.72536969, "learning_rate": 2.3751335732782074e-07, "loss": 0.74703562, "num_input_tokens_seen": 152574505, "step": 7052, "time_per_iteration": 3.8001439571380615 }, { "auxiliary_loss_clip": 0.01160998, "auxiliary_loss_mlp": 0.0102319, "balance_loss_clip": 1.04755187, "balance_loss_mlp": 1.01573372, "epoch": 0.8480731076775085, "flos": 20957790856320.0, "grad_norm": 1.9118103445779735, "language_loss": 0.79511547, "learning_rate": 2.371453000301582e-07, "loss": 0.81695741, "num_input_tokens_seen": 152593190, "step": 7053, "time_per_iteration": 2.744137763977051 }, { "auxiliary_loss_clip": 0.01152006, "auxiliary_loss_mlp": 0.01018628, "balance_loss_clip": 1.04487109, "balance_loss_mlp": 1.01152325, "epoch": 0.8481933505681477, "flos": 32596510487040.0, "grad_norm": 1.7855411388347027, "language_loss": 0.7415787, "learning_rate": 2.3677751015615222e-07, "loss": 0.76328504, "num_input_tokens_seen": 152615265, "step": 7054, "time_per_iteration": 2.792837619781494 }, { "auxiliary_loss_clip": 0.01153585, "auxiliary_loss_mlp": 0.01028911, "balance_loss_clip": 1.04903746, "balance_loss_mlp": 1.02128768, "epoch": 0.8483135934587868, "flos": 20741177888640.0, "grad_norm": 2.0309581774150174, "language_loss": 0.85333091, "learning_rate": 2.3640998776159593e-07, "loss": 0.87515581, "num_input_tokens_seen": 152632770, "step": 7055, "time_per_iteration": 2.662710189819336 }, { "auxiliary_loss_clip": 0.01158358, "auxiliary_loss_mlp": 0.01026118, "balance_loss_clip": 1.04744625, "balance_loss_mlp": 1.0189718, "epoch": 0.8484338363494258, "flos": 21653057485440.0, "grad_norm": 1.8979034700336423, "language_loss": 0.81586963, "learning_rate": 2.3604273290224253e-07, "loss": 0.83771443, "num_input_tokens_seen": 152653485, "step": 7056, "time_per_iteration": 2.7009522914886475 }, { "auxiliary_loss_clip": 0.011636, "auxiliary_loss_mlp": 0.01028072, "balance_loss_clip": 1.04929709, "balance_loss_mlp": 1.02044809, "epoch": 0.848554079240065, "flos": 15013964926080.0, "grad_norm": 2.161054984114841, "language_loss": 0.74655581, "learning_rate": 2.356757456338039e-07, "loss": 0.76847255, "num_input_tokens_seen": 152670970, "step": 7057, "time_per_iteration": 2.6718666553497314 }, { "auxiliary_loss_clip": 0.01060038, "auxiliary_loss_mlp": 0.01001869, "balance_loss_clip": 1.01163387, "balance_loss_mlp": 1.00080192, "epoch": 0.848674322130704, "flos": 68060453742720.0, "grad_norm": 0.7505171147270503, "language_loss": 0.59034902, "learning_rate": 2.3530902601195147e-07, "loss": 0.61096811, "num_input_tokens_seen": 152739460, "step": 7058, "time_per_iteration": 3.515972852706909 }, { "auxiliary_loss_clip": 0.01164386, "auxiliary_loss_mlp": 0.01025912, "balance_loss_clip": 1.04773819, "balance_loss_mlp": 1.01857424, "epoch": 0.8487945650213431, "flos": 18475788977280.0, "grad_norm": 2.193954660870486, "language_loss": 0.79318058, "learning_rate": 2.34942574092317e-07, "loss": 0.8150835, "num_input_tokens_seen": 152754710, "step": 7059, "time_per_iteration": 2.8282783031463623 }, { "auxiliary_loss_clip": 0.01166235, "auxiliary_loss_mlp": 0.01032974, "balance_loss_clip": 1.04689932, "balance_loss_mlp": 1.0257138, "epoch": 0.8489148079119821, "flos": 23473189405440.0, "grad_norm": 3.216534183183625, "language_loss": 0.76733261, "learning_rate": 2.3457638993049045e-07, "loss": 0.7893247, "num_input_tokens_seen": 152772700, "step": 7060, "time_per_iteration": 2.795375108718872 }, { "auxiliary_loss_clip": 0.01150796, "auxiliary_loss_mlp": 0.01030164, "balance_loss_clip": 1.04837358, "balance_loss_mlp": 1.02159905, "epoch": 0.8490350508026213, "flos": 19937604775680.0, "grad_norm": 2.352796985557426, "language_loss": 0.6449182, "learning_rate": 2.3421047358202252e-07, "loss": 0.66672778, "num_input_tokens_seen": 152791550, "step": 7061, "time_per_iteration": 2.8636298179626465 }, { "auxiliary_loss_clip": 0.0116504, "auxiliary_loss_mlp": 0.01024132, "balance_loss_clip": 1.04764283, "balance_loss_mlp": 1.01704478, "epoch": 0.8491552936932604, "flos": 24279958828800.0, "grad_norm": 3.207241770904439, "language_loss": 0.82926995, "learning_rate": 2.3384482510242144e-07, "loss": 0.8511616, "num_input_tokens_seen": 152809410, "step": 7062, "time_per_iteration": 2.7257113456726074 }, { "auxiliary_loss_clip": 0.01165862, "auxiliary_loss_mlp": 0.0102553, "balance_loss_clip": 1.04505873, "balance_loss_mlp": 1.01832986, "epoch": 0.8492755365838994, "flos": 22522526098560.0, "grad_norm": 1.9114613043644912, "language_loss": 0.77435291, "learning_rate": 2.3347944454715575e-07, "loss": 0.79626679, "num_input_tokens_seen": 152825800, "step": 7063, "time_per_iteration": 2.6790666580200195 }, { "auxiliary_loss_clip": 0.01168924, "auxiliary_loss_mlp": 0.0102537, "balance_loss_clip": 1.0476315, "balance_loss_mlp": 1.01761889, "epoch": 0.8493957794745386, "flos": 26980441182720.0, "grad_norm": 1.6955852302515106, "language_loss": 0.67311907, "learning_rate": 2.331143319716542e-07, "loss": 0.69506204, "num_input_tokens_seen": 152845330, "step": 7064, "time_per_iteration": 3.6947836875915527 }, { "auxiliary_loss_clip": 0.01161418, "auxiliary_loss_mlp": 0.01025214, "balance_loss_clip": 1.04668379, "balance_loss_mlp": 1.01777565, "epoch": 0.8495160223651776, "flos": 29861985018240.0, "grad_norm": 1.979809603932882, "language_loss": 0.66048861, "learning_rate": 2.3274948743130363e-07, "loss": 0.68235493, "num_input_tokens_seen": 152865165, "step": 7065, "time_per_iteration": 2.7799322605133057 }, { "auxiliary_loss_clip": 0.01164652, "auxiliary_loss_mlp": 0.0102106, "balance_loss_clip": 1.0451448, "balance_loss_mlp": 1.01371717, "epoch": 0.8496362652558167, "flos": 23075443128960.0, "grad_norm": 1.6730107283602929, "language_loss": 0.79305971, "learning_rate": 2.3238491098145085e-07, "loss": 0.81491685, "num_input_tokens_seen": 152884695, "step": 7066, "time_per_iteration": 3.615176200866699 }, { "auxiliary_loss_clip": 0.01161325, "auxiliary_loss_mlp": 0.01026288, "balance_loss_clip": 1.04512143, "balance_loss_mlp": 1.01856935, "epoch": 0.8497565081464559, "flos": 14609107756800.0, "grad_norm": 2.200515667864684, "language_loss": 0.73392665, "learning_rate": 2.3202060267740141e-07, "loss": 0.75580275, "num_input_tokens_seen": 152902220, "step": 7067, "time_per_iteration": 2.652909994125366 }, { "auxiliary_loss_clip": 0.01151645, "auxiliary_loss_mlp": 0.01025823, "balance_loss_clip": 1.04569638, "balance_loss_mlp": 1.0186646, "epoch": 0.8498767510370949, "flos": 21136446126720.0, "grad_norm": 2.372012695121629, "language_loss": 0.76964962, "learning_rate": 2.3165656257442044e-07, "loss": 0.79142433, "num_input_tokens_seen": 152920740, "step": 7068, "time_per_iteration": 2.7757205963134766 }, { "auxiliary_loss_clip": 0.01159167, "auxiliary_loss_mlp": 0.01028293, "balance_loss_clip": 1.0469985, "balance_loss_mlp": 1.0209353, "epoch": 0.849996993927734, "flos": 23654538195840.0, "grad_norm": 1.8539032910061612, "language_loss": 0.90618473, "learning_rate": 2.31292790727734e-07, "loss": 0.92805934, "num_input_tokens_seen": 152938305, "step": 7069, "time_per_iteration": 2.7266483306884766 }, { "auxiliary_loss_clip": 0.01162183, "auxiliary_loss_mlp": 0.01027397, "balance_loss_clip": 1.04440033, "balance_loss_mlp": 1.02005112, "epoch": 0.8501172368183731, "flos": 20558069331840.0, "grad_norm": 2.2251916713399447, "language_loss": 0.80454189, "learning_rate": 2.3092928719252392e-07, "loss": 0.82643771, "num_input_tokens_seen": 152956705, "step": 7070, "time_per_iteration": 2.6630349159240723 }, { "auxiliary_loss_clip": 0.01162968, "auxiliary_loss_mlp": 0.01022531, "balance_loss_clip": 1.04782963, "balance_loss_mlp": 1.01409721, "epoch": 0.8502374797090122, "flos": 22272624201600.0, "grad_norm": 3.878279586155606, "language_loss": 0.78196108, "learning_rate": 2.3056605202393475e-07, "loss": 0.80381608, "num_input_tokens_seen": 152974265, "step": 7071, "time_per_iteration": 3.5185720920562744 }, { "auxiliary_loss_clip": 0.01158635, "auxiliary_loss_mlp": 0.01052965, "balance_loss_clip": 1.04446268, "balance_loss_mlp": 1.01679111, "epoch": 0.8503577225996513, "flos": 23659817495040.0, "grad_norm": 2.540981787158019, "language_loss": 0.67048866, "learning_rate": 2.3020308527706888e-07, "loss": 0.69260466, "num_input_tokens_seen": 152993680, "step": 7072, "time_per_iteration": 2.659386157989502 }, { "auxiliary_loss_clip": 0.0116657, "auxiliary_loss_mlp": 0.01025528, "balance_loss_clip": 1.04759002, "balance_loss_mlp": 1.01823854, "epoch": 0.8504779654902904, "flos": 26758513002240.0, "grad_norm": 1.7704507874013657, "language_loss": 0.89054513, "learning_rate": 2.2984038700698715e-07, "loss": 0.91246605, "num_input_tokens_seen": 153012990, "step": 7073, "time_per_iteration": 2.731623888015747 }, { "auxiliary_loss_clip": 0.01157959, "auxiliary_loss_mlp": 0.01025424, "balance_loss_clip": 1.04565716, "balance_loss_mlp": 1.01820278, "epoch": 0.8505982083809295, "flos": 26468247196800.0, "grad_norm": 1.6077250366988454, "language_loss": 0.79193008, "learning_rate": 2.2947795726871222e-07, "loss": 0.81376392, "num_input_tokens_seen": 153034015, "step": 7074, "time_per_iteration": 2.667261838912964 }, { "auxiliary_loss_clip": 0.01160005, "auxiliary_loss_mlp": 0.0105184, "balance_loss_clip": 1.04874969, "balance_loss_mlp": 1.01762509, "epoch": 0.8507184512715685, "flos": 20303390926080.0, "grad_norm": 2.0213435952476817, "language_loss": 0.8623867, "learning_rate": 2.2911579611722253e-07, "loss": 0.88450515, "num_input_tokens_seen": 153053160, "step": 7075, "time_per_iteration": 2.731065034866333 }, { "auxiliary_loss_clip": 0.01156782, "auxiliary_loss_mlp": 0.01026718, "balance_loss_clip": 1.04598904, "balance_loss_mlp": 1.01904643, "epoch": 0.8508386941622077, "flos": 19025186474880.0, "grad_norm": 2.2254255146193254, "language_loss": 0.87228107, "learning_rate": 2.2875390360745905e-07, "loss": 0.89411604, "num_input_tokens_seen": 153072565, "step": 7076, "time_per_iteration": 2.7338898181915283 }, { "auxiliary_loss_clip": 0.01162486, "auxiliary_loss_mlp": 0.01026984, "balance_loss_clip": 1.04628098, "balance_loss_mlp": 1.01884198, "epoch": 0.8509589370528468, "flos": 16433405654400.0, "grad_norm": 2.302980247196714, "language_loss": 0.77804649, "learning_rate": 2.2839227979432008e-07, "loss": 0.79994118, "num_input_tokens_seen": 153090215, "step": 7077, "time_per_iteration": 3.705533266067505 }, { "auxiliary_loss_clip": 0.01160719, "auxiliary_loss_mlp": 0.01022495, "balance_loss_clip": 1.04530311, "balance_loss_mlp": 1.0153842, "epoch": 0.8510791799434858, "flos": 18259714713600.0, "grad_norm": 2.1395267966971643, "language_loss": 0.85147631, "learning_rate": 2.2803092473266373e-07, "loss": 0.87330842, "num_input_tokens_seen": 153107740, "step": 7078, "time_per_iteration": 2.7036068439483643 }, { "auxiliary_loss_clip": 0.0116707, "auxiliary_loss_mlp": 0.01029201, "balance_loss_clip": 1.0475508, "balance_loss_mlp": 1.02198291, "epoch": 0.851199422834125, "flos": 23441372933760.0, "grad_norm": 2.1796036908828524, "language_loss": 0.8684839, "learning_rate": 2.2766983847730724e-07, "loss": 0.8904466, "num_input_tokens_seen": 153127410, "step": 7079, "time_per_iteration": 2.6800715923309326 }, { "auxiliary_loss_clip": 0.01167739, "auxiliary_loss_mlp": 0.01028965, "balance_loss_clip": 1.0480113, "balance_loss_mlp": 1.02120471, "epoch": 0.851319665724764, "flos": 16289404030080.0, "grad_norm": 1.8802708600184987, "language_loss": 0.66378766, "learning_rate": 2.2730902108302663e-07, "loss": 0.68575472, "num_input_tokens_seen": 153144325, "step": 7080, "time_per_iteration": 2.7607803344726562 }, { "auxiliary_loss_clip": 0.01155828, "auxiliary_loss_mlp": 0.01028568, "balance_loss_clip": 1.04689825, "balance_loss_mlp": 1.02095687, "epoch": 0.8514399086154031, "flos": 18989347680000.0, "grad_norm": 1.6612564754632577, "language_loss": 0.6848473, "learning_rate": 2.269484726045583e-07, "loss": 0.70669127, "num_input_tokens_seen": 153163240, "step": 7081, "time_per_iteration": 2.6993916034698486 }, { "auxiliary_loss_clip": 0.01159871, "auxiliary_loss_mlp": 0.01026457, "balance_loss_clip": 1.04759824, "balance_loss_mlp": 1.01961768, "epoch": 0.8515601515060423, "flos": 24571194301440.0, "grad_norm": 2.93633500951675, "language_loss": 0.78979731, "learning_rate": 2.2658819309659672e-07, "loss": 0.81166053, "num_input_tokens_seen": 153183440, "step": 7082, "time_per_iteration": 2.926323890686035 }, { "auxiliary_loss_clip": 0.01155363, "auxiliary_loss_mlp": 0.01021818, "balance_loss_clip": 1.04582536, "balance_loss_mlp": 1.01534235, "epoch": 0.8516803943966813, "flos": 19529443555200.0, "grad_norm": 1.8141567102350735, "language_loss": 0.84845471, "learning_rate": 2.2622818261379706e-07, "loss": 0.8702265, "num_input_tokens_seen": 153200460, "step": 7083, "time_per_iteration": 2.673196315765381 }, { "auxiliary_loss_clip": 0.01162059, "auxiliary_loss_mlp": 0.01022934, "balance_loss_clip": 1.04996014, "balance_loss_mlp": 1.01548302, "epoch": 0.8518006372873204, "flos": 20265792364800.0, "grad_norm": 2.850018407260595, "language_loss": 0.75104141, "learning_rate": 2.2586844121077142e-07, "loss": 0.77289134, "num_input_tokens_seen": 153218970, "step": 7084, "time_per_iteration": 2.702173948287964 }, { "auxiliary_loss_clip": 0.01156632, "auxiliary_loss_mlp": 0.01024627, "balance_loss_clip": 1.04717672, "balance_loss_mlp": 1.01685452, "epoch": 0.8519208801779595, "flos": 24133227770880.0, "grad_norm": 1.663354025241238, "language_loss": 0.72075111, "learning_rate": 2.2550896894209215e-07, "loss": 0.74256372, "num_input_tokens_seen": 153238485, "step": 7085, "time_per_iteration": 2.8119781017303467 }, { "auxiliary_loss_clip": 0.010588, "auxiliary_loss_mlp": 0.01004125, "balance_loss_clip": 1.0087918, "balance_loss_mlp": 1.00289142, "epoch": 0.8520411230685986, "flos": 63035223252480.0, "grad_norm": 0.6808572230543768, "language_loss": 0.56611186, "learning_rate": 2.2514976586229184e-07, "loss": 0.58674109, "num_input_tokens_seen": 153306430, "step": 7086, "time_per_iteration": 3.4531333446502686 }, { "auxiliary_loss_clip": 0.0106144, "auxiliary_loss_mlp": 0.01000656, "balance_loss_clip": 1.0089879, "balance_loss_mlp": 0.99972647, "epoch": 0.8521613659592376, "flos": 65836865283840.0, "grad_norm": 0.8038940789240426, "language_loss": 0.54729337, "learning_rate": 2.247908320258609e-07, "loss": 0.56791431, "num_input_tokens_seen": 153366520, "step": 7087, "time_per_iteration": 3.283536672592163 }, { "auxiliary_loss_clip": 0.01146717, "auxiliary_loss_mlp": 0.01024773, "balance_loss_clip": 1.04559708, "balance_loss_mlp": 1.01727462, "epoch": 0.8522816088498768, "flos": 23112323418240.0, "grad_norm": 2.0531122104298083, "language_loss": 0.80155188, "learning_rate": 2.2443216748724914e-07, "loss": 0.82326674, "num_input_tokens_seen": 153387230, "step": 7088, "time_per_iteration": 2.7387053966522217 }, { "auxiliary_loss_clip": 0.01165606, "auxiliary_loss_mlp": 0.01054994, "balance_loss_clip": 1.04826736, "balance_loss_mlp": 1.01963806, "epoch": 0.8524018517405159, "flos": 31758140073600.0, "grad_norm": 7.705773137411858, "language_loss": 0.74411744, "learning_rate": 2.2407377230086588e-07, "loss": 0.76632345, "num_input_tokens_seen": 153409585, "step": 7089, "time_per_iteration": 2.7635507583618164 }, { "auxiliary_loss_clip": 0.01155973, "auxiliary_loss_mlp": 0.01023761, "balance_loss_clip": 1.04938924, "balance_loss_mlp": 1.01652813, "epoch": 0.8525220946311549, "flos": 18690318956160.0, "grad_norm": 2.298377651288019, "language_loss": 0.83769345, "learning_rate": 2.23715646521079e-07, "loss": 0.85949075, "num_input_tokens_seen": 153427105, "step": 7090, "time_per_iteration": 3.619295835494995 }, { "auxiliary_loss_clip": 0.01166731, "auxiliary_loss_mlp": 0.01053388, "balance_loss_clip": 1.04699337, "balance_loss_mlp": 1.01827145, "epoch": 0.852642337521794, "flos": 21793216354560.0, "grad_norm": 1.8082217753600163, "language_loss": 0.8395983, "learning_rate": 2.2335779020221724e-07, "loss": 0.86179948, "num_input_tokens_seen": 153443725, "step": 7091, "time_per_iteration": 2.647270441055298 }, { "auxiliary_loss_clip": 0.01058517, "auxiliary_loss_mlp": 0.01002188, "balance_loss_clip": 1.0123539, "balance_loss_mlp": 1.00129402, "epoch": 0.8527625804124331, "flos": 69040132260480.0, "grad_norm": 0.7971901002873361, "language_loss": 0.56410897, "learning_rate": 2.2300020339856497e-07, "loss": 0.58471608, "num_input_tokens_seen": 153506410, "step": 7092, "time_per_iteration": 4.184038162231445 }, { "auxiliary_loss_clip": 0.01156616, "auxiliary_loss_mlp": 0.01023667, "balance_loss_clip": 1.04405403, "balance_loss_mlp": 1.01690221, "epoch": 0.8528828233030722, "flos": 26979399688320.0, "grad_norm": 1.9986390201275837, "language_loss": 0.77906984, "learning_rate": 2.2264288616436966e-07, "loss": 0.80087268, "num_input_tokens_seen": 153526665, "step": 7093, "time_per_iteration": 2.7326273918151855 }, { "auxiliary_loss_clip": 0.01155748, "auxiliary_loss_mlp": 0.01023347, "balance_loss_clip": 1.04637766, "balance_loss_mlp": 1.01622725, "epoch": 0.8530030661937112, "flos": 17487598936320.0, "grad_norm": 2.0040402596853055, "language_loss": 0.72600627, "learning_rate": 2.222858385538351e-07, "loss": 0.74779725, "num_input_tokens_seen": 153543465, "step": 7094, "time_per_iteration": 2.692812204360962 }, { "auxiliary_loss_clip": 0.01160729, "auxiliary_loss_mlp": 0.01024224, "balance_loss_clip": 1.04700637, "balance_loss_mlp": 1.0172298, "epoch": 0.8531233090843504, "flos": 22160798184960.0, "grad_norm": 3.02038004363589, "language_loss": 0.68236786, "learning_rate": 2.2192906062112527e-07, "loss": 0.70421743, "num_input_tokens_seen": 153563340, "step": 7095, "time_per_iteration": 2.679861068725586 }, { "auxiliary_loss_clip": 0.01164992, "auxiliary_loss_mlp": 0.01028334, "balance_loss_clip": 1.04502106, "balance_loss_mlp": 1.02104759, "epoch": 0.8532435519749895, "flos": 37635388145280.0, "grad_norm": 1.521150024453762, "language_loss": 0.70845866, "learning_rate": 2.2157255242036377e-07, "loss": 0.73039186, "num_input_tokens_seen": 153587005, "step": 7096, "time_per_iteration": 2.815972328186035 }, { "auxiliary_loss_clip": 0.01154697, "auxiliary_loss_mlp": 0.01023573, "balance_loss_clip": 1.04554296, "balance_loss_mlp": 1.01609278, "epoch": 0.8533637948656285, "flos": 21398163598080.0, "grad_norm": 2.5098453102153457, "language_loss": 0.7436378, "learning_rate": 2.2121631400563135e-07, "loss": 0.76542044, "num_input_tokens_seen": 153606835, "step": 7097, "time_per_iteration": 3.7858126163482666 }, { "auxiliary_loss_clip": 0.01060406, "auxiliary_loss_mlp": 0.010016, "balance_loss_clip": 1.01185298, "balance_loss_mlp": 1.00057507, "epoch": 0.8534840377562677, "flos": 53345122490880.0, "grad_norm": 0.7693427184464546, "language_loss": 0.52986634, "learning_rate": 2.208603454309701e-07, "loss": 0.55048645, "num_input_tokens_seen": 153664925, "step": 7098, "time_per_iteration": 3.1961233615875244 }, { "auxiliary_loss_clip": 0.01154902, "auxiliary_loss_mlp": 0.01025246, "balance_loss_clip": 1.04878354, "balance_loss_mlp": 1.01728249, "epoch": 0.8536042806469067, "flos": 20814148368000.0, "grad_norm": 1.8911776955817499, "language_loss": 0.71152425, "learning_rate": 2.2050464675037994e-07, "loss": 0.73332572, "num_input_tokens_seen": 153683550, "step": 7099, "time_per_iteration": 2.732070207595825 }, { "auxiliary_loss_clip": 0.01157822, "auxiliary_loss_mlp": 0.01025198, "balance_loss_clip": 1.04557562, "balance_loss_mlp": 1.01766682, "epoch": 0.8537245235375458, "flos": 24681368292480.0, "grad_norm": 2.032114607470783, "language_loss": 0.7293731, "learning_rate": 2.2014921801782016e-07, "loss": 0.75120324, "num_input_tokens_seen": 153703040, "step": 7100, "time_per_iteration": 2.8008384704589844 }, { "auxiliary_loss_clip": 0.01161596, "auxiliary_loss_mlp": 0.01024358, "balance_loss_clip": 1.04497969, "balance_loss_mlp": 1.01712251, "epoch": 0.853844766428185, "flos": 24384817607040.0, "grad_norm": 1.8726409017027847, "language_loss": 0.74110061, "learning_rate": 2.1979405928720872e-07, "loss": 0.76296014, "num_input_tokens_seen": 153722695, "step": 7101, "time_per_iteration": 2.7790939807891846 }, { "auxiliary_loss_clip": 0.01161666, "auxiliary_loss_mlp": 0.01023142, "balance_loss_clip": 1.04679191, "balance_loss_mlp": 1.01548278, "epoch": 0.853965009318824, "flos": 20955707867520.0, "grad_norm": 1.443578324301886, "language_loss": 0.79230607, "learning_rate": 2.1943917061242257e-07, "loss": 0.81415421, "num_input_tokens_seen": 153742550, "step": 7102, "time_per_iteration": 2.7449707984924316 }, { "auxiliary_loss_clip": 0.01170587, "auxiliary_loss_mlp": 0.01052163, "balance_loss_clip": 1.04900181, "balance_loss_mlp": 1.01678598, "epoch": 0.8540852522094631, "flos": 24201816791040.0, "grad_norm": 1.7094533479974134, "language_loss": 0.66420346, "learning_rate": 2.1908455204729903e-07, "loss": 0.68643093, "num_input_tokens_seen": 153761700, "step": 7103, "time_per_iteration": 2.718092918395996 }, { "auxiliary_loss_clip": 0.01161917, "auxiliary_loss_mlp": 0.0102835, "balance_loss_clip": 1.04756784, "balance_loss_mlp": 1.02112937, "epoch": 0.8542054951001022, "flos": 25082921410560.0, "grad_norm": 2.1278113235658287, "language_loss": 0.7861656, "learning_rate": 2.1873020364563265e-07, "loss": 0.80806828, "num_input_tokens_seen": 153780765, "step": 7104, "time_per_iteration": 3.607419013977051 }, { "auxiliary_loss_clip": 0.011607, "auxiliary_loss_mlp": 0.01025157, "balance_loss_clip": 1.04726386, "balance_loss_mlp": 1.01787615, "epoch": 0.8543257379907413, "flos": 24316551809280.0, "grad_norm": 2.2447866013912443, "language_loss": 0.76088524, "learning_rate": 2.183761254611789e-07, "loss": 0.78274387, "num_input_tokens_seen": 153801090, "step": 7105, "time_per_iteration": 2.7201225757598877 }, { "auxiliary_loss_clip": 0.01161471, "auxiliary_loss_mlp": 0.0101903, "balance_loss_clip": 1.04744148, "balance_loss_mlp": 1.01221454, "epoch": 0.8544459808813804, "flos": 55286630467200.0, "grad_norm": 1.9896531087446696, "language_loss": 0.70132893, "learning_rate": 2.1802231754764987e-07, "loss": 0.72313398, "num_input_tokens_seen": 153826530, "step": 7106, "time_per_iteration": 2.953253984451294 }, { "auxiliary_loss_clip": 0.01163423, "auxiliary_loss_mlp": 0.01031892, "balance_loss_clip": 1.04665589, "balance_loss_mlp": 1.02407205, "epoch": 0.8545662237720195, "flos": 25776248705280.0, "grad_norm": 1.8164372713385089, "language_loss": 0.76349342, "learning_rate": 2.17668779958718e-07, "loss": 0.78544664, "num_input_tokens_seen": 153849110, "step": 7107, "time_per_iteration": 2.7535526752471924 }, { "auxiliary_loss_clip": 0.01168216, "auxiliary_loss_mlp": 0.01024508, "balance_loss_clip": 1.04843569, "balance_loss_mlp": 1.01760912, "epoch": 0.8546864666626586, "flos": 11108320427520.0, "grad_norm": 2.283267141434433, "language_loss": 0.80645114, "learning_rate": 2.1731551274801553e-07, "loss": 0.82837832, "num_input_tokens_seen": 153865550, "step": 7108, "time_per_iteration": 2.66377854347229 }, { "auxiliary_loss_clip": 0.01160067, "auxiliary_loss_mlp": 0.01026854, "balance_loss_clip": 1.04582703, "balance_loss_mlp": 1.01930785, "epoch": 0.8548067095532976, "flos": 25520169669120.0, "grad_norm": 2.0016085266722583, "language_loss": 0.61330467, "learning_rate": 2.169625159691324e-07, "loss": 0.63517392, "num_input_tokens_seen": 153885425, "step": 7109, "time_per_iteration": 2.817065954208374 }, { "auxiliary_loss_clip": 0.01161835, "auxiliary_loss_mlp": 0.01021578, "balance_loss_clip": 1.046175, "balance_loss_mlp": 1.01414537, "epoch": 0.8549269524439368, "flos": 24717853532160.0, "grad_norm": 2.115729507683127, "language_loss": 0.74279624, "learning_rate": 2.1660978967561784e-07, "loss": 0.76463038, "num_input_tokens_seen": 153904760, "step": 7110, "time_per_iteration": 2.8046088218688965 }, { "auxiliary_loss_clip": 0.01162431, "auxiliary_loss_mlp": 0.01024996, "balance_loss_clip": 1.04366672, "balance_loss_mlp": 1.01749802, "epoch": 0.8550471953345758, "flos": 19825599191040.0, "grad_norm": 2.7984809782205344, "language_loss": 0.79235506, "learning_rate": 2.1625733392098035e-07, "loss": 0.81422931, "num_input_tokens_seen": 153920370, "step": 7111, "time_per_iteration": 2.635699510574341 }, { "auxiliary_loss_clip": 0.01162991, "auxiliary_loss_mlp": 0.01021229, "balance_loss_clip": 1.04478824, "balance_loss_mlp": 1.01469636, "epoch": 0.8551674382252149, "flos": 22820441500800.0, "grad_norm": 1.639173310124653, "language_loss": 0.79604107, "learning_rate": 2.159051487586867e-07, "loss": 0.81788331, "num_input_tokens_seen": 153940500, "step": 7112, "time_per_iteration": 2.6085925102233887 }, { "auxiliary_loss_clip": 0.01163326, "auxiliary_loss_mlp": 0.01026961, "balance_loss_clip": 1.04832029, "balance_loss_mlp": 1.01963568, "epoch": 0.8552876811158541, "flos": 20631255292800.0, "grad_norm": 2.385730341244688, "language_loss": 0.72384012, "learning_rate": 2.155532342421642e-07, "loss": 0.74574298, "num_input_tokens_seen": 153958500, "step": 7113, "time_per_iteration": 2.7663345336914062 }, { "auxiliary_loss_clip": 0.01167481, "auxiliary_loss_mlp": 0.01027602, "balance_loss_clip": 1.04801857, "balance_loss_mlp": 1.02002001, "epoch": 0.8554079240064931, "flos": 23112359331840.0, "grad_norm": 1.9047138940870458, "language_loss": 0.7829923, "learning_rate": 2.1520159042479636e-07, "loss": 0.80494314, "num_input_tokens_seen": 153976790, "step": 7114, "time_per_iteration": 2.6806344985961914 }, { "auxiliary_loss_clip": 0.01162526, "auxiliary_loss_mlp": 0.01023781, "balance_loss_clip": 1.04815698, "balance_loss_mlp": 1.01634264, "epoch": 0.8555281668971322, "flos": 22128047959680.0, "grad_norm": 2.104884010075349, "language_loss": 0.70741916, "learning_rate": 2.148502173599287e-07, "loss": 0.72928226, "num_input_tokens_seen": 153994930, "step": 7115, "time_per_iteration": 2.653862476348877 }, { "auxiliary_loss_clip": 0.01157425, "auxiliary_loss_mlp": 0.01022092, "balance_loss_clip": 1.04826045, "balance_loss_mlp": 1.01472831, "epoch": 0.8556484097877713, "flos": 31139040234240.0, "grad_norm": 1.6863493513807386, "language_loss": 0.65610272, "learning_rate": 2.1449911510086372e-07, "loss": 0.67789793, "num_input_tokens_seen": 154014400, "step": 7116, "time_per_iteration": 2.707271099090576 }, { "auxiliary_loss_clip": 0.01157726, "auxiliary_loss_mlp": 0.0102292, "balance_loss_clip": 1.04390097, "balance_loss_mlp": 1.01531434, "epoch": 0.8557686526784104, "flos": 24316551809280.0, "grad_norm": 2.064180075586323, "language_loss": 0.76587695, "learning_rate": 2.141482837008628e-07, "loss": 0.78768343, "num_input_tokens_seen": 154034940, "step": 7117, "time_per_iteration": 3.597296953201294 }, { "auxiliary_loss_clip": 0.01157653, "auxiliary_loss_mlp": 0.01024654, "balance_loss_clip": 1.04694462, "balance_loss_mlp": 1.01727152, "epoch": 0.8558888955690495, "flos": 17712723427200.0, "grad_norm": 2.7315184529016965, "language_loss": 0.72088277, "learning_rate": 2.1379772321314826e-07, "loss": 0.74270594, "num_input_tokens_seen": 154052985, "step": 7118, "time_per_iteration": 3.6646640300750732 }, { "auxiliary_loss_clip": 0.01143423, "auxiliary_loss_mlp": 0.01030802, "balance_loss_clip": 1.04756093, "balance_loss_mlp": 1.02349472, "epoch": 0.8560091384596886, "flos": 19171702051200.0, "grad_norm": 2.1658893363238856, "language_loss": 0.81621331, "learning_rate": 2.1344743369089802e-07, "loss": 0.83795559, "num_input_tokens_seen": 154068765, "step": 7119, "time_per_iteration": 2.9433419704437256 }, { "auxiliary_loss_clip": 0.01158398, "auxiliary_loss_mlp": 0.0102861, "balance_loss_clip": 1.04695868, "balance_loss_mlp": 1.02197909, "epoch": 0.8561293813503277, "flos": 23914855036800.0, "grad_norm": 1.7539110620563345, "language_loss": 0.82009518, "learning_rate": 2.130974151872522e-07, "loss": 0.84196532, "num_input_tokens_seen": 154089100, "step": 7120, "time_per_iteration": 2.669433832168579 }, { "auxiliary_loss_clip": 0.01156408, "auxiliary_loss_mlp": 0.01024911, "balance_loss_clip": 1.04567862, "balance_loss_mlp": 1.01731706, "epoch": 0.8562496242409667, "flos": 22529206028160.0, "grad_norm": 1.968056128974908, "language_loss": 0.78504521, "learning_rate": 2.1274766775530773e-07, "loss": 0.80685842, "num_input_tokens_seen": 154108965, "step": 7121, "time_per_iteration": 2.700687885284424 }, { "auxiliary_loss_clip": 0.01166571, "auxiliary_loss_mlp": 0.01024887, "balance_loss_clip": 1.04509497, "balance_loss_mlp": 1.01729953, "epoch": 0.8563698671316058, "flos": 14712745472640.0, "grad_norm": 1.9570416978041583, "language_loss": 0.79744363, "learning_rate": 2.1239819144812077e-07, "loss": 0.81935823, "num_input_tokens_seen": 154123425, "step": 7122, "time_per_iteration": 2.6152660846710205 }, { "auxiliary_loss_clip": 0.01154004, "auxiliary_loss_mlp": 0.01021525, "balance_loss_clip": 1.04868865, "balance_loss_mlp": 1.01392579, "epoch": 0.856490110022245, "flos": 39167768211840.0, "grad_norm": 1.6626212131921623, "language_loss": 0.70009458, "learning_rate": 2.1204898631870716e-07, "loss": 0.72184992, "num_input_tokens_seen": 154148315, "step": 7123, "time_per_iteration": 3.8482770919799805 }, { "auxiliary_loss_clip": 0.01160008, "auxiliary_loss_mlp": 0.01023061, "balance_loss_clip": 1.04792714, "balance_loss_mlp": 1.0161674, "epoch": 0.856610352912884, "flos": 29059345658880.0, "grad_norm": 1.9312238843320988, "language_loss": 0.76138049, "learning_rate": 2.1170005242004006e-07, "loss": 0.78321117, "num_input_tokens_seen": 154169665, "step": 7124, "time_per_iteration": 2.8182766437530518 }, { "auxiliary_loss_clip": 0.01163393, "auxiliary_loss_mlp": 0.0102468, "balance_loss_clip": 1.04592204, "balance_loss_mlp": 1.01727712, "epoch": 0.8567305958035231, "flos": 23878333883520.0, "grad_norm": 2.1327929224762134, "language_loss": 0.77523458, "learning_rate": 2.1135138980505384e-07, "loss": 0.79711533, "num_input_tokens_seen": 154190335, "step": 7125, "time_per_iteration": 2.707066535949707 }, { "auxiliary_loss_clip": 0.01154781, "auxiliary_loss_mlp": 0.01026375, "balance_loss_clip": 1.04626799, "balance_loss_mlp": 1.01916254, "epoch": 0.8568508386941622, "flos": 22200120599040.0, "grad_norm": 2.4113074420728875, "language_loss": 0.72265798, "learning_rate": 2.110029985266395e-07, "loss": 0.74446952, "num_input_tokens_seen": 154210040, "step": 7126, "time_per_iteration": 2.729793071746826 }, { "auxiliary_loss_clip": 0.01165181, "auxiliary_loss_mlp": 0.01023435, "balance_loss_clip": 1.04706895, "balance_loss_mlp": 1.01622856, "epoch": 0.8569710815848013, "flos": 17307507121920.0, "grad_norm": 1.5863677340501032, "language_loss": 0.73955536, "learning_rate": 2.1065487863764787e-07, "loss": 0.76144153, "num_input_tokens_seen": 154228385, "step": 7127, "time_per_iteration": 2.5761396884918213 }, { "auxiliary_loss_clip": 0.01148201, "auxiliary_loss_mlp": 0.01027616, "balance_loss_clip": 1.04551303, "balance_loss_mlp": 1.0203476, "epoch": 0.8570913244754403, "flos": 23732285184000.0, "grad_norm": 3.266831267959897, "language_loss": 0.85545015, "learning_rate": 2.1030703019088846e-07, "loss": 0.87720829, "num_input_tokens_seen": 154249015, "step": 7128, "time_per_iteration": 2.8144776821136475 }, { "auxiliary_loss_clip": 0.01157109, "auxiliary_loss_mlp": 0.01021555, "balance_loss_clip": 1.04476368, "balance_loss_mlp": 1.01467633, "epoch": 0.8572115673660795, "flos": 20048748433920.0, "grad_norm": 2.2005351387501526, "language_loss": 0.7095052, "learning_rate": 2.099594532391291e-07, "loss": 0.73129189, "num_input_tokens_seen": 154267700, "step": 7129, "time_per_iteration": 2.777895212173462 }, { "auxiliary_loss_clip": 0.01154321, "auxiliary_loss_mlp": 0.01027873, "balance_loss_clip": 1.04492784, "balance_loss_mlp": 1.02035427, "epoch": 0.8573318102567186, "flos": 27160389342720.0, "grad_norm": 1.5420753405283023, "language_loss": 0.79164404, "learning_rate": 2.0961214783509806e-07, "loss": 0.81346595, "num_input_tokens_seen": 154290580, "step": 7130, "time_per_iteration": 3.777606248855591 }, { "auxiliary_loss_clip": 0.01164788, "auxiliary_loss_mlp": 0.01024021, "balance_loss_clip": 1.04651511, "balance_loss_mlp": 1.01681471, "epoch": 0.8574520531473576, "flos": 24936585402240.0, "grad_norm": 1.7044761745309718, "language_loss": 0.75047302, "learning_rate": 2.0926511403148051e-07, "loss": 0.7723611, "num_input_tokens_seen": 154309545, "step": 7131, "time_per_iteration": 2.805387496948242 }, { "auxiliary_loss_clip": 0.01165518, "auxiliary_loss_mlp": 0.01026217, "balance_loss_clip": 1.04877043, "balance_loss_mlp": 1.01901412, "epoch": 0.8575722960379968, "flos": 18771154513920.0, "grad_norm": 1.7689193022264462, "language_loss": 0.76119626, "learning_rate": 2.0891835188092143e-07, "loss": 0.78311366, "num_input_tokens_seen": 154326545, "step": 7132, "time_per_iteration": 2.734698534011841 }, { "auxiliary_loss_clip": 0.01163355, "auxiliary_loss_mlp": 0.01020832, "balance_loss_clip": 1.0469079, "balance_loss_mlp": 1.01362014, "epoch": 0.8576925389286358, "flos": 22200300167040.0, "grad_norm": 2.6513761931878665, "language_loss": 0.81501579, "learning_rate": 2.0857186143602434e-07, "loss": 0.83685768, "num_input_tokens_seen": 154345190, "step": 7133, "time_per_iteration": 2.8203108310699463 }, { "auxiliary_loss_clip": 0.01150955, "auxiliary_loss_mlp": 0.01027416, "balance_loss_clip": 1.04754853, "balance_loss_mlp": 1.02010798, "epoch": 0.8578127818192749, "flos": 22894345733760.0, "grad_norm": 1.6690511878800205, "language_loss": 0.67209661, "learning_rate": 2.0822564274935094e-07, "loss": 0.69388032, "num_input_tokens_seen": 154364615, "step": 7134, "time_per_iteration": 2.8166580200195312 }, { "auxiliary_loss_clip": 0.01157162, "auxiliary_loss_mlp": 0.01030764, "balance_loss_clip": 1.04744911, "balance_loss_mlp": 1.02316165, "epoch": 0.8579330247099141, "flos": 34824839541120.0, "grad_norm": 1.7137101306406015, "language_loss": 0.66775215, "learning_rate": 2.078796958734239e-07, "loss": 0.68963134, "num_input_tokens_seen": 154387335, "step": 7135, "time_per_iteration": 2.833127737045288 }, { "auxiliary_loss_clip": 0.01164049, "auxiliary_loss_mlp": 0.01023654, "balance_loss_clip": 1.04796457, "balance_loss_mlp": 1.0158999, "epoch": 0.8580532676005531, "flos": 19755681367680.0, "grad_norm": 1.899811362449382, "language_loss": 0.75015187, "learning_rate": 2.0753402086072124e-07, "loss": 0.77202886, "num_input_tokens_seen": 154405965, "step": 7136, "time_per_iteration": 2.686129570007324 }, { "auxiliary_loss_clip": 0.01160044, "auxiliary_loss_mlp": 0.0102923, "balance_loss_clip": 1.0485003, "balance_loss_mlp": 1.02177989, "epoch": 0.8581735104911922, "flos": 22739318634240.0, "grad_norm": 2.4054689314803523, "language_loss": 0.75393283, "learning_rate": 2.071886177636828e-07, "loss": 0.77582556, "num_input_tokens_seen": 154422750, "step": 7137, "time_per_iteration": 2.8142149448394775 }, { "auxiliary_loss_clip": 0.01160663, "auxiliary_loss_mlp": 0.01024558, "balance_loss_clip": 1.04728913, "balance_loss_mlp": 1.01698518, "epoch": 0.8582937533818313, "flos": 23149131880320.0, "grad_norm": 1.7999586367496014, "language_loss": 0.82978225, "learning_rate": 2.0684348663470575e-07, "loss": 0.8516345, "num_input_tokens_seen": 154442930, "step": 7138, "time_per_iteration": 2.6925859451293945 }, { "auxiliary_loss_clip": 0.01160659, "auxiliary_loss_mlp": 0.01028586, "balance_loss_clip": 1.04672766, "balance_loss_mlp": 1.02040863, "epoch": 0.8584139962724704, "flos": 19498668577920.0, "grad_norm": 1.8804364522404047, "language_loss": 0.61813468, "learning_rate": 2.0649862752614555e-07, "loss": 0.64002711, "num_input_tokens_seen": 154461640, "step": 7139, "time_per_iteration": 2.6043083667755127 }, { "auxiliary_loss_clip": 0.01060512, "auxiliary_loss_mlp": 0.01003683, "balance_loss_clip": 1.00687265, "balance_loss_mlp": 1.00263393, "epoch": 0.8585342391631094, "flos": 71276577788160.0, "grad_norm": 0.7540517049122538, "language_loss": 0.56984961, "learning_rate": 2.0615404049031838e-07, "loss": 0.59049153, "num_input_tokens_seen": 154518610, "step": 7140, "time_per_iteration": 3.2118396759033203 }, { "auxiliary_loss_clip": 0.01165506, "auxiliary_loss_mlp": 0.01020797, "balance_loss_clip": 1.04820704, "balance_loss_mlp": 1.0131644, "epoch": 0.8586544820537486, "flos": 10815432929280.0, "grad_norm": 2.668176349616355, "language_loss": 0.78260624, "learning_rate": 2.0580972557949616e-07, "loss": 0.80446929, "num_input_tokens_seen": 154533700, "step": 7141, "time_per_iteration": 2.689624309539795 }, { "auxiliary_loss_clip": 0.01060188, "auxiliary_loss_mlp": 0.01003364, "balance_loss_clip": 1.00676012, "balance_loss_mlp": 1.00232112, "epoch": 0.8587747249443877, "flos": 64811184422400.0, "grad_norm": 0.8137287690227958, "language_loss": 0.54169345, "learning_rate": 2.054656828459125e-07, "loss": 0.56232899, "num_input_tokens_seen": 154597810, "step": 7142, "time_per_iteration": 4.272130012512207 }, { "auxiliary_loss_clip": 0.01153546, "auxiliary_loss_mlp": 0.01023741, "balance_loss_clip": 1.0481782, "balance_loss_mlp": 1.01604021, "epoch": 0.8588949678350267, "flos": 26834607964800.0, "grad_norm": 1.6760496057723047, "language_loss": 0.7741183, "learning_rate": 2.051219123417578e-07, "loss": 0.79589117, "num_input_tokens_seen": 154617870, "step": 7143, "time_per_iteration": 2.828047037124634 }, { "auxiliary_loss_clip": 0.01167112, "auxiliary_loss_mlp": 0.01024353, "balance_loss_clip": 1.04603291, "balance_loss_mlp": 1.01709342, "epoch": 0.8590152107256659, "flos": 26104256726400.0, "grad_norm": 7.601821908509463, "language_loss": 0.59961694, "learning_rate": 2.0477841411918196e-07, "loss": 0.62153161, "num_input_tokens_seen": 154637395, "step": 7144, "time_per_iteration": 3.666792154312134 }, { "auxiliary_loss_clip": 0.01156271, "auxiliary_loss_mlp": 0.01025682, "balance_loss_clip": 1.04322267, "balance_loss_mlp": 1.01872301, "epoch": 0.859135453616305, "flos": 26140885620480.0, "grad_norm": 1.8949990811607809, "language_loss": 0.74756557, "learning_rate": 2.0443518823029326e-07, "loss": 0.7693851, "num_input_tokens_seen": 154657935, "step": 7145, "time_per_iteration": 2.7279250621795654 }, { "auxiliary_loss_clip": 0.01152529, "auxiliary_loss_mlp": 0.01026502, "balance_loss_clip": 1.04654694, "balance_loss_mlp": 1.01924801, "epoch": 0.859255696506944, "flos": 12969319046400.0, "grad_norm": 2.3184858273685633, "language_loss": 0.76328278, "learning_rate": 2.0409223472715854e-07, "loss": 0.7850731, "num_input_tokens_seen": 154675080, "step": 7146, "time_per_iteration": 2.676116943359375 }, { "auxiliary_loss_clip": 0.01153937, "auxiliary_loss_mlp": 0.01051461, "balance_loss_clip": 1.04391909, "balance_loss_mlp": 1.01653075, "epoch": 0.8593759393975832, "flos": 18475753063680.0, "grad_norm": 2.3839994892483047, "language_loss": 0.75203276, "learning_rate": 2.0374955366180434e-07, "loss": 0.77408671, "num_input_tokens_seen": 154692720, "step": 7147, "time_per_iteration": 2.7918403148651123 }, { "auxiliary_loss_clip": 0.01160985, "auxiliary_loss_mlp": 0.01026844, "balance_loss_clip": 1.04565823, "balance_loss_mlp": 1.01937246, "epoch": 0.8594961822882222, "flos": 22200156512640.0, "grad_norm": 1.7094484802112484, "language_loss": 0.72767949, "learning_rate": 2.034071450862147e-07, "loss": 0.74955773, "num_input_tokens_seen": 154710190, "step": 7148, "time_per_iteration": 2.7676138877868652 }, { "auxiliary_loss_clip": 0.01165924, "auxiliary_loss_mlp": 0.01026195, "balance_loss_clip": 1.04830909, "balance_loss_mlp": 1.01865506, "epoch": 0.8596164251788613, "flos": 23294749616640.0, "grad_norm": 2.365038063007189, "language_loss": 0.76692295, "learning_rate": 2.030650090523327e-07, "loss": 0.78884417, "num_input_tokens_seen": 154729380, "step": 7149, "time_per_iteration": 3.6166181564331055 }, { "auxiliary_loss_clip": 0.01157702, "auxiliary_loss_mlp": 0.01025551, "balance_loss_clip": 1.04850078, "balance_loss_mlp": 1.01841009, "epoch": 0.8597366680695004, "flos": 31649905416960.0, "grad_norm": 1.561498831793576, "language_loss": 0.59390533, "learning_rate": 2.0272314561205995e-07, "loss": 0.61573786, "num_input_tokens_seen": 154749775, "step": 7150, "time_per_iteration": 2.8222267627716064 }, { "auxiliary_loss_clip": 0.01152095, "auxiliary_loss_mlp": 0.01027327, "balance_loss_clip": 1.0444839, "balance_loss_mlp": 1.01989424, "epoch": 0.8598569109601395, "flos": 21287738211840.0, "grad_norm": 2.9644569636052194, "language_loss": 0.73126405, "learning_rate": 2.023815548172567e-07, "loss": 0.75305831, "num_input_tokens_seen": 154769845, "step": 7151, "time_per_iteration": 2.7468814849853516 }, { "auxiliary_loss_clip": 0.01163327, "auxiliary_loss_mlp": 0.01026418, "balance_loss_clip": 1.04616618, "balance_loss_mlp": 1.01861584, "epoch": 0.8599771538507786, "flos": 25447809720960.0, "grad_norm": 1.8917978737064216, "language_loss": 0.65753794, "learning_rate": 2.0204023671974267e-07, "loss": 0.67943537, "num_input_tokens_seen": 154789230, "step": 7152, "time_per_iteration": 2.821265459060669 }, { "auxiliary_loss_clip": 0.01157377, "auxiliary_loss_mlp": 0.01026879, "balance_loss_clip": 1.04359484, "balance_loss_mlp": 1.01916599, "epoch": 0.8600973967414177, "flos": 16723958768640.0, "grad_norm": 2.1807909245261845, "language_loss": 0.80972052, "learning_rate": 2.0169919137129532e-07, "loss": 0.83156312, "num_input_tokens_seen": 154807670, "step": 7153, "time_per_iteration": 2.848670482635498 }, { "auxiliary_loss_clip": 0.01165966, "auxiliary_loss_mlp": 0.01026326, "balance_loss_clip": 1.04876876, "balance_loss_mlp": 1.01806521, "epoch": 0.8602176396320568, "flos": 25227928615680.0, "grad_norm": 2.435005281309044, "language_loss": 0.70637274, "learning_rate": 2.013584188236508e-07, "loss": 0.72829568, "num_input_tokens_seen": 154825575, "step": 7154, "time_per_iteration": 2.7156479358673096 }, { "auxiliary_loss_clip": 0.01165715, "auxiliary_loss_mlp": 0.01024915, "balance_loss_clip": 1.04512286, "balance_loss_mlp": 1.01759601, "epoch": 0.8603378825226958, "flos": 20412236113920.0, "grad_norm": 1.8288987923963298, "language_loss": 0.79365849, "learning_rate": 2.0101791912850396e-07, "loss": 0.81556475, "num_input_tokens_seen": 154845115, "step": 7155, "time_per_iteration": 2.7925848960876465 }, { "auxiliary_loss_clip": 0.01160911, "auxiliary_loss_mlp": 0.01023699, "balance_loss_clip": 1.04754686, "balance_loss_mlp": 1.01629591, "epoch": 0.8604581254133349, "flos": 34930201109760.0, "grad_norm": 1.7914505163033707, "language_loss": 0.6401493, "learning_rate": 2.006776923375082e-07, "loss": 0.66199541, "num_input_tokens_seen": 154866770, "step": 7156, "time_per_iteration": 3.795435905456543 }, { "auxiliary_loss_clip": 0.01163694, "auxiliary_loss_mlp": 0.01028337, "balance_loss_clip": 1.04485965, "balance_loss_mlp": 1.0208025, "epoch": 0.860578368303974, "flos": 22596538072320.0, "grad_norm": 1.6354447783153605, "language_loss": 0.71371245, "learning_rate": 2.003377385022764e-07, "loss": 0.73563278, "num_input_tokens_seen": 154885595, "step": 7157, "time_per_iteration": 2.7235212326049805 }, { "auxiliary_loss_clip": 0.01162876, "auxiliary_loss_mlp": 0.01021131, "balance_loss_clip": 1.04747343, "balance_loss_mlp": 1.0137105, "epoch": 0.8606986111946131, "flos": 21324331192320.0, "grad_norm": 1.8365702411681881, "language_loss": 0.77079064, "learning_rate": 1.9999805767437826e-07, "loss": 0.79263067, "num_input_tokens_seen": 154904485, "step": 7158, "time_per_iteration": 2.6535794734954834 }, { "auxiliary_loss_clip": 0.01152209, "auxiliary_loss_mlp": 0.01019993, "balance_loss_clip": 1.04315019, "balance_loss_mlp": 1.01274526, "epoch": 0.8608188540852522, "flos": 28877206769280.0, "grad_norm": 1.9069626039674616, "language_loss": 0.71553981, "learning_rate": 1.9965864990534386e-07, "loss": 0.73726189, "num_input_tokens_seen": 154925010, "step": 7159, "time_per_iteration": 2.8088295459747314 }, { "auxiliary_loss_clip": 0.01149243, "auxiliary_loss_mlp": 0.0102378, "balance_loss_clip": 1.04373813, "balance_loss_mlp": 1.0169847, "epoch": 0.8609390969758913, "flos": 29716187713920.0, "grad_norm": 1.5840402521442978, "language_loss": 0.77661484, "learning_rate": 1.9931951524666092e-07, "loss": 0.79834509, "num_input_tokens_seen": 154946100, "step": 7160, "time_per_iteration": 2.7747459411621094 }, { "auxiliary_loss_clip": 0.01165474, "auxiliary_loss_mlp": 0.01051685, "balance_loss_clip": 1.04593086, "balance_loss_mlp": 1.01636982, "epoch": 0.8610593398665304, "flos": 21249349551360.0, "grad_norm": 1.6860868082120362, "language_loss": 0.81367767, "learning_rate": 1.9898065374977534e-07, "loss": 0.83584929, "num_input_tokens_seen": 154966305, "step": 7161, "time_per_iteration": 2.740100383758545 }, { "auxiliary_loss_clip": 0.0115137, "auxiliary_loss_mlp": 0.01023244, "balance_loss_clip": 1.0432713, "balance_loss_mlp": 1.01706851, "epoch": 0.8611795827571694, "flos": 14830102183680.0, "grad_norm": 2.2026159731558206, "language_loss": 0.73167926, "learning_rate": 1.9864206546609342e-07, "loss": 0.75342542, "num_input_tokens_seen": 154985145, "step": 7162, "time_per_iteration": 2.717754364013672 }, { "auxiliary_loss_clip": 0.01164086, "auxiliary_loss_mlp": 0.01023246, "balance_loss_clip": 1.04476571, "balance_loss_mlp": 1.01592636, "epoch": 0.8612998256478086, "flos": 24243258107520.0, "grad_norm": 1.8651961286647392, "language_loss": 0.8460381, "learning_rate": 1.983037504469771e-07, "loss": 0.86791146, "num_input_tokens_seen": 155003855, "step": 7163, "time_per_iteration": 2.655578374862671 }, { "auxiliary_loss_clip": 0.01163793, "auxiliary_loss_mlp": 0.01025704, "balance_loss_clip": 1.04703593, "balance_loss_mlp": 1.01775861, "epoch": 0.8614200685384477, "flos": 21252653602560.0, "grad_norm": 1.5895150304991046, "language_loss": 0.66888833, "learning_rate": 1.9796570874374984e-07, "loss": 0.69078332, "num_input_tokens_seen": 155023960, "step": 7164, "time_per_iteration": 2.739630699157715 }, { "auxiliary_loss_clip": 0.01162389, "auxiliary_loss_mlp": 0.01020741, "balance_loss_clip": 1.04603457, "balance_loss_mlp": 1.01314163, "epoch": 0.8615403114290867, "flos": 20007738080640.0, "grad_norm": 1.6306833009587984, "language_loss": 0.77572161, "learning_rate": 1.976279404076917e-07, "loss": 0.79755294, "num_input_tokens_seen": 155043360, "step": 7165, "time_per_iteration": 2.797696352005005 }, { "auxiliary_loss_clip": 0.0115585, "auxiliary_loss_mlp": 0.01024452, "balance_loss_clip": 1.04680598, "balance_loss_mlp": 1.01690054, "epoch": 0.8616605543197259, "flos": 29789373674880.0, "grad_norm": 1.928155270840286, "language_loss": 0.76317835, "learning_rate": 1.9729044549004193e-07, "loss": 0.78498137, "num_input_tokens_seen": 155064745, "step": 7166, "time_per_iteration": 2.831599712371826 }, { "auxiliary_loss_clip": 0.0115993, "auxiliary_loss_mlp": 0.01025379, "balance_loss_clip": 1.04633951, "balance_loss_mlp": 1.01841128, "epoch": 0.8617807972103649, "flos": 28911609020160.0, "grad_norm": 1.59505188377023, "language_loss": 0.70159882, "learning_rate": 1.9695322404199822e-07, "loss": 0.72345197, "num_input_tokens_seen": 155086790, "step": 7167, "time_per_iteration": 2.7610533237457275 }, { "auxiliary_loss_clip": 0.01158162, "auxiliary_loss_mlp": 0.01027596, "balance_loss_clip": 1.0460794, "balance_loss_mlp": 1.02032423, "epoch": 0.861901040101004, "flos": 27673804391040.0, "grad_norm": 2.7565727092331445, "language_loss": 0.82227075, "learning_rate": 1.9661627611471654e-07, "loss": 0.84412825, "num_input_tokens_seen": 155106585, "step": 7168, "time_per_iteration": 2.797733783721924 }, { "auxiliary_loss_clip": 0.01165072, "auxiliary_loss_mlp": 0.01023634, "balance_loss_clip": 1.04661465, "balance_loss_mlp": 1.01611805, "epoch": 0.8620212829916432, "flos": 49748056755840.0, "grad_norm": 1.8036458780937952, "language_loss": 0.7014327, "learning_rate": 1.9627960175931246e-07, "loss": 0.72331977, "num_input_tokens_seen": 155131285, "step": 7169, "time_per_iteration": 3.8718180656433105 }, { "auxiliary_loss_clip": 0.01161956, "auxiliary_loss_mlp": 0.01024612, "balance_loss_clip": 1.04674864, "balance_loss_mlp": 1.01810026, "epoch": 0.8621415258822822, "flos": 21138672769920.0, "grad_norm": 1.7606176074518634, "language_loss": 0.74313188, "learning_rate": 1.9594320102685847e-07, "loss": 0.7649976, "num_input_tokens_seen": 155150555, "step": 7170, "time_per_iteration": 2.663115978240967 }, { "auxiliary_loss_clip": 0.01150932, "auxiliary_loss_mlp": 0.01048558, "balance_loss_clip": 1.04595613, "balance_loss_mlp": 1.01431632, "epoch": 0.8622617687729213, "flos": 21689039934720.0, "grad_norm": 2.325714339602276, "language_loss": 0.63843143, "learning_rate": 1.956070739683864e-07, "loss": 0.66042638, "num_input_tokens_seen": 155169890, "step": 7171, "time_per_iteration": 3.7130661010742188 }, { "auxiliary_loss_clip": 0.01143493, "auxiliary_loss_mlp": 0.01027153, "balance_loss_clip": 1.04525089, "balance_loss_mlp": 1.0199852, "epoch": 0.8623820116635604, "flos": 26250592734720.0, "grad_norm": 1.7880739075378362, "language_loss": 0.74465173, "learning_rate": 1.9527122063488678e-07, "loss": 0.76635814, "num_input_tokens_seen": 155191005, "step": 7172, "time_per_iteration": 2.8402700424194336 }, { "auxiliary_loss_clip": 0.01157654, "auxiliary_loss_mlp": 0.01023122, "balance_loss_clip": 1.04390454, "balance_loss_mlp": 1.01559687, "epoch": 0.8625022545541995, "flos": 19647554451840.0, "grad_norm": 5.489961310705217, "language_loss": 0.80726123, "learning_rate": 1.9493564107730755e-07, "loss": 0.82906902, "num_input_tokens_seen": 155211005, "step": 7173, "time_per_iteration": 2.6771907806396484 }, { "auxiliary_loss_clip": 0.01155654, "auxiliary_loss_mlp": 0.01027034, "balance_loss_clip": 1.04556346, "balance_loss_mlp": 1.02014351, "epoch": 0.8626224974448385, "flos": 21908382336000.0, "grad_norm": 2.12795574109357, "language_loss": 0.61313373, "learning_rate": 1.9460033534655684e-07, "loss": 0.63496059, "num_input_tokens_seen": 155230365, "step": 7174, "time_per_iteration": 2.7261009216308594 }, { "auxiliary_loss_clip": 0.0115533, "auxiliary_loss_mlp": 0.01022687, "balance_loss_clip": 1.0451405, "balance_loss_mlp": 1.0150578, "epoch": 0.8627427403354777, "flos": 23331198942720.0, "grad_norm": 1.5505160458385923, "language_loss": 0.83808219, "learning_rate": 1.9426530349349978e-07, "loss": 0.85986239, "num_input_tokens_seen": 155250815, "step": 7175, "time_per_iteration": 3.6261961460113525 }, { "auxiliary_loss_clip": 0.01162367, "auxiliary_loss_mlp": 0.01053254, "balance_loss_clip": 1.04614484, "balance_loss_mlp": 1.01729012, "epoch": 0.8628629832261168, "flos": 16362877299840.0, "grad_norm": 2.595733012353571, "language_loss": 0.64730918, "learning_rate": 1.9393054556896038e-07, "loss": 0.66946542, "num_input_tokens_seen": 155268515, "step": 7176, "time_per_iteration": 2.7653727531433105 }, { "auxiliary_loss_clip": 0.01153536, "auxiliary_loss_mlp": 0.01028235, "balance_loss_clip": 1.04566121, "balance_loss_mlp": 1.02051926, "epoch": 0.8629832261167558, "flos": 28103941756800.0, "grad_norm": 2.235883152623517, "language_loss": 0.68992937, "learning_rate": 1.9359606162372133e-07, "loss": 0.71174705, "num_input_tokens_seen": 155290120, "step": 7177, "time_per_iteration": 2.804795265197754 }, { "auxiliary_loss_clip": 0.011646, "auxiliary_loss_mlp": 0.01025993, "balance_loss_clip": 1.04639363, "balance_loss_mlp": 1.01910555, "epoch": 0.863103469007395, "flos": 20230061310720.0, "grad_norm": 1.651894980470906, "language_loss": 0.70775872, "learning_rate": 1.9326185170852293e-07, "loss": 0.72966468, "num_input_tokens_seen": 155309085, "step": 7178, "time_per_iteration": 2.629856586456299 }, { "auxiliary_loss_clip": 0.01162087, "auxiliary_loss_mlp": 0.01028945, "balance_loss_clip": 1.04815054, "balance_loss_mlp": 1.02194452, "epoch": 0.863223711898034, "flos": 24498547044480.0, "grad_norm": 2.0653852455242165, "language_loss": 0.71880925, "learning_rate": 1.9292791587406598e-07, "loss": 0.74071956, "num_input_tokens_seen": 155327945, "step": 7179, "time_per_iteration": 2.7198421955108643 }, { "auxiliary_loss_clip": 0.01161389, "auxiliary_loss_mlp": 0.0105363, "balance_loss_clip": 1.04629076, "balance_loss_mlp": 1.01848078, "epoch": 0.8633439547886731, "flos": 17675376261120.0, "grad_norm": 2.0959147785338352, "language_loss": 0.86621213, "learning_rate": 1.9259425417100661e-07, "loss": 0.88836229, "num_input_tokens_seen": 155344060, "step": 7180, "time_per_iteration": 2.690364122390747 }, { "auxiliary_loss_clip": 0.01151863, "auxiliary_loss_mlp": 0.01024369, "balance_loss_clip": 1.04617763, "balance_loss_mlp": 1.01689506, "epoch": 0.8634641976793123, "flos": 12895055677440.0, "grad_norm": 2.1023690880060935, "language_loss": 0.74753428, "learning_rate": 1.9226086664996234e-07, "loss": 0.76929659, "num_input_tokens_seen": 155362305, "step": 7181, "time_per_iteration": 2.820688009262085 }, { "auxiliary_loss_clip": 0.0115953, "auxiliary_loss_mlp": 0.01027502, "balance_loss_clip": 1.04642963, "balance_loss_mlp": 1.02042377, "epoch": 0.8635844405699513, "flos": 23878980328320.0, "grad_norm": 2.281339465721464, "language_loss": 0.74058247, "learning_rate": 1.9192775336150712e-07, "loss": 0.76245278, "num_input_tokens_seen": 155382605, "step": 7182, "time_per_iteration": 3.6135544776916504 }, { "auxiliary_loss_clip": 0.0105617, "auxiliary_loss_mlp": 0.01001291, "balance_loss_clip": 1.0075711, "balance_loss_mlp": 1.00021768, "epoch": 0.8637046834605904, "flos": 60453387521280.0, "grad_norm": 0.7537443538912781, "language_loss": 0.56250536, "learning_rate": 1.915949143561739e-07, "loss": 0.58308005, "num_input_tokens_seen": 155437280, "step": 7183, "time_per_iteration": 3.2711308002471924 }, { "auxiliary_loss_clip": 0.01163454, "auxiliary_loss_mlp": 0.01023363, "balance_loss_clip": 1.04732847, "balance_loss_mlp": 1.01613569, "epoch": 0.8638249263512295, "flos": 20558751690240.0, "grad_norm": 1.8411360665139351, "language_loss": 0.78188479, "learning_rate": 1.9126234968445498e-07, "loss": 0.80375296, "num_input_tokens_seen": 155456970, "step": 7184, "time_per_iteration": 2.759690999984741 }, { "auxiliary_loss_clip": 0.01166287, "auxiliary_loss_mlp": 0.01025868, "balance_loss_clip": 1.04665303, "balance_loss_mlp": 1.01846468, "epoch": 0.8639451692418686, "flos": 26615768353920.0, "grad_norm": 1.3880109519084074, "language_loss": 0.67469704, "learning_rate": 1.9093005939679884e-07, "loss": 0.69661856, "num_input_tokens_seen": 155478925, "step": 7185, "time_per_iteration": 2.754906177520752 }, { "auxiliary_loss_clip": 0.01163582, "auxiliary_loss_mlp": 0.01026034, "balance_loss_clip": 1.04802966, "balance_loss_mlp": 1.0181129, "epoch": 0.8640654121325076, "flos": 15122450977920.0, "grad_norm": 1.8566334182563755, "language_loss": 0.76596755, "learning_rate": 1.9059804354361452e-07, "loss": 0.78786367, "num_input_tokens_seen": 155496700, "step": 7186, "time_per_iteration": 2.7657670974731445 }, { "auxiliary_loss_clip": 0.0115551, "auxiliary_loss_mlp": 0.01027503, "balance_loss_clip": 1.04693007, "balance_loss_mlp": 1.02032042, "epoch": 0.8641856550231467, "flos": 31869068250240.0, "grad_norm": 3.234027205662517, "language_loss": 0.70327443, "learning_rate": 1.902663021752684e-07, "loss": 0.72510457, "num_input_tokens_seen": 155518130, "step": 7187, "time_per_iteration": 2.897019386291504 }, { "auxiliary_loss_clip": 0.01170447, "auxiliary_loss_mlp": 0.01031629, "balance_loss_clip": 1.05001462, "balance_loss_mlp": 1.0240413, "epoch": 0.8643058979137859, "flos": 14976545932800.0, "grad_norm": 2.5255906829836707, "language_loss": 0.82543498, "learning_rate": 1.8993483534208556e-07, "loss": 0.84745574, "num_input_tokens_seen": 155537040, "step": 7188, "time_per_iteration": 2.6823995113372803 }, { "auxiliary_loss_clip": 0.011534, "auxiliary_loss_mlp": 0.01023454, "balance_loss_clip": 1.04723191, "balance_loss_mlp": 1.0163399, "epoch": 0.8644261408044249, "flos": 13115726881920.0, "grad_norm": 2.7639492802633048, "language_loss": 0.75093204, "learning_rate": 1.8960364309434884e-07, "loss": 0.77270055, "num_input_tokens_seen": 155554535, "step": 7189, "time_per_iteration": 2.609647035598755 }, { "auxiliary_loss_clip": 0.01146551, "auxiliary_loss_mlp": 0.01055876, "balance_loss_clip": 1.04700541, "balance_loss_mlp": 1.02026713, "epoch": 0.864546383695064, "flos": 20850920916480.0, "grad_norm": 1.8277897458330852, "language_loss": 0.78362393, "learning_rate": 1.8927272548229967e-07, "loss": 0.80564821, "num_input_tokens_seen": 155574225, "step": 7190, "time_per_iteration": 2.771601438522339 }, { "auxiliary_loss_clip": 0.01152922, "auxiliary_loss_mlp": 0.01030607, "balance_loss_clip": 1.04713345, "balance_loss_mlp": 1.02304637, "epoch": 0.8646666265857031, "flos": 21324582587520.0, "grad_norm": 1.5752159407453246, "language_loss": 0.83064187, "learning_rate": 1.8894208255613876e-07, "loss": 0.85247719, "num_input_tokens_seen": 155593540, "step": 7191, "time_per_iteration": 2.752115488052368 }, { "auxiliary_loss_clip": 0.01164218, "auxiliary_loss_mlp": 0.01023363, "balance_loss_clip": 1.04560292, "balance_loss_mlp": 1.01610315, "epoch": 0.8647868694763422, "flos": 19750833031680.0, "grad_norm": 2.2809314178270372, "language_loss": 0.77510452, "learning_rate": 1.8861171436602397e-07, "loss": 0.79698038, "num_input_tokens_seen": 155610655, "step": 7192, "time_per_iteration": 2.701707363128662 }, { "auxiliary_loss_clip": 0.01165126, "auxiliary_loss_mlp": 0.01023165, "balance_loss_clip": 1.04784894, "balance_loss_mlp": 1.0161736, "epoch": 0.8649071123669813, "flos": 26176760328960.0, "grad_norm": 2.1697759359382256, "language_loss": 0.8035624, "learning_rate": 1.882816209620719e-07, "loss": 0.82544529, "num_input_tokens_seen": 155627365, "step": 7193, "time_per_iteration": 2.69345760345459 }, { "auxiliary_loss_clip": 0.01162944, "auxiliary_loss_mlp": 0.01028908, "balance_loss_clip": 1.05014968, "balance_loss_mlp": 1.02158308, "epoch": 0.8650273552576204, "flos": 20302888135680.0, "grad_norm": 1.8911878264165103, "language_loss": 0.76985395, "learning_rate": 1.8795180239435738e-07, "loss": 0.79177248, "num_input_tokens_seen": 155646220, "step": 7194, "time_per_iteration": 2.7037503719329834 }, { "auxiliary_loss_clip": 0.01164542, "auxiliary_loss_mlp": 0.01026753, "balance_loss_clip": 1.04723704, "balance_loss_mlp": 1.01961231, "epoch": 0.8651475981482595, "flos": 23951088881280.0, "grad_norm": 2.823239535289084, "language_loss": 0.75597703, "learning_rate": 1.8762225871291348e-07, "loss": 0.77788997, "num_input_tokens_seen": 155662095, "step": 7195, "time_per_iteration": 3.8019630908966064 }, { "auxiliary_loss_clip": 0.01163791, "auxiliary_loss_mlp": 0.01052717, "balance_loss_clip": 1.04549289, "balance_loss_mlp": 1.01695704, "epoch": 0.8652678410388985, "flos": 21684622561920.0, "grad_norm": 1.675070440881161, "language_loss": 0.81075341, "learning_rate": 1.8729298996773201e-07, "loss": 0.83291847, "num_input_tokens_seen": 155680845, "step": 7196, "time_per_iteration": 2.752291679382324 }, { "auxiliary_loss_clip": 0.01056384, "auxiliary_loss_mlp": 0.01000833, "balance_loss_clip": 1.00800419, "balance_loss_mlp": 0.999951, "epoch": 0.8653880839295377, "flos": 65224660855680.0, "grad_norm": 0.8297545243594139, "language_loss": 0.60878778, "learning_rate": 1.8696399620876301e-07, "loss": 0.62935996, "num_input_tokens_seen": 155737875, "step": 7197, "time_per_iteration": 4.175026893615723 }, { "auxiliary_loss_clip": 0.01158243, "auxiliary_loss_mlp": 0.01028885, "balance_loss_clip": 1.04770565, "balance_loss_mlp": 1.02094531, "epoch": 0.8655083268201768, "flos": 17749172753280.0, "grad_norm": 2.1852489865594564, "language_loss": 0.79574203, "learning_rate": 1.866352774859141e-07, "loss": 0.81761336, "num_input_tokens_seen": 155753100, "step": 7198, "time_per_iteration": 2.7073564529418945 }, { "auxiliary_loss_clip": 0.01160533, "auxiliary_loss_mlp": 0.01021472, "balance_loss_clip": 1.04575849, "balance_loss_mlp": 1.01469457, "epoch": 0.8656285697108158, "flos": 20703974376960.0, "grad_norm": 2.976700212304197, "language_loss": 0.69314694, "learning_rate": 1.8630683384905188e-07, "loss": 0.71496701, "num_input_tokens_seen": 155772430, "step": 7199, "time_per_iteration": 2.697662353515625 }, { "auxiliary_loss_clip": 0.01165859, "auxiliary_loss_mlp": 0.01051455, "balance_loss_clip": 1.0466274, "balance_loss_mlp": 1.01732743, "epoch": 0.865748812601455, "flos": 18653833716480.0, "grad_norm": 2.612093368099651, "language_loss": 0.88589084, "learning_rate": 1.8597866534800045e-07, "loss": 0.90806401, "num_input_tokens_seen": 155787545, "step": 7200, "time_per_iteration": 2.6797616481781006 }, { "auxiliary_loss_clip": 0.01167446, "auxiliary_loss_mlp": 0.01053426, "balance_loss_clip": 1.04870617, "balance_loss_mlp": 1.01748121, "epoch": 0.865869055492094, "flos": 70652554807680.0, "grad_norm": 2.120031175159807, "language_loss": 0.74494565, "learning_rate": 1.8565077203254398e-07, "loss": 0.76715434, "num_input_tokens_seen": 155813005, "step": 7201, "time_per_iteration": 3.958447217941284 }, { "auxiliary_loss_clip": 0.01158115, "auxiliary_loss_mlp": 0.01033057, "balance_loss_clip": 1.05023813, "balance_loss_mlp": 1.02552307, "epoch": 0.8659892983827331, "flos": 17383961220480.0, "grad_norm": 2.6600286091323935, "language_loss": 0.72887301, "learning_rate": 1.8532315395242203e-07, "loss": 0.75078475, "num_input_tokens_seen": 155829455, "step": 7202, "time_per_iteration": 2.6925857067108154 }, { "auxiliary_loss_clip": 0.0115849, "auxiliary_loss_mlp": 0.01024151, "balance_loss_clip": 1.04617214, "balance_loss_mlp": 1.01702523, "epoch": 0.8661095412733723, "flos": 17895221452800.0, "grad_norm": 1.9306309939625594, "language_loss": 0.72124827, "learning_rate": 1.849958111573353e-07, "loss": 0.74307472, "num_input_tokens_seen": 155848060, "step": 7203, "time_per_iteration": 2.742762565612793 }, { "auxiliary_loss_clip": 0.01163463, "auxiliary_loss_mlp": 0.0102425, "balance_loss_clip": 1.04538095, "balance_loss_mlp": 1.01720476, "epoch": 0.8662297841640113, "flos": 18224163227520.0, "grad_norm": 1.6911039420280929, "language_loss": 0.63982618, "learning_rate": 1.8466874369694074e-07, "loss": 0.66170335, "num_input_tokens_seen": 155865755, "step": 7204, "time_per_iteration": 2.606382131576538 }, { "auxiliary_loss_clip": 0.0115878, "auxiliary_loss_mlp": 0.01022174, "balance_loss_clip": 1.04617405, "balance_loss_mlp": 1.01543605, "epoch": 0.8663500270546504, "flos": 16362159027840.0, "grad_norm": 2.532275105106885, "language_loss": 0.70758003, "learning_rate": 1.843419516208542e-07, "loss": 0.72938955, "num_input_tokens_seen": 155882680, "step": 7205, "time_per_iteration": 2.68969988822937 }, { "auxiliary_loss_clip": 0.01167398, "auxiliary_loss_mlp": 0.01029787, "balance_loss_clip": 1.04996133, "balance_loss_mlp": 1.02171648, "epoch": 0.8664702699452895, "flos": 17894431353600.0, "grad_norm": 2.9122571174046192, "language_loss": 0.79604805, "learning_rate": 1.8401543497865047e-07, "loss": 0.81801987, "num_input_tokens_seen": 155900680, "step": 7206, "time_per_iteration": 2.604738473892212 }, { "auxiliary_loss_clip": 0.01162744, "auxiliary_loss_mlp": 0.01052321, "balance_loss_clip": 1.04366779, "balance_loss_mlp": 1.01723289, "epoch": 0.8665905128359286, "flos": 30736373794560.0, "grad_norm": 2.131639944243492, "language_loss": 0.64407146, "learning_rate": 1.836891938198608e-07, "loss": 0.6662221, "num_input_tokens_seen": 155921105, "step": 7207, "time_per_iteration": 3.764038562774658 }, { "auxiliary_loss_clip": 0.01159967, "auxiliary_loss_mlp": 0.01025736, "balance_loss_clip": 1.04715109, "balance_loss_mlp": 1.01869082, "epoch": 0.8667107557265676, "flos": 18656419495680.0, "grad_norm": 2.3445805543773752, "language_loss": 0.70949054, "learning_rate": 1.8336322819397677e-07, "loss": 0.7313475, "num_input_tokens_seen": 155938640, "step": 7208, "time_per_iteration": 2.731086254119873 }, { "auxiliary_loss_clip": 0.0116202, "auxiliary_loss_mlp": 0.01025543, "balance_loss_clip": 1.04644299, "balance_loss_mlp": 1.0178833, "epoch": 0.8668309986172068, "flos": 20083725302400.0, "grad_norm": 2.2080761376189963, "language_loss": 0.62288868, "learning_rate": 1.8303753815044654e-07, "loss": 0.6447643, "num_input_tokens_seen": 155957945, "step": 7209, "time_per_iteration": 2.66054630279541 }, { "auxiliary_loss_clip": 0.01170849, "auxiliary_loss_mlp": 0.01025388, "balance_loss_clip": 1.04845142, "balance_loss_mlp": 1.01758003, "epoch": 0.8669512415078459, "flos": 21615099788160.0, "grad_norm": 2.7931315363866225, "language_loss": 0.71041358, "learning_rate": 1.827121237386773e-07, "loss": 0.73237598, "num_input_tokens_seen": 155975390, "step": 7210, "time_per_iteration": 2.6692986488342285 }, { "auxiliary_loss_clip": 0.01162773, "auxiliary_loss_mlp": 0.01023485, "balance_loss_clip": 1.04746604, "balance_loss_mlp": 1.01553988, "epoch": 0.8670714843984849, "flos": 17703601372800.0, "grad_norm": 2.3376212757211787, "language_loss": 0.75144398, "learning_rate": 1.8238698500803374e-07, "loss": 0.77330655, "num_input_tokens_seen": 155988155, "step": 7211, "time_per_iteration": 2.620891809463501 }, { "auxiliary_loss_clip": 0.01060774, "auxiliary_loss_mlp": 0.0100083, "balance_loss_clip": 1.00658178, "balance_loss_mlp": 0.99987024, "epoch": 0.8671917272891241, "flos": 60705483125760.0, "grad_norm": 0.7193872199289184, "language_loss": 0.5624975, "learning_rate": 1.820621220078391e-07, "loss": 0.58311343, "num_input_tokens_seen": 156052065, "step": 7212, "time_per_iteration": 3.3257358074188232 }, { "auxiliary_loss_clip": 0.0116493, "auxiliary_loss_mlp": 0.01024065, "balance_loss_clip": 1.04491854, "balance_loss_mlp": 1.01687706, "epoch": 0.8673119701797631, "flos": 20451881750400.0, "grad_norm": 1.7777779887651852, "language_loss": 0.6794709, "learning_rate": 1.8173753478737553e-07, "loss": 0.70136082, "num_input_tokens_seen": 156072500, "step": 7213, "time_per_iteration": 2.6559696197509766 }, { "auxiliary_loss_clip": 0.01165475, "auxiliary_loss_mlp": 0.01029908, "balance_loss_clip": 1.04534698, "balance_loss_mlp": 1.02228463, "epoch": 0.8674322130704022, "flos": 19647410797440.0, "grad_norm": 5.774404893820269, "language_loss": 0.79811144, "learning_rate": 1.8141322339588205e-07, "loss": 0.82006526, "num_input_tokens_seen": 156089840, "step": 7214, "time_per_iteration": 2.633787155151367 }, { "auxiliary_loss_clip": 0.01164307, "auxiliary_loss_mlp": 0.01027055, "balance_loss_clip": 1.04730999, "balance_loss_mlp": 1.01991987, "epoch": 0.8675524559610414, "flos": 26025001367040.0, "grad_norm": 1.8471824874195377, "language_loss": 0.6993984, "learning_rate": 1.810891878825569e-07, "loss": 0.72131193, "num_input_tokens_seen": 156109815, "step": 7215, "time_per_iteration": 2.721086025238037 }, { "auxiliary_loss_clip": 0.01159373, "auxiliary_loss_mlp": 0.01028419, "balance_loss_clip": 1.04629111, "balance_loss_mlp": 1.02137375, "epoch": 0.8676726988516804, "flos": 15049444584960.0, "grad_norm": 2.328054688582718, "language_loss": 0.7216748, "learning_rate": 1.8076542829655561e-07, "loss": 0.7435528, "num_input_tokens_seen": 156128620, "step": 7216, "time_per_iteration": 2.6404287815093994 }, { "auxiliary_loss_clip": 0.01157962, "auxiliary_loss_mlp": 0.01031115, "balance_loss_clip": 1.04644704, "balance_loss_mlp": 1.02344418, "epoch": 0.8677929417423195, "flos": 16288111140480.0, "grad_norm": 4.391179345869821, "language_loss": 0.79383039, "learning_rate": 1.8044194468699203e-07, "loss": 0.81572121, "num_input_tokens_seen": 156145930, "step": 7217, "time_per_iteration": 2.723228931427002 }, { "auxiliary_loss_clip": 0.01157251, "auxiliary_loss_mlp": 0.01024675, "balance_loss_clip": 1.04835796, "balance_loss_mlp": 1.01730824, "epoch": 0.8679131846329585, "flos": 18844160906880.0, "grad_norm": 2.278584079976492, "language_loss": 0.75848615, "learning_rate": 1.8011873710293912e-07, "loss": 0.78030539, "num_input_tokens_seen": 156164435, "step": 7218, "time_per_iteration": 2.7695443630218506 }, { "auxiliary_loss_clip": 0.01161117, "auxiliary_loss_mlp": 0.01022738, "balance_loss_clip": 1.04716945, "balance_loss_mlp": 1.01500726, "epoch": 0.8680334275235977, "flos": 33620718890880.0, "grad_norm": 13.500256313461891, "language_loss": 0.69622624, "learning_rate": 1.7979580559342677e-07, "loss": 0.71806484, "num_input_tokens_seen": 156185165, "step": 7219, "time_per_iteration": 2.8526182174682617 }, { "auxiliary_loss_clip": 0.01160571, "auxiliary_loss_mlp": 0.010252, "balance_loss_clip": 1.04922009, "balance_loss_mlp": 1.017892, "epoch": 0.8681536704142367, "flos": 24681152810880.0, "grad_norm": 2.343385620651694, "language_loss": 0.66532207, "learning_rate": 1.7947315020744358e-07, "loss": 0.6871798, "num_input_tokens_seen": 156206260, "step": 7220, "time_per_iteration": 2.752448558807373 }, { "auxiliary_loss_clip": 0.0115626, "auxiliary_loss_mlp": 0.01024203, "balance_loss_clip": 1.04370797, "balance_loss_mlp": 1.0169524, "epoch": 0.8682739133048758, "flos": 20011042131840.0, "grad_norm": 1.853465879134935, "language_loss": 0.79958165, "learning_rate": 1.7915077099393594e-07, "loss": 0.82138622, "num_input_tokens_seen": 156222860, "step": 7221, "time_per_iteration": 3.60408878326416 }, { "auxiliary_loss_clip": 0.01166363, "auxiliary_loss_mlp": 0.0102594, "balance_loss_clip": 1.04661679, "balance_loss_mlp": 1.01863265, "epoch": 0.868394156195515, "flos": 16654759217280.0, "grad_norm": 1.8248703492354488, "language_loss": 0.7345221, "learning_rate": 1.788286680018083e-07, "loss": 0.75644505, "num_input_tokens_seen": 156241570, "step": 7222, "time_per_iteration": 2.825406789779663 }, { "auxiliary_loss_clip": 0.01162907, "auxiliary_loss_mlp": 0.0102687, "balance_loss_clip": 1.04695559, "balance_loss_mlp": 1.01958084, "epoch": 0.868514399086154, "flos": 28001381448960.0, "grad_norm": 2.0088678738030583, "language_loss": 0.726201, "learning_rate": 1.7850684127992443e-07, "loss": 0.74809879, "num_input_tokens_seen": 156261315, "step": 7223, "time_per_iteration": 3.7651243209838867 }, { "auxiliary_loss_clip": 0.01155591, "auxiliary_loss_mlp": 0.01028328, "balance_loss_clip": 1.04798198, "balance_loss_mlp": 1.02049613, "epoch": 0.8686346419767931, "flos": 20084587228800.0, "grad_norm": 1.5007241614229354, "language_loss": 0.70304751, "learning_rate": 1.7818529087710378e-07, "loss": 0.72488666, "num_input_tokens_seen": 156281670, "step": 7224, "time_per_iteration": 2.7557551860809326 }, { "auxiliary_loss_clip": 0.01158222, "auxiliary_loss_mlp": 0.01053379, "balance_loss_clip": 1.04484582, "balance_loss_mlp": 1.01848066, "epoch": 0.8687548848674322, "flos": 18223516782720.0, "grad_norm": 1.857097365897871, "language_loss": 0.8443858, "learning_rate": 1.7786401684212637e-07, "loss": 0.86650181, "num_input_tokens_seen": 156300500, "step": 7225, "time_per_iteration": 2.7118356227874756 }, { "auxiliary_loss_clip": 0.01061329, "auxiliary_loss_mlp": 0.0100038, "balance_loss_clip": 1.01110148, "balance_loss_mlp": 0.99948561, "epoch": 0.8688751277580713, "flos": 70457885049600.0, "grad_norm": 0.741854806958205, "language_loss": 0.5596559, "learning_rate": 1.7754301922372883e-07, "loss": 0.58027303, "num_input_tokens_seen": 156350145, "step": 7226, "time_per_iteration": 3.1441855430603027 }, { "auxiliary_loss_clip": 0.01155148, "auxiliary_loss_mlp": 0.01026247, "balance_loss_clip": 1.04548049, "balance_loss_mlp": 1.01904964, "epoch": 0.8689953706487104, "flos": 26906788344960.0, "grad_norm": 1.9584091126641698, "language_loss": 0.80996102, "learning_rate": 1.7722229807060617e-07, "loss": 0.83177501, "num_input_tokens_seen": 156368725, "step": 7227, "time_per_iteration": 3.653690814971924 }, { "auxiliary_loss_clip": 0.01150312, "auxiliary_loss_mlp": 0.01022569, "balance_loss_clip": 1.04623187, "balance_loss_mlp": 1.01554465, "epoch": 0.8691156135393495, "flos": 34637385438720.0, "grad_norm": 5.625193570201871, "language_loss": 0.81676126, "learning_rate": 1.7690185343141172e-07, "loss": 0.83849001, "num_input_tokens_seen": 156388640, "step": 7228, "time_per_iteration": 2.7793853282928467 }, { "auxiliary_loss_clip": 0.01160015, "auxiliary_loss_mlp": 0.01022503, "balance_loss_clip": 1.04661274, "balance_loss_mlp": 1.01543689, "epoch": 0.8692358564299886, "flos": 18989814556800.0, "grad_norm": 2.011818836505664, "language_loss": 0.70047224, "learning_rate": 1.7658168535475615e-07, "loss": 0.72229743, "num_input_tokens_seen": 156406425, "step": 7229, "time_per_iteration": 2.733191728591919 }, { "auxiliary_loss_clip": 0.01163648, "auxiliary_loss_mlp": 0.01024034, "balance_loss_clip": 1.04808152, "balance_loss_mlp": 1.01633358, "epoch": 0.8693560993206276, "flos": 30370839039360.0, "grad_norm": 2.0823008418625943, "language_loss": 0.64196932, "learning_rate": 1.7626179388920948e-07, "loss": 0.66384614, "num_input_tokens_seen": 156427705, "step": 7230, "time_per_iteration": 2.8630788326263428 }, { "auxiliary_loss_clip": 0.01157719, "auxiliary_loss_mlp": 0.01051795, "balance_loss_clip": 1.0447998, "balance_loss_mlp": 1.01643944, "epoch": 0.8694763422112668, "flos": 27200430028800.0, "grad_norm": 1.638944388658236, "language_loss": 0.80534637, "learning_rate": 1.7594217908329866e-07, "loss": 0.82744145, "num_input_tokens_seen": 156449890, "step": 7231, "time_per_iteration": 2.7756505012512207 }, { "auxiliary_loss_clip": 0.01151114, "auxiliary_loss_mlp": 0.0101912, "balance_loss_clip": 1.04559743, "balance_loss_mlp": 1.0123136, "epoch": 0.8695965851019059, "flos": 26139161767680.0, "grad_norm": 2.772767940145187, "language_loss": 0.73953587, "learning_rate": 1.7562284098550895e-07, "loss": 0.76123822, "num_input_tokens_seen": 156469600, "step": 7232, "time_per_iteration": 2.7631895542144775 }, { "auxiliary_loss_clip": 0.01059048, "auxiliary_loss_mlp": 0.01004088, "balance_loss_clip": 1.01237512, "balance_loss_mlp": 1.00311053, "epoch": 0.8697168279925449, "flos": 67332616456320.0, "grad_norm": 0.9700469265725351, "language_loss": 0.62215543, "learning_rate": 1.753037796442838e-07, "loss": 0.64278674, "num_input_tokens_seen": 156529040, "step": 7233, "time_per_iteration": 4.164982318878174 }, { "auxiliary_loss_clip": 0.01164941, "auxiliary_loss_mlp": 0.01031165, "balance_loss_clip": 1.04495168, "balance_loss_mlp": 1.0236907, "epoch": 0.8698370708831841, "flos": 19718693337600.0, "grad_norm": 2.507424146380812, "language_loss": 0.75132501, "learning_rate": 1.74984995108024e-07, "loss": 0.7732861, "num_input_tokens_seen": 156546970, "step": 7234, "time_per_iteration": 2.6462273597717285 }, { "auxiliary_loss_clip": 0.01163425, "auxiliary_loss_mlp": 0.01026747, "balance_loss_clip": 1.0465796, "balance_loss_mlp": 1.01887321, "epoch": 0.8699573137738231, "flos": 12859971068160.0, "grad_norm": 3.0294561457249882, "language_loss": 0.8332957, "learning_rate": 1.7466648742508981e-07, "loss": 0.85519743, "num_input_tokens_seen": 156563155, "step": 7235, "time_per_iteration": 2.6663451194763184 }, { "auxiliary_loss_clip": 0.0115689, "auxiliary_loss_mlp": 0.01028424, "balance_loss_clip": 1.04585445, "balance_loss_mlp": 1.02126551, "epoch": 0.8700775566644622, "flos": 17420733768960.0, "grad_norm": 1.8877744278081023, "language_loss": 0.84456575, "learning_rate": 1.7434825664379837e-07, "loss": 0.86641884, "num_input_tokens_seen": 156581660, "step": 7236, "time_per_iteration": 2.6689231395721436 }, { "auxiliary_loss_clip": 0.01163281, "auxiliary_loss_mlp": 0.01023738, "balance_loss_clip": 1.04680085, "balance_loss_mlp": 1.01660383, "epoch": 0.8701977995551013, "flos": 13735221770880.0, "grad_norm": 3.1008340510075594, "language_loss": 0.85838193, "learning_rate": 1.740303028124246e-07, "loss": 0.88025212, "num_input_tokens_seen": 156597720, "step": 7237, "time_per_iteration": 2.6603031158447266 }, { "auxiliary_loss_clip": 0.0114219, "auxiliary_loss_mlp": 0.0102805, "balance_loss_clip": 1.04496896, "balance_loss_mlp": 1.02093339, "epoch": 0.8703180424457404, "flos": 30555707362560.0, "grad_norm": 1.7798515314933427, "language_loss": 0.7581116, "learning_rate": 1.7371262597920212e-07, "loss": 0.779814, "num_input_tokens_seen": 156619780, "step": 7238, "time_per_iteration": 2.8268566131591797 }, { "auxiliary_loss_clip": 0.01151557, "auxiliary_loss_mlp": 0.010311, "balance_loss_clip": 1.04725885, "balance_loss_mlp": 1.0235393, "epoch": 0.8704382853363795, "flos": 19608986223360.0, "grad_norm": 1.7077378279651672, "language_loss": 0.76405692, "learning_rate": 1.7339522619232195e-07, "loss": 0.78588355, "num_input_tokens_seen": 156638160, "step": 7239, "time_per_iteration": 2.74474835395813 }, { "auxiliary_loss_clip": 0.01168025, "auxiliary_loss_mlp": 0.01024733, "balance_loss_clip": 1.04693031, "balance_loss_mlp": 1.01725602, "epoch": 0.8705585282270186, "flos": 26613900846720.0, "grad_norm": 2.041769154646051, "language_loss": 0.75542468, "learning_rate": 1.730781034999338e-07, "loss": 0.77735221, "num_input_tokens_seen": 156659740, "step": 7240, "time_per_iteration": 2.7509374618530273 }, { "auxiliary_loss_clip": 0.01163349, "auxiliary_loss_mlp": 0.0102473, "balance_loss_clip": 1.04779148, "balance_loss_mlp": 1.01779163, "epoch": 0.8706787711176577, "flos": 34090465979520.0, "grad_norm": 2.376982675680446, "language_loss": 0.73279005, "learning_rate": 1.7276125795014497e-07, "loss": 0.75467086, "num_input_tokens_seen": 156678190, "step": 7241, "time_per_iteration": 2.7912850379943848 }, { "auxiliary_loss_clip": 0.01165687, "auxiliary_loss_mlp": 0.01028307, "balance_loss_clip": 1.04718554, "balance_loss_mlp": 1.0208323, "epoch": 0.8707990140082967, "flos": 14611513968000.0, "grad_norm": 2.0087946417381306, "language_loss": 0.67435801, "learning_rate": 1.7244468959102054e-07, "loss": 0.69629788, "num_input_tokens_seen": 156695245, "step": 7242, "time_per_iteration": 2.6562345027923584 }, { "auxiliary_loss_clip": 0.01162521, "auxiliary_loss_mlp": 0.01024659, "balance_loss_clip": 1.04754567, "balance_loss_mlp": 1.01736975, "epoch": 0.8709192568989359, "flos": 20084156265600.0, "grad_norm": 1.982629290751775, "language_loss": 0.8544445, "learning_rate": 1.7212839847058348e-07, "loss": 0.87631631, "num_input_tokens_seen": 156710375, "step": 7243, "time_per_iteration": 2.721613645553589 }, { "auxiliary_loss_clip": 0.01160252, "auxiliary_loss_mlp": 0.01028877, "balance_loss_clip": 1.04697847, "balance_loss_mlp": 1.02149463, "epoch": 0.871039499789575, "flos": 16727083251840.0, "grad_norm": 1.9716822912197751, "language_loss": 0.73599112, "learning_rate": 1.718123846368147e-07, "loss": 0.75788242, "num_input_tokens_seen": 156729420, "step": 7244, "time_per_iteration": 2.710968017578125 }, { "auxiliary_loss_clip": 0.01157719, "auxiliary_loss_mlp": 0.01050528, "balance_loss_clip": 1.04603767, "balance_loss_mlp": 1.01618791, "epoch": 0.871159742680214, "flos": 21068790860160.0, "grad_norm": 1.7930678752772176, "language_loss": 0.71725124, "learning_rate": 1.714966481376543e-07, "loss": 0.73933375, "num_input_tokens_seen": 156746100, "step": 7245, "time_per_iteration": 2.6747260093688965 }, { "auxiliary_loss_clip": 0.01159076, "auxiliary_loss_mlp": 0.01025234, "balance_loss_clip": 1.04384422, "balance_loss_mlp": 1.01760459, "epoch": 0.8712799855708532, "flos": 28256526731520.0, "grad_norm": 4.14742117762589, "language_loss": 0.8289901, "learning_rate": 1.7118118902099797e-07, "loss": 0.85083318, "num_input_tokens_seen": 156764185, "step": 7246, "time_per_iteration": 3.7253403663635254 }, { "auxiliary_loss_clip": 0.01162506, "auxiliary_loss_mlp": 0.01022171, "balance_loss_clip": 1.04691494, "balance_loss_mlp": 1.01480103, "epoch": 0.8714002284614922, "flos": 22236677665920.0, "grad_norm": 1.627466464830523, "language_loss": 0.80741477, "learning_rate": 1.7086600733470146e-07, "loss": 0.82926154, "num_input_tokens_seen": 156784855, "step": 7247, "time_per_iteration": 2.6786983013153076 }, { "auxiliary_loss_clip": 0.01157969, "auxiliary_loss_mlp": 0.0102367, "balance_loss_clip": 1.04537606, "balance_loss_mlp": 1.01647282, "epoch": 0.8715204713521313, "flos": 21431919404160.0, "grad_norm": 1.7875738758001494, "language_loss": 0.76828492, "learning_rate": 1.7055110312657738e-07, "loss": 0.79010129, "num_input_tokens_seen": 156804350, "step": 7248, "time_per_iteration": 2.704986333847046 }, { "auxiliary_loss_clip": 0.0115372, "auxiliary_loss_mlp": 0.01025553, "balance_loss_clip": 1.04670119, "balance_loss_mlp": 1.01812625, "epoch": 0.8716407142427703, "flos": 23440439180160.0, "grad_norm": 4.536281772230197, "language_loss": 0.74270123, "learning_rate": 1.702364764443962e-07, "loss": 0.76449394, "num_input_tokens_seen": 156823425, "step": 7249, "time_per_iteration": 3.6430141925811768 }, { "auxiliary_loss_clip": 0.01149676, "auxiliary_loss_mlp": 0.01022799, "balance_loss_clip": 1.04693186, "balance_loss_mlp": 1.0159564, "epoch": 0.8717609571334095, "flos": 27958683156480.0, "grad_norm": 2.5769506427013043, "language_loss": 0.72188199, "learning_rate": 1.6992212733588685e-07, "loss": 0.74360669, "num_input_tokens_seen": 156843090, "step": 7250, "time_per_iteration": 2.8095521926879883 }, { "auxiliary_loss_clip": 0.01158614, "auxiliary_loss_mlp": 0.01029088, "balance_loss_clip": 1.0479213, "balance_loss_mlp": 1.02169442, "epoch": 0.8718812000240486, "flos": 25479482538240.0, "grad_norm": 1.9082578501312004, "language_loss": 0.74920416, "learning_rate": 1.6960805584873538e-07, "loss": 0.77108115, "num_input_tokens_seen": 156861090, "step": 7251, "time_per_iteration": 2.7048046588897705 }, { "auxiliary_loss_clip": 0.01156174, "auxiliary_loss_mlp": 0.01021873, "balance_loss_clip": 1.04482293, "balance_loss_mlp": 1.01465476, "epoch": 0.8720014429146876, "flos": 23403056100480.0, "grad_norm": 2.8532951438550196, "language_loss": 0.78143895, "learning_rate": 1.6929426203058684e-07, "loss": 0.80321944, "num_input_tokens_seen": 156881515, "step": 7252, "time_per_iteration": 2.7179949283599854 }, { "auxiliary_loss_clip": 0.01169112, "auxiliary_loss_mlp": 0.01054256, "balance_loss_clip": 1.0468626, "balance_loss_mlp": 1.01896811, "epoch": 0.8721216858053268, "flos": 24352821567360.0, "grad_norm": 2.370402320151104, "language_loss": 0.79741162, "learning_rate": 1.689807459290431e-07, "loss": 0.81964529, "num_input_tokens_seen": 156900170, "step": 7253, "time_per_iteration": 3.3420159816741943 }, { "auxiliary_loss_clip": 0.01162334, "auxiliary_loss_mlp": 0.01026077, "balance_loss_clip": 1.04810262, "balance_loss_mlp": 1.01945186, "epoch": 0.8722419286959658, "flos": 33869687034240.0, "grad_norm": 1.8231961374594192, "language_loss": 0.7089119, "learning_rate": 1.6866750759166437e-07, "loss": 0.73079604, "num_input_tokens_seen": 156920150, "step": 7254, "time_per_iteration": 2.6327874660491943 }, { "auxiliary_loss_clip": 0.01154378, "auxiliary_loss_mlp": 0.01028535, "balance_loss_clip": 1.04492831, "balance_loss_mlp": 1.02120924, "epoch": 0.8723621715866049, "flos": 18369385914240.0, "grad_norm": 2.9830409456313958, "language_loss": 0.76874292, "learning_rate": 1.6835454706596865e-07, "loss": 0.79057211, "num_input_tokens_seen": 156937980, "step": 7255, "time_per_iteration": 2.574918031692505 }, { "auxiliary_loss_clip": 0.01168711, "auxiliary_loss_mlp": 0.01024888, "balance_loss_clip": 1.04796934, "balance_loss_mlp": 1.01707947, "epoch": 0.8724824144772441, "flos": 22013348855040.0, "grad_norm": 2.139226927038278, "language_loss": 0.73695594, "learning_rate": 1.680418643994317e-07, "loss": 0.75889194, "num_input_tokens_seen": 156956550, "step": 7256, "time_per_iteration": 2.533632278442383 }, { "auxiliary_loss_clip": 0.01059346, "auxiliary_loss_mlp": 0.0100176, "balance_loss_clip": 1.00697017, "balance_loss_mlp": 1.00079465, "epoch": 0.8726026573678831, "flos": 66698720213760.0, "grad_norm": 0.8836175903770586, "language_loss": 0.64486885, "learning_rate": 1.6772945963948738e-07, "loss": 0.6654799, "num_input_tokens_seen": 157014715, "step": 7257, "time_per_iteration": 3.1494686603546143 }, { "auxiliary_loss_clip": 0.0115694, "auxiliary_loss_mlp": 0.01022424, "balance_loss_clip": 1.04674816, "balance_loss_mlp": 1.01501226, "epoch": 0.8727229002585222, "flos": 13370908078080.0, "grad_norm": 3.2975929374281754, "language_loss": 0.77498543, "learning_rate": 1.6741733283352733e-07, "loss": 0.7967791, "num_input_tokens_seen": 157032320, "step": 7258, "time_per_iteration": 2.660074472427368 }, { "auxiliary_loss_clip": 0.01157772, "auxiliary_loss_mlp": 0.01025387, "balance_loss_clip": 1.0469327, "balance_loss_mlp": 1.01789427, "epoch": 0.8728431431491613, "flos": 21796987282560.0, "grad_norm": 1.545678959830294, "language_loss": 0.84096706, "learning_rate": 1.6710548402890102e-07, "loss": 0.86279863, "num_input_tokens_seen": 157052845, "step": 7259, "time_per_iteration": 3.7335739135742188 }, { "auxiliary_loss_clip": 0.01169733, "auxiliary_loss_mlp": 0.01026155, "balance_loss_clip": 1.04710424, "balance_loss_mlp": 1.01885307, "epoch": 0.8729633860398004, "flos": 36173823742080.0, "grad_norm": 1.792081219981178, "language_loss": 0.66653562, "learning_rate": 1.6679391327291527e-07, "loss": 0.68849444, "num_input_tokens_seen": 157074050, "step": 7260, "time_per_iteration": 2.8179104328155518 }, { "auxiliary_loss_clip": 0.01160751, "auxiliary_loss_mlp": 0.01031053, "balance_loss_clip": 1.04618502, "balance_loss_mlp": 1.02399325, "epoch": 0.8730836289304394, "flos": 16359680989440.0, "grad_norm": 2.758409538599547, "language_loss": 0.68272746, "learning_rate": 1.6648262061283492e-07, "loss": 0.70464551, "num_input_tokens_seen": 157089350, "step": 7261, "time_per_iteration": 2.7208738327026367 }, { "auxiliary_loss_clip": 0.01156857, "auxiliary_loss_mlp": 0.01023163, "balance_loss_clip": 1.04466724, "balance_loss_mlp": 1.01636505, "epoch": 0.8732038718210786, "flos": 21215126868480.0, "grad_norm": 2.2818895955581286, "language_loss": 0.73287868, "learning_rate": 1.6617160609588353e-07, "loss": 0.75467896, "num_input_tokens_seen": 157108525, "step": 7262, "time_per_iteration": 2.788545608520508 }, { "auxiliary_loss_clip": 0.0116539, "auxiliary_loss_mlp": 0.01027122, "balance_loss_clip": 1.04658437, "balance_loss_mlp": 1.01950431, "epoch": 0.8733241147117177, "flos": 16610696208000.0, "grad_norm": 2.104378441149214, "language_loss": 0.71810746, "learning_rate": 1.6586086976924163e-07, "loss": 0.74003261, "num_input_tokens_seen": 157124025, "step": 7263, "time_per_iteration": 2.6377601623535156 }, { "auxiliary_loss_clip": 0.01161829, "auxiliary_loss_mlp": 0.01030223, "balance_loss_clip": 1.04457092, "balance_loss_mlp": 1.02334511, "epoch": 0.8734443576023567, "flos": 20193935207040.0, "grad_norm": 3.1877560439442445, "language_loss": 0.78412724, "learning_rate": 1.6555041168004747e-07, "loss": 0.8060478, "num_input_tokens_seen": 157143345, "step": 7264, "time_per_iteration": 2.660166025161743 }, { "auxiliary_loss_clip": 0.0115808, "auxiliary_loss_mlp": 0.01019463, "balance_loss_clip": 1.048195, "balance_loss_mlp": 1.01238835, "epoch": 0.8735646004929959, "flos": 18041162411520.0, "grad_norm": 1.6406582450501126, "language_loss": 0.6901691, "learning_rate": 1.6524023187539715e-07, "loss": 0.71194458, "num_input_tokens_seen": 157161630, "step": 7265, "time_per_iteration": 2.653726577758789 }, { "auxiliary_loss_clip": 0.01159529, "auxiliary_loss_mlp": 0.0102603, "balance_loss_clip": 1.04624605, "balance_loss_mlp": 1.01835012, "epoch": 0.873684843383635, "flos": 20262344659200.0, "grad_norm": 2.033641693168739, "language_loss": 0.7491042, "learning_rate": 1.649303304023446e-07, "loss": 0.77095985, "num_input_tokens_seen": 157181385, "step": 7266, "time_per_iteration": 2.6746809482574463 }, { "auxiliary_loss_clip": 0.01150077, "auxiliary_loss_mlp": 0.01025589, "balance_loss_clip": 1.04704475, "balance_loss_mlp": 1.01892793, "epoch": 0.873805086274274, "flos": 16947287579520.0, "grad_norm": 1.6635361371731454, "language_loss": 0.78688514, "learning_rate": 1.6462070730790246e-07, "loss": 0.80864179, "num_input_tokens_seen": 157200545, "step": 7267, "time_per_iteration": 2.8102352619171143 }, { "auxiliary_loss_clip": 0.01155959, "auxiliary_loss_mlp": 0.0102333, "balance_loss_clip": 1.04519272, "balance_loss_mlp": 1.01633859, "epoch": 0.8739253291649132, "flos": 18041270152320.0, "grad_norm": 2.8823052892195915, "language_loss": 0.78597844, "learning_rate": 1.6431136263903912e-07, "loss": 0.80777133, "num_input_tokens_seen": 157219545, "step": 7268, "time_per_iteration": 2.680633068084717 }, { "auxiliary_loss_clip": 0.01165912, "auxiliary_loss_mlp": 0.01051354, "balance_loss_clip": 1.04560566, "balance_loss_mlp": 1.01586652, "epoch": 0.8740455720555522, "flos": 21325085377920.0, "grad_norm": 1.947733729353148, "language_loss": 0.73435962, "learning_rate": 1.6400229644268282e-07, "loss": 0.75653231, "num_input_tokens_seen": 157237900, "step": 7269, "time_per_iteration": 2.702875852584839 }, { "auxiliary_loss_clip": 0.01149515, "auxiliary_loss_mlp": 0.01022686, "balance_loss_clip": 1.04786563, "balance_loss_mlp": 1.01520514, "epoch": 0.8741658149461913, "flos": 15158684822400.0, "grad_norm": 1.9593141974497583, "language_loss": 0.81807274, "learning_rate": 1.6369350876571852e-07, "loss": 0.83979475, "num_input_tokens_seen": 157256055, "step": 7270, "time_per_iteration": 2.69069242477417 }, { "auxiliary_loss_clip": 0.01150234, "auxiliary_loss_mlp": 0.01024901, "balance_loss_clip": 1.04580545, "balance_loss_mlp": 1.01774848, "epoch": 0.8742860578368304, "flos": 23039855729280.0, "grad_norm": 1.9319563550603973, "language_loss": 0.815418, "learning_rate": 1.6338499965498874e-07, "loss": 0.83716929, "num_input_tokens_seen": 157274785, "step": 7271, "time_per_iteration": 2.8661751747131348 }, { "auxiliary_loss_clip": 0.01152694, "auxiliary_loss_mlp": 0.01028417, "balance_loss_clip": 1.04605854, "balance_loss_mlp": 1.0208894, "epoch": 0.8744063007274695, "flos": 28145347159680.0, "grad_norm": 1.4555553067667841, "language_loss": 0.77320671, "learning_rate": 1.630767691572943e-07, "loss": 0.79501784, "num_input_tokens_seen": 157294805, "step": 7272, "time_per_iteration": 2.742053270339966 }, { "auxiliary_loss_clip": 0.01060893, "auxiliary_loss_mlp": 0.01002729, "balance_loss_clip": 1.00682676, "balance_loss_mlp": 1.0017451, "epoch": 0.8745265436181086, "flos": 64034076654720.0, "grad_norm": 0.7425287987325248, "language_loss": 0.53462231, "learning_rate": 1.6276881731939306e-07, "loss": 0.55525851, "num_input_tokens_seen": 157356695, "step": 7273, "time_per_iteration": 4.314028978347778 }, { "auxiliary_loss_clip": 0.01157604, "auxiliary_loss_mlp": 0.01021781, "balance_loss_clip": 1.04486895, "balance_loss_mlp": 1.01472414, "epoch": 0.8746467865087477, "flos": 28658618553600.0, "grad_norm": 1.8356414386836208, "language_loss": 0.75286281, "learning_rate": 1.6246114418800193e-07, "loss": 0.77465671, "num_input_tokens_seen": 157376975, "step": 7274, "time_per_iteration": 3.7240312099456787 }, { "auxiliary_loss_clip": 0.01157173, "auxiliary_loss_mlp": 0.01027884, "balance_loss_clip": 1.04619014, "balance_loss_mlp": 1.02074647, "epoch": 0.8747670293993868, "flos": 23985850268160.0, "grad_norm": 1.8369167554219996, "language_loss": 0.76641095, "learning_rate": 1.6215374980979423e-07, "loss": 0.78826147, "num_input_tokens_seen": 157397385, "step": 7275, "time_per_iteration": 2.663574457168579 }, { "auxiliary_loss_clip": 0.01159537, "auxiliary_loss_mlp": 0.0102223, "balance_loss_clip": 1.04856217, "balance_loss_mlp": 1.01522684, "epoch": 0.8748872722900258, "flos": 45221624478720.0, "grad_norm": 2.201953306568744, "language_loss": 0.69179451, "learning_rate": 1.6184663423140133e-07, "loss": 0.71361214, "num_input_tokens_seen": 157417685, "step": 7276, "time_per_iteration": 2.8705692291259766 }, { "auxiliary_loss_clip": 0.0115932, "auxiliary_loss_mlp": 0.01029106, "balance_loss_clip": 1.05022502, "balance_loss_mlp": 1.02212644, "epoch": 0.875007515180665, "flos": 19754280737280.0, "grad_norm": 2.0044282352444753, "language_loss": 0.63921982, "learning_rate": 1.615397974994126e-07, "loss": 0.66110408, "num_input_tokens_seen": 157435490, "step": 7277, "time_per_iteration": 2.8049983978271484 }, { "auxiliary_loss_clip": 0.01162655, "auxiliary_loss_mlp": 0.01024317, "balance_loss_clip": 1.04571366, "balance_loss_mlp": 1.01768637, "epoch": 0.875127758071304, "flos": 22710734386560.0, "grad_norm": 1.7845714386262617, "language_loss": 0.80682039, "learning_rate": 1.6123323966037438e-07, "loss": 0.82869005, "num_input_tokens_seen": 157454010, "step": 7278, "time_per_iteration": 2.70414137840271 }, { "auxiliary_loss_clip": 0.01165542, "auxiliary_loss_mlp": 0.0102894, "balance_loss_clip": 1.04798853, "balance_loss_mlp": 1.02187657, "epoch": 0.8752480009619431, "flos": 23403846199680.0, "grad_norm": 2.0215816294370814, "language_loss": 0.78600764, "learning_rate": 1.6092696076079216e-07, "loss": 0.80795252, "num_input_tokens_seen": 157472385, "step": 7279, "time_per_iteration": 3.5495693683624268 }, { "auxiliary_loss_clip": 0.0114568, "auxiliary_loss_mlp": 0.01024612, "balance_loss_clip": 1.04535842, "balance_loss_mlp": 1.01766229, "epoch": 0.8753682438525822, "flos": 26213101914240.0, "grad_norm": 1.847062942057179, "language_loss": 0.73656368, "learning_rate": 1.6062096084712785e-07, "loss": 0.75826657, "num_input_tokens_seen": 157493735, "step": 7280, "time_per_iteration": 2.746656894683838 }, { "auxiliary_loss_clip": 0.01153654, "auxiliary_loss_mlp": 0.01049327, "balance_loss_clip": 1.04689765, "balance_loss_mlp": 1.01377726, "epoch": 0.8754884867432213, "flos": 23326745656320.0, "grad_norm": 1.7817748023201827, "language_loss": 0.70397413, "learning_rate": 1.6031523996580098e-07, "loss": 0.726004, "num_input_tokens_seen": 157511295, "step": 7281, "time_per_iteration": 2.7625958919525146 }, { "auxiliary_loss_clip": 0.0116423, "auxiliary_loss_mlp": 0.01032327, "balance_loss_clip": 1.0478673, "balance_loss_mlp": 1.02513885, "epoch": 0.8756087296338604, "flos": 12495226412160.0, "grad_norm": 2.308134510901258, "language_loss": 0.66423881, "learning_rate": 1.6000979816318981e-07, "loss": 0.68620431, "num_input_tokens_seen": 157529760, "step": 7282, "time_per_iteration": 2.8267791271209717 }, { "auxiliary_loss_clip": 0.0115924, "auxiliary_loss_mlp": 0.01026368, "balance_loss_clip": 1.04686224, "balance_loss_mlp": 1.01869678, "epoch": 0.8757289725244994, "flos": 18952898353920.0, "grad_norm": 2.0807708561389338, "language_loss": 0.75437486, "learning_rate": 1.5970463548562886e-07, "loss": 0.77623093, "num_input_tokens_seen": 157548915, "step": 7283, "time_per_iteration": 2.684889316558838 }, { "auxiliary_loss_clip": 0.01154414, "auxiliary_loss_mlp": 0.01023267, "balance_loss_clip": 1.04484165, "balance_loss_mlp": 1.01584935, "epoch": 0.8758492154151386, "flos": 25265958140160.0, "grad_norm": 1.6381312384744977, "language_loss": 0.7097224, "learning_rate": 1.5939975197941192e-07, "loss": 0.73149925, "num_input_tokens_seen": 157570570, "step": 7284, "time_per_iteration": 2.8174121379852295 }, { "auxiliary_loss_clip": 0.01060542, "auxiliary_loss_mlp": 0.01002602, "balance_loss_clip": 1.00742865, "balance_loss_mlp": 1.00160658, "epoch": 0.8759694583057777, "flos": 65571664193280.0, "grad_norm": 0.83298533909313, "language_loss": 0.53342438, "learning_rate": 1.5909514769078892e-07, "loss": 0.55405581, "num_input_tokens_seen": 157635675, "step": 7285, "time_per_iteration": 4.34218430519104 }, { "auxiliary_loss_clip": 0.01149869, "auxiliary_loss_mlp": 0.01028795, "balance_loss_clip": 1.04984999, "balance_loss_mlp": 1.02190781, "epoch": 0.8760897011964167, "flos": 25446193608960.0, "grad_norm": 1.752411506328841, "language_loss": 0.77473974, "learning_rate": 1.5879082266596867e-07, "loss": 0.79652643, "num_input_tokens_seen": 157657015, "step": 7286, "time_per_iteration": 2.768460988998413 }, { "auxiliary_loss_clip": 0.0115419, "auxiliary_loss_mlp": 0.01025928, "balance_loss_clip": 1.04485035, "balance_loss_mlp": 1.01895404, "epoch": 0.8762099440870559, "flos": 28984830894720.0, "grad_norm": 1.755122984011333, "language_loss": 0.71883655, "learning_rate": 1.5848677695111645e-07, "loss": 0.74063766, "num_input_tokens_seen": 157678615, "step": 7287, "time_per_iteration": 2.740692615509033 }, { "auxiliary_loss_clip": 0.01164296, "auxiliary_loss_mlp": 0.0102896, "balance_loss_clip": 1.04879999, "balance_loss_mlp": 1.02183747, "epoch": 0.8763301869776949, "flos": 21609461352960.0, "grad_norm": 2.644858319083089, "language_loss": 0.69759119, "learning_rate": 1.5818301059235562e-07, "loss": 0.71952379, "num_input_tokens_seen": 157693790, "step": 7288, "time_per_iteration": 2.749746322631836 }, { "auxiliary_loss_clip": 0.01158366, "auxiliary_loss_mlp": 0.01026626, "balance_loss_clip": 1.04702544, "balance_loss_mlp": 1.01867485, "epoch": 0.876450429868334, "flos": 24644416176000.0, "grad_norm": 1.569395604465888, "language_loss": 0.81244624, "learning_rate": 1.578795236357684e-07, "loss": 0.83429623, "num_input_tokens_seen": 157715255, "step": 7289, "time_per_iteration": 2.766071319580078 }, { "auxiliary_loss_clip": 0.01159212, "auxiliary_loss_mlp": 0.01028396, "balance_loss_clip": 1.04765606, "balance_loss_mlp": 1.02113914, "epoch": 0.8765706727589732, "flos": 20260046188800.0, "grad_norm": 3.376293394970914, "language_loss": 0.85588384, "learning_rate": 1.5757631612739218e-07, "loss": 0.87775999, "num_input_tokens_seen": 157728800, "step": 7290, "time_per_iteration": 2.655994176864624 }, { "auxiliary_loss_clip": 0.01059124, "auxiliary_loss_mlp": 0.01002449, "balance_loss_clip": 1.00684595, "balance_loss_mlp": 1.00152493, "epoch": 0.8766909156496122, "flos": 71371165276800.0, "grad_norm": 0.7739670119244373, "language_loss": 0.61386633, "learning_rate": 1.572733881132242e-07, "loss": 0.63448203, "num_input_tokens_seen": 157789445, "step": 7291, "time_per_iteration": 3.279787063598633 }, { "auxiliary_loss_clip": 0.01058592, "auxiliary_loss_mlp": 0.01001491, "balance_loss_clip": 1.01140833, "balance_loss_mlp": 1.00040615, "epoch": 0.8768111585402513, "flos": 69523490603520.0, "grad_norm": 0.7809400157415145, "language_loss": 0.58505028, "learning_rate": 1.5697073963921814e-07, "loss": 0.60565108, "num_input_tokens_seen": 157848685, "step": 7292, "time_per_iteration": 3.212836265563965 }, { "auxiliary_loss_clip": 0.01164359, "auxiliary_loss_mlp": 0.01024038, "balance_loss_clip": 1.04838204, "balance_loss_mlp": 1.01646197, "epoch": 0.8769314014308904, "flos": 18838558385280.0, "grad_norm": 4.518722378212439, "language_loss": 0.84882885, "learning_rate": 1.566683707512857e-07, "loss": 0.87071288, "num_input_tokens_seen": 157866360, "step": 7293, "time_per_iteration": 2.761385917663574 }, { "auxiliary_loss_clip": 0.01158798, "auxiliary_loss_mlp": 0.01027284, "balance_loss_clip": 1.04730237, "balance_loss_mlp": 1.01908839, "epoch": 0.8770516443215295, "flos": 14976402278400.0, "grad_norm": 2.6451197115864358, "language_loss": 0.79923165, "learning_rate": 1.5636628149529553e-07, "loss": 0.82109249, "num_input_tokens_seen": 157884150, "step": 7294, "time_per_iteration": 2.733107328414917 }, { "auxiliary_loss_clip": 0.01159021, "auxiliary_loss_mlp": 0.01026963, "balance_loss_clip": 1.04595053, "balance_loss_mlp": 1.02027512, "epoch": 0.8771718872121685, "flos": 31649654021760.0, "grad_norm": 2.0671983589280165, "language_loss": 0.79355967, "learning_rate": 1.560644719170743e-07, "loss": 0.81541955, "num_input_tokens_seen": 157905020, "step": 7295, "time_per_iteration": 2.8003740310668945 }, { "auxiliary_loss_clip": 0.01155879, "auxiliary_loss_mlp": 0.01029408, "balance_loss_clip": 1.04627943, "balance_loss_mlp": 1.02153397, "epoch": 0.8772921301028077, "flos": 36095466222720.0, "grad_norm": 1.8456283803575424, "language_loss": 0.72533929, "learning_rate": 1.5576294206240692e-07, "loss": 0.74719214, "num_input_tokens_seen": 157924545, "step": 7296, "time_per_iteration": 2.8086931705474854 }, { "auxiliary_loss_clip": 0.01155881, "auxiliary_loss_mlp": 0.01026073, "balance_loss_clip": 1.04765654, "balance_loss_mlp": 1.01867604, "epoch": 0.8774123729934468, "flos": 57116961849600.0, "grad_norm": 1.6640861188489955, "language_loss": 0.6753782, "learning_rate": 1.5546169197703507e-07, "loss": 0.69719774, "num_input_tokens_seen": 157950820, "step": 7297, "time_per_iteration": 2.98531174659729 }, { "auxiliary_loss_clip": 0.01165032, "auxiliary_loss_mlp": 0.0102651, "balance_loss_clip": 1.04652297, "balance_loss_mlp": 1.01930678, "epoch": 0.8775326158840858, "flos": 23914495900800.0, "grad_norm": 2.463792188231063, "language_loss": 0.7708838, "learning_rate": 1.5516072170665774e-07, "loss": 0.79279917, "num_input_tokens_seen": 157968790, "step": 7298, "time_per_iteration": 3.6084227561950684 }, { "auxiliary_loss_clip": 0.01163071, "auxiliary_loss_mlp": 0.01027648, "balance_loss_clip": 1.04623187, "balance_loss_mlp": 1.02008986, "epoch": 0.877652858774725, "flos": 17123285243520.0, "grad_norm": 1.8780204060213639, "language_loss": 0.86921334, "learning_rate": 1.5486003129693214e-07, "loss": 0.89112055, "num_input_tokens_seen": 157986155, "step": 7299, "time_per_iteration": 2.6704764366149902 }, { "auxiliary_loss_clip": 0.01164937, "auxiliary_loss_mlp": 0.01024988, "balance_loss_clip": 1.04630113, "balance_loss_mlp": 1.01765966, "epoch": 0.877773101665364, "flos": 16508961912960.0, "grad_norm": 1.8348454560346286, "language_loss": 0.78332597, "learning_rate": 1.545596207934725e-07, "loss": 0.80522525, "num_input_tokens_seen": 158004640, "step": 7300, "time_per_iteration": 2.606032371520996 }, { "auxiliary_loss_clip": 0.01154238, "auxiliary_loss_mlp": 0.01021258, "balance_loss_clip": 1.04554176, "balance_loss_mlp": 1.01443315, "epoch": 0.8778933445560031, "flos": 22053209973120.0, "grad_norm": 1.798100035695444, "language_loss": 0.77959299, "learning_rate": 1.5425949024185147e-07, "loss": 0.80134797, "num_input_tokens_seen": 158024665, "step": 7301, "time_per_iteration": 3.684670925140381 }, { "auxiliary_loss_clip": 0.01161399, "auxiliary_loss_mlp": 0.01025108, "balance_loss_clip": 1.04617047, "balance_loss_mlp": 1.01833653, "epoch": 0.8780135874466423, "flos": 22564757514240.0, "grad_norm": 1.8870187532547906, "language_loss": 0.67137825, "learning_rate": 1.5395963968759818e-07, "loss": 0.69324327, "num_input_tokens_seen": 158044940, "step": 7302, "time_per_iteration": 2.666329860687256 }, { "auxiliary_loss_clip": 0.01159124, "auxiliary_loss_mlp": 0.01027873, "balance_loss_clip": 1.04429317, "balance_loss_mlp": 1.02028847, "epoch": 0.8781338303372813, "flos": 61531999073280.0, "grad_norm": 5.267636245688016, "language_loss": 0.64500266, "learning_rate": 1.536600691761998e-07, "loss": 0.66687262, "num_input_tokens_seen": 158070770, "step": 7303, "time_per_iteration": 3.057379961013794 }, { "auxiliary_loss_clip": 0.01160895, "auxiliary_loss_mlp": 0.01023953, "balance_loss_clip": 1.04892254, "balance_loss_mlp": 1.01654446, "epoch": 0.8782540732279204, "flos": 22674751937280.0, "grad_norm": 1.8970306330856312, "language_loss": 0.71554828, "learning_rate": 1.5336077875310084e-07, "loss": 0.73739684, "num_input_tokens_seen": 158089995, "step": 7304, "time_per_iteration": 2.757845878601074 }, { "auxiliary_loss_clip": 0.01158193, "auxiliary_loss_mlp": 0.01023896, "balance_loss_clip": 1.04690158, "balance_loss_mlp": 1.01648116, "epoch": 0.8783743161185595, "flos": 16070348937600.0, "grad_norm": 2.033344794723487, "language_loss": 0.74096, "learning_rate": 1.5306176846370321e-07, "loss": 0.7627809, "num_input_tokens_seen": 158108140, "step": 7305, "time_per_iteration": 3.6212615966796875 }, { "auxiliary_loss_clip": 0.01167956, "auxiliary_loss_mlp": 0.01024055, "balance_loss_clip": 1.04704571, "balance_loss_mlp": 1.01689005, "epoch": 0.8784945590091986, "flos": 26067879227520.0, "grad_norm": 1.9284099720321184, "language_loss": 0.73808181, "learning_rate": 1.5276303835336712e-07, "loss": 0.7600019, "num_input_tokens_seen": 158128680, "step": 7306, "time_per_iteration": 2.774078369140625 }, { "auxiliary_loss_clip": 0.01059938, "auxiliary_loss_mlp": 0.0100066, "balance_loss_clip": 1.0071013, "balance_loss_mlp": 0.99970639, "epoch": 0.8786148018998376, "flos": 62720643939840.0, "grad_norm": 0.760192161425047, "language_loss": 0.53506732, "learning_rate": 1.524645884674094e-07, "loss": 0.55567324, "num_input_tokens_seen": 158185610, "step": 7307, "time_per_iteration": 3.187288284301758 }, { "auxiliary_loss_clip": 0.01168899, "auxiliary_loss_mlp": 0.01059214, "balance_loss_clip": 1.04788589, "balance_loss_mlp": 1.02351189, "epoch": 0.8787350447904768, "flos": 21652734263040.0, "grad_norm": 2.2272043811431943, "language_loss": 0.79305494, "learning_rate": 1.521664188511047e-07, "loss": 0.81533611, "num_input_tokens_seen": 158205635, "step": 7308, "time_per_iteration": 2.682086944580078 }, { "auxiliary_loss_clip": 0.01158241, "auxiliary_loss_mlp": 0.01048436, "balance_loss_clip": 1.04930043, "balance_loss_mlp": 1.01395822, "epoch": 0.8788552876811159, "flos": 25478476957440.0, "grad_norm": 2.1495524806482127, "language_loss": 0.80396402, "learning_rate": 1.518685295496851e-07, "loss": 0.82603085, "num_input_tokens_seen": 158223495, "step": 7309, "time_per_iteration": 2.742492914199829 }, { "auxiliary_loss_clip": 0.01162054, "auxiliary_loss_mlp": 0.0102472, "balance_loss_clip": 1.04513144, "balance_loss_mlp": 1.01768363, "epoch": 0.8789755305717549, "flos": 22310222762880.0, "grad_norm": 1.599119522908622, "language_loss": 0.85556233, "learning_rate": 1.5157092060833975e-07, "loss": 0.87743008, "num_input_tokens_seen": 158243145, "step": 7310, "time_per_iteration": 2.6874873638153076 }, { "auxiliary_loss_clip": 0.01157088, "auxiliary_loss_mlp": 0.01023613, "balance_loss_clip": 1.04505277, "balance_loss_mlp": 1.01661849, "epoch": 0.879095773462394, "flos": 29310971408640.0, "grad_norm": 2.141499783922369, "language_loss": 0.65926105, "learning_rate": 1.5127359207221658e-07, "loss": 0.68106806, "num_input_tokens_seen": 158262625, "step": 7311, "time_per_iteration": 2.8075573444366455 }, { "auxiliary_loss_clip": 0.01145497, "auxiliary_loss_mlp": 0.01025416, "balance_loss_clip": 1.04746747, "balance_loss_mlp": 1.01703, "epoch": 0.8792160163530331, "flos": 16690023394560.0, "grad_norm": 1.8489700267633693, "language_loss": 0.73407638, "learning_rate": 1.5097654398641923e-07, "loss": 0.75578547, "num_input_tokens_seen": 158280530, "step": 7312, "time_per_iteration": 3.594822883605957 }, { "auxiliary_loss_clip": 0.01166615, "auxiliary_loss_mlp": 0.01024878, "balance_loss_clip": 1.0462532, "balance_loss_mlp": 1.01779664, "epoch": 0.8793362592436722, "flos": 24499301230080.0, "grad_norm": 1.373624504997526, "language_loss": 0.72917587, "learning_rate": 1.5067977639601014e-07, "loss": 0.75109082, "num_input_tokens_seen": 158303290, "step": 7313, "time_per_iteration": 2.750692129135132 }, { "auxiliary_loss_clip": 0.0115555, "auxiliary_loss_mlp": 0.01025825, "balance_loss_clip": 1.04499149, "balance_loss_mlp": 1.01915526, "epoch": 0.8794565021343113, "flos": 14538399834240.0, "grad_norm": 2.1549142003684696, "language_loss": 0.71063268, "learning_rate": 1.5038328934600864e-07, "loss": 0.73244637, "num_input_tokens_seen": 158319925, "step": 7314, "time_per_iteration": 2.6361756324768066 }, { "auxiliary_loss_clip": 0.01158083, "auxiliary_loss_mlp": 0.01029539, "balance_loss_clip": 1.04613233, "balance_loss_mlp": 1.02257693, "epoch": 0.8795767450249504, "flos": 39530286224640.0, "grad_norm": 1.8881087126037959, "language_loss": 0.70297307, "learning_rate": 1.5008708288139161e-07, "loss": 0.72484928, "num_input_tokens_seen": 158342285, "step": 7315, "time_per_iteration": 2.8364861011505127 }, { "auxiliary_loss_clip": 0.01160938, "auxiliary_loss_mlp": 0.01025104, "balance_loss_clip": 1.04742146, "balance_loss_mlp": 1.01758206, "epoch": 0.8796969879155895, "flos": 22960672197120.0, "grad_norm": 2.422678526866316, "language_loss": 0.7316767, "learning_rate": 1.497911570470931e-07, "loss": 0.75353718, "num_input_tokens_seen": 158362290, "step": 7316, "time_per_iteration": 2.725088596343994 }, { "auxiliary_loss_clip": 0.01144238, "auxiliary_loss_mlp": 0.01023426, "balance_loss_clip": 1.0441196, "balance_loss_mlp": 1.01617861, "epoch": 0.8798172308062285, "flos": 28362427004160.0, "grad_norm": 1.9031293022175098, "language_loss": 0.85719043, "learning_rate": 1.494955118880048e-07, "loss": 0.87886709, "num_input_tokens_seen": 158383275, "step": 7317, "time_per_iteration": 2.8213460445404053 }, { "auxiliary_loss_clip": 0.01159704, "auxiliary_loss_mlp": 0.01023309, "balance_loss_clip": 1.0437727, "balance_loss_mlp": 1.01632905, "epoch": 0.8799374736968677, "flos": 23988974751360.0, "grad_norm": 1.6991849799130259, "language_loss": 0.72889471, "learning_rate": 1.4920014744897634e-07, "loss": 0.75072491, "num_input_tokens_seen": 158402690, "step": 7318, "time_per_iteration": 2.75136137008667 }, { "auxiliary_loss_clip": 0.01154734, "auxiliary_loss_mlp": 0.0102095, "balance_loss_clip": 1.04814148, "balance_loss_mlp": 1.01377344, "epoch": 0.8800577165875068, "flos": 25630271832960.0, "grad_norm": 1.9163583084244848, "language_loss": 0.86108589, "learning_rate": 1.4890506377481392e-07, "loss": 0.88284266, "num_input_tokens_seen": 158421780, "step": 7319, "time_per_iteration": 2.741877794265747 }, { "auxiliary_loss_clip": 0.01146033, "auxiliary_loss_mlp": 0.01024552, "balance_loss_clip": 1.0457561, "balance_loss_mlp": 1.01788831, "epoch": 0.8801779594781458, "flos": 23440331439360.0, "grad_norm": 1.5700150325846918, "language_loss": 0.63951033, "learning_rate": 1.486102609102815e-07, "loss": 0.6612162, "num_input_tokens_seen": 158442330, "step": 7320, "time_per_iteration": 2.7624332904815674 }, { "auxiliary_loss_clip": 0.0115363, "auxiliary_loss_mlp": 0.01023172, "balance_loss_clip": 1.0440762, "balance_loss_mlp": 1.01626968, "epoch": 0.880298202368785, "flos": 11508580656000.0, "grad_norm": 4.481431640440943, "language_loss": 0.85666811, "learning_rate": 1.483157389001004e-07, "loss": 0.87843615, "num_input_tokens_seen": 158459890, "step": 7321, "time_per_iteration": 2.648066520690918 }, { "auxiliary_loss_clip": 0.01162495, "auxiliary_loss_mlp": 0.01030017, "balance_loss_clip": 1.04849958, "balance_loss_mlp": 1.02221465, "epoch": 0.880418445259424, "flos": 22671447886080.0, "grad_norm": 2.0776967808661784, "language_loss": 0.78697991, "learning_rate": 1.4802149778894933e-07, "loss": 0.80890501, "num_input_tokens_seen": 158478680, "step": 7322, "time_per_iteration": 2.793262481689453 }, { "auxiliary_loss_clip": 0.01151104, "auxiliary_loss_mlp": 0.01021434, "balance_loss_clip": 1.0414294, "balance_loss_mlp": 1.01444554, "epoch": 0.8805386881500631, "flos": 20522158709760.0, "grad_norm": 1.7666537487208562, "language_loss": 0.87815964, "learning_rate": 1.4772753762146484e-07, "loss": 0.899885, "num_input_tokens_seen": 158497935, "step": 7323, "time_per_iteration": 2.6192197799682617 }, { "auxiliary_loss_clip": 0.01157672, "auxiliary_loss_mlp": 0.01022492, "balance_loss_clip": 1.04544604, "balance_loss_mlp": 1.01491058, "epoch": 0.8806589310407023, "flos": 36538891620480.0, "grad_norm": 1.4935566241070752, "language_loss": 0.70562786, "learning_rate": 1.474338584422401e-07, "loss": 0.72742951, "num_input_tokens_seen": 158523145, "step": 7324, "time_per_iteration": 2.7902023792266846 }, { "auxiliary_loss_clip": 0.01158645, "auxiliary_loss_mlp": 0.01024726, "balance_loss_clip": 1.04665411, "balance_loss_mlp": 1.01772249, "epoch": 0.8807791739313413, "flos": 23440187784960.0, "grad_norm": 2.213221605135367, "language_loss": 0.75831175, "learning_rate": 1.4714046029582595e-07, "loss": 0.78014541, "num_input_tokens_seen": 158542210, "step": 7325, "time_per_iteration": 3.6862688064575195 }, { "auxiliary_loss_clip": 0.01158193, "auxiliary_loss_mlp": 0.01023071, "balance_loss_clip": 1.04552352, "balance_loss_mlp": 1.01553106, "epoch": 0.8808994168219804, "flos": 25956843310080.0, "grad_norm": 1.666084494012622, "language_loss": 0.75793529, "learning_rate": 1.46847343226731e-07, "loss": 0.77974796, "num_input_tokens_seen": 158563250, "step": 7326, "time_per_iteration": 2.7149817943573 }, { "auxiliary_loss_clip": 0.01164792, "auxiliary_loss_mlp": 0.01023786, "balance_loss_clip": 1.04530931, "balance_loss_mlp": 1.01693118, "epoch": 0.8810196597126195, "flos": 17092079303040.0, "grad_norm": 1.8237249703431455, "language_loss": 0.69487548, "learning_rate": 1.465545072794203e-07, "loss": 0.71676129, "num_input_tokens_seen": 158581125, "step": 7327, "time_per_iteration": 3.612766981124878 }, { "auxiliary_loss_clip": 0.01147671, "auxiliary_loss_mlp": 0.01022448, "balance_loss_clip": 1.04579103, "balance_loss_mlp": 1.01536953, "epoch": 0.8811399026032586, "flos": 23002831785600.0, "grad_norm": 1.6891699292695197, "language_loss": 0.75899625, "learning_rate": 1.4626195249831774e-07, "loss": 0.78069741, "num_input_tokens_seen": 158602025, "step": 7328, "time_per_iteration": 2.7491848468780518 }, { "auxiliary_loss_clip": 0.01158993, "auxiliary_loss_mlp": 0.01023267, "balance_loss_clip": 1.04498005, "balance_loss_mlp": 1.01573277, "epoch": 0.8812601454938976, "flos": 14463813242880.0, "grad_norm": 1.8665596254574222, "language_loss": 0.71688014, "learning_rate": 1.4596967892780244e-07, "loss": 0.73870277, "num_input_tokens_seen": 158618355, "step": 7329, "time_per_iteration": 2.7179348468780518 }, { "auxiliary_loss_clip": 0.01164217, "auxiliary_loss_mlp": 0.01026251, "balance_loss_clip": 1.04673386, "balance_loss_mlp": 1.01905072, "epoch": 0.8813803883845368, "flos": 22493223578880.0, "grad_norm": 1.8934654594744567, "language_loss": 0.74867809, "learning_rate": 1.4567768661221314e-07, "loss": 0.7705828, "num_input_tokens_seen": 158638925, "step": 7330, "time_per_iteration": 2.743654489517212 }, { "auxiliary_loss_clip": 0.01164997, "auxiliary_loss_mlp": 0.01051216, "balance_loss_clip": 1.0468049, "balance_loss_mlp": 1.01652384, "epoch": 0.8815006312751759, "flos": 21506901045120.0, "grad_norm": 2.617008334832725, "language_loss": 0.74649477, "learning_rate": 1.4538597559584442e-07, "loss": 0.76865685, "num_input_tokens_seen": 158656715, "step": 7331, "time_per_iteration": 3.4943883419036865 }, { "auxiliary_loss_clip": 0.01156298, "auxiliary_loss_mlp": 0.0102588, "balance_loss_clip": 1.04685664, "balance_loss_mlp": 1.01750541, "epoch": 0.8816208741658149, "flos": 22784566792320.0, "grad_norm": 1.8154027394644805, "language_loss": 0.78789371, "learning_rate": 1.4509454592294823e-07, "loss": 0.80971551, "num_input_tokens_seen": 158677200, "step": 7332, "time_per_iteration": 2.7075414657592773 }, { "auxiliary_loss_clip": 0.01158399, "auxiliary_loss_mlp": 0.01051281, "balance_loss_clip": 1.04655397, "balance_loss_mlp": 1.01748347, "epoch": 0.8817411170564541, "flos": 17779409026560.0, "grad_norm": 2.067831469595026, "language_loss": 0.78790557, "learning_rate": 1.448033976377354e-07, "loss": 0.81000245, "num_input_tokens_seen": 158692185, "step": 7333, "time_per_iteration": 2.6675970554351807 }, { "auxiliary_loss_clip": 0.011633, "auxiliary_loss_mlp": 0.01023825, "balance_loss_clip": 1.04396391, "balance_loss_mlp": 1.01663041, "epoch": 0.8818613599470931, "flos": 18551812112640.0, "grad_norm": 2.56652266132461, "language_loss": 0.74110621, "learning_rate": 1.445125307843713e-07, "loss": 0.76297748, "num_input_tokens_seen": 158710410, "step": 7334, "time_per_iteration": 2.6920392513275146 }, { "auxiliary_loss_clip": 0.01158083, "auxiliary_loss_mlp": 0.01024603, "balance_loss_clip": 1.04638255, "balance_loss_mlp": 1.01765919, "epoch": 0.8819816028377322, "flos": 27599792417280.0, "grad_norm": 1.6734898159531626, "language_loss": 0.75727248, "learning_rate": 1.442219454069813e-07, "loss": 0.77909935, "num_input_tokens_seen": 158731435, "step": 7335, "time_per_iteration": 2.6912739276885986 }, { "auxiliary_loss_clip": 0.01156301, "auxiliary_loss_mlp": 0.01025885, "balance_loss_clip": 1.04632592, "balance_loss_mlp": 1.01875293, "epoch": 0.8821018457283714, "flos": 23404600385280.0, "grad_norm": 2.751785641829804, "language_loss": 0.66293252, "learning_rate": 1.4393164154964676e-07, "loss": 0.68475443, "num_input_tokens_seen": 158750965, "step": 7336, "time_per_iteration": 2.757746458053589 }, { "auxiliary_loss_clip": 0.01159815, "auxiliary_loss_mlp": 0.01024536, "balance_loss_clip": 1.04611444, "balance_loss_mlp": 1.01752031, "epoch": 0.8822220886190104, "flos": 29132459792640.0, "grad_norm": 1.7060589882932184, "language_loss": 0.94184494, "learning_rate": 1.4364161925640649e-07, "loss": 0.96368843, "num_input_tokens_seen": 158772365, "step": 7337, "time_per_iteration": 2.675248146057129 }, { "auxiliary_loss_clip": 0.01164927, "auxiliary_loss_mlp": 0.01026568, "balance_loss_clip": 1.04674113, "balance_loss_mlp": 1.0197556, "epoch": 0.8823423315096495, "flos": 20485422074880.0, "grad_norm": 2.1808237190487514, "language_loss": 0.85043287, "learning_rate": 1.4335187857125663e-07, "loss": 0.87234783, "num_input_tokens_seen": 158791065, "step": 7338, "time_per_iteration": 3.512661933898926 }, { "auxiliary_loss_clip": 0.01165215, "auxiliary_loss_mlp": 0.01025688, "balance_loss_clip": 1.04624438, "balance_loss_mlp": 1.01873469, "epoch": 0.8824625744002886, "flos": 24206377818240.0, "grad_norm": 1.8850575851746756, "language_loss": 0.7551896, "learning_rate": 1.4306241953815023e-07, "loss": 0.77709866, "num_input_tokens_seen": 158812125, "step": 7339, "time_per_iteration": 2.7564756870269775 }, { "auxiliary_loss_clip": 0.01162037, "auxiliary_loss_mlp": 0.01030707, "balance_loss_clip": 1.04497683, "balance_loss_mlp": 1.02372432, "epoch": 0.8825828172909277, "flos": 24679500785280.0, "grad_norm": 1.57359402659077, "language_loss": 0.71126842, "learning_rate": 1.4277324220099862e-07, "loss": 0.7331959, "num_input_tokens_seen": 158834035, "step": 7340, "time_per_iteration": 2.6879966259002686 }, { "auxiliary_loss_clip": 0.01154386, "auxiliary_loss_mlp": 0.01022986, "balance_loss_clip": 1.04598653, "balance_loss_mlp": 1.01566052, "epoch": 0.8827030601815667, "flos": 22456163721600.0, "grad_norm": 1.7785979385147899, "language_loss": 0.74285531, "learning_rate": 1.4248434660366938e-07, "loss": 0.76462901, "num_input_tokens_seen": 158853510, "step": 7341, "time_per_iteration": 2.7892236709594727 }, { "auxiliary_loss_clip": 0.01158655, "auxiliary_loss_mlp": 0.01024207, "balance_loss_clip": 1.04686272, "balance_loss_mlp": 1.01664865, "epoch": 0.8828233030722058, "flos": 19865639877120.0, "grad_norm": 1.718672871994582, "language_loss": 0.70609349, "learning_rate": 1.4219573278998808e-07, "loss": 0.72792208, "num_input_tokens_seen": 158871970, "step": 7342, "time_per_iteration": 2.799225330352783 }, { "auxiliary_loss_clip": 0.01158812, "auxiliary_loss_mlp": 0.01025476, "balance_loss_clip": 1.0461688, "balance_loss_mlp": 1.01793015, "epoch": 0.882943545962845, "flos": 39347213581440.0, "grad_norm": 3.0902357941518885, "language_loss": 0.64722461, "learning_rate": 1.4190740080373685e-07, "loss": 0.66906744, "num_input_tokens_seen": 158892250, "step": 7343, "time_per_iteration": 2.8486013412475586 }, { "auxiliary_loss_clip": 0.01153098, "auxiliary_loss_mlp": 0.01030268, "balance_loss_clip": 1.04854035, "balance_loss_mlp": 1.02235889, "epoch": 0.883063788853484, "flos": 19054524908160.0, "grad_norm": 1.8494412579219763, "language_loss": 0.84193349, "learning_rate": 1.4161935068865538e-07, "loss": 0.86376715, "num_input_tokens_seen": 158907395, "step": 7344, "time_per_iteration": 2.68411922454834 }, { "auxiliary_loss_clip": 0.01165078, "auxiliary_loss_mlp": 0.0102375, "balance_loss_clip": 1.04607725, "balance_loss_mlp": 1.01547134, "epoch": 0.8831840317441231, "flos": 18733196816640.0, "grad_norm": 2.172451596982897, "language_loss": 0.75500548, "learning_rate": 1.4133158248844113e-07, "loss": 0.77689373, "num_input_tokens_seen": 158926300, "step": 7345, "time_per_iteration": 2.6328012943267822 }, { "auxiliary_loss_clip": 0.01156468, "auxiliary_loss_mlp": 0.01024861, "balance_loss_clip": 1.04470098, "balance_loss_mlp": 1.01754487, "epoch": 0.8833042746347622, "flos": 26827712553600.0, "grad_norm": 1.8345597884329443, "language_loss": 0.73541272, "learning_rate": 1.4104409624674785e-07, "loss": 0.75722599, "num_input_tokens_seen": 158946085, "step": 7346, "time_per_iteration": 2.8223934173583984 }, { "auxiliary_loss_clip": 0.01161228, "auxiliary_loss_mlp": 0.0102282, "balance_loss_clip": 1.04726267, "balance_loss_mlp": 1.01630831, "epoch": 0.8834245175254013, "flos": 26104077158400.0, "grad_norm": 2.3690969782047784, "language_loss": 0.78633797, "learning_rate": 1.407568920071873e-07, "loss": 0.80817854, "num_input_tokens_seen": 158964950, "step": 7347, "time_per_iteration": 2.721820592880249 }, { "auxiliary_loss_clip": 0.01170169, "auxiliary_loss_mlp": 0.01023932, "balance_loss_clip": 1.04817724, "balance_loss_mlp": 1.01611805, "epoch": 0.8835447604160404, "flos": 30629036977920.0, "grad_norm": 2.488602486333614, "language_loss": 0.68461788, "learning_rate": 1.4046996981332782e-07, "loss": 0.70655888, "num_input_tokens_seen": 158984835, "step": 7348, "time_per_iteration": 2.7422521114349365 }, { "auxiliary_loss_clip": 0.01156556, "auxiliary_loss_mlp": 0.01023709, "balance_loss_clip": 1.0442189, "balance_loss_mlp": 1.01600194, "epoch": 0.8836650033066795, "flos": 24718356322560.0, "grad_norm": 1.8474172363189139, "language_loss": 0.78169256, "learning_rate": 1.4018332970869516e-07, "loss": 0.80349517, "num_input_tokens_seen": 159002775, "step": 7349, "time_per_iteration": 2.667843818664551 }, { "auxiliary_loss_clip": 0.01152916, "auxiliary_loss_mlp": 0.01024871, "balance_loss_clip": 1.04704142, "balance_loss_mlp": 1.01714337, "epoch": 0.8837852461973186, "flos": 25413371556480.0, "grad_norm": 2.238462853705715, "language_loss": 0.8529563, "learning_rate": 1.398969717367733e-07, "loss": 0.8747341, "num_input_tokens_seen": 159024100, "step": 7350, "time_per_iteration": 3.6507370471954346 }, { "auxiliary_loss_clip": 0.01150416, "auxiliary_loss_mlp": 0.01023333, "balance_loss_clip": 1.04759979, "balance_loss_mlp": 1.01672864, "epoch": 0.8839054890879576, "flos": 17822574195840.0, "grad_norm": 1.7225319374646464, "language_loss": 0.76410377, "learning_rate": 1.396108959410014e-07, "loss": 0.78584123, "num_input_tokens_seen": 159043315, "step": 7351, "time_per_iteration": 2.6471784114837646 }, { "auxiliary_loss_clip": 0.01161672, "auxiliary_loss_mlp": 0.0105775, "balance_loss_clip": 1.04777765, "balance_loss_mlp": 1.0224309, "epoch": 0.8840257319785968, "flos": 23769021818880.0, "grad_norm": 1.501179926815131, "language_loss": 0.81267118, "learning_rate": 1.3932510236477745e-07, "loss": 0.83486539, "num_input_tokens_seen": 159063985, "step": 7352, "time_per_iteration": 2.7550714015960693 }, { "auxiliary_loss_clip": 0.0116161, "auxiliary_loss_mlp": 0.01022035, "balance_loss_clip": 1.04532361, "balance_loss_mlp": 1.01445341, "epoch": 0.8841459748692359, "flos": 29059776622080.0, "grad_norm": 1.759900334587084, "language_loss": 0.56001937, "learning_rate": 1.3903959105145636e-07, "loss": 0.58185583, "num_input_tokens_seen": 159084475, "step": 7353, "time_per_iteration": 3.703557252883911 }, { "auxiliary_loss_clip": 0.01165085, "auxiliary_loss_mlp": 0.0102269, "balance_loss_clip": 1.04626536, "balance_loss_mlp": 1.01518607, "epoch": 0.8842662177598749, "flos": 24311523905280.0, "grad_norm": 2.326779978237172, "language_loss": 0.83247876, "learning_rate": 1.387543620443492e-07, "loss": 0.85435653, "num_input_tokens_seen": 159101320, "step": 7354, "time_per_iteration": 2.6095731258392334 }, { "auxiliary_loss_clip": 0.01165195, "auxiliary_loss_mlp": 0.01023666, "balance_loss_clip": 1.04656696, "balance_loss_mlp": 1.01678133, "epoch": 0.8843864606505141, "flos": 25007867942400.0, "grad_norm": 3.1797211979942173, "language_loss": 0.84341168, "learning_rate": 1.3846941538672606e-07, "loss": 0.8653003, "num_input_tokens_seen": 159120025, "step": 7355, "time_per_iteration": 2.6433517932891846 }, { "auxiliary_loss_clip": 0.01155563, "auxiliary_loss_mlp": 0.01020525, "balance_loss_clip": 1.04829597, "balance_loss_mlp": 1.01343799, "epoch": 0.8845067035411531, "flos": 28183915388160.0, "grad_norm": 2.092434056641558, "language_loss": 0.80988401, "learning_rate": 1.3818475112181193e-07, "loss": 0.83164489, "num_input_tokens_seen": 159138820, "step": 7356, "time_per_iteration": 2.7449984550476074 }, { "auxiliary_loss_clip": 0.01156896, "auxiliary_loss_mlp": 0.01028824, "balance_loss_clip": 1.04488242, "balance_loss_mlp": 1.02224958, "epoch": 0.8846269464317922, "flos": 12853219311360.0, "grad_norm": 3.9886552384671297, "language_loss": 0.79425013, "learning_rate": 1.3790036929279091e-07, "loss": 0.81610733, "num_input_tokens_seen": 159155975, "step": 7357, "time_per_iteration": 2.718747854232788 }, { "auxiliary_loss_clip": 0.01164284, "auxiliary_loss_mlp": 0.01055057, "balance_loss_clip": 1.04713082, "balance_loss_mlp": 1.01900339, "epoch": 0.8847471893224313, "flos": 18624351628800.0, "grad_norm": 2.2060372857959982, "language_loss": 0.59123623, "learning_rate": 1.3761626994280363e-07, "loss": 0.61342967, "num_input_tokens_seen": 159173445, "step": 7358, "time_per_iteration": 3.5657296180725098 }, { "auxiliary_loss_clip": 0.01159128, "auxiliary_loss_mlp": 0.01022884, "balance_loss_clip": 1.04487252, "balance_loss_mlp": 1.01596987, "epoch": 0.8848674322130704, "flos": 35769433449600.0, "grad_norm": 1.7386646392554896, "language_loss": 0.73442209, "learning_rate": 1.3733245311494735e-07, "loss": 0.75624216, "num_input_tokens_seen": 159196100, "step": 7359, "time_per_iteration": 2.796328544616699 }, { "auxiliary_loss_clip": 0.01163495, "auxiliary_loss_mlp": 0.01028549, "balance_loss_clip": 1.04789591, "balance_loss_mlp": 1.02092516, "epoch": 0.8849876751037095, "flos": 24243760897920.0, "grad_norm": 1.9596964553341913, "language_loss": 0.70759296, "learning_rate": 1.3704891885227676e-07, "loss": 0.72951341, "num_input_tokens_seen": 159216145, "step": 7360, "time_per_iteration": 2.681694984436035 }, { "auxiliary_loss_clip": 0.01160211, "auxiliary_loss_mlp": 0.01028581, "balance_loss_clip": 1.04773569, "balance_loss_mlp": 1.02021813, "epoch": 0.8851079179943486, "flos": 21500580251520.0, "grad_norm": 2.6889642719775715, "language_loss": 0.77902758, "learning_rate": 1.367656671978037e-07, "loss": 0.80091554, "num_input_tokens_seen": 159233610, "step": 7361, "time_per_iteration": 2.7154111862182617 }, { "auxiliary_loss_clip": 0.0116661, "auxiliary_loss_mlp": 0.01027012, "balance_loss_clip": 1.04655123, "balance_loss_mlp": 1.01981187, "epoch": 0.8852281608849877, "flos": 15300711198720.0, "grad_norm": 1.7603821563660553, "language_loss": 0.73516214, "learning_rate": 1.36482698194498e-07, "loss": 0.75709838, "num_input_tokens_seen": 159250155, "step": 7362, "time_per_iteration": 2.7074241638183594 }, { "auxiliary_loss_clip": 0.01159269, "auxiliary_loss_mlp": 0.01025863, "balance_loss_clip": 1.04608274, "balance_loss_mlp": 1.01815033, "epoch": 0.8853484037756267, "flos": 23295719283840.0, "grad_norm": 19.60682343073018, "language_loss": 0.71872532, "learning_rate": 1.3620001188528506e-07, "loss": 0.74057662, "num_input_tokens_seen": 159270875, "step": 7363, "time_per_iteration": 3.8223676681518555 }, { "auxiliary_loss_clip": 0.0116508, "auxiliary_loss_mlp": 0.01020944, "balance_loss_clip": 1.04479218, "balance_loss_mlp": 1.01318979, "epoch": 0.8854686466662659, "flos": 25114773795840.0, "grad_norm": 2.9342342890875153, "language_loss": 0.74020696, "learning_rate": 1.3591760831304865e-07, "loss": 0.7620672, "num_input_tokens_seen": 159288565, "step": 7364, "time_per_iteration": 2.6733322143554688 }, { "auxiliary_loss_clip": 0.0116418, "auxiliary_loss_mlp": 0.01021999, "balance_loss_clip": 1.04574728, "balance_loss_mlp": 1.01461363, "epoch": 0.885588889556905, "flos": 21390873137280.0, "grad_norm": 2.3736536635435153, "language_loss": 0.79452109, "learning_rate": 1.356354875206287e-07, "loss": 0.81638288, "num_input_tokens_seen": 159306400, "step": 7365, "time_per_iteration": 2.6593966484069824 }, { "auxiliary_loss_clip": 0.0115305, "auxiliary_loss_mlp": 0.01028932, "balance_loss_clip": 1.04750371, "balance_loss_mlp": 1.0213083, "epoch": 0.885709132447544, "flos": 26906752431360.0, "grad_norm": 2.0024761356452387, "language_loss": 0.70345223, "learning_rate": 1.3535364955082296e-07, "loss": 0.72527206, "num_input_tokens_seen": 159326250, "step": 7366, "time_per_iteration": 2.7239890098571777 }, { "auxiliary_loss_clip": 0.01163479, "auxiliary_loss_mlp": 0.01028961, "balance_loss_clip": 1.04712343, "balance_loss_mlp": 1.02195132, "epoch": 0.8858293753381832, "flos": 26103394800000.0, "grad_norm": 2.01589109269404, "language_loss": 0.64321113, "learning_rate": 1.3507209444638613e-07, "loss": 0.6651355, "num_input_tokens_seen": 159348250, "step": 7367, "time_per_iteration": 2.695732355117798 }, { "auxiliary_loss_clip": 0.01163618, "auxiliary_loss_mlp": 0.0102405, "balance_loss_clip": 1.04775262, "balance_loss_mlp": 1.01655197, "epoch": 0.8859496182288222, "flos": 23292810282240.0, "grad_norm": 1.9899746678865784, "language_loss": 0.73927295, "learning_rate": 1.347908222500298e-07, "loss": 0.76114964, "num_input_tokens_seen": 159368325, "step": 7368, "time_per_iteration": 2.7698020935058594 }, { "auxiliary_loss_clip": 0.01145958, "auxiliary_loss_mlp": 0.01023823, "balance_loss_clip": 1.0475601, "balance_loss_mlp": 1.017097, "epoch": 0.8860698611194613, "flos": 16872916469760.0, "grad_norm": 1.921993600854683, "language_loss": 0.69837618, "learning_rate": 1.3450983300442276e-07, "loss": 0.72007394, "num_input_tokens_seen": 159387555, "step": 7369, "time_per_iteration": 2.7630794048309326 }, { "auxiliary_loss_clip": 0.01162064, "auxiliary_loss_mlp": 0.01022035, "balance_loss_clip": 1.0447855, "balance_loss_mlp": 1.01524925, "epoch": 0.8861901040101005, "flos": 24681404206080.0, "grad_norm": 2.096459693339764, "language_loss": 0.73596561, "learning_rate": 1.3422912675219068e-07, "loss": 0.7578066, "num_input_tokens_seen": 159407310, "step": 7370, "time_per_iteration": 2.647900342941284 }, { "auxiliary_loss_clip": 0.01162691, "auxiliary_loss_mlp": 0.0102654, "balance_loss_clip": 1.04690647, "balance_loss_mlp": 1.01984382, "epoch": 0.8863103469007395, "flos": 24423026699520.0, "grad_norm": 1.5398678834038386, "language_loss": 0.78988373, "learning_rate": 1.339487035359166e-07, "loss": 0.8117761, "num_input_tokens_seen": 159427680, "step": 7371, "time_per_iteration": 2.6726293563842773 }, { "auxiliary_loss_clip": 0.01158031, "auxiliary_loss_mlp": 0.01051387, "balance_loss_clip": 1.04607797, "balance_loss_mlp": 1.01637161, "epoch": 0.8864305897913786, "flos": 22053964158720.0, "grad_norm": 1.9303745294547137, "language_loss": 0.84928602, "learning_rate": 1.336685633981409e-07, "loss": 0.87138021, "num_input_tokens_seen": 159448765, "step": 7372, "time_per_iteration": 2.7156999111175537 }, { "auxiliary_loss_clip": 0.01163985, "auxiliary_loss_mlp": 0.01032334, "balance_loss_clip": 1.04596734, "balance_loss_mlp": 1.02488375, "epoch": 0.8865508326820177, "flos": 19099449843840.0, "grad_norm": 2.8058166887757983, "language_loss": 0.74513197, "learning_rate": 1.333887063813597e-07, "loss": 0.76709509, "num_input_tokens_seen": 159466870, "step": 7373, "time_per_iteration": 2.6699554920196533 }, { "auxiliary_loss_clip": 0.0115979, "auxiliary_loss_mlp": 0.01023636, "balance_loss_clip": 1.04459739, "balance_loss_mlp": 1.01679373, "epoch": 0.8866710755726568, "flos": 15414189240960.0, "grad_norm": 1.8476987138382939, "language_loss": 0.66759032, "learning_rate": 1.331091325280278e-07, "loss": 0.68942463, "num_input_tokens_seen": 159485840, "step": 7374, "time_per_iteration": 2.7602198123931885 }, { "auxiliary_loss_clip": 0.01148395, "auxiliary_loss_mlp": 0.01020913, "balance_loss_clip": 1.04785991, "balance_loss_mlp": 1.01321232, "epoch": 0.8867913184632958, "flos": 20083689388800.0, "grad_norm": 1.5598658861532835, "language_loss": 0.78731525, "learning_rate": 1.3282984188055625e-07, "loss": 0.8090083, "num_input_tokens_seen": 159505630, "step": 7375, "time_per_iteration": 2.7502100467681885 }, { "auxiliary_loss_clip": 0.01164383, "auxiliary_loss_mlp": 0.01022268, "balance_loss_clip": 1.04529405, "balance_loss_mlp": 1.01521134, "epoch": 0.8869115613539349, "flos": 23365852588800.0, "grad_norm": 1.8938845394245982, "language_loss": 0.79801446, "learning_rate": 1.3255083448131288e-07, "loss": 0.81988096, "num_input_tokens_seen": 159524675, "step": 7376, "time_per_iteration": 2.678199052810669 }, { "auxiliary_loss_clip": 0.01163995, "auxiliary_loss_mlp": 0.01026982, "balance_loss_clip": 1.04416776, "balance_loss_mlp": 1.01927495, "epoch": 0.8870318042445741, "flos": 21286840371840.0, "grad_norm": 2.1142712473942002, "language_loss": 0.79177284, "learning_rate": 1.3227211037262365e-07, "loss": 0.81368268, "num_input_tokens_seen": 159541915, "step": 7377, "time_per_iteration": 3.656809091567993 }, { "auxiliary_loss_clip": 0.0115818, "auxiliary_loss_mlp": 0.01027115, "balance_loss_clip": 1.04703057, "balance_loss_mlp": 1.01985836, "epoch": 0.8871520471352131, "flos": 20010862563840.0, "grad_norm": 1.862019262235457, "language_loss": 0.8512603, "learning_rate": 1.319936695967696e-07, "loss": 0.87311316, "num_input_tokens_seen": 159559740, "step": 7378, "time_per_iteration": 2.75654673576355 }, { "auxiliary_loss_clip": 0.01172164, "auxiliary_loss_mlp": 0.01023336, "balance_loss_clip": 1.04759622, "balance_loss_mlp": 1.01486015, "epoch": 0.8872722900258522, "flos": 22601422321920.0, "grad_norm": 2.1745113350488188, "language_loss": 0.82253861, "learning_rate": 1.3171551219599097e-07, "loss": 0.84449363, "num_input_tokens_seen": 159578265, "step": 7379, "time_per_iteration": 2.624488115310669 }, { "auxiliary_loss_clip": 0.01167211, "auxiliary_loss_mlp": 0.01025124, "balance_loss_clip": 1.04810381, "balance_loss_mlp": 1.01825142, "epoch": 0.8873925329164913, "flos": 22163276223360.0, "grad_norm": 2.8775797244697294, "language_loss": 0.78487957, "learning_rate": 1.3143763821248377e-07, "loss": 0.80680287, "num_input_tokens_seen": 159595350, "step": 7380, "time_per_iteration": 3.6212692260742188 }, { "auxiliary_loss_clip": 0.01161554, "auxiliary_loss_mlp": 0.01023677, "balance_loss_clip": 1.04496944, "balance_loss_mlp": 1.01667619, "epoch": 0.8875127758071304, "flos": 19208223204480.0, "grad_norm": 1.9573480106580685, "language_loss": 0.72422987, "learning_rate": 1.3116004768840118e-07, "loss": 0.74608219, "num_input_tokens_seen": 159613725, "step": 7381, "time_per_iteration": 2.6083459854125977 }, { "auxiliary_loss_clip": 0.01166033, "auxiliary_loss_mlp": 0.01022221, "balance_loss_clip": 1.04696155, "balance_loss_mlp": 1.01497865, "epoch": 0.8876330186977694, "flos": 18110900666880.0, "grad_norm": 1.6800831722052376, "language_loss": 0.74074733, "learning_rate": 1.3088274066585348e-07, "loss": 0.76262987, "num_input_tokens_seen": 159631335, "step": 7382, "time_per_iteration": 2.59344220161438 }, { "auxiliary_loss_clip": 0.01161283, "auxiliary_loss_mlp": 0.01022639, "balance_loss_clip": 1.0454998, "balance_loss_mlp": 1.01544523, "epoch": 0.8877532615884086, "flos": 22009434272640.0, "grad_norm": 2.119738925751176, "language_loss": 0.90702248, "learning_rate": 1.3060571718690749e-07, "loss": 0.92886174, "num_input_tokens_seen": 159648830, "step": 7383, "time_per_iteration": 2.747788667678833 }, { "auxiliary_loss_clip": 0.01059827, "auxiliary_loss_mlp": 0.01033109, "balance_loss_clip": 1.00719297, "balance_loss_mlp": 0.99979985, "epoch": 0.8878735044790477, "flos": 72136924346880.0, "grad_norm": 0.7438304221042653, "language_loss": 0.56884879, "learning_rate": 1.3032897729358805e-07, "loss": 0.58977818, "num_input_tokens_seen": 159709785, "step": 7384, "time_per_iteration": 4.1417059898376465 }, { "auxiliary_loss_clip": 0.01141285, "auxiliary_loss_mlp": 0.01048926, "balance_loss_clip": 1.04524779, "balance_loss_mlp": 1.01373267, "epoch": 0.8879937473696867, "flos": 27526355061120.0, "grad_norm": 1.8833188814178243, "language_loss": 0.80067909, "learning_rate": 1.3005252102787645e-07, "loss": 0.82258117, "num_input_tokens_seen": 159728725, "step": 7385, "time_per_iteration": 2.763072967529297 }, { "auxiliary_loss_clip": 0.01164698, "auxiliary_loss_mlp": 0.01026561, "balance_loss_clip": 1.04637468, "balance_loss_mlp": 1.01877666, "epoch": 0.8881139902603259, "flos": 22234091886720.0, "grad_norm": 1.507455721395529, "language_loss": 0.73353851, "learning_rate": 1.297763484317105e-07, "loss": 0.75545108, "num_input_tokens_seen": 159747020, "step": 7386, "time_per_iteration": 2.7669260501861572 }, { "auxiliary_loss_clip": 0.01153451, "auxiliary_loss_mlp": 0.01061972, "balance_loss_clip": 1.0471344, "balance_loss_mlp": 1.02525091, "epoch": 0.888234233150965, "flos": 20299548170880.0, "grad_norm": 2.647938197704679, "language_loss": 0.70621663, "learning_rate": 1.2950045954698551e-07, "loss": 0.72837085, "num_input_tokens_seen": 159764855, "step": 7387, "time_per_iteration": 2.6611738204956055 }, { "auxiliary_loss_clip": 0.01147107, "auxiliary_loss_mlp": 0.0102098, "balance_loss_clip": 1.04575002, "balance_loss_mlp": 1.01371753, "epoch": 0.888354476041604, "flos": 18147996437760.0, "grad_norm": 1.7059937976383097, "language_loss": 0.75153601, "learning_rate": 1.2922485441555343e-07, "loss": 0.77321684, "num_input_tokens_seen": 159783935, "step": 7388, "time_per_iteration": 2.7264134883880615 }, { "auxiliary_loss_clip": 0.01164935, "auxiliary_loss_mlp": 0.01024978, "balance_loss_clip": 1.04508007, "balance_loss_mlp": 1.01744425, "epoch": 0.8884747189322432, "flos": 22014282608640.0, "grad_norm": 1.7287466458464, "language_loss": 0.81926501, "learning_rate": 1.2894953307922363e-07, "loss": 0.84116411, "num_input_tokens_seen": 159802895, "step": 7389, "time_per_iteration": 2.7089056968688965 }, { "auxiliary_loss_clip": 0.01150235, "auxiliary_loss_mlp": 0.01026874, "balance_loss_clip": 1.04652691, "balance_loss_mlp": 1.01944184, "epoch": 0.8885949618228822, "flos": 19786779567360.0, "grad_norm": 1.9537637207237473, "language_loss": 0.84129947, "learning_rate": 1.2867449557976208e-07, "loss": 0.86307061, "num_input_tokens_seen": 159820995, "step": 7390, "time_per_iteration": 3.5912749767303467 }, { "auxiliary_loss_clip": 0.01160768, "auxiliary_loss_mlp": 0.01023755, "balance_loss_clip": 1.04701197, "balance_loss_mlp": 1.01623559, "epoch": 0.8887152047135213, "flos": 20047599198720.0, "grad_norm": 2.0803186553056374, "language_loss": 0.75772166, "learning_rate": 1.283997419588916e-07, "loss": 0.77956688, "num_input_tokens_seen": 159840465, "step": 7391, "time_per_iteration": 2.695773124694824 }, { "auxiliary_loss_clip": 0.01165864, "auxiliary_loss_mlp": 0.01024638, "balance_loss_clip": 1.04625368, "balance_loss_mlp": 1.01747108, "epoch": 0.8888354476041604, "flos": 18588117784320.0, "grad_norm": 1.9759534842396103, "language_loss": 0.61984652, "learning_rate": 1.2812527225829216e-07, "loss": 0.64175153, "num_input_tokens_seen": 159858690, "step": 7392, "time_per_iteration": 2.64241623878479 }, { "auxiliary_loss_clip": 0.01167776, "auxiliary_loss_mlp": 0.01031258, "balance_loss_clip": 1.04714835, "balance_loss_mlp": 1.02384067, "epoch": 0.8889556904947995, "flos": 21689794120320.0, "grad_norm": 1.9966857739742334, "language_loss": 0.76335371, "learning_rate": 1.2785108651960052e-07, "loss": 0.785344, "num_input_tokens_seen": 159880325, "step": 7393, "time_per_iteration": 2.8053677082061768 }, { "auxiliary_loss_clip": 0.01166847, "auxiliary_loss_mlp": 0.01027217, "balance_loss_clip": 1.04736662, "balance_loss_mlp": 1.01999927, "epoch": 0.8890759333854386, "flos": 27381204201600.0, "grad_norm": 2.645628856945051, "language_loss": 0.80386961, "learning_rate": 1.2757718478441094e-07, "loss": 0.82581019, "num_input_tokens_seen": 159901070, "step": 7394, "time_per_iteration": 2.7428860664367676 }, { "auxiliary_loss_clip": 0.01156456, "auxiliary_loss_mlp": 0.01025063, "balance_loss_clip": 1.04402936, "balance_loss_mlp": 1.01806223, "epoch": 0.8891961762760777, "flos": 24498834353280.0, "grad_norm": 2.273166191024877, "language_loss": 0.77524662, "learning_rate": 1.2730356709427302e-07, "loss": 0.7970618, "num_input_tokens_seen": 159919750, "step": 7395, "time_per_iteration": 2.75718355178833 }, { "auxiliary_loss_clip": 0.01159199, "auxiliary_loss_mlp": 0.01026271, "balance_loss_clip": 1.04744565, "balance_loss_mlp": 1.01894569, "epoch": 0.8893164191667168, "flos": 41499770895360.0, "grad_norm": 1.5542949983515364, "language_loss": 0.5962345, "learning_rate": 1.2703023349069542e-07, "loss": 0.6180892, "num_input_tokens_seen": 159944600, "step": 7396, "time_per_iteration": 2.780488967895508 }, { "auxiliary_loss_clip": 0.01159234, "auxiliary_loss_mlp": 0.01022677, "balance_loss_clip": 1.04626262, "balance_loss_mlp": 1.01545942, "epoch": 0.8894366620573558, "flos": 33583623120000.0, "grad_norm": 2.0156706753295954, "language_loss": 0.6162625, "learning_rate": 1.2675718401514223e-07, "loss": 0.63808167, "num_input_tokens_seen": 159968780, "step": 7397, "time_per_iteration": 2.767138719558716 }, { "auxiliary_loss_clip": 0.01160425, "auxiliary_loss_mlp": 0.01031559, "balance_loss_clip": 1.04758239, "balance_loss_mlp": 1.02433491, "epoch": 0.889556904947995, "flos": 16909832672640.0, "grad_norm": 6.802636579303273, "language_loss": 0.74448645, "learning_rate": 1.264844187090346e-07, "loss": 0.7664063, "num_input_tokens_seen": 159985905, "step": 7398, "time_per_iteration": 2.721442222595215 }, { "auxiliary_loss_clip": 0.01156353, "auxiliary_loss_mlp": 0.01025749, "balance_loss_clip": 1.04651034, "balance_loss_mlp": 1.01914442, "epoch": 0.889677147838634, "flos": 26030855283840.0, "grad_norm": 1.706972672217974, "language_loss": 0.75177634, "learning_rate": 1.262119376137516e-07, "loss": 0.77359736, "num_input_tokens_seen": 160006965, "step": 7399, "time_per_iteration": 2.756943941116333 }, { "auxiliary_loss_clip": 0.01155026, "auxiliary_loss_mlp": 0.0102371, "balance_loss_clip": 1.04647923, "balance_loss_mlp": 1.01624441, "epoch": 0.8897973907292731, "flos": 26468283110400.0, "grad_norm": 2.114020204542601, "language_loss": 0.85047972, "learning_rate": 1.2593974077062707e-07, "loss": 0.87226713, "num_input_tokens_seen": 160028585, "step": 7400, "time_per_iteration": 2.65813946723938 }, { "auxiliary_loss_clip": 0.01150765, "auxiliary_loss_mlp": 0.0103234, "balance_loss_clip": 1.04657507, "balance_loss_mlp": 1.02507114, "epoch": 0.8899176336199123, "flos": 26249694894720.0, "grad_norm": 1.6969481127101715, "language_loss": 0.63495767, "learning_rate": 1.2566782822095423e-07, "loss": 0.65678871, "num_input_tokens_seen": 160048840, "step": 7401, "time_per_iteration": 2.750120162963867 }, { "auxiliary_loss_clip": 0.01161696, "auxiliary_loss_mlp": 0.01024688, "balance_loss_clip": 1.04859042, "balance_loss_mlp": 1.01718354, "epoch": 0.8900378765105513, "flos": 20811742156800.0, "grad_norm": 2.4300802860688666, "language_loss": 0.71424282, "learning_rate": 1.2539620000598162e-07, "loss": 0.73610663, "num_input_tokens_seen": 160068175, "step": 7402, "time_per_iteration": 2.7137224674224854 }, { "auxiliary_loss_clip": 0.01163292, "auxiliary_loss_mlp": 0.01026339, "balance_loss_clip": 1.04497349, "balance_loss_mlp": 1.01890028, "epoch": 0.8901581194011904, "flos": 16472333018880.0, "grad_norm": 1.7777190383969834, "language_loss": 0.79673779, "learning_rate": 1.2512485616691492e-07, "loss": 0.81863415, "num_input_tokens_seen": 160085230, "step": 7403, "time_per_iteration": 3.5466229915618896 }, { "auxiliary_loss_clip": 0.01161001, "auxiliary_loss_mlp": 0.01023963, "balance_loss_clip": 1.04852104, "balance_loss_mlp": 1.0163393, "epoch": 0.8902783622918296, "flos": 35155253773440.0, "grad_norm": 1.4420791165349656, "language_loss": 0.80829293, "learning_rate": 1.2485379674491681e-07, "loss": 0.8301425, "num_input_tokens_seen": 160111425, "step": 7404, "time_per_iteration": 2.8796184062957764 }, { "auxiliary_loss_clip": 0.01159375, "auxiliary_loss_mlp": 0.01029757, "balance_loss_clip": 1.04859328, "balance_loss_mlp": 1.02231228, "epoch": 0.8903986051824686, "flos": 17201068145280.0, "grad_norm": 3.029298095950701, "language_loss": 0.79460943, "learning_rate": 1.2458302178110657e-07, "loss": 0.81650072, "num_input_tokens_seen": 160129790, "step": 7405, "time_per_iteration": 2.704040288925171 }, { "auxiliary_loss_clip": 0.0114701, "auxiliary_loss_mlp": 0.01020137, "balance_loss_clip": 1.04481292, "balance_loss_mlp": 1.01353872, "epoch": 0.8905188480731077, "flos": 25483863997440.0, "grad_norm": 1.84895698476124, "language_loss": 0.82381231, "learning_rate": 1.2431253131656118e-07, "loss": 0.84548378, "num_input_tokens_seen": 160149265, "step": 7406, "time_per_iteration": 3.6853697299957275 }, { "auxiliary_loss_clip": 0.01154554, "auxiliary_loss_mlp": 0.01025819, "balance_loss_clip": 1.04704118, "balance_loss_mlp": 1.01776338, "epoch": 0.8906390909637467, "flos": 23365888502400.0, "grad_norm": 1.698001215017173, "language_loss": 0.76497948, "learning_rate": 1.240423253923133e-07, "loss": 0.78678322, "num_input_tokens_seen": 160168870, "step": 7407, "time_per_iteration": 2.7385404109954834 }, { "auxiliary_loss_clip": 0.01164344, "auxiliary_loss_mlp": 0.01027406, "balance_loss_clip": 1.04760146, "balance_loss_mlp": 1.01993108, "epoch": 0.8907593338543859, "flos": 21068790860160.0, "grad_norm": 2.009762715345234, "language_loss": 0.69620705, "learning_rate": 1.237724040493533e-07, "loss": 0.71812463, "num_input_tokens_seen": 160187495, "step": 7408, "time_per_iteration": 2.6070492267608643 }, { "auxiliary_loss_clip": 0.011724, "auxiliary_loss_mlp": 0.01029493, "balance_loss_clip": 1.05075932, "balance_loss_mlp": 1.02131498, "epoch": 0.8908795767450249, "flos": 21869562712320.0, "grad_norm": 5.25928201569262, "language_loss": 0.73487222, "learning_rate": 1.2350276732862773e-07, "loss": 0.75689113, "num_input_tokens_seen": 160208520, "step": 7409, "time_per_iteration": 2.6625816822052 }, { "auxiliary_loss_clip": 0.01060428, "auxiliary_loss_mlp": 0.01001773, "balance_loss_clip": 1.00714123, "balance_loss_mlp": 1.00084305, "epoch": 0.890999819635664, "flos": 66307869348480.0, "grad_norm": 0.8464701014638696, "language_loss": 0.56593907, "learning_rate": 1.2323341527103993e-07, "loss": 0.58656108, "num_input_tokens_seen": 160263720, "step": 7410, "time_per_iteration": 3.922747850418091 }, { "auxiliary_loss_clip": 0.01162853, "auxiliary_loss_mlp": 0.01022811, "balance_loss_clip": 1.04494858, "balance_loss_mlp": 1.01554179, "epoch": 0.8911200625263032, "flos": 26869908055680.0, "grad_norm": 2.0090457985619357, "language_loss": 0.853562, "learning_rate": 1.2296434791745135e-07, "loss": 0.8754186, "num_input_tokens_seen": 160282170, "step": 7411, "time_per_iteration": 2.6713571548461914 }, { "auxiliary_loss_clip": 0.01164942, "auxiliary_loss_mlp": 0.01023755, "balance_loss_clip": 1.04769969, "balance_loss_mlp": 1.01634061, "epoch": 0.8912403054169422, "flos": 20885825957760.0, "grad_norm": 2.1965982888426585, "language_loss": 0.76703292, "learning_rate": 1.2269556530867875e-07, "loss": 0.78891993, "num_input_tokens_seen": 160300725, "step": 7412, "time_per_iteration": 2.662712812423706 }, { "auxiliary_loss_clip": 0.01172085, "auxiliary_loss_mlp": 0.01025295, "balance_loss_clip": 1.04887402, "balance_loss_mlp": 1.01692045, "epoch": 0.8913605483075813, "flos": 27016567286400.0, "grad_norm": 2.130377472063556, "language_loss": 0.82210505, "learning_rate": 1.2242706748549614e-07, "loss": 0.84407878, "num_input_tokens_seen": 160318720, "step": 7413, "time_per_iteration": 2.681262731552124 }, { "auxiliary_loss_clip": 0.01160352, "auxiliary_loss_mlp": 0.01023485, "balance_loss_clip": 1.04479289, "balance_loss_mlp": 1.01660097, "epoch": 0.8914807911982204, "flos": 23621500661760.0, "grad_norm": 2.4262207700127765, "language_loss": 0.82235408, "learning_rate": 1.2215885448863473e-07, "loss": 0.8441925, "num_input_tokens_seen": 160339595, "step": 7414, "time_per_iteration": 2.7744932174682617 }, { "auxiliary_loss_clip": 0.01157611, "auxiliary_loss_mlp": 0.01025165, "balance_loss_clip": 1.04661942, "balance_loss_mlp": 1.01824236, "epoch": 0.8916010340888595, "flos": 24462277286400.0, "grad_norm": 2.0722807912750554, "language_loss": 0.80479544, "learning_rate": 1.2189092635878152e-07, "loss": 0.8266232, "num_input_tokens_seen": 160361045, "step": 7415, "time_per_iteration": 3.6037757396698 }, { "auxiliary_loss_clip": 0.01147167, "auxiliary_loss_mlp": 0.01024567, "balance_loss_clip": 1.04593325, "balance_loss_mlp": 1.01693809, "epoch": 0.8917212769794985, "flos": 21215773313280.0, "grad_norm": 1.9269193276965637, "language_loss": 0.7760675, "learning_rate": 1.216232831365822e-07, "loss": 0.79778486, "num_input_tokens_seen": 160379990, "step": 7416, "time_per_iteration": 2.793813467025757 }, { "auxiliary_loss_clip": 0.01163874, "auxiliary_loss_mlp": 0.01023634, "balance_loss_clip": 1.04579544, "balance_loss_mlp": 1.01613271, "epoch": 0.8918415198701377, "flos": 25513992529920.0, "grad_norm": 2.793727143316925, "language_loss": 0.8105455, "learning_rate": 1.2135592486263678e-07, "loss": 0.83242059, "num_input_tokens_seen": 160399240, "step": 7417, "time_per_iteration": 2.707909345626831 }, { "auxiliary_loss_clip": 0.01157181, "auxiliary_loss_mlp": 0.01021973, "balance_loss_clip": 1.04473054, "balance_loss_mlp": 1.01402795, "epoch": 0.8919617627607768, "flos": 37853006693760.0, "grad_norm": 1.6979769903646407, "language_loss": 0.61338109, "learning_rate": 1.2108885157750415e-07, "loss": 0.63517261, "num_input_tokens_seen": 160421600, "step": 7418, "time_per_iteration": 2.8317885398864746 }, { "auxiliary_loss_clip": 0.01152446, "auxiliary_loss_mlp": 0.01054791, "balance_loss_clip": 1.04754364, "balance_loss_mlp": 1.0194881, "epoch": 0.8920820056514158, "flos": 26213676531840.0, "grad_norm": 1.6633516326419358, "language_loss": 0.80132151, "learning_rate": 1.2082206332169897e-07, "loss": 0.82339394, "num_input_tokens_seen": 160441695, "step": 7419, "time_per_iteration": 2.761230945587158 }, { "auxiliary_loss_clip": 0.01153137, "auxiliary_loss_mlp": 0.01024669, "balance_loss_clip": 1.04538965, "balance_loss_mlp": 1.01775181, "epoch": 0.892202248542055, "flos": 17383135207680.0, "grad_norm": 2.907144731244039, "language_loss": 0.73319793, "learning_rate": 1.2055556013569225e-07, "loss": 0.75497603, "num_input_tokens_seen": 160457205, "step": 7420, "time_per_iteration": 2.6491692066192627 }, { "auxiliary_loss_clip": 0.01160893, "auxiliary_loss_mlp": 0.01022138, "balance_loss_clip": 1.04557014, "balance_loss_mlp": 1.01491952, "epoch": 0.892322491432694, "flos": 21324223451520.0, "grad_norm": 1.7067799187010664, "language_loss": 0.82385135, "learning_rate": 1.2028934205991315e-07, "loss": 0.84568167, "num_input_tokens_seen": 160476525, "step": 7421, "time_per_iteration": 2.7408056259155273 }, { "auxiliary_loss_clip": 0.01163298, "auxiliary_loss_mlp": 0.0102443, "balance_loss_clip": 1.04697692, "balance_loss_mlp": 1.01670218, "epoch": 0.8924427343233331, "flos": 24029374573440.0, "grad_norm": 1.4112305792078899, "language_loss": 0.76733768, "learning_rate": 1.2002340913474607e-07, "loss": 0.78921497, "num_input_tokens_seen": 160500160, "step": 7422, "time_per_iteration": 2.838414192199707 }, { "auxiliary_loss_clip": 0.01167079, "auxiliary_loss_mlp": 0.01024571, "balance_loss_clip": 1.04574132, "balance_loss_mlp": 1.0173465, "epoch": 0.8925629772139723, "flos": 30008069631360.0, "grad_norm": 5.0934164526689525, "language_loss": 0.73800027, "learning_rate": 1.1975776140053317e-07, "loss": 0.75991678, "num_input_tokens_seen": 160520130, "step": 7423, "time_per_iteration": 2.7752442359924316 }, { "auxiliary_loss_clip": 0.01157225, "auxiliary_loss_mlp": 0.01030304, "balance_loss_clip": 1.04875636, "balance_loss_mlp": 1.02216792, "epoch": 0.8926832201046113, "flos": 22601709630720.0, "grad_norm": 2.064452425148934, "language_loss": 0.74055362, "learning_rate": 1.194923988975729e-07, "loss": 0.76242888, "num_input_tokens_seen": 160539730, "step": 7424, "time_per_iteration": 2.7804529666900635 }, { "auxiliary_loss_clip": 0.01154127, "auxiliary_loss_mlp": 0.01024834, "balance_loss_clip": 1.04550552, "balance_loss_mlp": 1.01722789, "epoch": 0.8928034629952504, "flos": 13297722117120.0, "grad_norm": 2.033128047692246, "language_loss": 0.73348105, "learning_rate": 1.192273216661206e-07, "loss": 0.7552706, "num_input_tokens_seen": 160557820, "step": 7425, "time_per_iteration": 2.765132427215576 }, { "auxiliary_loss_clip": 0.0105836, "auxiliary_loss_mlp": 0.0099996, "balance_loss_clip": 1.00853527, "balance_loss_mlp": 0.99893534, "epoch": 0.8929237058858895, "flos": 54854556744960.0, "grad_norm": 0.7659160286435589, "language_loss": 0.57467711, "learning_rate": 1.189625297463881e-07, "loss": 0.59526038, "num_input_tokens_seen": 160619510, "step": 7426, "time_per_iteration": 3.274254322052002 }, { "auxiliary_loss_clip": 0.01146969, "auxiliary_loss_mlp": 0.01022174, "balance_loss_clip": 1.04452205, "balance_loss_mlp": 1.0150336, "epoch": 0.8930439487765286, "flos": 28883850785280.0, "grad_norm": 1.6610497937656514, "language_loss": 0.79508883, "learning_rate": 1.1869802317854394e-07, "loss": 0.81678027, "num_input_tokens_seen": 160643295, "step": 7427, "time_per_iteration": 2.861429452896118 }, { "auxiliary_loss_clip": 0.01156589, "auxiliary_loss_mlp": 0.01023759, "balance_loss_clip": 1.04536211, "balance_loss_mlp": 1.01616216, "epoch": 0.8931641916671677, "flos": 22419283432320.0, "grad_norm": 1.8465897384998993, "language_loss": 0.71960288, "learning_rate": 1.1843380200271425e-07, "loss": 0.74140644, "num_input_tokens_seen": 160662495, "step": 7428, "time_per_iteration": 3.056187629699707 }, { "auxiliary_loss_clip": 0.01151824, "auxiliary_loss_mlp": 0.01025619, "balance_loss_clip": 1.04619622, "balance_loss_mlp": 1.01799572, "epoch": 0.8932844345578068, "flos": 25843149786240.0, "grad_norm": 1.7601657780799127, "language_loss": 0.80399424, "learning_rate": 1.181698662589805e-07, "loss": 0.82576865, "num_input_tokens_seen": 160682080, "step": 7429, "time_per_iteration": 3.7383172512054443 }, { "auxiliary_loss_clip": 0.01161017, "auxiliary_loss_mlp": 0.01025876, "balance_loss_clip": 1.04539108, "balance_loss_mlp": 1.01804435, "epoch": 0.8934046774484459, "flos": 22925803069440.0, "grad_norm": 1.9018891209483832, "language_loss": 0.75920212, "learning_rate": 1.1790621598738249e-07, "loss": 0.78107107, "num_input_tokens_seen": 160700395, "step": 7430, "time_per_iteration": 2.7150943279266357 }, { "auxiliary_loss_clip": 0.01161148, "auxiliary_loss_mlp": 0.0102629, "balance_loss_clip": 1.04552221, "balance_loss_mlp": 1.02018905, "epoch": 0.8935249203390849, "flos": 24462097718400.0, "grad_norm": 1.9420422311967498, "language_loss": 0.75018895, "learning_rate": 1.1764285122791461e-07, "loss": 0.77206326, "num_input_tokens_seen": 160721115, "step": 7431, "time_per_iteration": 3.655944347381592 }, { "auxiliary_loss_clip": 0.01162445, "auxiliary_loss_mlp": 0.01019559, "balance_loss_clip": 1.04522371, "balance_loss_mlp": 1.01272821, "epoch": 0.8936451632297241, "flos": 15742735966080.0, "grad_norm": 2.6243060448151447, "language_loss": 0.76868945, "learning_rate": 1.173797720205294e-07, "loss": 0.79050946, "num_input_tokens_seen": 160739150, "step": 7432, "time_per_iteration": 2.59797739982605 }, { "auxiliary_loss_clip": 0.01163745, "auxiliary_loss_mlp": 0.01025373, "balance_loss_clip": 1.04696751, "balance_loss_mlp": 1.01782727, "epoch": 0.8937654061203631, "flos": 35115500396160.0, "grad_norm": 3.1995370839823187, "language_loss": 0.71786594, "learning_rate": 1.1711697840513602e-07, "loss": 0.73975718, "num_input_tokens_seen": 160758585, "step": 7433, "time_per_iteration": 2.7683117389678955 }, { "auxiliary_loss_clip": 0.011561, "auxiliary_loss_mlp": 0.01023783, "balance_loss_clip": 1.04513836, "balance_loss_mlp": 1.01680624, "epoch": 0.8938856490110022, "flos": 16107444708480.0, "grad_norm": 2.266289458607603, "language_loss": 0.71110743, "learning_rate": 1.1685447042160012e-07, "loss": 0.73290622, "num_input_tokens_seen": 160776620, "step": 7434, "time_per_iteration": 2.6044838428497314 }, { "auxiliary_loss_clip": 0.01168238, "auxiliary_loss_mlp": 0.01021237, "balance_loss_clip": 1.04683375, "balance_loss_mlp": 1.01373577, "epoch": 0.8940058919016414, "flos": 20704189858560.0, "grad_norm": 1.7845634836733266, "language_loss": 0.71514934, "learning_rate": 1.1659224810974367e-07, "loss": 0.7370441, "num_input_tokens_seen": 160796580, "step": 7435, "time_per_iteration": 2.6406118869781494 }, { "auxiliary_loss_clip": 0.01157664, "auxiliary_loss_mlp": 0.01029353, "balance_loss_clip": 1.04643607, "balance_loss_mlp": 1.02212358, "epoch": 0.8941261347922804, "flos": 25229041937280.0, "grad_norm": 1.556107930567828, "language_loss": 0.68517363, "learning_rate": 1.1633031150934591e-07, "loss": 0.70704377, "num_input_tokens_seen": 160819610, "step": 7436, "time_per_iteration": 3.6597328186035156 }, { "auxiliary_loss_clip": 0.01163884, "auxiliary_loss_mlp": 0.01027421, "balance_loss_clip": 1.04808724, "balance_loss_mlp": 1.01980329, "epoch": 0.8942463776829195, "flos": 19537236806400.0, "grad_norm": 2.269211996522154, "language_loss": 0.80056167, "learning_rate": 1.1606866066014176e-07, "loss": 0.82247472, "num_input_tokens_seen": 160838660, "step": 7437, "time_per_iteration": 2.63535737991333 }, { "auxiliary_loss_clip": 0.01152639, "auxiliary_loss_mlp": 0.01023457, "balance_loss_clip": 1.04538167, "balance_loss_mlp": 1.01608372, "epoch": 0.8943666205735585, "flos": 22301567585280.0, "grad_norm": 2.4265496528131347, "language_loss": 0.75401813, "learning_rate": 1.1580729560182434e-07, "loss": 0.77577907, "num_input_tokens_seen": 160854515, "step": 7438, "time_per_iteration": 2.738452434539795 }, { "auxiliary_loss_clip": 0.01164568, "auxiliary_loss_mlp": 0.01050631, "balance_loss_clip": 1.04635298, "balance_loss_mlp": 1.01568961, "epoch": 0.8944868634641977, "flos": 18912893581440.0, "grad_norm": 2.0557172411609757, "language_loss": 0.7081309, "learning_rate": 1.1554621637404171e-07, "loss": 0.7302829, "num_input_tokens_seen": 160872605, "step": 7439, "time_per_iteration": 2.7048587799072266 }, { "auxiliary_loss_clip": 0.01160822, "auxiliary_loss_mlp": 0.01020125, "balance_loss_clip": 1.04450512, "balance_loss_mlp": 1.01312709, "epoch": 0.8946071063548368, "flos": 14460904241280.0, "grad_norm": 37.971852240086974, "language_loss": 0.6117602, "learning_rate": 1.1528542301639999e-07, "loss": 0.63356966, "num_input_tokens_seen": 160889395, "step": 7440, "time_per_iteration": 2.719214677810669 }, { "auxiliary_loss_clip": 0.01157585, "auxiliary_loss_mlp": 0.0102377, "balance_loss_clip": 1.04472578, "balance_loss_mlp": 1.01630771, "epoch": 0.8947273492454758, "flos": 20084084438400.0, "grad_norm": 3.935413988615577, "language_loss": 0.82932365, "learning_rate": 1.1502491556846105e-07, "loss": 0.85113716, "num_input_tokens_seen": 160907890, "step": 7441, "time_per_iteration": 3.599958896636963 }, { "auxiliary_loss_clip": 0.01159583, "auxiliary_loss_mlp": 0.01021171, "balance_loss_clip": 1.04708886, "balance_loss_mlp": 1.01387298, "epoch": 0.894847592136115, "flos": 18550555136640.0, "grad_norm": 2.330822915650023, "language_loss": 0.81005299, "learning_rate": 1.1476469406974331e-07, "loss": 0.83186054, "num_input_tokens_seen": 160923490, "step": 7442, "time_per_iteration": 2.8063900470733643 }, { "auxiliary_loss_clip": 0.0116255, "auxiliary_loss_mlp": 0.01022363, "balance_loss_clip": 1.04646194, "balance_loss_mlp": 1.01476073, "epoch": 0.894967835026754, "flos": 23478468704640.0, "grad_norm": 1.6689917161929717, "language_loss": 0.7715345, "learning_rate": 1.1450475855972341e-07, "loss": 0.79338372, "num_input_tokens_seen": 160944280, "step": 7443, "time_per_iteration": 2.675968885421753 }, { "auxiliary_loss_clip": 0.01156804, "auxiliary_loss_mlp": 0.01056808, "balance_loss_clip": 1.0449053, "balance_loss_mlp": 1.02097917, "epoch": 0.8950880779173931, "flos": 15188310564480.0, "grad_norm": 1.8786917110997718, "language_loss": 0.70569623, "learning_rate": 1.1424510907783158e-07, "loss": 0.72783232, "num_input_tokens_seen": 160961560, "step": 7444, "time_per_iteration": 2.6185457706451416 }, { "auxiliary_loss_clip": 0.01161516, "auxiliary_loss_mlp": 0.01021179, "balance_loss_clip": 1.0431881, "balance_loss_mlp": 1.01438439, "epoch": 0.8952083208080323, "flos": 22091957769600.0, "grad_norm": 1.5722458457098962, "language_loss": 0.82687926, "learning_rate": 1.1398574566345787e-07, "loss": 0.84870619, "num_input_tokens_seen": 160982195, "step": 7445, "time_per_iteration": 2.6823575496673584 }, { "auxiliary_loss_clip": 0.01163788, "auxiliary_loss_mlp": 0.01024298, "balance_loss_clip": 1.04509819, "balance_loss_mlp": 1.0164485, "epoch": 0.8953285636986713, "flos": 23254026572160.0, "grad_norm": 2.2522107895675596, "language_loss": 0.82294452, "learning_rate": 1.1372666835594702e-07, "loss": 0.84482539, "num_input_tokens_seen": 161000520, "step": 7446, "time_per_iteration": 2.7187530994415283 }, { "auxiliary_loss_clip": 0.0115583, "auxiliary_loss_mlp": 0.01022711, "balance_loss_clip": 1.0457592, "balance_loss_mlp": 1.01518559, "epoch": 0.8954488065893104, "flos": 16362661818240.0, "grad_norm": 1.9092888794293057, "language_loss": 0.71820658, "learning_rate": 1.1346787719460071e-07, "loss": 0.73999202, "num_input_tokens_seen": 161019405, "step": 7447, "time_per_iteration": 2.7886595726013184 }, { "auxiliary_loss_clip": 0.01155759, "auxiliary_loss_mlp": 0.01022465, "balance_loss_clip": 1.04464245, "balance_loss_mlp": 1.01546729, "epoch": 0.8955690494799495, "flos": 18257883120000.0, "grad_norm": 1.9921602988719527, "language_loss": 0.72455537, "learning_rate": 1.1320937221867732e-07, "loss": 0.74633765, "num_input_tokens_seen": 161036985, "step": 7448, "time_per_iteration": 2.6803247928619385 }, { "auxiliary_loss_clip": 0.01156776, "auxiliary_loss_mlp": 0.01023143, "balance_loss_clip": 1.04511189, "balance_loss_mlp": 1.01637518, "epoch": 0.8956892923705886, "flos": 25447486498560.0, "grad_norm": 1.7985083651192024, "language_loss": 0.79782093, "learning_rate": 1.1295115346739192e-07, "loss": 0.81962013, "num_input_tokens_seen": 161056985, "step": 7449, "time_per_iteration": 2.797257661819458 }, { "auxiliary_loss_clip": 0.01161954, "auxiliary_loss_mlp": 0.01025029, "balance_loss_clip": 1.04633951, "balance_loss_mlp": 1.01716685, "epoch": 0.8958095352612276, "flos": 52661883939840.0, "grad_norm": 2.313325459210528, "language_loss": 0.72953248, "learning_rate": 1.1269322097991629e-07, "loss": 0.75140226, "num_input_tokens_seen": 161080270, "step": 7450, "time_per_iteration": 3.009721040725708 }, { "auxiliary_loss_clip": 0.01165947, "auxiliary_loss_mlp": 0.01028188, "balance_loss_clip": 1.04875088, "balance_loss_mlp": 1.0198313, "epoch": 0.8959297781518668, "flos": 23186335392000.0, "grad_norm": 2.386746458639781, "language_loss": 0.67951024, "learning_rate": 1.1243557479537846e-07, "loss": 0.70145154, "num_input_tokens_seen": 161100160, "step": 7451, "time_per_iteration": 2.8311452865600586 }, { "auxiliary_loss_clip": 0.01164941, "auxiliary_loss_mlp": 0.01028472, "balance_loss_clip": 1.04492688, "balance_loss_mlp": 1.02078891, "epoch": 0.8960500210425059, "flos": 20334309557760.0, "grad_norm": 1.9535327179384363, "language_loss": 0.68646842, "learning_rate": 1.121782149528634e-07, "loss": 0.70840257, "num_input_tokens_seen": 161117260, "step": 7452, "time_per_iteration": 2.649841547012329 }, { "auxiliary_loss_clip": 0.01161264, "auxiliary_loss_mlp": 0.0102665, "balance_loss_clip": 1.04679441, "balance_loss_mlp": 1.0188179, "epoch": 0.8961702639331449, "flos": 19901694153600.0, "grad_norm": 2.54606683826222, "language_loss": 0.78906983, "learning_rate": 1.1192114149141208e-07, "loss": 0.81094897, "num_input_tokens_seen": 161136895, "step": 7453, "time_per_iteration": 2.6184356212615967 }, { "auxiliary_loss_clip": 0.01163203, "auxiliary_loss_mlp": 0.01023539, "balance_loss_clip": 1.04661775, "balance_loss_mlp": 1.015939, "epoch": 0.8962905068237841, "flos": 12896348567040.0, "grad_norm": 2.4174659539680263, "language_loss": 0.65515321, "learning_rate": 1.1166435445002197e-07, "loss": 0.67702067, "num_input_tokens_seen": 161154565, "step": 7454, "time_per_iteration": 2.7563915252685547 }, { "auxiliary_loss_clip": 0.01166375, "auxiliary_loss_mlp": 0.01023059, "balance_loss_clip": 1.04868221, "balance_loss_mlp": 1.01542914, "epoch": 0.8964107497144231, "flos": 23440331439360.0, "grad_norm": 3.348189850315268, "language_loss": 0.68803734, "learning_rate": 1.1140785386764818e-07, "loss": 0.70993173, "num_input_tokens_seen": 161173265, "step": 7455, "time_per_iteration": 3.7033119201660156 }, { "auxiliary_loss_clip": 0.01159216, "auxiliary_loss_mlp": 0.01025484, "balance_loss_clip": 1.04796219, "balance_loss_mlp": 1.01816773, "epoch": 0.8965309926050622, "flos": 19500176949120.0, "grad_norm": 2.2562487997626945, "language_loss": 0.69300103, "learning_rate": 1.1115163978320153e-07, "loss": 0.71484804, "num_input_tokens_seen": 161191995, "step": 7456, "time_per_iteration": 2.7038347721099854 }, { "auxiliary_loss_clip": 0.01167975, "auxiliary_loss_mlp": 0.01057461, "balance_loss_clip": 1.04808199, "balance_loss_mlp": 1.02107286, "epoch": 0.8966512354957014, "flos": 28658008022400.0, "grad_norm": 2.038907903066192, "language_loss": 0.8237648, "learning_rate": 1.1089571223554917e-07, "loss": 0.84601915, "num_input_tokens_seen": 161212880, "step": 7457, "time_per_iteration": 2.6828842163085938 }, { "auxiliary_loss_clip": 0.01164257, "auxiliary_loss_mlp": 0.01028513, "balance_loss_clip": 1.04512811, "balance_loss_mlp": 1.02118778, "epoch": 0.8967714783863404, "flos": 23370916406400.0, "grad_norm": 5.06262251053438, "language_loss": 0.85366809, "learning_rate": 1.1064007126351537e-07, "loss": 0.87559581, "num_input_tokens_seen": 161233595, "step": 7458, "time_per_iteration": 3.7075934410095215 }, { "auxiliary_loss_clip": 0.01156667, "auxiliary_loss_mlp": 0.01021325, "balance_loss_clip": 1.04736006, "balance_loss_mlp": 1.01430082, "epoch": 0.8968917212769795, "flos": 24535175938560.0, "grad_norm": 2.2622310035349136, "language_loss": 0.76193845, "learning_rate": 1.1038471690588003e-07, "loss": 0.78371835, "num_input_tokens_seen": 161252740, "step": 7459, "time_per_iteration": 2.810422658920288 }, { "auxiliary_loss_clip": 0.01153141, "auxiliary_loss_mlp": 0.01029618, "balance_loss_clip": 1.04733431, "balance_loss_mlp": 1.0223676, "epoch": 0.8970119641676186, "flos": 23475416048640.0, "grad_norm": 1.8606437883585807, "language_loss": 0.79944962, "learning_rate": 1.1012964920138145e-07, "loss": 0.8212772, "num_input_tokens_seen": 161272325, "step": 7460, "time_per_iteration": 2.7708449363708496 }, { "auxiliary_loss_clip": 0.01155315, "auxiliary_loss_mlp": 0.01028253, "balance_loss_clip": 1.04512429, "balance_loss_mlp": 1.02048016, "epoch": 0.8971322070582577, "flos": 24538192680960.0, "grad_norm": 1.7702722026987587, "language_loss": 0.75870728, "learning_rate": 1.0987486818871205e-07, "loss": 0.78054297, "num_input_tokens_seen": 161295915, "step": 7461, "time_per_iteration": 2.859823703765869 }, { "auxiliary_loss_clip": 0.01161362, "auxiliary_loss_mlp": 0.01050857, "balance_loss_clip": 1.04606509, "balance_loss_mlp": 1.01421654, "epoch": 0.8972524499488967, "flos": 21797454159360.0, "grad_norm": 2.4579626856969714, "language_loss": 0.73147857, "learning_rate": 1.0962037390652245e-07, "loss": 0.75360078, "num_input_tokens_seen": 161314935, "step": 7462, "time_per_iteration": 3.5687127113342285 }, { "auxiliary_loss_clip": 0.01159956, "auxiliary_loss_mlp": 0.01023832, "balance_loss_clip": 1.04706836, "balance_loss_mlp": 1.01605356, "epoch": 0.8973726928395359, "flos": 21726243446400.0, "grad_norm": 1.8306099673632161, "language_loss": 0.71647549, "learning_rate": 1.0936616639341911e-07, "loss": 0.73831338, "num_input_tokens_seen": 161335225, "step": 7463, "time_per_iteration": 2.6726326942443848 }, { "auxiliary_loss_clip": 0.01057249, "auxiliary_loss_mlp": 0.0099997, "balance_loss_clip": 1.01018929, "balance_loss_mlp": 0.99905795, "epoch": 0.897492935730175, "flos": 53837100097920.0, "grad_norm": 0.8022577818753823, "language_loss": 0.54710114, "learning_rate": 1.0911224568796473e-07, "loss": 0.56767333, "num_input_tokens_seen": 161393420, "step": 7464, "time_per_iteration": 3.316437244415283 }, { "auxiliary_loss_clip": 0.01162159, "auxiliary_loss_mlp": 0.01025129, "balance_loss_clip": 1.04797029, "balance_loss_mlp": 1.01762795, "epoch": 0.897613178620814, "flos": 18290346036480.0, "grad_norm": 1.8754005344130849, "language_loss": 0.70714992, "learning_rate": 1.0885861182867984e-07, "loss": 0.72902286, "num_input_tokens_seen": 161411525, "step": 7465, "time_per_iteration": 2.7160027027130127 }, { "auxiliary_loss_clip": 0.01162905, "auxiliary_loss_mlp": 0.01027041, "balance_loss_clip": 1.04788172, "balance_loss_mlp": 1.01995146, "epoch": 0.8977334215114532, "flos": 32993718059520.0, "grad_norm": 1.7231733077352422, "language_loss": 0.70754743, "learning_rate": 1.0860526485403942e-07, "loss": 0.72944689, "num_input_tokens_seen": 161432800, "step": 7466, "time_per_iteration": 2.7731807231903076 }, { "auxiliary_loss_clip": 0.011643, "auxiliary_loss_mlp": 0.01026971, "balance_loss_clip": 1.04567885, "balance_loss_mlp": 1.01978838, "epoch": 0.8978536644020922, "flos": 15195636938880.0, "grad_norm": 1.82095946042314, "language_loss": 0.77438855, "learning_rate": 1.0835220480247675e-07, "loss": 0.79630125, "num_input_tokens_seen": 161451295, "step": 7467, "time_per_iteration": 3.547980785369873 }, { "auxiliary_loss_clip": 0.01154878, "auxiliary_loss_mlp": 0.01024334, "balance_loss_clip": 1.0452292, "balance_loss_mlp": 1.01731801, "epoch": 0.8979739072927313, "flos": 18004389863040.0, "grad_norm": 1.9119204039491255, "language_loss": 0.83475775, "learning_rate": 1.0809943171238067e-07, "loss": 0.85654992, "num_input_tokens_seen": 161469220, "step": 7468, "time_per_iteration": 2.6334033012390137 }, { "auxiliary_loss_clip": 0.01166331, "auxiliary_loss_mlp": 0.01027274, "balance_loss_clip": 1.04690158, "balance_loss_mlp": 1.01953149, "epoch": 0.8980941501833704, "flos": 22271546793600.0, "grad_norm": 2.5258859986263857, "language_loss": 0.62882537, "learning_rate": 1.078469456220965e-07, "loss": 0.65076143, "num_input_tokens_seen": 161489375, "step": 7469, "time_per_iteration": 2.7012572288513184 }, { "auxiliary_loss_clip": 0.01163967, "auxiliary_loss_mlp": 0.01026071, "balance_loss_clip": 1.04559851, "balance_loss_mlp": 1.01858497, "epoch": 0.8982143930740095, "flos": 37560729726720.0, "grad_norm": 1.696117155560358, "language_loss": 0.6953662, "learning_rate": 1.0759474656992606e-07, "loss": 0.71726656, "num_input_tokens_seen": 161512145, "step": 7470, "time_per_iteration": 2.78173828125 }, { "auxiliary_loss_clip": 0.01162737, "auxiliary_loss_mlp": 0.01023521, "balance_loss_clip": 1.04419482, "balance_loss_mlp": 1.01582932, "epoch": 0.8983346359646486, "flos": 18076893465600.0, "grad_norm": 2.230211547130594, "language_loss": 0.78422838, "learning_rate": 1.0734283459412785e-07, "loss": 0.80609101, "num_input_tokens_seen": 161528995, "step": 7471, "time_per_iteration": 2.71417498588562 }, { "auxiliary_loss_clip": 0.01159925, "auxiliary_loss_mlp": 0.01022476, "balance_loss_clip": 1.04919457, "balance_loss_mlp": 1.01451278, "epoch": 0.8984548788552876, "flos": 20558895344640.0, "grad_norm": 1.8170328896091235, "language_loss": 0.80404043, "learning_rate": 1.0709120973291707e-07, "loss": 0.82586443, "num_input_tokens_seen": 161548775, "step": 7472, "time_per_iteration": 2.8159244060516357 }, { "auxiliary_loss_clip": 0.0116839, "auxiliary_loss_mlp": 0.01032341, "balance_loss_clip": 1.0485189, "balance_loss_mlp": 1.02448535, "epoch": 0.8985751217459268, "flos": 17785442511360.0, "grad_norm": 2.951523324703914, "language_loss": 0.77700537, "learning_rate": 1.0683987202446475e-07, "loss": 0.79901266, "num_input_tokens_seen": 161566960, "step": 7473, "time_per_iteration": 2.682927370071411 }, { "auxiliary_loss_clip": 0.01166521, "auxiliary_loss_mlp": 0.01023681, "balance_loss_clip": 1.04707408, "balance_loss_mlp": 1.01603937, "epoch": 0.8986953646365659, "flos": 21617003208960.0, "grad_norm": 1.9430599070616428, "language_loss": 0.69914138, "learning_rate": 1.0658882150689862e-07, "loss": 0.72104335, "num_input_tokens_seen": 161585820, "step": 7474, "time_per_iteration": 2.679816484451294 }, { "auxiliary_loss_clip": 0.01159795, "auxiliary_loss_mlp": 0.01024618, "balance_loss_clip": 1.0462141, "balance_loss_mlp": 1.01740003, "epoch": 0.8988156075272049, "flos": 14027355083520.0, "grad_norm": 2.5213716853104646, "language_loss": 0.78577542, "learning_rate": 1.0633805821830288e-07, "loss": 0.80761957, "num_input_tokens_seen": 161602505, "step": 7475, "time_per_iteration": 2.9162380695343018 }, { "auxiliary_loss_clip": 0.0116183, "auxiliary_loss_mlp": 0.01024188, "balance_loss_clip": 1.04788959, "balance_loss_mlp": 1.01655602, "epoch": 0.8989358504178441, "flos": 29059202004480.0, "grad_norm": 3.2956576440408876, "language_loss": 0.83072609, "learning_rate": 1.0608758219671753e-07, "loss": 0.85258633, "num_input_tokens_seen": 161621545, "step": 7476, "time_per_iteration": 2.7675836086273193 }, { "auxiliary_loss_clip": 0.0116417, "auxiliary_loss_mlp": 0.01027228, "balance_loss_clip": 1.04642427, "balance_loss_mlp": 1.01977146, "epoch": 0.8990560933084831, "flos": 20230420446720.0, "grad_norm": 2.051809356487541, "language_loss": 0.70577067, "learning_rate": 1.0583739348014065e-07, "loss": 0.72768462, "num_input_tokens_seen": 161642630, "step": 7477, "time_per_iteration": 2.6784989833831787 }, { "auxiliary_loss_clip": 0.01167901, "auxiliary_loss_mlp": 0.01022963, "balance_loss_clip": 1.04864812, "balance_loss_mlp": 1.01573265, "epoch": 0.8991763361991222, "flos": 25520672459520.0, "grad_norm": 1.8147099398009816, "language_loss": 0.84697223, "learning_rate": 1.0558749210652518e-07, "loss": 0.86888087, "num_input_tokens_seen": 161662560, "step": 7478, "time_per_iteration": 2.657611608505249 }, { "auxiliary_loss_clip": 0.01160108, "auxiliary_loss_mlp": 0.01022163, "balance_loss_clip": 1.04593885, "balance_loss_mlp": 1.0148797, "epoch": 0.8992965790897613, "flos": 25119191168640.0, "grad_norm": 1.6428527461654043, "language_loss": 0.85422206, "learning_rate": 1.053378781137808e-07, "loss": 0.87604475, "num_input_tokens_seen": 161683480, "step": 7479, "time_per_iteration": 2.7058842182159424 }, { "auxiliary_loss_clip": 0.01160489, "auxiliary_loss_mlp": 0.01029647, "balance_loss_clip": 1.04514575, "balance_loss_mlp": 1.02208591, "epoch": 0.8994168219804004, "flos": 16070815814400.0, "grad_norm": 3.7199677733889795, "language_loss": 0.77488244, "learning_rate": 1.0508855153977392e-07, "loss": 0.79678375, "num_input_tokens_seen": 161699945, "step": 7480, "time_per_iteration": 2.679572582244873 }, { "auxiliary_loss_clip": 0.01161371, "auxiliary_loss_mlp": 0.01022782, "balance_loss_clip": 1.04403257, "balance_loss_mlp": 1.01546288, "epoch": 0.8995370648710395, "flos": 24825764966400.0, "grad_norm": 2.128274944069055, "language_loss": 0.66924191, "learning_rate": 1.0483951242232669e-07, "loss": 0.69108343, "num_input_tokens_seen": 161720420, "step": 7481, "time_per_iteration": 3.6727283000946045 }, { "auxiliary_loss_clip": 0.01059081, "auxiliary_loss_mlp": 0.01000723, "balance_loss_clip": 1.0065136, "balance_loss_mlp": 0.99979305, "epoch": 0.8996573077616786, "flos": 63116238378240.0, "grad_norm": 0.9781781112206253, "language_loss": 0.5765658, "learning_rate": 1.0459076079921936e-07, "loss": 0.5971638, "num_input_tokens_seen": 161773080, "step": 7482, "time_per_iteration": 3.2844574451446533 }, { "auxiliary_loss_clip": 0.0115627, "auxiliary_loss_mlp": 0.01026266, "balance_loss_clip": 1.04808521, "balance_loss_mlp": 1.01901221, "epoch": 0.8997775506523177, "flos": 18219674027520.0, "grad_norm": 2.583070636139323, "language_loss": 0.85030627, "learning_rate": 1.0434229670818618e-07, "loss": 0.87213165, "num_input_tokens_seen": 161789755, "step": 7483, "time_per_iteration": 2.7623791694641113 }, { "auxiliary_loss_clip": 0.01155596, "auxiliary_loss_mlp": 0.01023999, "balance_loss_clip": 1.04843664, "balance_loss_mlp": 1.01656365, "epoch": 0.8998977935429567, "flos": 24166768095360.0, "grad_norm": 1.670832557757819, "language_loss": 0.79957771, "learning_rate": 1.0409412018691944e-07, "loss": 0.8213737, "num_input_tokens_seen": 161810220, "step": 7484, "time_per_iteration": 3.665121078491211 }, { "auxiliary_loss_clip": 0.0115709, "auxiliary_loss_mlp": 0.01031762, "balance_loss_clip": 1.04850769, "balance_loss_mlp": 1.02490187, "epoch": 0.9000180364335959, "flos": 20773030273920.0, "grad_norm": 2.0165426401205093, "language_loss": 0.75430977, "learning_rate": 1.0384623127306724e-07, "loss": 0.77619827, "num_input_tokens_seen": 161827565, "step": 7485, "time_per_iteration": 2.725320339202881 }, { "auxiliary_loss_clip": 0.01155313, "auxiliary_loss_mlp": 0.01026209, "balance_loss_clip": 1.04633892, "balance_loss_mlp": 1.01910996, "epoch": 0.900138279324235, "flos": 19205745166080.0, "grad_norm": 2.189698106731875, "language_loss": 0.7961868, "learning_rate": 1.0359863000423397e-07, "loss": 0.81800205, "num_input_tokens_seen": 161845700, "step": 7486, "time_per_iteration": 2.6868369579315186 }, { "auxiliary_loss_clip": 0.01166717, "auxiliary_loss_mlp": 0.01023314, "balance_loss_clip": 1.04643297, "balance_loss_mlp": 1.01631606, "epoch": 0.900258522214874, "flos": 28731158069760.0, "grad_norm": 1.647208789352249, "language_loss": 0.72039914, "learning_rate": 1.0335131641798112e-07, "loss": 0.74229938, "num_input_tokens_seen": 161867660, "step": 7487, "time_per_iteration": 2.7370097637176514 }, { "auxiliary_loss_clip": 0.01058877, "auxiliary_loss_mlp": 0.01002494, "balance_loss_clip": 1.00771713, "balance_loss_mlp": 1.00159991, "epoch": 0.9003787651055132, "flos": 58280685655680.0, "grad_norm": 0.8055692817293373, "language_loss": 0.5559842, "learning_rate": 1.0310429055182512e-07, "loss": 0.57659787, "num_input_tokens_seen": 161921980, "step": 7488, "time_per_iteration": 4.032141447067261 }, { "auxiliary_loss_clip": 0.01159391, "auxiliary_loss_mlp": 0.01027814, "balance_loss_clip": 1.04936516, "balance_loss_mlp": 1.02033925, "epoch": 0.9004990079961522, "flos": 25556475340800.0, "grad_norm": 1.8526012042303217, "language_loss": 0.73989916, "learning_rate": 1.0285755244324024e-07, "loss": 0.76177114, "num_input_tokens_seen": 161942725, "step": 7489, "time_per_iteration": 2.725644111633301 }, { "auxiliary_loss_clip": 0.01160794, "auxiliary_loss_mlp": 0.01053439, "balance_loss_clip": 1.04346228, "balance_loss_mlp": 1.01804638, "epoch": 0.9006192508867913, "flos": 23335185352320.0, "grad_norm": 1.400024435212947, "language_loss": 0.68501425, "learning_rate": 1.0261110212965629e-07, "loss": 0.70715666, "num_input_tokens_seen": 161964520, "step": 7490, "time_per_iteration": 2.755270004272461 }, { "auxiliary_loss_clip": 0.01160173, "auxiliary_loss_mlp": 0.01028323, "balance_loss_clip": 1.04732251, "balance_loss_mlp": 1.02111363, "epoch": 0.9007394937774305, "flos": 18040300485120.0, "grad_norm": 1.8687580953015226, "language_loss": 0.79113579, "learning_rate": 1.023649396484596e-07, "loss": 0.81302077, "num_input_tokens_seen": 161983575, "step": 7491, "time_per_iteration": 2.7146029472351074 }, { "auxiliary_loss_clip": 0.01164663, "auxiliary_loss_mlp": 0.01025488, "balance_loss_clip": 1.04659176, "balance_loss_mlp": 1.0178411, "epoch": 0.9008597366680695, "flos": 43068456633600.0, "grad_norm": 3.5009986152751593, "language_loss": 0.67537135, "learning_rate": 1.0211906503699275e-07, "loss": 0.69727278, "num_input_tokens_seen": 162006550, "step": 7492, "time_per_iteration": 2.881863832473755 }, { "auxiliary_loss_clip": 0.01165238, "auxiliary_loss_mlp": 0.01026954, "balance_loss_clip": 1.04743457, "balance_loss_mlp": 1.01984286, "epoch": 0.9009799795587086, "flos": 14939055112320.0, "grad_norm": 2.4564268621526835, "language_loss": 0.820135, "learning_rate": 1.0187347833255455e-07, "loss": 0.84205693, "num_input_tokens_seen": 162022455, "step": 7493, "time_per_iteration": 3.673006296157837 }, { "auxiliary_loss_clip": 0.01163147, "auxiliary_loss_mlp": 0.01023745, "balance_loss_clip": 1.04705477, "balance_loss_mlp": 1.01703393, "epoch": 0.9011002224493477, "flos": 21579584215680.0, "grad_norm": 1.744155996337481, "language_loss": 0.79210007, "learning_rate": 1.0162817957240056e-07, "loss": 0.81396908, "num_input_tokens_seen": 162042350, "step": 7494, "time_per_iteration": 2.670206308364868 }, { "auxiliary_loss_clip": 0.01061338, "auxiliary_loss_mlp": 0.01000619, "balance_loss_clip": 1.00899518, "balance_loss_mlp": 0.99970692, "epoch": 0.9012204653399868, "flos": 71166367883520.0, "grad_norm": 0.8776723376938792, "language_loss": 0.63005984, "learning_rate": 1.0138316879374253e-07, "loss": 0.65067947, "num_input_tokens_seen": 162111640, "step": 7495, "time_per_iteration": 3.412677049636841 }, { "auxiliary_loss_clip": 0.0116043, "auxiliary_loss_mlp": 0.01027385, "balance_loss_clip": 1.04687536, "balance_loss_mlp": 1.02022982, "epoch": 0.9013407082306258, "flos": 15594963413760.0, "grad_norm": 2.220740659180425, "language_loss": 0.74156582, "learning_rate": 1.0113844603374833e-07, "loss": 0.76344401, "num_input_tokens_seen": 162128165, "step": 7496, "time_per_iteration": 2.6384546756744385 }, { "auxiliary_loss_clip": 0.01159985, "auxiliary_loss_mlp": 0.01032068, "balance_loss_clip": 1.04572725, "balance_loss_mlp": 1.02377105, "epoch": 0.901460951121265, "flos": 15049157276160.0, "grad_norm": 2.1810904175226384, "language_loss": 0.72190237, "learning_rate": 1.0089401132954178e-07, "loss": 0.74382293, "num_input_tokens_seen": 162146145, "step": 7497, "time_per_iteration": 2.6699695587158203 }, { "auxiliary_loss_clip": 0.01160161, "auxiliary_loss_mlp": 0.01026175, "balance_loss_clip": 1.04695153, "balance_loss_mlp": 1.01888204, "epoch": 0.9015811940119041, "flos": 22236857233920.0, "grad_norm": 1.761904198955539, "language_loss": 0.72683185, "learning_rate": 1.006498647182037e-07, "loss": 0.74869525, "num_input_tokens_seen": 162164800, "step": 7498, "time_per_iteration": 2.743673801422119 }, { "auxiliary_loss_clip": 0.01150654, "auxiliary_loss_mlp": 0.01027014, "balance_loss_clip": 1.0491817, "balance_loss_mlp": 1.01939905, "epoch": 0.9017014369025431, "flos": 24973824827520.0, "grad_norm": 1.9379955633526025, "language_loss": 0.71303707, "learning_rate": 1.004060062367713e-07, "loss": 0.73481369, "num_input_tokens_seen": 162185895, "step": 7499, "time_per_iteration": 2.8388547897338867 }, { "auxiliary_loss_clip": 0.01161967, "auxiliary_loss_mlp": 0.01025245, "balance_loss_clip": 1.04438996, "balance_loss_mlp": 1.01787508, "epoch": 0.9018216797931822, "flos": 18114168804480.0, "grad_norm": 1.716625355808084, "language_loss": 0.69719201, "learning_rate": 1.0016243592223728e-07, "loss": 0.71906412, "num_input_tokens_seen": 162206295, "step": 7500, "time_per_iteration": 2.9542429447174072 }, { "auxiliary_loss_clip": 0.01147691, "auxiliary_loss_mlp": 0.01027726, "balance_loss_clip": 1.04692471, "balance_loss_mlp": 1.02056158, "epoch": 0.9019419226838213, "flos": 37268452759680.0, "grad_norm": 2.339578838389029, "language_loss": 0.65701318, "learning_rate": 9.991915381155114e-08, "loss": 0.67876738, "num_input_tokens_seen": 162229275, "step": 7501, "time_per_iteration": 2.884382486343384 }, { "auxiliary_loss_clip": 0.01169461, "auxiliary_loss_mlp": 0.01026734, "balance_loss_clip": 1.04893231, "balance_loss_mlp": 1.01926517, "epoch": 0.9020621655744604, "flos": 23441121538560.0, "grad_norm": 1.9963053630754064, "language_loss": 0.74967551, "learning_rate": 9.967615994161871e-08, "loss": 0.77163744, "num_input_tokens_seen": 162248935, "step": 7502, "time_per_iteration": 2.7665185928344727 }, { "auxiliary_loss_clip": 0.01164882, "auxiliary_loss_mlp": 0.0102229, "balance_loss_clip": 1.04538083, "balance_loss_mlp": 1.01487541, "epoch": 0.9021824084650995, "flos": 22857465444480.0, "grad_norm": 2.894119843536758, "language_loss": 0.78088731, "learning_rate": 9.943345434930161e-08, "loss": 0.80275905, "num_input_tokens_seen": 162269185, "step": 7503, "time_per_iteration": 2.6758246421813965 }, { "auxiliary_loss_clip": 0.01156445, "auxiliary_loss_mlp": 0.0102416, "balance_loss_clip": 1.04861021, "balance_loss_mlp": 1.016626, "epoch": 0.9023026513557386, "flos": 22127581082880.0, "grad_norm": 1.960946964005373, "language_loss": 0.69092882, "learning_rate": 9.919103707141885e-08, "loss": 0.71273482, "num_input_tokens_seen": 162288065, "step": 7504, "time_per_iteration": 2.8985819816589355 }, { "auxiliary_loss_clip": 0.01164188, "auxiliary_loss_mlp": 0.01027131, "balance_loss_clip": 1.04715085, "balance_loss_mlp": 1.01914346, "epoch": 0.9024228942463777, "flos": 24199087357440.0, "grad_norm": 1.8629925063185646, "language_loss": 0.76420808, "learning_rate": 9.89489081447441e-08, "loss": 0.78612125, "num_input_tokens_seen": 162305265, "step": 7505, "time_per_iteration": 2.908243417739868 }, { "auxiliary_loss_clip": 0.01159283, "auxiliary_loss_mlp": 0.01022409, "balance_loss_clip": 1.04687572, "balance_loss_mlp": 1.01493442, "epoch": 0.9025431371370167, "flos": 25008262992000.0, "grad_norm": 1.8915251257083505, "language_loss": 0.82740879, "learning_rate": 9.870706760600844e-08, "loss": 0.84922576, "num_input_tokens_seen": 162325215, "step": 7506, "time_per_iteration": 2.7789294719696045 }, { "auxiliary_loss_clip": 0.01159354, "auxiliary_loss_mlp": 0.01025485, "balance_loss_clip": 1.05010962, "balance_loss_mlp": 1.01828432, "epoch": 0.9026633800276559, "flos": 18952862440320.0, "grad_norm": 1.9270790275238576, "language_loss": 0.72509599, "learning_rate": 9.846551549189918e-08, "loss": 0.74694431, "num_input_tokens_seen": 162344820, "step": 7507, "time_per_iteration": 3.791170358657837 }, { "auxiliary_loss_clip": 0.01157241, "auxiliary_loss_mlp": 0.01024739, "balance_loss_clip": 1.04532981, "balance_loss_mlp": 1.01692474, "epoch": 0.902783622918295, "flos": 32416059536640.0, "grad_norm": 2.044441873849622, "language_loss": 0.68943447, "learning_rate": 9.822425183905902e-08, "loss": 0.71125424, "num_input_tokens_seen": 162365345, "step": 7508, "time_per_iteration": 2.7748963832855225 }, { "auxiliary_loss_clip": 0.01059744, "auxiliary_loss_mlp": 0.01002318, "balance_loss_clip": 1.00905085, "balance_loss_mlp": 1.0014714, "epoch": 0.902903865808934, "flos": 63717453244800.0, "grad_norm": 0.9140831745032572, "language_loss": 0.75096762, "learning_rate": 9.798327668408823e-08, "loss": 0.77158827, "num_input_tokens_seen": 162426980, "step": 7509, "time_per_iteration": 3.345794439315796 }, { "auxiliary_loss_clip": 0.01167623, "auxiliary_loss_mlp": 0.01023523, "balance_loss_clip": 1.04633546, "balance_loss_mlp": 1.01557767, "epoch": 0.9030241086995732, "flos": 23804034600960.0, "grad_norm": 3.7741836891200315, "language_loss": 0.68945843, "learning_rate": 9.774259006354158e-08, "loss": 0.71136987, "num_input_tokens_seen": 162447050, "step": 7510, "time_per_iteration": 2.7077767848968506 }, { "auxiliary_loss_clip": 0.01164272, "auxiliary_loss_mlp": 0.01024031, "balance_loss_clip": 1.04624522, "balance_loss_mlp": 1.01691461, "epoch": 0.9031443515902122, "flos": 26395887248640.0, "grad_norm": 1.942471739433067, "language_loss": 0.76025641, "learning_rate": 9.750219201393184e-08, "loss": 0.78213954, "num_input_tokens_seen": 162467015, "step": 7511, "time_per_iteration": 3.695441722869873 }, { "auxiliary_loss_clip": 0.01159321, "auxiliary_loss_mlp": 0.01025295, "balance_loss_clip": 1.04501104, "balance_loss_mlp": 1.01865196, "epoch": 0.9032645944808513, "flos": 24939350749440.0, "grad_norm": 1.8181659976732112, "language_loss": 0.77669823, "learning_rate": 9.726208257172697e-08, "loss": 0.79854441, "num_input_tokens_seen": 162488710, "step": 7512, "time_per_iteration": 2.7195446491241455 }, { "auxiliary_loss_clip": 0.01165076, "auxiliary_loss_mlp": 0.01020259, "balance_loss_clip": 1.04681909, "balance_loss_mlp": 1.01330328, "epoch": 0.9033848373714904, "flos": 21178821196800.0, "grad_norm": 2.42093771088627, "language_loss": 0.75041974, "learning_rate": 9.702226177335115e-08, "loss": 0.77227306, "num_input_tokens_seen": 162507205, "step": 7513, "time_per_iteration": 2.5803933143615723 }, { "auxiliary_loss_clip": 0.01159988, "auxiliary_loss_mlp": 0.01025893, "balance_loss_clip": 1.04733896, "balance_loss_mlp": 1.0180254, "epoch": 0.9035050802621295, "flos": 26286359702400.0, "grad_norm": 1.6430789451019514, "language_loss": 0.7246927, "learning_rate": 9.67827296551853e-08, "loss": 0.74655151, "num_input_tokens_seen": 162528490, "step": 7514, "time_per_iteration": 3.6498329639434814 }, { "auxiliary_loss_clip": 0.01157537, "auxiliary_loss_mlp": 0.01056934, "balance_loss_clip": 1.05021071, "balance_loss_mlp": 1.0214889, "epoch": 0.9036253231527686, "flos": 24204546224640.0, "grad_norm": 2.490887602779203, "language_loss": 0.68705559, "learning_rate": 9.65434862535659e-08, "loss": 0.70920026, "num_input_tokens_seen": 162547860, "step": 7515, "time_per_iteration": 2.8224918842315674 }, { "auxiliary_loss_clip": 0.01163213, "auxiliary_loss_mlp": 0.01024159, "balance_loss_clip": 1.04549646, "balance_loss_mlp": 1.01675653, "epoch": 0.9037455660434077, "flos": 18072655660800.0, "grad_norm": 2.5494673081590125, "language_loss": 0.65114737, "learning_rate": 9.630453160478635e-08, "loss": 0.67302108, "num_input_tokens_seen": 162563215, "step": 7516, "time_per_iteration": 2.6971917152404785 }, { "auxiliary_loss_clip": 0.01154912, "auxiliary_loss_mlp": 0.01027729, "balance_loss_clip": 1.04812062, "balance_loss_mlp": 1.02048707, "epoch": 0.9038658089340468, "flos": 24060795995520.0, "grad_norm": 1.5791941304776702, "language_loss": 0.82256389, "learning_rate": 9.60658657450959e-08, "loss": 0.84439027, "num_input_tokens_seen": 162583515, "step": 7517, "time_per_iteration": 2.792848825454712 }, { "auxiliary_loss_clip": 0.01147639, "auxiliary_loss_mlp": 0.01021728, "balance_loss_clip": 1.04332876, "balance_loss_mlp": 1.01468265, "epoch": 0.9039860518246858, "flos": 21834298535040.0, "grad_norm": 2.334108537899109, "language_loss": 0.79479086, "learning_rate": 9.582748871069979e-08, "loss": 0.81648451, "num_input_tokens_seen": 162602955, "step": 7518, "time_per_iteration": 2.7083420753479004 }, { "auxiliary_loss_clip": 0.0116145, "auxiliary_loss_mlp": 0.01058358, "balance_loss_clip": 1.04492188, "balance_loss_mlp": 1.02337623, "epoch": 0.904106294715325, "flos": 26614870513920.0, "grad_norm": 2.617188771823715, "language_loss": 0.83171999, "learning_rate": 9.558940053775954e-08, "loss": 0.85391814, "num_input_tokens_seen": 162621595, "step": 7519, "time_per_iteration": 3.721005916595459 }, { "auxiliary_loss_clip": 0.01163704, "auxiliary_loss_mlp": 0.01026454, "balance_loss_clip": 1.04744411, "balance_loss_mlp": 1.01884866, "epoch": 0.904226537605964, "flos": 17785693906560.0, "grad_norm": 2.1878244800814097, "language_loss": 0.68117523, "learning_rate": 9.535160126239294e-08, "loss": 0.70307684, "num_input_tokens_seen": 162638220, "step": 7520, "time_per_iteration": 2.6728017330169678 }, { "auxiliary_loss_clip": 0.01162957, "auxiliary_loss_mlp": 0.01022916, "balance_loss_clip": 1.04788446, "balance_loss_mlp": 1.01568043, "epoch": 0.9043467804966031, "flos": 24790428961920.0, "grad_norm": 1.6429531982107088, "language_loss": 0.7085712, "learning_rate": 9.511409092067424e-08, "loss": 0.73043001, "num_input_tokens_seen": 162658575, "step": 7521, "time_per_iteration": 2.6780362129211426 }, { "auxiliary_loss_clip": 0.01158966, "auxiliary_loss_mlp": 0.01025174, "balance_loss_clip": 1.04674363, "balance_loss_mlp": 1.01805711, "epoch": 0.9044670233872423, "flos": 22632125472000.0, "grad_norm": 2.0619860476557057, "language_loss": 0.67318785, "learning_rate": 9.487686954863327e-08, "loss": 0.69502926, "num_input_tokens_seen": 162678295, "step": 7522, "time_per_iteration": 2.7682409286499023 }, { "auxiliary_loss_clip": 0.01161521, "auxiliary_loss_mlp": 0.01023807, "balance_loss_clip": 1.04657435, "balance_loss_mlp": 1.01658893, "epoch": 0.9045872662778813, "flos": 23771320289280.0, "grad_norm": 3.3573132331730586, "language_loss": 0.77534652, "learning_rate": 9.46399371822566e-08, "loss": 0.79719985, "num_input_tokens_seen": 162698070, "step": 7523, "time_per_iteration": 2.717646360397339 }, { "auxiliary_loss_clip": 0.01166152, "auxiliary_loss_mlp": 0.0102026, "balance_loss_clip": 1.04661727, "balance_loss_mlp": 1.01253557, "epoch": 0.9047075091685204, "flos": 15191039998080.0, "grad_norm": 2.4459297330969765, "language_loss": 0.72304833, "learning_rate": 9.440329385748657e-08, "loss": 0.74491239, "num_input_tokens_seen": 162715140, "step": 7524, "time_per_iteration": 2.726663112640381 }, { "auxiliary_loss_clip": 0.01157212, "auxiliary_loss_mlp": 0.01020485, "balance_loss_clip": 1.04826176, "balance_loss_mlp": 1.01420307, "epoch": 0.9048277520591596, "flos": 18003707504640.0, "grad_norm": 1.7055497562173156, "language_loss": 0.70715916, "learning_rate": 9.416693961022137e-08, "loss": 0.72893608, "num_input_tokens_seen": 162733390, "step": 7525, "time_per_iteration": 2.7159764766693115 }, { "auxiliary_loss_clip": 0.01138731, "auxiliary_loss_mlp": 0.01027471, "balance_loss_clip": 1.04583573, "balance_loss_mlp": 1.01953781, "epoch": 0.9049479949497986, "flos": 21872471713920.0, "grad_norm": 1.7344238970957537, "language_loss": 0.77451372, "learning_rate": 9.393087447631654e-08, "loss": 0.79617572, "num_input_tokens_seen": 162751670, "step": 7526, "time_per_iteration": 2.8283467292785645 }, { "auxiliary_loss_clip": 0.0115704, "auxiliary_loss_mlp": 0.01029062, "balance_loss_clip": 1.04285455, "balance_loss_mlp": 1.02198076, "epoch": 0.9050682378404377, "flos": 20773928113920.0, "grad_norm": 2.0772289249046856, "language_loss": 0.73135418, "learning_rate": 9.36950984915823e-08, "loss": 0.75321519, "num_input_tokens_seen": 162770025, "step": 7527, "time_per_iteration": 2.689940929412842 }, { "auxiliary_loss_clip": 0.01166654, "auxiliary_loss_mlp": 0.01030203, "balance_loss_clip": 1.04719877, "balance_loss_mlp": 1.02275872, "epoch": 0.9051884807310768, "flos": 21580015178880.0, "grad_norm": 1.7051871513409183, "language_loss": 0.69117916, "learning_rate": 9.345961169178607e-08, "loss": 0.71314776, "num_input_tokens_seen": 162789710, "step": 7528, "time_per_iteration": 2.698312997817993 }, { "auxiliary_loss_clip": 0.01146752, "auxiliary_loss_mlp": 0.01020183, "balance_loss_clip": 1.04951024, "balance_loss_mlp": 1.0132091, "epoch": 0.9053087236217159, "flos": 21908059113600.0, "grad_norm": 5.741297773157944, "language_loss": 0.72849935, "learning_rate": 9.322441411265081e-08, "loss": 0.75016868, "num_input_tokens_seen": 162810695, "step": 7529, "time_per_iteration": 2.7246196269989014 }, { "auxiliary_loss_clip": 0.01155926, "auxiliary_loss_mlp": 0.01023461, "balance_loss_clip": 1.04845214, "balance_loss_mlp": 1.01538491, "epoch": 0.9054289665123549, "flos": 17055809544960.0, "grad_norm": 2.0307705974072734, "language_loss": 0.7353999, "learning_rate": 9.298950578985554e-08, "loss": 0.75719374, "num_input_tokens_seen": 162827770, "step": 7530, "time_per_iteration": 2.7933242321014404 }, { "auxiliary_loss_clip": 0.01159726, "auxiliary_loss_mlp": 0.01055761, "balance_loss_clip": 1.04696023, "balance_loss_mlp": 1.0189867, "epoch": 0.905549209402994, "flos": 20777268078720.0, "grad_norm": 1.7169751364278996, "language_loss": 0.7082209, "learning_rate": 9.275488675903665e-08, "loss": 0.73037577, "num_input_tokens_seen": 162846715, "step": 7531, "time_per_iteration": 2.7882025241851807 }, { "auxiliary_loss_clip": 0.01152511, "auxiliary_loss_mlp": 0.01024545, "balance_loss_clip": 1.04850924, "balance_loss_mlp": 1.01691008, "epoch": 0.9056694522936332, "flos": 21686813291520.0, "grad_norm": 1.9403420942118452, "language_loss": 0.73709881, "learning_rate": 9.252055705578454e-08, "loss": 0.75886935, "num_input_tokens_seen": 162866215, "step": 7532, "time_per_iteration": 2.8522517681121826 }, { "auxiliary_loss_clip": 0.01162813, "auxiliary_loss_mlp": 0.01023675, "balance_loss_clip": 1.04676175, "balance_loss_mlp": 1.01697838, "epoch": 0.9057896951842722, "flos": 29569133433600.0, "grad_norm": 1.5355232470860387, "language_loss": 0.72209716, "learning_rate": 9.228651671564747e-08, "loss": 0.74396205, "num_input_tokens_seen": 162888245, "step": 7533, "time_per_iteration": 2.8163626194000244 }, { "auxiliary_loss_clip": 0.01151562, "auxiliary_loss_mlp": 0.01023586, "balance_loss_clip": 1.04850435, "balance_loss_mlp": 1.01631474, "epoch": 0.9059099380749113, "flos": 27892248952320.0, "grad_norm": 1.5036006124878487, "language_loss": 0.77891123, "learning_rate": 9.205276577412901e-08, "loss": 0.80066276, "num_input_tokens_seen": 162911025, "step": 7534, "time_per_iteration": 3.8680708408355713 }, { "auxiliary_loss_clip": 0.01167096, "auxiliary_loss_mlp": 0.01055948, "balance_loss_clip": 1.04626238, "balance_loss_mlp": 1.01994443, "epoch": 0.9060301809655504, "flos": 17748993185280.0, "grad_norm": 2.325844452680144, "language_loss": 0.77397323, "learning_rate": 9.181930426668905e-08, "loss": 0.79620367, "num_input_tokens_seen": 162927820, "step": 7535, "time_per_iteration": 2.6410326957702637 }, { "auxiliary_loss_clip": 0.01151885, "auxiliary_loss_mlp": 0.01027056, "balance_loss_clip": 1.04885566, "balance_loss_mlp": 1.01961708, "epoch": 0.9061504238561895, "flos": 31759432963200.0, "grad_norm": 1.5748196709854632, "language_loss": 0.67849338, "learning_rate": 9.158613222874346e-08, "loss": 0.70028281, "num_input_tokens_seen": 162949445, "step": 7536, "time_per_iteration": 2.8195443153381348 }, { "auxiliary_loss_clip": 0.01158633, "auxiliary_loss_mlp": 0.01023158, "balance_loss_clip": 1.04599941, "balance_loss_mlp": 1.01626801, "epoch": 0.9062706667468285, "flos": 20048066075520.0, "grad_norm": 1.5242340900731612, "language_loss": 0.81807053, "learning_rate": 9.135324969566394e-08, "loss": 0.83988839, "num_input_tokens_seen": 162968945, "step": 7537, "time_per_iteration": 3.6016151905059814 }, { "auxiliary_loss_clip": 0.01168779, "auxiliary_loss_mlp": 0.01025138, "balance_loss_clip": 1.04897881, "balance_loss_mlp": 1.01805687, "epoch": 0.9063909096374677, "flos": 18437292576000.0, "grad_norm": 1.9894794346690867, "language_loss": 0.75612986, "learning_rate": 9.112065670277913e-08, "loss": 0.77806902, "num_input_tokens_seen": 162985310, "step": 7538, "time_per_iteration": 2.7171146869659424 }, { "auxiliary_loss_clip": 0.01154926, "auxiliary_loss_mlp": 0.01025345, "balance_loss_clip": 1.04471779, "balance_loss_mlp": 1.01829338, "epoch": 0.9065111525281068, "flos": 33547353361920.0, "grad_norm": 1.8456518771478323, "language_loss": 0.73110479, "learning_rate": 9.088835328537303e-08, "loss": 0.75290751, "num_input_tokens_seen": 163006900, "step": 7539, "time_per_iteration": 3.7047107219696045 }, { "auxiliary_loss_clip": 0.01165702, "auxiliary_loss_mlp": 0.01026169, "balance_loss_clip": 1.04835403, "balance_loss_mlp": 1.01896, "epoch": 0.9066313954187458, "flos": 23367863750400.0, "grad_norm": 4.0808148076433355, "language_loss": 0.71216464, "learning_rate": 9.065633947868568e-08, "loss": 0.73408335, "num_input_tokens_seen": 163026505, "step": 7540, "time_per_iteration": 2.668973922729492 }, { "auxiliary_loss_clip": 0.01154322, "auxiliary_loss_mlp": 0.01053858, "balance_loss_clip": 1.04702234, "balance_loss_mlp": 1.01909852, "epoch": 0.906751638309385, "flos": 26249623067520.0, "grad_norm": 2.169597683016169, "language_loss": 0.80039108, "learning_rate": 9.042461531791379e-08, "loss": 0.82247293, "num_input_tokens_seen": 163044925, "step": 7541, "time_per_iteration": 2.7266151905059814 }, { "auxiliary_loss_clip": 0.01161541, "auxiliary_loss_mlp": 0.01025973, "balance_loss_clip": 1.04462004, "balance_loss_mlp": 1.01892197, "epoch": 0.906871881200024, "flos": 16544477485440.0, "grad_norm": 1.964630735764503, "language_loss": 0.78207636, "learning_rate": 9.019318083820903e-08, "loss": 0.8039515, "num_input_tokens_seen": 163063505, "step": 7542, "time_per_iteration": 2.5755879878997803 }, { "auxiliary_loss_clip": 0.01159837, "auxiliary_loss_mlp": 0.01025434, "balance_loss_clip": 1.04571867, "balance_loss_mlp": 1.01765609, "epoch": 0.9069921240906631, "flos": 24605129675520.0, "grad_norm": 1.554029253683363, "language_loss": 0.85341674, "learning_rate": 8.996203607468045e-08, "loss": 0.87526947, "num_input_tokens_seen": 163082505, "step": 7543, "time_per_iteration": 2.6690335273742676 }, { "auxiliary_loss_clip": 0.01158819, "auxiliary_loss_mlp": 0.01027471, "balance_loss_clip": 1.04530013, "balance_loss_mlp": 1.02021742, "epoch": 0.9071123669813023, "flos": 25374731500800.0, "grad_norm": 1.5439169408810278, "language_loss": 0.7568031, "learning_rate": 8.973118106239241e-08, "loss": 0.77866596, "num_input_tokens_seen": 163105110, "step": 7544, "time_per_iteration": 2.747476816177368 }, { "auxiliary_loss_clip": 0.0115113, "auxiliary_loss_mlp": 0.01025486, "balance_loss_clip": 1.04582024, "balance_loss_mlp": 1.01845837, "epoch": 0.9072326098719413, "flos": 26725798690560.0, "grad_norm": 2.0402366393130063, "language_loss": 0.94673628, "learning_rate": 8.95006158363656e-08, "loss": 0.96850246, "num_input_tokens_seen": 163125295, "step": 7545, "time_per_iteration": 3.744102716445923 }, { "auxiliary_loss_clip": 0.01160271, "auxiliary_loss_mlp": 0.01030035, "balance_loss_clip": 1.0465548, "balance_loss_mlp": 1.02230692, "epoch": 0.9073528527625804, "flos": 23878800760320.0, "grad_norm": 1.7850033278006234, "language_loss": 0.77513707, "learning_rate": 8.9270340431576e-08, "loss": 0.7970401, "num_input_tokens_seen": 163144385, "step": 7546, "time_per_iteration": 2.7858760356903076 }, { "auxiliary_loss_clip": 0.01163109, "auxiliary_loss_mlp": 0.01020107, "balance_loss_clip": 1.04409003, "balance_loss_mlp": 1.01319885, "epoch": 0.9074730956532195, "flos": 37852144767360.0, "grad_norm": 2.1627538405766966, "language_loss": 0.73423529, "learning_rate": 8.904035488295658e-08, "loss": 0.7560674, "num_input_tokens_seen": 163163885, "step": 7547, "time_per_iteration": 2.773219585418701 }, { "auxiliary_loss_clip": 0.01059282, "auxiliary_loss_mlp": 0.01036113, "balance_loss_clip": 1.00677073, "balance_loss_mlp": 1.00242579, "epoch": 0.9075933385438586, "flos": 65173307385600.0, "grad_norm": 0.7007767478441568, "language_loss": 0.53264403, "learning_rate": 8.881065922539632e-08, "loss": 0.55359805, "num_input_tokens_seen": 163224325, "step": 7548, "time_per_iteration": 3.261824369430542 }, { "auxiliary_loss_clip": 0.01148569, "auxiliary_loss_mlp": 0.0102435, "balance_loss_clip": 1.04529262, "balance_loss_mlp": 1.01740015, "epoch": 0.9077135814344977, "flos": 19931571290880.0, "grad_norm": 1.703551091144375, "language_loss": 0.73226476, "learning_rate": 8.85812534937389e-08, "loss": 0.75399399, "num_input_tokens_seen": 163242425, "step": 7549, "time_per_iteration": 2.6987931728363037 }, { "auxiliary_loss_clip": 0.01168222, "auxiliary_loss_mlp": 0.0102243, "balance_loss_clip": 1.04799294, "balance_loss_mlp": 1.01561117, "epoch": 0.9078338243251368, "flos": 17529650784000.0, "grad_norm": 2.891274946408509, "language_loss": 0.67740726, "learning_rate": 8.835213772278583e-08, "loss": 0.69931382, "num_input_tokens_seen": 163259280, "step": 7550, "time_per_iteration": 2.688847303390503 }, { "auxiliary_loss_clip": 0.01144967, "auxiliary_loss_mlp": 0.01018914, "balance_loss_clip": 1.04461598, "balance_loss_mlp": 1.01182747, "epoch": 0.9079540672157759, "flos": 28803410277120.0, "grad_norm": 3.2524478421719336, "language_loss": 0.79169929, "learning_rate": 8.812331194729373e-08, "loss": 0.81333816, "num_input_tokens_seen": 163278925, "step": 7551, "time_per_iteration": 2.760101318359375 }, { "auxiliary_loss_clip": 0.01171044, "auxiliary_loss_mlp": 0.01026127, "balance_loss_clip": 1.05080307, "balance_loss_mlp": 1.01868844, "epoch": 0.9080743101064149, "flos": 23513840622720.0, "grad_norm": 1.9658493708332185, "language_loss": 0.7184515, "learning_rate": 8.789477620197461e-08, "loss": 0.74042326, "num_input_tokens_seen": 163298450, "step": 7552, "time_per_iteration": 2.687601327896118 }, { "auxiliary_loss_clip": 0.01161072, "auxiliary_loss_mlp": 0.01026684, "balance_loss_clip": 1.04843998, "balance_loss_mlp": 1.01987696, "epoch": 0.9081945529970541, "flos": 22778102344320.0, "grad_norm": 2.4245904051077876, "language_loss": 0.79317874, "learning_rate": 8.766653052149831e-08, "loss": 0.81505626, "num_input_tokens_seen": 163313635, "step": 7553, "time_per_iteration": 2.713221788406372 }, { "auxiliary_loss_clip": 0.01157434, "auxiliary_loss_mlp": 0.01022921, "balance_loss_clip": 1.04551923, "balance_loss_mlp": 1.01550364, "epoch": 0.9083147958876931, "flos": 18873714821760.0, "grad_norm": 1.9079742484868385, "language_loss": 0.74708652, "learning_rate": 8.743857494048823e-08, "loss": 0.76889002, "num_input_tokens_seen": 163330450, "step": 7554, "time_per_iteration": 2.681337833404541 }, { "auxiliary_loss_clip": 0.01158047, "auxiliary_loss_mlp": 0.01023557, "balance_loss_clip": 1.04772496, "balance_loss_mlp": 1.01612711, "epoch": 0.9084350387783322, "flos": 18909374048640.0, "grad_norm": 1.9630519302467893, "language_loss": 0.62949246, "learning_rate": 8.721090949352605e-08, "loss": 0.65130848, "num_input_tokens_seen": 163346690, "step": 7555, "time_per_iteration": 2.7464792728424072 }, { "auxiliary_loss_clip": 0.01174586, "auxiliary_loss_mlp": 0.01022346, "balance_loss_clip": 1.05021548, "balance_loss_mlp": 1.01437664, "epoch": 0.9085552816689714, "flos": 20595488325120.0, "grad_norm": 1.9424788769137695, "language_loss": 0.72629571, "learning_rate": 8.698353421514793e-08, "loss": 0.74826503, "num_input_tokens_seen": 163365065, "step": 7556, "time_per_iteration": 2.6892483234405518 }, { "auxiliary_loss_clip": 0.01160541, "auxiliary_loss_mlp": 0.01024113, "balance_loss_clip": 1.04574871, "balance_loss_mlp": 1.01735663, "epoch": 0.9086755245596104, "flos": 18113163223680.0, "grad_norm": 2.0105877277709125, "language_loss": 0.8061468, "learning_rate": 8.67564491398467e-08, "loss": 0.82799333, "num_input_tokens_seen": 163382070, "step": 7557, "time_per_iteration": 2.7211170196533203 }, { "auxiliary_loss_clip": 0.01164116, "auxiliary_loss_mlp": 0.01025198, "balance_loss_clip": 1.04591966, "balance_loss_mlp": 1.01830173, "epoch": 0.9087957674502495, "flos": 19129793857920.0, "grad_norm": 1.867167449402625, "language_loss": 0.7400744, "learning_rate": 8.652965430207104e-08, "loss": 0.76196754, "num_input_tokens_seen": 163399975, "step": 7558, "time_per_iteration": 2.630600929260254 }, { "auxiliary_loss_clip": 0.01162562, "auxiliary_loss_mlp": 0.01023046, "balance_loss_clip": 1.04387546, "balance_loss_mlp": 1.01601875, "epoch": 0.9089160103408886, "flos": 18109930999680.0, "grad_norm": 1.9638760427854187, "language_loss": 0.65391123, "learning_rate": 8.630314973622521e-08, "loss": 0.67576724, "num_input_tokens_seen": 163417520, "step": 7559, "time_per_iteration": 2.7007946968078613 }, { "auxiliary_loss_clip": 0.01160545, "auxiliary_loss_mlp": 0.01022816, "balance_loss_clip": 1.04817247, "balance_loss_mlp": 1.01565123, "epoch": 0.9090362532315277, "flos": 33364855336320.0, "grad_norm": 1.8473715591958075, "language_loss": 0.70786738, "learning_rate": 8.607693547666995e-08, "loss": 0.72970092, "num_input_tokens_seen": 163440060, "step": 7560, "time_per_iteration": 3.857097864151001 }, { "auxiliary_loss_clip": 0.01060348, "auxiliary_loss_mlp": 0.01000402, "balance_loss_clip": 1.00746441, "balance_loss_mlp": 0.99934703, "epoch": 0.9091564961221668, "flos": 71480585082240.0, "grad_norm": 0.879720235528118, "language_loss": 0.57956344, "learning_rate": 8.585101155772201e-08, "loss": 0.60017097, "num_input_tokens_seen": 163502180, "step": 7561, "time_per_iteration": 3.3226215839385986 }, { "auxiliary_loss_clip": 0.01156754, "auxiliary_loss_mlp": 0.01024383, "balance_loss_clip": 1.04678011, "balance_loss_mlp": 1.0167594, "epoch": 0.9092767390128058, "flos": 24712574232960.0, "grad_norm": 1.7991795622504214, "language_loss": 0.68728185, "learning_rate": 8.562537801365377e-08, "loss": 0.70909327, "num_input_tokens_seen": 163521915, "step": 7562, "time_per_iteration": 2.752920150756836 }, { "auxiliary_loss_clip": 0.0116725, "auxiliary_loss_mlp": 0.01028032, "balance_loss_clip": 1.04789948, "balance_loss_mlp": 1.02038515, "epoch": 0.909396981903445, "flos": 23586487879680.0, "grad_norm": 1.7498552865688022, "language_loss": 0.69807839, "learning_rate": 8.540003487869362e-08, "loss": 0.72003126, "num_input_tokens_seen": 163543585, "step": 7563, "time_per_iteration": 3.6245338916778564 }, { "auxiliary_loss_clip": 0.01147555, "auxiliary_loss_mlp": 0.01021375, "balance_loss_clip": 1.04595804, "balance_loss_mlp": 1.01407933, "epoch": 0.909517224794084, "flos": 23404169422080.0, "grad_norm": 1.7205782394820721, "language_loss": 0.79538989, "learning_rate": 8.517498218702557e-08, "loss": 0.81707919, "num_input_tokens_seen": 163561515, "step": 7564, "time_per_iteration": 2.759439468383789 }, { "auxiliary_loss_clip": 0.0115182, "auxiliary_loss_mlp": 0.01024917, "balance_loss_clip": 1.04612458, "balance_loss_mlp": 1.0177021, "epoch": 0.9096374676847231, "flos": 19208618254080.0, "grad_norm": 1.6711186515799206, "language_loss": 0.69494247, "learning_rate": 8.49502199727905e-08, "loss": 0.71670979, "num_input_tokens_seen": 163579540, "step": 7565, "time_per_iteration": 3.5946364402770996 }, { "auxiliary_loss_clip": 0.01159121, "auxiliary_loss_mlp": 0.01028992, "balance_loss_clip": 1.04470015, "balance_loss_mlp": 1.02131498, "epoch": 0.9097577105753623, "flos": 33292495388160.0, "grad_norm": 2.2276545638839766, "language_loss": 0.66532886, "learning_rate": 8.472574827008428e-08, "loss": 0.68721002, "num_input_tokens_seen": 163600425, "step": 7566, "time_per_iteration": 2.735935926437378 }, { "auxiliary_loss_clip": 0.01160078, "auxiliary_loss_mlp": 0.01025332, "balance_loss_clip": 1.04460299, "balance_loss_mlp": 1.01801574, "epoch": 0.9098779534660013, "flos": 21906443001600.0, "grad_norm": 2.0070960362428427, "language_loss": 0.8419944, "learning_rate": 8.450156711295942e-08, "loss": 0.86384845, "num_input_tokens_seen": 163620595, "step": 7567, "time_per_iteration": 2.6773974895477295 }, { "auxiliary_loss_clip": 0.01156359, "auxiliary_loss_mlp": 0.0102405, "balance_loss_clip": 1.04740334, "balance_loss_mlp": 1.01721656, "epoch": 0.9099981963566404, "flos": 25730354102400.0, "grad_norm": 2.095235706508056, "language_loss": 0.86641622, "learning_rate": 8.427767653542383e-08, "loss": 0.88822019, "num_input_tokens_seen": 163635765, "step": 7568, "time_per_iteration": 2.7414627075195312 }, { "auxiliary_loss_clip": 0.01150661, "auxiliary_loss_mlp": 0.01024697, "balance_loss_clip": 1.04692197, "balance_loss_mlp": 1.01752627, "epoch": 0.9101184392472795, "flos": 21069437304960.0, "grad_norm": 2.7632777700951046, "language_loss": 0.70008194, "learning_rate": 8.405407657144125e-08, "loss": 0.72183549, "num_input_tokens_seen": 163654925, "step": 7569, "time_per_iteration": 2.7893614768981934 }, { "auxiliary_loss_clip": 0.01155567, "auxiliary_loss_mlp": 0.01023111, "balance_loss_clip": 1.04780149, "balance_loss_mlp": 1.01591384, "epoch": 0.9102386821379186, "flos": 24752614919040.0, "grad_norm": 1.8728283112182633, "language_loss": 0.7234689, "learning_rate": 8.383076725493232e-08, "loss": 0.74525571, "num_input_tokens_seen": 163672245, "step": 7570, "time_per_iteration": 2.780043840408325 }, { "auxiliary_loss_clip": 0.0116221, "auxiliary_loss_mlp": 0.01028863, "balance_loss_clip": 1.0448873, "balance_loss_mlp": 1.021281, "epoch": 0.9103589250285576, "flos": 22562818179840.0, "grad_norm": 15.772420039814202, "language_loss": 0.68112755, "learning_rate": 8.360774861977216e-08, "loss": 0.70303828, "num_input_tokens_seen": 163691365, "step": 7571, "time_per_iteration": 3.678879976272583 }, { "auxiliary_loss_clip": 0.01161332, "auxiliary_loss_mlp": 0.01025492, "balance_loss_clip": 1.04617918, "balance_loss_mlp": 1.01803589, "epoch": 0.9104791679191968, "flos": 25373474524800.0, "grad_norm": 2.2625853148906914, "language_loss": 0.74359936, "learning_rate": 8.338502069979281e-08, "loss": 0.76546758, "num_input_tokens_seen": 163711675, "step": 7572, "time_per_iteration": 2.765681743621826 }, { "auxiliary_loss_clip": 0.01165252, "auxiliary_loss_mlp": 0.01021257, "balance_loss_clip": 1.04469061, "balance_loss_mlp": 1.01386333, "epoch": 0.9105994108098359, "flos": 14426681558400.0, "grad_norm": 2.909647911547489, "language_loss": 0.80138314, "learning_rate": 8.316258352878214e-08, "loss": 0.82324827, "num_input_tokens_seen": 163728095, "step": 7573, "time_per_iteration": 2.6097867488861084 }, { "auxiliary_loss_clip": 0.01166218, "auxiliary_loss_mlp": 0.01022813, "balance_loss_clip": 1.04529786, "balance_loss_mlp": 1.01574373, "epoch": 0.9107196537004749, "flos": 26718292748160.0, "grad_norm": 1.8498312158752737, "language_loss": 0.7126199, "learning_rate": 8.294043714048338e-08, "loss": 0.73451024, "num_input_tokens_seen": 163747175, "step": 7574, "time_per_iteration": 2.7597763538360596 }, { "auxiliary_loss_clip": 0.0105848, "auxiliary_loss_mlp": 0.01001626, "balance_loss_clip": 1.0071857, "balance_loss_mlp": 1.00061274, "epoch": 0.9108398965911141, "flos": 66532634703360.0, "grad_norm": 0.7613196165306364, "language_loss": 0.60432786, "learning_rate": 8.271858156859624e-08, "loss": 0.62492889, "num_input_tokens_seen": 163812545, "step": 7575, "time_per_iteration": 3.3335647583007812 }, { "auxiliary_loss_clip": 0.01164384, "auxiliary_loss_mlp": 0.0102479, "balance_loss_clip": 1.04608536, "balance_loss_mlp": 1.0172565, "epoch": 0.9109601394817531, "flos": 25411073086080.0, "grad_norm": 1.7747242619219536, "language_loss": 0.74133086, "learning_rate": 8.249701684677557e-08, "loss": 0.76322263, "num_input_tokens_seen": 163833870, "step": 7576, "time_per_iteration": 2.724172592163086 }, { "auxiliary_loss_clip": 0.01164102, "auxiliary_loss_mlp": 0.01027501, "balance_loss_clip": 1.0490365, "balance_loss_mlp": 1.02020288, "epoch": 0.9110803823723922, "flos": 22747794243840.0, "grad_norm": 1.9052455643588246, "language_loss": 0.81242836, "learning_rate": 8.227574300863294e-08, "loss": 0.83434439, "num_input_tokens_seen": 163854040, "step": 7577, "time_per_iteration": 2.749133348464966 }, { "auxiliary_loss_clip": 0.01165697, "auxiliary_loss_mlp": 0.0102422, "balance_loss_clip": 1.04912722, "balance_loss_mlp": 1.01670957, "epoch": 0.9112006252630314, "flos": 48469924131840.0, "grad_norm": 1.9343586103094292, "language_loss": 0.69713902, "learning_rate": 8.205476008773548e-08, "loss": 0.71903813, "num_input_tokens_seen": 163878040, "step": 7578, "time_per_iteration": 2.927471876144409 }, { "auxiliary_loss_clip": 0.01150285, "auxiliary_loss_mlp": 0.01026069, "balance_loss_clip": 1.04741955, "balance_loss_mlp": 1.01891029, "epoch": 0.9113208681536704, "flos": 30009649829760.0, "grad_norm": 2.0649663533445577, "language_loss": 0.82777488, "learning_rate": 8.183406811760596e-08, "loss": 0.84953839, "num_input_tokens_seen": 163897770, "step": 7579, "time_per_iteration": 2.83383846282959 }, { "auxiliary_loss_clip": 0.01145514, "auxiliary_loss_mlp": 0.01026731, "balance_loss_clip": 1.04636037, "balance_loss_mlp": 1.01978683, "epoch": 0.9114411110443095, "flos": 25594971742080.0, "grad_norm": 1.5260519830019277, "language_loss": 0.74173808, "learning_rate": 8.161366713172313e-08, "loss": 0.76346052, "num_input_tokens_seen": 163920160, "step": 7580, "time_per_iteration": 2.754514455795288 }, { "auxiliary_loss_clip": 0.01163584, "auxiliary_loss_mlp": 0.01028642, "balance_loss_clip": 1.04909575, "balance_loss_mlp": 1.02101839, "epoch": 0.9115613539349486, "flos": 18399729928320.0, "grad_norm": 3.059247512696878, "language_loss": 0.84199286, "learning_rate": 8.139355716352137e-08, "loss": 0.86391509, "num_input_tokens_seen": 163935000, "step": 7581, "time_per_iteration": 2.7109217643737793 }, { "auxiliary_loss_clip": 0.01163294, "auxiliary_loss_mlp": 0.0102319, "balance_loss_clip": 1.04679441, "balance_loss_mlp": 1.01569223, "epoch": 0.9116815968255877, "flos": 21726171619200.0, "grad_norm": 2.243341942240972, "language_loss": 0.70215631, "learning_rate": 8.117373824639196e-08, "loss": 0.7240212, "num_input_tokens_seen": 163955265, "step": 7582, "time_per_iteration": 2.746311902999878 }, { "auxiliary_loss_clip": 0.01059476, "auxiliary_loss_mlp": 0.01002178, "balance_loss_clip": 1.00691676, "balance_loss_mlp": 1.00126016, "epoch": 0.9118018397162267, "flos": 65363526835200.0, "grad_norm": 0.7218857713593688, "language_loss": 0.5920561, "learning_rate": 8.095421041368067e-08, "loss": 0.61267269, "num_input_tokens_seen": 164014680, "step": 7583, "time_per_iteration": 3.1597251892089844 }, { "auxiliary_loss_clip": 0.01157593, "auxiliary_loss_mlp": 0.01050998, "balance_loss_clip": 1.04590857, "balance_loss_mlp": 1.01705074, "epoch": 0.9119220826068659, "flos": 20922885815040.0, "grad_norm": 1.8827208074304853, "language_loss": 0.705984, "learning_rate": 8.073497369868999e-08, "loss": 0.72806996, "num_input_tokens_seen": 164033140, "step": 7584, "time_per_iteration": 2.716709613800049 }, { "auxiliary_loss_clip": 0.01166651, "auxiliary_loss_mlp": 0.01022392, "balance_loss_clip": 1.04695427, "balance_loss_mlp": 1.01463723, "epoch": 0.912042325497505, "flos": 28366449327360.0, "grad_norm": 2.82198182703533, "language_loss": 0.75582707, "learning_rate": 8.051602813467772e-08, "loss": 0.77771747, "num_input_tokens_seen": 164054995, "step": 7585, "time_per_iteration": 2.7991435527801514 }, { "auxiliary_loss_clip": 0.01165767, "auxiliary_loss_mlp": 0.01023342, "balance_loss_clip": 1.04806149, "balance_loss_mlp": 1.01597476, "epoch": 0.912162568388144, "flos": 17566782468480.0, "grad_norm": 2.559434700945847, "language_loss": 0.71191514, "learning_rate": 8.029737375485756e-08, "loss": 0.73380625, "num_input_tokens_seen": 164074225, "step": 7586, "time_per_iteration": 3.657538890838623 }, { "auxiliary_loss_clip": 0.01166445, "auxiliary_loss_mlp": 0.01027005, "balance_loss_clip": 1.04589558, "balance_loss_mlp": 1.01982903, "epoch": 0.9122828112787832, "flos": 19827897661440.0, "grad_norm": 1.8277859815533584, "language_loss": 0.7246033, "learning_rate": 8.007901059239986e-08, "loss": 0.74653774, "num_input_tokens_seen": 164093505, "step": 7587, "time_per_iteration": 2.625513792037964 }, { "auxiliary_loss_clip": 0.01159132, "auxiliary_loss_mlp": 0.01022004, "balance_loss_clip": 1.04465437, "balance_loss_mlp": 1.01495576, "epoch": 0.9124030541694222, "flos": 20813789232000.0, "grad_norm": 1.592174561104518, "language_loss": 0.80360132, "learning_rate": 7.986093868042964e-08, "loss": 0.82541263, "num_input_tokens_seen": 164113750, "step": 7588, "time_per_iteration": 2.6770594120025635 }, { "auxiliary_loss_clip": 0.01159515, "auxiliary_loss_mlp": 0.01025661, "balance_loss_clip": 1.0448041, "balance_loss_mlp": 1.01891124, "epoch": 0.9125232970600613, "flos": 25192305302400.0, "grad_norm": 1.6642593666199341, "language_loss": 0.67941439, "learning_rate": 7.964315805202826e-08, "loss": 0.70126617, "num_input_tokens_seen": 164134330, "step": 7589, "time_per_iteration": 3.6516990661621094 }, { "auxiliary_loss_clip": 0.01158727, "auxiliary_loss_mlp": 0.01025398, "balance_loss_clip": 1.04566014, "balance_loss_mlp": 1.01817966, "epoch": 0.9126435399507005, "flos": 19719591177600.0, "grad_norm": 1.9536103158909985, "language_loss": 0.73205519, "learning_rate": 7.942566874023304e-08, "loss": 0.75389636, "num_input_tokens_seen": 164153515, "step": 7590, "time_per_iteration": 2.6349806785583496 }, { "auxiliary_loss_clip": 0.01160084, "auxiliary_loss_mlp": 0.01025615, "balance_loss_clip": 1.04730344, "balance_loss_mlp": 1.01811647, "epoch": 0.9127637828413395, "flos": 19573614305280.0, "grad_norm": 2.3902312994316857, "language_loss": 0.6982404, "learning_rate": 7.920847077803649e-08, "loss": 0.72009736, "num_input_tokens_seen": 164171305, "step": 7591, "time_per_iteration": 3.57517671585083 }, { "auxiliary_loss_clip": 0.01147182, "auxiliary_loss_mlp": 0.01028015, "balance_loss_clip": 1.04750562, "balance_loss_mlp": 1.02104163, "epoch": 0.9128840257319786, "flos": 20230635928320.0, "grad_norm": 2.3012008671605835, "language_loss": 0.82206953, "learning_rate": 7.899156419838826e-08, "loss": 0.84382153, "num_input_tokens_seen": 164190275, "step": 7592, "time_per_iteration": 2.679769277572632 }, { "auxiliary_loss_clip": 0.01155095, "auxiliary_loss_mlp": 0.01020011, "balance_loss_clip": 1.04661131, "balance_loss_mlp": 1.01296616, "epoch": 0.9130042686226177, "flos": 24858658846080.0, "grad_norm": 5.129861018285088, "language_loss": 0.65544498, "learning_rate": 7.87749490341918e-08, "loss": 0.67719603, "num_input_tokens_seen": 164210550, "step": 7593, "time_per_iteration": 2.862096071243286 }, { "auxiliary_loss_clip": 0.01168499, "auxiliary_loss_mlp": 0.01028455, "balance_loss_clip": 1.04697752, "balance_loss_mlp": 1.02073014, "epoch": 0.9131245115132568, "flos": 23581747284480.0, "grad_norm": 2.1824931075872174, "language_loss": 0.83455586, "learning_rate": 7.855862531830836e-08, "loss": 0.8565253, "num_input_tokens_seen": 164226660, "step": 7594, "time_per_iteration": 2.664755344390869 }, { "auxiliary_loss_clip": 0.011636, "auxiliary_loss_mlp": 0.01026473, "balance_loss_clip": 1.04806376, "balance_loss_mlp": 1.0190196, "epoch": 0.9132447544038959, "flos": 19931607204480.0, "grad_norm": 1.6004234047225907, "language_loss": 0.72832471, "learning_rate": 7.834259308355373e-08, "loss": 0.75022548, "num_input_tokens_seen": 164245425, "step": 7595, "time_per_iteration": 2.6431031227111816 }, { "auxiliary_loss_clip": 0.01138934, "auxiliary_loss_mlp": 0.01021885, "balance_loss_clip": 1.04683316, "balance_loss_mlp": 1.01495337, "epoch": 0.9133649972945349, "flos": 21981747864960.0, "grad_norm": 1.83340417553241, "language_loss": 0.74602211, "learning_rate": 7.812685236269989e-08, "loss": 0.76763034, "num_input_tokens_seen": 164264085, "step": 7596, "time_per_iteration": 2.8561716079711914 }, { "auxiliary_loss_clip": 0.01056606, "auxiliary_loss_mlp": 0.0099971, "balance_loss_clip": 1.00981092, "balance_loss_mlp": 0.99876225, "epoch": 0.9134852401851741, "flos": 71240523511680.0, "grad_norm": 0.7931400205672748, "language_loss": 0.58665693, "learning_rate": 7.791140318847445e-08, "loss": 0.60722011, "num_input_tokens_seen": 164322220, "step": 7597, "time_per_iteration": 3.2679975032806396 }, { "auxiliary_loss_clip": 0.01151544, "auxiliary_loss_mlp": 0.01024378, "balance_loss_clip": 1.04677236, "balance_loss_mlp": 1.01731515, "epoch": 0.9136054830758131, "flos": 23626923615360.0, "grad_norm": 1.3605946392643888, "language_loss": 0.80194724, "learning_rate": 7.769624559356081e-08, "loss": 0.82370645, "num_input_tokens_seen": 164345615, "step": 7598, "time_per_iteration": 3.649104118347168 }, { "auxiliary_loss_clip": 0.01162999, "auxiliary_loss_mlp": 0.01023707, "balance_loss_clip": 1.0471909, "balance_loss_mlp": 1.01596439, "epoch": 0.9137257259664522, "flos": 23438858981760.0, "grad_norm": 3.174451123685763, "language_loss": 0.7539953, "learning_rate": 7.748137961059842e-08, "loss": 0.7758624, "num_input_tokens_seen": 164359595, "step": 7599, "time_per_iteration": 2.6720471382141113 }, { "auxiliary_loss_clip": 0.01162817, "auxiliary_loss_mlp": 0.01021674, "balance_loss_clip": 1.04625559, "balance_loss_mlp": 1.01509976, "epoch": 0.9138459688570914, "flos": 19127854523520.0, "grad_norm": 2.4046473356210334, "language_loss": 0.65434194, "learning_rate": 7.726680527218211e-08, "loss": 0.67618686, "num_input_tokens_seen": 164376635, "step": 7600, "time_per_iteration": 2.62898588180542 }, { "auxiliary_loss_clip": 0.01163742, "auxiliary_loss_mlp": 0.01025375, "balance_loss_clip": 1.04305565, "balance_loss_mlp": 1.01792157, "epoch": 0.9139662117477304, "flos": 46281240714240.0, "grad_norm": 1.6872632445716467, "language_loss": 0.75609386, "learning_rate": 7.70525226108627e-08, "loss": 0.77798498, "num_input_tokens_seen": 164400305, "step": 7601, "time_per_iteration": 2.9454426765441895 }, { "auxiliary_loss_clip": 0.01166161, "auxiliary_loss_mlp": 0.01024302, "balance_loss_clip": 1.05018854, "balance_loss_mlp": 1.01698267, "epoch": 0.9140864546383695, "flos": 22273198819200.0, "grad_norm": 2.3676276804168523, "language_loss": 0.79776913, "learning_rate": 7.683853165914666e-08, "loss": 0.81967378, "num_input_tokens_seen": 164418075, "step": 7602, "time_per_iteration": 2.64967942237854 }, { "auxiliary_loss_clip": 0.01155909, "auxiliary_loss_mlp": 0.0102616, "balance_loss_clip": 1.04790187, "balance_loss_mlp": 1.01898408, "epoch": 0.9142066975290086, "flos": 17530009920000.0, "grad_norm": 1.6794210874029571, "language_loss": 0.7733773, "learning_rate": 7.662483244949602e-08, "loss": 0.79519796, "num_input_tokens_seen": 164435335, "step": 7603, "time_per_iteration": 2.833050489425659 }, { "auxiliary_loss_clip": 0.01147565, "auxiliary_loss_mlp": 0.01027678, "balance_loss_clip": 1.04376435, "balance_loss_mlp": 1.02018011, "epoch": 0.9143269404196477, "flos": 17712148809600.0, "grad_norm": 2.250854535248552, "language_loss": 0.80766857, "learning_rate": 7.641142501432951e-08, "loss": 0.82942098, "num_input_tokens_seen": 164451530, "step": 7604, "time_per_iteration": 2.7618567943573 }, { "auxiliary_loss_clip": 0.01156712, "auxiliary_loss_mlp": 0.01024991, "balance_loss_clip": 1.04674339, "balance_loss_mlp": 1.01775789, "epoch": 0.9144471833102867, "flos": 33323414019840.0, "grad_norm": 1.6844596441235593, "language_loss": 0.73831534, "learning_rate": 7.619830938602013e-08, "loss": 0.76013237, "num_input_tokens_seen": 164472755, "step": 7605, "time_per_iteration": 2.7434797286987305 }, { "auxiliary_loss_clip": 0.0115981, "auxiliary_loss_mlp": 0.01025978, "balance_loss_clip": 1.04875755, "balance_loss_mlp": 1.0185008, "epoch": 0.9145674262009259, "flos": 21068970428160.0, "grad_norm": 2.1196163580857807, "language_loss": 0.82877278, "learning_rate": 7.598548559689777e-08, "loss": 0.85063064, "num_input_tokens_seen": 164491155, "step": 7606, "time_per_iteration": 2.6604115962982178 }, { "auxiliary_loss_clip": 0.01150679, "auxiliary_loss_mlp": 0.01023267, "balance_loss_clip": 1.04576886, "balance_loss_mlp": 1.01603985, "epoch": 0.914687669091565, "flos": 16800269212800.0, "grad_norm": 2.1832414057064944, "language_loss": 0.81061906, "learning_rate": 7.577295367924751e-08, "loss": 0.83235854, "num_input_tokens_seen": 164507555, "step": 7607, "time_per_iteration": 2.7274632453918457 }, { "auxiliary_loss_clip": 0.01163699, "auxiliary_loss_mlp": 0.01022616, "balance_loss_clip": 1.04680276, "balance_loss_mlp": 1.01484942, "epoch": 0.914807911982204, "flos": 25773627012480.0, "grad_norm": 1.7464543232716003, "language_loss": 0.82418698, "learning_rate": 7.556071366531002e-08, "loss": 0.84605014, "num_input_tokens_seen": 164528525, "step": 7608, "time_per_iteration": 2.723698377609253 }, { "auxiliary_loss_clip": 0.01162161, "auxiliary_loss_mlp": 0.01028046, "balance_loss_clip": 1.0468452, "balance_loss_mlp": 1.02042282, "epoch": 0.9149281548728432, "flos": 19208043636480.0, "grad_norm": 1.8049601179668644, "language_loss": 0.79541409, "learning_rate": 7.53487655872822e-08, "loss": 0.81731617, "num_input_tokens_seen": 164547695, "step": 7609, "time_per_iteration": 2.6466007232666016 }, { "auxiliary_loss_clip": 0.01155592, "auxiliary_loss_mlp": 0.01020753, "balance_loss_clip": 1.04476452, "balance_loss_mlp": 1.01361203, "epoch": 0.9150483977634822, "flos": 26870554500480.0, "grad_norm": 1.7425281726966775, "language_loss": 0.73869866, "learning_rate": 7.513710947731656e-08, "loss": 0.76046205, "num_input_tokens_seen": 164568905, "step": 7610, "time_per_iteration": 2.786428928375244 }, { "auxiliary_loss_clip": 0.01156482, "auxiliary_loss_mlp": 0.01027934, "balance_loss_clip": 1.04878259, "balance_loss_mlp": 1.02084684, "epoch": 0.9151686406541213, "flos": 21908956953600.0, "grad_norm": 1.7107523576528716, "language_loss": 0.85126054, "learning_rate": 7.492574536752095e-08, "loss": 0.87310463, "num_input_tokens_seen": 164588895, "step": 7611, "time_per_iteration": 2.711966037750244 }, { "auxiliary_loss_clip": 0.0115722, "auxiliary_loss_mlp": 0.01028555, "balance_loss_clip": 1.04634571, "balance_loss_mlp": 1.0214529, "epoch": 0.9152888835447605, "flos": 27308556944640.0, "grad_norm": 2.203204623095251, "language_loss": 0.78113294, "learning_rate": 7.471467328995907e-08, "loss": 0.80299067, "num_input_tokens_seen": 164607705, "step": 7612, "time_per_iteration": 3.6509971618652344 }, { "auxiliary_loss_clip": 0.01154852, "auxiliary_loss_mlp": 0.01028528, "balance_loss_clip": 1.04787219, "balance_loss_mlp": 1.02027237, "epoch": 0.9154091264353995, "flos": 13370728510080.0, "grad_norm": 2.565936261481893, "language_loss": 0.60825753, "learning_rate": 7.450389327665018e-08, "loss": 0.63009131, "num_input_tokens_seen": 164625540, "step": 7613, "time_per_iteration": 2.696455717086792 }, { "auxiliary_loss_clip": 0.01159818, "auxiliary_loss_mlp": 0.01028232, "balance_loss_clip": 1.05016208, "balance_loss_mlp": 1.02048945, "epoch": 0.9155293693260386, "flos": 20193037367040.0, "grad_norm": 4.021882277709113, "language_loss": 0.67719233, "learning_rate": 7.429340535957029e-08, "loss": 0.69907278, "num_input_tokens_seen": 164640735, "step": 7614, "time_per_iteration": 2.705294609069824 }, { "auxiliary_loss_clip": 0.01161934, "auxiliary_loss_mlp": 0.01025448, "balance_loss_clip": 1.04795098, "balance_loss_mlp": 1.01873922, "epoch": 0.9156496122166777, "flos": 19354990176000.0, "grad_norm": 2.4694193860512623, "language_loss": 0.70717049, "learning_rate": 7.40832095706494e-08, "loss": 0.72904432, "num_input_tokens_seen": 164657430, "step": 7615, "time_per_iteration": 3.653610944747925 }, { "auxiliary_loss_clip": 0.0116105, "auxiliary_loss_mlp": 0.0102278, "balance_loss_clip": 1.04588485, "balance_loss_mlp": 1.01547813, "epoch": 0.9157698551073168, "flos": 21107287261440.0, "grad_norm": 2.4900683874709673, "language_loss": 0.80362606, "learning_rate": 7.387330594177443e-08, "loss": 0.82546437, "num_input_tokens_seen": 164679505, "step": 7616, "time_per_iteration": 2.8216302394866943 }, { "auxiliary_loss_clip": 0.01149909, "auxiliary_loss_mlp": 0.01023564, "balance_loss_clip": 1.04542994, "balance_loss_mlp": 1.01640236, "epoch": 0.9158900979979558, "flos": 25193167228800.0, "grad_norm": 1.7114794348299849, "language_loss": 0.79117501, "learning_rate": 7.366369450478749e-08, "loss": 0.81290972, "num_input_tokens_seen": 164700615, "step": 7617, "time_per_iteration": 3.6777710914611816 }, { "auxiliary_loss_clip": 0.01151755, "auxiliary_loss_mlp": 0.01021935, "balance_loss_clip": 1.04536295, "balance_loss_mlp": 1.01467824, "epoch": 0.916010340888595, "flos": 30146648302080.0, "grad_norm": 1.9638629189607806, "language_loss": 0.66550934, "learning_rate": 7.345437529148646e-08, "loss": 0.68724632, "num_input_tokens_seen": 164719625, "step": 7618, "time_per_iteration": 2.7994117736816406 }, { "auxiliary_loss_clip": 0.0115706, "auxiliary_loss_mlp": 0.01027455, "balance_loss_clip": 1.04551816, "balance_loss_mlp": 1.02028775, "epoch": 0.9161305837792341, "flos": 17091827907840.0, "grad_norm": 2.438752328988303, "language_loss": 0.72853982, "learning_rate": 7.324534833362483e-08, "loss": 0.75038499, "num_input_tokens_seen": 164737200, "step": 7619, "time_per_iteration": 2.6796112060546875 }, { "auxiliary_loss_clip": 0.01157703, "auxiliary_loss_mlp": 0.01025806, "balance_loss_clip": 1.04566562, "balance_loss_mlp": 1.0190047, "epoch": 0.9162508266698731, "flos": 22893699288960.0, "grad_norm": 1.590690489058169, "language_loss": 0.68398452, "learning_rate": 7.303661366291192e-08, "loss": 0.70581961, "num_input_tokens_seen": 164757870, "step": 7620, "time_per_iteration": 2.7780942916870117 }, { "auxiliary_loss_clip": 0.01154894, "auxiliary_loss_mlp": 0.01023491, "balance_loss_clip": 1.05008531, "balance_loss_mlp": 1.01649952, "epoch": 0.9163710695605123, "flos": 19974808287360.0, "grad_norm": 1.7973037745955236, "language_loss": 0.81546175, "learning_rate": 7.28281713110126e-08, "loss": 0.83724558, "num_input_tokens_seen": 164775945, "step": 7621, "time_per_iteration": 2.7755203247070312 }, { "auxiliary_loss_clip": 0.011529, "auxiliary_loss_mlp": 0.01027275, "balance_loss_clip": 1.04507136, "balance_loss_mlp": 1.02006316, "epoch": 0.9164913124511513, "flos": 22783812606720.0, "grad_norm": 2.0310143715976468, "language_loss": 0.77310866, "learning_rate": 7.262002130954759e-08, "loss": 0.79491043, "num_input_tokens_seen": 164794400, "step": 7622, "time_per_iteration": 2.7175045013427734 }, { "auxiliary_loss_clip": 0.0115745, "auxiliary_loss_mlp": 0.01027566, "balance_loss_clip": 1.04824495, "balance_loss_mlp": 1.02017462, "epoch": 0.9166115553417904, "flos": 24900854348160.0, "grad_norm": 1.6385850283205272, "language_loss": 0.79086304, "learning_rate": 7.241216369009296e-08, "loss": 0.81271315, "num_input_tokens_seen": 164814585, "step": 7623, "time_per_iteration": 2.742279291152954 }, { "auxiliary_loss_clip": 0.01164537, "auxiliary_loss_mlp": 0.01022292, "balance_loss_clip": 1.04489779, "balance_loss_mlp": 1.01510346, "epoch": 0.9167317982324296, "flos": 25702919089920.0, "grad_norm": 2.445885976099769, "language_loss": 0.66621089, "learning_rate": 7.220459848418037e-08, "loss": 0.68807912, "num_input_tokens_seen": 164834660, "step": 7624, "time_per_iteration": 3.5404839515686035 }, { "auxiliary_loss_clip": 0.01162344, "auxiliary_loss_mlp": 0.01026262, "balance_loss_clip": 1.0461086, "balance_loss_mlp": 1.01936018, "epoch": 0.9168520411230686, "flos": 15632813370240.0, "grad_norm": 1.8189396615261495, "language_loss": 0.79603928, "learning_rate": 7.199732572329708e-08, "loss": 0.81792533, "num_input_tokens_seen": 164852560, "step": 7625, "time_per_iteration": 2.6690056324005127 }, { "auxiliary_loss_clip": 0.01160198, "auxiliary_loss_mlp": 0.01025566, "balance_loss_clip": 1.04788685, "balance_loss_mlp": 1.01805556, "epoch": 0.9169722840137077, "flos": 30258151096320.0, "grad_norm": 2.1074510324915092, "language_loss": 0.75591713, "learning_rate": 7.179034543888684e-08, "loss": 0.77777475, "num_input_tokens_seen": 164872065, "step": 7626, "time_per_iteration": 2.7793753147125244 }, { "auxiliary_loss_clip": 0.01164828, "auxiliary_loss_mlp": 0.01023139, "balance_loss_clip": 1.04522312, "balance_loss_mlp": 1.01599562, "epoch": 0.9170925269043467, "flos": 22491643380480.0, "grad_norm": 4.262940662556682, "language_loss": 0.77456921, "learning_rate": 7.158365766234808e-08, "loss": 0.79644889, "num_input_tokens_seen": 164890915, "step": 7627, "time_per_iteration": 2.7249152660369873 }, { "auxiliary_loss_clip": 0.01151288, "auxiliary_loss_mlp": 0.01022723, "balance_loss_clip": 1.04638433, "balance_loss_mlp": 1.01471829, "epoch": 0.9172127697949859, "flos": 22893914770560.0, "grad_norm": 1.7710129938280004, "language_loss": 0.7234478, "learning_rate": 7.137726242503527e-08, "loss": 0.74518788, "num_input_tokens_seen": 164909835, "step": 7628, "time_per_iteration": 2.7475457191467285 }, { "auxiliary_loss_clip": 0.0116507, "auxiliary_loss_mlp": 0.01049171, "balance_loss_clip": 1.04937363, "balance_loss_mlp": 1.01367736, "epoch": 0.917333012685625, "flos": 17451867882240.0, "grad_norm": 2.192442817881827, "language_loss": 0.78350019, "learning_rate": 7.11711597582585e-08, "loss": 0.80564266, "num_input_tokens_seen": 164927195, "step": 7629, "time_per_iteration": 2.6648213863372803 }, { "auxiliary_loss_clip": 0.01158976, "auxiliary_loss_mlp": 0.01024271, "balance_loss_clip": 1.04594922, "balance_loss_mlp": 1.01765442, "epoch": 0.917453255576264, "flos": 14318949692160.0, "grad_norm": 1.966637050341857, "language_loss": 0.8029961, "learning_rate": 7.096534969328271e-08, "loss": 0.82482857, "num_input_tokens_seen": 164944640, "step": 7630, "time_per_iteration": 2.6929383277893066 }, { "auxiliary_loss_clip": 0.0116193, "auxiliary_loss_mlp": 0.01027948, "balance_loss_clip": 1.044765, "balance_loss_mlp": 1.0210489, "epoch": 0.9175734984669032, "flos": 20741177888640.0, "grad_norm": 1.964940435269979, "language_loss": 0.84110332, "learning_rate": 7.075983226132987e-08, "loss": 0.86300212, "num_input_tokens_seen": 164963570, "step": 7631, "time_per_iteration": 2.7089595794677734 }, { "auxiliary_loss_clip": 0.01164652, "auxiliary_loss_mlp": 0.01059619, "balance_loss_clip": 1.04750896, "balance_loss_mlp": 1.02409601, "epoch": 0.9176937413575422, "flos": 14830497233280.0, "grad_norm": 4.4164771439659605, "language_loss": 0.79032034, "learning_rate": 7.055460749357656e-08, "loss": 0.81256306, "num_input_tokens_seen": 164979850, "step": 7632, "time_per_iteration": 2.675693988800049 }, { "auxiliary_loss_clip": 0.01158922, "auxiliary_loss_mlp": 0.01025146, "balance_loss_clip": 1.04714191, "balance_loss_mlp": 1.01760006, "epoch": 0.9178139842481813, "flos": 18474603828480.0, "grad_norm": 1.6774169948441715, "language_loss": 0.70385098, "learning_rate": 7.034967542115521e-08, "loss": 0.72569168, "num_input_tokens_seen": 164998115, "step": 7633, "time_per_iteration": 2.74955415725708 }, { "auxiliary_loss_clip": 0.01153485, "auxiliary_loss_mlp": 0.01053379, "balance_loss_clip": 1.04546654, "balance_loss_mlp": 1.01733851, "epoch": 0.9179342271388204, "flos": 20047455544320.0, "grad_norm": 2.2475882820912934, "language_loss": 0.75082695, "learning_rate": 7.014503607515388e-08, "loss": 0.77289557, "num_input_tokens_seen": 165017420, "step": 7634, "time_per_iteration": 2.6664469242095947 }, { "auxiliary_loss_clip": 0.01157651, "auxiliary_loss_mlp": 0.01022206, "balance_loss_clip": 1.04686749, "balance_loss_mlp": 1.01533961, "epoch": 0.9180544700294595, "flos": 24676232647680.0, "grad_norm": 2.110775270509874, "language_loss": 0.67927986, "learning_rate": 6.994068948661592e-08, "loss": 0.70107841, "num_input_tokens_seen": 165035575, "step": 7635, "time_per_iteration": 2.7281715869903564 }, { "auxiliary_loss_clip": 0.01163109, "auxiliary_loss_mlp": 0.01027736, "balance_loss_clip": 1.04741716, "balance_loss_mlp": 1.01949859, "epoch": 0.9181747129200986, "flos": 16727478301440.0, "grad_norm": 2.1824862166674253, "language_loss": 0.76693207, "learning_rate": 6.973663568654142e-08, "loss": 0.78884053, "num_input_tokens_seen": 165053280, "step": 7636, "time_per_iteration": 2.684828519821167 }, { "auxiliary_loss_clip": 0.01164928, "auxiliary_loss_mlp": 0.01024174, "balance_loss_clip": 1.04652071, "balance_loss_mlp": 1.01689661, "epoch": 0.9182949558107377, "flos": 24271626873600.0, "grad_norm": 2.0180153860105263, "language_loss": 0.65895081, "learning_rate": 6.953287470588386e-08, "loss": 0.68084192, "num_input_tokens_seen": 165071235, "step": 7637, "time_per_iteration": 2.6673362255096436 }, { "auxiliary_loss_clip": 0.0116591, "auxiliary_loss_mlp": 0.01026865, "balance_loss_clip": 1.04589891, "balance_loss_mlp": 1.01937246, "epoch": 0.9184151987013768, "flos": 22082117443200.0, "grad_norm": 2.0974433982942347, "language_loss": 0.85655868, "learning_rate": 6.932940657555452e-08, "loss": 0.87848639, "num_input_tokens_seen": 165087365, "step": 7638, "time_per_iteration": 3.62211012840271 }, { "auxiliary_loss_clip": 0.01158211, "auxiliary_loss_mlp": 0.01021376, "balance_loss_clip": 1.04312313, "balance_loss_mlp": 1.01458693, "epoch": 0.9185354415920158, "flos": 32166732257280.0, "grad_norm": 2.8755238674355574, "language_loss": 0.76320964, "learning_rate": 6.912623132641938e-08, "loss": 0.78500551, "num_input_tokens_seen": 165112455, "step": 7639, "time_per_iteration": 2.8162221908569336 }, { "auxiliary_loss_clip": 0.01162549, "auxiliary_loss_mlp": 0.01027208, "balance_loss_clip": 1.04724574, "balance_loss_mlp": 1.01973093, "epoch": 0.918655684482655, "flos": 20997831542400.0, "grad_norm": 1.8913950167876301, "language_loss": 0.76871657, "learning_rate": 6.892334898929952e-08, "loss": 0.79061419, "num_input_tokens_seen": 165132700, "step": 7640, "time_per_iteration": 3.7166929244995117 }, { "auxiliary_loss_clip": 0.01158143, "auxiliary_loss_mlp": 0.01025767, "balance_loss_clip": 1.04648757, "balance_loss_mlp": 1.018543, "epoch": 0.918775927373294, "flos": 15560704817280.0, "grad_norm": 1.8684291259433374, "language_loss": 0.84726077, "learning_rate": 6.872075959497236e-08, "loss": 0.86909992, "num_input_tokens_seen": 165151475, "step": 7641, "time_per_iteration": 2.6564362049102783 }, { "auxiliary_loss_clip": 0.01164595, "auxiliary_loss_mlp": 0.01022844, "balance_loss_clip": 1.04561245, "balance_loss_mlp": 1.01558399, "epoch": 0.9188961702639331, "flos": 29934057657600.0, "grad_norm": 1.9104853820827687, "language_loss": 0.82533395, "learning_rate": 6.85184631741702e-08, "loss": 0.84720832, "num_input_tokens_seen": 165172040, "step": 7642, "time_per_iteration": 2.69547700881958 }, { "auxiliary_loss_clip": 0.01159062, "auxiliary_loss_mlp": 0.01029415, "balance_loss_clip": 1.04407978, "balance_loss_mlp": 1.02195549, "epoch": 0.9190164131545723, "flos": 20701244943360.0, "grad_norm": 2.186676947796964, "language_loss": 0.77023488, "learning_rate": 6.831645975758161e-08, "loss": 0.79211962, "num_input_tokens_seen": 165189980, "step": 7643, "time_per_iteration": 3.674971103668213 }, { "auxiliary_loss_clip": 0.01154403, "auxiliary_loss_mlp": 0.01027301, "balance_loss_clip": 1.04776382, "balance_loss_mlp": 1.01946867, "epoch": 0.9191366560452113, "flos": 25629912696960.0, "grad_norm": 2.0294935602650006, "language_loss": 0.67596471, "learning_rate": 6.811474937585026e-08, "loss": 0.69778174, "num_input_tokens_seen": 165209770, "step": 7644, "time_per_iteration": 2.642223834991455 }, { "auxiliary_loss_clip": 0.01154879, "auxiliary_loss_mlp": 0.01021329, "balance_loss_clip": 1.04849827, "balance_loss_mlp": 1.0148021, "epoch": 0.9192568989358504, "flos": 21434325615360.0, "grad_norm": 1.902247798972464, "language_loss": 0.79198492, "learning_rate": 6.79133320595755e-08, "loss": 0.81374705, "num_input_tokens_seen": 165229690, "step": 7645, "time_per_iteration": 2.732020378112793 }, { "auxiliary_loss_clip": 0.01160372, "auxiliary_loss_mlp": 0.01021642, "balance_loss_clip": 1.04620135, "balance_loss_mlp": 1.01431668, "epoch": 0.9193771418264896, "flos": 23185078416000.0, "grad_norm": 1.875060790190191, "language_loss": 0.75109816, "learning_rate": 6.771220783931198e-08, "loss": 0.77291834, "num_input_tokens_seen": 165249850, "step": 7646, "time_per_iteration": 2.7119672298431396 }, { "auxiliary_loss_clip": 0.01063306, "auxiliary_loss_mlp": 0.01035649, "balance_loss_clip": 1.01987004, "balance_loss_mlp": 1.00278497, "epoch": 0.9194973847171286, "flos": 70582963184640.0, "grad_norm": 0.8520510349537617, "language_loss": 0.64592171, "learning_rate": 6.751137674556994e-08, "loss": 0.66691124, "num_input_tokens_seen": 165310235, "step": 7647, "time_per_iteration": 3.419908285140991 }, { "auxiliary_loss_clip": 0.01163771, "auxiliary_loss_mlp": 0.01027771, "balance_loss_clip": 1.04436707, "balance_loss_mlp": 1.02010226, "epoch": 0.9196176276077677, "flos": 14720682378240.0, "grad_norm": 1.979874334863797, "language_loss": 0.77228516, "learning_rate": 6.731083880881572e-08, "loss": 0.79420054, "num_input_tokens_seen": 165326455, "step": 7648, "time_per_iteration": 2.7755420207977295 }, { "auxiliary_loss_clip": 0.01158907, "auxiliary_loss_mlp": 0.01026046, "balance_loss_clip": 1.04606509, "balance_loss_mlp": 1.01889086, "epoch": 0.9197378704984068, "flos": 23294893271040.0, "grad_norm": 2.1305031718909486, "language_loss": 0.81400806, "learning_rate": 6.711059405947072e-08, "loss": 0.83585757, "num_input_tokens_seen": 165344645, "step": 7649, "time_per_iteration": 2.7140448093414307 }, { "auxiliary_loss_clip": 0.01149355, "auxiliary_loss_mlp": 0.01020914, "balance_loss_clip": 1.04512191, "balance_loss_mlp": 1.01393437, "epoch": 0.9198581133890459, "flos": 20302564913280.0, "grad_norm": 1.730965703614865, "language_loss": 0.76909924, "learning_rate": 6.691064252791156e-08, "loss": 0.79080194, "num_input_tokens_seen": 165364120, "step": 7650, "time_per_iteration": 3.62434983253479 }, { "auxiliary_loss_clip": 0.01144811, "auxiliary_loss_mlp": 0.01024105, "balance_loss_clip": 1.04394341, "balance_loss_mlp": 1.01649988, "epoch": 0.9199783562796849, "flos": 17675663569920.0, "grad_norm": 1.6918738237037818, "language_loss": 0.7802406, "learning_rate": 6.67109842444713e-08, "loss": 0.80192971, "num_input_tokens_seen": 165383050, "step": 7651, "time_per_iteration": 2.7349956035614014 }, { "auxiliary_loss_clip": 0.01157312, "auxiliary_loss_mlp": 0.01051325, "balance_loss_clip": 1.04564905, "balance_loss_mlp": 1.01702011, "epoch": 0.9200985991703241, "flos": 17676022705920.0, "grad_norm": 1.9012622986727343, "language_loss": 0.76854569, "learning_rate": 6.651161923943704e-08, "loss": 0.79063207, "num_input_tokens_seen": 165400955, "step": 7652, "time_per_iteration": 2.6555588245391846 }, { "auxiliary_loss_clip": 0.01158494, "auxiliary_loss_mlp": 0.01029313, "balance_loss_clip": 1.04633152, "balance_loss_mlp": 1.02157044, "epoch": 0.9202188420609632, "flos": 20996574566400.0, "grad_norm": 2.0766813249577427, "language_loss": 0.76873857, "learning_rate": 6.631254754305326e-08, "loss": 0.79061663, "num_input_tokens_seen": 165420415, "step": 7653, "time_per_iteration": 2.7643795013427734 }, { "auxiliary_loss_clip": 0.01165207, "auxiliary_loss_mlp": 0.01024428, "balance_loss_clip": 1.04495692, "balance_loss_mlp": 1.01667917, "epoch": 0.9203390849516022, "flos": 13918222586880.0, "grad_norm": 2.8452431139270797, "language_loss": 0.78292513, "learning_rate": 6.611376918551848e-08, "loss": 0.80482143, "num_input_tokens_seen": 165439200, "step": 7654, "time_per_iteration": 2.675814390182495 }, { "auxiliary_loss_clip": 0.01155236, "auxiliary_loss_mlp": 0.0105004, "balance_loss_clip": 1.04669189, "balance_loss_mlp": 1.01481557, "epoch": 0.9204593278422414, "flos": 21175912195200.0, "grad_norm": 5.711840017817243, "language_loss": 0.79422504, "learning_rate": 6.591528419698744e-08, "loss": 0.81627786, "num_input_tokens_seen": 165458985, "step": 7655, "time_per_iteration": 2.811122179031372 }, { "auxiliary_loss_clip": 0.01160547, "auxiliary_loss_mlp": 0.01022801, "balance_loss_clip": 1.04426575, "balance_loss_mlp": 1.01582742, "epoch": 0.9205795707328804, "flos": 14501375890560.0, "grad_norm": 2.356620863648492, "language_loss": 0.83416188, "learning_rate": 6.571709260756986e-08, "loss": 0.85599536, "num_input_tokens_seen": 165475630, "step": 7656, "time_per_iteration": 2.5832552909851074 }, { "auxiliary_loss_clip": 0.0116336, "auxiliary_loss_mlp": 0.01029916, "balance_loss_clip": 1.04817033, "balance_loss_mlp": 1.02271879, "epoch": 0.9206998136235195, "flos": 22417559579520.0, "grad_norm": 3.0974133294630213, "language_loss": 0.76419228, "learning_rate": 6.551919444733122e-08, "loss": 0.78612506, "num_input_tokens_seen": 165493445, "step": 7657, "time_per_iteration": 2.696312189102173 }, { "auxiliary_loss_clip": 0.0115797, "auxiliary_loss_mlp": 0.01023393, "balance_loss_clip": 1.04674506, "balance_loss_mlp": 1.01601088, "epoch": 0.9208200565141585, "flos": 53358407544960.0, "grad_norm": 1.8509306338340552, "language_loss": 0.66128308, "learning_rate": 6.53215897462931e-08, "loss": 0.68309671, "num_input_tokens_seen": 165517200, "step": 7658, "time_per_iteration": 3.022111415863037 }, { "auxiliary_loss_clip": 0.01162161, "auxiliary_loss_mlp": 0.01021075, "balance_loss_clip": 1.04761982, "balance_loss_mlp": 1.01408315, "epoch": 0.9209402994047977, "flos": 30589139946240.0, "grad_norm": 3.174885733379253, "language_loss": 0.74926782, "learning_rate": 6.512427853443103e-08, "loss": 0.77110016, "num_input_tokens_seen": 165539280, "step": 7659, "time_per_iteration": 2.7258734703063965 }, { "auxiliary_loss_clip": 0.01165007, "auxiliary_loss_mlp": 0.01029644, "balance_loss_clip": 1.04726982, "balance_loss_mlp": 1.02208614, "epoch": 0.9210605422954368, "flos": 29132711187840.0, "grad_norm": 1.5320261540630928, "language_loss": 0.7567724, "learning_rate": 6.492726084167799e-08, "loss": 0.77871895, "num_input_tokens_seen": 165561395, "step": 7660, "time_per_iteration": 2.7422351837158203 }, { "auxiliary_loss_clip": 0.01059406, "auxiliary_loss_mlp": 0.01001927, "balance_loss_clip": 1.00683188, "balance_loss_mlp": 1.00099075, "epoch": 0.9211807851860758, "flos": 54853838472960.0, "grad_norm": 0.7795983780025333, "language_loss": 0.57479906, "learning_rate": 6.473053669792072e-08, "loss": 0.59541237, "num_input_tokens_seen": 165616085, "step": 7661, "time_per_iteration": 3.1021342277526855 }, { "auxiliary_loss_clip": 0.01163779, "auxiliary_loss_mlp": 0.01023617, "balance_loss_clip": 1.04814017, "balance_loss_mlp": 1.01599312, "epoch": 0.921301028076715, "flos": 19201974238080.0, "grad_norm": 2.1536691258593694, "language_loss": 0.73356205, "learning_rate": 6.453410613300248e-08, "loss": 0.75543594, "num_input_tokens_seen": 165634015, "step": 7662, "time_per_iteration": 2.7197425365448 }, { "auxiliary_loss_clip": 0.01151632, "auxiliary_loss_mlp": 0.01029215, "balance_loss_clip": 1.04781508, "balance_loss_mlp": 1.02236986, "epoch": 0.921421270967354, "flos": 27526893765120.0, "grad_norm": 1.6852724424672023, "language_loss": 0.58235103, "learning_rate": 6.43379691767214e-08, "loss": 0.60415947, "num_input_tokens_seen": 165653220, "step": 7663, "time_per_iteration": 2.8337368965148926 }, { "auxiliary_loss_clip": 0.01062165, "auxiliary_loss_mlp": 0.01000303, "balance_loss_clip": 1.00997305, "balance_loss_mlp": 0.99936163, "epoch": 0.9215415138579931, "flos": 70209311955840.0, "grad_norm": 0.7243078152704728, "language_loss": 0.55141896, "learning_rate": 6.414212585883105e-08, "loss": 0.57204366, "num_input_tokens_seen": 165715850, "step": 7664, "time_per_iteration": 4.422542333602905 }, { "auxiliary_loss_clip": 0.01161867, "auxiliary_loss_mlp": 0.0102043, "balance_loss_clip": 1.047436, "balance_loss_mlp": 1.01288962, "epoch": 0.9216617567486323, "flos": 35553107790720.0, "grad_norm": 1.5009945625849075, "language_loss": 0.70011163, "learning_rate": 6.394657620904143e-08, "loss": 0.72193456, "num_input_tokens_seen": 165738960, "step": 7665, "time_per_iteration": 2.848473072052002 }, { "auxiliary_loss_clip": 0.0116837, "auxiliary_loss_mlp": 0.0102419, "balance_loss_clip": 1.04800642, "balance_loss_mlp": 1.01690066, "epoch": 0.9217819996392713, "flos": 29533330552320.0, "grad_norm": 1.6430420651065192, "language_loss": 0.71756661, "learning_rate": 6.375132025701657e-08, "loss": 0.7394923, "num_input_tokens_seen": 165761260, "step": 7666, "time_per_iteration": 2.6382834911346436 }, { "auxiliary_loss_clip": 0.01170208, "auxiliary_loss_mlp": 0.01027986, "balance_loss_clip": 1.04968357, "balance_loss_mlp": 1.02094328, "epoch": 0.9219022425299104, "flos": 14574669592320.0, "grad_norm": 2.6816488622527475, "language_loss": 0.69739151, "learning_rate": 6.355635803237724e-08, "loss": 0.71937346, "num_input_tokens_seen": 165776960, "step": 7667, "time_per_iteration": 3.5951240062713623 }, { "auxiliary_loss_clip": 0.01162376, "auxiliary_loss_mlp": 0.01031402, "balance_loss_clip": 1.0472672, "balance_loss_mlp": 1.02408552, "epoch": 0.9220224854205495, "flos": 18077503996800.0, "grad_norm": 2.543229391265913, "language_loss": 0.79958355, "learning_rate": 6.336168956469867e-08, "loss": 0.82152134, "num_input_tokens_seen": 165795435, "step": 7668, "time_per_iteration": 2.692430019378662 }, { "auxiliary_loss_clip": 0.01154109, "auxiliary_loss_mlp": 0.01027241, "balance_loss_clip": 1.04826117, "balance_loss_mlp": 1.02044892, "epoch": 0.9221427283111886, "flos": 24790464875520.0, "grad_norm": 1.6916044607463963, "language_loss": 0.72106099, "learning_rate": 6.316731488351168e-08, "loss": 0.7428745, "num_input_tokens_seen": 165816625, "step": 7669, "time_per_iteration": 3.6685707569122314 }, { "auxiliary_loss_clip": 0.01161166, "auxiliary_loss_mlp": 0.01026115, "balance_loss_clip": 1.04555428, "balance_loss_mlp": 1.0186795, "epoch": 0.9222629712018277, "flos": 13845036625920.0, "grad_norm": 2.272241985812172, "language_loss": 0.63357568, "learning_rate": 6.297323401830334e-08, "loss": 0.65544856, "num_input_tokens_seen": 165835410, "step": 7670, "time_per_iteration": 2.672896146774292 }, { "auxiliary_loss_clip": 0.01163564, "auxiliary_loss_mlp": 0.01026662, "balance_loss_clip": 1.044523, "balance_loss_mlp": 1.0190624, "epoch": 0.9223832140924668, "flos": 21616177196160.0, "grad_norm": 2.5481300704389205, "language_loss": 0.69169444, "learning_rate": 6.277944699851523e-08, "loss": 0.7135967, "num_input_tokens_seen": 165854930, "step": 7671, "time_per_iteration": 2.6778416633605957 }, { "auxiliary_loss_clip": 0.01165294, "auxiliary_loss_mlp": 0.01028439, "balance_loss_clip": 1.04554367, "balance_loss_mlp": 1.02098811, "epoch": 0.9225034569831059, "flos": 21142084561920.0, "grad_norm": 1.9676766969806854, "language_loss": 0.73303258, "learning_rate": 6.25859538535447e-08, "loss": 0.75496995, "num_input_tokens_seen": 165875725, "step": 7672, "time_per_iteration": 2.736398935317993 }, { "auxiliary_loss_clip": 0.01160068, "auxiliary_loss_mlp": 0.01025923, "balance_loss_clip": 1.04653311, "balance_loss_mlp": 1.01864481, "epoch": 0.9226236998737449, "flos": 12495046844160.0, "grad_norm": 3.0750334438863347, "language_loss": 0.77456486, "learning_rate": 6.239275461274474e-08, "loss": 0.79642475, "num_input_tokens_seen": 165892100, "step": 7673, "time_per_iteration": 2.712744951248169 }, { "auxiliary_loss_clip": 0.01162978, "auxiliary_loss_mlp": 0.01028633, "balance_loss_clip": 1.04723525, "balance_loss_mlp": 1.02153707, "epoch": 0.9227439427643841, "flos": 26214071581440.0, "grad_norm": 1.9361633845963593, "language_loss": 0.85917842, "learning_rate": 6.219984930542299e-08, "loss": 0.88109457, "num_input_tokens_seen": 165912840, "step": 7674, "time_per_iteration": 2.717331647872925 }, { "auxiliary_loss_clip": 0.01167676, "auxiliary_loss_mlp": 0.01021363, "balance_loss_clip": 1.04894435, "balance_loss_mlp": 1.01350749, "epoch": 0.9228641856550232, "flos": 17967581400960.0, "grad_norm": 1.942736328170593, "language_loss": 0.75926828, "learning_rate": 6.200723796084383e-08, "loss": 0.78115875, "num_input_tokens_seen": 165930935, "step": 7675, "time_per_iteration": 2.638571262359619 }, { "auxiliary_loss_clip": 0.01061495, "auxiliary_loss_mlp": 0.01005505, "balance_loss_clip": 1.00739002, "balance_loss_mlp": 1.00439632, "epoch": 0.9229844285456622, "flos": 70420609710720.0, "grad_norm": 0.8030240560085509, "language_loss": 0.63070714, "learning_rate": 6.181492060822546e-08, "loss": 0.65137714, "num_input_tokens_seen": 165991110, "step": 7676, "time_per_iteration": 4.088911533355713 }, { "auxiliary_loss_clip": 0.01151341, "auxiliary_loss_mlp": 0.01027381, "balance_loss_clip": 1.04619205, "balance_loss_mlp": 1.02036536, "epoch": 0.9231046714363014, "flos": 17967832796160.0, "grad_norm": 2.3332282577644206, "language_loss": 0.81870067, "learning_rate": 6.162289727674274e-08, "loss": 0.84048784, "num_input_tokens_seen": 166008790, "step": 7677, "time_per_iteration": 2.7592275142669678 }, { "auxiliary_loss_clip": 0.01155109, "auxiliary_loss_mlp": 0.01023939, "balance_loss_clip": 1.04501319, "balance_loss_mlp": 1.01726913, "epoch": 0.9232249143269404, "flos": 17858233422720.0, "grad_norm": 2.4511661354440593, "language_loss": 0.87688434, "learning_rate": 6.143116799552527e-08, "loss": 0.89867485, "num_input_tokens_seen": 166025035, "step": 7678, "time_per_iteration": 2.6629295349121094 }, { "auxiliary_loss_clip": 0.01163847, "auxiliary_loss_mlp": 0.01027628, "balance_loss_clip": 1.0459137, "balance_loss_mlp": 1.02048779, "epoch": 0.9233451572175795, "flos": 23404384903680.0, "grad_norm": 9.829229702476393, "language_loss": 0.56576228, "learning_rate": 6.123973279365802e-08, "loss": 0.58767712, "num_input_tokens_seen": 166044010, "step": 7679, "time_per_iteration": 2.704592704772949 }, { "auxiliary_loss_clip": 0.01167338, "auxiliary_loss_mlp": 0.0102853, "balance_loss_clip": 1.04868996, "balance_loss_mlp": 1.02142823, "epoch": 0.9234654001082186, "flos": 17999326045440.0, "grad_norm": 2.3186337391278498, "language_loss": 0.77513987, "learning_rate": 6.10485917001824e-08, "loss": 0.79709858, "num_input_tokens_seen": 166061865, "step": 7680, "time_per_iteration": 2.6591005325317383 }, { "auxiliary_loss_clip": 0.01163487, "auxiliary_loss_mlp": 0.01027738, "balance_loss_clip": 1.04606485, "balance_loss_mlp": 1.02088976, "epoch": 0.9235856429988577, "flos": 24750747411840.0, "grad_norm": 2.1406087078741525, "language_loss": 0.80739075, "learning_rate": 6.085774474409322e-08, "loss": 0.82930297, "num_input_tokens_seen": 166082425, "step": 7681, "time_per_iteration": 2.7795355319976807 }, { "auxiliary_loss_clip": 0.01158553, "auxiliary_loss_mlp": 0.01027688, "balance_loss_clip": 1.04730344, "balance_loss_mlp": 1.02039242, "epoch": 0.9237058858894968, "flos": 14099894599680.0, "grad_norm": 2.2702913811512793, "language_loss": 0.7038151, "learning_rate": 6.066719195434267e-08, "loss": 0.72567755, "num_input_tokens_seen": 166100225, "step": 7682, "time_per_iteration": 2.775380849838257 }, { "auxiliary_loss_clip": 0.01166297, "auxiliary_loss_mlp": 0.01026461, "balance_loss_clip": 1.04762363, "balance_loss_mlp": 1.01825988, "epoch": 0.9238261287801359, "flos": 28694529175680.0, "grad_norm": 2.377220126217277, "language_loss": 0.66363442, "learning_rate": 6.047693335983717e-08, "loss": 0.68556201, "num_input_tokens_seen": 166122570, "step": 7683, "time_per_iteration": 2.8578860759735107 }, { "auxiliary_loss_clip": 0.01164091, "auxiliary_loss_mlp": 0.01024308, "balance_loss_clip": 1.04538751, "balance_loss_mlp": 1.01663387, "epoch": 0.923946371670775, "flos": 23111856541440.0, "grad_norm": 2.7064805659426305, "language_loss": 0.82539618, "learning_rate": 6.028696898943853e-08, "loss": 0.84728014, "num_input_tokens_seen": 166141630, "step": 7684, "time_per_iteration": 2.804868221282959 }, { "auxiliary_loss_clip": 0.01160092, "auxiliary_loss_mlp": 0.01057842, "balance_loss_clip": 1.04641688, "balance_loss_mlp": 1.02153242, "epoch": 0.924066614561414, "flos": 21867120587520.0, "grad_norm": 1.9640524489753295, "language_loss": 0.70905292, "learning_rate": 6.00972988719648e-08, "loss": 0.73123223, "num_input_tokens_seen": 166159865, "step": 7685, "time_per_iteration": 2.665088415145874 }, { "auxiliary_loss_clip": 0.01159809, "auxiliary_loss_mlp": 0.01054662, "balance_loss_clip": 1.04712224, "balance_loss_mlp": 1.018242, "epoch": 0.9241868574520532, "flos": 28511887495680.0, "grad_norm": 2.3596296670648704, "language_loss": 0.70703709, "learning_rate": 5.990792303618807e-08, "loss": 0.72918183, "num_input_tokens_seen": 166179445, "step": 7686, "time_per_iteration": 2.7963576316833496 }, { "auxiliary_loss_clip": 0.0115303, "auxiliary_loss_mlp": 0.01020348, "balance_loss_clip": 1.04475534, "balance_loss_mlp": 1.0132457, "epoch": 0.9243071003426923, "flos": 30518324282880.0, "grad_norm": 1.5760024781929391, "language_loss": 0.69336152, "learning_rate": 5.971884151083695e-08, "loss": 0.71509528, "num_input_tokens_seen": 166201855, "step": 7687, "time_per_iteration": 2.7367656230926514 }, { "auxiliary_loss_clip": 0.01165603, "auxiliary_loss_mlp": 0.01024658, "balance_loss_clip": 1.04950261, "balance_loss_mlp": 1.01764238, "epoch": 0.9244273432333313, "flos": 28658331244800.0, "grad_norm": 1.8542013339249364, "language_loss": 0.74495697, "learning_rate": 5.9530054324595124e-08, "loss": 0.76685953, "num_input_tokens_seen": 166221970, "step": 7688, "time_per_iteration": 2.8574419021606445 }, { "auxiliary_loss_clip": 0.01056224, "auxiliary_loss_mlp": 0.0103421, "balance_loss_clip": 1.00797749, "balance_loss_mlp": 1.00077343, "epoch": 0.9245475861239704, "flos": 66230589237120.0, "grad_norm": 0.7344184515927906, "language_loss": 0.57555056, "learning_rate": 5.934156150610103e-08, "loss": 0.59645486, "num_input_tokens_seen": 166279335, "step": 7689, "time_per_iteration": 3.357329845428467 }, { "auxiliary_loss_clip": 0.01157418, "auxiliary_loss_mlp": 0.01023927, "balance_loss_clip": 1.04593444, "balance_loss_mlp": 1.01535022, "epoch": 0.9246678290146095, "flos": 24239918142720.0, "grad_norm": 2.144459479817309, "language_loss": 0.79277992, "learning_rate": 5.915336308394914e-08, "loss": 0.81459337, "num_input_tokens_seen": 166298170, "step": 7690, "time_per_iteration": 3.689954996109009 }, { "auxiliary_loss_clip": 0.01157465, "auxiliary_loss_mlp": 0.01028146, "balance_loss_clip": 1.04446483, "balance_loss_mlp": 1.02140784, "epoch": 0.9247880719052486, "flos": 18988808976000.0, "grad_norm": 1.5841097780687643, "language_loss": 0.77089769, "learning_rate": 5.89654590866886e-08, "loss": 0.79275382, "num_input_tokens_seen": 166317670, "step": 7691, "time_per_iteration": 2.6778042316436768 }, { "auxiliary_loss_clip": 0.0115313, "auxiliary_loss_mlp": 0.01025181, "balance_loss_clip": 1.04808867, "balance_loss_mlp": 1.01766539, "epoch": 0.9249083147958876, "flos": 24024095274240.0, "grad_norm": 2.1802151543436716, "language_loss": 0.88471347, "learning_rate": 5.877784954282483e-08, "loss": 0.90649652, "num_input_tokens_seen": 166337010, "step": 7692, "time_per_iteration": 2.7924485206604004 }, { "auxiliary_loss_clip": 0.01164356, "auxiliary_loss_mlp": 0.01024759, "balance_loss_clip": 1.0455488, "balance_loss_mlp": 1.01725745, "epoch": 0.9250285576865268, "flos": 30773972355840.0, "grad_norm": 1.8648507804359953, "language_loss": 0.72558093, "learning_rate": 5.8590534480817963e-08, "loss": 0.74747211, "num_input_tokens_seen": 166358735, "step": 7693, "time_per_iteration": 3.697273015975952 }, { "auxiliary_loss_clip": 0.01166655, "auxiliary_loss_mlp": 0.01025369, "balance_loss_clip": 1.04808021, "balance_loss_mlp": 1.01759672, "epoch": 0.9251488005771659, "flos": 10633581348480.0, "grad_norm": 2.115733805153659, "language_loss": 0.72326934, "learning_rate": 5.840351392908349e-08, "loss": 0.74518955, "num_input_tokens_seen": 166374455, "step": 7694, "time_per_iteration": 2.7463958263397217 }, { "auxiliary_loss_clip": 0.01170221, "auxiliary_loss_mlp": 0.01055552, "balance_loss_clip": 1.04929805, "balance_loss_mlp": 1.01972151, "epoch": 0.9252690434678049, "flos": 23586416052480.0, "grad_norm": 2.258142555199332, "language_loss": 0.70560676, "learning_rate": 5.821678791599205e-08, "loss": 0.7278645, "num_input_tokens_seen": 166393900, "step": 7695, "time_per_iteration": 3.7007813453674316 }, { "auxiliary_loss_clip": 0.01159866, "auxiliary_loss_mlp": 0.01027288, "balance_loss_clip": 1.04814458, "balance_loss_mlp": 1.01992345, "epoch": 0.9253892863584441, "flos": 21469158829440.0, "grad_norm": 2.752695388105728, "language_loss": 0.80702603, "learning_rate": 5.803035646986965e-08, "loss": 0.82889754, "num_input_tokens_seen": 166413235, "step": 7696, "time_per_iteration": 2.7264161109924316 }, { "auxiliary_loss_clip": 0.01166538, "auxiliary_loss_mlp": 0.0102771, "balance_loss_clip": 1.0457294, "balance_loss_mlp": 1.02009296, "epoch": 0.9255095292490831, "flos": 17456680304640.0, "grad_norm": 3.316925952476924, "language_loss": 0.67536157, "learning_rate": 5.7844219618998766e-08, "loss": 0.69730407, "num_input_tokens_seen": 166427560, "step": 7697, "time_per_iteration": 2.6484267711639404 }, { "auxiliary_loss_clip": 0.01146492, "auxiliary_loss_mlp": 0.01025947, "balance_loss_clip": 1.04589391, "balance_loss_mlp": 1.01897931, "epoch": 0.9256297721397222, "flos": 24750675584640.0, "grad_norm": 2.069609710950503, "language_loss": 0.71846116, "learning_rate": 5.765837739161505e-08, "loss": 0.74018556, "num_input_tokens_seen": 166446680, "step": 7698, "time_per_iteration": 2.788817882537842 }, { "auxiliary_loss_clip": 0.01158718, "auxiliary_loss_mlp": 0.01023752, "balance_loss_clip": 1.04748678, "balance_loss_mlp": 1.01653123, "epoch": 0.9257500150303614, "flos": 23112215677440.0, "grad_norm": 1.7436875381586299, "language_loss": 0.74533403, "learning_rate": 5.7472829815911504e-08, "loss": 0.76715875, "num_input_tokens_seen": 166465505, "step": 7699, "time_per_iteration": 2.7740674018859863 }, { "auxiliary_loss_clip": 0.01154782, "auxiliary_loss_mlp": 0.01023671, "balance_loss_clip": 1.04588819, "balance_loss_mlp": 1.01633954, "epoch": 0.9258702579210004, "flos": 22564685687040.0, "grad_norm": 2.487885271571924, "language_loss": 0.81643957, "learning_rate": 5.7287576920035164e-08, "loss": 0.83822411, "num_input_tokens_seen": 166484520, "step": 7700, "time_per_iteration": 2.692495822906494 }, { "auxiliary_loss_clip": 0.0115162, "auxiliary_loss_mlp": 0.01020896, "balance_loss_clip": 1.04518104, "balance_loss_mlp": 1.01395774, "epoch": 0.9259905008116395, "flos": 30004298703360.0, "grad_norm": 1.837878463507158, "language_loss": 0.76575786, "learning_rate": 5.7102618732088435e-08, "loss": 0.7874831, "num_input_tokens_seen": 166503850, "step": 7701, "time_per_iteration": 3.6900949478149414 }, { "auxiliary_loss_clip": 0.01166395, "auxiliary_loss_mlp": 0.01023516, "balance_loss_clip": 1.0482583, "balance_loss_mlp": 1.01636958, "epoch": 0.9261107437022786, "flos": 24572128055040.0, "grad_norm": 1.6742805590519096, "language_loss": 0.74919885, "learning_rate": 5.6917955280130216e-08, "loss": 0.7710979, "num_input_tokens_seen": 166525330, "step": 7702, "time_per_iteration": 2.6775574684143066 }, { "auxiliary_loss_clip": 0.01158991, "auxiliary_loss_mlp": 0.0102479, "balance_loss_clip": 1.04682112, "balance_loss_mlp": 1.01748204, "epoch": 0.9262309865929177, "flos": 22018448586240.0, "grad_norm": 2.6155545754720473, "language_loss": 0.7181977, "learning_rate": 5.6733586592172755e-08, "loss": 0.74003553, "num_input_tokens_seen": 166544825, "step": 7703, "time_per_iteration": 2.7316672801971436 }, { "auxiliary_loss_clip": 0.01155004, "auxiliary_loss_mlp": 0.01050015, "balance_loss_clip": 1.04460621, "balance_loss_mlp": 1.0155139, "epoch": 0.9263512294835567, "flos": 20339481116160.0, "grad_norm": 1.7101177166164885, "language_loss": 0.79590094, "learning_rate": 5.6549512696185244e-08, "loss": 0.8179512, "num_input_tokens_seen": 166563325, "step": 7704, "time_per_iteration": 2.7719297409057617 }, { "auxiliary_loss_clip": 0.01163801, "auxiliary_loss_mlp": 0.01026504, "balance_loss_clip": 1.04708946, "balance_loss_mlp": 1.01924968, "epoch": 0.9264714723741959, "flos": 21215378263680.0, "grad_norm": 1.671377880896909, "language_loss": 0.6804654, "learning_rate": 5.636573362009156e-08, "loss": 0.7023685, "num_input_tokens_seen": 166583385, "step": 7705, "time_per_iteration": 2.7811944484710693 }, { "auxiliary_loss_clip": 0.01167893, "auxiliary_loss_mlp": 0.01026874, "balance_loss_clip": 1.04671383, "balance_loss_mlp": 1.01933718, "epoch": 0.926591715264835, "flos": 18004964480640.0, "grad_norm": 2.9035286889080147, "language_loss": 0.76967204, "learning_rate": 5.618224939177074e-08, "loss": 0.79161966, "num_input_tokens_seen": 166601290, "step": 7706, "time_per_iteration": 2.720966339111328 }, { "auxiliary_loss_clip": 0.01147902, "auxiliary_loss_mlp": 0.01026609, "balance_loss_clip": 1.04513705, "balance_loss_mlp": 1.01905751, "epoch": 0.926711958155474, "flos": 36167969825280.0, "grad_norm": 1.8901532048821894, "language_loss": 0.70281011, "learning_rate": 5.599906003905719e-08, "loss": 0.72455525, "num_input_tokens_seen": 166623835, "step": 7707, "time_per_iteration": 2.846055507659912 }, { "auxiliary_loss_clip": 0.0115416, "auxiliary_loss_mlp": 0.01023345, "balance_loss_clip": 1.04550934, "balance_loss_mlp": 1.01631141, "epoch": 0.9268322010461132, "flos": 21032736583680.0, "grad_norm": 2.2081018962789893, "language_loss": 0.81457621, "learning_rate": 5.581616558974023e-08, "loss": 0.83635134, "num_input_tokens_seen": 166642400, "step": 7708, "time_per_iteration": 2.765209197998047 }, { "auxiliary_loss_clip": 0.01169373, "auxiliary_loss_mlp": 0.01051815, "balance_loss_clip": 1.04856062, "balance_loss_mlp": 1.01660097, "epoch": 0.9269524439367522, "flos": 22964838174720.0, "grad_norm": 1.8144511961813223, "language_loss": 0.79224437, "learning_rate": 5.5633566071565444e-08, "loss": 0.81445628, "num_input_tokens_seen": 166661640, "step": 7709, "time_per_iteration": 2.895923376083374 }, { "auxiliary_loss_clip": 0.01152654, "auxiliary_loss_mlp": 0.01022674, "balance_loss_clip": 1.04581356, "balance_loss_mlp": 1.01585829, "epoch": 0.9270726868273913, "flos": 41975551468800.0, "grad_norm": 2.0612899878123043, "language_loss": 0.70666528, "learning_rate": 5.5451261512232896e-08, "loss": 0.72841859, "num_input_tokens_seen": 166684320, "step": 7710, "time_per_iteration": 3.0430617332458496 }, { "auxiliary_loss_clip": 0.01165599, "auxiliary_loss_mlp": 0.01025566, "balance_loss_clip": 1.04369557, "balance_loss_mlp": 1.01721501, "epoch": 0.9271929297180305, "flos": 19791771557760.0, "grad_norm": 2.0246332781376357, "language_loss": 0.62514937, "learning_rate": 5.5269251939397576e-08, "loss": 0.64706105, "num_input_tokens_seen": 166703835, "step": 7711, "time_per_iteration": 2.6965842247009277 }, { "auxiliary_loss_clip": 0.0115699, "auxiliary_loss_mlp": 0.01027507, "balance_loss_clip": 1.04421782, "balance_loss_mlp": 1.0199368, "epoch": 0.9273131726086695, "flos": 19968343839360.0, "grad_norm": 2.362982149596971, "language_loss": 0.76450861, "learning_rate": 5.508753738067073e-08, "loss": 0.78635353, "num_input_tokens_seen": 166723375, "step": 7712, "time_per_iteration": 2.754206895828247 }, { "auxiliary_loss_clip": 0.0116492, "auxiliary_loss_mlp": 0.01019791, "balance_loss_clip": 1.04559672, "balance_loss_mlp": 1.01266551, "epoch": 0.9274334154993086, "flos": 23258587599360.0, "grad_norm": 2.1936568333284665, "language_loss": 0.79396021, "learning_rate": 5.4906117863617875e-08, "loss": 0.81580734, "num_input_tokens_seen": 166742760, "step": 7713, "time_per_iteration": 2.7012243270874023 }, { "auxiliary_loss_clip": 0.01151953, "auxiliary_loss_mlp": 0.0101945, "balance_loss_clip": 1.0440371, "balance_loss_mlp": 1.0118978, "epoch": 0.9275536583899477, "flos": 31795343585280.0, "grad_norm": 2.303009443117207, "language_loss": 0.77945274, "learning_rate": 5.4724993415760533e-08, "loss": 0.80116677, "num_input_tokens_seen": 166761115, "step": 7714, "time_per_iteration": 2.778249502182007 }, { "auxiliary_loss_clip": 0.01166623, "auxiliary_loss_mlp": 0.01056369, "balance_loss_clip": 1.04835439, "balance_loss_mlp": 1.01924086, "epoch": 0.9276739012805868, "flos": 18696998885760.0, "grad_norm": 3.546256904470547, "language_loss": 0.74195093, "learning_rate": 5.454416406457496e-08, "loss": 0.7641809, "num_input_tokens_seen": 166780210, "step": 7715, "time_per_iteration": 2.835278272628784 }, { "auxiliary_loss_clip": 0.01161372, "auxiliary_loss_mlp": 0.01029928, "balance_loss_clip": 1.04621553, "balance_loss_mlp": 1.02261424, "epoch": 0.9277941441712259, "flos": 13879079740800.0, "grad_norm": 3.061508803808682, "language_loss": 0.74165595, "learning_rate": 5.436362983749299e-08, "loss": 0.76356894, "num_input_tokens_seen": 166795380, "step": 7716, "time_per_iteration": 2.762834310531616 }, { "auxiliary_loss_clip": 0.01149466, "auxiliary_loss_mlp": 0.01021298, "balance_loss_clip": 1.04626656, "balance_loss_mlp": 1.01439595, "epoch": 0.927914387061865, "flos": 23258659426560.0, "grad_norm": 2.8959224579922265, "language_loss": 0.64708817, "learning_rate": 5.418339076190137e-08, "loss": 0.66879582, "num_input_tokens_seen": 166814890, "step": 7717, "time_per_iteration": 3.707702875137329 }, { "auxiliary_loss_clip": 0.01154449, "auxiliary_loss_mlp": 0.01021289, "balance_loss_clip": 1.04894185, "balance_loss_mlp": 1.01368308, "epoch": 0.9280346299525041, "flos": 18073733068800.0, "grad_norm": 1.8473254107288013, "language_loss": 0.88790751, "learning_rate": 5.400344686514202e-08, "loss": 0.90966499, "num_input_tokens_seen": 166832475, "step": 7718, "time_per_iteration": 3.6329405307769775 }, { "auxiliary_loss_clip": 0.01161496, "auxiliary_loss_mlp": 0.01020458, "balance_loss_clip": 1.04788983, "balance_loss_mlp": 1.01351404, "epoch": 0.9281548728431431, "flos": 22342901160960.0, "grad_norm": 3.7574039599930664, "language_loss": 0.66361284, "learning_rate": 5.38237981745131e-08, "loss": 0.68543243, "num_input_tokens_seen": 166850590, "step": 7719, "time_per_iteration": 2.710501194000244 }, { "auxiliary_loss_clip": 0.01165086, "auxiliary_loss_mlp": 0.01055443, "balance_loss_clip": 1.04746699, "balance_loss_mlp": 1.01819372, "epoch": 0.9282751157337822, "flos": 18843765857280.0, "grad_norm": 2.302237833535728, "language_loss": 0.81256032, "learning_rate": 5.364444471726592e-08, "loss": 0.83476561, "num_input_tokens_seen": 166869795, "step": 7720, "time_per_iteration": 2.7411956787109375 }, { "auxiliary_loss_clip": 0.01161376, "auxiliary_loss_mlp": 0.01020559, "balance_loss_clip": 1.04634786, "balance_loss_mlp": 1.0134722, "epoch": 0.9283953586244214, "flos": 25556834476800.0, "grad_norm": 3.335484167732486, "language_loss": 0.80123955, "learning_rate": 5.346538652060939e-08, "loss": 0.8230589, "num_input_tokens_seen": 166891150, "step": 7721, "time_per_iteration": 3.75345516204834 }, { "auxiliary_loss_clip": 0.01156687, "auxiliary_loss_mlp": 0.01030339, "balance_loss_clip": 1.0471009, "balance_loss_mlp": 1.02354383, "epoch": 0.9285156015150604, "flos": 18223480869120.0, "grad_norm": 1.7424203099832587, "language_loss": 0.70334023, "learning_rate": 5.3286623611705994e-08, "loss": 0.72521043, "num_input_tokens_seen": 166909195, "step": 7722, "time_per_iteration": 2.750068187713623 }, { "auxiliary_loss_clip": 0.01059138, "auxiliary_loss_mlp": 0.01002141, "balance_loss_clip": 1.00671518, "balance_loss_mlp": 1.00118113, "epoch": 0.9286358444056995, "flos": 66400017690240.0, "grad_norm": 0.8339420316038444, "language_loss": 0.60611886, "learning_rate": 5.3108156017673824e-08, "loss": 0.62673163, "num_input_tokens_seen": 166970955, "step": 7723, "time_per_iteration": 3.356328010559082 }, { "auxiliary_loss_clip": 0.0116674, "auxiliary_loss_mlp": 0.01027731, "balance_loss_clip": 1.04733658, "balance_loss_mlp": 1.02008379, "epoch": 0.9287560872963386, "flos": 22345630594560.0, "grad_norm": 1.7992796099515838, "language_loss": 0.71673715, "learning_rate": 5.2929983765586775e-08, "loss": 0.73868185, "num_input_tokens_seen": 166989735, "step": 7724, "time_per_iteration": 2.673311710357666 }, { "auxiliary_loss_clip": 0.01164325, "auxiliary_loss_mlp": 0.01028037, "balance_loss_clip": 1.04726171, "balance_loss_mlp": 1.02101898, "epoch": 0.9288763301869777, "flos": 25700225569920.0, "grad_norm": 3.2370672672767227, "language_loss": 0.6281426, "learning_rate": 5.275210688247278e-08, "loss": 0.65006626, "num_input_tokens_seen": 167010060, "step": 7725, "time_per_iteration": 2.71014666557312 }, { "auxiliary_loss_clip": 0.01152833, "auxiliary_loss_mlp": 0.01021197, "balance_loss_clip": 1.04655719, "balance_loss_mlp": 1.01411021, "epoch": 0.9289965730776167, "flos": 12312046028160.0, "grad_norm": 1.8382517656092499, "language_loss": 0.85348797, "learning_rate": 5.257452539531604e-08, "loss": 0.87522817, "num_input_tokens_seen": 167027130, "step": 7726, "time_per_iteration": 2.8258519172668457 }, { "auxiliary_loss_clip": 0.01163721, "auxiliary_loss_mlp": 0.01025713, "balance_loss_clip": 1.04609489, "balance_loss_mlp": 1.01877785, "epoch": 0.9291168159682559, "flos": 26685973486080.0, "grad_norm": 1.61429909735368, "language_loss": 0.68582165, "learning_rate": 5.2397239331055445e-08, "loss": 0.70771599, "num_input_tokens_seen": 167049130, "step": 7727, "time_per_iteration": 2.7325899600982666 }, { "auxiliary_loss_clip": 0.01156125, "auxiliary_loss_mlp": 0.01024432, "balance_loss_clip": 1.04684341, "balance_loss_mlp": 1.01677608, "epoch": 0.929237058858895, "flos": 14538256179840.0, "grad_norm": 4.819568747152911, "language_loss": 0.81205082, "learning_rate": 5.2220248716585036e-08, "loss": 0.8338564, "num_input_tokens_seen": 167066810, "step": 7728, "time_per_iteration": 3.6251025199890137 }, { "auxiliary_loss_clip": 0.01155882, "auxiliary_loss_mlp": 0.01027225, "balance_loss_clip": 1.04461956, "balance_loss_mlp": 1.01992989, "epoch": 0.929357301749534, "flos": 23835456023040.0, "grad_norm": 2.442540712156563, "language_loss": 0.75251001, "learning_rate": 5.204355357875445e-08, "loss": 0.77434111, "num_input_tokens_seen": 167085155, "step": 7729, "time_per_iteration": 2.682879686355591 }, { "auxiliary_loss_clip": 0.01159464, "auxiliary_loss_mlp": 0.0102663, "balance_loss_clip": 1.04737425, "balance_loss_mlp": 1.01900017, "epoch": 0.9294775446401732, "flos": 12969319046400.0, "grad_norm": 2.5387243289956998, "language_loss": 0.70608616, "learning_rate": 5.1867153944367584e-08, "loss": 0.72794706, "num_input_tokens_seen": 167101545, "step": 7730, "time_per_iteration": 2.715437173843384 }, { "auxiliary_loss_clip": 0.01162525, "auxiliary_loss_mlp": 0.01025441, "balance_loss_clip": 1.04734159, "balance_loss_mlp": 1.01766217, "epoch": 0.9295977875308122, "flos": 26211809024640.0, "grad_norm": 1.5181260261039045, "language_loss": 0.73426151, "learning_rate": 5.16910498401848e-08, "loss": 0.75614119, "num_input_tokens_seen": 167120995, "step": 7731, "time_per_iteration": 2.7643349170684814 }, { "auxiliary_loss_clip": 0.01163006, "auxiliary_loss_mlp": 0.01028423, "balance_loss_clip": 1.04526329, "balance_loss_mlp": 1.02137721, "epoch": 0.9297180304214513, "flos": 16472297105280.0, "grad_norm": 2.0447687382551107, "language_loss": 0.83518422, "learning_rate": 5.151524129292073e-08, "loss": 0.85709846, "num_input_tokens_seen": 167138890, "step": 7732, "time_per_iteration": 2.6761794090270996 }, { "auxiliary_loss_clip": 0.01160024, "auxiliary_loss_mlp": 0.01029045, "balance_loss_clip": 1.04592514, "balance_loss_mlp": 1.02200294, "epoch": 0.9298382733120905, "flos": 24060436859520.0, "grad_norm": 2.4885951725234454, "language_loss": 0.66791075, "learning_rate": 5.1339728329245155e-08, "loss": 0.68980145, "num_input_tokens_seen": 167159455, "step": 7733, "time_per_iteration": 2.8982443809509277 }, { "auxiliary_loss_clip": 0.01168799, "auxiliary_loss_mlp": 0.01024511, "balance_loss_clip": 1.04735208, "balance_loss_mlp": 1.01695311, "epoch": 0.9299585162027295, "flos": 22127652910080.0, "grad_norm": 2.1131763817117895, "language_loss": 0.79713941, "learning_rate": 5.116451097578367e-08, "loss": 0.81907248, "num_input_tokens_seen": 167178495, "step": 7734, "time_per_iteration": 2.6965558528900146 }, { "auxiliary_loss_clip": 0.01154659, "auxiliary_loss_mlp": 0.01025945, "balance_loss_clip": 1.04711413, "balance_loss_mlp": 1.01895356, "epoch": 0.9300787590933686, "flos": 21471780522240.0, "grad_norm": 1.7499957996605202, "language_loss": 0.74318337, "learning_rate": 5.0989589259115895e-08, "loss": 0.7649895, "num_input_tokens_seen": 167199380, "step": 7735, "time_per_iteration": 2.7233481407165527 }, { "auxiliary_loss_clip": 0.01160811, "auxiliary_loss_mlp": 0.01022961, "balance_loss_clip": 1.04614961, "balance_loss_mlp": 1.01476586, "epoch": 0.9301990019840077, "flos": 17779588594560.0, "grad_norm": 2.519901246919091, "language_loss": 0.71671206, "learning_rate": 5.081496320577816e-08, "loss": 0.73854977, "num_input_tokens_seen": 167216500, "step": 7736, "time_per_iteration": 2.6244492530822754 }, { "auxiliary_loss_clip": 0.01060562, "auxiliary_loss_mlp": 0.01003398, "balance_loss_clip": 1.01336503, "balance_loss_mlp": 1.00248575, "epoch": 0.9303192448746468, "flos": 58896122307840.0, "grad_norm": 0.9124792662444448, "language_loss": 0.6119737, "learning_rate": 5.0640632842260835e-08, "loss": 0.6326133, "num_input_tokens_seen": 167276760, "step": 7737, "time_per_iteration": 3.357956886291504 }, { "auxiliary_loss_clip": 0.01155411, "auxiliary_loss_mlp": 0.01057698, "balance_loss_clip": 1.04871905, "balance_loss_mlp": 1.02136517, "epoch": 0.9304394877652858, "flos": 57663522172800.0, "grad_norm": 1.791782861387344, "language_loss": 0.72513509, "learning_rate": 5.0466598195009426e-08, "loss": 0.74726617, "num_input_tokens_seen": 167303630, "step": 7738, "time_per_iteration": 3.0300793647766113 }, { "auxiliary_loss_clip": 0.01156924, "auxiliary_loss_mlp": 0.01019591, "balance_loss_clip": 1.04537249, "balance_loss_mlp": 1.01221752, "epoch": 0.930559730655925, "flos": 20996143603200.0, "grad_norm": 1.900367532429542, "language_loss": 0.70381236, "learning_rate": 5.0292859290425036e-08, "loss": 0.72557747, "num_input_tokens_seen": 167321500, "step": 7739, "time_per_iteration": 2.6846303939819336 }, { "auxiliary_loss_clip": 0.0116394, "auxiliary_loss_mlp": 0.01021094, "balance_loss_clip": 1.04690647, "balance_loss_mlp": 1.01465631, "epoch": 0.9306799735465641, "flos": 23258264376960.0, "grad_norm": 2.4357707065846994, "language_loss": 0.77319205, "learning_rate": 5.011941615486348e-08, "loss": 0.7950424, "num_input_tokens_seen": 167340615, "step": 7740, "time_per_iteration": 2.722482204437256 }, { "auxiliary_loss_clip": 0.01163884, "auxiliary_loss_mlp": 0.01022565, "balance_loss_clip": 1.04558921, "balance_loss_mlp": 1.01579344, "epoch": 0.9308002164372031, "flos": 15231547560960.0, "grad_norm": 2.988696060336324, "language_loss": 0.84852362, "learning_rate": 4.994626881463659e-08, "loss": 0.87038815, "num_input_tokens_seen": 167356870, "step": 7741, "time_per_iteration": 2.656198024749756 }, { "auxiliary_loss_clip": 0.0114295, "auxiliary_loss_mlp": 0.01024102, "balance_loss_clip": 1.04636681, "balance_loss_mlp": 1.01653218, "epoch": 0.9309204593278423, "flos": 30847481539200.0, "grad_norm": 1.868414540937962, "language_loss": 0.71169943, "learning_rate": 4.9773417296009814e-08, "loss": 0.73336995, "num_input_tokens_seen": 167378390, "step": 7742, "time_per_iteration": 2.820258378982544 }, { "auxiliary_loss_clip": 0.01169831, "auxiliary_loss_mlp": 0.01025754, "balance_loss_clip": 1.04933441, "balance_loss_mlp": 1.0179038, "epoch": 0.9310407022184813, "flos": 23037269950080.0, "grad_norm": 1.8838729996935417, "language_loss": 0.65268171, "learning_rate": 4.960086162520527e-08, "loss": 0.67463756, "num_input_tokens_seen": 167398480, "step": 7743, "time_per_iteration": 3.7487618923187256 }, { "auxiliary_loss_clip": 0.01157869, "auxiliary_loss_mlp": 0.01029998, "balance_loss_clip": 1.04638016, "balance_loss_mlp": 1.02238023, "epoch": 0.9311609451091204, "flos": 22127976132480.0, "grad_norm": 1.9090974824087585, "language_loss": 0.82347482, "learning_rate": 4.942860182839936e-08, "loss": 0.84535348, "num_input_tokens_seen": 167416825, "step": 7744, "time_per_iteration": 3.7672228813171387 }, { "auxiliary_loss_clip": 0.01160356, "auxiliary_loss_mlp": 0.01030524, "balance_loss_clip": 1.04784727, "balance_loss_mlp": 1.0224359, "epoch": 0.9312811879997596, "flos": 21099206701440.0, "grad_norm": 1.9338804687111129, "language_loss": 0.79569286, "learning_rate": 4.925663793172341e-08, "loss": 0.81760168, "num_input_tokens_seen": 167434785, "step": 7745, "time_per_iteration": 2.7330853939056396 }, { "auxiliary_loss_clip": 0.01053865, "auxiliary_loss_mlp": 0.01032965, "balance_loss_clip": 1.00760031, "balance_loss_mlp": 0.99958318, "epoch": 0.9314014308903986, "flos": 67148179096320.0, "grad_norm": 0.7836264423928138, "language_loss": 0.56514639, "learning_rate": 4.908496996126477e-08, "loss": 0.58601469, "num_input_tokens_seen": 167498245, "step": 7746, "time_per_iteration": 3.2526848316192627 }, { "auxiliary_loss_clip": 0.01161886, "auxiliary_loss_mlp": 0.01022172, "balance_loss_clip": 1.04965484, "balance_loss_mlp": 1.01519489, "epoch": 0.9315216737810377, "flos": 22565583527040.0, "grad_norm": 1.8314159719147844, "language_loss": 0.76477462, "learning_rate": 4.89135979430646e-08, "loss": 0.78661525, "num_input_tokens_seen": 167518290, "step": 7747, "time_per_iteration": 3.723726749420166 }, { "auxiliary_loss_clip": 0.01165516, "auxiliary_loss_mlp": 0.01025897, "balance_loss_clip": 1.04650772, "balance_loss_mlp": 1.01845288, "epoch": 0.9316419166716768, "flos": 23984054588160.0, "grad_norm": 1.8473714877548637, "language_loss": 0.85499275, "learning_rate": 4.874252190312078e-08, "loss": 0.87690687, "num_input_tokens_seen": 167538675, "step": 7748, "time_per_iteration": 2.7077906131744385 }, { "auxiliary_loss_clip": 0.01167331, "auxiliary_loss_mlp": 0.01024916, "balance_loss_clip": 1.04744244, "balance_loss_mlp": 1.01773059, "epoch": 0.9317621595623159, "flos": 30230464688640.0, "grad_norm": 1.5133432786511924, "language_loss": 0.64809102, "learning_rate": 4.857174186738477e-08, "loss": 0.67001349, "num_input_tokens_seen": 167562025, "step": 7749, "time_per_iteration": 2.7287757396698 }, { "auxiliary_loss_clip": 0.01167946, "auxiliary_loss_mlp": 0.01024675, "balance_loss_clip": 1.04882669, "balance_loss_mlp": 1.01721287, "epoch": 0.931882402452955, "flos": 15742735966080.0, "grad_norm": 2.5738830710933316, "language_loss": 0.73428714, "learning_rate": 4.840125786176408e-08, "loss": 0.75621337, "num_input_tokens_seen": 167578230, "step": 7750, "time_per_iteration": 2.6062536239624023 }, { "auxiliary_loss_clip": 0.01156791, "auxiliary_loss_mlp": 0.01028311, "balance_loss_clip": 1.0474, "balance_loss_mlp": 1.02124226, "epoch": 0.932002645343594, "flos": 28366521154560.0, "grad_norm": 2.10449765904225, "language_loss": 0.77452546, "learning_rate": 4.823106991212067e-08, "loss": 0.79637647, "num_input_tokens_seen": 167597470, "step": 7751, "time_per_iteration": 2.704082489013672 }, { "auxiliary_loss_clip": 0.01161293, "auxiliary_loss_mlp": 0.01024126, "balance_loss_clip": 1.04405951, "balance_loss_mlp": 1.01738739, "epoch": 0.9321228882342332, "flos": 15341146934400.0, "grad_norm": 2.479123258030319, "language_loss": 0.83373022, "learning_rate": 4.806117804427212e-08, "loss": 0.85558438, "num_input_tokens_seen": 167615405, "step": 7752, "time_per_iteration": 2.683931589126587 }, { "auxiliary_loss_clip": 0.01162116, "auxiliary_loss_mlp": 0.01025708, "balance_loss_clip": 1.0488224, "balance_loss_mlp": 1.01814163, "epoch": 0.9322431311248722, "flos": 17895365107200.0, "grad_norm": 1.8913824706711548, "language_loss": 0.64506626, "learning_rate": 4.7891582283990926e-08, "loss": 0.6669445, "num_input_tokens_seen": 167634130, "step": 7753, "time_per_iteration": 2.626962184906006 }, { "auxiliary_loss_clip": 0.01155536, "auxiliary_loss_mlp": 0.01025182, "balance_loss_clip": 1.04571891, "balance_loss_mlp": 1.01826179, "epoch": 0.9323633740155113, "flos": 24169713010560.0, "grad_norm": 2.5570938449275356, "language_loss": 0.72727752, "learning_rate": 4.772228265700473e-08, "loss": 0.74908471, "num_input_tokens_seen": 167654990, "step": 7754, "time_per_iteration": 3.6201891899108887 }, { "auxiliary_loss_clip": 0.01164955, "auxiliary_loss_mlp": 0.01027524, "balance_loss_clip": 1.04750693, "balance_loss_mlp": 1.01997459, "epoch": 0.9324836169061504, "flos": 15043482927360.0, "grad_norm": 2.1281942017768416, "language_loss": 0.75924957, "learning_rate": 4.75532791889961e-08, "loss": 0.7811743, "num_input_tokens_seen": 167671690, "step": 7755, "time_per_iteration": 2.5792076587677 }, { "auxiliary_loss_clip": 0.01161099, "auxiliary_loss_mlp": 0.01023917, "balance_loss_clip": 1.0453043, "balance_loss_mlp": 1.01689517, "epoch": 0.9326038597967895, "flos": 18624890332800.0, "grad_norm": 1.7896468379376156, "language_loss": 0.65643823, "learning_rate": 4.738457190560252e-08, "loss": 0.67828834, "num_input_tokens_seen": 167690800, "step": 7756, "time_per_iteration": 2.6877388954162598 }, { "auxiliary_loss_clip": 0.01151408, "auxiliary_loss_mlp": 0.0102722, "balance_loss_clip": 1.04711282, "balance_loss_mlp": 1.02027321, "epoch": 0.9327241026874286, "flos": 18952646958720.0, "grad_norm": 5.625343228792317, "language_loss": 0.79064637, "learning_rate": 4.721616083241664e-08, "loss": 0.81243265, "num_input_tokens_seen": 167709055, "step": 7757, "time_per_iteration": 2.676502227783203 }, { "auxiliary_loss_clip": 0.01158115, "auxiliary_loss_mlp": 0.01025299, "balance_loss_clip": 1.04653978, "balance_loss_mlp": 1.01769984, "epoch": 0.9328443455780677, "flos": 29570282668800.0, "grad_norm": 2.2272849167181703, "language_loss": 0.77939892, "learning_rate": 4.7048045994986684e-08, "loss": 0.80123305, "num_input_tokens_seen": 167729915, "step": 7758, "time_per_iteration": 2.766972541809082 }, { "auxiliary_loss_clip": 0.01168824, "auxiliary_loss_mlp": 0.01020844, "balance_loss_clip": 1.04901493, "balance_loss_mlp": 1.01393306, "epoch": 0.9329645884687068, "flos": 30081722469120.0, "grad_norm": 3.5237670279688453, "language_loss": 0.91031301, "learning_rate": 4.688022741881559e-08, "loss": 0.93220973, "num_input_tokens_seen": 167750440, "step": 7759, "time_per_iteration": 2.8454089164733887 }, { "auxiliary_loss_clip": 0.0115869, "auxiliary_loss_mlp": 0.01026179, "balance_loss_clip": 1.04501092, "balance_loss_mlp": 1.01935482, "epoch": 0.9330848313593458, "flos": 21867982513920.0, "grad_norm": 1.6198123674747111, "language_loss": 0.75280321, "learning_rate": 4.671270512936076e-08, "loss": 0.77465189, "num_input_tokens_seen": 167769600, "step": 7760, "time_per_iteration": 2.715816020965576 }, { "auxiliary_loss_clip": 0.01152401, "auxiliary_loss_mlp": 0.01026677, "balance_loss_clip": 1.04691219, "balance_loss_mlp": 1.01939368, "epoch": 0.933205074249985, "flos": 22127221946880.0, "grad_norm": 1.81798480863118, "language_loss": 0.82828164, "learning_rate": 4.6545479152035884e-08, "loss": 0.85007244, "num_input_tokens_seen": 167788770, "step": 7761, "time_per_iteration": 2.7238311767578125 }, { "auxiliary_loss_clip": 0.01161628, "auxiliary_loss_mlp": 0.01022374, "balance_loss_clip": 1.04682016, "balance_loss_mlp": 1.01543617, "epoch": 0.9333253171406241, "flos": 15341254675200.0, "grad_norm": 1.8923084934415881, "language_loss": 0.75854504, "learning_rate": 4.637854951220821e-08, "loss": 0.78038508, "num_input_tokens_seen": 167805555, "step": 7762, "time_per_iteration": 2.6113758087158203 }, { "auxiliary_loss_clip": 0.01150829, "auxiliary_loss_mlp": 0.0102339, "balance_loss_clip": 1.04542303, "balance_loss_mlp": 1.01633859, "epoch": 0.9334455600312631, "flos": 15706142985600.0, "grad_norm": 1.9526623325374075, "language_loss": 0.75007379, "learning_rate": 4.621191623520171e-08, "loss": 0.7718159, "num_input_tokens_seen": 167823985, "step": 7763, "time_per_iteration": 2.7293941974639893 }, { "auxiliary_loss_clip": 0.01163577, "auxiliary_loss_mlp": 0.0102367, "balance_loss_clip": 1.0468359, "balance_loss_mlp": 1.01677084, "epoch": 0.9335658029219023, "flos": 22163563532160.0, "grad_norm": 2.215149216065871, "language_loss": 0.84583795, "learning_rate": 4.604557934629372e-08, "loss": 0.86771041, "num_input_tokens_seen": 167843060, "step": 7764, "time_per_iteration": 2.6874241828918457 }, { "auxiliary_loss_clip": 0.01152923, "auxiliary_loss_mlp": 0.01022047, "balance_loss_clip": 1.04630458, "balance_loss_mlp": 1.01509964, "epoch": 0.9336860458125413, "flos": 20266833859200.0, "grad_norm": 12.378171619891583, "language_loss": 0.80276561, "learning_rate": 4.587953887071805e-08, "loss": 0.82451528, "num_input_tokens_seen": 167862880, "step": 7765, "time_per_iteration": 2.6895291805267334 }, { "auxiliary_loss_clip": 0.01154701, "auxiliary_loss_mlp": 0.01024348, "balance_loss_clip": 1.0444541, "balance_loss_mlp": 1.01702809, "epoch": 0.9338062887031804, "flos": 20919689504640.0, "grad_norm": 2.5239676134657167, "language_loss": 0.85672665, "learning_rate": 4.5713794833662554e-08, "loss": 0.87851715, "num_input_tokens_seen": 167882095, "step": 7766, "time_per_iteration": 2.671340227127075 }, { "auxiliary_loss_clip": 0.01168288, "auxiliary_loss_mlp": 0.01025124, "balance_loss_clip": 1.04766774, "balance_loss_mlp": 1.01760769, "epoch": 0.9339265315938196, "flos": 23221635482880.0, "grad_norm": 1.7552161862654947, "language_loss": 0.63456184, "learning_rate": 4.5548347260270236e-08, "loss": 0.65649593, "num_input_tokens_seen": 167901385, "step": 7767, "time_per_iteration": 2.683948278427124 }, { "auxiliary_loss_clip": 0.0115154, "auxiliary_loss_mlp": 0.01027554, "balance_loss_clip": 1.0460906, "balance_loss_mlp": 1.02064037, "epoch": 0.9340467744844586, "flos": 22820261932800.0, "grad_norm": 1.6336298531175515, "language_loss": 0.69493943, "learning_rate": 4.538319617564012e-08, "loss": 0.71673036, "num_input_tokens_seen": 167920405, "step": 7768, "time_per_iteration": 2.6875803470611572 }, { "auxiliary_loss_clip": 0.01159997, "auxiliary_loss_mlp": 0.01024262, "balance_loss_clip": 1.04701424, "balance_loss_mlp": 1.01681697, "epoch": 0.9341670173750977, "flos": 23660428026240.0, "grad_norm": 1.9735514261724751, "language_loss": 0.74472678, "learning_rate": 4.521834160482485e-08, "loss": 0.76656938, "num_input_tokens_seen": 167939145, "step": 7769, "time_per_iteration": 3.6929121017456055 }, { "auxiliary_loss_clip": 0.01164193, "auxiliary_loss_mlp": 0.01023419, "balance_loss_clip": 1.045434, "balance_loss_mlp": 1.01620412, "epoch": 0.9342872602657368, "flos": 24824256595200.0, "grad_norm": 1.797788834351311, "language_loss": 0.81999433, "learning_rate": 4.5053783572832846e-08, "loss": 0.84187043, "num_input_tokens_seen": 167959325, "step": 7770, "time_per_iteration": 2.8113911151885986 }, { "auxiliary_loss_clip": 0.01162832, "auxiliary_loss_mlp": 0.01026036, "balance_loss_clip": 1.04697692, "balance_loss_mlp": 1.01866555, "epoch": 0.9344075031563759, "flos": 25771831332480.0, "grad_norm": 1.669578127745169, "language_loss": 0.76600844, "learning_rate": 4.488952210462771e-08, "loss": 0.78789711, "num_input_tokens_seen": 167979530, "step": 7771, "time_per_iteration": 3.7261829376220703 }, { "auxiliary_loss_clip": 0.01165733, "auxiliary_loss_mlp": 0.01023037, "balance_loss_clip": 1.04739952, "balance_loss_mlp": 1.01615512, "epoch": 0.9345277460470149, "flos": 25551303782400.0, "grad_norm": 2.1769699023669773, "language_loss": 0.85888076, "learning_rate": 4.4725557225127495e-08, "loss": 0.88076842, "num_input_tokens_seen": 167997870, "step": 7772, "time_per_iteration": 2.8287928104400635 }, { "auxiliary_loss_clip": 0.01164003, "auxiliary_loss_mlp": 0.01021091, "balance_loss_clip": 1.04805827, "balance_loss_mlp": 1.01452279, "epoch": 0.9346479889376541, "flos": 34313112432000.0, "grad_norm": 1.6077212768650946, "language_loss": 0.79413837, "learning_rate": 4.456188895920565e-08, "loss": 0.81598926, "num_input_tokens_seen": 168019625, "step": 7773, "time_per_iteration": 3.7609481811523438 }, { "auxiliary_loss_clip": 0.01166971, "auxiliary_loss_mlp": 0.01023645, "balance_loss_clip": 1.04764998, "balance_loss_mlp": 1.01625371, "epoch": 0.9347682318282932, "flos": 19093739581440.0, "grad_norm": 2.134255088358987, "language_loss": 0.85984737, "learning_rate": 4.439851733169031e-08, "loss": 0.88175344, "num_input_tokens_seen": 168037415, "step": 7774, "time_per_iteration": 2.6776559352874756 }, { "auxiliary_loss_clip": 0.01156715, "auxiliary_loss_mlp": 0.01023881, "balance_loss_clip": 1.04717004, "balance_loss_mlp": 1.01615059, "epoch": 0.9348884747189322, "flos": 26249587153920.0, "grad_norm": 2.2005639084966764, "language_loss": 0.69589865, "learning_rate": 4.4235442367365204e-08, "loss": 0.71770459, "num_input_tokens_seen": 168057725, "step": 7775, "time_per_iteration": 2.7392866611480713 }, { "auxiliary_loss_clip": 0.0115759, "auxiliary_loss_mlp": 0.01027731, "balance_loss_clip": 1.04633069, "balance_loss_mlp": 1.02025628, "epoch": 0.9350087176095714, "flos": 18333080242560.0, "grad_norm": 1.9632579740497151, "language_loss": 0.79202634, "learning_rate": 4.4072664090968545e-08, "loss": 0.81387955, "num_input_tokens_seen": 168076110, "step": 7776, "time_per_iteration": 2.6341865062713623 }, { "auxiliary_loss_clip": 0.01161883, "auxiliary_loss_mlp": 0.01028313, "balance_loss_clip": 1.04746795, "balance_loss_mlp": 1.02126431, "epoch": 0.9351289605002104, "flos": 19318253541120.0, "grad_norm": 1.7618593643663019, "language_loss": 0.84466803, "learning_rate": 4.391018252719347e-08, "loss": 0.86657, "num_input_tokens_seen": 168095905, "step": 7777, "time_per_iteration": 2.816884756088257 }, { "auxiliary_loss_clip": 0.01163487, "auxiliary_loss_mlp": 0.01027681, "balance_loss_clip": 1.04827762, "balance_loss_mlp": 1.02006102, "epoch": 0.9352492033908495, "flos": 18799990156800.0, "grad_norm": 1.7507534772009528, "language_loss": 0.68975806, "learning_rate": 4.374799770068849e-08, "loss": 0.71166974, "num_input_tokens_seen": 168112580, "step": 7778, "time_per_iteration": 2.729377269744873 }, { "auxiliary_loss_clip": 0.01158955, "auxiliary_loss_mlp": 0.01024202, "balance_loss_clip": 1.04583037, "balance_loss_mlp": 1.01656067, "epoch": 0.9353694462814887, "flos": 29530134241920.0, "grad_norm": 2.0784502666625335, "language_loss": 0.74882877, "learning_rate": 4.358610963605658e-08, "loss": 0.77066034, "num_input_tokens_seen": 168133030, "step": 7779, "time_per_iteration": 3.6487514972686768 }, { "auxiliary_loss_clip": 0.01171325, "auxiliary_loss_mlp": 0.01027769, "balance_loss_clip": 1.04950321, "balance_loss_mlp": 1.01988292, "epoch": 0.9354896891721277, "flos": 30665450390400.0, "grad_norm": 2.040333210049757, "language_loss": 0.68631107, "learning_rate": 4.342451835785677e-08, "loss": 0.70830196, "num_input_tokens_seen": 168153940, "step": 7780, "time_per_iteration": 2.7703166007995605 }, { "auxiliary_loss_clip": 0.01156284, "auxiliary_loss_mlp": 0.0102306, "balance_loss_clip": 1.04565835, "balance_loss_mlp": 1.01619959, "epoch": 0.9356099320627668, "flos": 19463907191040.0, "grad_norm": 1.8698089058425815, "language_loss": 0.74711108, "learning_rate": 4.3263223890601665e-08, "loss": 0.76890445, "num_input_tokens_seen": 168172650, "step": 7781, "time_per_iteration": 2.863478422164917 }, { "auxiliary_loss_clip": 0.0115951, "auxiliary_loss_mlp": 0.01058252, "balance_loss_clip": 1.04795396, "balance_loss_mlp": 1.0230484, "epoch": 0.9357301749534058, "flos": 19098156954240.0, "grad_norm": 3.1154329314593805, "language_loss": 0.79581273, "learning_rate": 4.31022262587597e-08, "loss": 0.81799042, "num_input_tokens_seen": 168191325, "step": 7782, "time_per_iteration": 2.7800633907318115 }, { "auxiliary_loss_clip": 0.01164391, "auxiliary_loss_mlp": 0.01026056, "balance_loss_clip": 1.04806852, "balance_loss_mlp": 1.01807475, "epoch": 0.935850417844045, "flos": 23550361776000.0, "grad_norm": 1.614638484537395, "language_loss": 0.66005731, "learning_rate": 4.2941525486754225e-08, "loss": 0.68196177, "num_input_tokens_seen": 168211645, "step": 7783, "time_per_iteration": 2.685415029525757 }, { "auxiliary_loss_clip": 0.01149771, "auxiliary_loss_mlp": 0.01023318, "balance_loss_clip": 1.04645157, "balance_loss_mlp": 1.01639771, "epoch": 0.935970660734684, "flos": 18588333265920.0, "grad_norm": 2.0320498354198993, "language_loss": 0.79464006, "learning_rate": 4.278112159896286e-08, "loss": 0.81637096, "num_input_tokens_seen": 168229485, "step": 7784, "time_per_iteration": 2.7561709880828857 }, { "auxiliary_loss_clip": 0.0115216, "auxiliary_loss_mlp": 0.01023735, "balance_loss_clip": 1.04394722, "balance_loss_mlp": 1.01673126, "epoch": 0.9360909036253231, "flos": 20631255292800.0, "grad_norm": 1.8422916014580228, "language_loss": 0.67526686, "learning_rate": 4.2621014619719896e-08, "loss": 0.69702578, "num_input_tokens_seen": 168247250, "step": 7785, "time_per_iteration": 2.6528704166412354 }, { "auxiliary_loss_clip": 0.01056959, "auxiliary_loss_mlp": 0.01000636, "balance_loss_clip": 1.00807405, "balance_loss_mlp": 0.99956894, "epoch": 0.9362111465159623, "flos": 61791421052160.0, "grad_norm": 0.7231469257301363, "language_loss": 0.58572024, "learning_rate": 4.246120457331215e-08, "loss": 0.60629618, "num_input_tokens_seen": 168309425, "step": 7786, "time_per_iteration": 3.263519048690796 }, { "auxiliary_loss_clip": 0.01156434, "auxiliary_loss_mlp": 0.01023244, "balance_loss_clip": 1.0501591, "balance_loss_mlp": 1.01589775, "epoch": 0.9363313894066013, "flos": 24170395368960.0, "grad_norm": 2.142652687529099, "language_loss": 0.72221518, "learning_rate": 4.2301691483983325e-08, "loss": 0.74401188, "num_input_tokens_seen": 168329545, "step": 7787, "time_per_iteration": 2.7070064544677734 }, { "auxiliary_loss_clip": 0.01164931, "auxiliary_loss_mlp": 0.01025986, "balance_loss_clip": 1.04615951, "balance_loss_mlp": 1.01889062, "epoch": 0.9364516322972404, "flos": 20120354196480.0, "grad_norm": 1.78631121171496, "language_loss": 0.75822663, "learning_rate": 4.214247537593163e-08, "loss": 0.78013575, "num_input_tokens_seen": 168348795, "step": 7788, "time_per_iteration": 2.6872293949127197 }, { "auxiliary_loss_clip": 0.01162051, "auxiliary_loss_mlp": 0.01028236, "balance_loss_clip": 1.04786813, "balance_loss_mlp": 1.02104735, "epoch": 0.9365718751878795, "flos": 20703758895360.0, "grad_norm": 1.7432647336960396, "language_loss": 0.80573004, "learning_rate": 4.1983556273309293e-08, "loss": 0.8276329, "num_input_tokens_seen": 168367545, "step": 7789, "time_per_iteration": 2.7468976974487305 }, { "auxiliary_loss_clip": 0.01169093, "auxiliary_loss_mlp": 0.01025202, "balance_loss_clip": 1.04770827, "balance_loss_mlp": 1.01738203, "epoch": 0.9366921180785186, "flos": 18655270260480.0, "grad_norm": 3.220063855813664, "language_loss": 0.69269401, "learning_rate": 4.182493420022526e-08, "loss": 0.71463692, "num_input_tokens_seen": 168383215, "step": 7790, "time_per_iteration": 2.5697295665740967 }, { "auxiliary_loss_clip": 0.01154085, "auxiliary_loss_mlp": 0.01021975, "balance_loss_clip": 1.0435605, "balance_loss_mlp": 1.01492393, "epoch": 0.9368123609691577, "flos": 25774955815680.0, "grad_norm": 1.8249478770576641, "language_loss": 0.78297436, "learning_rate": 4.166660918074139e-08, "loss": 0.80473495, "num_input_tokens_seen": 168403120, "step": 7791, "time_per_iteration": 2.6817843914031982 }, { "auxiliary_loss_clip": 0.01152851, "auxiliary_loss_mlp": 0.01023482, "balance_loss_clip": 1.04610968, "balance_loss_mlp": 1.0165534, "epoch": 0.9369326038597968, "flos": 25553386771200.0, "grad_norm": 1.3993861498229758, "language_loss": 0.73578918, "learning_rate": 4.15085812388758e-08, "loss": 0.75755244, "num_input_tokens_seen": 168425340, "step": 7792, "time_per_iteration": 2.704258680343628 }, { "auxiliary_loss_clip": 0.0115946, "auxiliary_loss_mlp": 0.01024311, "balance_loss_clip": 1.04713297, "balance_loss_mlp": 1.01721203, "epoch": 0.9370528467504359, "flos": 23220019370880.0, "grad_norm": 1.7577538701703193, "language_loss": 0.7866261, "learning_rate": 4.135085039860153e-08, "loss": 0.80846381, "num_input_tokens_seen": 168444740, "step": 7793, "time_per_iteration": 2.5628650188446045 }, { "auxiliary_loss_clip": 0.01157445, "auxiliary_loss_mlp": 0.01024757, "balance_loss_clip": 1.04804039, "balance_loss_mlp": 1.01731849, "epoch": 0.9371730896410749, "flos": 24967468120320.0, "grad_norm": 2.289475968187147, "language_loss": 0.78719532, "learning_rate": 4.1193416683845906e-08, "loss": 0.80901736, "num_input_tokens_seen": 168463670, "step": 7794, "time_per_iteration": 2.600837469100952 }, { "auxiliary_loss_clip": 0.01157572, "auxiliary_loss_mlp": 0.01024871, "balance_loss_clip": 1.0461067, "balance_loss_mlp": 1.01842189, "epoch": 0.9372933325317141, "flos": 15553091134080.0, "grad_norm": 2.352592781346358, "language_loss": 0.83541983, "learning_rate": 4.103628011849136e-08, "loss": 0.85724431, "num_input_tokens_seen": 168479030, "step": 7795, "time_per_iteration": 3.457653522491455 }, { "auxiliary_loss_clip": 0.01161239, "auxiliary_loss_mlp": 0.01028715, "balance_loss_clip": 1.0469017, "balance_loss_mlp": 1.02134776, "epoch": 0.9374135754223532, "flos": 21871861182720.0, "grad_norm": 2.117097473575787, "language_loss": 0.76087809, "learning_rate": 4.0879440726375506e-08, "loss": 0.78277761, "num_input_tokens_seen": 168496815, "step": 7796, "time_per_iteration": 3.428220272064209 }, { "auxiliary_loss_clip": 0.01158779, "auxiliary_loss_mlp": 0.01027755, "balance_loss_clip": 1.04553056, "balance_loss_mlp": 1.01963425, "epoch": 0.9375338183129922, "flos": 22631048064000.0, "grad_norm": 2.4039888182150264, "language_loss": 0.5653367, "learning_rate": 4.0722898531291074e-08, "loss": 0.58720207, "num_input_tokens_seen": 168514055, "step": 7797, "time_per_iteration": 2.7835419178009033 }, { "auxiliary_loss_clip": 0.01165814, "auxiliary_loss_mlp": 0.01027961, "balance_loss_clip": 1.04713285, "balance_loss_mlp": 1.02082002, "epoch": 0.9376540612036314, "flos": 26104292640000.0, "grad_norm": 1.665061131855299, "language_loss": 0.76520985, "learning_rate": 4.0566653556985295e-08, "loss": 0.78714758, "num_input_tokens_seen": 168534600, "step": 7798, "time_per_iteration": 2.7897229194641113 }, { "auxiliary_loss_clip": 0.0115393, "auxiliary_loss_mlp": 0.01018243, "balance_loss_clip": 1.05029702, "balance_loss_mlp": 1.01053572, "epoch": 0.9377743040942704, "flos": 19717580016000.0, "grad_norm": 2.3291398652043553, "language_loss": 0.81711435, "learning_rate": 4.0410705827159886e-08, "loss": 0.83883607, "num_input_tokens_seen": 168551895, "step": 7799, "time_per_iteration": 4.023524045944214 }, { "auxiliary_loss_clip": 0.0115839, "auxiliary_loss_mlp": 0.01024742, "balance_loss_clip": 1.04733336, "balance_loss_mlp": 1.01759195, "epoch": 0.9378945469849095, "flos": 15267530010240.0, "grad_norm": 2.108870336008134, "language_loss": 0.71450603, "learning_rate": 4.0255055365472356e-08, "loss": 0.73633736, "num_input_tokens_seen": 168569990, "step": 7800, "time_per_iteration": 2.7483086585998535 }, { "auxiliary_loss_clip": 0.01151324, "auxiliary_loss_mlp": 0.01026492, "balance_loss_clip": 1.04522669, "balance_loss_mlp": 1.01937246, "epoch": 0.9380147898755486, "flos": 20591394174720.0, "grad_norm": 2.649079491851497, "language_loss": 0.74935782, "learning_rate": 4.009970219553471e-08, "loss": 0.77113599, "num_input_tokens_seen": 168586940, "step": 7801, "time_per_iteration": 2.7407798767089844 }, { "auxiliary_loss_clip": 0.01166777, "auxiliary_loss_mlp": 0.01024413, "balance_loss_clip": 1.0478487, "balance_loss_mlp": 1.01655126, "epoch": 0.9381350327661877, "flos": 26281116316800.0, "grad_norm": 3.554252347310015, "language_loss": 0.76206189, "learning_rate": 3.99446463409141e-08, "loss": 0.78397381, "num_input_tokens_seen": 168604795, "step": 7802, "time_per_iteration": 2.6194770336151123 }, { "auxiliary_loss_clip": 0.01167696, "auxiliary_loss_mlp": 0.01029492, "balance_loss_clip": 1.04587698, "balance_loss_mlp": 1.02217889, "epoch": 0.9382552756568268, "flos": 23586344225280.0, "grad_norm": 2.0976616369978474, "language_loss": 0.6905055, "learning_rate": 3.978988782513215e-08, "loss": 0.71247733, "num_input_tokens_seen": 168622290, "step": 7803, "time_per_iteration": 2.6798832416534424 }, { "auxiliary_loss_clip": 0.01165901, "auxiliary_loss_mlp": 0.01022893, "balance_loss_clip": 1.04661953, "balance_loss_mlp": 1.01512074, "epoch": 0.9383755185474659, "flos": 28438809275520.0, "grad_norm": 2.3260019558197698, "language_loss": 0.76556671, "learning_rate": 3.963542667166586e-08, "loss": 0.78745461, "num_input_tokens_seen": 168642395, "step": 7804, "time_per_iteration": 2.674884080886841 }, { "auxiliary_loss_clip": 0.01155642, "auxiliary_loss_mlp": 0.01023984, "balance_loss_clip": 1.04698229, "balance_loss_mlp": 1.01709712, "epoch": 0.938495761438105, "flos": 20449583280000.0, "grad_norm": 1.7129673798375395, "language_loss": 0.68208593, "learning_rate": 3.9481262903946486e-08, "loss": 0.70388222, "num_input_tokens_seen": 168661840, "step": 7805, "time_per_iteration": 2.7632358074188232 }, { "auxiliary_loss_clip": 0.01062102, "auxiliary_loss_mlp": 0.01001473, "balance_loss_clip": 1.00933957, "balance_loss_mlp": 1.00041223, "epoch": 0.938616004328744, "flos": 69302711658240.0, "grad_norm": 0.7615778654896678, "language_loss": 0.54484099, "learning_rate": 3.932739654536066e-08, "loss": 0.56547672, "num_input_tokens_seen": 168724540, "step": 7806, "time_per_iteration": 4.177046060562134 }, { "auxiliary_loss_clip": 0.01160681, "auxiliary_loss_mlp": 0.01020081, "balance_loss_clip": 1.046453, "balance_loss_mlp": 1.01268995, "epoch": 0.9387362472193832, "flos": 18911636605440.0, "grad_norm": 2.2351828231930733, "language_loss": 0.7411319, "learning_rate": 3.917382761925014e-08, "loss": 0.76293957, "num_input_tokens_seen": 168740375, "step": 7807, "time_per_iteration": 2.6624157428741455 }, { "auxiliary_loss_clip": 0.01158, "auxiliary_loss_mlp": 0.01021984, "balance_loss_clip": 1.04517341, "balance_loss_mlp": 1.015028, "epoch": 0.9388564901100223, "flos": 26501967089280.0, "grad_norm": 2.6083467935463247, "language_loss": 0.79452258, "learning_rate": 3.9020556148910754e-08, "loss": 0.81632245, "num_input_tokens_seen": 168759730, "step": 7808, "time_per_iteration": 2.829169511795044 }, { "auxiliary_loss_clip": 0.01061499, "auxiliary_loss_mlp": 0.01005129, "balance_loss_clip": 1.00708675, "balance_loss_mlp": 1.00412202, "epoch": 0.9389767330006613, "flos": 58941083157120.0, "grad_norm": 0.706678436144045, "language_loss": 0.56678486, "learning_rate": 3.8867582157593895e-08, "loss": 0.5874511, "num_input_tokens_seen": 168813935, "step": 7809, "time_per_iteration": 3.208364963531494 }, { "auxiliary_loss_clip": 0.01162492, "auxiliary_loss_mlp": 0.0102391, "balance_loss_clip": 1.04880047, "balance_loss_mlp": 1.01681983, "epoch": 0.9390969758913005, "flos": 31102554994560.0, "grad_norm": 1.6235240175123886, "language_loss": 0.76454043, "learning_rate": 3.871490566850544e-08, "loss": 0.78640437, "num_input_tokens_seen": 168838145, "step": 7810, "time_per_iteration": 2.868297576904297 }, { "auxiliary_loss_clip": 0.01156153, "auxiliary_loss_mlp": 0.01025849, "balance_loss_clip": 1.04751146, "balance_loss_mlp": 1.01822007, "epoch": 0.9392172187819395, "flos": 22419391173120.0, "grad_norm": 1.6923679630987045, "language_loss": 0.70770353, "learning_rate": 3.856252670480642e-08, "loss": 0.72952354, "num_input_tokens_seen": 168856805, "step": 7811, "time_per_iteration": 2.867126941680908 }, { "auxiliary_loss_clip": 0.01160389, "auxiliary_loss_mlp": 0.01020114, "balance_loss_clip": 1.04785955, "balance_loss_mlp": 1.01264548, "epoch": 0.9393374616725786, "flos": 19719483436800.0, "grad_norm": 2.199082276442431, "language_loss": 0.80998725, "learning_rate": 3.841044528961279e-08, "loss": 0.83179235, "num_input_tokens_seen": 168874600, "step": 7812, "time_per_iteration": 2.8259358406066895 }, { "auxiliary_loss_clip": 0.01167184, "auxiliary_loss_mlp": 0.01023031, "balance_loss_clip": 1.04591691, "balance_loss_mlp": 1.01519871, "epoch": 0.9394577045632178, "flos": 24170215800960.0, "grad_norm": 1.6616856411864513, "language_loss": 0.7913568, "learning_rate": 3.825866144599477e-08, "loss": 0.81325895, "num_input_tokens_seen": 168893655, "step": 7813, "time_per_iteration": 2.7223846912384033 }, { "auxiliary_loss_clip": 0.01159287, "auxiliary_loss_mlp": 0.01035719, "balance_loss_clip": 1.04521966, "balance_loss_mlp": 1.02823889, "epoch": 0.9395779474538568, "flos": 19023929498880.0, "grad_norm": 2.104651249689713, "language_loss": 0.75400794, "learning_rate": 3.8107175196978145e-08, "loss": 0.77595794, "num_input_tokens_seen": 168909960, "step": 7814, "time_per_iteration": 2.780233383178711 }, { "auxiliary_loss_clip": 0.01153354, "auxiliary_loss_mlp": 0.01027183, "balance_loss_clip": 1.04638267, "balance_loss_mlp": 1.01979184, "epoch": 0.9396981903444959, "flos": 14319129260160.0, "grad_norm": 1.9625625740322035, "language_loss": 0.76547277, "learning_rate": 3.7955986565542996e-08, "loss": 0.78727812, "num_input_tokens_seen": 168928040, "step": 7815, "time_per_iteration": 2.7024216651916504 }, { "auxiliary_loss_clip": 0.01155106, "auxiliary_loss_mlp": 0.01024204, "balance_loss_clip": 1.04614282, "balance_loss_mlp": 1.016837, "epoch": 0.9398184332351349, "flos": 34787564202240.0, "grad_norm": 2.1993061797183016, "language_loss": 0.68002987, "learning_rate": 3.780509557462497e-08, "loss": 0.701823, "num_input_tokens_seen": 168948240, "step": 7816, "time_per_iteration": 2.8519105911254883 }, { "auxiliary_loss_clip": 0.01159776, "auxiliary_loss_mlp": 0.01029639, "balance_loss_clip": 1.04601216, "balance_loss_mlp": 1.02210212, "epoch": 0.9399386761257741, "flos": 25372253462400.0, "grad_norm": 1.522583163388054, "language_loss": 0.75569153, "learning_rate": 3.765450224711375e-08, "loss": 0.77758563, "num_input_tokens_seen": 168968745, "step": 7817, "time_per_iteration": 2.7750933170318604 }, { "auxiliary_loss_clip": 0.01152513, "auxiliary_loss_mlp": 0.01023519, "balance_loss_clip": 1.04584146, "balance_loss_mlp": 1.01671791, "epoch": 0.9400589190164131, "flos": 27304965584640.0, "grad_norm": 2.162972248538326, "language_loss": 0.79962397, "learning_rate": 3.750420660585396e-08, "loss": 0.82138431, "num_input_tokens_seen": 168990685, "step": 7818, "time_per_iteration": 2.7825074195861816 }, { "auxiliary_loss_clip": 0.01163058, "auxiliary_loss_mlp": 0.01027544, "balance_loss_clip": 1.04671693, "balance_loss_mlp": 1.02023923, "epoch": 0.9401791619070522, "flos": 23399859790080.0, "grad_norm": 2.246923552141043, "language_loss": 0.79360986, "learning_rate": 3.735420867364603e-08, "loss": 0.81551588, "num_input_tokens_seen": 169011665, "step": 7819, "time_per_iteration": 2.696143627166748 }, { "auxiliary_loss_clip": 0.01143245, "auxiliary_loss_mlp": 0.01021071, "balance_loss_clip": 1.04426754, "balance_loss_mlp": 1.014467, "epoch": 0.9402994047976914, "flos": 35881403120640.0, "grad_norm": 1.5696685277550986, "language_loss": 0.61922443, "learning_rate": 3.7204508473244186e-08, "loss": 0.64086759, "num_input_tokens_seen": 169035290, "step": 7820, "time_per_iteration": 3.022822141647339 }, { "auxiliary_loss_clip": 0.01141053, "auxiliary_loss_mlp": 0.01023016, "balance_loss_clip": 1.04488909, "balance_loss_mlp": 1.0159173, "epoch": 0.9404196476883304, "flos": 22236821320320.0, "grad_norm": 1.5616806538347352, "language_loss": 0.69313771, "learning_rate": 3.7055106027357395e-08, "loss": 0.71477836, "num_input_tokens_seen": 169055155, "step": 7821, "time_per_iteration": 3.68176531791687 }, { "auxiliary_loss_clip": 0.01158228, "auxiliary_loss_mlp": 0.010245, "balance_loss_clip": 1.04607081, "balance_loss_mlp": 1.01719809, "epoch": 0.9405398905789695, "flos": 18915802583040.0, "grad_norm": 2.2066263178568586, "language_loss": 0.71813655, "learning_rate": 3.690600135865063e-08, "loss": 0.73996383, "num_input_tokens_seen": 169072080, "step": 7822, "time_per_iteration": 2.6813535690307617 }, { "auxiliary_loss_clip": 0.01058844, "auxiliary_loss_mlp": 0.01002259, "balance_loss_clip": 1.00759149, "balance_loss_mlp": 1.00113869, "epoch": 0.9406601334696086, "flos": 70274130048000.0, "grad_norm": 0.7823525561767003, "language_loss": 0.58055031, "learning_rate": 3.675719448974246e-08, "loss": 0.60116136, "num_input_tokens_seen": 169137170, "step": 7823, "time_per_iteration": 4.279159784317017 }, { "auxiliary_loss_clip": 0.01147306, "auxiliary_loss_mlp": 0.01055526, "balance_loss_clip": 1.04537511, "balance_loss_mlp": 1.02009571, "epoch": 0.9407803763602477, "flos": 22165071903360.0, "grad_norm": 2.302256965236765, "language_loss": 0.6016022, "learning_rate": 3.6608685443207054e-08, "loss": 0.62363052, "num_input_tokens_seen": 169156320, "step": 7824, "time_per_iteration": 2.9171338081359863 }, { "auxiliary_loss_clip": 0.01157378, "auxiliary_loss_mlp": 0.0102724, "balance_loss_clip": 1.04697537, "balance_loss_mlp": 1.01983666, "epoch": 0.9409006192508867, "flos": 18879496911360.0, "grad_norm": 2.6611036716857455, "language_loss": 0.66794491, "learning_rate": 3.646047424157306e-08, "loss": 0.68979108, "num_input_tokens_seen": 169173295, "step": 7825, "time_per_iteration": 3.7215960025787354 }, { "auxiliary_loss_clip": 0.01160146, "auxiliary_loss_mlp": 0.01029345, "balance_loss_clip": 1.04750264, "balance_loss_mlp": 1.02133465, "epoch": 0.9410208621415259, "flos": 23368258800000.0, "grad_norm": 2.2119551540800138, "language_loss": 0.6836164, "learning_rate": 3.631256090732382e-08, "loss": 0.70551133, "num_input_tokens_seen": 169193755, "step": 7826, "time_per_iteration": 2.7984604835510254 }, { "auxiliary_loss_clip": 0.01155951, "auxiliary_loss_mlp": 0.01026066, "balance_loss_clip": 1.04637909, "balance_loss_mlp": 1.0191009, "epoch": 0.941141105032165, "flos": 22742227635840.0, "grad_norm": 2.0613867366963055, "language_loss": 0.82695615, "learning_rate": 3.6164945462897833e-08, "loss": 0.84877634, "num_input_tokens_seen": 169213045, "step": 7827, "time_per_iteration": 2.7311625480651855 }, { "auxiliary_loss_clip": 0.01162002, "auxiliary_loss_mlp": 0.01048738, "balance_loss_clip": 1.04765463, "balance_loss_mlp": 1.01425755, "epoch": 0.941261347922804, "flos": 20704908130560.0, "grad_norm": 1.742369826204811, "language_loss": 0.75677383, "learning_rate": 3.6017627930687856e-08, "loss": 0.77888125, "num_input_tokens_seen": 169232870, "step": 7828, "time_per_iteration": 2.79691743850708 }, { "auxiliary_loss_clip": 0.01149081, "auxiliary_loss_mlp": 0.01019833, "balance_loss_clip": 1.04440045, "balance_loss_mlp": 1.01290071, "epoch": 0.9413815908134432, "flos": 19421998997760.0, "grad_norm": 3.074212403563875, "language_loss": 0.77593541, "learning_rate": 3.587060833304267e-08, "loss": 0.79762459, "num_input_tokens_seen": 169251060, "step": 7829, "time_per_iteration": 2.8403141498565674 }, { "auxiliary_loss_clip": 0.01165471, "auxiliary_loss_mlp": 0.01023521, "balance_loss_clip": 1.04734659, "balance_loss_mlp": 1.01646674, "epoch": 0.9415018337040822, "flos": 17493452853120.0, "grad_norm": 2.02025133910099, "language_loss": 0.64617711, "learning_rate": 3.5723886692264225e-08, "loss": 0.66806698, "num_input_tokens_seen": 169268600, "step": 7830, "time_per_iteration": 2.7217612266540527 }, { "auxiliary_loss_clip": 0.01156911, "auxiliary_loss_mlp": 0.01023841, "balance_loss_clip": 1.04404211, "balance_loss_mlp": 1.01676655, "epoch": 0.9416220765947213, "flos": 31831613343360.0, "grad_norm": 5.0607212485889015, "language_loss": 0.61962831, "learning_rate": 3.557746303061071e-08, "loss": 0.64143586, "num_input_tokens_seen": 169290355, "step": 7831, "time_per_iteration": 2.7668256759643555 }, { "auxiliary_loss_clip": 0.01157225, "auxiliary_loss_mlp": 0.01033353, "balance_loss_clip": 1.04627824, "balance_loss_mlp": 1.02637374, "epoch": 0.9417423194853605, "flos": 23511973115520.0, "grad_norm": 2.5197521459450156, "language_loss": 0.72350895, "learning_rate": 3.543133737029391e-08, "loss": 0.74541473, "num_input_tokens_seen": 169310865, "step": 7832, "time_per_iteration": 3.5540597438812256 }, { "auxiliary_loss_clip": 0.01165727, "auxiliary_loss_mlp": 0.01026497, "balance_loss_clip": 1.04765511, "balance_loss_mlp": 1.01871896, "epoch": 0.9418625623759995, "flos": 23915106432000.0, "grad_norm": 1.8480045936813376, "language_loss": 0.69159889, "learning_rate": 3.5285509733481214e-08, "loss": 0.71352112, "num_input_tokens_seen": 169330590, "step": 7833, "time_per_iteration": 2.7659084796905518 }, { "auxiliary_loss_clip": 0.01161358, "auxiliary_loss_mlp": 0.01024131, "balance_loss_clip": 1.04789317, "balance_loss_mlp": 1.01658249, "epoch": 0.9419828052666386, "flos": 18076965292800.0, "grad_norm": 1.9203863066239235, "language_loss": 0.77017546, "learning_rate": 3.513998014229469e-08, "loss": 0.79203033, "num_input_tokens_seen": 169349540, "step": 7834, "time_per_iteration": 2.584845542907715 }, { "auxiliary_loss_clip": 0.01159923, "auxiliary_loss_mlp": 0.01023022, "balance_loss_clip": 1.04653978, "balance_loss_mlp": 1.01569009, "epoch": 0.9421030481572777, "flos": 17712328377600.0, "grad_norm": 6.458285004449003, "language_loss": 0.86460495, "learning_rate": 3.499474861881069e-08, "loss": 0.88643438, "num_input_tokens_seen": 169366765, "step": 7835, "time_per_iteration": 2.7080891132354736 }, { "auxiliary_loss_clip": 0.01151243, "auxiliary_loss_mlp": 0.01025676, "balance_loss_clip": 1.04753053, "balance_loss_mlp": 1.01857734, "epoch": 0.9422232910479168, "flos": 20194114775040.0, "grad_norm": 2.8000576081780792, "language_loss": 0.68326771, "learning_rate": 3.4849815185061136e-08, "loss": 0.70503694, "num_input_tokens_seen": 169386655, "step": 7836, "time_per_iteration": 2.880753517150879 }, { "auxiliary_loss_clip": 0.01161921, "auxiliary_loss_mlp": 0.01020977, "balance_loss_clip": 1.04505026, "balance_loss_mlp": 1.01392269, "epoch": 0.9423435339385559, "flos": 18442571875200.0, "grad_norm": 1.948219271313897, "language_loss": 0.7598244, "learning_rate": 3.470517986303223e-08, "loss": 0.7816534, "num_input_tokens_seen": 169405640, "step": 7837, "time_per_iteration": 2.65305495262146 }, { "auxiliary_loss_clip": 0.01156584, "auxiliary_loss_mlp": 0.01028752, "balance_loss_clip": 1.04952681, "balance_loss_mlp": 1.02155805, "epoch": 0.942463776829195, "flos": 20080636732800.0, "grad_norm": 1.8226603619994368, "language_loss": 0.79020226, "learning_rate": 3.4560842674664856e-08, "loss": 0.81205565, "num_input_tokens_seen": 169424155, "step": 7838, "time_per_iteration": 2.6800925731658936 }, { "auxiliary_loss_clip": 0.01164049, "auxiliary_loss_mlp": 0.01024701, "balance_loss_clip": 1.04539084, "balance_loss_mlp": 1.01721478, "epoch": 0.9425840197198341, "flos": 22636255536000.0, "grad_norm": 2.469134291033238, "language_loss": 0.75393128, "learning_rate": 3.441680364185506e-08, "loss": 0.77581877, "num_input_tokens_seen": 169444025, "step": 7839, "time_per_iteration": 2.7418875694274902 }, { "auxiliary_loss_clip": 0.01161775, "auxiliary_loss_mlp": 0.01026902, "balance_loss_clip": 1.04787886, "balance_loss_mlp": 1.01878405, "epoch": 0.9427042626104731, "flos": 19937892084480.0, "grad_norm": 2.4220426016974543, "language_loss": 0.74760175, "learning_rate": 3.427306278645314e-08, "loss": 0.76948845, "num_input_tokens_seen": 169462480, "step": 7840, "time_per_iteration": 2.7037293910980225 }, { "auxiliary_loss_clip": 0.01152673, "auxiliary_loss_mlp": 0.0102052, "balance_loss_clip": 1.0467236, "balance_loss_mlp": 1.01379657, "epoch": 0.9428245055011123, "flos": 22856998567680.0, "grad_norm": 2.191828278644714, "language_loss": 0.73053402, "learning_rate": 3.4129620130264767e-08, "loss": 0.75226593, "num_input_tokens_seen": 169480840, "step": 7841, "time_per_iteration": 2.7619059085845947 }, { "auxiliary_loss_clip": 0.01163601, "auxiliary_loss_mlp": 0.0105732, "balance_loss_clip": 1.04801464, "balance_loss_mlp": 1.02032042, "epoch": 0.9429447483917514, "flos": 20951757371520.0, "grad_norm": 2.1156205256949487, "language_loss": 0.78223616, "learning_rate": 3.398647569505009e-08, "loss": 0.80444539, "num_input_tokens_seen": 169498265, "step": 7842, "time_per_iteration": 2.7145371437072754 }, { "auxiliary_loss_clip": 0.01159164, "auxiliary_loss_mlp": 0.01027952, "balance_loss_clip": 1.0468359, "balance_loss_mlp": 1.02008438, "epoch": 0.9430649912823904, "flos": 18843658116480.0, "grad_norm": 2.4749628907497994, "language_loss": 0.74466974, "learning_rate": 3.384362950252373e-08, "loss": 0.76654088, "num_input_tokens_seen": 169515235, "step": 7843, "time_per_iteration": 2.676375150680542 }, { "auxiliary_loss_clip": 0.01156885, "auxiliary_loss_mlp": 0.01025777, "balance_loss_clip": 1.04519773, "balance_loss_mlp": 1.01872909, "epoch": 0.9431852341730296, "flos": 32556038837760.0, "grad_norm": 3.0138324858774337, "language_loss": 0.57278264, "learning_rate": 3.3701081574355473e-08, "loss": 0.59460926, "num_input_tokens_seen": 169537195, "step": 7844, "time_per_iteration": 2.7796413898468018 }, { "auxiliary_loss_clip": 0.01060033, "auxiliary_loss_mlp": 0.01002259, "balance_loss_clip": 1.00720406, "balance_loss_mlp": 1.00112677, "epoch": 0.9433054770636686, "flos": 66904490252160.0, "grad_norm": 0.6393971852182586, "language_loss": 0.51642036, "learning_rate": 3.3558831932169796e-08, "loss": 0.53704321, "num_input_tokens_seen": 169605865, "step": 7845, "time_per_iteration": 3.3397128582000732 }, { "auxiliary_loss_clip": 0.0116016, "auxiliary_loss_mlp": 0.01024033, "balance_loss_clip": 1.04568291, "balance_loss_mlp": 1.01660037, "epoch": 0.9434257199543077, "flos": 26140346916480.0, "grad_norm": 1.9406753768131941, "language_loss": 0.88823754, "learning_rate": 3.341688059754588e-08, "loss": 0.91007948, "num_input_tokens_seen": 169621520, "step": 7846, "time_per_iteration": 2.711426258087158 }, { "auxiliary_loss_clip": 0.01165085, "auxiliary_loss_mlp": 0.01052331, "balance_loss_clip": 1.04803061, "balance_loss_mlp": 1.01833439, "epoch": 0.9435459628449467, "flos": 25003486483200.0, "grad_norm": 2.5610629623395393, "language_loss": 0.77573252, "learning_rate": 3.327522759201762e-08, "loss": 0.7979067, "num_input_tokens_seen": 169641390, "step": 7847, "time_per_iteration": 2.8467209339141846 }, { "auxiliary_loss_clip": 0.01155429, "auxiliary_loss_mlp": 0.0102323, "balance_loss_clip": 1.04636908, "balance_loss_mlp": 1.01595497, "epoch": 0.9436662057355859, "flos": 22163240309760.0, "grad_norm": 3.401210349234252, "language_loss": 0.67157221, "learning_rate": 3.313387293707359e-08, "loss": 0.69335878, "num_input_tokens_seen": 169660095, "step": 7848, "time_per_iteration": 4.749023675918579 }, { "auxiliary_loss_clip": 0.01152413, "auxiliary_loss_mlp": 0.01027598, "balance_loss_clip": 1.04661226, "balance_loss_mlp": 1.02017462, "epoch": 0.943786448626225, "flos": 20118522602880.0, "grad_norm": 1.9906886005000544, "language_loss": 0.68568599, "learning_rate": 3.29928166541571e-08, "loss": 0.70748603, "num_input_tokens_seen": 169679050, "step": 7849, "time_per_iteration": 2.7384259700775146 }, { "auxiliary_loss_clip": 0.01151819, "auxiliary_loss_mlp": 0.01024718, "balance_loss_clip": 1.04596698, "balance_loss_mlp": 1.01796508, "epoch": 0.943906691516864, "flos": 22090808534400.0, "grad_norm": 1.9836581111520526, "language_loss": 0.80448979, "learning_rate": 3.2852058764666346e-08, "loss": 0.82625508, "num_input_tokens_seen": 169698150, "step": 7850, "time_per_iteration": 3.690486431121826 }, { "auxiliary_loss_clip": 0.0114834, "auxiliary_loss_mlp": 0.01030669, "balance_loss_clip": 1.05024946, "balance_loss_mlp": 1.02358532, "epoch": 0.9440269344075032, "flos": 35298501212160.0, "grad_norm": 1.7824697423697107, "language_loss": 0.68875462, "learning_rate": 3.2711599289954264e-08, "loss": 0.71054471, "num_input_tokens_seen": 169722185, "step": 7851, "time_per_iteration": 2.8257055282592773 }, { "auxiliary_loss_clip": 0.01149438, "auxiliary_loss_mlp": 0.01028722, "balance_loss_clip": 1.04707897, "balance_loss_mlp": 1.02124751, "epoch": 0.9441471772981422, "flos": 19238136255360.0, "grad_norm": 1.9465679996420795, "language_loss": 0.77777743, "learning_rate": 3.257143825132847e-08, "loss": 0.79955906, "num_input_tokens_seen": 169740355, "step": 7852, "time_per_iteration": 2.7661585807800293 }, { "auxiliary_loss_clip": 0.01158672, "auxiliary_loss_mlp": 0.01021006, "balance_loss_clip": 1.04566407, "balance_loss_mlp": 1.01433039, "epoch": 0.9442674201887813, "flos": 25739799379200.0, "grad_norm": 1.7675792216123098, "language_loss": 0.75982124, "learning_rate": 3.243157567005106e-08, "loss": 0.781618, "num_input_tokens_seen": 169758535, "step": 7853, "time_per_iteration": 2.7128663063049316 }, { "auxiliary_loss_clip": 0.01171598, "auxiliary_loss_mlp": 0.01031384, "balance_loss_clip": 1.05070353, "balance_loss_mlp": 1.02386165, "epoch": 0.9443876630794205, "flos": 15523321737600.0, "grad_norm": 2.0474182793448126, "language_loss": 0.64089942, "learning_rate": 3.2292011567339296e-08, "loss": 0.66292924, "num_input_tokens_seen": 169776340, "step": 7854, "time_per_iteration": 2.6745874881744385 }, { "auxiliary_loss_clip": 0.01164008, "auxiliary_loss_mlp": 0.01052926, "balance_loss_clip": 1.04587197, "balance_loss_mlp": 1.01740634, "epoch": 0.9445079059700595, "flos": 13400821128960.0, "grad_norm": 2.401349674418011, "language_loss": 0.56011093, "learning_rate": 3.21527459643649e-08, "loss": 0.58228022, "num_input_tokens_seen": 169793225, "step": 7855, "time_per_iteration": 2.631566047668457 }, { "auxiliary_loss_clip": 0.0116268, "auxiliary_loss_mlp": 0.01026651, "balance_loss_clip": 1.04564786, "balance_loss_mlp": 1.01926339, "epoch": 0.9446281488606986, "flos": 23659242877440.0, "grad_norm": 2.87336709262581, "language_loss": 0.74284041, "learning_rate": 3.2013778882254536e-08, "loss": 0.76473367, "num_input_tokens_seen": 169812020, "step": 7856, "time_per_iteration": 2.700197219848633 }, { "auxiliary_loss_clip": 0.01154934, "auxiliary_loss_mlp": 0.01027513, "balance_loss_clip": 1.04592252, "balance_loss_mlp": 1.02036929, "epoch": 0.9447483917513377, "flos": 25557337267200.0, "grad_norm": 1.772900121308669, "language_loss": 0.76053202, "learning_rate": 3.1875110342088676e-08, "loss": 0.7823565, "num_input_tokens_seen": 169833470, "step": 7857, "time_per_iteration": 2.7141478061676025 }, { "auxiliary_loss_clip": 0.01150558, "auxiliary_loss_mlp": 0.01024602, "balance_loss_clip": 1.044801, "balance_loss_mlp": 1.01740766, "epoch": 0.9448686346419768, "flos": 24535463247360.0, "grad_norm": 2.027687263604265, "language_loss": 0.65657747, "learning_rate": 3.1736740364904035e-08, "loss": 0.67832905, "num_input_tokens_seen": 169854000, "step": 7858, "time_per_iteration": 3.657183885574341 }, { "auxiliary_loss_clip": 0.01152498, "auxiliary_loss_mlp": 0.01056778, "balance_loss_clip": 1.04781675, "balance_loss_mlp": 1.02117467, "epoch": 0.9449888775326158, "flos": 14721256995840.0, "grad_norm": 2.5181766056308215, "language_loss": 0.76859665, "learning_rate": 3.159866897169094e-08, "loss": 0.79068935, "num_input_tokens_seen": 169872200, "step": 7859, "time_per_iteration": 2.859548330307007 }, { "auxiliary_loss_clip": 0.01160723, "auxiliary_loss_mlp": 0.01022818, "balance_loss_clip": 1.04653621, "balance_loss_mlp": 1.01602018, "epoch": 0.945109120423255, "flos": 15447873219840.0, "grad_norm": 1.847966310943518, "language_loss": 0.75883758, "learning_rate": 3.146089618339487e-08, "loss": 0.78067297, "num_input_tokens_seen": 169889055, "step": 7860, "time_per_iteration": 2.6691200733184814 }, { "auxiliary_loss_clip": 0.01158794, "auxiliary_loss_mlp": 0.0102388, "balance_loss_clip": 1.04794598, "balance_loss_mlp": 1.0163697, "epoch": 0.9452293633138941, "flos": 25448097029760.0, "grad_norm": 1.9087476882264351, "language_loss": 0.68092203, "learning_rate": 3.132342202091554e-08, "loss": 0.70274872, "num_input_tokens_seen": 169909280, "step": 7861, "time_per_iteration": 2.765566110610962 }, { "auxiliary_loss_clip": 0.01166203, "auxiliary_loss_mlp": 0.01023665, "balance_loss_clip": 1.04545927, "balance_loss_mlp": 1.01561236, "epoch": 0.9453496062045331, "flos": 21215342350080.0, "grad_norm": 2.31884664409916, "language_loss": 0.68492496, "learning_rate": 3.1186246505107595e-08, "loss": 0.70682365, "num_input_tokens_seen": 169928420, "step": 7862, "time_per_iteration": 2.785100221633911 }, { "auxiliary_loss_clip": 0.01163927, "auxiliary_loss_mlp": 0.01025036, "balance_loss_clip": 1.04833508, "balance_loss_mlp": 1.01790416, "epoch": 0.9454698490951723, "flos": 20010898477440.0, "grad_norm": 2.4148797836547358, "language_loss": 0.83483899, "learning_rate": 3.104936965678084e-08, "loss": 0.85672855, "num_input_tokens_seen": 169946750, "step": 7863, "time_per_iteration": 2.8021867275238037 }, { "auxiliary_loss_clip": 0.01159818, "auxiliary_loss_mlp": 0.01028037, "balance_loss_clip": 1.04368806, "balance_loss_mlp": 1.02053916, "epoch": 0.9455900919858113, "flos": 21069652786560.0, "grad_norm": 1.9612664948793235, "language_loss": 0.81937766, "learning_rate": 3.091279149669956e-08, "loss": 0.8412562, "num_input_tokens_seen": 169965540, "step": 7864, "time_per_iteration": 2.796009063720703 }, { "auxiliary_loss_clip": 0.0116002, "auxiliary_loss_mlp": 0.01051386, "balance_loss_clip": 1.04488516, "balance_loss_mlp": 1.01654267, "epoch": 0.9457103348764504, "flos": 20740854666240.0, "grad_norm": 2.1016065370766075, "language_loss": 0.73825526, "learning_rate": 3.0776512045581624e-08, "loss": 0.76036936, "num_input_tokens_seen": 169984330, "step": 7865, "time_per_iteration": 2.6877472400665283 }, { "auxiliary_loss_clip": 0.01156203, "auxiliary_loss_mlp": 0.01026772, "balance_loss_clip": 1.04833972, "balance_loss_mlp": 1.01942277, "epoch": 0.9458305777670896, "flos": 21428363957760.0, "grad_norm": 2.1485546192152647, "language_loss": 0.77890962, "learning_rate": 3.0640531324101384e-08, "loss": 0.80073941, "num_input_tokens_seen": 170002095, "step": 7866, "time_per_iteration": 2.743751049041748 }, { "auxiliary_loss_clip": 0.01166858, "auxiliary_loss_mlp": 0.01023544, "balance_loss_clip": 1.05113649, "balance_loss_mlp": 1.01601636, "epoch": 0.9459508206577286, "flos": 20011185786240.0, "grad_norm": 1.6962837607976844, "language_loss": 0.76176035, "learning_rate": 3.0504849352886554e-08, "loss": 0.78366435, "num_input_tokens_seen": 170020240, "step": 7867, "time_per_iteration": 2.6149039268493652 }, { "auxiliary_loss_clip": 0.01162878, "auxiliary_loss_mlp": 0.01023485, "balance_loss_clip": 1.04805088, "balance_loss_mlp": 1.01604068, "epoch": 0.9460710635483677, "flos": 12166428291840.0, "grad_norm": 2.2401985815606094, "language_loss": 0.7191208, "learning_rate": 3.036946615252023e-08, "loss": 0.74098444, "num_input_tokens_seen": 170035770, "step": 7868, "time_per_iteration": 2.772418737411499 }, { "auxiliary_loss_clip": 0.01165446, "auxiliary_loss_mlp": 0.01028206, "balance_loss_clip": 1.04752994, "balance_loss_mlp": 1.02098203, "epoch": 0.9461913064390068, "flos": 34276196229120.0, "grad_norm": 2.1147357649462357, "language_loss": 0.66665173, "learning_rate": 3.0234381743539984e-08, "loss": 0.6885882, "num_input_tokens_seen": 170053385, "step": 7869, "time_per_iteration": 2.820188045501709 }, { "auxiliary_loss_clip": 0.0116685, "auxiliary_loss_mlp": 0.01027249, "balance_loss_clip": 1.049088, "balance_loss_mlp": 1.01942277, "epoch": 0.9463115493296459, "flos": 19463763536640.0, "grad_norm": 1.9771527140678642, "language_loss": 0.80029052, "learning_rate": 3.0099596146437863e-08, "loss": 0.82223153, "num_input_tokens_seen": 170070490, "step": 7870, "time_per_iteration": 2.769691228866577 }, { "auxiliary_loss_clip": 0.01058801, "auxiliary_loss_mlp": 0.01001891, "balance_loss_clip": 1.00642252, "balance_loss_mlp": 1.00093734, "epoch": 0.946431792220285, "flos": 70570824387840.0, "grad_norm": 0.7747254140092722, "language_loss": 0.6003443, "learning_rate": 2.996510938166086e-08, "loss": 0.62095118, "num_input_tokens_seen": 170133465, "step": 7871, "time_per_iteration": 3.3456664085388184 }, { "auxiliary_loss_clip": 0.0116107, "auxiliary_loss_mlp": 0.01021921, "balance_loss_clip": 1.04678464, "balance_loss_mlp": 1.01523089, "epoch": 0.9465520351109241, "flos": 18947906363520.0, "grad_norm": 2.1861898996978746, "language_loss": 0.73544455, "learning_rate": 2.983092146960997e-08, "loss": 0.75727445, "num_input_tokens_seen": 170150810, "step": 7872, "time_per_iteration": 2.6366539001464844 }, { "auxiliary_loss_clip": 0.01163664, "auxiliary_loss_mlp": 0.01024235, "balance_loss_clip": 1.04782486, "balance_loss_mlp": 1.0169127, "epoch": 0.9466722780015632, "flos": 19135647774720.0, "grad_norm": 1.9874288609007662, "language_loss": 0.80310678, "learning_rate": 2.9697032430642256e-08, "loss": 0.82498574, "num_input_tokens_seen": 170169025, "step": 7873, "time_per_iteration": 3.631812334060669 }, { "auxiliary_loss_clip": 0.01160256, "auxiliary_loss_mlp": 0.01020804, "balance_loss_clip": 1.0447017, "balance_loss_mlp": 1.01352, "epoch": 0.9467925208922022, "flos": 17237912520960.0, "grad_norm": 2.94405216619653, "language_loss": 0.73951995, "learning_rate": 2.9563442285067906e-08, "loss": 0.7613306, "num_input_tokens_seen": 170186070, "step": 7874, "time_per_iteration": 3.7736799716949463 }, { "auxiliary_loss_clip": 0.01162301, "auxiliary_loss_mlp": 0.01026075, "balance_loss_clip": 1.046, "balance_loss_mlp": 1.01816595, "epoch": 0.9469127637828414, "flos": 29169016859520.0, "grad_norm": 2.1789383524935646, "language_loss": 0.795771, "learning_rate": 2.943015105315294e-08, "loss": 0.81765479, "num_input_tokens_seen": 170206265, "step": 7875, "time_per_iteration": 2.7276570796966553 }, { "auxiliary_loss_clip": 0.01157498, "auxiliary_loss_mlp": 0.01032174, "balance_loss_clip": 1.04713523, "balance_loss_mlp": 1.02446079, "epoch": 0.9470330066734804, "flos": 26030460234240.0, "grad_norm": 2.3522630982432013, "language_loss": 0.66441143, "learning_rate": 2.929715875511718e-08, "loss": 0.68630821, "num_input_tokens_seen": 170225300, "step": 7876, "time_per_iteration": 2.964416742324829 }, { "auxiliary_loss_clip": 0.01161615, "auxiliary_loss_mlp": 0.01028914, "balance_loss_clip": 1.04296947, "balance_loss_mlp": 1.02166319, "epoch": 0.9471532495641195, "flos": 23440906056960.0, "grad_norm": 1.9834709052401824, "language_loss": 0.69945037, "learning_rate": 2.9164465411135375e-08, "loss": 0.72135568, "num_input_tokens_seen": 170245070, "step": 7877, "time_per_iteration": 3.79898738861084 }, { "auxiliary_loss_clip": 0.01164566, "auxiliary_loss_mlp": 0.01024942, "balance_loss_clip": 1.04883373, "balance_loss_mlp": 1.01767623, "epoch": 0.9472734924547586, "flos": 15815850099840.0, "grad_norm": 1.8941731124104046, "language_loss": 0.80883288, "learning_rate": 2.9032071041337426e-08, "loss": 0.83072793, "num_input_tokens_seen": 170263305, "step": 7878, "time_per_iteration": 2.733664035797119 }, { "auxiliary_loss_clip": 0.01153442, "auxiliary_loss_mlp": 0.01021456, "balance_loss_clip": 1.04797852, "balance_loss_mlp": 1.01452661, "epoch": 0.9473937353453977, "flos": 11181793697280.0, "grad_norm": 1.6743005739767596, "language_loss": 0.7287361, "learning_rate": 2.889997566580704e-08, "loss": 0.75048506, "num_input_tokens_seen": 170281460, "step": 7879, "time_per_iteration": 2.879889965057373 }, { "auxiliary_loss_clip": 0.01167327, "auxiliary_loss_mlp": 0.01029051, "balance_loss_clip": 1.04651225, "balance_loss_mlp": 1.02105808, "epoch": 0.9475139782360368, "flos": 25775530433280.0, "grad_norm": 1.888949213679042, "language_loss": 0.70394468, "learning_rate": 2.8768179304583086e-08, "loss": 0.7259084, "num_input_tokens_seen": 170303515, "step": 7880, "time_per_iteration": 2.8184173107147217 }, { "auxiliary_loss_clip": 0.01157807, "auxiliary_loss_mlp": 0.010268, "balance_loss_clip": 1.04863381, "balance_loss_mlp": 1.01966536, "epoch": 0.9476342211266758, "flos": 22820046451200.0, "grad_norm": 1.637699856952609, "language_loss": 0.73626906, "learning_rate": 2.8636681977659117e-08, "loss": 0.75811517, "num_input_tokens_seen": 170323165, "step": 7881, "time_per_iteration": 2.8455114364624023 }, { "auxiliary_loss_clip": 0.01153238, "auxiliary_loss_mlp": 0.01028212, "balance_loss_clip": 1.05021429, "balance_loss_mlp": 1.0205915, "epoch": 0.947754464017315, "flos": 20193611984640.0, "grad_norm": 2.1286106060698486, "language_loss": 0.78030801, "learning_rate": 2.850548370498318e-08, "loss": 0.80212247, "num_input_tokens_seen": 170341005, "step": 7882, "time_per_iteration": 2.7557458877563477 }, { "auxiliary_loss_clip": 0.01161912, "auxiliary_loss_mlp": 0.01023699, "balance_loss_clip": 1.04465246, "balance_loss_mlp": 1.01657963, "epoch": 0.9478747069079541, "flos": 24717925359360.0, "grad_norm": 1.5200014388357652, "language_loss": 0.71222043, "learning_rate": 2.8374584506457798e-08, "loss": 0.7340765, "num_input_tokens_seen": 170362280, "step": 7883, "time_per_iteration": 2.7811522483825684 }, { "auxiliary_loss_clip": 0.01156989, "auxiliary_loss_mlp": 0.01024611, "balance_loss_clip": 1.04739761, "balance_loss_mlp": 1.01703548, "epoch": 0.9479949497985931, "flos": 21361355136000.0, "grad_norm": 2.5145145728156577, "language_loss": 0.67012465, "learning_rate": 2.824398440193998e-08, "loss": 0.69194067, "num_input_tokens_seen": 170381080, "step": 7884, "time_per_iteration": 2.7320659160614014 }, { "auxiliary_loss_clip": 0.0114857, "auxiliary_loss_mlp": 0.01023935, "balance_loss_clip": 1.04770863, "balance_loss_mlp": 1.01642489, "epoch": 0.9481151926892323, "flos": 18148606968960.0, "grad_norm": 2.0354419985669043, "language_loss": 0.71266043, "learning_rate": 2.811368341124232e-08, "loss": 0.73438549, "num_input_tokens_seen": 170400150, "step": 7885, "time_per_iteration": 3.5983757972717285 }, { "auxiliary_loss_clip": 0.01162072, "auxiliary_loss_mlp": 0.01022006, "balance_loss_clip": 1.04970777, "balance_loss_mlp": 1.01513958, "epoch": 0.9482354355798713, "flos": 22128012046080.0, "grad_norm": 3.958786790694676, "language_loss": 0.68132305, "learning_rate": 2.7983681554131222e-08, "loss": 0.70316386, "num_input_tokens_seen": 170420410, "step": 7886, "time_per_iteration": 2.7013931274414062 }, { "auxiliary_loss_clip": 0.01161016, "auxiliary_loss_mlp": 0.01024269, "balance_loss_clip": 1.04917717, "balance_loss_mlp": 1.01630569, "epoch": 0.9483556784705104, "flos": 19063072344960.0, "grad_norm": 4.357672274156402, "language_loss": 0.70367956, "learning_rate": 2.7853978850327365e-08, "loss": 0.72553235, "num_input_tokens_seen": 170439580, "step": 7887, "time_per_iteration": 2.650756597518921 }, { "auxiliary_loss_clip": 0.01156558, "auxiliary_loss_mlp": 0.01023728, "balance_loss_clip": 1.04914999, "balance_loss_mlp": 1.01741624, "epoch": 0.9484759213611496, "flos": 25777110631680.0, "grad_norm": 1.834857957409911, "language_loss": 0.86931384, "learning_rate": 2.7724575319507225e-08, "loss": 0.89111674, "num_input_tokens_seen": 170459290, "step": 7888, "time_per_iteration": 2.7445223331451416 }, { "auxiliary_loss_clip": 0.01160134, "auxiliary_loss_mlp": 0.0102297, "balance_loss_clip": 1.04327977, "balance_loss_mlp": 1.01588356, "epoch": 0.9485961642517886, "flos": 20667740532480.0, "grad_norm": 2.0759671324991085, "language_loss": 0.7737307, "learning_rate": 2.759547098130044e-08, "loss": 0.79556173, "num_input_tokens_seen": 170478020, "step": 7889, "time_per_iteration": 2.6761300563812256 }, { "auxiliary_loss_clip": 0.01164419, "auxiliary_loss_mlp": 0.01020987, "balance_loss_clip": 1.04764462, "balance_loss_mlp": 1.01400697, "epoch": 0.9487164071424277, "flos": 22674069578880.0, "grad_norm": 1.7453091404333483, "language_loss": 0.76812887, "learning_rate": 2.746666585529267e-08, "loss": 0.78998303, "num_input_tokens_seen": 170498295, "step": 7890, "time_per_iteration": 2.704204797744751 }, { "auxiliary_loss_clip": 0.01155595, "auxiliary_loss_mlp": 0.01031484, "balance_loss_clip": 1.04541862, "balance_loss_mlp": 1.02431059, "epoch": 0.9488366500330668, "flos": 38726461716480.0, "grad_norm": 2.2978728784136786, "language_loss": 0.74282324, "learning_rate": 2.73381599610234e-08, "loss": 0.76469398, "num_input_tokens_seen": 170518695, "step": 7891, "time_per_iteration": 2.7921998500823975 }, { "auxiliary_loss_clip": 0.0115833, "auxiliary_loss_mlp": 0.01025191, "balance_loss_clip": 1.04527843, "balance_loss_mlp": 1.01738834, "epoch": 0.9489568929237059, "flos": 27890920149120.0, "grad_norm": 1.6514924162151776, "language_loss": 0.71413684, "learning_rate": 2.7209953317987033e-08, "loss": 0.73597205, "num_input_tokens_seen": 170539735, "step": 7892, "time_per_iteration": 2.7651286125183105 }, { "auxiliary_loss_clip": 0.01159609, "auxiliary_loss_mlp": 0.01027011, "balance_loss_clip": 1.04514098, "balance_loss_mlp": 1.02002561, "epoch": 0.9490771358143449, "flos": 33580642291200.0, "grad_norm": 2.0355763258234303, "language_loss": 0.78017688, "learning_rate": 2.7082045945631793e-08, "loss": 0.80204308, "num_input_tokens_seen": 170561950, "step": 7893, "time_per_iteration": 2.7819149494171143 }, { "auxiliary_loss_clip": 0.01150474, "auxiliary_loss_mlp": 0.01022673, "balance_loss_clip": 1.04623449, "balance_loss_mlp": 1.01595533, "epoch": 0.9491973787049841, "flos": 14793796512000.0, "grad_norm": 2.4038832240365595, "language_loss": 0.69703454, "learning_rate": 2.6954437863361712e-08, "loss": 0.71876597, "num_input_tokens_seen": 170579865, "step": 7894, "time_per_iteration": 2.7710514068603516 }, { "auxiliary_loss_clip": 0.01151734, "auxiliary_loss_mlp": 0.01021558, "balance_loss_clip": 1.04549623, "balance_loss_mlp": 1.01488566, "epoch": 0.9493176215956232, "flos": 25332535998720.0, "grad_norm": 1.9559438479804698, "language_loss": 0.70960605, "learning_rate": 2.6827129090534862e-08, "loss": 0.73133898, "num_input_tokens_seen": 170600165, "step": 7895, "time_per_iteration": 2.8186910152435303 }, { "auxiliary_loss_clip": 0.01155103, "auxiliary_loss_mlp": 0.01023587, "balance_loss_clip": 1.04477239, "balance_loss_mlp": 1.01615143, "epoch": 0.9494378644862622, "flos": 21029971236480.0, "grad_norm": 1.9898088812877524, "language_loss": 0.77904397, "learning_rate": 2.670011964646335e-08, "loss": 0.80083084, "num_input_tokens_seen": 170618845, "step": 7896, "time_per_iteration": 2.6760270595550537 }, { "auxiliary_loss_clip": 0.01156524, "auxiliary_loss_mlp": 0.01030059, "balance_loss_clip": 1.04797125, "balance_loss_mlp": 1.02194118, "epoch": 0.9495581073769014, "flos": 15195134148480.0, "grad_norm": 9.97282785108106, "language_loss": 0.67706442, "learning_rate": 2.657340955041487e-08, "loss": 0.69893014, "num_input_tokens_seen": 170637620, "step": 7897, "time_per_iteration": 2.805347204208374 }, { "auxiliary_loss_clip": 0.01157208, "auxiliary_loss_mlp": 0.01024807, "balance_loss_clip": 1.04760122, "balance_loss_mlp": 1.01747012, "epoch": 0.9496783502675404, "flos": 28616566705920.0, "grad_norm": 3.602937241260397, "language_loss": 0.7152968, "learning_rate": 2.6446998821611167e-08, "loss": 0.73711693, "num_input_tokens_seen": 170657815, "step": 7898, "time_per_iteration": 2.7357826232910156 }, { "auxiliary_loss_clip": 0.0115912, "auxiliary_loss_mlp": 0.01022054, "balance_loss_clip": 1.04950857, "balance_loss_mlp": 1.01484442, "epoch": 0.9497985931581795, "flos": 14866874732160.0, "grad_norm": 5.69054213210564, "language_loss": 0.71848702, "learning_rate": 2.6320887479228228e-08, "loss": 0.74029875, "num_input_tokens_seen": 170674415, "step": 7899, "time_per_iteration": 2.7106995582580566 }, { "auxiliary_loss_clip": 0.01161569, "auxiliary_loss_mlp": 0.01030433, "balance_loss_clip": 1.04605722, "balance_loss_mlp": 1.02323842, "epoch": 0.9499188360488187, "flos": 27193319136000.0, "grad_norm": 2.3402702442562306, "language_loss": 0.72484499, "learning_rate": 2.619507554239786e-08, "loss": 0.74676502, "num_input_tokens_seen": 170692975, "step": 7900, "time_per_iteration": 4.612868309020996 }, { "auxiliary_loss_clip": 0.01157062, "auxiliary_loss_mlp": 0.0102537, "balance_loss_clip": 1.04630399, "balance_loss_mlp": 1.01800025, "epoch": 0.9500390789394577, "flos": 24316479982080.0, "grad_norm": 1.8967244188371133, "language_loss": 0.69818461, "learning_rate": 2.606956303020502e-08, "loss": 0.72000891, "num_input_tokens_seen": 170713780, "step": 7901, "time_per_iteration": 2.7162253856658936 }, { "auxiliary_loss_clip": 0.01162451, "auxiliary_loss_mlp": 0.01027106, "balance_loss_clip": 1.04732335, "balance_loss_mlp": 1.01965857, "epoch": 0.9501593218300968, "flos": 14354752573440.0, "grad_norm": 1.803449811688652, "language_loss": 0.84243703, "learning_rate": 2.5944349961690036e-08, "loss": 0.86433256, "num_input_tokens_seen": 170730800, "step": 7902, "time_per_iteration": 2.6474051475524902 }, { "auxiliary_loss_clip": 0.01154092, "auxiliary_loss_mlp": 0.01024911, "balance_loss_clip": 1.04492176, "balance_loss_mlp": 1.01796746, "epoch": 0.9502795647207359, "flos": 38728113742080.0, "grad_norm": 1.5843917995274142, "language_loss": 0.73081827, "learning_rate": 2.581943635584749e-08, "loss": 0.7526083, "num_input_tokens_seen": 170753630, "step": 7903, "time_per_iteration": 3.7899768352508545 }, { "auxiliary_loss_clip": 0.01150292, "auxiliary_loss_mlp": 0.01025434, "balance_loss_clip": 1.0450269, "balance_loss_mlp": 1.01865125, "epoch": 0.950399807611375, "flos": 40808023799040.0, "grad_norm": 1.9642359982011404, "language_loss": 0.65397596, "learning_rate": 2.569482223162689e-08, "loss": 0.67573321, "num_input_tokens_seen": 170777605, "step": 7904, "time_per_iteration": 2.818716287612915 }, { "auxiliary_loss_clip": 0.01160554, "auxiliary_loss_mlp": 0.01026087, "balance_loss_clip": 1.0439651, "balance_loss_mlp": 1.01844597, "epoch": 0.950520050502014, "flos": 23440403266560.0, "grad_norm": 1.7891610865189196, "language_loss": 0.72659427, "learning_rate": 2.5570507607932e-08, "loss": 0.74846071, "num_input_tokens_seen": 170797520, "step": 7905, "time_per_iteration": 2.66584849357605 }, { "auxiliary_loss_clip": 0.01168045, "auxiliary_loss_mlp": 0.01026655, "balance_loss_clip": 1.04836893, "balance_loss_mlp": 1.01909685, "epoch": 0.9506402933926532, "flos": 17783718658560.0, "grad_norm": 3.34104528504882, "language_loss": 0.63992631, "learning_rate": 2.54464925036213e-08, "loss": 0.66187334, "num_input_tokens_seen": 170814810, "step": 7906, "time_per_iteration": 2.5659821033477783 }, { "auxiliary_loss_clip": 0.0116115, "auxiliary_loss_mlp": 0.01024711, "balance_loss_clip": 1.04673755, "balance_loss_mlp": 1.01729286, "epoch": 0.9507605362832923, "flos": 32561928668160.0, "grad_norm": 1.7864676897451777, "language_loss": 0.61113071, "learning_rate": 2.532277693750773e-08, "loss": 0.63298929, "num_input_tokens_seen": 170835735, "step": 7907, "time_per_iteration": 2.763040781021118 }, { "auxiliary_loss_clip": 0.01153749, "auxiliary_loss_mlp": 0.01026044, "balance_loss_clip": 1.0514909, "balance_loss_mlp": 1.01872158, "epoch": 0.9508807791739313, "flos": 19602054898560.0, "grad_norm": 2.1104093855152892, "language_loss": 0.75520241, "learning_rate": 2.5199360928358948e-08, "loss": 0.77700037, "num_input_tokens_seen": 170852970, "step": 7908, "time_per_iteration": 2.744154214859009 }, { "auxiliary_loss_clip": 0.0115235, "auxiliary_loss_mlp": 0.01047534, "balance_loss_clip": 1.04412723, "balance_loss_mlp": 1.01428962, "epoch": 0.9510010220645704, "flos": 21471852349440.0, "grad_norm": 2.328634392315735, "language_loss": 0.86970949, "learning_rate": 2.507624449489665e-08, "loss": 0.89170837, "num_input_tokens_seen": 170871600, "step": 7909, "time_per_iteration": 2.7492387294769287 }, { "auxiliary_loss_clip": 0.0116021, "auxiliary_loss_mlp": 0.01021611, "balance_loss_clip": 1.04731381, "balance_loss_mlp": 1.01419914, "epoch": 0.9511212649552095, "flos": 18879999701760.0, "grad_norm": 2.2797168821547436, "language_loss": 0.65336514, "learning_rate": 2.495342765579811e-08, "loss": 0.6751833, "num_input_tokens_seen": 170890260, "step": 7910, "time_per_iteration": 2.6872191429138184 }, { "auxiliary_loss_clip": 0.01146363, "auxiliary_loss_mlp": 0.01022593, "balance_loss_clip": 1.04394746, "balance_loss_mlp": 1.0151093, "epoch": 0.9512415078458486, "flos": 20810521094400.0, "grad_norm": 2.4407843395572235, "language_loss": 0.70970482, "learning_rate": 2.4830910429693984e-08, "loss": 0.73139441, "num_input_tokens_seen": 170910220, "step": 7911, "time_per_iteration": 3.7407214641571045 }, { "auxiliary_loss_clip": 0.01163477, "auxiliary_loss_mlp": 0.01020339, "balance_loss_clip": 1.04512501, "balance_loss_mlp": 1.0129869, "epoch": 0.9513617507364877, "flos": 18369565482240.0, "grad_norm": 1.7830050116562504, "language_loss": 0.7945621, "learning_rate": 2.470869283517052e-08, "loss": 0.81640023, "num_input_tokens_seen": 170928255, "step": 7912, "time_per_iteration": 2.7759287357330322 }, { "auxiliary_loss_clip": 0.01157488, "auxiliary_loss_mlp": 0.01028894, "balance_loss_clip": 1.04598761, "balance_loss_mlp": 1.02143192, "epoch": 0.9514819936271268, "flos": 25010166412800.0, "grad_norm": 1.5656117652822092, "language_loss": 0.7693702, "learning_rate": 2.458677489076777e-08, "loss": 0.79123408, "num_input_tokens_seen": 170949265, "step": 7913, "time_per_iteration": 2.717360496520996 }, { "auxiliary_loss_clip": 0.01152614, "auxiliary_loss_mlp": 0.01024969, "balance_loss_clip": 1.04403019, "balance_loss_mlp": 1.01793289, "epoch": 0.9516022365177659, "flos": 18662129758080.0, "grad_norm": 1.772521770539674, "language_loss": 0.82917535, "learning_rate": 2.446515661498072e-08, "loss": 0.85095125, "num_input_tokens_seen": 170968595, "step": 7914, "time_per_iteration": 2.684833288192749 }, { "auxiliary_loss_clip": 0.0114782, "auxiliary_loss_mlp": 0.01022527, "balance_loss_clip": 1.04623389, "balance_loss_mlp": 1.01536822, "epoch": 0.9517224794084049, "flos": 25372109808000.0, "grad_norm": 4.154490607615752, "language_loss": 0.74741721, "learning_rate": 2.434383802625861e-08, "loss": 0.76912069, "num_input_tokens_seen": 170987550, "step": 7915, "time_per_iteration": 2.855290651321411 }, { "auxiliary_loss_clip": 0.01156918, "auxiliary_loss_mlp": 0.01020152, "balance_loss_clip": 1.04516518, "balance_loss_mlp": 1.01313066, "epoch": 0.9518427222990441, "flos": 21470918595840.0, "grad_norm": 1.9330907787796556, "language_loss": 0.74161255, "learning_rate": 2.4222819143005168e-08, "loss": 0.76338327, "num_input_tokens_seen": 171007145, "step": 7916, "time_per_iteration": 2.7106480598449707 }, { "auxiliary_loss_clip": 0.01164177, "auxiliary_loss_mlp": 0.01024344, "balance_loss_clip": 1.04653168, "balance_loss_mlp": 1.01716161, "epoch": 0.9519629651896832, "flos": 21033634423680.0, "grad_norm": 2.0948464000052156, "language_loss": 0.80591452, "learning_rate": 2.4102099983579706e-08, "loss": 0.8277998, "num_input_tokens_seen": 171026295, "step": 7917, "time_per_iteration": 2.600750207901001 }, { "auxiliary_loss_clip": 0.01163334, "auxiliary_loss_mlp": 0.0102726, "balance_loss_clip": 1.04733682, "balance_loss_mlp": 1.01985121, "epoch": 0.9520832080803222, "flos": 21689219502720.0, "grad_norm": 1.6167325592945498, "language_loss": 0.77171838, "learning_rate": 2.3981680566294236e-08, "loss": 0.79362434, "num_input_tokens_seen": 171045895, "step": 7918, "time_per_iteration": 2.6606998443603516 }, { "auxiliary_loss_clip": 0.01164549, "auxiliary_loss_mlp": 0.01024706, "balance_loss_clip": 1.04762638, "balance_loss_mlp": 1.01773572, "epoch": 0.9522034509709614, "flos": 23145289125120.0, "grad_norm": 1.9699539031759885, "language_loss": 0.73750615, "learning_rate": 2.3861560909416822e-08, "loss": 0.7593987, "num_input_tokens_seen": 171065445, "step": 7919, "time_per_iteration": 2.6307740211486816 }, { "auxiliary_loss_clip": 0.01153163, "auxiliary_loss_mlp": 0.01027753, "balance_loss_clip": 1.04954267, "balance_loss_mlp": 1.02075505, "epoch": 0.9523236938616004, "flos": 24679428958080.0, "grad_norm": 3.65359113898818, "language_loss": 0.82622546, "learning_rate": 2.3741741031169325e-08, "loss": 0.84803462, "num_input_tokens_seen": 171085015, "step": 7920, "time_per_iteration": 2.7385971546173096 }, { "auxiliary_loss_clip": 0.01149996, "auxiliary_loss_mlp": 0.01023715, "balance_loss_clip": 1.04611969, "balance_loss_mlp": 1.0163244, "epoch": 0.9524439367522395, "flos": 22672309812480.0, "grad_norm": 1.9661960418892712, "language_loss": 0.71697962, "learning_rate": 2.3622220949728544e-08, "loss": 0.73871672, "num_input_tokens_seen": 171103900, "step": 7921, "time_per_iteration": 2.6754982471466064 }, { "auxiliary_loss_clip": 0.01158042, "auxiliary_loss_mlp": 0.01025269, "balance_loss_clip": 1.04710817, "balance_loss_mlp": 1.01810408, "epoch": 0.9525641796428787, "flos": 34055525024640.0, "grad_norm": 2.6481755668491296, "language_loss": 0.61461091, "learning_rate": 2.3503000683225526e-08, "loss": 0.63644409, "num_input_tokens_seen": 171121615, "step": 7922, "time_per_iteration": 2.7543833255767822 }, { "auxiliary_loss_clip": 0.01165075, "auxiliary_loss_mlp": 0.01025444, "balance_loss_clip": 1.04540443, "balance_loss_mlp": 1.01808929, "epoch": 0.9526844225335177, "flos": 16727083251840.0, "grad_norm": 2.2896409754904328, "language_loss": 0.84295404, "learning_rate": 2.3384080249745585e-08, "loss": 0.86485922, "num_input_tokens_seen": 171139505, "step": 7923, "time_per_iteration": 2.766362190246582 }, { "auxiliary_loss_clip": 0.01157573, "auxiliary_loss_mlp": 0.01024499, "balance_loss_clip": 1.04724932, "balance_loss_mlp": 1.01735818, "epoch": 0.9528046654241568, "flos": 36939367330560.0, "grad_norm": 2.273037063399404, "language_loss": 0.82948732, "learning_rate": 2.3265459667329178e-08, "loss": 0.85130805, "num_input_tokens_seen": 171158995, "step": 7924, "time_per_iteration": 2.814236879348755 }, { "auxiliary_loss_clip": 0.01158914, "auxiliary_loss_mlp": 0.01024091, "balance_loss_clip": 1.04480767, "balance_loss_mlp": 1.01755261, "epoch": 0.9529249083147959, "flos": 18255010032000.0, "grad_norm": 22.752961241923604, "language_loss": 0.86094731, "learning_rate": 2.31471389539708e-08, "loss": 0.88277733, "num_input_tokens_seen": 171176120, "step": 7925, "time_per_iteration": 2.653477668762207 }, { "auxiliary_loss_clip": 0.0116288, "auxiliary_loss_mlp": 0.0105094, "balance_loss_clip": 1.04691434, "balance_loss_mlp": 1.01655805, "epoch": 0.953045151205435, "flos": 28658438985600.0, "grad_norm": 2.8800619180971254, "language_loss": 0.73049903, "learning_rate": 2.3029118127619872e-08, "loss": 0.75263727, "num_input_tokens_seen": 171195835, "step": 7926, "time_per_iteration": 3.5614805221557617 }, { "auxiliary_loss_clip": 0.01154539, "auxiliary_loss_mlp": 0.01028563, "balance_loss_clip": 1.04822981, "balance_loss_mlp": 1.02127969, "epoch": 0.953165394096074, "flos": 21835232288640.0, "grad_norm": 2.2374027519334776, "language_loss": 0.8670792, "learning_rate": 2.2911397206179628e-08, "loss": 0.88891017, "num_input_tokens_seen": 171212585, "step": 7927, "time_per_iteration": 3.655564308166504 }, { "auxiliary_loss_clip": 0.0116377, "auxiliary_loss_mlp": 0.01024472, "balance_loss_clip": 1.04679823, "balance_loss_mlp": 1.01740932, "epoch": 0.9532856369867132, "flos": 19975059682560.0, "grad_norm": 1.7562286487982692, "language_loss": 0.62726134, "learning_rate": 2.279397620750845e-08, "loss": 0.64914376, "num_input_tokens_seen": 171231630, "step": 7928, "time_per_iteration": 2.661067008972168 }, { "auxiliary_loss_clip": 0.01155102, "auxiliary_loss_mlp": 0.01025087, "balance_loss_clip": 1.04528046, "balance_loss_mlp": 1.01811624, "epoch": 0.9534058798773523, "flos": 15049588239360.0, "grad_norm": 1.9512281897670782, "language_loss": 0.78758508, "learning_rate": 2.2676855149419195e-08, "loss": 0.80938697, "num_input_tokens_seen": 171248800, "step": 7929, "time_per_iteration": 3.523359537124634 }, { "auxiliary_loss_clip": 0.01154676, "auxiliary_loss_mlp": 0.0102528, "balance_loss_clip": 1.04719448, "balance_loss_mlp": 1.01830029, "epoch": 0.9535261227679913, "flos": 17602800831360.0, "grad_norm": 2.344174058351579, "language_loss": 0.75122809, "learning_rate": 2.2560034049678988e-08, "loss": 0.77302766, "num_input_tokens_seen": 171263150, "step": 7930, "time_per_iteration": 2.6808156967163086 }, { "auxiliary_loss_clip": 0.01168846, "auxiliary_loss_mlp": 0.01026473, "balance_loss_clip": 1.0483973, "balance_loss_mlp": 1.01878393, "epoch": 0.9536463656586305, "flos": 23142954741120.0, "grad_norm": 1.7246657787934068, "language_loss": 0.75441885, "learning_rate": 2.2443512926008988e-08, "loss": 0.77637208, "num_input_tokens_seen": 171282480, "step": 7931, "time_per_iteration": 2.599142074584961 }, { "auxiliary_loss_clip": 0.01160311, "auxiliary_loss_mlp": 0.0102268, "balance_loss_clip": 1.0456655, "balance_loss_mlp": 1.0155009, "epoch": 0.9537666085492695, "flos": 18625033987200.0, "grad_norm": 2.5414212473849536, "language_loss": 0.69957614, "learning_rate": 2.2327291796085946e-08, "loss": 0.72140604, "num_input_tokens_seen": 171300840, "step": 7932, "time_per_iteration": 2.724402666091919 }, { "auxiliary_loss_clip": 0.01165478, "auxiliary_loss_mlp": 0.01023153, "balance_loss_clip": 1.0461905, "balance_loss_mlp": 1.01545215, "epoch": 0.9538868514399086, "flos": 18989347680000.0, "grad_norm": 3.7578853258457805, "language_loss": 0.77032471, "learning_rate": 2.2211370677540197e-08, "loss": 0.79221106, "num_input_tokens_seen": 171317365, "step": 7933, "time_per_iteration": 2.6190786361694336 }, { "auxiliary_loss_clip": 0.01167073, "auxiliary_loss_mlp": 0.01028529, "balance_loss_clip": 1.04662251, "balance_loss_mlp": 1.02132261, "epoch": 0.9540070943305478, "flos": 16800556521600.0, "grad_norm": 2.542533540973932, "language_loss": 0.78294659, "learning_rate": 2.2095749587957012e-08, "loss": 0.80490267, "num_input_tokens_seen": 171335270, "step": 7934, "time_per_iteration": 2.6480159759521484 }, { "auxiliary_loss_clip": 0.0116156, "auxiliary_loss_mlp": 0.01027501, "balance_loss_clip": 1.04871726, "balance_loss_mlp": 1.01997304, "epoch": 0.9541273372211868, "flos": 20156911263360.0, "grad_norm": 2.3752972710331806, "language_loss": 0.69349754, "learning_rate": 2.1980428544876138e-08, "loss": 0.71538812, "num_input_tokens_seen": 171353910, "step": 7935, "time_per_iteration": 2.7319366931915283 }, { "auxiliary_loss_clip": 0.01151618, "auxiliary_loss_mlp": 0.01025746, "balance_loss_clip": 1.04708338, "balance_loss_mlp": 1.01790822, "epoch": 0.9542475801118259, "flos": 26725511381760.0, "grad_norm": 1.5970817054850786, "language_loss": 0.74116635, "learning_rate": 2.1865407565791584e-08, "loss": 0.76293999, "num_input_tokens_seen": 171375480, "step": 7936, "time_per_iteration": 2.721261739730835 }, { "auxiliary_loss_clip": 0.01160625, "auxiliary_loss_mlp": 0.01023918, "balance_loss_clip": 1.04516935, "balance_loss_mlp": 1.0160265, "epoch": 0.954367823002465, "flos": 23330911633920.0, "grad_norm": 1.8060968984670205, "language_loss": 0.77197397, "learning_rate": 2.175068666815183e-08, "loss": 0.79381937, "num_input_tokens_seen": 171396320, "step": 7937, "time_per_iteration": 3.6750712394714355 }, { "auxiliary_loss_clip": 0.01158062, "auxiliary_loss_mlp": 0.01025309, "balance_loss_clip": 1.04827678, "balance_loss_mlp": 1.01722121, "epoch": 0.9544880658931041, "flos": 14902713527040.0, "grad_norm": 2.148961537442881, "language_loss": 0.78749341, "learning_rate": 2.163626586935985e-08, "loss": 0.80932713, "num_input_tokens_seen": 171412860, "step": 7938, "time_per_iteration": 2.762643575668335 }, { "auxiliary_loss_clip": 0.01161721, "auxiliary_loss_mlp": 0.01026535, "balance_loss_clip": 1.04740644, "balance_loss_mlp": 1.01874149, "epoch": 0.9546083087837431, "flos": 29095902725760.0, "grad_norm": 1.7828094241679577, "language_loss": 0.63029075, "learning_rate": 2.1522145186773755e-08, "loss": 0.65217334, "num_input_tokens_seen": 171431780, "step": 7939, "time_per_iteration": 2.891554117202759 }, { "auxiliary_loss_clip": 0.01158414, "auxiliary_loss_mlp": 0.01027476, "balance_loss_clip": 1.04681063, "balance_loss_mlp": 1.02007949, "epoch": 0.9547285516743822, "flos": 21142335957120.0, "grad_norm": 1.8848223993262005, "language_loss": 0.85366201, "learning_rate": 2.140832463770481e-08, "loss": 0.87552094, "num_input_tokens_seen": 171450975, "step": 7940, "time_per_iteration": 2.721081495285034 }, { "auxiliary_loss_clip": 0.01163273, "auxiliary_loss_mlp": 0.01027861, "balance_loss_clip": 1.04729509, "balance_loss_mlp": 1.02012992, "epoch": 0.9548487945650214, "flos": 27490157130240.0, "grad_norm": 2.219802788179885, "language_loss": 0.76207685, "learning_rate": 2.129480423941987e-08, "loss": 0.78398812, "num_input_tokens_seen": 171467645, "step": 7941, "time_per_iteration": 2.777864933013916 }, { "auxiliary_loss_clip": 0.01164181, "auxiliary_loss_mlp": 0.0102442, "balance_loss_clip": 1.04803538, "balance_loss_mlp": 1.01742268, "epoch": 0.9549690374556604, "flos": 22273198819200.0, "grad_norm": 1.7571594256129062, "language_loss": 0.80273175, "learning_rate": 2.1181584009140052e-08, "loss": 0.82461774, "num_input_tokens_seen": 171487185, "step": 7942, "time_per_iteration": 2.779954195022583 }, { "auxiliary_loss_clip": 0.01163108, "auxiliary_loss_mlp": 0.010227, "balance_loss_clip": 1.04688239, "balance_loss_mlp": 1.01625419, "epoch": 0.9550892803462995, "flos": 17595294888960.0, "grad_norm": 2.0054059694539967, "language_loss": 0.84121192, "learning_rate": 2.10686639640405e-08, "loss": 0.86307001, "num_input_tokens_seen": 171501275, "step": 7943, "time_per_iteration": 2.6813533306121826 }, { "auxiliary_loss_clip": 0.01165335, "auxiliary_loss_mlp": 0.01026604, "balance_loss_clip": 1.04625428, "balance_loss_mlp": 1.01996112, "epoch": 0.9552095232369386, "flos": 24353144789760.0, "grad_norm": 1.9047916228244723, "language_loss": 0.81364834, "learning_rate": 2.0956044121251294e-08, "loss": 0.83556771, "num_input_tokens_seen": 171520060, "step": 7944, "time_per_iteration": 2.7649340629577637 }, { "auxiliary_loss_clip": 0.011541, "auxiliary_loss_mlp": 0.01021538, "balance_loss_clip": 1.04652381, "balance_loss_mlp": 1.01425433, "epoch": 0.9553297661275777, "flos": 22746860490240.0, "grad_norm": 1.6929807780650954, "language_loss": 0.81093967, "learning_rate": 2.084372449785654e-08, "loss": 0.83269608, "num_input_tokens_seen": 171539895, "step": 7945, "time_per_iteration": 2.684309959411621 }, { "auxiliary_loss_clip": 0.01156013, "auxiliary_loss_mlp": 0.01024945, "balance_loss_clip": 1.0461843, "balance_loss_mlp": 1.01755452, "epoch": 0.9554500090182168, "flos": 15413866018560.0, "grad_norm": 4.045878296455101, "language_loss": 0.68548727, "learning_rate": 2.0731705110895282e-08, "loss": 0.70729685, "num_input_tokens_seen": 171557385, "step": 7946, "time_per_iteration": 2.748098134994507 }, { "auxiliary_loss_clip": 0.01169081, "auxiliary_loss_mlp": 0.01025889, "balance_loss_clip": 1.05028844, "balance_loss_mlp": 1.01855803, "epoch": 0.9555702519088559, "flos": 23513517400320.0, "grad_norm": 1.709923580019245, "language_loss": 0.87104797, "learning_rate": 2.0619985977360587e-08, "loss": 0.89299774, "num_input_tokens_seen": 171575705, "step": 7947, "time_per_iteration": 2.842031478881836 }, { "auxiliary_loss_clip": 0.0115464, "auxiliary_loss_mlp": 0.01028085, "balance_loss_clip": 1.04548717, "balance_loss_mlp": 1.02162695, "epoch": 0.955690494799495, "flos": 22962072827520.0, "grad_norm": 1.6814260103703427, "language_loss": 0.77144718, "learning_rate": 2.0508567114200237e-08, "loss": 0.79327446, "num_input_tokens_seen": 171595620, "step": 7948, "time_per_iteration": 2.8607516288757324 }, { "auxiliary_loss_clip": 0.01161323, "auxiliary_loss_mlp": 0.01021492, "balance_loss_clip": 1.04607201, "balance_loss_mlp": 1.01464009, "epoch": 0.955810737690134, "flos": 26031250333440.0, "grad_norm": 1.8850554741756447, "language_loss": 0.78305471, "learning_rate": 2.0397448538316485e-08, "loss": 0.80488288, "num_input_tokens_seen": 171616660, "step": 7949, "time_per_iteration": 2.8400890827178955 }, { "auxiliary_loss_clip": 0.01150509, "auxiliary_loss_mlp": 0.01028687, "balance_loss_clip": 1.04571939, "balance_loss_mlp": 1.02196932, "epoch": 0.9559309805807732, "flos": 20849951249280.0, "grad_norm": 2.134772820497827, "language_loss": 0.67011249, "learning_rate": 2.028663026656563e-08, "loss": 0.69190443, "num_input_tokens_seen": 171635515, "step": 7950, "time_per_iteration": 2.8113930225372314 }, { "auxiliary_loss_clip": 0.01159559, "auxiliary_loss_mlp": 0.01054592, "balance_loss_clip": 1.04378152, "balance_loss_mlp": 1.01860762, "epoch": 0.9560512234714122, "flos": 21578219498880.0, "grad_norm": 3.363171201976799, "language_loss": 0.72076023, "learning_rate": 2.0176112315758885e-08, "loss": 0.74290168, "num_input_tokens_seen": 171653305, "step": 7951, "time_per_iteration": 2.7576141357421875 }, { "auxiliary_loss_clip": 0.01160624, "auxiliary_loss_mlp": 0.01023617, "balance_loss_clip": 1.04652929, "balance_loss_mlp": 1.01589262, "epoch": 0.9561714663620513, "flos": 17450144029440.0, "grad_norm": 2.9385741496893205, "language_loss": 0.69263077, "learning_rate": 2.0065894702661957e-08, "loss": 0.71447325, "num_input_tokens_seen": 171669980, "step": 7952, "time_per_iteration": 3.6866252422332764 }, { "auxiliary_loss_clip": 0.01151969, "auxiliary_loss_mlp": 0.01053511, "balance_loss_clip": 1.04661644, "balance_loss_mlp": 1.01766658, "epoch": 0.9562917092526905, "flos": 26098510550400.0, "grad_norm": 1.9305268879383894, "language_loss": 0.77931511, "learning_rate": 1.9955977443994577e-08, "loss": 0.80137002, "num_input_tokens_seen": 171689970, "step": 7953, "time_per_iteration": 3.755263090133667 }, { "auxiliary_loss_clip": 0.01161373, "auxiliary_loss_mlp": 0.01028608, "balance_loss_clip": 1.0479598, "balance_loss_mlp": 1.0210855, "epoch": 0.9564119521433295, "flos": 24096742531200.0, "grad_norm": 2.7047389541235174, "language_loss": 0.62150335, "learning_rate": 1.9846360556430965e-08, "loss": 0.64340317, "num_input_tokens_seen": 171708270, "step": 7954, "time_per_iteration": 2.7939467430114746 }, { "auxiliary_loss_clip": 0.0116414, "auxiliary_loss_mlp": 0.01020065, "balance_loss_clip": 1.04587007, "balance_loss_mlp": 1.01308191, "epoch": 0.9565321950339686, "flos": 32008903896960.0, "grad_norm": 2.1533436094461824, "language_loss": 0.61530703, "learning_rate": 1.973704405660004e-08, "loss": 0.6371491, "num_input_tokens_seen": 171729385, "step": 7955, "time_per_iteration": 3.6925208568573 }, { "auxiliary_loss_clip": 0.01150014, "auxiliary_loss_mlp": 0.0102314, "balance_loss_clip": 1.04586959, "balance_loss_mlp": 1.01601434, "epoch": 0.9566524379246077, "flos": 23588642695680.0, "grad_norm": 1.4318622865186592, "language_loss": 0.77967286, "learning_rate": 1.9628027961085203e-08, "loss": 0.80140436, "num_input_tokens_seen": 171752615, "step": 7956, "time_per_iteration": 2.8774361610412598 }, { "auxiliary_loss_clip": 0.01145273, "auxiliary_loss_mlp": 0.01020257, "balance_loss_clip": 1.04448009, "balance_loss_mlp": 1.0134325, "epoch": 0.9567726808152468, "flos": 38067716240640.0, "grad_norm": 1.665924239629296, "language_loss": 0.83781642, "learning_rate": 1.9519312286423894e-08, "loss": 0.85947168, "num_input_tokens_seen": 171775810, "step": 7957, "time_per_iteration": 2.9638378620147705 }, { "auxiliary_loss_clip": 0.01161081, "auxiliary_loss_mlp": 0.0103028, "balance_loss_clip": 1.04728889, "balance_loss_mlp": 1.02290154, "epoch": 0.9568929237058859, "flos": 22744059229440.0, "grad_norm": 1.8617795108713893, "language_loss": 0.77566874, "learning_rate": 1.9410897049108255e-08, "loss": 0.79758239, "num_input_tokens_seen": 171795090, "step": 7958, "time_per_iteration": 2.6562588214874268 }, { "auxiliary_loss_clip": 0.01171288, "auxiliary_loss_mlp": 0.01024345, "balance_loss_clip": 1.04996538, "balance_loss_mlp": 1.01699924, "epoch": 0.957013166596525, "flos": 23841633162240.0, "grad_norm": 1.9391357921375132, "language_loss": 0.91340494, "learning_rate": 1.9302782265584905e-08, "loss": 0.93536127, "num_input_tokens_seen": 171815755, "step": 7959, "time_per_iteration": 2.735238552093506 }, { "auxiliary_loss_clip": 0.01141334, "auxiliary_loss_mlp": 0.01023105, "balance_loss_clip": 1.0455153, "balance_loss_mlp": 1.01629782, "epoch": 0.9571334094871641, "flos": 17639286071040.0, "grad_norm": 2.62839869334732, "language_loss": 0.87231451, "learning_rate": 1.9194967952254282e-08, "loss": 0.89395893, "num_input_tokens_seen": 171834330, "step": 7960, "time_per_iteration": 2.707184076309204 }, { "auxiliary_loss_clip": 0.0116134, "auxiliary_loss_mlp": 0.01028985, "balance_loss_clip": 1.04763389, "balance_loss_mlp": 1.02191567, "epoch": 0.9572536523778031, "flos": 15369623441280.0, "grad_norm": 2.2363243297596482, "language_loss": 0.80800462, "learning_rate": 1.9087454125472635e-08, "loss": 0.82990795, "num_input_tokens_seen": 171848805, "step": 7961, "time_per_iteration": 2.7120182514190674 }, { "auxiliary_loss_clip": 0.01167443, "auxiliary_loss_mlp": 0.01025044, "balance_loss_clip": 1.04754686, "balance_loss_mlp": 1.01746213, "epoch": 0.9573738952684423, "flos": 24969838417920.0, "grad_norm": 2.2680054069983115, "language_loss": 0.78751582, "learning_rate": 1.8980240801548696e-08, "loss": 0.80944067, "num_input_tokens_seen": 171867995, "step": 7962, "time_per_iteration": 2.6141464710235596 }, { "auxiliary_loss_clip": 0.01156893, "auxiliary_loss_mlp": 0.01019693, "balance_loss_clip": 1.04917645, "balance_loss_mlp": 1.01264453, "epoch": 0.9574941381590814, "flos": 25769461034880.0, "grad_norm": 1.657652855496309, "language_loss": 0.74173266, "learning_rate": 1.8873327996747458e-08, "loss": 0.76349849, "num_input_tokens_seen": 171886495, "step": 7963, "time_per_iteration": 2.72883677482605 }, { "auxiliary_loss_clip": 0.01163497, "auxiliary_loss_mlp": 0.01024435, "balance_loss_clip": 1.04424405, "balance_loss_mlp": 1.01682305, "epoch": 0.9576143810497204, "flos": 32307178435200.0, "grad_norm": 1.7199572071560472, "language_loss": 0.6607132, "learning_rate": 1.8766715727287053e-08, "loss": 0.68259251, "num_input_tokens_seen": 171908200, "step": 7964, "time_per_iteration": 3.6325252056121826 }, { "auxiliary_loss_clip": 0.0116601, "auxiliary_loss_mlp": 0.01053634, "balance_loss_clip": 1.04567897, "balance_loss_mlp": 1.01777875, "epoch": 0.9577346239403596, "flos": 27745733376000.0, "grad_norm": 1.6708945961947477, "language_loss": 0.79044175, "learning_rate": 1.8660404009340546e-08, "loss": 0.81263822, "num_input_tokens_seen": 171928650, "step": 7965, "time_per_iteration": 2.753432273864746 }, { "auxiliary_loss_clip": 0.01059742, "auxiliary_loss_mlp": 0.01002559, "balance_loss_clip": 1.00712216, "balance_loss_mlp": 1.00155783, "epoch": 0.9578548668309986, "flos": 57468313710720.0, "grad_norm": 0.8730488663185593, "language_loss": 0.59542727, "learning_rate": 1.8554392859035485e-08, "loss": 0.61605024, "num_input_tokens_seen": 171986400, "step": 7966, "time_per_iteration": 3.2844033241271973 }, { "auxiliary_loss_clip": 0.01150475, "auxiliary_loss_mlp": 0.01027397, "balance_loss_clip": 1.0448972, "balance_loss_mlp": 1.02006018, "epoch": 0.9579751097216377, "flos": 19756040503680.0, "grad_norm": 1.7126736682092139, "language_loss": 0.78948677, "learning_rate": 1.8448682292453444e-08, "loss": 0.81126559, "num_input_tokens_seen": 172005475, "step": 7967, "time_per_iteration": 2.9188663959503174 }, { "auxiliary_loss_clip": 0.01165622, "auxiliary_loss_mlp": 0.01028051, "balance_loss_clip": 1.0469346, "balance_loss_mlp": 1.02071989, "epoch": 0.9580953526122769, "flos": 18041270152320.0, "grad_norm": 1.6926806339732243, "language_loss": 0.65721834, "learning_rate": 1.8343272325631154e-08, "loss": 0.67915505, "num_input_tokens_seen": 172024420, "step": 7968, "time_per_iteration": 2.6376636028289795 }, { "auxiliary_loss_clip": 0.01152708, "auxiliary_loss_mlp": 0.01050247, "balance_loss_clip": 1.04730272, "balance_loss_mlp": 1.01546574, "epoch": 0.9582155955029159, "flos": 24270154416000.0, "grad_norm": 2.2870972413656, "language_loss": 0.78172487, "learning_rate": 1.8238162974558492e-08, "loss": 0.80375445, "num_input_tokens_seen": 172038350, "step": 7969, "time_per_iteration": 2.762202024459839 }, { "auxiliary_loss_clip": 0.01158643, "auxiliary_loss_mlp": 0.01027528, "balance_loss_clip": 1.04804683, "balance_loss_mlp": 1.02059293, "epoch": 0.958335838393555, "flos": 22783309816320.0, "grad_norm": 1.874081601113166, "language_loss": 0.75053257, "learning_rate": 1.8133354255181144e-08, "loss": 0.7723943, "num_input_tokens_seen": 172058665, "step": 7970, "time_per_iteration": 2.728788137435913 }, { "auxiliary_loss_clip": 0.01156317, "auxiliary_loss_mlp": 0.01022717, "balance_loss_clip": 1.04462254, "balance_loss_mlp": 1.01585674, "epoch": 0.958456081284194, "flos": 16911484698240.0, "grad_norm": 2.8998785629837647, "language_loss": 0.74283177, "learning_rate": 1.802884618339795e-08, "loss": 0.76462209, "num_input_tokens_seen": 172077470, "step": 7971, "time_per_iteration": 2.641106128692627 }, { "auxiliary_loss_clip": 0.01165132, "auxiliary_loss_mlp": 0.01029444, "balance_loss_clip": 1.04772449, "balance_loss_mlp": 1.02200818, "epoch": 0.9585763241748332, "flos": 19974951941760.0, "grad_norm": 1.8211006705489574, "language_loss": 0.80826628, "learning_rate": 1.7924638775062894e-08, "loss": 0.830212, "num_input_tokens_seen": 172096590, "step": 7972, "time_per_iteration": 2.6961402893066406 }, { "auxiliary_loss_clip": 0.01149949, "auxiliary_loss_mlp": 0.01025548, "balance_loss_clip": 1.04524076, "balance_loss_mlp": 1.01876163, "epoch": 0.9586965670654722, "flos": 21395649646080.0, "grad_norm": 1.914313096149103, "language_loss": 0.81723511, "learning_rate": 1.7820732045984444e-08, "loss": 0.83899003, "num_input_tokens_seen": 172116735, "step": 7973, "time_per_iteration": 2.7312612533569336 }, { "auxiliary_loss_clip": 0.01159704, "auxiliary_loss_mlp": 0.01027644, "balance_loss_clip": 1.04641008, "balance_loss_mlp": 1.01932323, "epoch": 0.9588168099561113, "flos": 21435115714560.0, "grad_norm": 1.8414428752951226, "language_loss": 0.74150312, "learning_rate": 1.7717126011924655e-08, "loss": 0.76337659, "num_input_tokens_seen": 172138320, "step": 7974, "time_per_iteration": 2.6837315559387207 }, { "auxiliary_loss_clip": 0.01150656, "auxiliary_loss_mlp": 0.01027198, "balance_loss_clip": 1.04695261, "balance_loss_mlp": 1.02000666, "epoch": 0.9589370528467505, "flos": 11763761852160.0, "grad_norm": 4.850184444874017, "language_loss": 0.76054513, "learning_rate": 1.7613820688600957e-08, "loss": 0.78232372, "num_input_tokens_seen": 172154225, "step": 7975, "time_per_iteration": 2.821362257003784 }, { "auxiliary_loss_clip": 0.01164925, "auxiliary_loss_mlp": 0.01021677, "balance_loss_clip": 1.04636931, "balance_loss_mlp": 1.01454806, "epoch": 0.9590572957373895, "flos": 23441516588160.0, "grad_norm": 1.8087091949842893, "language_loss": 0.78573167, "learning_rate": 1.7510816091684588e-08, "loss": 0.8075977, "num_input_tokens_seen": 172174150, "step": 7976, "time_per_iteration": 2.677114248275757 }, { "auxiliary_loss_clip": 0.01162057, "auxiliary_loss_mlp": 0.01021908, "balance_loss_clip": 1.04793334, "balance_loss_mlp": 1.01412916, "epoch": 0.9591775386280286, "flos": 22528272274560.0, "grad_norm": 2.5402110362062174, "language_loss": 0.79238474, "learning_rate": 1.740811223680083e-08, "loss": 0.81422436, "num_input_tokens_seen": 172191005, "step": 7977, "time_per_iteration": 2.723527669906616 }, { "auxiliary_loss_clip": 0.01165244, "auxiliary_loss_mlp": 0.01022299, "balance_loss_clip": 1.04583716, "balance_loss_mlp": 1.01510727, "epoch": 0.9592977815186677, "flos": 18186959715840.0, "grad_norm": 2.7401368783283018, "language_loss": 0.74585408, "learning_rate": 1.7305709139530334e-08, "loss": 0.76772952, "num_input_tokens_seen": 172209785, "step": 7978, "time_per_iteration": 3.605839252471924 }, { "auxiliary_loss_clip": 0.01157005, "auxiliary_loss_mlp": 0.01027485, "balance_loss_clip": 1.04490948, "balance_loss_mlp": 1.01962948, "epoch": 0.9594180244093068, "flos": 16537797555840.0, "grad_norm": 2.2545866933222287, "language_loss": 0.74646628, "learning_rate": 1.7203606815407334e-08, "loss": 0.7683112, "num_input_tokens_seen": 172224380, "step": 7979, "time_per_iteration": 3.653376579284668 }, { "auxiliary_loss_clip": 0.01163189, "auxiliary_loss_mlp": 0.01029421, "balance_loss_clip": 1.04819667, "balance_loss_mlp": 1.02102304, "epoch": 0.9595382672999458, "flos": 20554334317440.0, "grad_norm": 1.7618916329166163, "language_loss": 0.79367739, "learning_rate": 1.7101805279920557e-08, "loss": 0.81560349, "num_input_tokens_seen": 172242540, "step": 7980, "time_per_iteration": 2.8748254776000977 }, { "auxiliary_loss_clip": 0.01168082, "auxiliary_loss_mlp": 0.01028279, "balance_loss_clip": 1.04934204, "balance_loss_mlp": 1.02075684, "epoch": 0.959658510190585, "flos": 22638266697600.0, "grad_norm": 2.000395200971557, "language_loss": 0.81208992, "learning_rate": 1.7000304548513643e-08, "loss": 0.83405358, "num_input_tokens_seen": 172262645, "step": 7981, "time_per_iteration": 3.6018409729003906 }, { "auxiliary_loss_clip": 0.01153137, "auxiliary_loss_mlp": 0.01026942, "balance_loss_clip": 1.04559827, "balance_loss_mlp": 1.01914859, "epoch": 0.9597787530812241, "flos": 19135252725120.0, "grad_norm": 2.1976521419301185, "language_loss": 0.82588506, "learning_rate": 1.6899104636583394e-08, "loss": 0.84768581, "num_input_tokens_seen": 172280695, "step": 7982, "time_per_iteration": 2.698843002319336 }, { "auxiliary_loss_clip": 0.01060803, "auxiliary_loss_mlp": 0.01002076, "balance_loss_clip": 1.00653732, "balance_loss_mlp": 1.00108647, "epoch": 0.9598989959718631, "flos": 60098124055680.0, "grad_norm": 0.7400656152882149, "language_loss": 0.61921263, "learning_rate": 1.6798205559482638e-08, "loss": 0.63984138, "num_input_tokens_seen": 172343075, "step": 7983, "time_per_iteration": 3.3947534561157227 }, { "auxiliary_loss_clip": 0.01154549, "auxiliary_loss_mlp": 0.01023063, "balance_loss_clip": 1.04484093, "balance_loss_mlp": 1.01558554, "epoch": 0.9600192388625023, "flos": 20886795624960.0, "grad_norm": 1.9995661511776506, "language_loss": 0.76663804, "learning_rate": 1.669760733251713e-08, "loss": 0.78841412, "num_input_tokens_seen": 172361950, "step": 7984, "time_per_iteration": 2.79289174079895 }, { "auxiliary_loss_clip": 0.01155711, "auxiliary_loss_mlp": 0.01022376, "balance_loss_clip": 1.04574752, "balance_loss_mlp": 1.01555085, "epoch": 0.9601394817531413, "flos": 20445740524800.0, "grad_norm": 1.636228183180572, "language_loss": 0.82356322, "learning_rate": 1.659730997094755e-08, "loss": 0.84534413, "num_input_tokens_seen": 172380440, "step": 7985, "time_per_iteration": 2.833993673324585 }, { "auxiliary_loss_clip": 0.01157386, "auxiliary_loss_mlp": 0.01024586, "balance_loss_clip": 1.04716992, "balance_loss_mlp": 1.01762986, "epoch": 0.9602597246437804, "flos": 21507152440320.0, "grad_norm": 1.8533569016304838, "language_loss": 0.62322021, "learning_rate": 1.6497313489989283e-08, "loss": 0.64503992, "num_input_tokens_seen": 172400265, "step": 7986, "time_per_iteration": 2.682685136795044 }, { "auxiliary_loss_clip": 0.01153462, "auxiliary_loss_mlp": 0.01024033, "balance_loss_clip": 1.04422307, "balance_loss_mlp": 1.01652014, "epoch": 0.9603799675344196, "flos": 29935099152000.0, "grad_norm": 2.488735932334457, "language_loss": 0.70205885, "learning_rate": 1.639761790481131e-08, "loss": 0.7238338, "num_input_tokens_seen": 172421145, "step": 7987, "time_per_iteration": 2.825338363647461 }, { "auxiliary_loss_clip": 0.01162392, "auxiliary_loss_mlp": 0.01027704, "balance_loss_clip": 1.0458343, "balance_loss_mlp": 1.02082515, "epoch": 0.9605002104250586, "flos": 28001525103360.0, "grad_norm": 2.0020621300624932, "language_loss": 0.79213649, "learning_rate": 1.6298223230537754e-08, "loss": 0.81403744, "num_input_tokens_seen": 172438945, "step": 7988, "time_per_iteration": 2.6778414249420166 }, { "auxiliary_loss_clip": 0.01155439, "auxiliary_loss_mlp": 0.01058674, "balance_loss_clip": 1.04444945, "balance_loss_mlp": 1.02310908, "epoch": 0.9606204533156977, "flos": 35590490870400.0, "grad_norm": 2.648950056143786, "language_loss": 0.69526809, "learning_rate": 1.619912948224611e-08, "loss": 0.71740925, "num_input_tokens_seen": 172460150, "step": 7989, "time_per_iteration": 3.76479434967041 }, { "auxiliary_loss_clip": 0.01154052, "auxiliary_loss_mlp": 0.01028948, "balance_loss_clip": 1.04863596, "balance_loss_mlp": 1.02148008, "epoch": 0.9607406962063368, "flos": 26574614346240.0, "grad_norm": 3.1170103461744647, "language_loss": 0.61548448, "learning_rate": 1.6100336674969682e-08, "loss": 0.63731444, "num_input_tokens_seen": 172478990, "step": 7990, "time_per_iteration": 2.8337833881378174 }, { "auxiliary_loss_clip": 0.01156801, "auxiliary_loss_mlp": 0.01023236, "balance_loss_clip": 1.04673779, "balance_loss_mlp": 1.01617622, "epoch": 0.9608609390969759, "flos": 25331781813120.0, "grad_norm": 1.7694015044711222, "language_loss": 0.76857102, "learning_rate": 1.600184482369449e-08, "loss": 0.79037142, "num_input_tokens_seen": 172498905, "step": 7991, "time_per_iteration": 2.6827168464660645 }, { "auxiliary_loss_clip": 0.01157043, "auxiliary_loss_mlp": 0.01022297, "balance_loss_clip": 1.0448581, "balance_loss_mlp": 1.01383042, "epoch": 0.960981181987615, "flos": 21069114082560.0, "grad_norm": 2.65934979635411, "language_loss": 0.89242637, "learning_rate": 1.5903653943362126e-08, "loss": 0.9142198, "num_input_tokens_seen": 172517900, "step": 7992, "time_per_iteration": 2.720669984817505 }, { "auxiliary_loss_clip": 0.0115999, "auxiliary_loss_mlp": 0.01022914, "balance_loss_clip": 1.04649496, "balance_loss_mlp": 1.01571679, "epoch": 0.9611014248782541, "flos": 17823256554240.0, "grad_norm": 1.8948409733510525, "language_loss": 0.76679784, "learning_rate": 1.580576404886802e-08, "loss": 0.78862691, "num_input_tokens_seen": 172536430, "step": 7993, "time_per_iteration": 2.6612539291381836 }, { "auxiliary_loss_clip": 0.01163768, "auxiliary_loss_mlp": 0.01023606, "balance_loss_clip": 1.04804659, "balance_loss_mlp": 1.01690388, "epoch": 0.9612216677688932, "flos": 19354631040000.0, "grad_norm": 1.9041270379459903, "language_loss": 0.80133516, "learning_rate": 1.570817515506162e-08, "loss": 0.82320893, "num_input_tokens_seen": 172555120, "step": 7994, "time_per_iteration": 2.665646553039551 }, { "auxiliary_loss_clip": 0.01165444, "auxiliary_loss_mlp": 0.01020687, "balance_loss_clip": 1.04776454, "balance_loss_mlp": 1.01330209, "epoch": 0.9613419106595322, "flos": 15808739207040.0, "grad_norm": 2.2920998210357855, "language_loss": 0.81430912, "learning_rate": 1.561088727674753e-08, "loss": 0.83617043, "num_input_tokens_seen": 172569330, "step": 7995, "time_per_iteration": 2.5404586791992188 }, { "auxiliary_loss_clip": 0.01166131, "auxiliary_loss_mlp": 0.01023565, "balance_loss_clip": 1.04783201, "balance_loss_mlp": 1.01556933, "epoch": 0.9614621535501714, "flos": 25702488126720.0, "grad_norm": 3.2895322674705354, "language_loss": 0.71489066, "learning_rate": 1.551390042868417e-08, "loss": 0.73678762, "num_input_tokens_seen": 172591100, "step": 7996, "time_per_iteration": 2.797666072845459 }, { "auxiliary_loss_clip": 0.0116131, "auxiliary_loss_mlp": 0.01022432, "balance_loss_clip": 1.04659986, "balance_loss_mlp": 1.01561904, "epoch": 0.9615823964408104, "flos": 17819054663040.0, "grad_norm": 1.770932374766879, "language_loss": 0.70616078, "learning_rate": 1.5417214625584207e-08, "loss": 0.7279982, "num_input_tokens_seen": 172608755, "step": 7997, "time_per_iteration": 2.6475746631622314 }, { "auxiliary_loss_clip": 0.01158574, "auxiliary_loss_mlp": 0.01025231, "balance_loss_clip": 1.04551053, "balance_loss_mlp": 1.01765573, "epoch": 0.9617026393314495, "flos": 20190020624640.0, "grad_norm": 1.694723193463485, "language_loss": 0.85250342, "learning_rate": 1.5320829882114806e-08, "loss": 0.87434149, "num_input_tokens_seen": 172626830, "step": 7998, "time_per_iteration": 2.6622846126556396 }, { "auxiliary_loss_clip": 0.01162977, "auxiliary_loss_mlp": 0.01026352, "balance_loss_clip": 1.0435344, "balance_loss_mlp": 1.01847196, "epoch": 0.9618228822220887, "flos": 20267013427200.0, "grad_norm": 1.828016594395434, "language_loss": 0.7898317, "learning_rate": 1.5224746212897378e-08, "loss": 0.81172496, "num_input_tokens_seen": 172646125, "step": 7999, "time_per_iteration": 2.6463100910186768 }, { "auxiliary_loss_clip": 0.01160581, "auxiliary_loss_mlp": 0.01023453, "balance_loss_clip": 1.04551673, "balance_loss_mlp": 1.01627946, "epoch": 0.9619431251127277, "flos": 21031300039680.0, "grad_norm": 1.8450047647977175, "language_loss": 0.77288175, "learning_rate": 1.512896363250804e-08, "loss": 0.79472208, "num_input_tokens_seen": 172666235, "step": 8000, "time_per_iteration": 2.6876659393310547 }, { "auxiliary_loss_clip": 0.01165366, "auxiliary_loss_mlp": 0.01022867, "balance_loss_clip": 1.04739726, "balance_loss_mlp": 1.01592362, "epoch": 0.9620633680033668, "flos": 22382654538240.0, "grad_norm": 5.078026736645184, "language_loss": 0.75514281, "learning_rate": 1.503348215547673e-08, "loss": 0.7770251, "num_input_tokens_seen": 172687325, "step": 8001, "time_per_iteration": 2.7576205730438232 }, { "auxiliary_loss_clip": 0.01154744, "auxiliary_loss_mlp": 0.01023629, "balance_loss_clip": 1.04452932, "balance_loss_mlp": 1.01641726, "epoch": 0.962183610894006, "flos": 18471730740480.0, "grad_norm": 1.9161708139385143, "language_loss": 0.80678415, "learning_rate": 1.4938301796288078e-08, "loss": 0.82856786, "num_input_tokens_seen": 172703895, "step": 8002, "time_per_iteration": 2.74813175201416 }, { "auxiliary_loss_clip": 0.01166235, "auxiliary_loss_mlp": 0.01023141, "balance_loss_clip": 1.04713297, "balance_loss_mlp": 1.01593459, "epoch": 0.962303853784645, "flos": 18435245500800.0, "grad_norm": 2.529057178589821, "language_loss": 0.81782204, "learning_rate": 1.4843422569380537e-08, "loss": 0.83971578, "num_input_tokens_seen": 172720650, "step": 8003, "time_per_iteration": 2.6669061183929443 }, { "auxiliary_loss_clip": 0.01151868, "auxiliary_loss_mlp": 0.01023101, "balance_loss_clip": 1.04461825, "balance_loss_mlp": 1.01595402, "epoch": 0.9624240966752841, "flos": 26391074826240.0, "grad_norm": 1.9824929157808848, "language_loss": 0.82875067, "learning_rate": 1.4748844489147483e-08, "loss": 0.85050035, "num_input_tokens_seen": 172737640, "step": 8004, "time_per_iteration": 3.630887746810913 }, { "auxiliary_loss_clip": 0.01156095, "auxiliary_loss_mlp": 0.01029969, "balance_loss_clip": 1.04235673, "balance_loss_mlp": 1.02301002, "epoch": 0.9625443395659231, "flos": 14647675985280.0, "grad_norm": 1.8001732567440458, "language_loss": 0.71065491, "learning_rate": 1.4654567569936326e-08, "loss": 0.73251551, "num_input_tokens_seen": 172755215, "step": 8005, "time_per_iteration": 3.5738627910614014 }, { "auxiliary_loss_clip": 0.01149992, "auxiliary_loss_mlp": 0.01027487, "balance_loss_clip": 1.04688954, "balance_loss_mlp": 1.02025402, "epoch": 0.9626645824565623, "flos": 18367626147840.0, "grad_norm": 3.065539587467584, "language_loss": 0.83244014, "learning_rate": 1.456059182604874e-08, "loss": 0.85421491, "num_input_tokens_seen": 172774020, "step": 8006, "time_per_iteration": 2.7411720752716064 }, { "auxiliary_loss_clip": 0.01166168, "auxiliary_loss_mlp": 0.0102289, "balance_loss_clip": 1.04740977, "balance_loss_mlp": 1.01556492, "epoch": 0.9627848253472013, "flos": 16580424021120.0, "grad_norm": 1.8656965123156213, "language_loss": 0.7621823, "learning_rate": 1.4466917271740653e-08, "loss": 0.78407288, "num_input_tokens_seen": 172792220, "step": 8007, "time_per_iteration": 3.5881221294403076 }, { "auxiliary_loss_clip": 0.01158931, "auxiliary_loss_mlp": 0.01022755, "balance_loss_clip": 1.04847908, "balance_loss_mlp": 1.01473498, "epoch": 0.9629050682378404, "flos": 20886867452160.0, "grad_norm": 2.450378860036566, "language_loss": 0.67633891, "learning_rate": 1.4373543921222697e-08, "loss": 0.69815576, "num_input_tokens_seen": 172811805, "step": 8008, "time_per_iteration": 2.7011075019836426 }, { "auxiliary_loss_clip": 0.01156551, "auxiliary_loss_mlp": 0.01027763, "balance_loss_clip": 1.04560769, "balance_loss_mlp": 1.02012789, "epoch": 0.9630253111284796, "flos": 17019252478080.0, "grad_norm": 1.9926051330547512, "language_loss": 0.77525222, "learning_rate": 1.428047178865932e-08, "loss": 0.7970953, "num_input_tokens_seen": 172828595, "step": 8009, "time_per_iteration": 2.662480115890503 }, { "auxiliary_loss_clip": 0.0115853, "auxiliary_loss_mlp": 0.01029971, "balance_loss_clip": 1.04535651, "balance_loss_mlp": 1.0225625, "epoch": 0.9631455540191186, "flos": 20338942412160.0, "grad_norm": 1.5719626851286232, "language_loss": 0.74574381, "learning_rate": 1.4187700888169451e-08, "loss": 0.76762879, "num_input_tokens_seen": 172847770, "step": 8010, "time_per_iteration": 2.727579116821289 }, { "auxiliary_loss_clip": 0.01057485, "auxiliary_loss_mlp": 0.01000142, "balance_loss_clip": 1.00703597, "balance_loss_mlp": 0.9991464, "epoch": 0.9632657969097577, "flos": 65956700033280.0, "grad_norm": 0.7543938285346926, "language_loss": 0.56965435, "learning_rate": 1.40952312338265e-08, "loss": 0.59023058, "num_input_tokens_seen": 172912415, "step": 8011, "time_per_iteration": 3.2541823387145996 }, { "auxiliary_loss_clip": 0.01159278, "auxiliary_loss_mlp": 0.0102221, "balance_loss_clip": 1.04625642, "balance_loss_mlp": 1.01490557, "epoch": 0.9633860398003968, "flos": 44419523823360.0, "grad_norm": 1.8669759339959908, "language_loss": 0.68344969, "learning_rate": 1.4003062839657909e-08, "loss": 0.70526457, "num_input_tokens_seen": 172934895, "step": 8012, "time_per_iteration": 2.888247013092041 }, { "auxiliary_loss_clip": 0.01156515, "auxiliary_loss_mlp": 0.01023598, "balance_loss_clip": 1.04402947, "balance_loss_mlp": 1.01653457, "epoch": 0.9635062826910359, "flos": 24827704300800.0, "grad_norm": 1.7521916795833625, "language_loss": 0.79941225, "learning_rate": 1.391119571964583e-08, "loss": 0.82121336, "num_input_tokens_seen": 172955835, "step": 8013, "time_per_iteration": 2.7943763732910156 }, { "auxiliary_loss_clip": 0.01158638, "auxiliary_loss_mlp": 0.01020573, "balance_loss_clip": 1.04491723, "balance_loss_mlp": 1.01368523, "epoch": 0.9636265255816749, "flos": 15961360095360.0, "grad_norm": 1.7961394316422516, "language_loss": 0.72963965, "learning_rate": 1.3819629887726225e-08, "loss": 0.75143176, "num_input_tokens_seen": 172973925, "step": 8014, "time_per_iteration": 2.798398733139038 }, { "auxiliary_loss_clip": 0.01163446, "auxiliary_loss_mlp": 0.01027713, "balance_loss_clip": 1.04701471, "balance_loss_mlp": 1.02050054, "epoch": 0.9637467684723141, "flos": 22601781457920.0, "grad_norm": 1.9306157330976712, "language_loss": 0.76274979, "learning_rate": 1.3728365357789317e-08, "loss": 0.78466141, "num_input_tokens_seen": 172993290, "step": 8015, "time_per_iteration": 3.6164631843566895 }, { "auxiliary_loss_clip": 0.01146012, "auxiliary_loss_mlp": 0.01023845, "balance_loss_clip": 1.04961705, "balance_loss_mlp": 1.01682997, "epoch": 0.9638670113629532, "flos": 17565812801280.0, "grad_norm": 2.5318553575710205, "language_loss": 0.7629869, "learning_rate": 1.3637402143680254e-08, "loss": 0.78468549, "num_input_tokens_seen": 173008190, "step": 8016, "time_per_iteration": 2.763638496398926 }, { "auxiliary_loss_clip": 0.01057146, "auxiliary_loss_mlp": 0.01000349, "balance_loss_clip": 1.00976479, "balance_loss_mlp": 0.99941963, "epoch": 0.9639872542535922, "flos": 55072139379840.0, "grad_norm": 0.7237786213526727, "language_loss": 0.55029905, "learning_rate": 1.3546740259197998e-08, "loss": 0.57087404, "num_input_tokens_seen": 173061000, "step": 8017, "time_per_iteration": 3.211787700653076 }, { "auxiliary_loss_clip": 0.01164437, "auxiliary_loss_mlp": 0.01025377, "balance_loss_clip": 1.04930627, "balance_loss_mlp": 1.01814079, "epoch": 0.9641074971442314, "flos": 24134484746880.0, "grad_norm": 2.321671058321846, "language_loss": 0.70116466, "learning_rate": 1.3456379718095989e-08, "loss": 0.72306287, "num_input_tokens_seen": 173081415, "step": 8018, "time_per_iteration": 2.7095916271209717 }, { "auxiliary_loss_clip": 0.01056919, "auxiliary_loss_mlp": 0.01001956, "balance_loss_clip": 1.00771976, "balance_loss_mlp": 1.00084138, "epoch": 0.9642277400348704, "flos": 66747416077440.0, "grad_norm": 0.8399812364944848, "language_loss": 0.61975384, "learning_rate": 1.3366320534081487e-08, "loss": 0.64034259, "num_input_tokens_seen": 173144095, "step": 8019, "time_per_iteration": 3.290876865386963 }, { "auxiliary_loss_clip": 0.01163393, "auxiliary_loss_mlp": 0.01026869, "balance_loss_clip": 1.0489397, "balance_loss_mlp": 1.01949596, "epoch": 0.9643479829255095, "flos": 30920272450560.0, "grad_norm": 1.9936518038727393, "language_loss": 0.75642496, "learning_rate": 1.3276562720816675e-08, "loss": 0.77832758, "num_input_tokens_seen": 173165605, "step": 8020, "time_per_iteration": 2.8206591606140137 }, { "auxiliary_loss_clip": 0.01165878, "auxiliary_loss_mlp": 0.0102893, "balance_loss_clip": 1.04643106, "balance_loss_mlp": 1.02172351, "epoch": 0.9644682258161487, "flos": 20048245643520.0, "grad_norm": 2.3212373500164465, "language_loss": 0.82471776, "learning_rate": 1.3187106291917549e-08, "loss": 0.84666586, "num_input_tokens_seen": 173182595, "step": 8021, "time_per_iteration": 2.715257406234741 }, { "auxiliary_loss_clip": 0.01156584, "auxiliary_loss_mlp": 0.01023438, "balance_loss_clip": 1.04476452, "balance_loss_mlp": 1.01622581, "epoch": 0.9645884687067877, "flos": 21178713456000.0, "grad_norm": 1.9940128520005675, "language_loss": 0.70936871, "learning_rate": 1.309795126095503e-08, "loss": 0.73116887, "num_input_tokens_seen": 173200895, "step": 8022, "time_per_iteration": 2.6662099361419678 }, { "auxiliary_loss_clip": 0.01142133, "auxiliary_loss_mlp": 0.01026912, "balance_loss_clip": 1.04387546, "balance_loss_mlp": 1.01985431, "epoch": 0.9647087115974268, "flos": 18945967029120.0, "grad_norm": 2.604779270505109, "language_loss": 0.80399781, "learning_rate": 1.3009097641453192e-08, "loss": 0.8256883, "num_input_tokens_seen": 173218745, "step": 8023, "time_per_iteration": 2.8247528076171875 }, { "auxiliary_loss_clip": 0.01154594, "auxiliary_loss_mlp": 0.01022069, "balance_loss_clip": 1.04391575, "balance_loss_mlp": 1.01459491, "epoch": 0.9648289544880659, "flos": 16545088016640.0, "grad_norm": 1.8737916870724627, "language_loss": 0.76061612, "learning_rate": 1.2920545446891474e-08, "loss": 0.78238279, "num_input_tokens_seen": 173235465, "step": 8024, "time_per_iteration": 2.7080271244049072 }, { "auxiliary_loss_clip": 0.01159777, "auxiliary_loss_mlp": 0.01025514, "balance_loss_clip": 1.0470134, "balance_loss_mlp": 1.01820385, "epoch": 0.964949197378705, "flos": 24057527857920.0, "grad_norm": 1.6844692824411416, "language_loss": 0.70785356, "learning_rate": 1.2832294690703127e-08, "loss": 0.72970647, "num_input_tokens_seen": 173254440, "step": 8025, "time_per_iteration": 2.803067684173584 }, { "auxiliary_loss_clip": 0.0116276, "auxiliary_loss_mlp": 0.0102314, "balance_loss_clip": 1.04775679, "balance_loss_mlp": 1.01579368, "epoch": 0.965069440269344, "flos": 23365565280000.0, "grad_norm": 1.8655048438028807, "language_loss": 0.77751762, "learning_rate": 1.2744345386275668e-08, "loss": 0.79937661, "num_input_tokens_seen": 173273980, "step": 8026, "time_per_iteration": 2.6682095527648926 }, { "auxiliary_loss_clip": 0.0116407, "auxiliary_loss_mlp": 0.01027617, "balance_loss_clip": 1.04835105, "balance_loss_mlp": 1.02024722, "epoch": 0.9651896831599832, "flos": 25374875155200.0, "grad_norm": 1.908405946279194, "language_loss": 0.7866056, "learning_rate": 1.265669754695109e-08, "loss": 0.80852252, "num_input_tokens_seen": 173293550, "step": 8027, "time_per_iteration": 2.8056678771972656 }, { "auxiliary_loss_clip": 0.01150162, "auxiliary_loss_mlp": 0.01029681, "balance_loss_clip": 1.04390526, "balance_loss_mlp": 1.02164912, "epoch": 0.9653099260506223, "flos": 22272875596800.0, "grad_norm": 2.083525838484907, "language_loss": 0.82218206, "learning_rate": 1.2569351186025201e-08, "loss": 0.84398049, "num_input_tokens_seen": 173312005, "step": 8028, "time_per_iteration": 2.842449188232422 }, { "auxiliary_loss_clip": 0.01144233, "auxiliary_loss_mlp": 0.01025233, "balance_loss_clip": 1.04444504, "balance_loss_mlp": 1.01837826, "epoch": 0.9654301689412613, "flos": 26760847386240.0, "grad_norm": 1.5082784755522014, "language_loss": 0.75533199, "learning_rate": 1.2482306316748737e-08, "loss": 0.77702665, "num_input_tokens_seen": 173332450, "step": 8029, "time_per_iteration": 2.8549997806549072 }, { "auxiliary_loss_clip": 0.01167103, "auxiliary_loss_mlp": 0.01025319, "balance_loss_clip": 1.04551435, "balance_loss_mlp": 1.01797557, "epoch": 0.9655504118319005, "flos": 17412689122560.0, "grad_norm": 1.986500105768517, "language_loss": 0.78412712, "learning_rate": 1.2395562952326021e-08, "loss": 0.80605131, "num_input_tokens_seen": 173349610, "step": 8030, "time_per_iteration": 3.645833730697632 }, { "auxiliary_loss_clip": 0.0116935, "auxiliary_loss_mlp": 0.0102857, "balance_loss_clip": 1.0496186, "balance_loss_mlp": 1.02063036, "epoch": 0.9656706547225395, "flos": 22126970551680.0, "grad_norm": 15.50809341803695, "language_loss": 0.81152213, "learning_rate": 1.2309121105916309e-08, "loss": 0.83350134, "num_input_tokens_seen": 173367900, "step": 8031, "time_per_iteration": 3.7220091819763184 }, { "auxiliary_loss_clip": 0.01164582, "auxiliary_loss_mlp": 0.01023728, "balance_loss_clip": 1.04684031, "balance_loss_mlp": 1.01603889, "epoch": 0.9657908976131786, "flos": 37049289926400.0, "grad_norm": 4.456214879024234, "language_loss": 0.69087315, "learning_rate": 1.222298079063222e-08, "loss": 0.71275628, "num_input_tokens_seen": 173389040, "step": 8032, "time_per_iteration": 2.7979657649993896 }, { "auxiliary_loss_clip": 0.01160514, "auxiliary_loss_mlp": 0.01020965, "balance_loss_clip": 1.04594922, "balance_loss_mlp": 1.01424193, "epoch": 0.9659111405038178, "flos": 24389809597440.0, "grad_norm": 2.125496027170368, "language_loss": 0.72478151, "learning_rate": 1.2137142019541524e-08, "loss": 0.74659634, "num_input_tokens_seen": 173407595, "step": 8033, "time_per_iteration": 3.610818862915039 }, { "auxiliary_loss_clip": 0.01166704, "auxiliary_loss_mlp": 0.01026334, "balance_loss_clip": 1.04728031, "balance_loss_mlp": 1.01872253, "epoch": 0.9660313833944568, "flos": 25009412227200.0, "grad_norm": 2.488447670403783, "language_loss": 0.73804653, "learning_rate": 1.2051604805666027e-08, "loss": 0.75997692, "num_input_tokens_seen": 173424720, "step": 8034, "time_per_iteration": 2.7946627140045166 }, { "auxiliary_loss_clip": 0.01165139, "auxiliary_loss_mlp": 0.01050331, "balance_loss_clip": 1.04702091, "balance_loss_mlp": 1.01431584, "epoch": 0.9661516262850959, "flos": 11801575895040.0, "grad_norm": 2.8187248729644487, "language_loss": 0.7827819, "learning_rate": 1.196636916198135e-08, "loss": 0.80493659, "num_input_tokens_seen": 173442260, "step": 8035, "time_per_iteration": 2.6975862979888916 }, { "auxiliary_loss_clip": 0.01167062, "auxiliary_loss_mlp": 0.01021551, "balance_loss_clip": 1.04644275, "balance_loss_mlp": 1.01427341, "epoch": 0.9662718691757349, "flos": 20047778766720.0, "grad_norm": 1.9409418136027488, "language_loss": 0.77154303, "learning_rate": 1.1881435101418036e-08, "loss": 0.79342914, "num_input_tokens_seen": 173461675, "step": 8036, "time_per_iteration": 2.6856791973114014 }, { "auxiliary_loss_clip": 0.01057888, "auxiliary_loss_mlp": 0.01001479, "balance_loss_clip": 1.00744379, "balance_loss_mlp": 1.00037611, "epoch": 0.9663921120663741, "flos": 68027703517440.0, "grad_norm": 0.7254137298401145, "language_loss": 0.65528882, "learning_rate": 1.1796802636860003e-08, "loss": 0.67588246, "num_input_tokens_seen": 173530205, "step": 8037, "time_per_iteration": 3.3029298782348633 }, { "auxiliary_loss_clip": 0.01168647, "auxiliary_loss_mlp": 0.01025327, "balance_loss_clip": 1.04722953, "balance_loss_mlp": 1.01804364, "epoch": 0.9665123549570132, "flos": 26322916769280.0, "grad_norm": 4.283991215710855, "language_loss": 0.73965818, "learning_rate": 1.1712471781146316e-08, "loss": 0.76159793, "num_input_tokens_seen": 173549540, "step": 8038, "time_per_iteration": 2.77470064163208 }, { "auxiliary_loss_clip": 0.01162846, "auxiliary_loss_mlp": 0.01024863, "balance_loss_clip": 1.04459751, "balance_loss_mlp": 1.01731968, "epoch": 0.9666325978476522, "flos": 43941121557120.0, "grad_norm": 1.8561723240958226, "language_loss": 0.67188287, "learning_rate": 1.1628442547069628e-08, "loss": 0.69375992, "num_input_tokens_seen": 173571740, "step": 8039, "time_per_iteration": 2.822913408279419 }, { "auxiliary_loss_clip": 0.01164468, "auxiliary_loss_mlp": 0.01052035, "balance_loss_clip": 1.04561675, "balance_loss_mlp": 1.01613057, "epoch": 0.9667528407382914, "flos": 21543422198400.0, "grad_norm": 1.956268747795522, "language_loss": 0.77422416, "learning_rate": 1.1544714947377521e-08, "loss": 0.79638922, "num_input_tokens_seen": 173589425, "step": 8040, "time_per_iteration": 2.7323391437530518 }, { "auxiliary_loss_clip": 0.01167453, "auxiliary_loss_mlp": 0.01024875, "balance_loss_clip": 1.04788649, "balance_loss_mlp": 1.01785684, "epoch": 0.9668730836289304, "flos": 23878585278720.0, "grad_norm": 5.8163044741804875, "language_loss": 0.69845402, "learning_rate": 1.1461288994770945e-08, "loss": 0.72037733, "num_input_tokens_seen": 173608500, "step": 8041, "time_per_iteration": 2.643354892730713 }, { "auxiliary_loss_clip": 0.01169099, "auxiliary_loss_mlp": 0.0102677, "balance_loss_clip": 1.0466826, "balance_loss_mlp": 1.01921785, "epoch": 0.9669933265195695, "flos": 28293011971200.0, "grad_norm": 2.025692071099013, "language_loss": 0.77332532, "learning_rate": 1.1378164701906002e-08, "loss": 0.79528397, "num_input_tokens_seen": 173630265, "step": 8042, "time_per_iteration": 3.517012119293213 }, { "auxiliary_loss_clip": 0.01167118, "auxiliary_loss_mlp": 0.0103208, "balance_loss_clip": 1.04716909, "balance_loss_mlp": 1.02395546, "epoch": 0.9671135694102087, "flos": 22454763091200.0, "grad_norm": 1.6413188615869312, "language_loss": 0.66832745, "learning_rate": 1.1295342081392156e-08, "loss": 0.69031942, "num_input_tokens_seen": 173649625, "step": 8043, "time_per_iteration": 2.6994218826293945 }, { "auxiliary_loss_clip": 0.01157926, "auxiliary_loss_mlp": 0.01019883, "balance_loss_clip": 1.04383361, "balance_loss_mlp": 1.01319242, "epoch": 0.9672338123008477, "flos": 20155941596160.0, "grad_norm": 1.6208163079407762, "language_loss": 0.69374013, "learning_rate": 1.1212821145793804e-08, "loss": 0.71551818, "num_input_tokens_seen": 173669240, "step": 8044, "time_per_iteration": 2.6554718017578125 }, { "auxiliary_loss_clip": 0.01159282, "auxiliary_loss_mlp": 0.01032712, "balance_loss_clip": 1.04549694, "balance_loss_mlp": 1.02479923, "epoch": 0.9673540551914868, "flos": 16977487939200.0, "grad_norm": 1.8898842899455732, "language_loss": 0.78829449, "learning_rate": 1.1130601907629156e-08, "loss": 0.8102144, "num_input_tokens_seen": 173686970, "step": 8045, "time_per_iteration": 2.710641622543335 }, { "auxiliary_loss_clip": 0.01059914, "auxiliary_loss_mlp": 0.01005042, "balance_loss_clip": 1.00647044, "balance_loss_mlp": 1.00396276, "epoch": 0.9674742980821259, "flos": 61892903952000.0, "grad_norm": 0.823399807217257, "language_loss": 0.64756691, "learning_rate": 1.1048684379370899e-08, "loss": 0.66821647, "num_input_tokens_seen": 173747655, "step": 8046, "time_per_iteration": 3.2094075679779053 }, { "auxiliary_loss_clip": 0.01149697, "auxiliary_loss_mlp": 0.01025813, "balance_loss_clip": 1.04601717, "balance_loss_mlp": 1.01855326, "epoch": 0.967594540972765, "flos": 18697824898560.0, "grad_norm": 3.858185739460423, "language_loss": 0.74748266, "learning_rate": 1.0967068573445759e-08, "loss": 0.76923782, "num_input_tokens_seen": 173765140, "step": 8047, "time_per_iteration": 2.7214810848236084 }, { "auxiliary_loss_clip": 0.01157527, "auxiliary_loss_mlp": 0.01025051, "balance_loss_clip": 1.04581678, "balance_loss_mlp": 1.01701629, "epoch": 0.967714783863404, "flos": 20777411733120.0, "grad_norm": 2.641664572503341, "language_loss": 0.6559177, "learning_rate": 1.0885754502234945e-08, "loss": 0.67774343, "num_input_tokens_seen": 173784800, "step": 8048, "time_per_iteration": 2.6751058101654053 }, { "auxiliary_loss_clip": 0.01153025, "auxiliary_loss_mlp": 0.0102554, "balance_loss_clip": 1.04498494, "balance_loss_mlp": 1.01865602, "epoch": 0.9678350267540432, "flos": 23185473465600.0, "grad_norm": 5.99849966209853, "language_loss": 0.77834332, "learning_rate": 1.08047421780737e-08, "loss": 0.80012894, "num_input_tokens_seen": 173803990, "step": 8049, "time_per_iteration": 2.71681809425354 }, { "auxiliary_loss_clip": 0.01165956, "auxiliary_loss_mlp": 0.01049662, "balance_loss_clip": 1.04632401, "balance_loss_mlp": 1.01484382, "epoch": 0.9679552696446823, "flos": 21726063878400.0, "grad_norm": 14.143060403880373, "language_loss": 0.73879129, "learning_rate": 1.0724031613251305e-08, "loss": 0.76094747, "num_input_tokens_seen": 173821890, "step": 8050, "time_per_iteration": 2.7875077724456787 }, { "auxiliary_loss_clip": 0.01168155, "auxiliary_loss_mlp": 0.01028977, "balance_loss_clip": 1.04753315, "balance_loss_mlp": 1.02085853, "epoch": 0.9680755125353213, "flos": 26869046129280.0, "grad_norm": 2.058139746540911, "language_loss": 0.66688466, "learning_rate": 1.0643622820011744e-08, "loss": 0.68885601, "num_input_tokens_seen": 173842945, "step": 8051, "time_per_iteration": 2.781801462173462 }, { "auxiliary_loss_clip": 0.01167362, "auxiliary_loss_mlp": 0.01025486, "balance_loss_clip": 1.04606009, "balance_loss_mlp": 1.01760364, "epoch": 0.9681957554259605, "flos": 28325008010880.0, "grad_norm": 2.2311576708704917, "language_loss": 0.68158972, "learning_rate": 1.0563515810552814e-08, "loss": 0.70351821, "num_input_tokens_seen": 173859915, "step": 8052, "time_per_iteration": 2.7353744506835938 }, { "auxiliary_loss_clip": 0.01167906, "auxiliary_loss_mlp": 0.0102397, "balance_loss_clip": 1.04972672, "balance_loss_mlp": 1.0163641, "epoch": 0.9683159983165995, "flos": 20557674282240.0, "grad_norm": 1.5693837173962606, "language_loss": 0.73132545, "learning_rate": 1.0483710597026795e-08, "loss": 0.75324416, "num_input_tokens_seen": 173879775, "step": 8053, "time_per_iteration": 2.620743989944458 }, { "auxiliary_loss_clip": 0.0115531, "auxiliary_loss_mlp": 0.0102422, "balance_loss_clip": 1.04611182, "balance_loss_mlp": 1.01780653, "epoch": 0.9684362412072386, "flos": 24207958016640.0, "grad_norm": 1.9681545444957185, "language_loss": 0.74214172, "learning_rate": 1.0404207191540227e-08, "loss": 0.76393706, "num_input_tokens_seen": 173900230, "step": 8054, "time_per_iteration": 2.808258056640625 }, { "auxiliary_loss_clip": 0.01162666, "auxiliary_loss_mlp": 0.01027458, "balance_loss_clip": 1.04501557, "balance_loss_mlp": 1.02028739, "epoch": 0.9685564840978778, "flos": 22346241125760.0, "grad_norm": 2.2019204396306944, "language_loss": 0.7469517, "learning_rate": 1.0325005606153236e-08, "loss": 0.76885295, "num_input_tokens_seen": 173919690, "step": 8055, "time_per_iteration": 2.7264294624328613 }, { "auxiliary_loss_clip": 0.01155873, "auxiliary_loss_mlp": 0.01024205, "balance_loss_clip": 1.0459348, "balance_loss_mlp": 1.01716614, "epoch": 0.9686767269885168, "flos": 14386389477120.0, "grad_norm": 2.6043498199825446, "language_loss": 0.79389107, "learning_rate": 1.0246105852881104e-08, "loss": 0.81569189, "num_input_tokens_seen": 173934790, "step": 8056, "time_per_iteration": 3.767415761947632 }, { "auxiliary_loss_clip": 0.01166739, "auxiliary_loss_mlp": 0.01033114, "balance_loss_clip": 1.0465169, "balance_loss_mlp": 1.02552879, "epoch": 0.9687969698791559, "flos": 21287630471040.0, "grad_norm": 1.9039669140243216, "language_loss": 0.7909106, "learning_rate": 1.0167507943692476e-08, "loss": 0.81290913, "num_input_tokens_seen": 173953875, "step": 8057, "time_per_iteration": 3.737483263015747 }, { "auxiliary_loss_clip": 0.01162352, "auxiliary_loss_mlp": 0.01027328, "balance_loss_clip": 1.04844451, "balance_loss_mlp": 1.02013099, "epoch": 0.968917212769795, "flos": 19828328624640.0, "grad_norm": 2.404354239604153, "language_loss": 0.71346486, "learning_rate": 1.008921189051093e-08, "loss": 0.7353617, "num_input_tokens_seen": 173971220, "step": 8058, "time_per_iteration": 2.7381398677825928 }, { "auxiliary_loss_clip": 0.01166109, "auxiliary_loss_mlp": 0.01025517, "balance_loss_clip": 1.04748487, "balance_loss_mlp": 1.01780093, "epoch": 0.9690374556604341, "flos": 21681749473920.0, "grad_norm": 3.465107857108473, "language_loss": 0.77625573, "learning_rate": 1.0011217705213848e-08, "loss": 0.798172, "num_input_tokens_seen": 173989095, "step": 8059, "time_per_iteration": 3.7566027641296387 }, { "auxiliary_loss_clip": 0.01159942, "auxiliary_loss_mlp": 0.0102329, "balance_loss_clip": 1.04611361, "balance_loss_mlp": 1.01659012, "epoch": 0.9691576985510731, "flos": 32635437851520.0, "grad_norm": 1.7985669566327305, "language_loss": 0.74568307, "learning_rate": 9.933525399632658e-09, "loss": 0.76751536, "num_input_tokens_seen": 174007330, "step": 8060, "time_per_iteration": 2.889742851257324 }, { "auxiliary_loss_clip": 0.01152439, "auxiliary_loss_mlp": 0.01025067, "balance_loss_clip": 1.04324675, "balance_loss_mlp": 1.01692224, "epoch": 0.9692779414417123, "flos": 35663174040960.0, "grad_norm": 1.8881489852322098, "language_loss": 0.65034437, "learning_rate": 9.856134985553488e-09, "loss": 0.6721195, "num_input_tokens_seen": 174027055, "step": 8061, "time_per_iteration": 2.74923038482666 }, { "auxiliary_loss_clip": 0.01164416, "auxiliary_loss_mlp": 0.010264, "balance_loss_clip": 1.04635406, "balance_loss_mlp": 1.01887798, "epoch": 0.9693981843323514, "flos": 28366952117760.0, "grad_norm": 1.5777529368567118, "language_loss": 0.73767805, "learning_rate": 9.77904647471628e-09, "loss": 0.75958622, "num_input_tokens_seen": 174050235, "step": 8062, "time_per_iteration": 2.7559406757354736 }, { "auxiliary_loss_clip": 0.01146816, "auxiliary_loss_mlp": 0.0102397, "balance_loss_clip": 1.0469625, "balance_loss_mlp": 1.01621556, "epoch": 0.9695184272229904, "flos": 23622865378560.0, "grad_norm": 1.5910689934436262, "language_loss": 0.73984057, "learning_rate": 9.702259878815454e-09, "loss": 0.76154846, "num_input_tokens_seen": 174070560, "step": 8063, "time_per_iteration": 2.7252631187438965 }, { "auxiliary_loss_clip": 0.01166291, "auxiliary_loss_mlp": 0.01026243, "balance_loss_clip": 1.04871857, "balance_loss_mlp": 1.01845849, "epoch": 0.9696386701136296, "flos": 23294677789440.0, "grad_norm": 2.054954877313353, "language_loss": 0.74588549, "learning_rate": 9.625775209499254e-09, "loss": 0.76781088, "num_input_tokens_seen": 174090565, "step": 8064, "time_per_iteration": 2.823744058609009 }, { "auxiliary_loss_clip": 0.01153028, "auxiliary_loss_mlp": 0.01025675, "balance_loss_clip": 1.04786336, "balance_loss_mlp": 1.01858473, "epoch": 0.9697589130042686, "flos": 15121876360320.0, "grad_norm": 1.952127581867762, "language_loss": 0.74069726, "learning_rate": 9.549592478370172e-09, "loss": 0.76248431, "num_input_tokens_seen": 174108745, "step": 8065, "time_per_iteration": 2.724180221557617 }, { "auxiliary_loss_clip": 0.0116111, "auxiliary_loss_mlp": 0.01024327, "balance_loss_clip": 1.04324198, "balance_loss_mlp": 1.01706719, "epoch": 0.9698791558949077, "flos": 18879532824960.0, "grad_norm": 1.574452848759495, "language_loss": 0.79441619, "learning_rate": 9.473711696985632e-09, "loss": 0.81627047, "num_input_tokens_seen": 174128075, "step": 8066, "time_per_iteration": 2.678982734680176 }, { "auxiliary_loss_clip": 0.01157966, "auxiliary_loss_mlp": 0.01022022, "balance_loss_clip": 1.04542553, "balance_loss_mlp": 1.01464915, "epoch": 0.9699993987855468, "flos": 17931455297280.0, "grad_norm": 2.6663945945190926, "language_loss": 0.76229393, "learning_rate": 9.398132876856201e-09, "loss": 0.7840938, "num_input_tokens_seen": 174147040, "step": 8067, "time_per_iteration": 3.6645405292510986 }, { "auxiliary_loss_clip": 0.01058552, "auxiliary_loss_mlp": 0.01003506, "balance_loss_clip": 1.00903034, "balance_loss_mlp": 1.00248039, "epoch": 0.9701196416761859, "flos": 67182186297600.0, "grad_norm": 0.7955466600335405, "language_loss": 0.60768145, "learning_rate": 9.322856029447379e-09, "loss": 0.62830198, "num_input_tokens_seen": 174208225, "step": 8068, "time_per_iteration": 3.3713996410369873 }, { "auxiliary_loss_clip": 0.01162866, "auxiliary_loss_mlp": 0.01028363, "balance_loss_clip": 1.04661202, "balance_loss_mlp": 1.02106786, "epoch": 0.970239884566825, "flos": 24277804012800.0, "grad_norm": 1.9983457118955736, "language_loss": 0.80157113, "learning_rate": 9.247881166178695e-09, "loss": 0.82348341, "num_input_tokens_seen": 174226935, "step": 8069, "time_per_iteration": 2.7728428840637207 }, { "auxiliary_loss_clip": 0.01164881, "auxiliary_loss_mlp": 0.01023836, "balance_loss_clip": 1.04876602, "balance_loss_mlp": 1.01702964, "epoch": 0.970360127457464, "flos": 25301689194240.0, "grad_norm": 2.3878338374199792, "language_loss": 0.76588994, "learning_rate": 9.173208298423274e-09, "loss": 0.78777707, "num_input_tokens_seen": 174248140, "step": 8070, "time_per_iteration": 2.789632797241211 }, { "auxiliary_loss_clip": 0.01151096, "auxiliary_loss_mlp": 0.01053603, "balance_loss_clip": 1.04696274, "balance_loss_mlp": 1.01821589, "epoch": 0.9704803703481032, "flos": 29572473398400.0, "grad_norm": 1.6200076303843824, "language_loss": 0.76210433, "learning_rate": 9.09883743750961e-09, "loss": 0.78415132, "num_input_tokens_seen": 174271030, "step": 8071, "time_per_iteration": 2.7945830821990967 }, { "auxiliary_loss_clip": 0.0115653, "auxiliary_loss_mlp": 0.01021361, "balance_loss_clip": 1.04451239, "balance_loss_mlp": 1.0140419, "epoch": 0.9706006132387422, "flos": 17380046638080.0, "grad_norm": 1.6866785508318105, "language_loss": 0.83752429, "learning_rate": 9.024768594719124e-09, "loss": 0.85930324, "num_input_tokens_seen": 174289410, "step": 8072, "time_per_iteration": 2.755523204803467 }, { "auxiliary_loss_clip": 0.01154254, "auxiliary_loss_mlp": 0.01025867, "balance_loss_clip": 1.04520881, "balance_loss_mlp": 1.01810098, "epoch": 0.9707208561293813, "flos": 18186421011840.0, "grad_norm": 4.443749059377574, "language_loss": 0.72825694, "learning_rate": 8.95100178128816e-09, "loss": 0.75005811, "num_input_tokens_seen": 174308550, "step": 8073, "time_per_iteration": 2.745206594467163 }, { "auxiliary_loss_clip": 0.01163043, "auxiliary_loss_mlp": 0.01021946, "balance_loss_clip": 1.04879797, "balance_loss_mlp": 1.01398897, "epoch": 0.9708410990200205, "flos": 31248388212480.0, "grad_norm": 1.874630437538708, "language_loss": 0.6960749, "learning_rate": 8.877537008407321e-09, "loss": 0.71792483, "num_input_tokens_seen": 174328600, "step": 8074, "time_per_iteration": 2.825430154800415 }, { "auxiliary_loss_clip": 0.0116134, "auxiliary_loss_mlp": 0.01030917, "balance_loss_clip": 1.04559541, "balance_loss_mlp": 1.02359462, "epoch": 0.9709613419106595, "flos": 30554450386560.0, "grad_norm": 1.5110896053475085, "language_loss": 0.68843842, "learning_rate": 8.804374287221028e-09, "loss": 0.710361, "num_input_tokens_seen": 174349835, "step": 8075, "time_per_iteration": 2.714674234390259 }, { "auxiliary_loss_clip": 0.01150613, "auxiliary_loss_mlp": 0.01027417, "balance_loss_clip": 1.04452467, "balance_loss_mlp": 1.02070272, "epoch": 0.9710815848012986, "flos": 23730166281600.0, "grad_norm": 1.6470512359618361, "language_loss": 0.84456813, "learning_rate": 8.731513628827958e-09, "loss": 0.86634839, "num_input_tokens_seen": 174369200, "step": 8076, "time_per_iteration": 2.760542631149292 }, { "auxiliary_loss_clip": 0.01163033, "auxiliary_loss_mlp": 0.01025739, "balance_loss_clip": 1.04681551, "balance_loss_mlp": 1.01900959, "epoch": 0.9712018276919377, "flos": 23761875012480.0, "grad_norm": 2.1813217139269865, "language_loss": 0.82638937, "learning_rate": 8.658955044280825e-09, "loss": 0.84827709, "num_input_tokens_seen": 174388125, "step": 8077, "time_per_iteration": 2.624460458755493 }, { "auxiliary_loss_clip": 0.01157497, "auxiliary_loss_mlp": 0.01023101, "balance_loss_clip": 1.04549241, "balance_loss_mlp": 1.01594245, "epoch": 0.9713220705825768, "flos": 23330983461120.0, "grad_norm": 2.111463466057199, "language_loss": 0.77190977, "learning_rate": 8.586698544587268e-09, "loss": 0.79371572, "num_input_tokens_seen": 174409735, "step": 8078, "time_per_iteration": 2.7249252796173096 }, { "auxiliary_loss_clip": 0.01155244, "auxiliary_loss_mlp": 0.01031144, "balance_loss_clip": 1.04926264, "balance_loss_mlp": 1.02383041, "epoch": 0.9714423134732159, "flos": 22200946611840.0, "grad_norm": 3.3460666737126896, "language_loss": 0.73804724, "learning_rate": 8.514744140707853e-09, "loss": 0.75991118, "num_input_tokens_seen": 174428875, "step": 8079, "time_per_iteration": 2.687389612197876 }, { "auxiliary_loss_clip": 0.01161404, "auxiliary_loss_mlp": 0.01028322, "balance_loss_clip": 1.04460132, "balance_loss_mlp": 1.02122879, "epoch": 0.971562556363855, "flos": 20229917656320.0, "grad_norm": 1.8640462583652924, "language_loss": 0.766608, "learning_rate": 8.443091843558515e-09, "loss": 0.7885052, "num_input_tokens_seen": 174447960, "step": 8080, "time_per_iteration": 2.674616575241089 }, { "auxiliary_loss_clip": 0.01154628, "auxiliary_loss_mlp": 0.0102302, "balance_loss_clip": 1.0473845, "balance_loss_mlp": 1.01524711, "epoch": 0.9716827992544941, "flos": 24970197553920.0, "grad_norm": 3.7617920646530747, "language_loss": 0.64872527, "learning_rate": 8.37174166400878e-09, "loss": 0.67050177, "num_input_tokens_seen": 174463535, "step": 8081, "time_per_iteration": 2.6748011112213135 }, { "auxiliary_loss_clip": 0.01166157, "auxiliary_loss_mlp": 0.01023162, "balance_loss_clip": 1.04892182, "balance_loss_mlp": 1.01635242, "epoch": 0.9718030421451331, "flos": 24681476033280.0, "grad_norm": 2.4004143926571224, "language_loss": 0.85109681, "learning_rate": 8.300693612881992e-09, "loss": 0.87299001, "num_input_tokens_seen": 174483600, "step": 8082, "time_per_iteration": 3.557265043258667 }, { "auxiliary_loss_clip": 0.01161438, "auxiliary_loss_mlp": 0.010493, "balance_loss_clip": 1.04708076, "balance_loss_mlp": 1.01395321, "epoch": 0.9719232850357723, "flos": 22090700793600.0, "grad_norm": 2.0196618490781693, "language_loss": 0.81226575, "learning_rate": 8.22994770095664e-09, "loss": 0.83437312, "num_input_tokens_seen": 174502175, "step": 8083, "time_per_iteration": 3.6486990451812744 }, { "auxiliary_loss_clip": 0.01159917, "auxiliary_loss_mlp": 0.01028626, "balance_loss_clip": 1.04997158, "balance_loss_mlp": 1.02070165, "epoch": 0.9720435279264114, "flos": 23656908493440.0, "grad_norm": 3.7214428997935167, "language_loss": 0.75507718, "learning_rate": 8.159503938964585e-09, "loss": 0.77696264, "num_input_tokens_seen": 174519495, "step": 8084, "time_per_iteration": 2.719672679901123 }, { "auxiliary_loss_clip": 0.01148422, "auxiliary_loss_mlp": 0.01027656, "balance_loss_clip": 1.04706812, "balance_loss_mlp": 1.02028608, "epoch": 0.9721637708170504, "flos": 28365910623360.0, "grad_norm": 1.8801951657409197, "language_loss": 0.70644855, "learning_rate": 8.089362337592164e-09, "loss": 0.72820926, "num_input_tokens_seen": 174543120, "step": 8085, "time_per_iteration": 3.6279284954071045 }, { "auxiliary_loss_clip": 0.01156319, "auxiliary_loss_mlp": 0.01025763, "balance_loss_clip": 1.04538667, "balance_loss_mlp": 1.0184052, "epoch": 0.9722840137076896, "flos": 29130807767040.0, "grad_norm": 1.6686943220350945, "language_loss": 0.72273099, "learning_rate": 8.019522907479536e-09, "loss": 0.74455184, "num_input_tokens_seen": 174563480, "step": 8086, "time_per_iteration": 2.763394832611084 }, { "auxiliary_loss_clip": 0.01167112, "auxiliary_loss_mlp": 0.01026765, "balance_loss_clip": 1.04808199, "balance_loss_mlp": 1.01982963, "epoch": 0.9724042565983286, "flos": 19243954258560.0, "grad_norm": 2.0818885795152515, "language_loss": 0.7737723, "learning_rate": 7.949985659221558e-09, "loss": 0.79571104, "num_input_tokens_seen": 174580745, "step": 8087, "time_per_iteration": 2.626563310623169 }, { "auxiliary_loss_clip": 0.01161498, "auxiliary_loss_mlp": 0.01024515, "balance_loss_clip": 1.04631782, "balance_loss_mlp": 1.01704693, "epoch": 0.9725244994889677, "flos": 23039676161280.0, "grad_norm": 2.0903858705756098, "language_loss": 0.78917974, "learning_rate": 7.880750603366904e-09, "loss": 0.81103981, "num_input_tokens_seen": 174599615, "step": 8088, "time_per_iteration": 2.7217488288879395 }, { "auxiliary_loss_clip": 0.01167564, "auxiliary_loss_mlp": 0.01028298, "balance_loss_clip": 1.04746532, "balance_loss_mlp": 1.01999807, "epoch": 0.9726447423796069, "flos": 23367468700800.0, "grad_norm": 1.999559128023777, "language_loss": 0.79739946, "learning_rate": 7.811817750418282e-09, "loss": 0.81935811, "num_input_tokens_seen": 174618375, "step": 8089, "time_per_iteration": 2.730553388595581 }, { "auxiliary_loss_clip": 0.01153936, "auxiliary_loss_mlp": 0.01024348, "balance_loss_clip": 1.04714382, "balance_loss_mlp": 1.01648653, "epoch": 0.9727649852702459, "flos": 26541648639360.0, "grad_norm": 1.73948201011432, "language_loss": 0.80108345, "learning_rate": 7.743187110833105e-09, "loss": 0.82286632, "num_input_tokens_seen": 174641135, "step": 8090, "time_per_iteration": 2.8540778160095215 }, { "auxiliary_loss_clip": 0.01158922, "auxiliary_loss_mlp": 0.01022025, "balance_loss_clip": 1.04288089, "balance_loss_mlp": 1.01529014, "epoch": 0.972885228160885, "flos": 20522338277760.0, "grad_norm": 1.5659925995655977, "language_loss": 0.80633473, "learning_rate": 7.674858695022602e-09, "loss": 0.82814419, "num_input_tokens_seen": 174659490, "step": 8091, "time_per_iteration": 2.6858103275299072 }, { "auxiliary_loss_clip": 0.01169981, "auxiliary_loss_mlp": 0.0102707, "balance_loss_clip": 1.04974818, "balance_loss_mlp": 1.01976597, "epoch": 0.9730054710515241, "flos": 17566064196480.0, "grad_norm": 2.5209525415747294, "language_loss": 0.75971901, "learning_rate": 7.606832513351591e-09, "loss": 0.78168952, "num_input_tokens_seen": 174677440, "step": 8092, "time_per_iteration": 2.6957080364227295 }, { "auxiliary_loss_clip": 0.01058923, "auxiliary_loss_mlp": 0.01034739, "balance_loss_clip": 1.0066452, "balance_loss_mlp": 1.00122261, "epoch": 0.9731257139421632, "flos": 68972010117120.0, "grad_norm": 0.831340329416895, "language_loss": 0.63946527, "learning_rate": 7.539108576140264e-09, "loss": 0.66040188, "num_input_tokens_seen": 174741550, "step": 8093, "time_per_iteration": 4.225275993347168 }, { "auxiliary_loss_clip": 0.01151365, "auxiliary_loss_mlp": 0.01023491, "balance_loss_clip": 1.04504347, "balance_loss_mlp": 1.01631498, "epoch": 0.9732459568328022, "flos": 18478841633280.0, "grad_norm": 2.1633273206019323, "language_loss": 0.7032665, "learning_rate": 7.471686893661732e-09, "loss": 0.72501504, "num_input_tokens_seen": 174759845, "step": 8094, "time_per_iteration": 2.7420685291290283 }, { "auxiliary_loss_clip": 0.0115737, "auxiliary_loss_mlp": 0.01027972, "balance_loss_clip": 1.04819226, "balance_loss_mlp": 1.02097738, "epoch": 0.9733661997234414, "flos": 20883886623360.0, "grad_norm": 2.3241247878474076, "language_loss": 0.64248765, "learning_rate": 7.4045674761442636e-09, "loss": 0.66434109, "num_input_tokens_seen": 174777175, "step": 8095, "time_per_iteration": 2.7454476356506348 }, { "auxiliary_loss_clip": 0.01163248, "auxiliary_loss_mlp": 0.01050995, "balance_loss_clip": 1.04585981, "balance_loss_mlp": 1.01586699, "epoch": 0.9734864426140805, "flos": 23766795175680.0, "grad_norm": 2.0851419573177408, "language_loss": 0.74501181, "learning_rate": 7.337750333769488e-09, "loss": 0.76715422, "num_input_tokens_seen": 174796980, "step": 8096, "time_per_iteration": 2.6346991062164307 }, { "auxiliary_loss_clip": 0.01166626, "auxiliary_loss_mlp": 0.01028127, "balance_loss_clip": 1.04626501, "balance_loss_mlp": 1.02050924, "epoch": 0.9736066855047195, "flos": 35042422176000.0, "grad_norm": 1.7195799525166293, "language_loss": 0.72842455, "learning_rate": 7.2712354766737425e-09, "loss": 0.75037205, "num_input_tokens_seen": 174817310, "step": 8097, "time_per_iteration": 2.8084402084350586 }, { "auxiliary_loss_clip": 0.01150215, "auxiliary_loss_mlp": 0.01031843, "balance_loss_clip": 1.04898214, "balance_loss_mlp": 1.02439845, "epoch": 0.9737269283953586, "flos": 20410620001920.0, "grad_norm": 1.5155024041651932, "language_loss": 0.81010002, "learning_rate": 7.2050229149469565e-09, "loss": 0.83192062, "num_input_tokens_seen": 174837320, "step": 8098, "time_per_iteration": 2.7598845958709717 }, { "auxiliary_loss_clip": 0.01160248, "auxiliary_loss_mlp": 0.01026515, "balance_loss_clip": 1.04548931, "balance_loss_mlp": 1.01917171, "epoch": 0.9738471712859977, "flos": 28911680847360.0, "grad_norm": 1.8633755105571344, "language_loss": 0.63529778, "learning_rate": 7.139112658633984e-09, "loss": 0.65716535, "num_input_tokens_seen": 174857470, "step": 8099, "time_per_iteration": 2.7660202980041504 }, { "auxiliary_loss_clip": 0.0115395, "auxiliary_loss_mlp": 0.01023762, "balance_loss_clip": 1.04588938, "balance_loss_mlp": 1.01667833, "epoch": 0.9739674141766368, "flos": 27782326356480.0, "grad_norm": 2.3428411507381437, "language_loss": 0.70573044, "learning_rate": 7.073504717733048e-09, "loss": 0.72750753, "num_input_tokens_seen": 174877035, "step": 8100, "time_per_iteration": 2.8359949588775635 }, { "auxiliary_loss_clip": 0.01059484, "auxiliary_loss_mlp": 0.0099948, "balance_loss_clip": 1.01000428, "balance_loss_mlp": 0.99859774, "epoch": 0.9740876570672758, "flos": 68863057188480.0, "grad_norm": 0.7382934433446199, "language_loss": 0.57202196, "learning_rate": 7.008199102196855e-09, "loss": 0.59261161, "num_input_tokens_seen": 174938460, "step": 8101, "time_per_iteration": 3.2707202434539795 }, { "auxiliary_loss_clip": 0.01058313, "auxiliary_loss_mlp": 0.01001233, "balance_loss_clip": 1.01220894, "balance_loss_mlp": 1.00018966, "epoch": 0.974207899957915, "flos": 58236622646400.0, "grad_norm": 0.7912695580665378, "language_loss": 0.58959597, "learning_rate": 6.9431958219321464e-09, "loss": 0.61019146, "num_input_tokens_seen": 174994625, "step": 8102, "time_per_iteration": 3.2818055152893066 }, { "auxiliary_loss_clip": 0.01159294, "auxiliary_loss_mlp": 0.01024611, "balance_loss_clip": 1.04720902, "balance_loss_mlp": 1.01735115, "epoch": 0.9743281428485541, "flos": 22600057605120.0, "grad_norm": 1.7795398825008577, "language_loss": 0.77705264, "learning_rate": 6.878494886800146e-09, "loss": 0.79889166, "num_input_tokens_seen": 175015400, "step": 8103, "time_per_iteration": 2.715643882751465 }, { "auxiliary_loss_clip": 0.01159421, "auxiliary_loss_mlp": 0.01021904, "balance_loss_clip": 1.04643822, "balance_loss_mlp": 1.01499009, "epoch": 0.9744483857391931, "flos": 20008815488640.0, "grad_norm": 1.8632014741934768, "language_loss": 0.76423681, "learning_rate": 6.814096306615669e-09, "loss": 0.78605008, "num_input_tokens_seen": 175033540, "step": 8104, "time_per_iteration": 2.777142286300659 }, { "auxiliary_loss_clip": 0.01165399, "auxiliary_loss_mlp": 0.010223, "balance_loss_clip": 1.04485846, "balance_loss_mlp": 1.01459265, "epoch": 0.9745686286298323, "flos": 17675268520320.0, "grad_norm": 2.240338104603921, "language_loss": 0.65284556, "learning_rate": 6.750000091148011e-09, "loss": 0.67472255, "num_input_tokens_seen": 175050835, "step": 8105, "time_per_iteration": 2.6418890953063965 }, { "auxiliary_loss_clip": 0.01165501, "auxiliary_loss_mlp": 0.01025951, "balance_loss_clip": 1.04695106, "balance_loss_mlp": 1.01818514, "epoch": 0.9746888715204713, "flos": 29460252332160.0, "grad_norm": 2.3792996370819517, "language_loss": 0.72859025, "learning_rate": 6.686206250120729e-09, "loss": 0.75050473, "num_input_tokens_seen": 175072330, "step": 8106, "time_per_iteration": 2.7392749786376953 }, { "auxiliary_loss_clip": 0.01159678, "auxiliary_loss_mlp": 0.01025804, "balance_loss_clip": 1.04337358, "balance_loss_mlp": 1.01828778, "epoch": 0.9748091144111104, "flos": 18479308510080.0, "grad_norm": 1.9361026995612864, "language_loss": 0.75005871, "learning_rate": 6.622714793210749e-09, "loss": 0.77191353, "num_input_tokens_seen": 175091250, "step": 8107, "time_per_iteration": 2.71573543548584 }, { "auxiliary_loss_clip": 0.01165298, "auxiliary_loss_mlp": 0.01025429, "balance_loss_clip": 1.04586756, "balance_loss_mlp": 1.01809144, "epoch": 0.9749293573017496, "flos": 20665154753280.0, "grad_norm": 1.7784759323904782, "language_loss": 0.78845084, "learning_rate": 6.559525730050364e-09, "loss": 0.81035805, "num_input_tokens_seen": 175111350, "step": 8108, "time_per_iteration": 3.5744130611419678 }, { "auxiliary_loss_clip": 0.01158218, "auxiliary_loss_mlp": 0.01025108, "balance_loss_clip": 1.04645884, "balance_loss_mlp": 1.01821494, "epoch": 0.9750496001923886, "flos": 18478590238080.0, "grad_norm": 2.1590914161299715, "language_loss": 0.75984764, "learning_rate": 6.496639070224574e-09, "loss": 0.78168094, "num_input_tokens_seen": 175129835, "step": 8109, "time_per_iteration": 3.691282033920288 }, { "auxiliary_loss_clip": 0.01166449, "auxiliary_loss_mlp": 0.01025413, "balance_loss_clip": 1.04797268, "balance_loss_mlp": 1.01803684, "epoch": 0.9751698430830277, "flos": 19572967860480.0, "grad_norm": 2.7799094009206735, "language_loss": 0.83783311, "learning_rate": 6.4340548232739714e-09, "loss": 0.8597517, "num_input_tokens_seen": 175146035, "step": 8110, "time_per_iteration": 2.7230279445648193 }, { "auxiliary_loss_clip": 0.01160032, "auxiliary_loss_mlp": 0.01026747, "balance_loss_clip": 1.04630136, "balance_loss_mlp": 1.01954055, "epoch": 0.9752900859736668, "flos": 23550325862400.0, "grad_norm": 1.9007770516184332, "language_loss": 0.79264474, "learning_rate": 6.371772998692071e-09, "loss": 0.81451255, "num_input_tokens_seen": 175165290, "step": 8111, "time_per_iteration": 3.582205057144165 }, { "auxiliary_loss_clip": 0.01157842, "auxiliary_loss_mlp": 0.01028662, "balance_loss_clip": 1.04390264, "balance_loss_mlp": 1.02149415, "epoch": 0.9754103288643059, "flos": 20303211358080.0, "grad_norm": 2.6619841740309154, "language_loss": 0.65289927, "learning_rate": 6.309793605927094e-09, "loss": 0.67476422, "num_input_tokens_seen": 175183610, "step": 8112, "time_per_iteration": 2.881105661392212 }, { "auxiliary_loss_clip": 0.01163774, "auxiliary_loss_mlp": 0.01019879, "balance_loss_clip": 1.04589605, "balance_loss_mlp": 1.01312292, "epoch": 0.975530571754945, "flos": 19350680544000.0, "grad_norm": 1.844017604424595, "language_loss": 0.80225486, "learning_rate": 6.248116654381297e-09, "loss": 0.82409137, "num_input_tokens_seen": 175202080, "step": 8113, "time_per_iteration": 2.6474568843841553 }, { "auxiliary_loss_clip": 0.01162375, "auxiliary_loss_mlp": 0.01023418, "balance_loss_clip": 1.04545045, "balance_loss_mlp": 1.01634264, "epoch": 0.9756508146455841, "flos": 23583399310080.0, "grad_norm": 3.052257333706095, "language_loss": 0.72636735, "learning_rate": 6.186742153410751e-09, "loss": 0.74822527, "num_input_tokens_seen": 175221575, "step": 8114, "time_per_iteration": 2.793592929840088 }, { "auxiliary_loss_clip": 0.01159441, "auxiliary_loss_mlp": 0.01031433, "balance_loss_clip": 1.04832006, "balance_loss_mlp": 1.02377057, "epoch": 0.9757710575362232, "flos": 22966921163520.0, "grad_norm": 1.977893851429318, "language_loss": 0.87729657, "learning_rate": 6.125670112326453e-09, "loss": 0.89920533, "num_input_tokens_seen": 175240835, "step": 8115, "time_per_iteration": 2.7732183933258057 }, { "auxiliary_loss_clip": 0.01165049, "auxiliary_loss_mlp": 0.01029126, "balance_loss_clip": 1.04610872, "balance_loss_mlp": 1.02197385, "epoch": 0.9758913004268622, "flos": 27966009530880.0, "grad_norm": 2.2495785599134215, "language_loss": 0.700472, "learning_rate": 6.064900540392548e-09, "loss": 0.72241378, "num_input_tokens_seen": 175262930, "step": 8116, "time_per_iteration": 2.6839253902435303 }, { "auxiliary_loss_clip": 0.01152764, "auxiliary_loss_mlp": 0.01021894, "balance_loss_clip": 1.04521644, "balance_loss_mlp": 1.01505136, "epoch": 0.9760115433175014, "flos": 22200156512640.0, "grad_norm": 1.8826253515950206, "language_loss": 0.78767049, "learning_rate": 6.0044334468278835e-09, "loss": 0.80941707, "num_input_tokens_seen": 175282275, "step": 8117, "time_per_iteration": 2.771493911743164 }, { "auxiliary_loss_clip": 0.01154331, "auxiliary_loss_mlp": 0.01024692, "balance_loss_clip": 1.04644787, "balance_loss_mlp": 1.01751602, "epoch": 0.9761317862081405, "flos": 26250736389120.0, "grad_norm": 1.911355228396067, "language_loss": 0.72026587, "learning_rate": 5.944268840805345e-09, "loss": 0.74205613, "num_input_tokens_seen": 175303020, "step": 8118, "time_per_iteration": 2.7271156311035156 }, { "auxiliary_loss_clip": 0.01152715, "auxiliary_loss_mlp": 0.01025033, "balance_loss_clip": 1.0470947, "balance_loss_mlp": 1.0181365, "epoch": 0.9762520290987795, "flos": 26575440359040.0, "grad_norm": 2.439261439109994, "language_loss": 0.64120126, "learning_rate": 5.88440673145163e-09, "loss": 0.66297877, "num_input_tokens_seen": 175324070, "step": 8119, "time_per_iteration": 3.68282413482666 }, { "auxiliary_loss_clip": 0.01161523, "auxiliary_loss_mlp": 0.01024381, "balance_loss_clip": 1.04801011, "balance_loss_mlp": 1.01739836, "epoch": 0.9763722719894187, "flos": 18005036307840.0, "grad_norm": 7.634972711996731, "language_loss": 0.82723558, "learning_rate": 5.824847127848142e-09, "loss": 0.84909463, "num_input_tokens_seen": 175342595, "step": 8120, "time_per_iteration": 2.647742986679077 }, { "auxiliary_loss_clip": 0.01157476, "auxiliary_loss_mlp": 0.01023284, "balance_loss_clip": 1.04832351, "balance_loss_mlp": 1.01614904, "epoch": 0.9764925148800577, "flos": 22455660931200.0, "grad_norm": 1.8472775942963218, "language_loss": 0.78911018, "learning_rate": 5.765590039029433e-09, "loss": 0.81091774, "num_input_tokens_seen": 175361915, "step": 8121, "time_per_iteration": 2.758059501647949 }, { "auxiliary_loss_clip": 0.01165659, "auxiliary_loss_mlp": 0.01025088, "balance_loss_clip": 1.04756594, "balance_loss_mlp": 1.01785827, "epoch": 0.9766127577706968, "flos": 36757084786560.0, "grad_norm": 2.076181037063038, "language_loss": 0.71252906, "learning_rate": 5.706635473985422e-09, "loss": 0.73443651, "num_input_tokens_seen": 175385785, "step": 8122, "time_per_iteration": 2.795288324356079 }, { "auxiliary_loss_clip": 0.01157806, "auxiliary_loss_mlp": 0.01024273, "balance_loss_clip": 1.04396546, "balance_loss_mlp": 1.0171864, "epoch": 0.976733000661336, "flos": 22309971367680.0, "grad_norm": 1.956224426356344, "language_loss": 0.85032123, "learning_rate": 5.6479834416591764e-09, "loss": 0.87214202, "num_input_tokens_seen": 175405145, "step": 8123, "time_per_iteration": 2.689467430114746 }, { "auxiliary_loss_clip": 0.01158639, "auxiliary_loss_mlp": 0.01051478, "balance_loss_clip": 1.04493618, "balance_loss_mlp": 1.01568198, "epoch": 0.976853243551975, "flos": 25810938264960.0, "grad_norm": 1.7640247872174983, "language_loss": 0.68462026, "learning_rate": 5.589633950947803e-09, "loss": 0.70672143, "num_input_tokens_seen": 175422645, "step": 8124, "time_per_iteration": 2.735140323638916 }, { "auxiliary_loss_clip": 0.0116128, "auxiliary_loss_mlp": 0.0102511, "balance_loss_clip": 1.05006564, "balance_loss_mlp": 1.01726055, "epoch": 0.9769734864426141, "flos": 21397445326080.0, "grad_norm": 2.2590863982706213, "language_loss": 0.70163947, "learning_rate": 5.5315870107035535e-09, "loss": 0.72350341, "num_input_tokens_seen": 175440695, "step": 8125, "time_per_iteration": 2.668635845184326 }, { "auxiliary_loss_clip": 0.01152344, "auxiliary_loss_mlp": 0.01022215, "balance_loss_clip": 1.04522014, "balance_loss_mlp": 1.014902, "epoch": 0.9770937293332532, "flos": 13990977584640.0, "grad_norm": 1.9662093544709955, "language_loss": 0.78902268, "learning_rate": 5.473842629731607e-09, "loss": 0.81076837, "num_input_tokens_seen": 175459195, "step": 8126, "time_per_iteration": 2.685450553894043 }, { "auxiliary_loss_clip": 0.01167201, "auxiliary_loss_mlp": 0.01052975, "balance_loss_clip": 1.0477488, "balance_loss_mlp": 1.01638591, "epoch": 0.9772139722238923, "flos": 17931994001280.0, "grad_norm": 1.9769927819777944, "language_loss": 0.78156847, "learning_rate": 5.416400816792066e-09, "loss": 0.80377024, "num_input_tokens_seen": 175476710, "step": 8127, "time_per_iteration": 2.6134731769561768 }, { "auxiliary_loss_clip": 0.01165254, "auxiliary_loss_mlp": 0.01025909, "balance_loss_clip": 1.04558945, "balance_loss_mlp": 1.01870847, "epoch": 0.9773342151145313, "flos": 20446171488000.0, "grad_norm": 2.3355507143543335, "language_loss": 0.7869637, "learning_rate": 5.359261580598407e-09, "loss": 0.80887532, "num_input_tokens_seen": 175492550, "step": 8128, "time_per_iteration": 2.6074674129486084 }, { "auxiliary_loss_clip": 0.01166851, "auxiliary_loss_mlp": 0.01025904, "balance_loss_clip": 1.04837704, "balance_loss_mlp": 1.0182327, "epoch": 0.9774544580051704, "flos": 11837306949120.0, "grad_norm": 2.546297349873964, "language_loss": 0.78026581, "learning_rate": 5.302424929819027e-09, "loss": 0.80219334, "num_input_tokens_seen": 175506560, "step": 8129, "time_per_iteration": 2.600202798843384 }, { "auxiliary_loss_clip": 0.01165038, "auxiliary_loss_mlp": 0.01023453, "balance_loss_clip": 1.04443169, "balance_loss_mlp": 1.01507282, "epoch": 0.9775747008958096, "flos": 13479932833920.0, "grad_norm": 2.605347812547789, "language_loss": 0.73532718, "learning_rate": 5.24589087307592e-09, "loss": 0.7572121, "num_input_tokens_seen": 175524180, "step": 8130, "time_per_iteration": 2.6465229988098145 }, { "auxiliary_loss_clip": 0.01166011, "auxiliary_loss_mlp": 0.01022817, "balance_loss_clip": 1.04590726, "balance_loss_mlp": 1.01486611, "epoch": 0.9776949437864486, "flos": 59532314042880.0, "grad_norm": 1.3745532731694512, "language_loss": 0.64889348, "learning_rate": 5.189659418944891e-09, "loss": 0.67078173, "num_input_tokens_seen": 175554355, "step": 8131, "time_per_iteration": 3.020312547683716 }, { "auxiliary_loss_clip": 0.01164884, "auxiliary_loss_mlp": 0.01024307, "balance_loss_clip": 1.04676199, "balance_loss_mlp": 1.01762819, "epoch": 0.9778151866770877, "flos": 21178605715200.0, "grad_norm": 1.8774043212011216, "language_loss": 0.78580379, "learning_rate": 5.133730575956674e-09, "loss": 0.80769575, "num_input_tokens_seen": 175574025, "step": 8132, "time_per_iteration": 2.7638416290283203 }, { "auxiliary_loss_clip": 0.01162415, "auxiliary_loss_mlp": 0.01023983, "balance_loss_clip": 1.04693437, "balance_loss_mlp": 1.01649666, "epoch": 0.9779354295677268, "flos": 20886795624960.0, "grad_norm": 2.064145698648196, "language_loss": 0.72106731, "learning_rate": 5.0781043525953696e-09, "loss": 0.74293125, "num_input_tokens_seen": 175592090, "step": 8133, "time_per_iteration": 2.7834055423736572 }, { "auxiliary_loss_clip": 0.01153127, "auxiliary_loss_mlp": 0.0102197, "balance_loss_clip": 1.04712415, "balance_loss_mlp": 1.01530671, "epoch": 0.9780556724583659, "flos": 23440618748160.0, "grad_norm": 1.7380751709704272, "language_loss": 0.73810285, "learning_rate": 5.0227807572995605e-09, "loss": 0.75985384, "num_input_tokens_seen": 175614065, "step": 8134, "time_per_iteration": 3.6114814281463623 }, { "auxiliary_loss_clip": 0.01159567, "auxiliary_loss_mlp": 0.01022512, "balance_loss_clip": 1.04421711, "balance_loss_mlp": 1.01546633, "epoch": 0.9781759153490049, "flos": 20923244951040.0, "grad_norm": 2.8229272916588544, "language_loss": 0.67436898, "learning_rate": 4.967759798461646e-09, "loss": 0.69618982, "num_input_tokens_seen": 175632410, "step": 8135, "time_per_iteration": 3.8375561237335205 }, { "auxiliary_loss_clip": 0.01162929, "auxiliary_loss_mlp": 0.01024526, "balance_loss_clip": 1.04576921, "balance_loss_mlp": 1.01746559, "epoch": 0.9782961582396441, "flos": 28293191539200.0, "grad_norm": 1.9598889723240351, "language_loss": 0.74834627, "learning_rate": 4.913041484428282e-09, "loss": 0.77022082, "num_input_tokens_seen": 175652885, "step": 8136, "time_per_iteration": 2.77154278755188 }, { "auxiliary_loss_clip": 0.01163373, "auxiliary_loss_mlp": 0.01028039, "balance_loss_clip": 1.04684901, "balance_loss_mlp": 1.02103531, "epoch": 0.9784164011302832, "flos": 25552955808000.0, "grad_norm": 1.8806645118086363, "language_loss": 0.74285996, "learning_rate": 4.858625823500384e-09, "loss": 0.76477408, "num_input_tokens_seen": 175670585, "step": 8137, "time_per_iteration": 3.6362173557281494 }, { "auxiliary_loss_clip": 0.0116694, "auxiliary_loss_mlp": 0.01025803, "balance_loss_clip": 1.04686117, "balance_loss_mlp": 1.01835799, "epoch": 0.9785366440209222, "flos": 29965945956480.0, "grad_norm": 2.0498838673772717, "language_loss": 0.73123324, "learning_rate": 4.80451282393246e-09, "loss": 0.75316066, "num_input_tokens_seen": 175690570, "step": 8138, "time_per_iteration": 2.7736833095550537 }, { "auxiliary_loss_clip": 0.01161956, "auxiliary_loss_mlp": 0.01022904, "balance_loss_clip": 1.0468992, "balance_loss_mlp": 1.01545072, "epoch": 0.9786568869115614, "flos": 32343591847680.0, "grad_norm": 2.612371456923748, "language_loss": 0.67283875, "learning_rate": 4.750702493933722e-09, "loss": 0.69468737, "num_input_tokens_seen": 175710455, "step": 8139, "time_per_iteration": 2.6994240283966064 }, { "auxiliary_loss_clip": 0.01157247, "auxiliary_loss_mlp": 0.01054793, "balance_loss_clip": 1.0470891, "balance_loss_mlp": 1.02028537, "epoch": 0.9787771298022004, "flos": 23331414424320.0, "grad_norm": 2.0137083401900955, "language_loss": 0.84781104, "learning_rate": 4.697194841666974e-09, "loss": 0.86993146, "num_input_tokens_seen": 175729380, "step": 8140, "time_per_iteration": 2.672344446182251 }, { "auxiliary_loss_clip": 0.01163941, "auxiliary_loss_mlp": 0.01031727, "balance_loss_clip": 1.04570627, "balance_loss_mlp": 1.0235846, "epoch": 0.9788973726928395, "flos": 21468548298240.0, "grad_norm": 2.1848634469273835, "language_loss": 0.81742066, "learning_rate": 4.6439898752492764e-09, "loss": 0.8393774, "num_input_tokens_seen": 175749520, "step": 8141, "time_per_iteration": 2.6770238876342773 }, { "auxiliary_loss_clip": 0.01060227, "auxiliary_loss_mlp": 0.01035491, "balance_loss_clip": 1.00747359, "balance_loss_mlp": 1.00315702, "epoch": 0.9790176155834787, "flos": 68897459439360.0, "grad_norm": 0.7501598442229609, "language_loss": 0.63675535, "learning_rate": 4.591087602751731e-09, "loss": 0.65771258, "num_input_tokens_seen": 175811380, "step": 8142, "time_per_iteration": 3.3569252490997314 }, { "auxiliary_loss_clip": 0.01161405, "auxiliary_loss_mlp": 0.01023702, "balance_loss_clip": 1.04607606, "balance_loss_mlp": 1.0169847, "epoch": 0.9791378584741177, "flos": 21430877909760.0, "grad_norm": 1.9554774570364075, "language_loss": 0.71825194, "learning_rate": 4.538488032199916e-09, "loss": 0.74010301, "num_input_tokens_seen": 175829480, "step": 8143, "time_per_iteration": 2.8095993995666504 }, { "auxiliary_loss_clip": 0.01164883, "auxiliary_loss_mlp": 0.01027455, "balance_loss_clip": 1.0440017, "balance_loss_mlp": 1.02011454, "epoch": 0.9792581013647568, "flos": 20153032594560.0, "grad_norm": 2.7089306507281385, "language_loss": 0.69005132, "learning_rate": 4.486191171572784e-09, "loss": 0.71197474, "num_input_tokens_seen": 175846750, "step": 8144, "time_per_iteration": 2.6757826805114746 }, { "auxiliary_loss_clip": 0.01166003, "auxiliary_loss_mlp": 0.01021582, "balance_loss_clip": 1.04841805, "balance_loss_mlp": 1.0142746, "epoch": 0.9793783442553959, "flos": 23728191033600.0, "grad_norm": 1.534108866239952, "language_loss": 0.77367687, "learning_rate": 4.434197028803766e-09, "loss": 0.79555273, "num_input_tokens_seen": 175865975, "step": 8145, "time_per_iteration": 3.67901873588562 }, { "auxiliary_loss_clip": 0.01163977, "auxiliary_loss_mlp": 0.01024876, "balance_loss_clip": 1.0463587, "balance_loss_mlp": 1.01709747, "epoch": 0.979498587146035, "flos": 23038742407680.0, "grad_norm": 2.092699234067167, "language_loss": 0.82196295, "learning_rate": 4.3825056117805514e-09, "loss": 0.84385151, "num_input_tokens_seen": 175881860, "step": 8146, "time_per_iteration": 2.776064157485962 }, { "auxiliary_loss_clip": 0.01167495, "auxiliary_loss_mlp": 0.01025525, "balance_loss_clip": 1.04628038, "balance_loss_mlp": 1.01823556, "epoch": 0.979618830036674, "flos": 14318841951360.0, "grad_norm": 4.801567069074544, "language_loss": 0.7949698, "learning_rate": 4.331116928344425e-09, "loss": 0.8168999, "num_input_tokens_seen": 175898175, "step": 8147, "time_per_iteration": 2.6575427055358887 }, { "auxiliary_loss_clip": 0.01163779, "auxiliary_loss_mlp": 0.01051143, "balance_loss_clip": 1.04614377, "balance_loss_mlp": 1.01684117, "epoch": 0.9797390729273132, "flos": 16727514215040.0, "grad_norm": 1.9043262069776694, "language_loss": 0.6259042, "learning_rate": 4.28003098629115e-09, "loss": 0.64805341, "num_input_tokens_seen": 175914310, "step": 8148, "time_per_iteration": 2.7398388385772705 }, { "auxiliary_loss_clip": 0.01154359, "auxiliary_loss_mlp": 0.01024863, "balance_loss_clip": 1.04264426, "balance_loss_mlp": 1.01773739, "epoch": 0.9798593158179523, "flos": 24532661986560.0, "grad_norm": 1.8238211703956113, "language_loss": 0.78537011, "learning_rate": 4.229247793370305e-09, "loss": 0.80716228, "num_input_tokens_seen": 175933435, "step": 8149, "time_per_iteration": 2.771850824356079 }, { "auxiliary_loss_clip": 0.01167318, "auxiliary_loss_mlp": 0.01033677, "balance_loss_clip": 1.04796982, "balance_loss_mlp": 1.02603817, "epoch": 0.9799795587085913, "flos": 27308808339840.0, "grad_norm": 1.6339495463871876, "language_loss": 0.70326674, "learning_rate": 4.178767357285951e-09, "loss": 0.72527659, "num_input_tokens_seen": 175955065, "step": 8150, "time_per_iteration": 2.7863681316375732 }, { "auxiliary_loss_clip": 0.01163865, "auxiliary_loss_mlp": 0.01053259, "balance_loss_clip": 1.0468148, "balance_loss_mlp": 1.01577735, "epoch": 0.9800998015992305, "flos": 26286575184000.0, "grad_norm": 2.0645710662671792, "language_loss": 0.71366858, "learning_rate": 4.128589685695516e-09, "loss": 0.73583984, "num_input_tokens_seen": 175975490, "step": 8151, "time_per_iteration": 2.743373394012451 }, { "auxiliary_loss_clip": 0.01165713, "auxiliary_loss_mlp": 0.01025994, "balance_loss_clip": 1.04611766, "balance_loss_mlp": 1.01839423, "epoch": 0.9802200444898695, "flos": 16723635546240.0, "grad_norm": 2.229491575249115, "language_loss": 0.84683639, "learning_rate": 4.078714786211135e-09, "loss": 0.86875343, "num_input_tokens_seen": 175991340, "step": 8152, "time_per_iteration": 2.7499046325683594 }, { "auxiliary_loss_clip": 0.01158497, "auxiliary_loss_mlp": 0.01023196, "balance_loss_clip": 1.04572022, "balance_loss_mlp": 1.01612127, "epoch": 0.9803402873805086, "flos": 24900459298560.0, "grad_norm": 2.196677511814093, "language_loss": 0.76489812, "learning_rate": 4.029142666398977e-09, "loss": 0.78671509, "num_input_tokens_seen": 176011505, "step": 8153, "time_per_iteration": 2.7827610969543457 }, { "auxiliary_loss_clip": 0.01163984, "auxiliary_loss_mlp": 0.01027599, "balance_loss_clip": 1.04700828, "balance_loss_mlp": 1.02081895, "epoch": 0.9804605302711478, "flos": 22564937082240.0, "grad_norm": 2.0133637842298255, "language_loss": 0.80429471, "learning_rate": 3.979873333778805e-09, "loss": 0.8262105, "num_input_tokens_seen": 176029680, "step": 8154, "time_per_iteration": 2.7437744140625 }, { "auxiliary_loss_clip": 0.01164319, "auxiliary_loss_mlp": 0.01024511, "balance_loss_clip": 1.04726064, "balance_loss_mlp": 1.01756978, "epoch": 0.9805807731617868, "flos": 38905368382080.0, "grad_norm": 2.1532675541056916, "language_loss": 0.73756987, "learning_rate": 3.930906795824862e-09, "loss": 0.75945818, "num_input_tokens_seen": 176050355, "step": 8155, "time_per_iteration": 2.891676187515259 }, { "auxiliary_loss_clip": 0.01158568, "auxiliary_loss_mlp": 0.01024798, "balance_loss_clip": 1.04464912, "balance_loss_mlp": 1.01751709, "epoch": 0.9807010160524259, "flos": 17821999578240.0, "grad_norm": 2.0160217839005394, "language_loss": 0.77142471, "learning_rate": 3.882243059965207e-09, "loss": 0.79325831, "num_input_tokens_seen": 176068070, "step": 8156, "time_per_iteration": 2.706758499145508 }, { "auxiliary_loss_clip": 0.01156405, "auxiliary_loss_mlp": 0.01027206, "balance_loss_clip": 1.04503822, "balance_loss_mlp": 1.01917994, "epoch": 0.980821258943065, "flos": 13552975140480.0, "grad_norm": 3.195502698140819, "language_loss": 0.65806615, "learning_rate": 3.833882133582156e-09, "loss": 0.67990232, "num_input_tokens_seen": 176083730, "step": 8157, "time_per_iteration": 2.835420608520508 }, { "auxiliary_loss_clip": 0.01165931, "auxiliary_loss_mlp": 0.01025304, "balance_loss_clip": 1.04702258, "balance_loss_mlp": 1.01744795, "epoch": 0.9809415018337041, "flos": 21689794120320.0, "grad_norm": 1.746188944798706, "language_loss": 0.7820127, "learning_rate": 3.785824024012285e-09, "loss": 0.80392504, "num_input_tokens_seen": 176102730, "step": 8158, "time_per_iteration": 2.6777918338775635 }, { "auxiliary_loss_clip": 0.0115106, "auxiliary_loss_mlp": 0.01024196, "balance_loss_clip": 1.04643714, "balance_loss_mlp": 1.01672125, "epoch": 0.9810617447243432, "flos": 23294857357440.0, "grad_norm": 1.4834903261768824, "language_loss": 0.78729463, "learning_rate": 3.738068738545541e-09, "loss": 0.80904716, "num_input_tokens_seen": 176121815, "step": 8159, "time_per_iteration": 2.7920241355895996 }, { "auxiliary_loss_clip": 0.01167934, "auxiliary_loss_mlp": 0.01029926, "balance_loss_clip": 1.04889059, "balance_loss_mlp": 1.02234483, "epoch": 0.9811819876149822, "flos": 18332038748160.0, "grad_norm": 2.526534678791638, "language_loss": 0.78796446, "learning_rate": 3.6906162844265733e-09, "loss": 0.80994308, "num_input_tokens_seen": 176138900, "step": 8160, "time_per_iteration": 3.8191466331481934 }, { "auxiliary_loss_clip": 0.01156809, "auxiliary_loss_mlp": 0.01030837, "balance_loss_clip": 1.04760671, "balance_loss_mlp": 1.02306187, "epoch": 0.9813022305056214, "flos": 22601961025920.0, "grad_norm": 1.7735594941161603, "language_loss": 0.71002132, "learning_rate": 3.643466668853845e-09, "loss": 0.73189777, "num_input_tokens_seen": 176156925, "step": 8161, "time_per_iteration": 2.8668930530548096 }, { "auxiliary_loss_clip": 0.01164997, "auxiliary_loss_mlp": 0.01020888, "balance_loss_clip": 1.04965234, "balance_loss_mlp": 1.01389658, "epoch": 0.9814224733962604, "flos": 25413335642880.0, "grad_norm": 2.0048813893250546, "language_loss": 0.75066781, "learning_rate": 3.59661989898008e-09, "loss": 0.77252668, "num_input_tokens_seen": 176177980, "step": 8162, "time_per_iteration": 3.7230660915374756 }, { "auxiliary_loss_clip": 0.01147244, "auxiliary_loss_mlp": 0.01023204, "balance_loss_clip": 1.04408765, "balance_loss_mlp": 1.01648378, "epoch": 0.9815427162868995, "flos": 25007185584000.0, "grad_norm": 1.8389897515933569, "language_loss": 0.76626885, "learning_rate": 3.5500759819115934e-09, "loss": 0.7879734, "num_input_tokens_seen": 176198345, "step": 8163, "time_per_iteration": 3.6711342334747314 }, { "auxiliary_loss_clip": 0.01167567, "auxiliary_loss_mlp": 0.01026724, "balance_loss_clip": 1.0485177, "balance_loss_mlp": 1.01920438, "epoch": 0.9816629591775387, "flos": 20662604887680.0, "grad_norm": 1.9988329523315602, "language_loss": 0.81179422, "learning_rate": 3.5038349247094034e-09, "loss": 0.83373708, "num_input_tokens_seen": 176215605, "step": 8164, "time_per_iteration": 2.7279794216156006 }, { "auxiliary_loss_clip": 0.0116063, "auxiliary_loss_mlp": 0.0102849, "balance_loss_clip": 1.04657662, "balance_loss_mlp": 1.02138782, "epoch": 0.9817832020681777, "flos": 17712220636800.0, "grad_norm": 1.8771581115841331, "language_loss": 0.76993829, "learning_rate": 3.4578967343878994e-09, "loss": 0.79182947, "num_input_tokens_seen": 176231810, "step": 8165, "time_per_iteration": 2.7615127563476562 }, { "auxiliary_loss_clip": 0.01157937, "auxiliary_loss_mlp": 0.01021196, "balance_loss_clip": 1.04662442, "balance_loss_mlp": 1.01455641, "epoch": 0.9819034449588168, "flos": 22530032040960.0, "grad_norm": 2.083504584774252, "language_loss": 0.81133819, "learning_rate": 3.4122614179161733e-09, "loss": 0.83312958, "num_input_tokens_seen": 176251770, "step": 8166, "time_per_iteration": 2.7658567428588867 }, { "auxiliary_loss_clip": 0.01145646, "auxiliary_loss_mlp": 0.01020529, "balance_loss_clip": 1.04469395, "balance_loss_mlp": 1.01380515, "epoch": 0.9820236878494559, "flos": 20011221699840.0, "grad_norm": 2.083978173408206, "language_loss": 0.78265762, "learning_rate": 3.36692898221691e-09, "loss": 0.80431932, "num_input_tokens_seen": 176270135, "step": 8167, "time_per_iteration": 2.9354710578918457 }, { "auxiliary_loss_clip": 0.01162696, "auxiliary_loss_mlp": 0.01022308, "balance_loss_clip": 1.04637921, "balance_loss_mlp": 1.01584136, "epoch": 0.982143930740095, "flos": 18807316531200.0, "grad_norm": 1.9251928030427068, "language_loss": 0.73544759, "learning_rate": 3.3218994341668305e-09, "loss": 0.75729764, "num_input_tokens_seen": 176289065, "step": 8168, "time_per_iteration": 2.770172119140625 }, { "auxiliary_loss_clip": 0.01163401, "auxiliary_loss_mlp": 0.01027058, "balance_loss_clip": 1.04686666, "balance_loss_mlp": 1.01945806, "epoch": 0.982264173630734, "flos": 26578026138240.0, "grad_norm": 1.6967683171274246, "language_loss": 0.75841051, "learning_rate": 3.2771727805971373e-09, "loss": 0.7803151, "num_input_tokens_seen": 176310450, "step": 8169, "time_per_iteration": 2.7174901962280273 }, { "auxiliary_loss_clip": 0.01146892, "auxiliary_loss_mlp": 0.01024243, "balance_loss_clip": 1.044397, "balance_loss_mlp": 1.01715565, "epoch": 0.9823844165213732, "flos": 22014462176640.0, "grad_norm": 1.8957923483006174, "language_loss": 0.77223134, "learning_rate": 3.232749028292847e-09, "loss": 0.79394269, "num_input_tokens_seen": 176327415, "step": 8170, "time_per_iteration": 2.724194288253784 }, { "auxiliary_loss_clip": 0.01164586, "auxiliary_loss_mlp": 0.01031015, "balance_loss_clip": 1.04468715, "balance_loss_mlp": 1.02322161, "epoch": 0.9825046594120123, "flos": 21908166854400.0, "grad_norm": 1.9370221024025642, "language_loss": 0.88321662, "learning_rate": 3.188628183992792e-09, "loss": 0.90517259, "num_input_tokens_seen": 176347680, "step": 8171, "time_per_iteration": 3.554898738861084 }, { "auxiliary_loss_clip": 0.01060382, "auxiliary_loss_mlp": 0.01002403, "balance_loss_clip": 1.00650573, "balance_loss_mlp": 1.00145519, "epoch": 0.9826249023026513, "flos": 59494610718720.0, "grad_norm": 0.7432178370640341, "language_loss": 0.62535572, "learning_rate": 3.1448102543902844e-09, "loss": 0.64598358, "num_input_tokens_seen": 176411595, "step": 8172, "time_per_iteration": 3.2477409839630127 }, { "auxiliary_loss_clip": 0.01155977, "auxiliary_loss_mlp": 0.01031623, "balance_loss_clip": 1.04852509, "balance_loss_mlp": 1.02473843, "epoch": 0.9827451451932905, "flos": 16071031296000.0, "grad_norm": 1.8416140113017216, "language_loss": 0.67657471, "learning_rate": 3.1012952461324515e-09, "loss": 0.69845068, "num_input_tokens_seen": 176430570, "step": 8173, "time_per_iteration": 2.7119643688201904 }, { "auxiliary_loss_clip": 0.01160618, "auxiliary_loss_mlp": 0.01025514, "balance_loss_clip": 1.04789758, "balance_loss_mlp": 1.01850748, "epoch": 0.9828653880839295, "flos": 20262775622400.0, "grad_norm": 2.354296337928453, "language_loss": 0.73968613, "learning_rate": 3.0580831658204575e-09, "loss": 0.76154745, "num_input_tokens_seen": 176448150, "step": 8174, "time_per_iteration": 2.767244577407837 }, { "auxiliary_loss_clip": 0.0116037, "auxiliary_loss_mlp": 0.01025675, "balance_loss_clip": 1.04669952, "balance_loss_mlp": 1.01872516, "epoch": 0.9829856309745686, "flos": 21616141282560.0, "grad_norm": 3.8886932419979643, "language_loss": 0.77931082, "learning_rate": 3.015174020009281e-09, "loss": 0.8011713, "num_input_tokens_seen": 176467475, "step": 8175, "time_per_iteration": 2.7517244815826416 }, { "auxiliary_loss_clip": 0.01161034, "auxiliary_loss_mlp": 0.0102543, "balance_loss_clip": 1.04717898, "balance_loss_mlp": 1.01841199, "epoch": 0.9831058738652078, "flos": 23764209396480.0, "grad_norm": 1.7613283237256965, "language_loss": 0.75169456, "learning_rate": 2.9725678152086043e-09, "loss": 0.77355921, "num_input_tokens_seen": 176486045, "step": 8176, "time_per_iteration": 2.764415979385376 }, { "auxiliary_loss_clip": 0.01149121, "auxiliary_loss_mlp": 0.0102405, "balance_loss_clip": 1.04571807, "balance_loss_mlp": 1.01678133, "epoch": 0.9832261167558468, "flos": 11320911072000.0, "grad_norm": 3.009647723487649, "language_loss": 0.82586241, "learning_rate": 2.930264557881257e-09, "loss": 0.84759414, "num_input_tokens_seen": 176501230, "step": 8177, "time_per_iteration": 2.673741579055786 }, { "auxiliary_loss_clip": 0.01058973, "auxiliary_loss_mlp": 0.0100228, "balance_loss_clip": 1.006531, "balance_loss_mlp": 1.00128484, "epoch": 0.9833463596464859, "flos": 60000304343040.0, "grad_norm": 0.8446260393483613, "language_loss": 0.58166355, "learning_rate": 2.8882642544452163e-09, "loss": 0.60227609, "num_input_tokens_seen": 176565955, "step": 8178, "time_per_iteration": 3.2779014110565186 }, { "auxiliary_loss_clip": 0.01149672, "auxiliary_loss_mlp": 0.01027897, "balance_loss_clip": 1.04514503, "balance_loss_mlp": 1.02037477, "epoch": 0.983466602537125, "flos": 13626699805440.0, "grad_norm": 2.2371300525694022, "language_loss": 0.74329245, "learning_rate": 2.8465669112716083e-09, "loss": 0.76506817, "num_input_tokens_seen": 176583480, "step": 8179, "time_per_iteration": 2.8922219276428223 }, { "auxiliary_loss_clip": 0.0116398, "auxiliary_loss_mlp": 0.01055182, "balance_loss_clip": 1.04612994, "balance_loss_mlp": 1.02028131, "epoch": 0.9835868454277641, "flos": 22926844563840.0, "grad_norm": 1.9440236638548836, "language_loss": 0.76311433, "learning_rate": 2.8051725346858177e-09, "loss": 0.78530598, "num_input_tokens_seen": 176603740, "step": 8180, "time_per_iteration": 2.7057383060455322 }, { "auxiliary_loss_clip": 0.01165411, "auxiliary_loss_mlp": 0.01023629, "balance_loss_clip": 1.04470181, "balance_loss_mlp": 1.01638687, "epoch": 0.9837070883184031, "flos": 27673409341440.0, "grad_norm": 3.3403018685486625, "language_loss": 0.70721352, "learning_rate": 2.7640811309674883e-09, "loss": 0.72910392, "num_input_tokens_seen": 176623240, "step": 8181, "time_per_iteration": 2.6788206100463867 }, { "auxiliary_loss_clip": 0.01148545, "auxiliary_loss_mlp": 0.01026315, "balance_loss_clip": 1.04666376, "balance_loss_mlp": 1.01945472, "epoch": 0.9838273312090423, "flos": 29241951425280.0, "grad_norm": 1.6322735558273924, "language_loss": 0.80689812, "learning_rate": 2.7232927063498557e-09, "loss": 0.82864672, "num_input_tokens_seen": 176643615, "step": 8182, "time_per_iteration": 2.737269878387451 }, { "auxiliary_loss_clip": 0.01163101, "auxiliary_loss_mlp": 0.01024021, "balance_loss_clip": 1.04624295, "balance_loss_mlp": 1.01650524, "epoch": 0.9839475740996814, "flos": 40110207304320.0, "grad_norm": 2.160480273042554, "language_loss": 0.68907833, "learning_rate": 2.682807267020859e-09, "loss": 0.71094954, "num_input_tokens_seen": 176666375, "step": 8183, "time_per_iteration": 2.8594698905944824 }, { "auxiliary_loss_clip": 0.01162543, "auxiliary_loss_mlp": 0.01029125, "balance_loss_clip": 1.04687333, "balance_loss_mlp": 1.02159691, "epoch": 0.9840678169903204, "flos": 24169389788160.0, "grad_norm": 5.888107565607566, "language_loss": 0.62229937, "learning_rate": 2.642624819121808e-09, "loss": 0.64421606, "num_input_tokens_seen": 176686525, "step": 8184, "time_per_iteration": 2.69157075881958 }, { "auxiliary_loss_clip": 0.01157718, "auxiliary_loss_mlp": 0.01023459, "balance_loss_clip": 1.04722047, "balance_loss_mlp": 1.01658642, "epoch": 0.9841880598809596, "flos": 14684484447360.0, "grad_norm": 1.9462574570507885, "language_loss": 0.61776435, "learning_rate": 2.6027453687487154e-09, "loss": 0.63957608, "num_input_tokens_seen": 176703615, "step": 8185, "time_per_iteration": 2.721323013305664 }, { "auxiliary_loss_clip": 0.01159679, "auxiliary_loss_mlp": 0.01024967, "balance_loss_clip": 1.0473721, "balance_loss_mlp": 1.01711071, "epoch": 0.9843083027715986, "flos": 22344768668160.0, "grad_norm": 2.4018385282644186, "language_loss": 0.54125285, "learning_rate": 2.5631689219509643e-09, "loss": 0.56309932, "num_input_tokens_seen": 176722295, "step": 8186, "time_per_iteration": 2.789299726486206 }, { "auxiliary_loss_clip": 0.01158104, "auxiliary_loss_mlp": 0.01025471, "balance_loss_clip": 1.04647446, "balance_loss_mlp": 1.01886392, "epoch": 0.9844285456622377, "flos": 21800111765760.0, "grad_norm": 1.6103901356594297, "language_loss": 0.83662206, "learning_rate": 2.523895484732197e-09, "loss": 0.8584578, "num_input_tokens_seen": 176741750, "step": 8187, "time_per_iteration": 4.533081769943237 }, { "auxiliary_loss_clip": 0.01169645, "auxiliary_loss_mlp": 0.01022975, "balance_loss_clip": 1.04819739, "balance_loss_mlp": 1.01528871, "epoch": 0.9845487885528769, "flos": 18035380321920.0, "grad_norm": 2.141473212094935, "language_loss": 0.74761903, "learning_rate": 2.4849250630505357e-09, "loss": 0.7695452, "num_input_tokens_seen": 176759995, "step": 8188, "time_per_iteration": 2.6274876594543457 }, { "auxiliary_loss_clip": 0.01138472, "auxiliary_loss_mlp": 0.0102315, "balance_loss_clip": 1.04679298, "balance_loss_mlp": 1.0165906, "epoch": 0.9846690314435159, "flos": 25228610974080.0, "grad_norm": 1.8711367170021445, "language_loss": 0.73705494, "learning_rate": 2.4462576628172528e-09, "loss": 0.75867122, "num_input_tokens_seen": 176778625, "step": 8189, "time_per_iteration": 3.8056111335754395 }, { "auxiliary_loss_clip": 0.01158991, "auxiliary_loss_mlp": 0.01027827, "balance_loss_clip": 1.04574418, "balance_loss_mlp": 1.02048111, "epoch": 0.984789274334155, "flos": 18552171248640.0, "grad_norm": 1.9034805303821414, "language_loss": 0.73751378, "learning_rate": 2.407893289898766e-09, "loss": 0.75938201, "num_input_tokens_seen": 176797655, "step": 8190, "time_per_iteration": 2.6952755451202393 }, { "auxiliary_loss_clip": 0.01149498, "auxiliary_loss_mlp": 0.01026428, "balance_loss_clip": 1.04585361, "balance_loss_mlp": 1.01881075, "epoch": 0.984909517224794, "flos": 27345437233920.0, "grad_norm": 2.7162952868053507, "language_loss": 0.83933634, "learning_rate": 2.3698319501144202e-09, "loss": 0.86109561, "num_input_tokens_seen": 176818640, "step": 8191, "time_per_iteration": 2.7825067043304443 }, { "auxiliary_loss_clip": 0.01166144, "auxiliary_loss_mlp": 0.01028109, "balance_loss_clip": 1.04460478, "balance_loss_mlp": 1.02003241, "epoch": 0.9850297601154332, "flos": 18734058743040.0, "grad_norm": 1.6846195215427684, "language_loss": 0.73318881, "learning_rate": 2.3320736492382644e-09, "loss": 0.75513136, "num_input_tokens_seen": 176837475, "step": 8192, "time_per_iteration": 2.6025052070617676 }, { "auxiliary_loss_clip": 0.01163993, "auxiliary_loss_mlp": 0.01022102, "balance_loss_clip": 1.04760742, "balance_loss_mlp": 1.0153904, "epoch": 0.9851500030060723, "flos": 22308247514880.0, "grad_norm": 1.7437575054231973, "language_loss": 0.68062735, "learning_rate": 2.29461839299816e-09, "loss": 0.7024883, "num_input_tokens_seen": 176857190, "step": 8193, "time_per_iteration": 2.6818583011627197 }, { "auxiliary_loss_clip": 0.01159627, "auxiliary_loss_mlp": 0.01022123, "balance_loss_clip": 1.04728138, "balance_loss_mlp": 1.0150007, "epoch": 0.9852702458967113, "flos": 26353691746560.0, "grad_norm": 1.6547702826868134, "language_loss": 0.80069482, "learning_rate": 2.257466187076229e-09, "loss": 0.82251239, "num_input_tokens_seen": 176876395, "step": 8194, "time_per_iteration": 2.7291259765625 }, { "auxiliary_loss_clip": 0.01164769, "auxiliary_loss_mlp": 0.01057727, "balance_loss_clip": 1.04499197, "balance_loss_mlp": 1.02233219, "epoch": 0.9853904887873505, "flos": 20883599314560.0, "grad_norm": 1.9566858952126194, "language_loss": 0.71066743, "learning_rate": 2.2206170371081854e-09, "loss": 0.73289239, "num_input_tokens_seen": 176894980, "step": 8195, "time_per_iteration": 2.7236781120300293 }, { "auxiliary_loss_clip": 0.01161157, "auxiliary_loss_mlp": 0.0102874, "balance_loss_clip": 1.04660118, "balance_loss_mlp": 1.02121186, "epoch": 0.9855107316779895, "flos": 25263444188160.0, "grad_norm": 1.74370862241684, "language_loss": 0.84768873, "learning_rate": 2.1840709486842247e-09, "loss": 0.86958772, "num_input_tokens_seen": 176914600, "step": 8196, "time_per_iteration": 2.6921634674072266 }, { "auxiliary_loss_clip": 0.01153859, "auxiliary_loss_mlp": 0.01024123, "balance_loss_clip": 1.04546428, "balance_loss_mlp": 1.01700342, "epoch": 0.9856309745686286, "flos": 19062102677760.0, "grad_norm": 2.431527693768726, "language_loss": 0.79545712, "learning_rate": 2.1478279273481335e-09, "loss": 0.81723696, "num_input_tokens_seen": 176933085, "step": 8197, "time_per_iteration": 3.600154161453247 }, { "auxiliary_loss_clip": 0.01162208, "auxiliary_loss_mlp": 0.01027572, "balance_loss_clip": 1.04863214, "balance_loss_mlp": 1.02082205, "epoch": 0.9857512174592677, "flos": 34130758060800.0, "grad_norm": 2.260049950407628, "language_loss": 0.80269676, "learning_rate": 2.1118879785981815e-09, "loss": 0.82459462, "num_input_tokens_seen": 176953225, "step": 8198, "time_per_iteration": 2.8189523220062256 }, { "auxiliary_loss_clip": 0.01158495, "auxiliary_loss_mlp": 0.01022017, "balance_loss_clip": 1.04635191, "balance_loss_mlp": 1.01520765, "epoch": 0.9858714603499068, "flos": 25994693266560.0, "grad_norm": 1.7116123484329104, "language_loss": 0.79339552, "learning_rate": 2.0762511078862288e-09, "loss": 0.81520069, "num_input_tokens_seen": 176973570, "step": 8199, "time_per_iteration": 2.6757171154022217 }, { "auxiliary_loss_clip": 0.01164809, "auxiliary_loss_mlp": 0.01022453, "balance_loss_clip": 1.04475367, "balance_loss_mlp": 1.01560092, "epoch": 0.9859917032405459, "flos": 23696230907520.0, "grad_norm": 1.701632133012378, "language_loss": 0.64665616, "learning_rate": 2.0409173206186183e-09, "loss": 0.66852874, "num_input_tokens_seen": 176992810, "step": 8200, "time_per_iteration": 2.747140645980835 }, { "auxiliary_loss_clip": 0.01154939, "auxiliary_loss_mlp": 0.01021375, "balance_loss_clip": 1.04798555, "balance_loss_mlp": 1.01438951, "epoch": 0.986111946131185, "flos": 19938287134080.0, "grad_norm": 2.176757511620517, "language_loss": 0.86874896, "learning_rate": 2.0058866221550617e-09, "loss": 0.89051211, "num_input_tokens_seen": 177011050, "step": 8201, "time_per_iteration": 2.7360870838165283 }, { "auxiliary_loss_clip": 0.0116393, "auxiliary_loss_mlp": 0.01025735, "balance_loss_clip": 1.04492259, "balance_loss_mlp": 1.01825452, "epoch": 0.9862321890218241, "flos": 19828831415040.0, "grad_norm": 1.982055626776624, "language_loss": 0.74964273, "learning_rate": 1.971159017809976e-09, "loss": 0.77153939, "num_input_tokens_seen": 177029340, "step": 8202, "time_per_iteration": 2.6339669227600098 }, { "auxiliary_loss_clip": 0.01161138, "auxiliary_loss_mlp": 0.01030492, "balance_loss_clip": 1.04624331, "balance_loss_mlp": 1.02289855, "epoch": 0.9863524319124631, "flos": 21652051904640.0, "grad_norm": 2.2039205033130993, "language_loss": 0.77617389, "learning_rate": 1.93673451285159e-09, "loss": 0.79809022, "num_input_tokens_seen": 177048390, "step": 8203, "time_per_iteration": 2.7083680629730225 }, { "auxiliary_loss_clip": 0.01060772, "auxiliary_loss_mlp": 0.01001932, "balance_loss_clip": 1.00680327, "balance_loss_mlp": 1.00086522, "epoch": 0.9864726748031023, "flos": 52769977920000.0, "grad_norm": 0.7335066572673832, "language_loss": 0.56517178, "learning_rate": 1.9026131125019495e-09, "loss": 0.5857988, "num_input_tokens_seen": 177105760, "step": 8204, "time_per_iteration": 3.17681884765625 }, { "auxiliary_loss_clip": 0.01157015, "auxiliary_loss_mlp": 0.01028313, "balance_loss_clip": 1.04520512, "balance_loss_mlp": 1.02104759, "epoch": 0.9865929176937414, "flos": 23364631526400.0, "grad_norm": 1.8766321158646768, "language_loss": 0.8693276, "learning_rate": 1.8687948219371363e-09, "loss": 0.89118087, "num_input_tokens_seen": 177124985, "step": 8205, "time_per_iteration": 2.688206911087036 }, { "auxiliary_loss_clip": 0.01167972, "auxiliary_loss_mlp": 0.01027102, "balance_loss_clip": 1.04585624, "balance_loss_mlp": 1.01919234, "epoch": 0.9867131605843804, "flos": 21616679986560.0, "grad_norm": 2.93900282170903, "language_loss": 0.88302016, "learning_rate": 1.835279646287491e-09, "loss": 0.90497088, "num_input_tokens_seen": 177142995, "step": 8206, "time_per_iteration": 2.6024181842803955 }, { "auxiliary_loss_clip": 0.0117335, "auxiliary_loss_mlp": 0.01031323, "balance_loss_clip": 1.05125773, "balance_loss_mlp": 1.02341938, "epoch": 0.9868334034750196, "flos": 22271403139200.0, "grad_norm": 1.6551785190537225, "language_loss": 0.76460016, "learning_rate": 1.8020675906371685e-09, "loss": 0.78664684, "num_input_tokens_seen": 177162390, "step": 8207, "time_per_iteration": 2.6943774223327637 }, { "auxiliary_loss_clip": 0.01149594, "auxiliary_loss_mlp": 0.01021653, "balance_loss_clip": 1.04455435, "balance_loss_mlp": 1.01488233, "epoch": 0.9869536463656586, "flos": 25809573548160.0, "grad_norm": 2.6987183390432614, "language_loss": 0.75425577, "learning_rate": 1.7691586600243612e-09, "loss": 0.77596831, "num_input_tokens_seen": 177181290, "step": 8208, "time_per_iteration": 2.733654737472534 }, { "auxiliary_loss_clip": 0.01156445, "auxiliary_loss_mlp": 0.01024302, "balance_loss_clip": 1.04706597, "balance_loss_mlp": 1.01688671, "epoch": 0.9870738892562977, "flos": 16398500613120.0, "grad_norm": 2.4962312721402737, "language_loss": 0.86867106, "learning_rate": 1.7365528594415202e-09, "loss": 0.89047861, "num_input_tokens_seen": 177195360, "step": 8209, "time_per_iteration": 2.7022504806518555 }, { "auxiliary_loss_clip": 0.01166007, "auxiliary_loss_mlp": 0.01055444, "balance_loss_clip": 1.04610062, "balance_loss_mlp": 1.01953244, "epoch": 0.9871941321469369, "flos": 35481358373760.0, "grad_norm": 3.6050568128118594, "language_loss": 0.67520607, "learning_rate": 1.7042501938346888e-09, "loss": 0.69742054, "num_input_tokens_seen": 177218090, "step": 8210, "time_per_iteration": 2.753053903579712 }, { "auxiliary_loss_clip": 0.01147589, "auxiliary_loss_mlp": 0.01025431, "balance_loss_clip": 1.04314351, "balance_loss_mlp": 1.01860356, "epoch": 0.9873143750375759, "flos": 21434217874560.0, "grad_norm": 1.875839997406283, "language_loss": 0.76607573, "learning_rate": 1.6722506681043913e-09, "loss": 0.78780586, "num_input_tokens_seen": 177237050, "step": 8211, "time_per_iteration": 2.7118358612060547 }, { "auxiliary_loss_clip": 0.01162676, "auxiliary_loss_mlp": 0.01026454, "balance_loss_clip": 1.04630995, "balance_loss_mlp": 1.01960254, "epoch": 0.987434617928215, "flos": 16326499800960.0, "grad_norm": 2.235444376377202, "language_loss": 0.69235671, "learning_rate": 1.640554287104745e-09, "loss": 0.714248, "num_input_tokens_seen": 177255325, "step": 8212, "time_per_iteration": 2.740288019180298 }, { "auxiliary_loss_clip": 0.01161509, "auxiliary_loss_mlp": 0.0102507, "balance_loss_clip": 1.04633558, "balance_loss_mlp": 1.01744092, "epoch": 0.9875548608188541, "flos": 17851984456320.0, "grad_norm": 11.415792777435946, "language_loss": 0.79871899, "learning_rate": 1.609161055644348e-09, "loss": 0.82058477, "num_input_tokens_seen": 177271250, "step": 8213, "time_per_iteration": 3.666745185852051 }, { "auxiliary_loss_clip": 0.01169661, "auxiliary_loss_mlp": 0.0102114, "balance_loss_clip": 1.04741478, "balance_loss_mlp": 1.01347446, "epoch": 0.9876751037094932, "flos": 26132876887680.0, "grad_norm": 2.1228224941431706, "language_loss": 0.68265021, "learning_rate": 1.5780709784849467e-09, "loss": 0.70455819, "num_input_tokens_seen": 177288270, "step": 8214, "time_per_iteration": 2.936660051345825 }, { "auxiliary_loss_clip": 0.01152597, "auxiliary_loss_mlp": 0.01025957, "balance_loss_clip": 1.04816198, "balance_loss_mlp": 1.01815522, "epoch": 0.9877953466001322, "flos": 15991344973440.0, "grad_norm": 1.9875298070571414, "language_loss": 0.8233912, "learning_rate": 1.5472840603436565e-09, "loss": 0.84517676, "num_input_tokens_seen": 177305500, "step": 8215, "time_per_iteration": 3.7143759727478027 }, { "auxiliary_loss_clip": 0.0116095, "auxiliary_loss_mlp": 0.01020912, "balance_loss_clip": 1.04589653, "balance_loss_mlp": 1.01419139, "epoch": 0.9879155894907714, "flos": 18806777827200.0, "grad_norm": 2.095730703156923, "language_loss": 0.7801578, "learning_rate": 1.5168003058900757e-09, "loss": 0.80197644, "num_input_tokens_seen": 177323500, "step": 8216, "time_per_iteration": 2.71429443359375 }, { "auxiliary_loss_clip": 0.0115611, "auxiliary_loss_mlp": 0.01024714, "balance_loss_clip": 1.0461638, "balance_loss_mlp": 1.01723981, "epoch": 0.9880358323814105, "flos": 22382044007040.0, "grad_norm": 2.0098648279499236, "language_loss": 0.91956329, "learning_rate": 1.4866197197491715e-09, "loss": 0.94137156, "num_input_tokens_seen": 177342860, "step": 8217, "time_per_iteration": 2.774540662765503 }, { "auxiliary_loss_clip": 0.01165821, "auxiliary_loss_mlp": 0.01052572, "balance_loss_clip": 1.04680443, "balance_loss_mlp": 1.01505423, "epoch": 0.9881560752720495, "flos": 15668831733120.0, "grad_norm": 4.973417256046709, "language_loss": 0.79371536, "learning_rate": 1.4567423064988371e-09, "loss": 0.81589925, "num_input_tokens_seen": 177360210, "step": 8218, "time_per_iteration": 2.6359288692474365 }, { "auxiliary_loss_clip": 0.01167191, "auxiliary_loss_mlp": 0.01029725, "balance_loss_clip": 1.04632831, "balance_loss_mlp": 1.02254868, "epoch": 0.9882763181626887, "flos": 21500113374720.0, "grad_norm": 2.2279981760964116, "language_loss": 0.77856529, "learning_rate": 1.4271680706718913e-09, "loss": 0.80053437, "num_input_tokens_seen": 177377885, "step": 8219, "time_per_iteration": 2.6864516735076904 }, { "auxiliary_loss_clip": 0.01166283, "auxiliary_loss_mlp": 0.01029331, "balance_loss_clip": 1.04877973, "balance_loss_mlp": 1.02152276, "epoch": 0.9883965610533277, "flos": 28034598551040.0, "grad_norm": 2.495677795636564, "language_loss": 0.82762074, "learning_rate": 1.3978970167543013e-09, "loss": 0.84957683, "num_input_tokens_seen": 177398065, "step": 8220, "time_per_iteration": 2.6904304027557373 }, { "auxiliary_loss_clip": 0.01150656, "auxiliary_loss_mlp": 0.01026167, "balance_loss_clip": 1.04581809, "balance_loss_mlp": 1.01904738, "epoch": 0.9885168039439668, "flos": 14098601710080.0, "grad_norm": 2.8709154293232944, "language_loss": 0.77861863, "learning_rate": 1.3689291491867372e-09, "loss": 0.80038691, "num_input_tokens_seen": 177416380, "step": 8221, "time_per_iteration": 2.665255069732666 }, { "auxiliary_loss_clip": 0.01165926, "auxiliary_loss_mlp": 0.0102563, "balance_loss_clip": 1.04598939, "balance_loss_mlp": 1.01794672, "epoch": 0.988637046834606, "flos": 26432013352320.0, "grad_norm": 2.9608035418023357, "language_loss": 0.73570198, "learning_rate": 1.3402644723636836e-09, "loss": 0.75761753, "num_input_tokens_seen": 177438410, "step": 8222, "time_per_iteration": 2.670077323913574 }, { "auxiliary_loss_clip": 0.01153185, "auxiliary_loss_mlp": 0.01029586, "balance_loss_clip": 1.04571497, "balance_loss_mlp": 1.02186155, "epoch": 0.988757289725245, "flos": 25229113764480.0, "grad_norm": 1.975987894460889, "language_loss": 0.83487117, "learning_rate": 1.311902990633218e-09, "loss": 0.85669881, "num_input_tokens_seen": 177457375, "step": 8223, "time_per_iteration": 3.5957624912261963 }, { "auxiliary_loss_clip": 0.01153484, "auxiliary_loss_mlp": 0.01022304, "balance_loss_clip": 1.04452777, "balance_loss_mlp": 1.01577151, "epoch": 0.9888775326158841, "flos": 26359042872960.0, "grad_norm": 1.712749019297393, "language_loss": 0.71404266, "learning_rate": 1.2838447082978987e-09, "loss": 0.73580062, "num_input_tokens_seen": 177478530, "step": 8224, "time_per_iteration": 2.7827672958374023 }, { "auxiliary_loss_clip": 0.01159742, "auxiliary_loss_mlp": 0.01029412, "balance_loss_clip": 1.04559827, "balance_loss_mlp": 1.02216673, "epoch": 0.9889977755065231, "flos": 24316120846080.0, "grad_norm": 2.573513442909767, "language_loss": 0.82897371, "learning_rate": 1.2560896296143208e-09, "loss": 0.85086524, "num_input_tokens_seen": 177496995, "step": 8225, "time_per_iteration": 2.7030091285705566 }, { "auxiliary_loss_clip": 0.0116385, "auxiliary_loss_mlp": 0.01026984, "balance_loss_clip": 1.04529428, "balance_loss_mlp": 1.01999271, "epoch": 0.9891180183971623, "flos": 18951066760320.0, "grad_norm": 18.07144224740037, "language_loss": 0.82044065, "learning_rate": 1.2286377587926722e-09, "loss": 0.84234893, "num_input_tokens_seen": 177513785, "step": 8226, "time_per_iteration": 2.658484935760498 }, { "auxiliary_loss_clip": 0.01163679, "auxiliary_loss_mlp": 0.01022123, "balance_loss_clip": 1.04405665, "balance_loss_mlp": 1.01503003, "epoch": 0.9892382612878013, "flos": 26176580760960.0, "grad_norm": 3.2136551592977973, "language_loss": 0.75104272, "learning_rate": 1.2014890999973992e-09, "loss": 0.7729007, "num_input_tokens_seen": 177530705, "step": 8227, "time_per_iteration": 2.6272075176239014 }, { "auxiliary_loss_clip": 0.01163281, "auxiliary_loss_mlp": 0.01020729, "balance_loss_clip": 1.04467928, "balance_loss_mlp": 1.01423216, "epoch": 0.9893585041784404, "flos": 25449605400960.0, "grad_norm": 1.5664930194195663, "language_loss": 0.78833944, "learning_rate": 1.1746436573472073e-09, "loss": 0.81017953, "num_input_tokens_seen": 177552440, "step": 8228, "time_per_iteration": 2.7062838077545166 }, { "auxiliary_loss_clip": 0.01169131, "auxiliary_loss_mlp": 0.01025063, "balance_loss_clip": 1.04913306, "balance_loss_mlp": 1.01739168, "epoch": 0.9894787470690796, "flos": 20189302352640.0, "grad_norm": 2.680610249355251, "language_loss": 0.69220722, "learning_rate": 1.1481014349141726e-09, "loss": 0.71414924, "num_input_tokens_seen": 177569660, "step": 8229, "time_per_iteration": 2.640001058578491 }, { "auxiliary_loss_clip": 0.01161242, "auxiliary_loss_mlp": 0.01029859, "balance_loss_clip": 1.04669309, "balance_loss_mlp": 1.02175021, "epoch": 0.9895989899597186, "flos": 24644308435200.0, "grad_norm": 2.967500261130945, "language_loss": 0.8452332, "learning_rate": 1.121862436724852e-09, "loss": 0.86714423, "num_input_tokens_seen": 177588500, "step": 8230, "time_per_iteration": 2.725677251815796 }, { "auxiliary_loss_clip": 0.01165566, "auxiliary_loss_mlp": 0.01026725, "balance_loss_clip": 1.05092859, "balance_loss_mlp": 1.01962614, "epoch": 0.9897192328503577, "flos": 21799034357760.0, "grad_norm": 1.6067916104039375, "language_loss": 0.70405787, "learning_rate": 1.0959266667598388e-09, "loss": 0.72598082, "num_input_tokens_seen": 177607315, "step": 8231, "time_per_iteration": 2.6022045612335205 }, { "auxiliary_loss_clip": 0.01160615, "auxiliary_loss_mlp": 0.01026487, "balance_loss_clip": 1.04807901, "balance_loss_mlp": 1.0183984, "epoch": 0.9898394757409968, "flos": 21325229032320.0, "grad_norm": 1.92149428989831, "language_loss": 0.74629772, "learning_rate": 1.0702941289533196e-09, "loss": 0.76816875, "num_input_tokens_seen": 177625990, "step": 8232, "time_per_iteration": 2.7313430309295654 }, { "auxiliary_loss_clip": 0.01154113, "auxiliary_loss_mlp": 0.01024926, "balance_loss_clip": 1.04576802, "balance_loss_mlp": 1.0179193, "epoch": 0.9899597186316359, "flos": 18545024442240.0, "grad_norm": 1.8262602026023578, "language_loss": 0.89009941, "learning_rate": 1.0449648271939615e-09, "loss": 0.91188979, "num_input_tokens_seen": 177642335, "step": 8233, "time_per_iteration": 2.6452741622924805 }, { "auxiliary_loss_clip": 0.01156695, "auxiliary_loss_mlp": 0.01055416, "balance_loss_clip": 1.04807353, "balance_loss_mlp": 1.0198288, "epoch": 0.990079961522275, "flos": 23766723348480.0, "grad_norm": 1.596555217555989, "language_loss": 0.73243541, "learning_rate": 1.0199387653240243e-09, "loss": 0.75455654, "num_input_tokens_seen": 177662025, "step": 8234, "time_per_iteration": 2.806842088699341 }, { "auxiliary_loss_clip": 0.0115276, "auxiliary_loss_mlp": 0.01024442, "balance_loss_clip": 1.0462954, "balance_loss_mlp": 1.01769471, "epoch": 0.9902002044129141, "flos": 16399182971520.0, "grad_norm": 1.630832564530147, "language_loss": 0.70695919, "learning_rate": 9.952159471400267e-10, "loss": 0.72873116, "num_input_tokens_seen": 177679065, "step": 8235, "time_per_iteration": 2.705556631088257 }, { "auxiliary_loss_clip": 0.01162513, "auxiliary_loss_mlp": 0.01053282, "balance_loss_clip": 1.0450021, "balance_loss_mlp": 1.01683879, "epoch": 0.9903204473035532, "flos": 22559657783040.0, "grad_norm": 1.9875310012260412, "language_loss": 0.84356034, "learning_rate": 9.707963763923022e-10, "loss": 0.86571831, "num_input_tokens_seen": 177698115, "step": 8236, "time_per_iteration": 2.6764886379241943 }, { "auxiliary_loss_clip": 0.01158438, "auxiliary_loss_mlp": 0.0102484, "balance_loss_clip": 1.04576218, "balance_loss_mlp": 1.01807809, "epoch": 0.9904406901941922, "flos": 16144001775360.0, "grad_norm": 1.7003128912826417, "language_loss": 0.79191911, "learning_rate": 9.466800567854427e-10, "loss": 0.81375194, "num_input_tokens_seen": 177716715, "step": 8237, "time_per_iteration": 2.6394710540771484 }, { "auxiliary_loss_clip": 0.01159234, "auxiliary_loss_mlp": 0.01030884, "balance_loss_clip": 1.04701591, "balance_loss_mlp": 1.02307272, "epoch": 0.9905609330848314, "flos": 26651499408000.0, "grad_norm": 1.920273952658413, "language_loss": 0.6822567, "learning_rate": 9.228669919778553e-10, "loss": 0.70415795, "num_input_tokens_seen": 177735640, "step": 8238, "time_per_iteration": 2.7471041679382324 }, { "auxiliary_loss_clip": 0.01157147, "auxiliary_loss_mlp": 0.01027251, "balance_loss_clip": 1.04670286, "balance_loss_mlp": 1.01984251, "epoch": 0.9906811759754705, "flos": 23111820627840.0, "grad_norm": 2.3960570945535213, "language_loss": 0.79425722, "learning_rate": 8.993571855817617e-10, "loss": 0.81610119, "num_input_tokens_seen": 177754470, "step": 8239, "time_per_iteration": 3.734729290008545 }, { "auxiliary_loss_clip": 0.01160898, "auxiliary_loss_mlp": 0.01020807, "balance_loss_clip": 1.04515862, "balance_loss_mlp": 1.01376224, "epoch": 0.9908014188661095, "flos": 22090593052800.0, "grad_norm": 2.0638606462615807, "language_loss": 0.7468394, "learning_rate": 8.761506411638642e-10, "loss": 0.76865649, "num_input_tokens_seen": 177773935, "step": 8240, "time_per_iteration": 2.6366920471191406 }, { "auxiliary_loss_clip": 0.01156748, "auxiliary_loss_mlp": 0.01026214, "balance_loss_clip": 1.04588604, "balance_loss_mlp": 1.0189395, "epoch": 0.9909216617567487, "flos": 19242948677760.0, "grad_norm": 1.723946454715992, "language_loss": 0.73645061, "learning_rate": 8.53247362244236e-10, "loss": 0.75828016, "num_input_tokens_seen": 177792745, "step": 8241, "time_per_iteration": 3.692875623703003 }, { "auxiliary_loss_clip": 0.01159047, "auxiliary_loss_mlp": 0.01031253, "balance_loss_clip": 1.04595101, "balance_loss_mlp": 1.02460146, "epoch": 0.9910419046473877, "flos": 23621213352960.0, "grad_norm": 1.5977835890144267, "language_loss": 0.68290591, "learning_rate": 8.306473522976532e-10, "loss": 0.70480889, "num_input_tokens_seen": 177812150, "step": 8242, "time_per_iteration": 2.725365400314331 }, { "auxiliary_loss_clip": 0.01163815, "auxiliary_loss_mlp": 0.01030945, "balance_loss_clip": 1.04510903, "balance_loss_mlp": 1.02396882, "epoch": 0.9911621475380268, "flos": 22711380831360.0, "grad_norm": 1.7222273220500899, "language_loss": 0.72086823, "learning_rate": 8.083506147522623e-10, "loss": 0.74281579, "num_input_tokens_seen": 177831545, "step": 8243, "time_per_iteration": 2.7920045852661133 }, { "auxiliary_loss_clip": 0.01156454, "auxiliary_loss_mlp": 0.01027854, "balance_loss_clip": 1.04529142, "balance_loss_mlp": 1.02019525, "epoch": 0.991282390428666, "flos": 13516956777600.0, "grad_norm": 2.17447423050606, "language_loss": 0.85135865, "learning_rate": 7.863571529906909e-10, "loss": 0.87320167, "num_input_tokens_seen": 177847130, "step": 8244, "time_per_iteration": 2.705475330352783 }, { "auxiliary_loss_clip": 0.01059569, "auxiliary_loss_mlp": 0.01002457, "balance_loss_clip": 1.007002, "balance_loss_mlp": 1.00146759, "epoch": 0.991402633319305, "flos": 61830492071040.0, "grad_norm": 0.7271261407016355, "language_loss": 0.59657776, "learning_rate": 7.646669703489372e-10, "loss": 0.61719799, "num_input_tokens_seen": 177911440, "step": 8245, "time_per_iteration": 3.3998305797576904 }, { "auxiliary_loss_clip": 0.01149376, "auxiliary_loss_mlp": 0.01022858, "balance_loss_clip": 1.04676676, "balance_loss_mlp": 1.01570225, "epoch": 0.9915228762099441, "flos": 18770148933120.0, "grad_norm": 2.8621194013491755, "language_loss": 0.57247162, "learning_rate": 7.432800701177023e-10, "loss": 0.59419399, "num_input_tokens_seen": 177929440, "step": 8246, "time_per_iteration": 2.8555495738983154 }, { "auxiliary_loss_clip": 0.01061594, "auxiliary_loss_mlp": 0.01001832, "balance_loss_clip": 1.00899124, "balance_loss_mlp": 1.00089002, "epoch": 0.9916431191005832, "flos": 65936660244480.0, "grad_norm": 0.7928284273121137, "language_loss": 0.57758129, "learning_rate": 7.221964555415017e-10, "loss": 0.59821558, "num_input_tokens_seen": 177989100, "step": 8247, "time_per_iteration": 3.2087576389312744 }, { "auxiliary_loss_clip": 0.0115392, "auxiliary_loss_mlp": 0.01021163, "balance_loss_clip": 1.04394054, "balance_loss_mlp": 1.01454091, "epoch": 0.9917633619912223, "flos": 16581573256320.0, "grad_norm": 1.8652046861638503, "language_loss": 0.7514112, "learning_rate": 7.01416129818222e-10, "loss": 0.77316201, "num_input_tokens_seen": 178006720, "step": 8248, "time_per_iteration": 2.724334478378296 }, { "auxiliary_loss_clip": 0.01162564, "auxiliary_loss_mlp": 0.01026974, "balance_loss_clip": 1.04596698, "balance_loss_mlp": 1.02017939, "epoch": 0.9918836048818613, "flos": 25411108999680.0, "grad_norm": 2.313137311112631, "language_loss": 0.5870108, "learning_rate": 6.809390961006745e-10, "loss": 0.60890621, "num_input_tokens_seen": 178026850, "step": 8249, "time_per_iteration": 2.8419907093048096 }, { "auxiliary_loss_clip": 0.01159999, "auxiliary_loss_mlp": 0.01031368, "balance_loss_clip": 1.04648316, "balance_loss_mlp": 1.02412939, "epoch": 0.9920038477725005, "flos": 25046867134080.0, "grad_norm": 1.951221305481963, "language_loss": 0.6853022, "learning_rate": 6.607653574948191e-10, "loss": 0.70721585, "num_input_tokens_seen": 178047630, "step": 8250, "time_per_iteration": 3.6675870418548584 }, { "auxiliary_loss_clip": 0.0115525, "auxiliary_loss_mlp": 0.01021409, "balance_loss_clip": 1.04467559, "balance_loss_mlp": 1.01420236, "epoch": 0.9921240906631396, "flos": 21829773421440.0, "grad_norm": 1.762781392321252, "language_loss": 0.82032388, "learning_rate": 6.408949170613187e-10, "loss": 0.84209049, "num_input_tokens_seen": 178066895, "step": 8251, "time_per_iteration": 2.7111611366271973 }, { "auxiliary_loss_clip": 0.01159182, "auxiliary_loss_mlp": 0.01030859, "balance_loss_clip": 1.04741073, "balance_loss_mlp": 1.02235961, "epoch": 0.9922443335537786, "flos": 24864225454080.0, "grad_norm": 2.5318341925157677, "language_loss": 0.81867582, "learning_rate": 6.213277778144288e-10, "loss": 0.84057623, "num_input_tokens_seen": 178088540, "step": 8252, "time_per_iteration": 2.790966033935547 }, { "auxiliary_loss_clip": 0.01152477, "auxiliary_loss_mlp": 0.010252, "balance_loss_clip": 1.04552186, "balance_loss_mlp": 1.01766551, "epoch": 0.9923645764444178, "flos": 21613088626560.0, "grad_norm": 1.975457128749148, "language_loss": 0.67092454, "learning_rate": 6.020639427224416e-10, "loss": 0.69270134, "num_input_tokens_seen": 178106185, "step": 8253, "time_per_iteration": 2.7655420303344727 }, { "auxiliary_loss_clip": 0.01156575, "auxiliary_loss_mlp": 0.01019707, "balance_loss_clip": 1.04540157, "balance_loss_mlp": 1.01290047, "epoch": 0.9924848193350568, "flos": 25001798544000.0, "grad_norm": 2.010375761102074, "language_loss": 0.72692919, "learning_rate": 5.831034147076864e-10, "loss": 0.74869192, "num_input_tokens_seen": 178123435, "step": 8254, "time_per_iteration": 2.7383618354797363 }, { "auxiliary_loss_clip": 0.01057582, "auxiliary_loss_mlp": 0.01003877, "balance_loss_clip": 1.0075314, "balance_loss_mlp": 1.00300109, "epoch": 0.9926050622256959, "flos": 68912543151360.0, "grad_norm": 0.6881232496529586, "language_loss": 0.5571866, "learning_rate": 5.644461966463065e-10, "loss": 0.57780123, "num_input_tokens_seen": 178191045, "step": 8255, "time_per_iteration": 3.3380630016326904 }, { "auxiliary_loss_clip": 0.01159299, "auxiliary_loss_mlp": 0.01028066, "balance_loss_clip": 1.0466516, "balance_loss_mlp": 1.02161634, "epoch": 0.9927253051163349, "flos": 20923675914240.0, "grad_norm": 1.7296265417616907, "language_loss": 0.75795895, "learning_rate": 5.460922913687049e-10, "loss": 0.7798326, "num_input_tokens_seen": 178210135, "step": 8256, "time_per_iteration": 2.725337505340576 }, { "auxiliary_loss_clip": 0.01152741, "auxiliary_loss_mlp": 0.01048679, "balance_loss_clip": 1.04585671, "balance_loss_mlp": 1.01470578, "epoch": 0.9928455480069741, "flos": 22308211601280.0, "grad_norm": 2.344218758523982, "language_loss": 0.75576401, "learning_rate": 5.280417016593208e-10, "loss": 0.77777821, "num_input_tokens_seen": 178229925, "step": 8257, "time_per_iteration": 2.945189952850342 }, { "auxiliary_loss_clip": 0.01161102, "auxiliary_loss_mlp": 0.01054717, "balance_loss_clip": 1.04948092, "balance_loss_mlp": 1.01892841, "epoch": 0.9929657908976132, "flos": 17383889393280.0, "grad_norm": 1.6297274276414502, "language_loss": 0.74856889, "learning_rate": 5.102944302559642e-10, "loss": 0.77072704, "num_input_tokens_seen": 178247420, "step": 8258, "time_per_iteration": 2.7337734699249268 }, { "auxiliary_loss_clip": 0.01160101, "auxiliary_loss_mlp": 0.01027878, "balance_loss_clip": 1.04938102, "balance_loss_mlp": 1.02047825, "epoch": 0.9930860337882522, "flos": 22674680110080.0, "grad_norm": 2.296763200675079, "language_loss": 0.7983638, "learning_rate": 4.9285047985137e-10, "loss": 0.8202436, "num_input_tokens_seen": 178266840, "step": 8259, "time_per_iteration": 2.913344383239746 }, { "auxiliary_loss_clip": 0.01170627, "auxiliary_loss_mlp": 0.01030584, "balance_loss_clip": 1.05018735, "balance_loss_mlp": 1.02297211, "epoch": 0.9932062766788914, "flos": 28147789284480.0, "grad_norm": 2.07606108691114, "language_loss": 0.74413216, "learning_rate": 4.757098530916436e-10, "loss": 0.76614428, "num_input_tokens_seen": 178287285, "step": 8260, "time_per_iteration": 2.8477890491485596 }, { "auxiliary_loss_clip": 0.01165385, "auxiliary_loss_mlp": 0.01023842, "balance_loss_clip": 1.04794216, "balance_loss_mlp": 1.0158906, "epoch": 0.9933265195695304, "flos": 20156659868160.0, "grad_norm": 2.7094814486403602, "language_loss": 0.77282012, "learning_rate": 4.5887255257670563e-10, "loss": 0.79471242, "num_input_tokens_seen": 178304325, "step": 8261, "time_per_iteration": 2.812209367752075 }, { "auxiliary_loss_clip": 0.01164624, "auxiliary_loss_mlp": 0.0102359, "balance_loss_clip": 1.04497921, "balance_loss_mlp": 1.01626801, "epoch": 0.9934467624601695, "flos": 21362037494400.0, "grad_norm": 3.0077842674705155, "language_loss": 0.76697993, "learning_rate": 4.4233858086117906e-10, "loss": 0.78886205, "num_input_tokens_seen": 178322850, "step": 8262, "time_per_iteration": 2.8007009029388428 }, { "auxiliary_loss_clip": 0.01151523, "auxiliary_loss_mlp": 0.01025982, "balance_loss_clip": 1.04944682, "balance_loss_mlp": 1.01885343, "epoch": 0.9935670053508087, "flos": 19756040503680.0, "grad_norm": 2.0233494923441446, "language_loss": 0.67783833, "learning_rate": 4.261079404528356e-10, "loss": 0.69961333, "num_input_tokens_seen": 178342330, "step": 8263, "time_per_iteration": 2.897472381591797 }, { "auxiliary_loss_clip": 0.01161775, "auxiliary_loss_mlp": 0.01025379, "balance_loss_clip": 1.0484426, "balance_loss_mlp": 1.01752865, "epoch": 0.9936872482414477, "flos": 21978838863360.0, "grad_norm": 2.7949196715699625, "language_loss": 0.69121003, "learning_rate": 4.1018063381437205e-10, "loss": 0.71308154, "num_input_tokens_seen": 178362715, "step": 8264, "time_per_iteration": 2.80926513671875 }, { "auxiliary_loss_clip": 0.01057759, "auxiliary_loss_mlp": 0.01001413, "balance_loss_clip": 1.0105226, "balance_loss_mlp": 1.00044692, "epoch": 0.9938074911320868, "flos": 69810667839360.0, "grad_norm": 0.8650900955308441, "language_loss": 0.61092424, "learning_rate": 3.9455666336141167e-10, "loss": 0.63151598, "num_input_tokens_seen": 178426495, "step": 8265, "time_per_iteration": 5.369746208190918 }, { "auxiliary_loss_clip": 0.01165288, "auxiliary_loss_mlp": 0.01024406, "balance_loss_clip": 1.04749155, "balance_loss_mlp": 1.01714325, "epoch": 0.9939277340227259, "flos": 15084170058240.0, "grad_norm": 2.7353701237444756, "language_loss": 0.82979524, "learning_rate": 3.7923603146450267e-10, "loss": 0.8516922, "num_input_tokens_seen": 178442555, "step": 8266, "time_per_iteration": 2.6187565326690674 }, { "auxiliary_loss_clip": 0.01156438, "auxiliary_loss_mlp": 0.01021378, "balance_loss_clip": 1.04472089, "balance_loss_mlp": 1.01436877, "epoch": 0.994047976913365, "flos": 17712364291200.0, "grad_norm": 1.948124410150637, "language_loss": 0.8105669, "learning_rate": 3.642187404473418e-10, "loss": 0.83234513, "num_input_tokens_seen": 178460715, "step": 8267, "time_per_iteration": 2.6386568546295166 }, { "auxiliary_loss_clip": 0.01162126, "auxiliary_loss_mlp": 0.01022948, "balance_loss_clip": 1.04599786, "balance_loss_mlp": 1.01591742, "epoch": 0.994168219804004, "flos": 19171558396800.0, "grad_norm": 2.1560914896627934, "language_loss": 0.85936272, "learning_rate": 3.495047925885508e-10, "loss": 0.88121349, "num_input_tokens_seen": 178479050, "step": 8268, "time_per_iteration": 3.4749464988708496 }, { "auxiliary_loss_clip": 0.01161656, "auxiliary_loss_mlp": 0.01028188, "balance_loss_clip": 1.04926705, "balance_loss_mlp": 1.02076721, "epoch": 0.9942884626946432, "flos": 17851589406720.0, "grad_norm": 2.206359369020803, "language_loss": 0.82736242, "learning_rate": 3.350941901199e-10, "loss": 0.84926093, "num_input_tokens_seen": 178495970, "step": 8269, "time_per_iteration": 2.6565699577331543 }, { "auxiliary_loss_clip": 0.01163907, "auxiliary_loss_mlp": 0.01023605, "balance_loss_clip": 1.04702306, "balance_loss_mlp": 1.01668453, "epoch": 0.9944087055852823, "flos": 18796578364800.0, "grad_norm": 2.3794067478741487, "language_loss": 0.83475029, "learning_rate": 3.2098693522764066e-10, "loss": 0.85662538, "num_input_tokens_seen": 178509170, "step": 8270, "time_per_iteration": 2.6298582553863525 }, { "auxiliary_loss_clip": 0.01164648, "auxiliary_loss_mlp": 0.01056217, "balance_loss_clip": 1.04634845, "balance_loss_mlp": 1.01974189, "epoch": 0.9945289484759213, "flos": 20996969616000.0, "grad_norm": 2.0628703852206636, "language_loss": 0.8130548, "learning_rate": 3.071830300516165e-10, "loss": 0.83526343, "num_input_tokens_seen": 178527000, "step": 8271, "time_per_iteration": 2.7769296169281006 }, { "auxiliary_loss_clip": 0.01167398, "auxiliary_loss_mlp": 0.01027886, "balance_loss_clip": 1.04688346, "balance_loss_mlp": 1.02032197, "epoch": 0.9946491913665605, "flos": 14756952136320.0, "grad_norm": 2.4050658765018555, "language_loss": 0.71155941, "learning_rate": 2.9368247668615234e-10, "loss": 0.73351222, "num_input_tokens_seen": 178545590, "step": 8272, "time_per_iteration": 2.7019457817077637 }, { "auxiliary_loss_clip": 0.01170948, "auxiliary_loss_mlp": 0.01026611, "balance_loss_clip": 1.04933786, "balance_loss_mlp": 1.01836789, "epoch": 0.9947694342571995, "flos": 12669931186560.0, "grad_norm": 3.8805537039735825, "language_loss": 0.61577046, "learning_rate": 2.804852771789434e-10, "loss": 0.63774604, "num_input_tokens_seen": 178558890, "step": 8273, "time_per_iteration": 2.576727867126465 }, { "auxiliary_loss_clip": 0.01161423, "auxiliary_loss_mlp": 0.01029679, "balance_loss_clip": 1.044209, "balance_loss_mlp": 1.02259183, "epoch": 0.9948896771478386, "flos": 18843442634880.0, "grad_norm": 1.6956902260032667, "language_loss": 0.55757654, "learning_rate": 2.675914335321661e-10, "loss": 0.57948756, "num_input_tokens_seen": 178577645, "step": 8274, "time_per_iteration": 2.606935739517212 }, { "auxiliary_loss_clip": 0.01165964, "auxiliary_loss_mlp": 0.0102774, "balance_loss_clip": 1.04650855, "balance_loss_mlp": 1.01920509, "epoch": 0.9950099200384778, "flos": 24900207903360.0, "grad_norm": 2.694262203717336, "language_loss": 0.78605497, "learning_rate": 2.550009477018111e-10, "loss": 0.80799198, "num_input_tokens_seen": 178596415, "step": 8275, "time_per_iteration": 2.6399641036987305 }, { "auxiliary_loss_clip": 0.01158127, "auxiliary_loss_mlp": 0.01052202, "balance_loss_clip": 1.04666233, "balance_loss_mlp": 1.01653242, "epoch": 0.9951301629291168, "flos": 23733613987200.0, "grad_norm": 2.66798525270853, "language_loss": 0.63005692, "learning_rate": 2.4271382159790634e-10, "loss": 0.65216023, "num_input_tokens_seen": 178613845, "step": 8276, "time_per_iteration": 3.535845994949341 }, { "auxiliary_loss_clip": 0.01162375, "auxiliary_loss_mlp": 0.0102898, "balance_loss_clip": 1.0488081, "balance_loss_mlp": 1.02167535, "epoch": 0.9952504058197559, "flos": 22236893147520.0, "grad_norm": 1.7074647163500662, "language_loss": 0.85845578, "learning_rate": 2.3073005708429406e-10, "loss": 0.88036931, "num_input_tokens_seen": 178633490, "step": 8277, "time_per_iteration": 2.7006025314331055 }, { "auxiliary_loss_clip": 0.01152519, "auxiliary_loss_mlp": 0.01024682, "balance_loss_clip": 1.04729605, "balance_loss_mlp": 1.01854241, "epoch": 0.995370648710395, "flos": 21211032718080.0, "grad_norm": 1.7119034136753388, "language_loss": 0.72174013, "learning_rate": 2.190496559788535e-10, "loss": 0.74351215, "num_input_tokens_seen": 178651775, "step": 8278, "time_per_iteration": 2.679499387741089 }, { "auxiliary_loss_clip": 0.01157253, "auxiliary_loss_mlp": 0.01024063, "balance_loss_clip": 1.04704547, "balance_loss_mlp": 1.01657653, "epoch": 0.9954908916010341, "flos": 14866731077760.0, "grad_norm": 2.702958735081706, "language_loss": 0.76840854, "learning_rate": 2.0767262005372265e-10, "loss": 0.79022169, "num_input_tokens_seen": 178669290, "step": 8279, "time_per_iteration": 2.603111505508423 }, { "auxiliary_loss_clip": 0.01162278, "auxiliary_loss_mlp": 0.01028544, "balance_loss_clip": 1.04577541, "balance_loss_mlp": 1.02103972, "epoch": 0.9956111344916732, "flos": 19208259118080.0, "grad_norm": 1.9795271506957766, "language_loss": 0.75290126, "learning_rate": 1.965989510346322e-10, "loss": 0.77480942, "num_input_tokens_seen": 178688410, "step": 8280, "time_per_iteration": 2.641353130340576 }, { "auxiliary_loss_clip": 0.0114913, "auxiliary_loss_mlp": 0.01026185, "balance_loss_clip": 1.04652345, "balance_loss_mlp": 1.01828098, "epoch": 0.9957313773823123, "flos": 20047060494720.0, "grad_norm": 3.68228955521542, "language_loss": 0.71575809, "learning_rate": 1.8582865060134955e-10, "loss": 0.73751122, "num_input_tokens_seen": 178706600, "step": 8281, "time_per_iteration": 2.6841564178466797 }, { "auxiliary_loss_clip": 0.01058767, "auxiliary_loss_mlp": 0.01002673, "balance_loss_clip": 1.00639856, "balance_loss_mlp": 1.00170159, "epoch": 0.9958516202729514, "flos": 57483253768320.0, "grad_norm": 0.9049900078046552, "language_loss": 0.55701447, "learning_rate": 1.7536172038790098e-10, "loss": 0.57762885, "num_input_tokens_seen": 178766910, "step": 8282, "time_per_iteration": 3.284449577331543 }, { "auxiliary_loss_clip": 0.01159085, "auxiliary_loss_mlp": 0.01028921, "balance_loss_clip": 1.04572606, "balance_loss_mlp": 1.02161932, "epoch": 0.9959718631635904, "flos": 27782900974080.0, "grad_norm": 2.708898213899707, "language_loss": 0.69748807, "learning_rate": 1.651981619819054e-10, "loss": 0.71936816, "num_input_tokens_seen": 178784060, "step": 8283, "time_per_iteration": 2.6919567584991455 }, { "auxiliary_loss_clip": 0.01157439, "auxiliary_loss_mlp": 0.01021782, "balance_loss_clip": 1.0464561, "balance_loss_mlp": 1.01456404, "epoch": 0.9960921060542296, "flos": 24024095274240.0, "grad_norm": 2.3907345409986656, "language_loss": 0.70334375, "learning_rate": 1.5533797692546257e-10, "loss": 0.72513604, "num_input_tokens_seen": 178802795, "step": 8284, "time_per_iteration": 2.68735671043396 }, { "auxiliary_loss_clip": 0.01159471, "auxiliary_loss_mlp": 0.01028394, "balance_loss_clip": 1.04618251, "balance_loss_mlp": 1.02073467, "epoch": 0.9962123489448687, "flos": 18697393935360.0, "grad_norm": 7.338402441413045, "language_loss": 0.83863378, "learning_rate": 1.4578116671404296e-10, "loss": 0.86051244, "num_input_tokens_seen": 178821075, "step": 8285, "time_per_iteration": 2.6673848628997803 }, { "auxiliary_loss_clip": 0.01159847, "auxiliary_loss_mlp": 0.01024144, "balance_loss_clip": 1.04777086, "balance_loss_mlp": 1.01711369, "epoch": 0.9963325918355077, "flos": 20010754823040.0, "grad_norm": 5.289569564498677, "language_loss": 0.71315753, "learning_rate": 1.3652773279759777e-10, "loss": 0.73499739, "num_input_tokens_seen": 178837725, "step": 8286, "time_per_iteration": 2.6114962100982666 }, { "auxiliary_loss_clip": 0.01163788, "auxiliary_loss_mlp": 0.01028034, "balance_loss_clip": 1.04877162, "balance_loss_mlp": 1.01985013, "epoch": 0.9964528347261468, "flos": 33108488991360.0, "grad_norm": 1.7291748903515296, "language_loss": 0.63282561, "learning_rate": 1.2757767657989305e-10, "loss": 0.65474379, "num_input_tokens_seen": 178861515, "step": 8287, "time_per_iteration": 2.731854200363159 }, { "auxiliary_loss_clip": 0.01159737, "auxiliary_loss_mlp": 0.0102603, "balance_loss_clip": 1.04613864, "balance_loss_mlp": 1.01919365, "epoch": 0.9965730776167859, "flos": 23109342589440.0, "grad_norm": 1.9879394216294626, "language_loss": 0.87382358, "learning_rate": 1.1893099941850948e-10, "loss": 0.89568126, "num_input_tokens_seen": 178880410, "step": 8288, "time_per_iteration": 2.683527946472168 }, { "auxiliary_loss_clip": 0.01161433, "auxiliary_loss_mlp": 0.01029542, "balance_loss_clip": 1.04403579, "balance_loss_mlp": 1.02185321, "epoch": 0.996693320507425, "flos": 22965843755520.0, "grad_norm": 2.4404544196502043, "language_loss": 0.77626902, "learning_rate": 1.105877026252866e-10, "loss": 0.79817873, "num_input_tokens_seen": 178898740, "step": 8289, "time_per_iteration": 2.924433946609497 }, { "auxiliary_loss_clip": 0.01168246, "auxiliary_loss_mlp": 0.01025295, "balance_loss_clip": 1.04757822, "balance_loss_mlp": 1.01739717, "epoch": 0.996813563398064, "flos": 13222740476160.0, "grad_norm": 4.673764912279496, "language_loss": 0.72066927, "learning_rate": 1.0254778746565663e-10, "loss": 0.74260467, "num_input_tokens_seen": 178914015, "step": 8290, "time_per_iteration": 2.5819108486175537 }, { "auxiliary_loss_clip": 0.01152084, "auxiliary_loss_mlp": 0.01026054, "balance_loss_clip": 1.04580736, "balance_loss_mlp": 1.0193541, "epoch": 0.9969338062887032, "flos": 14647855553280.0, "grad_norm": 1.9499489614872814, "language_loss": 0.7333957, "learning_rate": 9.481125515953259e-11, "loss": 0.75517708, "num_input_tokens_seen": 178932075, "step": 8291, "time_per_iteration": 4.6893346309661865 }, { "auxiliary_loss_clip": 0.01154734, "auxiliary_loss_mlp": 0.01030185, "balance_loss_clip": 1.04586983, "balance_loss_mlp": 1.02290177, "epoch": 0.9970540491793423, "flos": 25735741142400.0, "grad_norm": 1.963616338190608, "language_loss": 0.79844928, "learning_rate": 8.737810688064228e-11, "loss": 0.82029843, "num_input_tokens_seen": 178951910, "step": 8292, "time_per_iteration": 2.7891643047332764 }, { "auxiliary_loss_clip": 0.01154799, "auxiliary_loss_mlp": 0.01029823, "balance_loss_clip": 1.04888844, "balance_loss_mlp": 1.02110898, "epoch": 0.9971742920699813, "flos": 21470236237440.0, "grad_norm": 2.186544277841048, "language_loss": 0.79199553, "learning_rate": 8.024834375608414e-11, "loss": 0.81384176, "num_input_tokens_seen": 178970500, "step": 8293, "time_per_iteration": 2.716373920440674 }, { "auxiliary_loss_clip": 0.01059008, "auxiliary_loss_mlp": 0.01001938, "balance_loss_clip": 1.00657809, "balance_loss_mlp": 1.00091267, "epoch": 0.9972945349606205, "flos": 72211223629440.0, "grad_norm": 0.834890597792274, "language_loss": 0.62788796, "learning_rate": 7.342196686788149e-11, "loss": 0.64849746, "num_input_tokens_seen": 179023665, "step": 8294, "time_per_iteration": 4.04690957069397 }, { "auxiliary_loss_clip": 0.01157351, "auxiliary_loss_mlp": 0.01030402, "balance_loss_clip": 1.04808354, "balance_loss_mlp": 1.02255559, "epoch": 0.9974147778512595, "flos": 19678293515520.0, "grad_norm": 1.9363788348019282, "language_loss": 0.69016951, "learning_rate": 6.689897725142834e-11, "loss": 0.71204704, "num_input_tokens_seen": 179043140, "step": 8295, "time_per_iteration": 2.6884164810180664 }, { "auxiliary_loss_clip": 0.01160348, "auxiliary_loss_mlp": 0.0102804, "balance_loss_clip": 1.04569435, "balance_loss_mlp": 1.02058363, "epoch": 0.9975350207418986, "flos": 15960821391360.0, "grad_norm": 3.0823024187102908, "language_loss": 0.87978917, "learning_rate": 6.067937589615545e-11, "loss": 0.90167308, "num_input_tokens_seen": 179061215, "step": 8296, "time_per_iteration": 2.669450521469116 }, { "auxiliary_loss_clip": 0.010603, "auxiliary_loss_mlp": 0.01004705, "balance_loss_clip": 1.00677133, "balance_loss_mlp": 1.00351894, "epoch": 0.9976552636325378, "flos": 59961879768960.0, "grad_norm": 0.7563563711266772, "language_loss": 0.57668513, "learning_rate": 5.476316374575241e-11, "loss": 0.59733528, "num_input_tokens_seen": 179124700, "step": 8297, "time_per_iteration": 3.2290000915527344 }, { "auxiliary_loss_clip": 0.01170348, "auxiliary_loss_mlp": 0.01022485, "balance_loss_clip": 1.0495863, "balance_loss_mlp": 1.01448882, "epoch": 0.9977755065231768, "flos": 22487872452480.0, "grad_norm": 1.9086968824223594, "language_loss": 0.7312724, "learning_rate": 4.9150341697723476e-11, "loss": 0.75320077, "num_input_tokens_seen": 179144590, "step": 8298, "time_per_iteration": 2.706310749053955 }, { "auxiliary_loss_clip": 0.01160068, "auxiliary_loss_mlp": 0.01032486, "balance_loss_clip": 1.05052364, "balance_loss_mlp": 1.02471662, "epoch": 0.9978957494138159, "flos": 26030280666240.0, "grad_norm": 1.4685648378200478, "language_loss": 0.6648885, "learning_rate": 4.384091060338768e-11, "loss": 0.68681395, "num_input_tokens_seen": 179165060, "step": 8299, "time_per_iteration": 2.8276078701019287 }, { "auxiliary_loss_clip": 0.01159932, "auxiliary_loss_mlp": 0.01024609, "balance_loss_clip": 1.04627919, "balance_loss_mlp": 1.01717615, "epoch": 0.998015992304455, "flos": 22637835734400.0, "grad_norm": 2.6403129874945055, "language_loss": 0.74143052, "learning_rate": 3.883487126810081e-11, "loss": 0.76327598, "num_input_tokens_seen": 179184320, "step": 8300, "time_per_iteration": 2.791774272918701 }, { "auxiliary_loss_clip": 0.01153256, "auxiliary_loss_mlp": 0.01022203, "balance_loss_clip": 1.04447627, "balance_loss_mlp": 1.01502037, "epoch": 0.9981362351950941, "flos": 18223444955520.0, "grad_norm": 1.9534001749002072, "language_loss": 0.79311025, "learning_rate": 3.41322244516995e-11, "loss": 0.81486481, "num_input_tokens_seen": 179202265, "step": 8301, "time_per_iteration": 2.640058755874634 }, { "auxiliary_loss_clip": 0.01143669, "auxiliary_loss_mlp": 0.01026271, "balance_loss_clip": 1.04657984, "balance_loss_mlp": 1.01905572, "epoch": 0.9982564780857331, "flos": 33474095573760.0, "grad_norm": 3.8173326737187687, "language_loss": 0.63184512, "learning_rate": 2.9732970866946925e-11, "loss": 0.65354455, "num_input_tokens_seen": 179222145, "step": 8302, "time_per_iteration": 3.68723726272583 }, { "auxiliary_loss_clip": 0.01149033, "auxiliary_loss_mlp": 0.01030684, "balance_loss_clip": 1.04871583, "balance_loss_mlp": 1.02273285, "epoch": 0.9983767209763723, "flos": 15523465392000.0, "grad_norm": 2.2439588651454447, "language_loss": 0.78829062, "learning_rate": 2.563711118175327e-11, "loss": 0.81008774, "num_input_tokens_seen": 179239030, "step": 8303, "time_per_iteration": 2.723418712615967 }, { "auxiliary_loss_clip": 0.01153019, "auxiliary_loss_mlp": 0.01023509, "balance_loss_clip": 1.048666, "balance_loss_mlp": 1.01655316, "epoch": 0.9984969638670114, "flos": 19974377324160.0, "grad_norm": 1.7598741001431024, "language_loss": 0.83599859, "learning_rate": 2.184464601717728e-11, "loss": 0.85776389, "num_input_tokens_seen": 179257345, "step": 8304, "time_per_iteration": 2.7128469944000244 }, { "auxiliary_loss_clip": 0.01165237, "auxiliary_loss_mlp": 0.01022548, "balance_loss_clip": 1.04783797, "balance_loss_mlp": 1.01533031, "epoch": 0.9986172067576504, "flos": 20375750874240.0, "grad_norm": 2.4894433704105134, "language_loss": 0.77560544, "learning_rate": 1.8355575948758585e-11, "loss": 0.79748333, "num_input_tokens_seen": 179275330, "step": 8305, "time_per_iteration": 2.718454360961914 }, { "auxiliary_loss_clip": 0.01159894, "auxiliary_loss_mlp": 0.01026479, "balance_loss_clip": 1.04585695, "balance_loss_mlp": 1.01886725, "epoch": 0.9987374496482896, "flos": 23727903724800.0, "grad_norm": 2.4353481685033027, "language_loss": 0.73388541, "learning_rate": 1.5169901505407424e-11, "loss": 0.75574911, "num_input_tokens_seen": 179292395, "step": 8306, "time_per_iteration": 2.8018734455108643 }, { "auxiliary_loss_clip": 0.01155813, "auxiliary_loss_mlp": 0.01027912, "balance_loss_clip": 1.0451566, "balance_loss_mlp": 1.02072334, "epoch": 0.9988576925389286, "flos": 25044029959680.0, "grad_norm": 1.8896416498360726, "language_loss": 0.7423113, "learning_rate": 1.228762317073695e-11, "loss": 0.76414853, "num_input_tokens_seen": 179311225, "step": 8307, "time_per_iteration": 2.788801908493042 }, { "auxiliary_loss_clip": 0.01156953, "auxiliary_loss_mlp": 0.01021773, "balance_loss_clip": 1.04569387, "balance_loss_mlp": 1.01476955, "epoch": 0.9989779354295677, "flos": 31285627637760.0, "grad_norm": 1.9357666229647568, "language_loss": 0.79265261, "learning_rate": 9.70874138195299e-12, "loss": 0.81443989, "num_input_tokens_seen": 179333135, "step": 8308, "time_per_iteration": 2.785557508468628 }, { "auxiliary_loss_clip": 0.01167242, "auxiliary_loss_mlp": 0.01026904, "balance_loss_clip": 1.04716849, "balance_loss_mlp": 1.01964164, "epoch": 0.9990981783202069, "flos": 19573398823680.0, "grad_norm": 1.702141159416835, "language_loss": 0.74659002, "learning_rate": 7.433256530076093e-12, "loss": 0.76853144, "num_input_tokens_seen": 179353090, "step": 8309, "time_per_iteration": 2.6769816875457764 }, { "auxiliary_loss_clip": 0.01156653, "auxiliary_loss_mlp": 0.01021574, "balance_loss_clip": 1.04417658, "balance_loss_mlp": 1.01483917, "epoch": 0.9992184212108459, "flos": 17199667514880.0, "grad_norm": 2.0817293860951858, "language_loss": 0.75909507, "learning_rate": 5.46116896038562e-12, "loss": 0.78087735, "num_input_tokens_seen": 179367500, "step": 8310, "time_per_iteration": 2.7261240482330322 }, { "auxiliary_loss_clip": 0.0115499, "auxiliary_loss_mlp": 0.01024361, "balance_loss_clip": 1.04579473, "balance_loss_mlp": 1.01688075, "epoch": 0.999338664101485, "flos": 46497853681920.0, "grad_norm": 2.8220566116468038, "language_loss": 0.62295711, "learning_rate": 3.792478972197699e-12, "loss": 0.6447506, "num_input_tokens_seen": 179388085, "step": 8311, "time_per_iteration": 3.0397236347198486 }, { "auxiliary_loss_clip": 0.01162937, "auxiliary_loss_mlp": 0.01023437, "balance_loss_clip": 1.04410148, "balance_loss_mlp": 1.01628709, "epoch": 0.9994589069921241, "flos": 15158253859200.0, "grad_norm": 2.185438937594892, "language_loss": 0.70296603, "learning_rate": 2.4271868181990895e-12, "loss": 0.72482979, "num_input_tokens_seen": 179405250, "step": 8312, "time_per_iteration": 2.89276123046875 }, { "auxiliary_loss_clip": 0.01163033, "auxiliary_loss_mlp": 0.01022507, "balance_loss_clip": 1.04584348, "balance_loss_mlp": 1.01531899, "epoch": 0.9995791498827632, "flos": 12531460256640.0, "grad_norm": 1.9887741309926672, "language_loss": 0.80991042, "learning_rate": 1.3652927060014973e-12, "loss": 0.83176577, "num_input_tokens_seen": 179420845, "step": 8313, "time_per_iteration": 2.724740982055664 }, { "auxiliary_loss_clip": 0.01156452, "auxiliary_loss_mlp": 0.01026417, "balance_loss_clip": 1.04554999, "balance_loss_mlp": 1.01918721, "epoch": 0.9996993927734023, "flos": 19245175320960.0, "grad_norm": 5.194054923169705, "language_loss": 0.63880384, "learning_rate": 6.067967965872612e-13, "loss": 0.66063249, "num_input_tokens_seen": 179440455, "step": 8314, "time_per_iteration": 2.753223180770874 }, { "auxiliary_loss_clip": 0.01157374, "auxiliary_loss_mlp": 0.01028161, "balance_loss_clip": 1.04851341, "balance_loss_mlp": 1.02056742, "epoch": 0.9998196356640414, "flos": 62952804518400.0, "grad_norm": 1.649181451649565, "language_loss": 0.76893938, "learning_rate": 1.5169920497548615e-13, "loss": 0.79079467, "num_input_tokens_seen": 179465075, "step": 8315, "time_per_iteration": 3.091611862182617 }, { "auxiliary_loss_clip": 0.01114339, "auxiliary_loss_mlp": 0.0101463, "balance_loss_clip": 1.02685201, "balance_loss_mlp": 1.01027858, "epoch": 0.9999398785546805, "flos": 50922375073920.0, "grad_norm": 1.7214134078485896, "language_loss": 0.55126202, "learning_rate": 0.0, "loss": 0.57255167, "num_input_tokens_seen": 179513955, "step": 8316, "time_per_iteration": 3.2604382038116455 }, { "epoch": 0.9999398785546805, "num_input_tokens_seen": 179513955, "step": 8316, "total_flos": 6.996749092776837e+17, "train_loss": 0.7899877518386781, "train_runtime": 26149.5989, "train_samples_per_second": 12.721, "train_steps_per_second": 0.318 } ], "logging_steps": 1.0, "max_steps": 8316, "num_input_tokens_seen": 179513955, "num_train_epochs": 1, "save_steps": 1664, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.996749092776837e+17, "train_batch_size": 5, "trial_name": null, "trial_params": null }