{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999398785546805, "eval_steps": 500, "global_step": 8316, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "auxiliary_loss_clip": 0.04491599, "auxiliary_loss_mlp": 0.02221579, "balance_loss_clip": 2.47289801, "balance_loss_mlp": 1.82561398, "epoch": 0.00012024289063909097, "flos": 24932483919360.0, "grad_norm": 40.163321662020806, "language_loss": 2.57927752, "learning_rate": 0.0, "loss": 1.89822352, "num_input_tokens_seen": 20375, "step": 1, "time_per_iteration": 16.226963996887207 }, { "auxiliary_loss_clip": 0.0300904, "auxiliary_loss_mlp": 0.01468188, "balance_loss_clip": 1.63999248, "balance_loss_mlp": 1.19372022, "epoch": 0.00024048578127818193, "flos": 30664624377600.0, "grad_norm": 54.95453609262513, "language_loss": 1.88825572, "learning_rate": 5.021476677069823e-07, "loss": 1.93302798, "num_input_tokens_seen": 39035, "step": 2, "time_per_iteration": 2.5960869789123535 }, { "auxiliary_loss_clip": 0.03015296, "auxiliary_loss_mlp": 0.01474571, "balance_loss_clip": 1.64831674, "balance_loss_mlp": 1.19075799, "epoch": 0.0003607286719172729, "flos": 19026227969280.0, "grad_norm": 40.1721642573067, "language_loss": 1.61505961, "learning_rate": 7.958852231401551e-07, "loss": 1.65995836, "num_input_tokens_seen": 57600, "step": 3, "time_per_iteration": 2.535675287246704 }, { "auxiliary_loss_clip": 0.02993917, "auxiliary_loss_mlp": 0.01471694, "balance_loss_clip": 1.64810443, "balance_loss_mlp": 1.21610916, "epoch": 0.00048097156255636386, "flos": 19316314206720.0, "grad_norm": 36.83340058166358, "language_loss": 1.64525616, "learning_rate": 1.0042953354139647e-06, "loss": 1.68991232, "num_input_tokens_seen": 76465, "step": 4, "time_per_iteration": 2.6027774810791016 }, { "auxiliary_loss_clip": 0.02987625, "auxiliary_loss_mlp": 0.0146254, "balance_loss_clip": 1.6403805, "balance_loss_mlp": 1.19169617, "epoch": 0.0006012144531954548, "flos": 13991264893440.0, "grad_norm": 54.983568980586625, "language_loss": 1.93322444, "learning_rate": 1.1659507774310057e-06, "loss": 1.97772598, "num_input_tokens_seen": 94350, "step": 5, "time_per_iteration": 2.77620267868042 }, { "auxiliary_loss_clip": 0.02991995, "auxiliary_loss_mlp": 0.01455131, "balance_loss_clip": 1.64246082, "balance_loss_mlp": 1.18085408, "epoch": 0.0007214573438345458, "flos": 23148988225920.0, "grad_norm": 44.92862293676176, "language_loss": 1.60884368, "learning_rate": 1.2980328908471373e-06, "loss": 1.65331507, "num_input_tokens_seen": 114595, "step": 6, "time_per_iteration": 2.878549814224243 }, { "auxiliary_loss_clip": 0.02899924, "auxiliary_loss_mlp": 0.0152134, "balance_loss_clip": 1.74713707, "balance_loss_mlp": 1.17191362, "epoch": 0.0008417002344736367, "flos": 67663246170240.0, "grad_norm": 4.5917703107897285, "language_loss": 0.81486475, "learning_rate": 1.4097067265369432e-06, "loss": 0.85907733, "num_input_tokens_seen": 179590, "step": 7, "time_per_iteration": 3.304527521133423 }, { "auxiliary_loss_clip": 0.02973682, "auxiliary_loss_mlp": 0.01441301, "balance_loss_clip": 1.64579487, "balance_loss_mlp": 1.17904043, "epoch": 0.0009619431251127277, "flos": 21281381504640.0, "grad_norm": 41.01975025988405, "language_loss": 1.58570433, "learning_rate": 1.506443003120947e-06, "loss": 1.6298542, "num_input_tokens_seen": 195090, "step": 8, "time_per_iteration": 2.810551404953003 }, { "auxiliary_loss_clip": 0.02993168, "auxiliary_loss_mlp": 0.01467015, "balance_loss_clip": 1.64896607, "balance_loss_mlp": 1.19788778, "epoch": 0.0010821860157518186, "flos": 23331342597120.0, "grad_norm": 17.493215014814783, "language_loss": 1.47886181, "learning_rate": 1.5917704462803102e-06, "loss": 1.52346373, "num_input_tokens_seen": 211635, "step": 9, "time_per_iteration": 2.7871103286743164 }, { "auxiliary_loss_clip": 0.02977469, "auxiliary_loss_mlp": 0.01457293, "balance_loss_clip": 1.64932227, "balance_loss_mlp": 1.19140863, "epoch": 0.0012024289063909096, "flos": 17010166337280.0, "grad_norm": 13.220119208854346, "language_loss": 1.52825212, "learning_rate": 1.6680984451379884e-06, "loss": 1.57259977, "num_input_tokens_seen": 224705, "step": 10, "time_per_iteration": 2.7486579418182373 }, { "auxiliary_loss_clip": 0.02995587, "auxiliary_loss_mlp": 0.01466967, "balance_loss_clip": 1.65137935, "balance_loss_mlp": 1.18849444, "epoch": 0.0013226717970300007, "flos": 21288133261440.0, "grad_norm": 19.089142408113418, "language_loss": 1.32471442, "learning_rate": 1.7371455188905097e-06, "loss": 1.36933994, "num_input_tokens_seen": 244635, "step": 11, "time_per_iteration": 2.8070144653320312 }, { "auxiliary_loss_clip": 0.03003385, "auxiliary_loss_mlp": 0.01460643, "balance_loss_clip": 1.64764857, "balance_loss_mlp": 1.17778349, "epoch": 0.0014429146876690916, "flos": 27237884935680.0, "grad_norm": 10.54493964200517, "language_loss": 1.2536633, "learning_rate": 1.8001805585541196e-06, "loss": 1.2983036, "num_input_tokens_seen": 265765, "step": 12, "time_per_iteration": 2.863173246383667 }, { "auxiliary_loss_clip": 0.02981205, "auxiliary_loss_mlp": 0.01472424, "balance_loss_clip": 1.65053058, "balance_loss_mlp": 1.19871926, "epoch": 0.0015631575783081825, "flos": 19062174504960.0, "grad_norm": 6.646512759096275, "language_loss": 1.29329753, "learning_rate": 1.8581671739548328e-06, "loss": 1.337834, "num_input_tokens_seen": 283500, "step": 13, "time_per_iteration": 2.8205015659332275 }, { "auxiliary_loss_clip": 0.02961987, "auxiliary_loss_mlp": 0.01475863, "balance_loss_clip": 1.64167118, "balance_loss_mlp": 1.19815302, "epoch": 0.0016834004689472734, "flos": 48139473985920.0, "grad_norm": 6.278591479558552, "language_loss": 1.13597274, "learning_rate": 1.9118543942439254e-06, "loss": 1.18035126, "num_input_tokens_seen": 305685, "step": 14, "time_per_iteration": 3.964249849319458 }, { "auxiliary_loss_clip": 0.02981122, "auxiliary_loss_mlp": 0.01449155, "balance_loss_clip": 1.64573061, "balance_loss_mlp": 1.17754877, "epoch": 0.0018036433595863645, "flos": 34970026314240.0, "grad_norm": 5.484670188989572, "language_loss": 1.12807953, "learning_rate": 1.961836000571161e-06, "loss": 1.17238235, "num_input_tokens_seen": 327340, "step": 15, "time_per_iteration": 3.9112637042999268 }, { "auxiliary_loss_clip": 0.0280488, "auxiliary_loss_mlp": 0.0147059, "balance_loss_clip": 1.72419047, "balance_loss_mlp": 1.13337076, "epoch": 0.0019238862502254555, "flos": 59768284440960.0, "grad_norm": 3.7866858979891935, "language_loss": 0.64658284, "learning_rate": 2.0085906708279293e-06, "loss": 0.68933755, "num_input_tokens_seen": 382710, "step": 16, "time_per_iteration": 3.3153250217437744 }, { "auxiliary_loss_clip": 0.02950809, "auxiliary_loss_mlp": 0.0143603, "balance_loss_clip": 1.64240313, "balance_loss_mlp": 1.16957331, "epoch": 0.0020441291408645466, "flos": 20814543417600.0, "grad_norm": 4.3541656018488855, "language_loss": 1.15949893, "learning_rate": 2.0525099325728135e-06, "loss": 1.20336747, "num_input_tokens_seen": 400890, "step": 17, "time_per_iteration": 2.841297149658203 }, { "auxiliary_loss_clip": 0.02776008, "auxiliary_loss_mlp": 0.0145669, "balance_loss_clip": 1.71820652, "balance_loss_mlp": 1.12099707, "epoch": 0.0021643720315036373, "flos": 63857001582720.0, "grad_norm": 3.5308769842636045, "language_loss": 0.7215389, "learning_rate": 2.0939181139872922e-06, "loss": 0.76386595, "num_input_tokens_seen": 462605, "step": 18, "time_per_iteration": 3.239359140396118 }, { "auxiliary_loss_clip": 0.02947094, "auxiliary_loss_mlp": 0.01426851, "balance_loss_clip": 1.64437175, "balance_loss_mlp": 1.17450881, "epoch": 0.0022846149221427284, "flos": 31284981192960.0, "grad_norm": 5.057843935804984, "language_loss": 1.01627076, "learning_rate": 2.1330868934640175e-06, "loss": 1.06001019, "num_input_tokens_seen": 483280, "step": 19, "time_per_iteration": 2.8798208236694336 }, { "auxiliary_loss_clip": 0.02738828, "auxiliary_loss_mlp": 0.01431733, "balance_loss_clip": 1.7070657, "balance_loss_mlp": 1.10214293, "epoch": 0.002404857812781819, "flos": 51083648161920.0, "grad_norm": 3.8932250024616897, "language_loss": 0.76434153, "learning_rate": 2.170246112844971e-06, "loss": 0.80604714, "num_input_tokens_seen": 537620, "step": 20, "time_per_iteration": 3.01880145072937 }, { "auxiliary_loss_clip": 0.0287993, "auxiliary_loss_mlp": 0.01429534, "balance_loss_clip": 1.62626266, "balance_loss_mlp": 1.17108786, "epoch": 0.0025251007034209102, "flos": 15815347309440.0, "grad_norm": 4.528375664338746, "language_loss": 1.01726222, "learning_rate": 2.2055919496770983e-06, "loss": 1.06035674, "num_input_tokens_seen": 555760, "step": 21, "time_per_iteration": 2.7465481758117676 }, { "auxiliary_loss_clip": 0.02882308, "auxiliary_loss_mlp": 0.0142223, "balance_loss_clip": 1.62947464, "balance_loss_mlp": 1.16492879, "epoch": 0.0026453435940600014, "flos": 37851857458560.0, "grad_norm": 3.6890059988262593, "language_loss": 0.89403272, "learning_rate": 2.2392931865974923e-06, "loss": 0.93707812, "num_input_tokens_seen": 578450, "step": 22, "time_per_iteration": 3.0133705139160156 }, { "auxiliary_loss_clip": 0.02834745, "auxiliary_loss_mlp": 0.01427875, "balance_loss_clip": 1.61556959, "balance_loss_mlp": 1.16961956, "epoch": 0.002765586484699092, "flos": 21141976821120.0, "grad_norm": 4.089971394814634, "language_loss": 1.01706481, "learning_rate": 2.271496085962064e-06, "loss": 1.05969119, "num_input_tokens_seen": 596145, "step": 23, "time_per_iteration": 2.8292930126190186 }, { "auxiliary_loss_clip": 0.02820214, "auxiliary_loss_mlp": 0.01408314, "balance_loss_clip": 1.61054301, "balance_loss_mlp": 1.16074002, "epoch": 0.002885829375338183, "flos": 20667381396480.0, "grad_norm": 4.491538893383789, "language_loss": 1.02586675, "learning_rate": 2.3023282262611022e-06, "loss": 1.06815195, "num_input_tokens_seen": 614920, "step": 24, "time_per_iteration": 2.7971348762512207 }, { "auxiliary_loss_clip": 0.02805424, "auxiliary_loss_mlp": 0.01400227, "balance_loss_clip": 1.60972285, "balance_loss_mlp": 1.16495526, "epoch": 0.003006072265977274, "flos": 34823869873920.0, "grad_norm": 3.331871990777231, "language_loss": 0.92589688, "learning_rate": 2.3319015548620114e-06, "loss": 0.96795332, "num_input_tokens_seen": 636060, "step": 25, "time_per_iteration": 2.8684446811676025 }, { "auxiliary_loss_clip": 0.02781421, "auxiliary_loss_mlp": 0.01396556, "balance_loss_clip": 1.60625172, "balance_loss_mlp": 1.16614866, "epoch": 0.003126315156616365, "flos": 24422021118720.0, "grad_norm": 2.8577445218707056, "language_loss": 0.92953634, "learning_rate": 2.3603148416618152e-06, "loss": 0.9713161, "num_input_tokens_seen": 655575, "step": 26, "time_per_iteration": 2.885298728942871 }, { "auxiliary_loss_clip": 0.02802371, "auxiliary_loss_mlp": 0.0138588, "balance_loss_clip": 1.6114316, "balance_loss_mlp": 1.1516571, "epoch": 0.003246558047255456, "flos": 23622326674560.0, "grad_norm": 2.3972637729499553, "language_loss": 1.00903463, "learning_rate": 2.3876556694204647e-06, "loss": 1.05091715, "num_input_tokens_seen": 675730, "step": 27, "time_per_iteration": 2.774613618850708 }, { "auxiliary_loss_clip": 0.02760736, "auxiliary_loss_mlp": 0.0139506, "balance_loss_clip": 1.59588456, "balance_loss_mlp": 1.15530634, "epoch": 0.003366800937894547, "flos": 17820275725440.0, "grad_norm": 2.656257689039716, "language_loss": 0.90690333, "learning_rate": 2.414002061950908e-06, "loss": 0.94846123, "num_input_tokens_seen": 694605, "step": 28, "time_per_iteration": 2.8128139972686768 }, { "auxiliary_loss_clip": 0.02749347, "auxiliary_loss_mlp": 0.01396387, "balance_loss_clip": 1.59300232, "balance_loss_mlp": 1.16617036, "epoch": 0.003487043828533638, "flos": 24426115269120.0, "grad_norm": 2.4008841500620517, "language_loss": 0.9993763, "learning_rate": 2.4394238264681557e-06, "loss": 1.04083359, "num_input_tokens_seen": 714340, "step": 29, "time_per_iteration": 2.8445684909820557 }, { "auxiliary_loss_clip": 0.02710701, "auxiliary_loss_mlp": 0.01342423, "balance_loss_clip": 1.58956349, "balance_loss_mlp": 1.10953557, "epoch": 0.003607286719172729, "flos": 26140311002880.0, "grad_norm": 2.0678962121924775, "language_loss": 0.9955495, "learning_rate": 2.4639836682781433e-06, "loss": 1.03608084, "num_input_tokens_seen": 734470, "step": 30, "time_per_iteration": 2.8105156421661377 }, { "auxiliary_loss_clip": 0.02720673, "auxiliary_loss_mlp": 0.01348615, "balance_loss_clip": 1.59222877, "balance_loss_mlp": 1.10218537, "epoch": 0.00372752960981182, "flos": 20593082113920.0, "grad_norm": 2.7295349189738256, "language_loss": 1.0017463, "learning_rate": 2.487738122623307e-06, "loss": 1.04243922, "num_input_tokens_seen": 753380, "step": 31, "time_per_iteration": 2.8798696994781494 }, { "auxiliary_loss_clip": 0.02662504, "auxiliary_loss_mlp": 0.01343031, "balance_loss_clip": 1.57569802, "balance_loss_mlp": 1.10814071, "epoch": 0.003847772500450911, "flos": 22674608282880.0, "grad_norm": 2.513649326653245, "language_loss": 0.98945636, "learning_rate": 2.510738338534912e-06, "loss": 1.02951169, "num_input_tokens_seen": 772105, "step": 32, "time_per_iteration": 2.8228604793548584 }, { "auxiliary_loss_clip": 0.02588818, "auxiliary_loss_mlp": 0.01371655, "balance_loss_clip": 1.55665135, "balance_loss_mlp": 1.13800478, "epoch": 0.003968015391090002, "flos": 17967796882560.0, "grad_norm": 2.70018168897321, "language_loss": 1.02677512, "learning_rate": 2.5330307420306648e-06, "loss": 1.06637979, "num_input_tokens_seen": 788955, "step": 33, "time_per_iteration": 2.7979047298431396 }, { "auxiliary_loss_clip": 0.02564916, "auxiliary_loss_mlp": 0.01320494, "balance_loss_clip": 1.55383253, "balance_loss_mlp": 1.10687065, "epoch": 0.004088258281729093, "flos": 27304103658240.0, "grad_norm": 3.738118320304094, "language_loss": 0.87965846, "learning_rate": 2.554657600279796e-06, "loss": 0.91851258, "num_input_tokens_seen": 810230, "step": 34, "time_per_iteration": 2.8355112075805664 }, { "auxiliary_loss_clip": 0.02527101, "auxiliary_loss_mlp": 0.01326616, "balance_loss_clip": 1.54346848, "balance_loss_mlp": 1.09983253, "epoch": 0.004208501172368184, "flos": 23258587599360.0, "grad_norm": 2.1664910393617194, "language_loss": 1.03335226, "learning_rate": 2.5756575039679493e-06, "loss": 1.0718894, "num_input_tokens_seen": 829780, "step": 35, "time_per_iteration": 2.7929413318634033 }, { "auxiliary_loss_clip": 0.02498675, "auxiliary_loss_mlp": 0.01322131, "balance_loss_clip": 1.53181314, "balance_loss_mlp": 1.1127044, "epoch": 0.0043287440630072746, "flos": 17312104062720.0, "grad_norm": 1.96712159981555, "language_loss": 0.95004499, "learning_rate": 2.5960657816942747e-06, "loss": 0.98825306, "num_input_tokens_seen": 848695, "step": 36, "time_per_iteration": 2.852844476699829 }, { "auxiliary_loss_clip": 0.02189552, "auxiliary_loss_mlp": 0.01334769, "balance_loss_clip": 1.52825308, "balance_loss_mlp": 1.0662148, "epoch": 0.004448986953646365, "flos": 53092491160320.0, "grad_norm": 1.387905975389154, "language_loss": 0.61020124, "learning_rate": 2.6159148575788668e-06, "loss": 0.64544445, "num_input_tokens_seen": 906730, "step": 37, "time_per_iteration": 3.325557231903076 }, { "auxiliary_loss_clip": 0.02461854, "auxiliary_loss_mlp": 0.01286017, "balance_loss_clip": 1.51970005, "balance_loss_mlp": 1.07496929, "epoch": 0.004569229844285457, "flos": 13444165866240.0, "grad_norm": 2.4302642910780907, "language_loss": 0.98947513, "learning_rate": 2.635234561171e-06, "loss": 1.02695382, "num_input_tokens_seen": 925125, "step": 38, "time_per_iteration": 2.8126745223999023 }, { "auxiliary_loss_clip": 0.02429918, "auxiliary_loss_mlp": 0.01287115, "balance_loss_clip": 1.51529503, "balance_loss_mlp": 1.09523535, "epoch": 0.0046894727349245475, "flos": 16209609966720.0, "grad_norm": 3.7219891565306384, "language_loss": 0.9411552, "learning_rate": 2.6540523970949877e-06, "loss": 0.97832555, "num_input_tokens_seen": 939970, "step": 39, "time_per_iteration": 2.777009963989258 }, { "auxiliary_loss_clip": 0.02430649, "auxiliary_loss_mlp": 0.01286289, "balance_loss_clip": 1.51683307, "balance_loss_mlp": 1.08239388, "epoch": 0.004809715625563638, "flos": 23914244505600.0, "grad_norm": 3.145172711394561, "language_loss": 0.92502004, "learning_rate": 2.6723937805519533e-06, "loss": 0.96218938, "num_input_tokens_seen": 957470, "step": 40, "time_per_iteration": 3.8056347370147705 }, { "auxiliary_loss_clip": 0.02380222, "auxiliary_loss_mlp": 0.01288794, "balance_loss_clip": 1.50057077, "balance_loss_mlp": 1.09376717, "epoch": 0.00492995851620273, "flos": 20773030273920.0, "grad_norm": 2.3267289374857247, "language_loss": 0.93226171, "learning_rate": 2.690282243737839e-06, "loss": 0.96895188, "num_input_tokens_seen": 976405, "step": 41, "time_per_iteration": 3.7384727001190186 }, { "auxiliary_loss_clip": 0.0234594, "auxiliary_loss_mlp": 0.01262382, "balance_loss_clip": 1.48543429, "balance_loss_mlp": 1.07202816, "epoch": 0.0050502014068418205, "flos": 20338655103360.0, "grad_norm": 2.848675288888808, "language_loss": 0.99460036, "learning_rate": 2.7077396173840807e-06, "loss": 1.03068352, "num_input_tokens_seen": 994690, "step": 42, "time_per_iteration": 2.7416727542877197 }, { "auxiliary_loss_clip": 0.02346066, "auxiliary_loss_mlp": 0.01275769, "balance_loss_clip": 1.48910332, "balance_loss_mlp": 1.07902563, "epoch": 0.005170444297480911, "flos": 25994872834560.0, "grad_norm": 2.575373489594992, "language_loss": 0.92822099, "learning_rate": 2.7247861909342594e-06, "loss": 0.96443933, "num_input_tokens_seen": 1015615, "step": 43, "time_per_iteration": 2.872300148010254 }, { "auxiliary_loss_clip": 0.0232423, "auxiliary_loss_mlp": 0.01282572, "balance_loss_clip": 1.48540139, "balance_loss_mlp": 1.08830881, "epoch": 0.005290687188120003, "flos": 20954055841920.0, "grad_norm": 2.244574759184508, "language_loss": 0.83192968, "learning_rate": 2.7414408543044743e-06, "loss": 0.86799777, "num_input_tokens_seen": 1031255, "step": 44, "time_per_iteration": 2.755058765411377 }, { "auxiliary_loss_clip": 0.02304633, "auxiliary_loss_mlp": 0.01242318, "balance_loss_clip": 1.47259796, "balance_loss_mlp": 1.05291784, "epoch": 0.005410930078759093, "flos": 15851401585920.0, "grad_norm": 4.630753036823466, "language_loss": 0.79513079, "learning_rate": 2.7577212237113157e-06, "loss": 0.83060026, "num_input_tokens_seen": 1048295, "step": 45, "time_per_iteration": 2.8247509002685547 }, { "auxiliary_loss_clip": 0.02281564, "auxiliary_loss_mlp": 0.01249999, "balance_loss_clip": 1.46995521, "balance_loss_mlp": 1.06603527, "epoch": 0.005531172969398184, "flos": 21104988791040.0, "grad_norm": 1.9719248686857345, "language_loss": 1.04239273, "learning_rate": 2.7736437536690466e-06, "loss": 1.07770824, "num_input_tokens_seen": 1067925, "step": 46, "time_per_iteration": 2.80381178855896 }, { "auxiliary_loss_clip": 0.02263206, "auxiliary_loss_mlp": 0.0123503, "balance_loss_clip": 1.46415186, "balance_loss_mlp": 1.06117487, "epoch": 0.005651415860037276, "flos": 20844887431680.0, "grad_norm": 1.9275815132256588, "language_loss": 1.07867312, "learning_rate": 2.789223836941131e-06, "loss": 1.11365545, "num_input_tokens_seen": 1088060, "step": 47, "time_per_iteration": 2.785576105117798 }, { "auxiliary_loss_clip": 0.02229983, "auxiliary_loss_mlp": 0.01246862, "balance_loss_clip": 1.4529804, "balance_loss_mlp": 1.07224441, "epoch": 0.005771658750676366, "flos": 13260195383040.0, "grad_norm": 2.6716556385447974, "language_loss": 1.08693254, "learning_rate": 2.8044758939680847e-06, "loss": 1.121701, "num_input_tokens_seen": 1104130, "step": 48, "time_per_iteration": 2.769165515899658 }, { "auxiliary_loss_clip": 0.02213242, "auxiliary_loss_mlp": 0.01251453, "balance_loss_clip": 1.45413268, "balance_loss_mlp": 1.07492805, "epoch": 0.005891901641315457, "flos": 24425396997120.0, "grad_norm": 3.313589062545653, "language_loss": 1.02070785, "learning_rate": 2.8194134530738863e-06, "loss": 1.05535483, "num_input_tokens_seen": 1122900, "step": 49, "time_per_iteration": 2.8311607837677 }, { "auxiliary_loss_clip": 0.02203759, "auxiliary_loss_mlp": 0.01247635, "balance_loss_clip": 1.44937098, "balance_loss_mlp": 1.08465219, "epoch": 0.006012144531954548, "flos": 23076197314560.0, "grad_norm": 5.683830445648056, "language_loss": 0.90051711, "learning_rate": 2.834049222568994e-06, "loss": 0.93503106, "num_input_tokens_seen": 1140250, "step": 50, "time_per_iteration": 2.8686678409576416 }, { "auxiliary_loss_clip": 0.02188846, "auxiliary_loss_mlp": 0.01235823, "balance_loss_clip": 1.44044685, "balance_loss_mlp": 1.07551026, "epoch": 0.006132387422593639, "flos": 22528775064960.0, "grad_norm": 2.426369706199979, "language_loss": 0.92602342, "learning_rate": 2.848395155712969e-06, "loss": 0.96027017, "num_input_tokens_seen": 1160470, "step": 51, "time_per_iteration": 2.837876796722412 }, { "auxiliary_loss_clip": 0.02185804, "auxiliary_loss_mlp": 0.01217172, "balance_loss_clip": 1.4457972, "balance_loss_mlp": 1.05142355, "epoch": 0.00625263031323273, "flos": 27628340751360.0, "grad_norm": 2.296299873823433, "language_loss": 0.97654855, "learning_rate": 2.8624625093687977e-06, "loss": 1.01057827, "num_input_tokens_seen": 1177605, "step": 52, "time_per_iteration": 2.8265416622161865 }, { "auxiliary_loss_clip": 0.02169193, "auxiliary_loss_mlp": 0.01228689, "balance_loss_clip": 1.4359324, "balance_loss_mlp": 1.07667327, "epoch": 0.006372873203871821, "flos": 23110671392640.0, "grad_norm": 2.6102562872732027, "language_loss": 0.89007723, "learning_rate": 2.876261897070029e-06, "loss": 0.92405605, "num_input_tokens_seen": 1197735, "step": 53, "time_per_iteration": 2.861121892929077 }, { "auxiliary_loss_clip": 0.02158876, "auxiliary_loss_mlp": 0.01232933, "balance_loss_clip": 1.43290758, "balance_loss_mlp": 1.07900953, "epoch": 0.006493116094510912, "flos": 22856028900480.0, "grad_norm": 2.504731415245237, "language_loss": 0.92582208, "learning_rate": 2.889803337127447e-06, "loss": 0.95974016, "num_input_tokens_seen": 1216335, "step": 54, "time_per_iteration": 2.8140838146209717 }, { "auxiliary_loss_clip": 0.02150208, "auxiliary_loss_mlp": 0.0123312, "balance_loss_clip": 1.4285928, "balance_loss_mlp": 1.06822944, "epoch": 0.006613358985150003, "flos": 23071708114560.0, "grad_norm": 2.42290319628069, "language_loss": 0.84639096, "learning_rate": 2.903096296321516e-06, "loss": 0.88022423, "num_input_tokens_seen": 1234480, "step": 55, "time_per_iteration": 2.856668472290039 }, { "auxiliary_loss_clip": 0.02123154, "auxiliary_loss_mlp": 0.0122991, "balance_loss_clip": 1.42408085, "balance_loss_mlp": 1.08113694, "epoch": 0.006733601875789094, "flos": 26537662229760.0, "grad_norm": 2.0248832025360497, "language_loss": 0.91771901, "learning_rate": 2.9161497296578907e-06, "loss": 0.95124966, "num_input_tokens_seen": 1253870, "step": 56, "time_per_iteration": 2.8442752361297607 }, { "auxiliary_loss_clip": 0.02121915, "auxiliary_loss_mlp": 0.01212995, "balance_loss_clip": 1.42322659, "balance_loss_mlp": 1.0646987, "epoch": 0.006853844766428185, "flos": 15523178083200.0, "grad_norm": 3.5954849599607535, "language_loss": 0.85932827, "learning_rate": 2.928972116604173e-06, "loss": 0.89267743, "num_input_tokens_seen": 1270145, "step": 57, "time_per_iteration": 2.8010828495025635 }, { "auxiliary_loss_clip": 0.02097864, "auxiliary_loss_mlp": 0.01217619, "balance_loss_clip": 1.41159272, "balance_loss_mlp": 1.0836277, "epoch": 0.006974087657067276, "flos": 24243760897920.0, "grad_norm": 2.6591121281541072, "language_loss": 1.02064562, "learning_rate": 2.9415714941751377e-06, "loss": 1.05380046, "num_input_tokens_seen": 1291365, "step": 58, "time_per_iteration": 2.80631947517395 }, { "auxiliary_loss_clip": 0.02089877, "auxiliary_loss_mlp": 0.01228649, "balance_loss_clip": 1.41032386, "balance_loss_mlp": 1.08016181, "epoch": 0.007094330547706367, "flos": 25772513690880.0, "grad_norm": 2.5126778207248255, "language_loss": 0.93590796, "learning_rate": 2.9539554871897396e-06, "loss": 0.96909326, "num_input_tokens_seen": 1311535, "step": 59, "time_per_iteration": 2.8197238445281982 }, { "auxiliary_loss_clip": 0.02074519, "auxiliary_loss_mlp": 0.01213149, "balance_loss_clip": 1.4071002, "balance_loss_mlp": 1.07181454, "epoch": 0.007214573438345458, "flos": 21319015979520.0, "grad_norm": 2.1657645678408883, "language_loss": 0.9750675, "learning_rate": 2.9661313359851253e-06, "loss": 1.00794411, "num_input_tokens_seen": 1329420, "step": 60, "time_per_iteration": 2.7548983097076416 }, { "auxiliary_loss_clip": 0.0206097, "auxiliary_loss_mlp": 0.01208096, "balance_loss_clip": 1.40042186, "balance_loss_mlp": 1.06666601, "epoch": 0.007334816328984549, "flos": 24937088192640.0, "grad_norm": 2.447107206499474, "language_loss": 0.93989247, "learning_rate": 2.978105921839922e-06, "loss": 0.97258312, "num_input_tokens_seen": 1349965, "step": 61, "time_per_iteration": 2.8515472412109375 }, { "auxiliary_loss_clip": 0.02045748, "auxiliary_loss_mlp": 0.01193601, "balance_loss_clip": 1.39928341, "balance_loss_mlp": 1.06361508, "epoch": 0.00745505921962364, "flos": 18510586277760.0, "grad_norm": 2.2628650062593993, "language_loss": 0.72203124, "learning_rate": 2.9898857903302893e-06, "loss": 0.75442475, "num_input_tokens_seen": 1368915, "step": 62, "time_per_iteration": 2.8011655807495117 }, { "auxiliary_loss_clip": 0.02036683, "auxiliary_loss_mlp": 0.01200397, "balance_loss_clip": 1.39645278, "balance_loss_mlp": 1.06335354, "epoch": 0.007575302110262731, "flos": 18477656484480.0, "grad_norm": 3.1766018259275857, "language_loss": 0.88179195, "learning_rate": 3.001477172817253e-06, "loss": 0.91416276, "num_input_tokens_seen": 1386805, "step": 63, "time_per_iteration": 2.7530806064605713 }, { "auxiliary_loss_clip": 0.02040345, "auxiliary_loss_mlp": 0.01202791, "balance_loss_clip": 1.39826393, "balance_loss_mlp": 1.07423592, "epoch": 0.007695545000901822, "flos": 24973178382720.0, "grad_norm": 2.4657284504653796, "language_loss": 0.96154374, "learning_rate": 3.012886006241894e-06, "loss": 0.99397498, "num_input_tokens_seen": 1406190, "step": 64, "time_per_iteration": 2.8475453853607178 }, { "auxiliary_loss_clip": 0.02012405, "auxiliary_loss_mlp": 0.0119366, "balance_loss_clip": 1.38691652, "balance_loss_mlp": 1.07035041, "epoch": 0.007815787891540913, "flos": 21324223451520.0, "grad_norm": 1.9991685820894902, "language_loss": 0.88282537, "learning_rate": 3.0241179513858383e-06, "loss": 0.914886, "num_input_tokens_seen": 1425500, "step": 65, "time_per_iteration": 2.7386250495910645 }, { "auxiliary_loss_clip": 0.02003539, "auxiliary_loss_mlp": 0.01179024, "balance_loss_clip": 1.37830842, "balance_loss_mlp": 1.05647659, "epoch": 0.007936030782180003, "flos": 21575777374080.0, "grad_norm": 2.229825165253108, "language_loss": 0.87873471, "learning_rate": 3.035178409737647e-06, "loss": 0.91056037, "num_input_tokens_seen": 1442950, "step": 66, "time_per_iteration": 3.7674920558929443 }, { "auxiliary_loss_clip": 0.0197785, "auxiliary_loss_mlp": 0.01179481, "balance_loss_clip": 1.36962128, "balance_loss_mlp": 1.06475437, "epoch": 0.008056273672819095, "flos": 20120785159680.0, "grad_norm": 2.0235569430546168, "language_loss": 0.88794947, "learning_rate": 3.046072539090907e-06, "loss": 0.91952282, "num_input_tokens_seen": 1460915, "step": 67, "time_per_iteration": 4.620994329452515 }, { "auxiliary_loss_clip": 0.01975824, "auxiliary_loss_mlp": 0.01176868, "balance_loss_clip": 1.36495888, "balance_loss_mlp": 1.0657177, "epoch": 0.008176516563458186, "flos": 18333116156160.0, "grad_norm": 2.5070186835816815, "language_loss": 1.04548109, "learning_rate": 3.056805267986779e-06, "loss": 1.07700801, "num_input_tokens_seen": 1478385, "step": 68, "time_per_iteration": 2.800161361694336 }, { "auxiliary_loss_clip": 0.01950921, "auxiliary_loss_mlp": 0.01173359, "balance_loss_clip": 1.35912144, "balance_loss_mlp": 1.06692863, "epoch": 0.008296759454097276, "flos": 21872076664320.0, "grad_norm": 2.3185109693254993, "language_loss": 0.95364261, "learning_rate": 3.0673813091022194e-06, "loss": 0.98488545, "num_input_tokens_seen": 1497605, "step": 69, "time_per_iteration": 2.8168087005615234 }, { "auxiliary_loss_clip": 0.01694084, "auxiliary_loss_mlp": 0.01205752, "balance_loss_clip": 1.29890954, "balance_loss_mlp": 1.06842256, "epoch": 0.008417002344736368, "flos": 63408228036480.0, "grad_norm": 1.3499245316351247, "language_loss": 0.62147617, "learning_rate": 3.0778051716749317e-06, "loss": 0.65047455, "num_input_tokens_seen": 1561150, "step": 70, "time_per_iteration": 3.4276390075683594 }, { "auxiliary_loss_clip": 0.01919551, "auxiliary_loss_mlp": 0.01180379, "balance_loss_clip": 1.3435421, "balance_loss_mlp": 1.07633328, "epoch": 0.008537245235375458, "flos": 22966454286720.0, "grad_norm": 2.9357889082858977, "language_loss": 0.90367472, "learning_rate": 3.0880811730470094e-06, "loss": 0.93467402, "num_input_tokens_seen": 1580605, "step": 71, "time_per_iteration": 2.791699171066284 }, { "auxiliary_loss_clip": 0.01664234, "auxiliary_loss_mlp": 0.01175268, "balance_loss_clip": 1.28190291, "balance_loss_mlp": 1.04709399, "epoch": 0.008657488126014549, "flos": 61984046712960.0, "grad_norm": 1.1528580608414556, "language_loss": 0.58654857, "learning_rate": 3.098213449401257e-06, "loss": 0.6149435, "num_input_tokens_seen": 1647535, "step": 72, "time_per_iteration": 3.2722995281219482 }, { "auxiliary_loss_clip": 0.01893758, "auxiliary_loss_mlp": 0.0115481, "balance_loss_clip": 1.33117318, "balance_loss_mlp": 1.05014443, "epoch": 0.00877773101665364, "flos": 30296791152000.0, "grad_norm": 2.097757480071856, "language_loss": 0.98849636, "learning_rate": 3.1082059657570015e-06, "loss": 1.01898205, "num_input_tokens_seen": 1666770, "step": 73, "time_per_iteration": 2.9573159217834473 }, { "auxiliary_loss_clip": 0.01887045, "auxiliary_loss_mlp": 0.01157442, "balance_loss_clip": 1.33067703, "balance_loss_mlp": 1.06078672, "epoch": 0.00889797390729273, "flos": 23514056104320.0, "grad_norm": 2.3820051095163195, "language_loss": 0.96637833, "learning_rate": 3.1180625252858496e-06, "loss": 0.99682319, "num_input_tokens_seen": 1685200, "step": 74, "time_per_iteration": 2.7658281326293945 }, { "auxiliary_loss_clip": 0.0186253, "auxiliary_loss_mlp": 0.01168921, "balance_loss_clip": 1.31867814, "balance_loss_mlp": 1.06950092, "epoch": 0.009018216797931822, "flos": 23075838178560.0, "grad_norm": 3.5958232747645495, "language_loss": 0.80363262, "learning_rate": 3.1277867780021663e-06, "loss": 0.83394718, "num_input_tokens_seen": 1701835, "step": 75, "time_per_iteration": 2.811174154281616 }, { "auxiliary_loss_clip": 0.01841367, "auxiliary_loss_mlp": 0.01145317, "balance_loss_clip": 1.31616116, "balance_loss_mlp": 1.05943847, "epoch": 0.009138459688570914, "flos": 15918877284480.0, "grad_norm": 1.8322208760358931, "language_loss": 0.95819902, "learning_rate": 3.1373822288779824e-06, "loss": 0.98806578, "num_input_tokens_seen": 1718415, "step": 76, "time_per_iteration": 2.7934348583221436 }, { "auxiliary_loss_clip": 0.01849599, "auxiliary_loss_mlp": 0.01170548, "balance_loss_clip": 1.3156352, "balance_loss_mlp": 1.07408416, "epoch": 0.009258702579210003, "flos": 27016531372800.0, "grad_norm": 1.9056880690729283, "language_loss": 0.79426813, "learning_rate": 3.1468522454274533e-06, "loss": 0.82446957, "num_input_tokens_seen": 1738770, "step": 77, "time_per_iteration": 2.8460984230041504 }, { "auxiliary_loss_clip": 0.01822034, "auxiliary_loss_mlp": 0.01167028, "balance_loss_clip": 1.30575919, "balance_loss_mlp": 1.07304299, "epoch": 0.009378945469849095, "flos": 26903196984960.0, "grad_norm": 2.1665553481433077, "language_loss": 0.91903865, "learning_rate": 3.15620006480197e-06, "loss": 0.94892925, "num_input_tokens_seen": 1758040, "step": 78, "time_per_iteration": 2.782259941101074 }, { "auxiliary_loss_clip": 0.01825896, "auxiliary_loss_mlp": 0.01148958, "balance_loss_clip": 1.30329621, "balance_loss_mlp": 1.0603137, "epoch": 0.009499188360488187, "flos": 35694236327040.0, "grad_norm": 3.296738972822618, "language_loss": 0.74674737, "learning_rate": 3.1654288004333087e-06, "loss": 0.77649587, "num_input_tokens_seen": 1776705, "step": 79, "time_per_iteration": 2.8932721614837646 }, { "auxiliary_loss_clip": 0.01805992, "auxiliary_loss_mlp": 0.01146265, "balance_loss_clip": 1.29915977, "balance_loss_mlp": 1.06653786, "epoch": 0.009619431251127276, "flos": 21503201944320.0, "grad_norm": 3.2681294437199204, "language_loss": 0.76094091, "learning_rate": 3.1745414482589353e-06, "loss": 0.79046339, "num_input_tokens_seen": 1795915, "step": 80, "time_per_iteration": 2.7435810565948486 }, { "auxiliary_loss_clip": 0.01784892, "auxiliary_loss_mlp": 0.01153992, "balance_loss_clip": 1.29052162, "balance_loss_mlp": 1.07302499, "epoch": 0.009739674141766368, "flos": 17421056991360.0, "grad_norm": 3.9944259323804734, "language_loss": 0.87182081, "learning_rate": 3.1835408925606204e-06, "loss": 0.90120971, "num_input_tokens_seen": 1814055, "step": 81, "time_per_iteration": 2.8428235054016113 }, { "auxiliary_loss_clip": 0.01775799, "auxiliary_loss_mlp": 0.01145835, "balance_loss_clip": 1.28615808, "balance_loss_mlp": 1.06868315, "epoch": 0.00985991703240546, "flos": 27527109246720.0, "grad_norm": 2.119452305500667, "language_loss": 0.89250362, "learning_rate": 3.1924299114448214e-06, "loss": 0.92171991, "num_input_tokens_seen": 1834535, "step": 82, "time_per_iteration": 2.8540806770324707 }, { "auxiliary_loss_clip": 0.01778879, "auxiliary_loss_mlp": 0.01159706, "balance_loss_clip": 1.28671062, "balance_loss_mlp": 1.07778549, "epoch": 0.00998015992304455, "flos": 13808084509440.0, "grad_norm": 2.0821045700117815, "language_loss": 0.83279049, "learning_rate": 3.2012111819909055e-06, "loss": 0.86217642, "num_input_tokens_seen": 1851865, "step": 83, "time_per_iteration": 2.7830352783203125 }, { "auxiliary_loss_clip": 0.01767898, "auxiliary_loss_mlp": 0.01147947, "balance_loss_clip": 1.28383541, "balance_loss_mlp": 1.07098591, "epoch": 0.010100402813683641, "flos": 20191385341440.0, "grad_norm": 2.200420330636526, "language_loss": 0.95237505, "learning_rate": 3.2098872850910627e-06, "loss": 0.98153353, "num_input_tokens_seen": 1868540, "step": 84, "time_per_iteration": 2.820887565612793 }, { "auxiliary_loss_clip": 0.01769197, "auxiliary_loss_mlp": 0.01140347, "balance_loss_clip": 1.28550935, "balance_loss_mlp": 1.06476879, "epoch": 0.010220645704322733, "flos": 17201642762880.0, "grad_norm": 1.8923507372060147, "language_loss": 0.89354932, "learning_rate": 3.2184607100038194e-06, "loss": 0.92264473, "num_input_tokens_seen": 1887180, "step": 85, "time_per_iteration": 2.800595998764038 }, { "auxiliary_loss_clip": 0.01746566, "auxiliary_loss_mlp": 0.01129378, "balance_loss_clip": 1.27404165, "balance_loss_mlp": 1.05499101, "epoch": 0.010340888594961822, "flos": 21470415805440.0, "grad_norm": 2.378485622949192, "language_loss": 0.93350601, "learning_rate": 3.2269338586412414e-06, "loss": 0.96226549, "num_input_tokens_seen": 1904765, "step": 86, "time_per_iteration": 2.7614243030548096 }, { "auxiliary_loss_clip": 0.01739104, "auxiliary_loss_mlp": 0.01142659, "balance_loss_clip": 1.27134991, "balance_loss_mlp": 1.07518649, "epoch": 0.010461131485600914, "flos": 23002831785600.0, "grad_norm": 2.3198102440964106, "language_loss": 0.9631446, "learning_rate": 3.2353090496083106e-06, "loss": 0.99196231, "num_input_tokens_seen": 1922600, "step": 87, "time_per_iteration": 2.806793212890625 }, { "auxiliary_loss_clip": 0.01729518, "auxiliary_loss_mlp": 0.0112693, "balance_loss_clip": 1.26786017, "balance_loss_mlp": 1.0581224, "epoch": 0.010581374376240005, "flos": 33546850571520.0, "grad_norm": 2.9160813256824896, "language_loss": 0.81401861, "learning_rate": 3.2435885220114572e-06, "loss": 0.84258306, "num_input_tokens_seen": 1943950, "step": 88, "time_per_iteration": 2.8113462924957275 }, { "auxiliary_loss_clip": 0.01728568, "auxiliary_loss_mlp": 0.01132877, "balance_loss_clip": 1.26657844, "balance_loss_mlp": 1.06945801, "epoch": 0.010701617266879095, "flos": 21763087822080.0, "grad_norm": 2.8277232742236866, "language_loss": 0.93954146, "learning_rate": 3.2517744390519113e-06, "loss": 0.96815592, "num_input_tokens_seen": 1962815, "step": 89, "time_per_iteration": 2.8294315338134766 }, { "auxiliary_loss_clip": 0.01709919, "auxiliary_loss_mlp": 0.01115001, "balance_loss_clip": 1.25421834, "balance_loss_mlp": 1.05453777, "epoch": 0.010821860157518187, "flos": 19060199256960.0, "grad_norm": 2.0253127648094074, "language_loss": 0.7519933, "learning_rate": 3.259868891418298e-06, "loss": 0.78024244, "num_input_tokens_seen": 1980580, "step": 90, "time_per_iteration": 2.7273151874542236 }, { "auxiliary_loss_clip": 0.01710881, "auxiliary_loss_mlp": 0.0114234, "balance_loss_clip": 1.25902343, "balance_loss_mlp": 1.07205427, "epoch": 0.010942103048157278, "flos": 25447378757760.0, "grad_norm": 2.198992716400347, "language_loss": 0.8498674, "learning_rate": 3.2678739004917757e-06, "loss": 0.87839967, "num_input_tokens_seen": 2000315, "step": 91, "time_per_iteration": 2.851680278778076 }, { "auxiliary_loss_clip": 0.01704175, "auxiliary_loss_mlp": 0.01132637, "balance_loss_clip": 1.25302362, "balance_loss_mlp": 1.06979024, "epoch": 0.011062345938796368, "flos": 27493928058240.0, "grad_norm": 1.6217159425185765, "language_loss": 0.921597, "learning_rate": 3.275791421376029e-06, "loss": 0.94996512, "num_input_tokens_seen": 2023760, "step": 92, "time_per_iteration": 3.658412218093872 }, { "auxiliary_loss_clip": 0.016959, "auxiliary_loss_mlp": 0.01119514, "balance_loss_clip": 1.25192046, "balance_loss_mlp": 1.06255579, "epoch": 0.01118258882943546, "flos": 16071210864000.0, "grad_norm": 2.2343151093219045, "language_loss": 0.96135616, "learning_rate": 3.2836233457634622e-06, "loss": 0.9895103, "num_input_tokens_seen": 2041895, "step": 93, "time_per_iteration": 4.5933310985565186 }, { "auxiliary_loss_clip": 0.01690163, "auxiliary_loss_mlp": 0.0113285, "balance_loss_clip": 1.24838412, "balance_loss_mlp": 1.06389964, "epoch": 0.011302831720074551, "flos": 20668602458880.0, "grad_norm": 2.271387745778157, "language_loss": 0.85538971, "learning_rate": 3.2913715046481135e-06, "loss": 0.88361984, "num_input_tokens_seen": 2061640, "step": 94, "time_per_iteration": 2.7608888149261475 }, { "auxiliary_loss_clip": 0.01682073, "auxiliary_loss_mlp": 0.01111468, "balance_loss_clip": 1.24334979, "balance_loss_mlp": 1.05305505, "epoch": 0.011423074610713641, "flos": 13072238490240.0, "grad_norm": 2.2394329663049635, "language_loss": 0.88671839, "learning_rate": 3.299037670895023e-06, "loss": 0.91465378, "num_input_tokens_seen": 2078255, "step": 95, "time_per_iteration": 2.7603001594543457 }, { "auxiliary_loss_clip": 0.01685062, "auxiliary_loss_mlp": 0.01109305, "balance_loss_clip": 1.24709272, "balance_loss_mlp": 1.0501771, "epoch": 0.011543317501352733, "flos": 30335646689280.0, "grad_norm": 1.9716010671710815, "language_loss": 0.80162954, "learning_rate": 3.3066235616750667e-06, "loss": 0.82957315, "num_input_tokens_seen": 2099490, "step": 96, "time_per_iteration": 2.835517168045044 }, { "auxiliary_loss_clip": 0.01651588, "auxiliary_loss_mlp": 0.0110807, "balance_loss_clip": 1.22912633, "balance_loss_mlp": 1.05344832, "epoch": 0.011663560391991824, "flos": 15522962601600.0, "grad_norm": 2.426808710548288, "language_loss": 0.92517221, "learning_rate": 3.3141308407736276e-06, "loss": 0.95276886, "num_input_tokens_seen": 2116125, "step": 97, "time_per_iteration": 2.7461814880371094 }, { "auxiliary_loss_clip": 0.01649204, "auxiliary_loss_mlp": 0.01114609, "balance_loss_clip": 1.22166777, "balance_loss_mlp": 1.05996323, "epoch": 0.011783803282630914, "flos": 19902125116800.0, "grad_norm": 4.315403232198553, "language_loss": 0.86542857, "learning_rate": 3.321561120780869e-06, "loss": 0.89306664, "num_input_tokens_seen": 2134835, "step": 98, "time_per_iteration": 2.7719523906707764 }, { "auxiliary_loss_clip": 0.01652237, "auxiliary_loss_mlp": 0.01123995, "balance_loss_clip": 1.23327065, "balance_loss_mlp": 1.07018375, "epoch": 0.011904046173270006, "flos": 22340674517760.0, "grad_norm": 2.3121610371157866, "language_loss": 1.01338661, "learning_rate": 3.3289159651708192e-06, "loss": 1.0411489, "num_input_tokens_seen": 2152410, "step": 99, "time_per_iteration": 2.7645716667175293 }, { "auxiliary_loss_clip": 0.01643335, "auxiliary_loss_mlp": 0.01112099, "balance_loss_clip": 1.22376597, "balance_loss_mlp": 1.0565474, "epoch": 0.012024289063909096, "flos": 19100060375040.0, "grad_norm": 1.9825972759369996, "language_loss": 0.97755855, "learning_rate": 3.3361968902759768e-06, "loss": 1.00511289, "num_input_tokens_seen": 2172090, "step": 100, "time_per_iteration": 2.7129034996032715 }, { "auxiliary_loss_clip": 0.01625094, "auxiliary_loss_mlp": 0.01095759, "balance_loss_clip": 1.21462393, "balance_loss_mlp": 1.04707408, "epoch": 0.012144531954548187, "flos": 15012205159680.0, "grad_norm": 3.20056968164523, "language_loss": 0.93713617, "learning_rate": 3.343405367163663e-06, "loss": 0.96434474, "num_input_tokens_seen": 2189020, "step": 101, "time_per_iteration": 2.7386598587036133 }, { "auxiliary_loss_clip": 0.01636917, "auxiliary_loss_mlp": 0.01112425, "balance_loss_clip": 1.22053385, "balance_loss_mlp": 1.0616653, "epoch": 0.012264774845187279, "flos": 15122020014720.0, "grad_norm": 2.350256743794026, "language_loss": 0.81376719, "learning_rate": 3.350542823419951e-06, "loss": 0.84126055, "num_input_tokens_seen": 2205620, "step": 102, "time_per_iteration": 2.718799114227295 }, { "auxiliary_loss_clip": 0.01636278, "auxiliary_loss_mlp": 0.01100069, "balance_loss_clip": 1.22151434, "balance_loss_mlp": 1.04663968, "epoch": 0.012385017735826368, "flos": 13949248959360.0, "grad_norm": 3.617264296968206, "language_loss": 0.87479031, "learning_rate": 3.3576106448465615e-06, "loss": 0.90215373, "num_input_tokens_seen": 2219000, "step": 103, "time_per_iteration": 2.695237874984741 }, { "auxiliary_loss_clip": 0.01615814, "auxiliary_loss_mlp": 0.01108381, "balance_loss_clip": 1.20921195, "balance_loss_mlp": 1.05962396, "epoch": 0.01250526062646546, "flos": 23623260428160.0, "grad_norm": 2.098614039988764, "language_loss": 0.88140273, "learning_rate": 3.3646101770757797e-06, "loss": 0.9086448, "num_input_tokens_seen": 2237790, "step": 104, "time_per_iteration": 2.7375972270965576 }, { "auxiliary_loss_clip": 0.0160539, "auxiliary_loss_mlp": 0.01103918, "balance_loss_clip": 1.20787764, "balance_loss_mlp": 1.05251515, "epoch": 0.012625503517104552, "flos": 34640078958720.0, "grad_norm": 1.6674427748119232, "language_loss": 0.85632902, "learning_rate": 3.371542727108104e-06, "loss": 0.88342214, "num_input_tokens_seen": 2259965, "step": 105, "time_per_iteration": 2.864018678665161 }, { "auxiliary_loss_clip": 0.01613643, "auxiliary_loss_mlp": 0.01115266, "balance_loss_clip": 1.20845723, "balance_loss_mlp": 1.06376719, "epoch": 0.012745746407743641, "flos": 17821891837440.0, "grad_norm": 3.6040229704753215, "language_loss": 0.90128565, "learning_rate": 3.3784095647770114e-06, "loss": 0.92857468, "num_input_tokens_seen": 2278610, "step": 106, "time_per_iteration": 2.6798036098480225 }, { "auxiliary_loss_clip": 0.01602969, "auxiliary_loss_mlp": 0.01105155, "balance_loss_clip": 1.20407164, "balance_loss_mlp": 1.05179727, "epoch": 0.012865989298382733, "flos": 20595057361920.0, "grad_norm": 5.844317503764526, "language_loss": 0.88332534, "learning_rate": 3.3852119241449547e-06, "loss": 0.91040665, "num_input_tokens_seen": 2297730, "step": 107, "time_per_iteration": 2.7417352199554443 }, { "auxiliary_loss_clip": 0.01595791, "auxiliary_loss_mlp": 0.01107104, "balance_loss_clip": 1.19982409, "balance_loss_mlp": 1.05663037, "epoch": 0.012986232189021825, "flos": 23948969978880.0, "grad_norm": 2.5717829705124173, "language_loss": 0.96081209, "learning_rate": 3.3919510048344295e-06, "loss": 0.98784101, "num_input_tokens_seen": 2315740, "step": 108, "time_per_iteration": 2.7866291999816895 }, { "auxiliary_loss_clip": 0.01585557, "auxiliary_loss_mlp": 0.01115806, "balance_loss_clip": 1.19923759, "balance_loss_mlp": 1.0680989, "epoch": 0.013106475079660914, "flos": 23725425686400.0, "grad_norm": 2.1352850769786533, "language_loss": 0.86749071, "learning_rate": 3.3986279732976907e-06, "loss": 0.89450443, "num_input_tokens_seen": 2334215, "step": 109, "time_per_iteration": 2.7947704792022705 }, { "auxiliary_loss_clip": 0.01578879, "auxiliary_loss_mlp": 0.01101873, "balance_loss_clip": 1.19448781, "balance_loss_mlp": 1.05585861, "epoch": 0.013226717970300006, "flos": 21102438925440.0, "grad_norm": 1.9952479998999384, "language_loss": 0.9541949, "learning_rate": 3.4052439640284983e-06, "loss": 0.98100239, "num_input_tokens_seen": 2353130, "step": 110, "time_per_iteration": 2.774613380432129 }, { "auxiliary_loss_clip": 0.01579156, "auxiliary_loss_mlp": 0.01089863, "balance_loss_clip": 1.1911397, "balance_loss_mlp": 1.04446864, "epoch": 0.013346960860939098, "flos": 24863902231680.0, "grad_norm": 1.8123564720375656, "language_loss": 0.81381464, "learning_rate": 3.4118000807190217e-06, "loss": 0.84050488, "num_input_tokens_seen": 2374010, "step": 111, "time_per_iteration": 2.770792245864868 }, { "auxiliary_loss_clip": 0.01581371, "auxiliary_loss_mlp": 0.01097763, "balance_loss_clip": 1.19124568, "balance_loss_mlp": 1.054919, "epoch": 0.013467203751578187, "flos": 28181940140160.0, "grad_norm": 1.8752645698568884, "language_loss": 0.75926155, "learning_rate": 3.4182973973648723e-06, "loss": 0.78605288, "num_input_tokens_seen": 2395220, "step": 112, "time_per_iteration": 2.8275082111358643 }, { "auxiliary_loss_clip": 0.01573474, "auxiliary_loss_mlp": 0.01100741, "balance_loss_clip": 1.18915594, "balance_loss_mlp": 1.05613327, "epoch": 0.013587446642217279, "flos": 18916233546240.0, "grad_norm": 2.569109691972843, "language_loss": 0.95035571, "learning_rate": 3.424736959321014e-06, "loss": 0.97709781, "num_input_tokens_seen": 2413025, "step": 113, "time_per_iteration": 2.711380958557129 }, { "auxiliary_loss_clip": 0.01579039, "auxiliary_loss_mlp": 0.011035, "balance_loss_clip": 1.19279599, "balance_loss_mlp": 1.05827212, "epoch": 0.01370768953285637, "flos": 23988615615360.0, "grad_norm": 2.146199732362796, "language_loss": 0.88576341, "learning_rate": 3.431119784311155e-06, "loss": 0.91258878, "num_input_tokens_seen": 2432700, "step": 114, "time_per_iteration": 2.7693426609039307 }, { "auxiliary_loss_clip": 0.01565014, "auxiliary_loss_mlp": 0.01101605, "balance_loss_clip": 1.18894136, "balance_loss_mlp": 1.05773604, "epoch": 0.01382793242349546, "flos": 39202565512320.0, "grad_norm": 2.1666806059867594, "language_loss": 0.77601242, "learning_rate": 3.43744686339307e-06, "loss": 0.80267859, "num_input_tokens_seen": 2455020, "step": 115, "time_per_iteration": 2.8842124938964844 }, { "auxiliary_loss_clip": 0.01549065, "auxiliary_loss_mlp": 0.01078877, "balance_loss_clip": 1.17411411, "balance_loss_mlp": 1.04165959, "epoch": 0.013948175314134552, "flos": 41353506714240.0, "grad_norm": 2.1122867395809837, "language_loss": 0.90540463, "learning_rate": 3.44371916188212e-06, "loss": 0.93168402, "num_input_tokens_seen": 2475775, "step": 116, "time_per_iteration": 2.9128360748291016 }, { "auxiliary_loss_clip": 0.01553238, "auxiliary_loss_mlp": 0.01085621, "balance_loss_clip": 1.17802, "balance_loss_mlp": 1.04931021, "epoch": 0.014068418204773643, "flos": 22453542028800.0, "grad_norm": 2.0338129181752675, "language_loss": 0.85834879, "learning_rate": 3.449937620235143e-06, "loss": 0.88473737, "num_input_tokens_seen": 2496370, "step": 117, "time_per_iteration": 2.7472524642944336 }, { "auxiliary_loss_clip": 0.01540398, "auxiliary_loss_mlp": 0.01096417, "balance_loss_clip": 1.1737566, "balance_loss_mlp": 1.05784118, "epoch": 0.014188661095412733, "flos": 23805147922560.0, "grad_norm": 1.8402946869425383, "language_loss": 0.89322424, "learning_rate": 3.456103154896722e-06, "loss": 0.91959238, "num_input_tokens_seen": 2517645, "step": 118, "time_per_iteration": 3.6204569339752197 }, { "auxiliary_loss_clip": 0.01534657, "auxiliary_loss_mlp": 0.01094205, "balance_loss_clip": 1.17189145, "balance_loss_mlp": 1.05212426, "epoch": 0.014308903986051825, "flos": 23660248458240.0, "grad_norm": 1.8717561131930636, "language_loss": 0.92343897, "learning_rate": 3.462216659109757e-06, "loss": 0.94972754, "num_input_tokens_seen": 2537825, "step": 119, "time_per_iteration": 4.65583872795105 }, { "auxiliary_loss_clip": 0.01544189, "auxiliary_loss_mlp": 0.01094502, "balance_loss_clip": 1.17379403, "balance_loss_mlp": 1.05435193, "epoch": 0.014429146876690916, "flos": 20667991927680.0, "grad_norm": 2.4130892512528015, "language_loss": 0.85072464, "learning_rate": 3.4682790036921077e-06, "loss": 0.87711149, "num_input_tokens_seen": 2556485, "step": 120, "time_per_iteration": 3.699442148208618 }, { "auxiliary_loss_clip": 0.01518442, "auxiliary_loss_mlp": 0.01082955, "balance_loss_clip": 1.16455662, "balance_loss_mlp": 1.04931378, "epoch": 0.014549389767330006, "flos": 20229199384320.0, "grad_norm": 1.919712002858095, "language_loss": 0.83186501, "learning_rate": 3.4742910377810193e-06, "loss": 0.85787892, "num_input_tokens_seen": 2573945, "step": 121, "time_per_iteration": 2.709707736968994 }, { "auxiliary_loss_clip": 0.01524272, "auxiliary_loss_mlp": 0.0110188, "balance_loss_clip": 1.16599607, "balance_loss_mlp": 1.06661797, "epoch": 0.014669632657969098, "flos": 18004174381440.0, "grad_norm": 3.2851498112503377, "language_loss": 0.88705194, "learning_rate": 3.4802535895469042e-06, "loss": 0.91331339, "num_input_tokens_seen": 2592695, "step": 122, "time_per_iteration": 2.670933723449707 }, { "auxiliary_loss_clip": 0.01522686, "auxiliary_loss_mlp": 0.01093607, "balance_loss_clip": 1.16536188, "balance_loss_mlp": 1.05636621, "epoch": 0.01478987554860819, "flos": 22741796672640.0, "grad_norm": 2.0979371559302504, "language_loss": 0.89413953, "learning_rate": 3.4861674668779934e-06, "loss": 0.92030245, "num_input_tokens_seen": 2610925, "step": 123, "time_per_iteration": 2.747762441635132 }, { "auxiliary_loss_clip": 0.01518753, "auxiliary_loss_mlp": 0.01094183, "balance_loss_clip": 1.16223788, "balance_loss_mlp": 1.05925477, "epoch": 0.01491011843924728, "flos": 17198590106880.0, "grad_norm": 2.125410033053841, "language_loss": 0.84308249, "learning_rate": 3.492033458037272e-06, "loss": 0.86921191, "num_input_tokens_seen": 2629495, "step": 124, "time_per_iteration": 2.7083122730255127 }, { "auxiliary_loss_clip": 0.01508444, "auxiliary_loss_mlp": 0.01089675, "balance_loss_clip": 1.15483034, "balance_loss_mlp": 1.05603445, "epoch": 0.01503036132988637, "flos": 17673867889920.0, "grad_norm": 2.4145961821687947, "language_loss": 0.8689959, "learning_rate": 3.497852332293018e-06, "loss": 0.89497715, "num_input_tokens_seen": 2645070, "step": 125, "time_per_iteration": 2.7551441192626953 }, { "auxiliary_loss_clip": 0.01509738, "auxiliary_loss_mlp": 0.01090909, "balance_loss_clip": 1.1598649, "balance_loss_mlp": 1.05741096, "epoch": 0.015150604220525462, "flos": 18878239935360.0, "grad_norm": 3.1882841277881284, "language_loss": 0.96611857, "learning_rate": 3.5036248405242356e-06, "loss": 0.99212503, "num_input_tokens_seen": 2663825, "step": 126, "time_per_iteration": 2.7205774784088135 }, { "auxiliary_loss_clip": 0.0150564, "auxiliary_loss_mlp": 0.01113146, "balance_loss_clip": 1.15599477, "balance_loss_mlp": 1.07325864, "epoch": 0.015270847111164552, "flos": 39420184060800.0, "grad_norm": 2.0125373774373183, "language_loss": 0.82604873, "learning_rate": 3.509351715802146e-06, "loss": 0.85223657, "num_input_tokens_seen": 2684710, "step": 127, "time_per_iteration": 2.903897762298584 }, { "auxiliary_loss_clip": 0.01493251, "auxiliary_loss_mlp": 0.01082611, "balance_loss_clip": 1.14927626, "balance_loss_mlp": 1.04711056, "epoch": 0.015391090001803644, "flos": 43762466286720.0, "grad_norm": 2.1210038515489344, "language_loss": 0.78434384, "learning_rate": 3.5150336739488763e-06, "loss": 0.81010252, "num_input_tokens_seen": 2706995, "step": 128, "time_per_iteration": 2.8854024410247803 }, { "auxiliary_loss_clip": 0.01495778, "auxiliary_loss_mlp": 0.01086586, "balance_loss_clip": 1.15144837, "balance_loss_mlp": 1.05652153, "epoch": 0.015511332892442733, "flos": 18916341287040.0, "grad_norm": 1.9920031885938865, "language_loss": 0.83905834, "learning_rate": 3.5206714140744143e-06, "loss": 0.86488199, "num_input_tokens_seen": 2727050, "step": 129, "time_per_iteration": 2.7462501525878906 }, { "auxiliary_loss_clip": 0.01500479, "auxiliary_loss_mlp": 0.01099146, "balance_loss_clip": 1.1551038, "balance_loss_mlp": 1.06297779, "epoch": 0.015631575783081827, "flos": 24535283679360.0, "grad_norm": 2.7927649198648914, "language_loss": 0.87461013, "learning_rate": 3.5262656190928208e-06, "loss": 0.90060633, "num_input_tokens_seen": 2745350, "step": 130, "time_per_iteration": 2.845433235168457 }, { "auxiliary_loss_clip": 0.01513118, "auxiliary_loss_mlp": 0.01090015, "balance_loss_clip": 1.22729838, "balance_loss_mlp": 1.06273985, "epoch": 0.015751818673720917, "flos": 62328536098560.0, "grad_norm": 1.053613045616893, "language_loss": 0.71592617, "learning_rate": 3.5318169562186737e-06, "loss": 0.74195743, "num_input_tokens_seen": 2814195, "step": 131, "time_per_iteration": 3.3020341396331787 }, { "auxiliary_loss_clip": 0.01490268, "auxiliary_loss_mlp": 0.01096562, "balance_loss_clip": 1.14874685, "balance_loss_mlp": 1.064399, "epoch": 0.015872061564360006, "flos": 23878549365120.0, "grad_norm": 2.3704484556815, "language_loss": 0.82033777, "learning_rate": 3.5373260774446292e-06, "loss": 0.84620613, "num_input_tokens_seen": 2834645, "step": 132, "time_per_iteration": 2.75390625 }, { "auxiliary_loss_clip": 0.01481003, "auxiliary_loss_mlp": 0.01094375, "balance_loss_clip": 1.1441102, "balance_loss_mlp": 1.06228364, "epoch": 0.0159923044549991, "flos": 23367899664000.0, "grad_norm": 1.832005124637457, "language_loss": 0.90313101, "learning_rate": 3.542793620000961e-06, "loss": 0.92888486, "num_input_tokens_seen": 2854120, "step": 133, "time_per_iteration": 2.716811418533325 }, { "auxiliary_loss_clip": 0.01486233, "auxiliary_loss_mlp": 0.01099182, "balance_loss_clip": 1.14925385, "balance_loss_mlp": 1.06582761, "epoch": 0.01611254734563819, "flos": 17858305249920.0, "grad_norm": 5.059662976380581, "language_loss": 0.86841142, "learning_rate": 3.5482202067978894e-06, "loss": 0.89426553, "num_input_tokens_seen": 2871330, "step": 134, "time_per_iteration": 2.743974447250366 }, { "auxiliary_loss_clip": 0.01484045, "auxiliary_loss_mlp": 0.01089784, "balance_loss_clip": 1.1492188, "balance_loss_mlp": 1.05826473, "epoch": 0.01623279023627728, "flos": 20954774113920.0, "grad_norm": 1.9500121656249, "language_loss": 0.7600261, "learning_rate": 3.553606446851471e-06, "loss": 0.78576446, "num_input_tokens_seen": 2888070, "step": 135, "time_per_iteration": 2.6938154697418213 }, { "auxiliary_loss_clip": 0.01479727, "auxiliary_loss_mlp": 0.01082928, "balance_loss_clip": 1.144701, "balance_loss_mlp": 1.05357885, "epoch": 0.016353033126916373, "flos": 15742412743680.0, "grad_norm": 1.9634076313710365, "language_loss": 0.83299983, "learning_rate": 3.5589529356937613e-06, "loss": 0.85862637, "num_input_tokens_seen": 2906465, "step": 136, "time_per_iteration": 2.69671630859375 }, { "auxiliary_loss_clip": 0.0147532, "auxiliary_loss_mlp": 0.01087005, "balance_loss_clip": 1.14303315, "balance_loss_mlp": 1.05688047, "epoch": 0.016473276017555463, "flos": 18807280617600.0, "grad_norm": 2.0032207055993547, "language_loss": 0.7676419, "learning_rate": 3.5642602557679627e-06, "loss": 0.79326516, "num_input_tokens_seen": 2924915, "step": 137, "time_per_iteration": 2.7246298789978027 }, { "auxiliary_loss_clip": 0.01479305, "auxiliary_loss_mlp": 0.01088637, "balance_loss_clip": 1.15059805, "balance_loss_mlp": 1.0598954, "epoch": 0.016593518908194552, "flos": 24352641999360.0, "grad_norm": 2.2814956466904905, "language_loss": 0.84126949, "learning_rate": 3.569528976809202e-06, "loss": 0.86694896, "num_input_tokens_seen": 2942130, "step": 138, "time_per_iteration": 2.9137814044952393 }, { "auxiliary_loss_clip": 0.01468537, "auxiliary_loss_mlp": 0.01073625, "balance_loss_clip": 1.14031768, "balance_loss_mlp": 1.04510999, "epoch": 0.016713761798833646, "flos": 22346133384960.0, "grad_norm": 1.7302916707947082, "language_loss": 0.89940566, "learning_rate": 3.5747596562115522e-06, "loss": 0.92482728, "num_input_tokens_seen": 2962745, "step": 139, "time_per_iteration": 2.8356661796569824 }, { "auxiliary_loss_clip": 0.01476201, "auxiliary_loss_mlp": 0.01072274, "balance_loss_clip": 1.14602876, "balance_loss_mlp": 1.03899121, "epoch": 0.016834004689472735, "flos": 17821820010240.0, "grad_norm": 11.659725848067133, "language_loss": 0.90954, "learning_rate": 3.5799528393819138e-06, "loss": 0.93502474, "num_input_tokens_seen": 2981825, "step": 140, "time_per_iteration": 2.7296414375305176 }, { "auxiliary_loss_clip": 0.01462237, "auxiliary_loss_mlp": 0.01093168, "balance_loss_clip": 1.13566709, "balance_loss_mlp": 1.06354427, "epoch": 0.016954247580111825, "flos": 20519501103360.0, "grad_norm": 1.9649385670469819, "language_loss": 0.88076293, "learning_rate": 3.585109060081286e-06, "loss": 0.90631694, "num_input_tokens_seen": 3001625, "step": 141, "time_per_iteration": 2.728119134902954 }, { "auxiliary_loss_clip": 0.01467646, "auxiliary_loss_mlp": 0.01091003, "balance_loss_clip": 1.13944304, "balance_loss_mlp": 1.06133199, "epoch": 0.017074490470750915, "flos": 22088869200000.0, "grad_norm": 1.8505606643583166, "language_loss": 0.78662086, "learning_rate": 3.590228840753992e-06, "loss": 0.81220734, "num_input_tokens_seen": 3022055, "step": 142, "time_per_iteration": 2.8031117916107178 }, { "auxiliary_loss_clip": 0.01466192, "auxiliary_loss_mlp": 0.01068637, "balance_loss_clip": 1.14059305, "balance_loss_mlp": 1.04185033, "epoch": 0.01719473336139001, "flos": 15997270717440.0, "grad_norm": 2.0564516927045267, "language_loss": 0.87326121, "learning_rate": 3.5953126928453423e-06, "loss": 0.89860952, "num_input_tokens_seen": 3039605, "step": 143, "time_per_iteration": 2.743452548980713 }, { "auxiliary_loss_clip": 0.01455996, "auxiliary_loss_mlp": 0.01080635, "balance_loss_clip": 1.13414657, "balance_loss_mlp": 1.05251336, "epoch": 0.017314976252029098, "flos": 22492038430080.0, "grad_norm": 2.0381563751313254, "language_loss": 0.80287719, "learning_rate": 3.600361117108239e-06, "loss": 0.82824349, "num_input_tokens_seen": 3059405, "step": 144, "time_per_iteration": 2.72440505027771 }, { "auxiliary_loss_clip": 0.01456845, "auxiliary_loss_mlp": 0.01083546, "balance_loss_clip": 1.13420248, "balance_loss_mlp": 1.05565143, "epoch": 0.017435219142668188, "flos": 22018053536640.0, "grad_norm": 2.084531643465103, "language_loss": 0.97235394, "learning_rate": 3.6053746038991616e-06, "loss": 0.99775779, "num_input_tokens_seen": 3078490, "step": 145, "time_per_iteration": 4.572920560836792 }, { "auxiliary_loss_clip": 0.0143102, "auxiliary_loss_mlp": 0.01019412, "balance_loss_clip": 1.17987156, "balance_loss_mlp": 1.00081575, "epoch": 0.01755546203330728, "flos": 72240526149120.0, "grad_norm": 1.0501163205754471, "language_loss": 0.58482635, "learning_rate": 3.6103536334639843e-06, "loss": 0.60933065, "num_input_tokens_seen": 3131755, "step": 146, "time_per_iteration": 4.197690486907959 }, { "auxiliary_loss_clip": 0.01445533, "auxiliary_loss_mlp": 0.01083313, "balance_loss_clip": 1.13059616, "balance_loss_mlp": 1.05572844, "epoch": 0.01767570492394637, "flos": 25337061112320.0, "grad_norm": 2.382601794250895, "language_loss": 0.85594136, "learning_rate": 3.615298676214041e-06, "loss": 0.88122976, "num_input_tokens_seen": 3152035, "step": 147, "time_per_iteration": 2.766993999481201 }, { "auxiliary_loss_clip": 0.01441292, "auxiliary_loss_mlp": 0.01071382, "balance_loss_clip": 1.12834907, "balance_loss_mlp": 1.04333234, "epoch": 0.01779594781458546, "flos": 20449188230400.0, "grad_norm": 2.961288353323809, "language_loss": 0.88844216, "learning_rate": 3.6202101929928317e-06, "loss": 0.91356885, "num_input_tokens_seen": 3170625, "step": 148, "time_per_iteration": 2.7267377376556396 }, { "auxiliary_loss_clip": 0.01447279, "auxiliary_loss_mlp": 0.01086578, "balance_loss_clip": 1.13072133, "balance_loss_mlp": 1.06135368, "epoch": 0.017916190705224554, "flos": 16253601148800.0, "grad_norm": 2.436463314393052, "language_loss": 0.88274872, "learning_rate": 3.6250886353337413e-06, "loss": 0.90808731, "num_input_tokens_seen": 3188155, "step": 149, "time_per_iteration": 2.691784620285034 }, { "auxiliary_loss_clip": 0.01444672, "auxiliary_loss_mlp": 0.01080986, "balance_loss_clip": 1.12895632, "balance_loss_mlp": 1.05365133, "epoch": 0.018036433595863644, "flos": 23330588411520.0, "grad_norm": 2.3329753000018636, "language_loss": 0.86464584, "learning_rate": 3.6299344457091488e-06, "loss": 0.88990235, "num_input_tokens_seen": 3209015, "step": 150, "time_per_iteration": 2.7465267181396484 }, { "auxiliary_loss_clip": 0.01437247, "auxiliary_loss_mlp": 0.01076502, "balance_loss_clip": 1.12654889, "balance_loss_mlp": 1.04945302, "epoch": 0.018156676486502734, "flos": 18588010043520.0, "grad_norm": 2.0579689237579877, "language_loss": 0.9378795, "learning_rate": 3.634748057771256e-06, "loss": 0.96301699, "num_input_tokens_seen": 3224955, "step": 151, "time_per_iteration": 2.7523319721221924 }, { "auxiliary_loss_clip": 0.01436746, "auxiliary_loss_mlp": 0.01067535, "balance_loss_clip": 1.12638187, "balance_loss_mlp": 1.04172659, "epoch": 0.018276919377141827, "flos": 25448707560960.0, "grad_norm": 2.482481565513713, "language_loss": 0.85601401, "learning_rate": 3.639529896584965e-06, "loss": 0.88105685, "num_input_tokens_seen": 3246330, "step": 152, "time_per_iteration": 2.812182903289795 }, { "auxiliary_loss_clip": 0.01435856, "auxiliary_loss_mlp": 0.01071972, "balance_loss_clip": 1.12612534, "balance_loss_mlp": 1.04574609, "epoch": 0.018397162267780917, "flos": 20047311889920.0, "grad_norm": 3.061464646453129, "language_loss": 0.88891762, "learning_rate": 3.6442803788531233e-06, "loss": 0.91399592, "num_input_tokens_seen": 3264290, "step": 153, "time_per_iteration": 2.7511959075927734 }, { "auxiliary_loss_clip": 0.01439962, "auxiliary_loss_mlp": 0.01077652, "balance_loss_clip": 1.12550211, "balance_loss_mlp": 1.0514977, "epoch": 0.018517405158420007, "flos": 27565282425600.0, "grad_norm": 2.473585892953188, "language_loss": 0.96023136, "learning_rate": 3.6489999131344357e-06, "loss": 0.98540747, "num_input_tokens_seen": 3287065, "step": 154, "time_per_iteration": 2.837027072906494 }, { "auxiliary_loss_clip": 0.01428734, "auxiliary_loss_mlp": 0.01066119, "balance_loss_clip": 1.12491441, "balance_loss_mlp": 1.04264688, "epoch": 0.0186376480490591, "flos": 19354056422400.0, "grad_norm": 1.8744212005900394, "language_loss": 0.90426761, "learning_rate": 3.653688900054313e-06, "loss": 0.92921615, "num_input_tokens_seen": 3305595, "step": 155, "time_per_iteration": 2.6998584270477295 }, { "auxiliary_loss_clip": 0.01430692, "auxiliary_loss_mlp": 0.01074298, "balance_loss_clip": 1.12151444, "balance_loss_mlp": 1.04810786, "epoch": 0.01875789093969819, "flos": 26687840993280.0, "grad_norm": 2.143159911420179, "language_loss": 0.75835758, "learning_rate": 3.6583477325089526e-06, "loss": 0.78340751, "num_input_tokens_seen": 3326135, "step": 156, "time_per_iteration": 2.798849105834961 }, { "auxiliary_loss_clip": 0.01429337, "auxiliary_loss_mlp": 0.01069369, "balance_loss_clip": 1.12481165, "balance_loss_mlp": 1.04133058, "epoch": 0.01887813383033728, "flos": 24353001135360.0, "grad_norm": 2.2798337698658626, "language_loss": 1.04201221, "learning_rate": 3.6629767958628916e-06, "loss": 1.06699944, "num_input_tokens_seen": 3343510, "step": 157, "time_per_iteration": 2.7249958515167236 }, { "auxiliary_loss_clip": 0.0143294, "auxiliary_loss_mlp": 0.01086132, "balance_loss_clip": 1.12559891, "balance_loss_mlp": 1.0616349, "epoch": 0.018998376720976373, "flos": 14647532330880.0, "grad_norm": 2.208392420949861, "language_loss": 0.85842657, "learning_rate": 3.667576468140291e-06, "loss": 0.88361734, "num_input_tokens_seen": 3361325, "step": 158, "time_per_iteration": 2.7555198669433594 }, { "auxiliary_loss_clip": 0.01423382, "auxiliary_loss_mlp": 0.01067781, "balance_loss_clip": 1.11920881, "balance_loss_mlp": 1.04265153, "epoch": 0.019118619611615463, "flos": 29305261146240.0, "grad_norm": 3.1108215275087976, "language_loss": 0.88954639, "learning_rate": 3.672147120210184e-06, "loss": 0.91445804, "num_input_tokens_seen": 3377925, "step": 159, "time_per_iteration": 2.744236946105957 }, { "auxiliary_loss_clip": 0.01421746, "auxiliary_loss_mlp": 0.01078553, "balance_loss_clip": 1.12347507, "balance_loss_mlp": 1.05454397, "epoch": 0.019238862502254553, "flos": 20886723797760.0, "grad_norm": 2.0268856099516204, "language_loss": 0.86402106, "learning_rate": 3.6766891159659177e-06, "loss": 0.88902396, "num_input_tokens_seen": 3396335, "step": 160, "time_per_iteration": 2.752629518508911 }, { "auxiliary_loss_clip": 0.01424919, "auxiliary_loss_mlp": 0.01069441, "balance_loss_clip": 1.1219734, "balance_loss_mlp": 1.0442518, "epoch": 0.019359105392893646, "flos": 21360672777600.0, "grad_norm": 4.069574723431663, "language_loss": 0.8775329, "learning_rate": 3.6812028124990075e-06, "loss": 0.90247649, "num_input_tokens_seen": 3413605, "step": 161, "time_per_iteration": 2.6554267406463623 }, { "auxiliary_loss_clip": 0.01413899, "auxiliary_loss_mlp": 0.01065284, "balance_loss_clip": 1.11526823, "balance_loss_mlp": 1.04047656, "epoch": 0.019479348283532736, "flos": 16283729681280.0, "grad_norm": 3.645914814587508, "language_loss": 0.81561714, "learning_rate": 3.6856885602676016e-06, "loss": 0.84040892, "num_input_tokens_seen": 3429640, "step": 162, "time_per_iteration": 2.7112302780151367 }, { "auxiliary_loss_clip": 0.0141921, "auxiliary_loss_mlp": 0.01070403, "balance_loss_clip": 1.11683297, "balance_loss_mlp": 1.04674006, "epoch": 0.019599591174171826, "flos": 22091239497600.0, "grad_norm": 2.1810945709548357, "language_loss": 0.94150376, "learning_rate": 3.6901467032597733e-06, "loss": 0.96639991, "num_input_tokens_seen": 3448125, "step": 163, "time_per_iteration": 2.733766794204712 }, { "auxiliary_loss_clip": 0.01419667, "auxiliary_loss_mlp": 0.01074439, "balance_loss_clip": 1.12046766, "balance_loss_mlp": 1.04805756, "epoch": 0.01971983406481092, "flos": 19609668581760.0, "grad_norm": 2.109186211048345, "language_loss": 0.87333548, "learning_rate": 3.694577579151804e-06, "loss": 0.89827651, "num_input_tokens_seen": 3466535, "step": 164, "time_per_iteration": 2.6854336261749268 }, { "auxiliary_loss_clip": 0.01418396, "auxiliary_loss_mlp": 0.01064735, "balance_loss_clip": 1.12055302, "balance_loss_mlp": 1.03986788, "epoch": 0.01984007695545001, "flos": 19099342103040.0, "grad_norm": 2.3304050801001117, "language_loss": 0.73676556, "learning_rate": 3.6989815194616703e-06, "loss": 0.76159692, "num_input_tokens_seen": 3483730, "step": 165, "time_per_iteration": 2.759139060974121 }, { "auxiliary_loss_clip": 0.01420701, "auxiliary_loss_mlp": 0.01068829, "balance_loss_clip": 1.11759734, "balance_loss_mlp": 1.04384291, "epoch": 0.0199603198460891, "flos": 20848406964480.0, "grad_norm": 2.2489207654864676, "language_loss": 0.79755867, "learning_rate": 3.703358849697888e-06, "loss": 0.82245398, "num_input_tokens_seen": 3503640, "step": 166, "time_per_iteration": 2.700531005859375 }, { "auxiliary_loss_clip": 0.0141159, "auxiliary_loss_mlp": 0.0106641, "balance_loss_clip": 1.11741471, "balance_loss_mlp": 1.04410613, "epoch": 0.020080562736728192, "flos": 21870747861120.0, "grad_norm": 1.7294341119100956, "language_loss": 0.82596946, "learning_rate": 3.7077098895038803e-06, "loss": 0.85074943, "num_input_tokens_seen": 3523010, "step": 167, "time_per_iteration": 2.7318639755249023 }, { "auxiliary_loss_clip": 0.01416088, "auxiliary_loss_mlp": 0.01066235, "balance_loss_clip": 1.11691654, "balance_loss_mlp": 1.04090285, "epoch": 0.020200805627367282, "flos": 21688788539520.0, "grad_norm": 3.322848819909023, "language_loss": 0.96673244, "learning_rate": 3.712034952798045e-06, "loss": 0.99155569, "num_input_tokens_seen": 3541125, "step": 168, "time_per_iteration": 2.7357845306396484 }, { "auxiliary_loss_clip": 0.01413543, "auxiliary_loss_mlp": 0.01069605, "balance_loss_clip": 1.11691141, "balance_loss_mlp": 1.04273558, "epoch": 0.02032104851800637, "flos": 33543043729920.0, "grad_norm": 2.1757137189965086, "language_loss": 0.84556484, "learning_rate": 3.7163343479096656e-06, "loss": 0.87039638, "num_input_tokens_seen": 3562700, "step": 169, "time_per_iteration": 2.817506790161133 }, { "auxiliary_loss_clip": 0.01410345, "auxiliary_loss_mlp": 0.01068796, "balance_loss_clip": 1.1162976, "balance_loss_mlp": 1.04572892, "epoch": 0.020441291408645465, "flos": 31686965274240.0, "grad_norm": 2.0076367521977145, "language_loss": 0.82739466, "learning_rate": 3.720608377710802e-06, "loss": 0.85218614, "num_input_tokens_seen": 3582790, "step": 170, "time_per_iteration": 3.701653480529785 }, { "auxiliary_loss_clip": 0.01405187, "auxiliary_loss_mlp": 0.01066815, "balance_loss_clip": 1.11157393, "balance_loss_mlp": 1.04135156, "epoch": 0.020561534299284555, "flos": 20886687884160.0, "grad_norm": 2.1410121371794224, "language_loss": 0.86637318, "learning_rate": 3.7248573397443277e-06, "loss": 0.89109319, "num_input_tokens_seen": 3601715, "step": 171, "time_per_iteration": 3.607347249984741 }, { "auxiliary_loss_clip": 0.01410584, "auxiliary_loss_mlp": 0.01079611, "balance_loss_clip": 1.11724651, "balance_loss_mlp": 1.0536356, "epoch": 0.020681777189923645, "flos": 20996610480000.0, "grad_norm": 2.013478207570808, "language_loss": 0.97540963, "learning_rate": 3.729081526348224e-06, "loss": 1.00031161, "num_input_tokens_seen": 3620245, "step": 172, "time_per_iteration": 4.59281587600708 }, { "auxiliary_loss_clip": 0.01406025, "auxiliary_loss_mlp": 0.01068682, "balance_loss_clip": 1.11373091, "balance_loss_mlp": 1.04472065, "epoch": 0.020802020080562738, "flos": 28257532312320.0, "grad_norm": 1.8447003191776574, "language_loss": 0.85117358, "learning_rate": 3.7332812247762777e-06, "loss": 0.87592059, "num_input_tokens_seen": 3641545, "step": 173, "time_per_iteration": 2.7831366062164307 }, { "auxiliary_loss_clip": 0.01403997, "auxiliary_loss_mlp": 0.0107497, "balance_loss_clip": 1.11315608, "balance_loss_mlp": 1.05270207, "epoch": 0.020922262971201828, "flos": 19681274344320.0, "grad_norm": 2.586859632048097, "language_loss": 0.95557338, "learning_rate": 3.737456717315293e-06, "loss": 0.98036307, "num_input_tokens_seen": 3660510, "step": 174, "time_per_iteration": 2.6288578510284424 }, { "auxiliary_loss_clip": 0.01401339, "auxiliary_loss_mlp": 0.0108346, "balance_loss_clip": 1.11330152, "balance_loss_mlp": 1.06052458, "epoch": 0.021042505861840918, "flos": 15666353694720.0, "grad_norm": 1.7004689553143786, "language_loss": 0.90733087, "learning_rate": 3.7416082813989552e-06, "loss": 0.93217891, "num_input_tokens_seen": 3677505, "step": 175, "time_per_iteration": 2.7079241275787354 }, { "auxiliary_loss_clip": 0.01403174, "auxiliary_loss_mlp": 0.01088252, "balance_loss_clip": 1.11100888, "balance_loss_mlp": 1.0629437, "epoch": 0.02116274875248001, "flos": 21142012734720.0, "grad_norm": 2.143233810620858, "language_loss": 0.89369005, "learning_rate": 3.745736189718439e-06, "loss": 0.91860431, "num_input_tokens_seen": 3696760, "step": 176, "time_per_iteration": 2.6605892181396484 }, { "auxiliary_loss_clip": 0.01394122, "auxiliary_loss_mlp": 0.01059589, "balance_loss_clip": 1.10825133, "balance_loss_mlp": 1.03721344, "epoch": 0.0212829916431191, "flos": 24715770543360.0, "grad_norm": 3.3079660954819037, "language_loss": 0.73108786, "learning_rate": 3.749840710329894e-06, "loss": 0.75562501, "num_input_tokens_seen": 3717465, "step": 177, "time_per_iteration": 2.7484419345855713 }, { "auxiliary_loss_clip": 0.01412669, "auxiliary_loss_mlp": 0.0106895, "balance_loss_clip": 1.11644006, "balance_loss_mlp": 1.04267645, "epoch": 0.02140323453375819, "flos": 16645493508480.0, "grad_norm": 3.3460694310099224, "language_loss": 0.97627079, "learning_rate": 3.7539221067588938e-06, "loss": 1.00108695, "num_input_tokens_seen": 3731440, "step": 178, "time_per_iteration": 2.6608803272247314 }, { "auxiliary_loss_clip": 0.01401184, "auxiliary_loss_mlp": 0.01076376, "balance_loss_clip": 1.10986352, "balance_loss_mlp": 1.05066228, "epoch": 0.021523477424397284, "flos": 20299332689280.0, "grad_norm": 4.385082777790487, "language_loss": 0.93709004, "learning_rate": 3.757980638101964e-06, "loss": 0.96186566, "num_input_tokens_seen": 3744935, "step": 179, "time_per_iteration": 2.6934523582458496 }, { "auxiliary_loss_clip": 0.0140312, "auxiliary_loss_mlp": 0.01082913, "balance_loss_clip": 1.11291373, "balance_loss_mlp": 1.05736637, "epoch": 0.021643720315036374, "flos": 26104005331200.0, "grad_norm": 2.9171192819066856, "language_loss": 0.89243287, "learning_rate": 3.7620165591252806e-06, "loss": 0.91729319, "num_input_tokens_seen": 3763035, "step": 180, "time_per_iteration": 2.785236358642578 }, { "auxiliary_loss_clip": 0.01399639, "auxiliary_loss_mlp": 0.01075465, "balance_loss_clip": 1.11308694, "balance_loss_mlp": 1.0543294, "epoch": 0.021763963205675464, "flos": 24787663614720.0, "grad_norm": 1.8630456881373627, "language_loss": 0.94602054, "learning_rate": 3.766030120360636e-06, "loss": 0.97077155, "num_input_tokens_seen": 3782665, "step": 181, "time_per_iteration": 2.7162742614746094 }, { "auxiliary_loss_clip": 0.01394915, "auxiliary_loss_mlp": 0.01065372, "balance_loss_clip": 1.10885477, "balance_loss_mlp": 1.04236484, "epoch": 0.021884206096314557, "flos": 25813559957760.0, "grad_norm": 3.2113082344119794, "language_loss": 0.90231967, "learning_rate": 3.7700215681987578e-06, "loss": 0.9269225, "num_input_tokens_seen": 3802435, "step": 182, "time_per_iteration": 2.748892307281494 }, { "auxiliary_loss_clip": 0.01397713, "auxiliary_loss_mlp": 0.01077639, "balance_loss_clip": 1.11117792, "balance_loss_mlp": 1.05417824, "epoch": 0.022004448986953647, "flos": 20082719721600.0, "grad_norm": 5.0290064404443084, "language_loss": 0.82320875, "learning_rate": 3.7739911449800767e-06, "loss": 0.84796226, "num_input_tokens_seen": 3822490, "step": 183, "time_per_iteration": 2.682551383972168 }, { "auxiliary_loss_clip": 0.0139254, "auxiliary_loss_mlp": 0.01081798, "balance_loss_clip": 1.1058991, "balance_loss_mlp": 1.05819452, "epoch": 0.022124691877592736, "flos": 20480609652480.0, "grad_norm": 2.6159138476673167, "language_loss": 0.80607408, "learning_rate": 3.7779390890830114e-06, "loss": 0.83081746, "num_input_tokens_seen": 3841140, "step": 184, "time_per_iteration": 2.71777081489563 }, { "auxiliary_loss_clip": 0.01388369, "auxiliary_loss_mlp": 0.01066382, "balance_loss_clip": 1.10517335, "balance_loss_mlp": 1.04205167, "epoch": 0.02224493476823183, "flos": 23586847015680.0, "grad_norm": 1.7441224904116959, "language_loss": 0.860174, "learning_rate": 3.7818656350098723e-06, "loss": 0.88472152, "num_input_tokens_seen": 3862090, "step": 185, "time_per_iteration": 2.7396037578582764 }, { "auxiliary_loss_clip": 0.01392149, "auxiliary_loss_mlp": 0.01077181, "balance_loss_clip": 1.10847151, "balance_loss_mlp": 1.05347037, "epoch": 0.02236517765887092, "flos": 16909940413440.0, "grad_norm": 3.28773701223614, "language_loss": 0.77037346, "learning_rate": 3.7857710134704447e-06, "loss": 0.79506671, "num_input_tokens_seen": 3881025, "step": 186, "time_per_iteration": 2.7352263927459717 }, { "auxiliary_loss_clip": 0.01384312, "auxiliary_loss_mlp": 0.01073251, "balance_loss_clip": 1.10541677, "balance_loss_mlp": 1.0495522, "epoch": 0.02248542054951001, "flos": 43508182930560.0, "grad_norm": 2.516275041734204, "language_loss": 0.79211593, "learning_rate": 3.7896554514633234e-06, "loss": 0.81669158, "num_input_tokens_seen": 3905310, "step": 187, "time_per_iteration": 2.8640170097351074 }, { "auxiliary_loss_clip": 0.01386188, "auxiliary_loss_mlp": 0.01058456, "balance_loss_clip": 1.10521626, "balance_loss_mlp": 1.03610396, "epoch": 0.022605663440149103, "flos": 23367648268800.0, "grad_norm": 1.9684343875014363, "language_loss": 0.84067595, "learning_rate": 3.7935191723550955e-06, "loss": 0.86512244, "num_input_tokens_seen": 3924265, "step": 188, "time_per_iteration": 2.7250490188598633 }, { "auxiliary_loss_clip": 0.01385011, "auxiliary_loss_mlp": 0.01060667, "balance_loss_clip": 1.10462189, "balance_loss_mlp": 1.03788638, "epoch": 0.022725906330788193, "flos": 29019915504000.0, "grad_norm": 1.9555368343417607, "language_loss": 0.88785356, "learning_rate": 3.797362395957408e-06, "loss": 0.91231036, "num_input_tokens_seen": 3944830, "step": 189, "time_per_iteration": 2.7804479598999023 }, { "auxiliary_loss_clip": 0.01388042, "auxiliary_loss_mlp": 0.01067998, "balance_loss_clip": 1.10773063, "balance_loss_mlp": 1.04311883, "epoch": 0.022846149221427282, "flos": 24496176746880.0, "grad_norm": 2.0929264196833572, "language_loss": 0.78068787, "learning_rate": 3.8011853386020055e-06, "loss": 0.80524826, "num_input_tokens_seen": 3965735, "step": 190, "time_per_iteration": 2.7155423164367676 }, { "auxiliary_loss_clip": 0.01390607, "auxiliary_loss_mlp": 0.01075058, "balance_loss_clip": 1.1090467, "balance_loss_mlp": 1.05036998, "epoch": 0.022966392112066376, "flos": 15523537219200.0, "grad_norm": 3.254710492069304, "language_loss": 0.89683783, "learning_rate": 3.804988213213804e-06, "loss": 0.92149448, "num_input_tokens_seen": 3983975, "step": 191, "time_per_iteration": 2.6694326400756836 }, { "auxiliary_loss_clip": 0.01387734, "auxiliary_loss_mlp": 0.01017432, "balance_loss_clip": 1.17144036, "balance_loss_mlp": 1.00532043, "epoch": 0.023086635002705466, "flos": 55650408433920.0, "grad_norm": 1.0196660135367834, "language_loss": 0.63193452, "learning_rate": 3.808771229382049e-06, "loss": 0.65598619, "num_input_tokens_seen": 4043440, "step": 192, "time_per_iteration": 3.1630778312683105 }, { "auxiliary_loss_clip": 0.0137717, "auxiliary_loss_mlp": 0.01069142, "balance_loss_clip": 1.10006905, "balance_loss_mlp": 1.04843569, "epoch": 0.023206877893344555, "flos": 19313441118720.0, "grad_norm": 2.055474719199043, "language_loss": 0.84470254, "learning_rate": 3.8125345934296324e-06, "loss": 0.86916566, "num_input_tokens_seen": 4061750, "step": 193, "time_per_iteration": 2.7106425762176514 }, { "auxiliary_loss_clip": 0.01384897, "auxiliary_loss_mlp": 0.01081176, "balance_loss_clip": 1.10525799, "balance_loss_mlp": 1.05745387, "epoch": 0.02332712078398365, "flos": 23072965090560.0, "grad_norm": 2.6010968155171836, "language_loss": 0.87956256, "learning_rate": 3.81627850848061e-06, "loss": 0.90422326, "num_input_tokens_seen": 4082345, "step": 194, "time_per_iteration": 2.7001044750213623 }, { "auxiliary_loss_clip": 0.01380007, "auxiliary_loss_mlp": 0.01071738, "balance_loss_clip": 1.10329127, "balance_loss_mlp": 1.04933858, "epoch": 0.02344736367462274, "flos": 24425971614720.0, "grad_norm": 2.169277772215798, "language_loss": 0.86128247, "learning_rate": 3.820003174525994e-06, "loss": 0.8858, "num_input_tokens_seen": 4101770, "step": 195, "time_per_iteration": 2.7809293270111084 }, { "auxiliary_loss_clip": 0.01379638, "auxiliary_loss_mlp": 0.01069782, "balance_loss_clip": 1.1038835, "balance_loss_mlp": 1.04468894, "epoch": 0.02356760656526183, "flos": 21579799697280.0, "grad_norm": 2.9043350723453223, "language_loss": 0.82842988, "learning_rate": 3.823708788487851e-06, "loss": 0.85292411, "num_input_tokens_seen": 4118770, "step": 196, "time_per_iteration": 2.7868335247039795 }, { "auxiliary_loss_clip": 0.01379707, "auxiliary_loss_mlp": 0.01073871, "balance_loss_clip": 1.10377216, "balance_loss_mlp": 1.05196023, "epoch": 0.02368784945590092, "flos": 25193598192000.0, "grad_norm": 1.9727864790142842, "language_loss": 0.84473842, "learning_rate": 3.827395544281781e-06, "loss": 0.8692742, "num_input_tokens_seen": 4141110, "step": 197, "time_per_iteration": 4.493316411972046 }, { "auxiliary_loss_clip": 0.01379303, "auxiliary_loss_mlp": 0.01089044, "balance_loss_clip": 1.10354948, "balance_loss_mlp": 1.06597662, "epoch": 0.02380809234654001, "flos": 27562481164800.0, "grad_norm": 1.9952628394020147, "language_loss": 0.79124099, "learning_rate": 3.831063632877802e-06, "loss": 0.81592441, "num_input_tokens_seen": 4161430, "step": 198, "time_per_iteration": 3.6180975437164307 }, { "auxiliary_loss_clip": 0.0137846, "auxiliary_loss_mlp": 0.01061515, "balance_loss_clip": 1.10738134, "balance_loss_mlp": 1.03958011, "epoch": 0.0239283352371791, "flos": 18259786540800.0, "grad_norm": 3.200289910352633, "language_loss": 0.7574442, "learning_rate": 3.834713242359712e-06, "loss": 0.7818439, "num_input_tokens_seen": 4179260, "step": 199, "time_per_iteration": 3.607980728149414 }, { "auxiliary_loss_clip": 0.0137814, "auxiliary_loss_mlp": 0.01066842, "balance_loss_clip": 1.10190654, "balance_loss_mlp": 1.0431428, "epoch": 0.02404857812781819, "flos": 21395110942080.0, "grad_norm": 2.7515519254464444, "language_loss": 0.87201428, "learning_rate": 3.838344557982959e-06, "loss": 0.89646411, "num_input_tokens_seen": 4200640, "step": 200, "time_per_iteration": 2.7081263065338135 }, { "auxiliary_loss_clip": 0.01370098, "auxiliary_loss_mlp": 0.01067301, "balance_loss_clip": 1.09836841, "balance_loss_mlp": 1.04568815, "epoch": 0.024168821018457284, "flos": 16654256426880.0, "grad_norm": 3.1781211489644114, "language_loss": 0.84858656, "learning_rate": 3.841957762231063e-06, "loss": 0.87296057, "num_input_tokens_seen": 4218170, "step": 201, "time_per_iteration": 2.659226417541504 }, { "auxiliary_loss_clip": 0.01370107, "auxiliary_loss_mlp": 0.01071343, "balance_loss_clip": 1.09710014, "balance_loss_mlp": 1.04964721, "epoch": 0.024289063909096374, "flos": 22820872464000.0, "grad_norm": 2.3406761757277157, "language_loss": 0.87808323, "learning_rate": 3.8455530348706454e-06, "loss": 0.90249765, "num_input_tokens_seen": 4237770, "step": 202, "time_per_iteration": 2.6990745067596436 }, { "auxiliary_loss_clip": 0.01372492, "auxiliary_loss_mlp": 0.01062201, "balance_loss_clip": 1.10030937, "balance_loss_mlp": 1.04098153, "epoch": 0.024409306799735464, "flos": 17748598135680.0, "grad_norm": 1.9717380135094764, "language_loss": 0.77401578, "learning_rate": 3.849130553005099e-06, "loss": 0.79836273, "num_input_tokens_seen": 4255985, "step": 203, "time_per_iteration": 2.689941167831421 }, { "auxiliary_loss_clip": 0.01368754, "auxiliary_loss_mlp": 0.01063101, "balance_loss_clip": 1.09911489, "balance_loss_mlp": 1.04078543, "epoch": 0.024529549690374557, "flos": 21616213109760.0, "grad_norm": 2.103325283558823, "language_loss": 0.83555341, "learning_rate": 3.852690491126933e-06, "loss": 0.85987186, "num_input_tokens_seen": 4276035, "step": 204, "time_per_iteration": 2.6832735538482666 }, { "auxiliary_loss_clip": 0.01374371, "auxiliary_loss_mlp": 0.01061436, "balance_loss_clip": 1.10003459, "balance_loss_mlp": 1.04015708, "epoch": 0.024649792581013647, "flos": 25551662918400.0, "grad_norm": 3.556613141061763, "language_loss": 0.91168368, "learning_rate": 3.856233021168845e-06, "loss": 0.93604177, "num_input_tokens_seen": 4295730, "step": 205, "time_per_iteration": 2.7232744693756104 }, { "auxiliary_loss_clip": 0.01361881, "auxiliary_loss_mlp": 0.01058844, "balance_loss_clip": 1.0939976, "balance_loss_mlp": 1.03817272, "epoch": 0.024770035471652737, "flos": 34495574544000.0, "grad_norm": 2.6279391336879017, "language_loss": 0.91106135, "learning_rate": 3.859758312553544e-06, "loss": 0.93526864, "num_input_tokens_seen": 4317950, "step": 206, "time_per_iteration": 2.756917715072632 }, { "auxiliary_loss_clip": 0.01368763, "auxiliary_loss_mlp": 0.01063592, "balance_loss_clip": 1.09857321, "balance_loss_mlp": 1.04133594, "epoch": 0.02489027836229183, "flos": 21505428587520.0, "grad_norm": 2.029577045432846, "language_loss": 0.9187212, "learning_rate": 3.8632665322423735e-06, "loss": 0.94304472, "num_input_tokens_seen": 4337605, "step": 207, "time_per_iteration": 2.7604188919067383 }, { "auxiliary_loss_clip": 0.01361526, "auxiliary_loss_mlp": 0.01066448, "balance_loss_clip": 1.09542036, "balance_loss_mlp": 1.04615879, "epoch": 0.02501052125293092, "flos": 23219013790080.0, "grad_norm": 6.165214836580721, "language_loss": 0.85940605, "learning_rate": 3.866757844782762e-06, "loss": 0.88368583, "num_input_tokens_seen": 4358110, "step": 208, "time_per_iteration": 2.7291061878204346 }, { "auxiliary_loss_clip": 0.01361605, "auxiliary_loss_mlp": 0.01075121, "balance_loss_clip": 1.09571183, "balance_loss_mlp": 1.05302, "epoch": 0.02513076414357001, "flos": 26388920010240.0, "grad_norm": 2.47855774911894, "language_loss": 0.91529131, "learning_rate": 3.870232412354527e-06, "loss": 0.93965858, "num_input_tokens_seen": 4374955, "step": 209, "time_per_iteration": 2.6958751678466797 }, { "auxiliary_loss_clip": 0.01365342, "auxiliary_loss_mlp": 0.01070656, "balance_loss_clip": 1.09574938, "balance_loss_mlp": 1.04894769, "epoch": 0.025251007034209103, "flos": 13590430047360.0, "grad_norm": 2.1225811415845954, "language_loss": 0.92692697, "learning_rate": 3.873690394815086e-06, "loss": 0.95128691, "num_input_tokens_seen": 4391535, "step": 210, "time_per_iteration": 2.704148054122925 }, { "auxiliary_loss_clip": 0.01366294, "auxiliary_loss_mlp": 0.0105962, "balance_loss_clip": 1.09800601, "balance_loss_mlp": 1.03927088, "epoch": 0.025371249924848193, "flos": 15049229103360.0, "grad_norm": 13.003292359778728, "language_loss": 0.91199493, "learning_rate": 3.877131949743587e-06, "loss": 0.93625408, "num_input_tokens_seen": 4408400, "step": 211, "time_per_iteration": 2.692728281021118 }, { "auxiliary_loss_clip": 0.01359485, "auxiliary_loss_mlp": 0.01061634, "balance_loss_clip": 1.09381807, "balance_loss_mlp": 1.04044998, "epoch": 0.025491492815487283, "flos": 25553853648000.0, "grad_norm": 3.817600752169996, "language_loss": 0.7809149, "learning_rate": 3.880557232483993e-06, "loss": 0.80512613, "num_input_tokens_seen": 4427840, "step": 212, "time_per_iteration": 2.775796413421631 }, { "auxiliary_loss_clip": 0.01357592, "auxiliary_loss_mlp": 0.01056672, "balance_loss_clip": 1.09125209, "balance_loss_mlp": 1.03631139, "epoch": 0.025611735706126376, "flos": 20630752502400.0, "grad_norm": 1.9348574651845132, "language_loss": 0.86673051, "learning_rate": 3.883966396187164e-06, "loss": 0.89087319, "num_input_tokens_seen": 4447110, "step": 213, "time_per_iteration": 2.773782253265381 }, { "auxiliary_loss_clip": 0.01357898, "auxiliary_loss_mlp": 0.01065793, "balance_loss_clip": 1.09486008, "balance_loss_mlp": 1.04490805, "epoch": 0.025731978596765466, "flos": 19062282245760.0, "grad_norm": 2.6232203632264395, "language_loss": 0.89797944, "learning_rate": 3.887359591851937e-06, "loss": 0.92221636, "num_input_tokens_seen": 4464715, "step": 214, "time_per_iteration": 2.668405771255493 }, { "auxiliary_loss_clip": 0.0135696, "auxiliary_loss_mlp": 0.01070314, "balance_loss_clip": 1.09464169, "balance_loss_mlp": 1.05023932, "epoch": 0.025852221487404556, "flos": 22163814927360.0, "grad_norm": 1.8396707831657761, "language_loss": 0.92265213, "learning_rate": 3.890736968365265e-06, "loss": 0.94692487, "num_input_tokens_seen": 4485030, "step": 215, "time_per_iteration": 2.6965348720550537 }, { "auxiliary_loss_clip": 0.01354457, "auxiliary_loss_mlp": 0.01050971, "balance_loss_clip": 1.09159636, "balance_loss_mlp": 1.02937007, "epoch": 0.02597246437804365, "flos": 26541971861760.0, "grad_norm": 1.939547369646986, "language_loss": 0.8492285, "learning_rate": 3.894098672541412e-06, "loss": 0.87328279, "num_input_tokens_seen": 4505935, "step": 216, "time_per_iteration": 2.718322277069092 }, { "auxiliary_loss_clip": 0.01358146, "auxiliary_loss_mlp": 0.01070121, "balance_loss_clip": 1.09356809, "balance_loss_mlp": 1.04902077, "epoch": 0.02609270726868274, "flos": 32671671696000.0, "grad_norm": 1.8225429636783088, "language_loss": 0.75406915, "learning_rate": 3.89744484916025e-06, "loss": 0.77835178, "num_input_tokens_seen": 4527045, "step": 217, "time_per_iteration": 2.829684019088745 }, { "auxiliary_loss_clip": 0.01359738, "auxiliary_loss_mlp": 0.01061686, "balance_loss_clip": 1.0942173, "balance_loss_mlp": 1.03965569, "epoch": 0.02621295015932183, "flos": 26243553669120.0, "grad_norm": 2.0058874775593725, "language_loss": 0.87323701, "learning_rate": 3.900775641004673e-06, "loss": 0.89745122, "num_input_tokens_seen": 4546360, "step": 218, "time_per_iteration": 2.7772092819213867 }, { "auxiliary_loss_clip": 0.01362935, "auxiliary_loss_mlp": 0.01066346, "balance_loss_clip": 1.09626436, "balance_loss_mlp": 1.04287338, "epoch": 0.026333193049960922, "flos": 42921402353280.0, "grad_norm": 2.960122245288127, "language_loss": 0.74437767, "learning_rate": 3.904091188897156e-06, "loss": 0.76867044, "num_input_tokens_seen": 4565495, "step": 219, "time_per_iteration": 2.9297783374786377 }, { "auxiliary_loss_clip": 0.01359912, "auxiliary_loss_mlp": 0.01067537, "balance_loss_clip": 1.09447396, "balance_loss_mlp": 1.04557908, "epoch": 0.026453435940600012, "flos": 17963846386560.0, "grad_norm": 2.0298664325234066, "language_loss": 0.8192997, "learning_rate": 3.90739163173548e-06, "loss": 0.84357417, "num_input_tokens_seen": 4583330, "step": 220, "time_per_iteration": 2.7693541049957275 }, { "auxiliary_loss_clip": 0.01356187, "auxiliary_loss_mlp": 0.01048305, "balance_loss_clip": 1.09329796, "balance_loss_mlp": 1.02801538, "epoch": 0.026573678831239102, "flos": 18984319776000.0, "grad_norm": 2.374874771320717, "language_loss": 0.88084781, "learning_rate": 3.910677106527646e-06, "loss": 0.90489274, "num_input_tokens_seen": 4600520, "step": 221, "time_per_iteration": 2.7835569381713867 }, { "auxiliary_loss_clip": 0.01351427, "auxiliary_loss_mlp": 0.01066193, "balance_loss_clip": 1.09182298, "balance_loss_mlp": 1.04636836, "epoch": 0.026693921721878195, "flos": 29241448634880.0, "grad_norm": 2.284663811863766, "language_loss": 0.84000146, "learning_rate": 3.913947748426004e-06, "loss": 0.8641777, "num_input_tokens_seen": 4617340, "step": 222, "time_per_iteration": 2.7775650024414062 }, { "auxiliary_loss_clip": 0.01355981, "auxiliary_loss_mlp": 0.01054237, "balance_loss_clip": 1.0937686, "balance_loss_mlp": 1.03394771, "epoch": 0.026814164612517285, "flos": 14128083797760.0, "grad_norm": 2.760770655916662, "language_loss": 0.76178062, "learning_rate": 3.9172036907606136e-06, "loss": 0.78588283, "num_input_tokens_seen": 4630820, "step": 223, "time_per_iteration": 3.577866315841675 }, { "auxiliary_loss_clip": 0.01351998, "auxiliary_loss_mlp": 0.01064684, "balance_loss_clip": 1.0895499, "balance_loss_mlp": 1.04314291, "epoch": 0.026934407503156375, "flos": 23511973115520.0, "grad_norm": 1.7666934401793748, "language_loss": 0.95082068, "learning_rate": 3.920445065071855e-06, "loss": 0.97498751, "num_input_tokens_seen": 4651985, "step": 224, "time_per_iteration": 3.657883644104004 }, { "auxiliary_loss_clip": 0.01355131, "auxiliary_loss_mlp": 0.01056447, "balance_loss_clip": 1.09389293, "balance_loss_mlp": 1.03687322, "epoch": 0.027054650393795468, "flos": 28950356816640.0, "grad_norm": 2.2370036531447233, "language_loss": 0.79889721, "learning_rate": 3.923672001142322e-06, "loss": 0.82301301, "num_input_tokens_seen": 4672295, "step": 225, "time_per_iteration": 3.6633412837982178 }, { "auxiliary_loss_clip": 0.01349938, "auxiliary_loss_mlp": 0.01066852, "balance_loss_clip": 1.08887017, "balance_loss_mlp": 1.04502439, "epoch": 0.027174893284434558, "flos": 31431568596480.0, "grad_norm": 1.8776618456063423, "language_loss": 0.84347028, "learning_rate": 3.926884627027996e-06, "loss": 0.86763817, "num_input_tokens_seen": 4696065, "step": 226, "time_per_iteration": 3.6602859497070312 }, { "auxiliary_loss_clip": 0.01346401, "auxiliary_loss_mlp": 0.01067774, "balance_loss_clip": 1.08773077, "balance_loss_mlp": 1.04916549, "epoch": 0.027295136175073648, "flos": 22054466949120.0, "grad_norm": 1.9544126421754122, "language_loss": 0.77598101, "learning_rate": 3.930083069088744e-06, "loss": 0.80012274, "num_input_tokens_seen": 4716065, "step": 227, "time_per_iteration": 2.6831133365631104 }, { "auxiliary_loss_clip": 0.0133468, "auxiliary_loss_mlp": 0.0101034, "balance_loss_clip": 1.13474405, "balance_loss_mlp": 1.00018334, "epoch": 0.02741537906571274, "flos": 60800752972800.0, "grad_norm": 0.9869608786419882, "language_loss": 0.59321207, "learning_rate": 3.933267452018137e-06, "loss": 0.61666226, "num_input_tokens_seen": 4775860, "step": 228, "time_per_iteration": 3.2391090393066406 }, { "auxiliary_loss_clip": 0.01348307, "auxiliary_loss_mlp": 0.01069996, "balance_loss_clip": 1.09302735, "balance_loss_mlp": 1.0495038, "epoch": 0.02753562195635183, "flos": 24606278910720.0, "grad_norm": 2.373849544444169, "language_loss": 0.84407276, "learning_rate": 3.936437898872622e-06, "loss": 0.86825579, "num_input_tokens_seen": 4795835, "step": 229, "time_per_iteration": 2.7046124935150146 }, { "auxiliary_loss_clip": 0.01352373, "auxiliary_loss_mlp": 0.01058024, "balance_loss_clip": 1.09058845, "balance_loss_mlp": 1.03896272, "epoch": 0.02765586484699092, "flos": 34094236907520.0, "grad_norm": 2.794993024392365, "language_loss": 0.79793596, "learning_rate": 3.9395945311000525e-06, "loss": 0.82203996, "num_input_tokens_seen": 4817460, "step": 230, "time_per_iteration": 2.8646390438079834 }, { "auxiliary_loss_clip": 0.01351001, "auxiliary_loss_mlp": 0.01062398, "balance_loss_clip": 1.0898087, "balance_loss_mlp": 1.0424664, "epoch": 0.027776107737630014, "flos": 14829922615680.0, "grad_norm": 3.723560560190765, "language_loss": 0.90817654, "learning_rate": 3.942737468567608e-06, "loss": 0.93231046, "num_input_tokens_seen": 4835475, "step": 231, "time_per_iteration": 2.754484176635742 }, { "auxiliary_loss_clip": 0.01349596, "auxiliary_loss_mlp": 0.01071364, "balance_loss_clip": 1.0896405, "balance_loss_mlp": 1.04889286, "epoch": 0.027896350628269104, "flos": 47920347066240.0, "grad_norm": 2.556045947182385, "language_loss": 0.8623867, "learning_rate": 3.9458668295891026e-06, "loss": 0.88659638, "num_input_tokens_seen": 4857760, "step": 232, "time_per_iteration": 2.972388982772827 }, { "auxiliary_loss_clip": 0.01344712, "auxiliary_loss_mlp": 0.01047586, "balance_loss_clip": 1.08578265, "balance_loss_mlp": 1.02825069, "epoch": 0.028016593518908194, "flos": 21684550734720.0, "grad_norm": 2.4899485523183627, "language_loss": 0.86840755, "learning_rate": 3.948982730951712e-06, "loss": 0.89233053, "num_input_tokens_seen": 4875855, "step": 233, "time_per_iteration": 2.778571128845215 }, { "auxiliary_loss_clip": 0.01352215, "auxiliary_loss_mlp": 0.01064662, "balance_loss_clip": 1.09134102, "balance_loss_mlp": 1.0430609, "epoch": 0.028136836409547287, "flos": 18439483305600.0, "grad_norm": 2.377556586178967, "language_loss": 0.82117772, "learning_rate": 3.9520852879421254e-06, "loss": 0.84534645, "num_input_tokens_seen": 4893200, "step": 234, "time_per_iteration": 2.6536307334899902 }, { "auxiliary_loss_clip": 0.01342038, "auxiliary_loss_mlp": 0.01057875, "balance_loss_clip": 1.08670402, "balance_loss_mlp": 1.03882504, "epoch": 0.028257079300186377, "flos": 31576934937600.0, "grad_norm": 2.706803517057041, "language_loss": 0.81870466, "learning_rate": 3.955174614372137e-06, "loss": 0.84270382, "num_input_tokens_seen": 4912965, "step": 235, "time_per_iteration": 2.7041773796081543 }, { "auxiliary_loss_clip": 0.01348325, "auxiliary_loss_mlp": 0.01067592, "balance_loss_clip": 1.08983624, "balance_loss_mlp": 1.04746985, "epoch": 0.028377322190825467, "flos": 23513337832320.0, "grad_norm": 3.638066717470407, "language_loss": 0.84350473, "learning_rate": 3.9582508226037045e-06, "loss": 0.86766392, "num_input_tokens_seen": 4933105, "step": 236, "time_per_iteration": 2.7125213146209717 }, { "auxiliary_loss_clip": 0.01348085, "auxiliary_loss_mlp": 0.0106656, "balance_loss_clip": 1.08931589, "balance_loss_mlp": 1.04486418, "epoch": 0.02849756508146456, "flos": 20479604071680.0, "grad_norm": 4.308818017675863, "language_loss": 0.9413892, "learning_rate": 3.9613140235734636e-06, "loss": 0.96553564, "num_input_tokens_seen": 4950085, "step": 237, "time_per_iteration": 2.6339480876922607 }, { "auxiliary_loss_clip": 0.0134467, "auxiliary_loss_mlp": 0.01067314, "balance_loss_clip": 1.0876863, "balance_loss_mlp": 1.04708433, "epoch": 0.02861780797210365, "flos": 14283362292480.0, "grad_norm": 2.0670559194349334, "language_loss": 0.81187743, "learning_rate": 3.96436432681674e-06, "loss": 0.83599728, "num_input_tokens_seen": 4968075, "step": 238, "time_per_iteration": 2.678725004196167 }, { "auxiliary_loss_clip": 0.0134601, "auxiliary_loss_mlp": 0.01058781, "balance_loss_clip": 1.08830392, "balance_loss_mlp": 1.03883731, "epoch": 0.02873805086274274, "flos": 25808532053760.0, "grad_norm": 2.434832016649358, "language_loss": 0.89231443, "learning_rate": 3.967401840491044e-06, "loss": 0.91636229, "num_input_tokens_seen": 4987355, "step": 239, "time_per_iteration": 2.7481400966644287 }, { "auxiliary_loss_clip": 0.01343489, "auxiliary_loss_mlp": 0.01071726, "balance_loss_clip": 1.08856606, "balance_loss_mlp": 1.05165064, "epoch": 0.028858293753381833, "flos": 17304238984320.0, "grad_norm": 2.238358419806448, "language_loss": 0.87721354, "learning_rate": 3.97042667139909e-06, "loss": 0.90136564, "num_input_tokens_seen": 5004680, "step": 240, "time_per_iteration": 2.664344549179077 }, { "auxiliary_loss_clip": 0.01344444, "auxiliary_loss_mlp": 0.01057876, "balance_loss_clip": 1.08965027, "balance_loss_mlp": 1.03671598, "epoch": 0.028978536644020923, "flos": 23038347358080.0, "grad_norm": 2.130332197407333, "language_loss": 0.87527931, "learning_rate": 3.973438925011327e-06, "loss": 0.89930248, "num_input_tokens_seen": 5022965, "step": 241, "time_per_iteration": 2.7137465476989746 }, { "auxiliary_loss_clip": 0.01344648, "auxiliary_loss_mlp": 0.01063251, "balance_loss_clip": 1.08736086, "balance_loss_mlp": 1.04385567, "epoch": 0.029098779534660012, "flos": 28329712692480.0, "grad_norm": 2.284064681198569, "language_loss": 0.91277969, "learning_rate": 3.976438705488002e-06, "loss": 0.93685871, "num_input_tokens_seen": 5042625, "step": 242, "time_per_iteration": 2.687079906463623 }, { "auxiliary_loss_clip": 0.01342574, "auxiliary_loss_mlp": 0.0105818, "balance_loss_clip": 1.09006476, "balance_loss_mlp": 1.03902268, "epoch": 0.029219022425299106, "flos": 13881665520000.0, "grad_norm": 2.9503076579019822, "language_loss": 0.92902613, "learning_rate": 3.9794261157007744e-06, "loss": 0.95303363, "num_input_tokens_seen": 5060380, "step": 243, "time_per_iteration": 2.6521639823913574 }, { "auxiliary_loss_clip": 0.01347869, "auxiliary_loss_mlp": 0.01071535, "balance_loss_clip": 1.09082711, "balance_loss_mlp": 1.04971933, "epoch": 0.029339265315938196, "flos": 19422501788160.0, "grad_norm": 2.0857838080122044, "language_loss": 0.8482095, "learning_rate": 3.982401257253887e-06, "loss": 0.8724035, "num_input_tokens_seen": 5078720, "step": 244, "time_per_iteration": 2.681300640106201 }, { "auxiliary_loss_clip": 0.01341591, "auxiliary_loss_mlp": 0.01049504, "balance_loss_clip": 1.08732557, "balance_loss_mlp": 1.03109837, "epoch": 0.029459508206577285, "flos": 15669550005120.0, "grad_norm": 2.3976617348060927, "language_loss": 0.89639938, "learning_rate": 3.985364230504893e-06, "loss": 0.92031038, "num_input_tokens_seen": 5096605, "step": 245, "time_per_iteration": 2.69746732711792 }, { "auxiliary_loss_clip": 0.01342913, "auxiliary_loss_mlp": 0.01061634, "balance_loss_clip": 1.08769083, "balance_loss_mlp": 1.04175019, "epoch": 0.02957975109721638, "flos": 28220975245440.0, "grad_norm": 1.9809464072377714, "language_loss": 0.84273672, "learning_rate": 3.988315134584976e-06, "loss": 0.86678219, "num_input_tokens_seen": 5116285, "step": 246, "time_per_iteration": 2.753222942352295 }, { "auxiliary_loss_clip": 0.01345533, "auxiliary_loss_mlp": 0.01061471, "balance_loss_clip": 1.08980727, "balance_loss_mlp": 1.04224288, "epoch": 0.02969999398785547, "flos": 24315869450880.0, "grad_norm": 1.797197459118882, "language_loss": 0.80585629, "learning_rate": 3.991254067418851e-06, "loss": 0.82992631, "num_input_tokens_seen": 5136825, "step": 247, "time_per_iteration": 2.7297940254211426 }, { "auxiliary_loss_clip": 0.01338759, "auxiliary_loss_mlp": 0.01060145, "balance_loss_clip": 1.08798909, "balance_loss_mlp": 1.04173923, "epoch": 0.02982023687849456, "flos": 35078584193280.0, "grad_norm": 2.260179795159027, "language_loss": 0.82688993, "learning_rate": 3.994181125744254e-06, "loss": 0.85087901, "num_input_tokens_seen": 5158630, "step": 248, "time_per_iteration": 2.771456718444824 }, { "auxiliary_loss_clip": 0.0134043, "auxiliary_loss_mlp": 0.01057327, "balance_loss_clip": 1.08870614, "balance_loss_mlp": 1.03834915, "epoch": 0.02994047976913365, "flos": 26177155378560.0, "grad_norm": 2.0669164763145074, "language_loss": 0.73925662, "learning_rate": 3.99709640513106e-06, "loss": 0.76323426, "num_input_tokens_seen": 5179510, "step": 249, "time_per_iteration": 3.649968147277832 }, { "auxiliary_loss_clip": 0.01342174, "auxiliary_loss_mlp": 0.01067167, "balance_loss_clip": 1.08887959, "balance_loss_mlp": 1.0471164, "epoch": 0.03006072265977274, "flos": 25625028447360.0, "grad_norm": 3.4602588582004534, "language_loss": 0.85728574, "learning_rate": 4e-06, "loss": 0.88137913, "num_input_tokens_seen": 5199345, "step": 250, "time_per_iteration": 2.704350709915161 }, { "auxiliary_loss_clip": 0.01345185, "auxiliary_loss_mlp": 0.01063719, "balance_loss_clip": 1.09113336, "balance_loss_mlp": 1.04377556, "epoch": 0.03018096555041183, "flos": 22127078292480.0, "grad_norm": 2.9091942192213764, "language_loss": 0.887025, "learning_rate": 3.999999848300794e-06, "loss": 0.91111398, "num_input_tokens_seen": 5218330, "step": 251, "time_per_iteration": 3.6359775066375732 }, { "auxiliary_loss_clip": 0.01334993, "auxiliary_loss_mlp": 0.01065801, "balance_loss_clip": 1.08472395, "balance_loss_mlp": 1.04741907, "epoch": 0.030301208441050925, "flos": 30188197359360.0, "grad_norm": 1.7336457611264133, "language_loss": 0.89083165, "learning_rate": 3.999999393203203e-06, "loss": 0.91483957, "num_input_tokens_seen": 5240740, "step": 252, "time_per_iteration": 3.6692140102386475 }, { "auxiliary_loss_clip": 0.01335445, "auxiliary_loss_mlp": 0.01062689, "balance_loss_clip": 1.08498359, "balance_loss_mlp": 1.04264951, "epoch": 0.030421451331690014, "flos": 23621392920960.0, "grad_norm": 2.0492958441941425, "language_loss": 0.85130203, "learning_rate": 3.999998634707293e-06, "loss": 0.87528336, "num_input_tokens_seen": 5260290, "step": 253, "time_per_iteration": 2.7231366634368896 }, { "auxiliary_loss_clip": 0.01342733, "auxiliary_loss_mlp": 0.01070677, "balance_loss_clip": 1.089993, "balance_loss_mlp": 1.05060196, "epoch": 0.030541694222329104, "flos": 27928446883200.0, "grad_norm": 2.161956721384621, "language_loss": 0.96453905, "learning_rate": 3.999997572813182e-06, "loss": 0.98867309, "num_input_tokens_seen": 5278100, "step": 254, "time_per_iteration": 3.585601806640625 }, { "auxiliary_loss_clip": 0.01334662, "auxiliary_loss_mlp": 0.01054046, "balance_loss_clip": 1.0869453, "balance_loss_mlp": 1.03580713, "epoch": 0.030661937112968194, "flos": 18588441006720.0, "grad_norm": 1.9011231721078339, "language_loss": 0.87489724, "learning_rate": 3.999996207521028e-06, "loss": 0.8987844, "num_input_tokens_seen": 5296810, "step": 255, "time_per_iteration": 2.6965291500091553 }, { "auxiliary_loss_clip": 0.01337497, "auxiliary_loss_mlp": 0.01061369, "balance_loss_clip": 1.08637857, "balance_loss_mlp": 1.04072237, "epoch": 0.030782180003607287, "flos": 12969139478400.0, "grad_norm": 2.4840798588232986, "language_loss": 0.81944323, "learning_rate": 3.999994538831039e-06, "loss": 0.84343195, "num_input_tokens_seen": 5313395, "step": 256, "time_per_iteration": 2.7494046688079834 }, { "auxiliary_loss_clip": 0.01334121, "auxiliary_loss_mlp": 0.01060036, "balance_loss_clip": 1.08442736, "balance_loss_mlp": 1.03999722, "epoch": 0.030902422894246377, "flos": 23335364920320.0, "grad_norm": 2.741851552231918, "language_loss": 0.85795528, "learning_rate": 3.99999256674347e-06, "loss": 0.88189685, "num_input_tokens_seen": 5333545, "step": 257, "time_per_iteration": 2.738823890686035 }, { "auxiliary_loss_clip": 0.01313265, "auxiliary_loss_mlp": 0.01018576, "balance_loss_clip": 1.11683297, "balance_loss_mlp": 1.00808549, "epoch": 0.031022665784885467, "flos": 55094151438720.0, "grad_norm": 1.0090079864146269, "language_loss": 0.53497308, "learning_rate": 3.999990291258618e-06, "loss": 0.55829144, "num_input_tokens_seen": 5392235, "step": 258, "time_per_iteration": 3.2375783920288086 }, { "auxiliary_loss_clip": 0.01340199, "auxiliary_loss_mlp": 0.01064169, "balance_loss_clip": 1.0909133, "balance_loss_mlp": 1.04466641, "epoch": 0.03114290867552456, "flos": 19317786664320.0, "grad_norm": 3.0243520222376272, "language_loss": 0.86719823, "learning_rate": 3.999987712376829e-06, "loss": 0.89124191, "num_input_tokens_seen": 5410555, "step": 259, "time_per_iteration": 2.63279390335083 }, { "auxiliary_loss_clip": 0.01333569, "auxiliary_loss_mlp": 0.01064223, "balance_loss_clip": 1.08639205, "balance_loss_mlp": 1.04375446, "epoch": 0.031263151566163654, "flos": 20959442881920.0, "grad_norm": 2.4065767753133187, "language_loss": 0.82247615, "learning_rate": 3.999984830098494e-06, "loss": 0.84645402, "num_input_tokens_seen": 5430135, "step": 260, "time_per_iteration": 2.7147560119628906 }, { "auxiliary_loss_clip": 0.01331466, "auxiliary_loss_mlp": 0.01053058, "balance_loss_clip": 1.08425927, "balance_loss_mlp": 1.03341222, "epoch": 0.03138339445680274, "flos": 14793006412800.0, "grad_norm": 4.66319158942458, "language_loss": 0.98433584, "learning_rate": 3.999981644424051e-06, "loss": 1.0081811, "num_input_tokens_seen": 5444935, "step": 261, "time_per_iteration": 2.68422532081604 }, { "auxiliary_loss_clip": 0.01338071, "auxiliary_loss_mlp": 0.01069575, "balance_loss_clip": 1.08772779, "balance_loss_mlp": 1.04873776, "epoch": 0.03150363734744183, "flos": 11655599022720.0, "grad_norm": 2.431294449869067, "language_loss": 0.86173052, "learning_rate": 3.999978155353982e-06, "loss": 0.88580704, "num_input_tokens_seen": 5462080, "step": 262, "time_per_iteration": 2.789813995361328 }, { "auxiliary_loss_clip": 0.01333394, "auxiliary_loss_mlp": 0.01059259, "balance_loss_clip": 1.08367026, "balance_loss_mlp": 1.03974426, "epoch": 0.03162388023808092, "flos": 33727732485120.0, "grad_norm": 2.2692784019642254, "language_loss": 0.80039096, "learning_rate": 3.9999743628888186e-06, "loss": 0.82431751, "num_input_tokens_seen": 5483870, "step": 263, "time_per_iteration": 2.789564609527588 }, { "auxiliary_loss_clip": 0.01332279, "auxiliary_loss_mlp": 0.0105584, "balance_loss_clip": 1.0839963, "balance_loss_mlp": 1.03723145, "epoch": 0.03174412312872001, "flos": 20810952057600.0, "grad_norm": 2.339746730496139, "language_loss": 0.8961612, "learning_rate": 3.999970267029133e-06, "loss": 0.9200424, "num_input_tokens_seen": 5502830, "step": 264, "time_per_iteration": 2.6982367038726807 }, { "auxiliary_loss_clip": 0.01330216, "auxiliary_loss_mlp": 0.01059934, "balance_loss_clip": 1.0854727, "balance_loss_mlp": 1.03977561, "epoch": 0.0318643660193591, "flos": 23727939638400.0, "grad_norm": 2.0060184471477878, "language_loss": 0.79990101, "learning_rate": 3.999965867775548e-06, "loss": 0.82380253, "num_input_tokens_seen": 5523225, "step": 265, "time_per_iteration": 2.6986820697784424 }, { "auxiliary_loss_clip": 0.01334351, "auxiliary_loss_mlp": 0.01052304, "balance_loss_clip": 1.08569264, "balance_loss_mlp": 1.03418386, "epoch": 0.0319846089099982, "flos": 13917863450880.0, "grad_norm": 2.4530576700240143, "language_loss": 0.8695246, "learning_rate": 3.9999611651287315e-06, "loss": 0.89339113, "num_input_tokens_seen": 5541380, "step": 266, "time_per_iteration": 2.673325777053833 }, { "auxiliary_loss_clip": 0.01332973, "auxiliary_loss_mlp": 0.01058216, "balance_loss_clip": 1.08457983, "balance_loss_mlp": 1.03787923, "epoch": 0.03210485180063729, "flos": 14753253035520.0, "grad_norm": 4.924322790995798, "language_loss": 0.78662729, "learning_rate": 3.999956159089396e-06, "loss": 0.81053913, "num_input_tokens_seen": 5558830, "step": 267, "time_per_iteration": 2.698577404022217 }, { "auxiliary_loss_clip": 0.01331411, "auxiliary_loss_mlp": 0.01065573, "balance_loss_clip": 1.0863322, "balance_loss_mlp": 1.04727435, "epoch": 0.03222509469127638, "flos": 28913153304960.0, "grad_norm": 2.0804776966187197, "language_loss": 0.79668897, "learning_rate": 3.999950849658302e-06, "loss": 0.8206588, "num_input_tokens_seen": 5577750, "step": 268, "time_per_iteration": 2.754279851913452 }, { "auxiliary_loss_clip": 0.01334471, "auxiliary_loss_mlp": 0.01064549, "balance_loss_clip": 1.08531642, "balance_loss_mlp": 1.04473615, "epoch": 0.03234533758191547, "flos": 16946389739520.0, "grad_norm": 2.6706541920455504, "language_loss": 0.84314668, "learning_rate": 3.999945236836254e-06, "loss": 0.86713684, "num_input_tokens_seen": 5596715, "step": 269, "time_per_iteration": 2.7007744312286377 }, { "auxiliary_loss_clip": 0.01335398, "auxiliary_loss_mlp": 0.01065218, "balance_loss_clip": 1.08700883, "balance_loss_mlp": 1.04485714, "epoch": 0.03246558047255456, "flos": 18989096284800.0, "grad_norm": 3.177205879447411, "language_loss": 0.94793367, "learning_rate": 3.999939320624103e-06, "loss": 0.9719398, "num_input_tokens_seen": 5611865, "step": 270, "time_per_iteration": 2.7495241165161133 }, { "auxiliary_loss_clip": 0.01329732, "auxiliary_loss_mlp": 0.01068375, "balance_loss_clip": 1.08411133, "balance_loss_mlp": 1.04883647, "epoch": 0.03258582336319365, "flos": 23728334688000.0, "grad_norm": 1.8674007917933901, "language_loss": 0.89979607, "learning_rate": 3.999933101022749e-06, "loss": 0.9237771, "num_input_tokens_seen": 5632270, "step": 271, "time_per_iteration": 2.7252659797668457 }, { "auxiliary_loss_clip": 0.01331895, "auxiliary_loss_mlp": 0.01052261, "balance_loss_clip": 1.08836126, "balance_loss_mlp": 1.03300881, "epoch": 0.032706066253832745, "flos": 27670823562240.0, "grad_norm": 1.867226103730768, "language_loss": 0.86988771, "learning_rate": 3.999926578033132e-06, "loss": 0.89372927, "num_input_tokens_seen": 5652085, "step": 272, "time_per_iteration": 2.6898910999298096 }, { "auxiliary_loss_clip": 0.01332849, "auxiliary_loss_mlp": 0.01060149, "balance_loss_clip": 1.0857079, "balance_loss_mlp": 1.04018128, "epoch": 0.032826309144471835, "flos": 45624685968000.0, "grad_norm": 2.6508221365500626, "language_loss": 0.63324946, "learning_rate": 3.999919751656244e-06, "loss": 0.65717947, "num_input_tokens_seen": 5678985, "step": 273, "time_per_iteration": 2.9197139739990234 }, { "auxiliary_loss_clip": 0.01331014, "auxiliary_loss_mlp": 0.01057466, "balance_loss_clip": 1.08564317, "balance_loss_mlp": 1.0391916, "epoch": 0.032946552035110925, "flos": 25812374808960.0, "grad_norm": 3.720886186327365, "language_loss": 0.75995743, "learning_rate": 3.9999126218931195e-06, "loss": 0.78384221, "num_input_tokens_seen": 5697020, "step": 274, "time_per_iteration": 2.6745901107788086 }, { "auxiliary_loss_clip": 0.01330418, "auxiliary_loss_mlp": 0.010713, "balance_loss_clip": 1.08678365, "balance_loss_mlp": 1.05356216, "epoch": 0.033066794925750015, "flos": 15121984101120.0, "grad_norm": 2.3093727712041865, "language_loss": 0.894436, "learning_rate": 3.99990518874484e-06, "loss": 0.91845322, "num_input_tokens_seen": 5713460, "step": 275, "time_per_iteration": 2.645709753036499 }, { "auxiliary_loss_clip": 0.01332308, "auxiliary_loss_mlp": 0.01067478, "balance_loss_clip": 1.08806872, "balance_loss_mlp": 1.04875016, "epoch": 0.033187037816389105, "flos": 22776593973120.0, "grad_norm": 5.36614730396745, "language_loss": 0.9265433, "learning_rate": 3.999897452212534e-06, "loss": 0.95054114, "num_input_tokens_seen": 5730790, "step": 276, "time_per_iteration": 3.6108014583587646 }, { "auxiliary_loss_clip": 0.01330633, "auxiliary_loss_mlp": 0.01069527, "balance_loss_clip": 1.08387327, "balance_loss_mlp": 1.04914188, "epoch": 0.033307280707028195, "flos": 23331414424320.0, "grad_norm": 2.007433340654826, "language_loss": 1.00389564, "learning_rate": 3.999889412297374e-06, "loss": 1.02789724, "num_input_tokens_seen": 5750215, "step": 277, "time_per_iteration": 3.600492000579834 }, { "auxiliary_loss_clip": 0.01329864, "auxiliary_loss_mlp": 0.01059298, "balance_loss_clip": 1.08548903, "balance_loss_mlp": 1.03910446, "epoch": 0.03342752359766729, "flos": 28840290566400.0, "grad_norm": 5.412266055259722, "language_loss": 0.7915169, "learning_rate": 3.999881069000581e-06, "loss": 0.81540859, "num_input_tokens_seen": 5769945, "step": 278, "time_per_iteration": 3.7194409370422363 }, { "auxiliary_loss_clip": 0.01326784, "auxiliary_loss_mlp": 0.01054641, "balance_loss_clip": 1.08401179, "balance_loss_mlp": 1.03544843, "epoch": 0.03354776648830638, "flos": 19384544090880.0, "grad_norm": 2.512472138473977, "language_loss": 0.86967021, "learning_rate": 3.99987242232342e-06, "loss": 0.89348447, "num_input_tokens_seen": 5784950, "step": 279, "time_per_iteration": 2.7840583324432373 }, { "auxiliary_loss_clip": 0.01330092, "auxiliary_loss_mlp": 0.01059539, "balance_loss_clip": 1.08681631, "balance_loss_mlp": 1.04033422, "epoch": 0.03366800937894547, "flos": 17858628472320.0, "grad_norm": 2.0775584614929263, "language_loss": 0.79687029, "learning_rate": 3.9998634722672026e-06, "loss": 0.82076657, "num_input_tokens_seen": 5805005, "step": 280, "time_per_iteration": 3.6582541465759277 }, { "auxiliary_loss_clip": 0.01330469, "auxiliary_loss_mlp": 0.01063478, "balance_loss_clip": 1.08722067, "balance_loss_mlp": 1.04410684, "epoch": 0.03378825226958456, "flos": 35951033635200.0, "grad_norm": 1.945510130965831, "language_loss": 0.78442359, "learning_rate": 3.999854218833286e-06, "loss": 0.80836308, "num_input_tokens_seen": 5825825, "step": 281, "time_per_iteration": 2.789396047592163 }, { "auxiliary_loss_clip": 0.01329322, "auxiliary_loss_mlp": 0.01074479, "balance_loss_clip": 1.08695626, "balance_loss_mlp": 1.05473757, "epoch": 0.03390849516022365, "flos": 25702488126720.0, "grad_norm": 3.22136030577526, "language_loss": 0.82282412, "learning_rate": 3.999844662023075e-06, "loss": 0.84686214, "num_input_tokens_seen": 5845700, "step": 282, "time_per_iteration": 2.715961456298828 }, { "auxiliary_loss_clip": 0.01321581, "auxiliary_loss_mlp": 0.01062309, "balance_loss_clip": 1.08139455, "balance_loss_mlp": 1.04310441, "epoch": 0.03402873805086274, "flos": 21284505987840.0, "grad_norm": 2.08542584073003, "language_loss": 0.92024875, "learning_rate": 3.999834801838018e-06, "loss": 0.94408762, "num_input_tokens_seen": 5864680, "step": 283, "time_per_iteration": 2.7861926555633545 }, { "auxiliary_loss_clip": 0.01327507, "auxiliary_loss_mlp": 0.01060625, "balance_loss_clip": 1.08660173, "balance_loss_mlp": 1.04251671, "epoch": 0.03414898094150183, "flos": 22710913954560.0, "grad_norm": 2.210332678685034, "language_loss": 0.73841798, "learning_rate": 3.9998246382796115e-06, "loss": 0.7622993, "num_input_tokens_seen": 5884260, "step": 284, "time_per_iteration": 2.7307846546173096 }, { "auxiliary_loss_clip": 0.01327847, "auxiliary_loss_mlp": 0.01061502, "balance_loss_clip": 1.08504248, "balance_loss_mlp": 1.04267883, "epoch": 0.03426922383214093, "flos": 18879927874560.0, "grad_norm": 5.983992762047023, "language_loss": 0.90770316, "learning_rate": 3.999814171349399e-06, "loss": 0.93159664, "num_input_tokens_seen": 5902120, "step": 285, "time_per_iteration": 2.680018663406372 }, { "auxiliary_loss_clip": 0.01323022, "auxiliary_loss_mlp": 0.0105247, "balance_loss_clip": 1.08363426, "balance_loss_mlp": 1.03491044, "epoch": 0.03438946672278002, "flos": 34752012716160.0, "grad_norm": 3.1191517137275575, "language_loss": 0.73603314, "learning_rate": 3.9998034010489655e-06, "loss": 0.75978804, "num_input_tokens_seen": 5925810, "step": 286, "time_per_iteration": 2.875169515609741 }, { "auxiliary_loss_clip": 0.01325629, "auxiliary_loss_mlp": 0.01059788, "balance_loss_clip": 1.08522511, "balance_loss_mlp": 1.04158437, "epoch": 0.03450970961341911, "flos": 22164102236160.0, "grad_norm": 2.0317332111661837, "language_loss": 0.76008719, "learning_rate": 3.999792327379946e-06, "loss": 0.78394133, "num_input_tokens_seen": 5945185, "step": 287, "time_per_iteration": 2.6936113834381104 }, { "auxiliary_loss_clip": 0.01327652, "auxiliary_loss_mlp": 0.01057565, "balance_loss_clip": 1.08708119, "balance_loss_mlp": 1.03825343, "epoch": 0.034629952504058197, "flos": 21725740656000.0, "grad_norm": 2.0885222687021066, "language_loss": 0.96361196, "learning_rate": 3.999780950344021e-06, "loss": 0.98746413, "num_input_tokens_seen": 5963375, "step": 288, "time_per_iteration": 2.710160493850708 }, { "auxiliary_loss_clip": 0.01329198, "auxiliary_loss_mlp": 0.01055755, "balance_loss_clip": 1.0864017, "balance_loss_mlp": 1.03570366, "epoch": 0.034750195394697286, "flos": 20047994248320.0, "grad_norm": 1.9079784791415575, "language_loss": 0.83028495, "learning_rate": 3.999769269942916e-06, "loss": 0.85413456, "num_input_tokens_seen": 5983415, "step": 289, "time_per_iteration": 2.733022451400757 }, { "auxiliary_loss_clip": 0.01321769, "auxiliary_loss_mlp": 0.01055872, "balance_loss_clip": 1.08379507, "balance_loss_mlp": 1.03664339, "epoch": 0.034870438285336376, "flos": 27965865876480.0, "grad_norm": 8.917787922156819, "language_loss": 0.81099391, "learning_rate": 3.999757286178402e-06, "loss": 0.83477032, "num_input_tokens_seen": 6005850, "step": 290, "time_per_iteration": 2.7998557090759277 }, { "auxiliary_loss_clip": 0.01326486, "auxiliary_loss_mlp": 0.01063042, "balance_loss_clip": 1.08628845, "balance_loss_mlp": 1.04417109, "epoch": 0.03499068117597547, "flos": 22017514832640.0, "grad_norm": 1.7574948632895206, "language_loss": 0.90764606, "learning_rate": 3.999744999052299e-06, "loss": 0.93154132, "num_input_tokens_seen": 6027240, "step": 291, "time_per_iteration": 2.7823214530944824 }, { "auxiliary_loss_clip": 0.0128704, "auxiliary_loss_mlp": 0.01013491, "balance_loss_clip": 1.10277903, "balance_loss_mlp": 1.00342929, "epoch": 0.03511092406661456, "flos": 57242147725440.0, "grad_norm": 0.964081432299059, "language_loss": 0.61178768, "learning_rate": 3.9997324085664675e-06, "loss": 0.63479292, "num_input_tokens_seen": 6087470, "step": 292, "time_per_iteration": 3.2987613677978516 }, { "auxiliary_loss_clip": 0.01322582, "auxiliary_loss_mlp": 0.0105824, "balance_loss_clip": 1.08284974, "balance_loss_mlp": 1.04088283, "epoch": 0.03523116695725365, "flos": 22928065626240.0, "grad_norm": 4.084045732936985, "language_loss": 0.92101932, "learning_rate": 3.999719514722821e-06, "loss": 0.94482756, "num_input_tokens_seen": 6107600, "step": 293, "time_per_iteration": 2.7961180210113525 }, { "auxiliary_loss_clip": 0.01316473, "auxiliary_loss_mlp": 0.01064212, "balance_loss_clip": 1.08168936, "balance_loss_mlp": 1.04630661, "epoch": 0.03535140984789274, "flos": 36903241226880.0, "grad_norm": 2.942634049713418, "language_loss": 0.74823284, "learning_rate": 3.999706317523314e-06, "loss": 0.77203965, "num_input_tokens_seen": 6126160, "step": 294, "time_per_iteration": 2.8931026458740234 }, { "auxiliary_loss_clip": 0.01319422, "auxiliary_loss_mlp": 0.01056571, "balance_loss_clip": 1.08375442, "balance_loss_mlp": 1.03843927, "epoch": 0.03547165273853183, "flos": 20449152316800.0, "grad_norm": 2.2386162267260046, "language_loss": 0.86008883, "learning_rate": 3.999692816969948e-06, "loss": 0.88384879, "num_input_tokens_seen": 6145695, "step": 295, "time_per_iteration": 2.745386838912964 }, { "auxiliary_loss_clip": 0.01277518, "auxiliary_loss_mlp": 0.01010998, "balance_loss_clip": 1.09604788, "balance_loss_mlp": 1.00084138, "epoch": 0.03559189562917092, "flos": 69850564871040.0, "grad_norm": 1.006546332665757, "language_loss": 0.69426966, "learning_rate": 3.999679013064772e-06, "loss": 0.71715474, "num_input_tokens_seen": 6212440, "step": 296, "time_per_iteration": 3.282848596572876 }, { "auxiliary_loss_clip": 0.01316366, "auxiliary_loss_mlp": 0.01058533, "balance_loss_clip": 1.08076227, "balance_loss_mlp": 1.04012692, "epoch": 0.03571213851981002, "flos": 21651944163840.0, "grad_norm": 2.4490698384917264, "language_loss": 0.85304523, "learning_rate": 3.99966490580988e-06, "loss": 0.87679422, "num_input_tokens_seen": 6229800, "step": 297, "time_per_iteration": 2.7566468715667725 }, { "auxiliary_loss_clip": 0.01322227, "auxiliary_loss_mlp": 0.01066187, "balance_loss_clip": 1.08458424, "balance_loss_mlp": 1.04800749, "epoch": 0.03583238141044911, "flos": 43945610757120.0, "grad_norm": 2.4280701355997985, "language_loss": 0.65834558, "learning_rate": 3.999650495207411e-06, "loss": 0.68222976, "num_input_tokens_seen": 6255825, "step": 298, "time_per_iteration": 2.928084373474121 }, { "auxiliary_loss_clip": 0.01320002, "auxiliary_loss_mlp": 0.01057809, "balance_loss_clip": 1.08412218, "balance_loss_mlp": 1.03768611, "epoch": 0.0359526243010882, "flos": 18910810592640.0, "grad_norm": 3.8044984192631723, "language_loss": 0.90089405, "learning_rate": 3.999635781259553e-06, "loss": 0.92467213, "num_input_tokens_seen": 6271090, "step": 299, "time_per_iteration": 2.6696536540985107 }, { "auxiliary_loss_clip": 0.01267765, "auxiliary_loss_mlp": 0.01013133, "balance_loss_clip": 1.09013486, "balance_loss_mlp": 1.00311947, "epoch": 0.03607286719172729, "flos": 61668892782720.0, "grad_norm": 0.920493545733023, "language_loss": 0.52254307, "learning_rate": 3.999620763968535e-06, "loss": 0.5453521, "num_input_tokens_seen": 6329965, "step": 300, "time_per_iteration": 3.111809492111206 }, { "auxiliary_loss_clip": 0.01316884, "auxiliary_loss_mlp": 0.0106411, "balance_loss_clip": 1.08282757, "balance_loss_mlp": 1.04606152, "epoch": 0.03619311008236638, "flos": 27819062991360.0, "grad_norm": 1.7392908028723926, "language_loss": 0.86369812, "learning_rate": 3.999605443336638e-06, "loss": 0.88750803, "num_input_tokens_seen": 6352095, "step": 301, "time_per_iteration": 2.7056374549865723 }, { "auxiliary_loss_clip": 0.01319593, "auxiliary_loss_mlp": 0.01062115, "balance_loss_clip": 1.08157349, "balance_loss_mlp": 1.04227853, "epoch": 0.03631335297300547, "flos": 13621133197440.0, "grad_norm": 2.5684757588850347, "language_loss": 0.89244145, "learning_rate": 3.999589819366185e-06, "loss": 0.91625857, "num_input_tokens_seen": 6365885, "step": 302, "time_per_iteration": 2.612623453140259 }, { "auxiliary_loss_clip": 0.01322307, "auxiliary_loss_mlp": 0.01056153, "balance_loss_clip": 1.08431971, "balance_loss_mlp": 1.03790164, "epoch": 0.036433595863644565, "flos": 27631788456960.0, "grad_norm": 2.960710930831052, "language_loss": 0.84683073, "learning_rate": 3.999573892059547e-06, "loss": 0.8706153, "num_input_tokens_seen": 6385015, "step": 303, "time_per_iteration": 3.6686787605285645 }, { "auxiliary_loss_clip": 0.01323174, "auxiliary_loss_mlp": 0.01053787, "balance_loss_clip": 1.08400941, "balance_loss_mlp": 1.03460622, "epoch": 0.036553838754283655, "flos": 24572020314240.0, "grad_norm": 1.9658339668209999, "language_loss": 0.81036204, "learning_rate": 3.999557661419138e-06, "loss": 0.83413166, "num_input_tokens_seen": 6405165, "step": 304, "time_per_iteration": 4.5113770961761475 }, { "auxiliary_loss_clip": 0.01319909, "auxiliary_loss_mlp": 0.01057978, "balance_loss_clip": 1.0834105, "balance_loss_mlp": 1.04023933, "epoch": 0.036674081644922744, "flos": 23404313076480.0, "grad_norm": 2.457993836763286, "language_loss": 0.81225455, "learning_rate": 3.9995411274474225e-06, "loss": 0.83603346, "num_input_tokens_seen": 6424445, "step": 305, "time_per_iteration": 2.813042640686035 }, { "auxiliary_loss_clip": 0.01322504, "auxiliary_loss_mlp": 0.01049888, "balance_loss_clip": 1.08614635, "balance_loss_mlp": 1.03150594, "epoch": 0.036794324535561834, "flos": 27489690253440.0, "grad_norm": 11.821719772457628, "language_loss": 0.81777579, "learning_rate": 3.999524290146908e-06, "loss": 0.84149969, "num_input_tokens_seen": 6444650, "step": 306, "time_per_iteration": 3.6807756423950195 }, { "auxiliary_loss_clip": 0.01318192, "auxiliary_loss_mlp": 0.01064017, "balance_loss_clip": 1.0812788, "balance_loss_mlp": 1.04531312, "epoch": 0.036914567426200924, "flos": 19463476227840.0, "grad_norm": 2.371346813488224, "language_loss": 0.92085314, "learning_rate": 3.9995071495201485e-06, "loss": 0.94467521, "num_input_tokens_seen": 6461755, "step": 307, "time_per_iteration": 2.708165168762207 }, { "auxiliary_loss_clip": 0.01320551, "auxiliary_loss_mlp": 0.01061696, "balance_loss_clip": 1.08453202, "balance_loss_mlp": 1.04218102, "epoch": 0.037034810316840014, "flos": 22309324922880.0, "grad_norm": 2.7102188315558973, "language_loss": 0.98055506, "learning_rate": 3.999489705569744e-06, "loss": 1.00437748, "num_input_tokens_seen": 6479455, "step": 308, "time_per_iteration": 2.7222626209259033 }, { "auxiliary_loss_clip": 0.01317948, "auxiliary_loss_mlp": 0.01059042, "balance_loss_clip": 1.0810914, "balance_loss_mlp": 1.0403266, "epoch": 0.03715505320747911, "flos": 18588333265920.0, "grad_norm": 3.0075478299910454, "language_loss": 0.86244941, "learning_rate": 3.999471958298341e-06, "loss": 0.88621926, "num_input_tokens_seen": 6498365, "step": 309, "time_per_iteration": 2.773552894592285 }, { "auxiliary_loss_clip": 0.01321597, "auxiliary_loss_mlp": 0.0105902, "balance_loss_clip": 1.08356643, "balance_loss_mlp": 1.03995883, "epoch": 0.0372752960981182, "flos": 35955343267200.0, "grad_norm": 1.8692003301890903, "language_loss": 0.76116639, "learning_rate": 3.999453907708631e-06, "loss": 0.78497255, "num_input_tokens_seen": 6520770, "step": 310, "time_per_iteration": 2.8258137702941895 }, { "auxiliary_loss_clip": 0.01317394, "auxiliary_loss_mlp": 0.01049011, "balance_loss_clip": 1.08132231, "balance_loss_mlp": 1.0327152, "epoch": 0.03739553898875729, "flos": 20814040627200.0, "grad_norm": 1.8464873603471188, "language_loss": 0.81401056, "learning_rate": 3.999435553803353e-06, "loss": 0.83767462, "num_input_tokens_seen": 6540170, "step": 311, "time_per_iteration": 2.6896297931671143 }, { "auxiliary_loss_clip": 0.01315405, "auxiliary_loss_mlp": 0.01050329, "balance_loss_clip": 1.08071184, "balance_loss_mlp": 1.03281713, "epoch": 0.03751578187939638, "flos": 20264140339200.0, "grad_norm": 2.3327441891367733, "language_loss": 0.83382356, "learning_rate": 3.999416896585292e-06, "loss": 0.85748088, "num_input_tokens_seen": 6557200, "step": 312, "time_per_iteration": 2.609515905380249 }, { "auxiliary_loss_clip": 0.01313563, "auxiliary_loss_mlp": 0.01059269, "balance_loss_clip": 1.07904983, "balance_loss_mlp": 1.04075599, "epoch": 0.03763602477003547, "flos": 20668063754880.0, "grad_norm": 8.367455210623413, "language_loss": 0.85775554, "learning_rate": 3.9993979360572775e-06, "loss": 0.88148379, "num_input_tokens_seen": 6577340, "step": 313, "time_per_iteration": 2.6352968215942383 }, { "auxiliary_loss_clip": 0.01323002, "auxiliary_loss_mlp": 0.01059054, "balance_loss_clip": 1.08635175, "balance_loss_mlp": 1.04057693, "epoch": 0.03775626766067456, "flos": 16691352197760.0, "grad_norm": 3.503847974872001, "language_loss": 0.82556355, "learning_rate": 3.999378672222185e-06, "loss": 0.84938413, "num_input_tokens_seen": 6595125, "step": 314, "time_per_iteration": 2.61641263961792 }, { "auxiliary_loss_clip": 0.01311844, "auxiliary_loss_mlp": 0.01068888, "balance_loss_clip": 1.08003175, "balance_loss_mlp": 1.0507803, "epoch": 0.03787651055131366, "flos": 21141797253120.0, "grad_norm": 1.9492685489274622, "language_loss": 0.82933319, "learning_rate": 3.9993591050829385e-06, "loss": 0.85314047, "num_input_tokens_seen": 6612990, "step": 315, "time_per_iteration": 2.754539966583252 }, { "auxiliary_loss_clip": 0.0131925, "auxiliary_loss_mlp": 0.01059061, "balance_loss_clip": 1.0850234, "balance_loss_mlp": 1.04047632, "epoch": 0.037996753441952746, "flos": 22018089450240.0, "grad_norm": 1.9007872301534665, "language_loss": 0.79154676, "learning_rate": 3.999339234642506e-06, "loss": 0.81532985, "num_input_tokens_seen": 6632740, "step": 316, "time_per_iteration": 2.7447831630706787 }, { "auxiliary_loss_clip": 0.01315253, "auxiliary_loss_mlp": 0.01061114, "balance_loss_clip": 1.08205259, "balance_loss_mlp": 1.04303038, "epoch": 0.038116996332591836, "flos": 27709391790720.0, "grad_norm": 5.9566805299857455, "language_loss": 0.83794707, "learning_rate": 3.9993190609038994e-06, "loss": 0.86171073, "num_input_tokens_seen": 6651505, "step": 317, "time_per_iteration": 2.6997358798980713 }, { "auxiliary_loss_clip": 0.01313626, "auxiliary_loss_mlp": 0.01055286, "balance_loss_clip": 1.08262026, "balance_loss_mlp": 1.03710675, "epoch": 0.038237239223230926, "flos": 21178067011200.0, "grad_norm": 2.142058489601009, "language_loss": 0.83120823, "learning_rate": 3.999298583870182e-06, "loss": 0.85489726, "num_input_tokens_seen": 6671090, "step": 318, "time_per_iteration": 2.62884783744812 }, { "auxiliary_loss_clip": 0.01313336, "auxiliary_loss_mlp": 0.01055347, "balance_loss_clip": 1.08001947, "balance_loss_mlp": 1.03667879, "epoch": 0.038357482113870016, "flos": 25556618995200.0, "grad_norm": 2.016708247899925, "language_loss": 0.77369535, "learning_rate": 3.999277803544458e-06, "loss": 0.79738218, "num_input_tokens_seen": 6691245, "step": 319, "time_per_iteration": 2.6756930351257324 }, { "auxiliary_loss_clip": 0.01241233, "auxiliary_loss_mlp": 0.01015842, "balance_loss_clip": 1.07434034, "balance_loss_mlp": 1.0065434, "epoch": 0.038477725004509106, "flos": 59227578034560.0, "grad_norm": 0.9554338029834426, "language_loss": 0.62385643, "learning_rate": 3.999256719929882e-06, "loss": 0.64642715, "num_input_tokens_seen": 6752520, "step": 320, "time_per_iteration": 3.1467599868774414 }, { "auxiliary_loss_clip": 0.01241174, "auxiliary_loss_mlp": 0.01012175, "balance_loss_clip": 1.07448196, "balance_loss_mlp": 1.0029242, "epoch": 0.0385979678951482, "flos": 67317676398720.0, "grad_norm": 1.2292841025612062, "language_loss": 0.67077219, "learning_rate": 3.999235333029651e-06, "loss": 0.69330573, "num_input_tokens_seen": 6806460, "step": 321, "time_per_iteration": 3.1459403038024902 }, { "auxiliary_loss_clip": 0.01314791, "auxiliary_loss_mlp": 0.01060542, "balance_loss_clip": 1.08322525, "balance_loss_mlp": 1.04331684, "epoch": 0.03871821078578729, "flos": 22746752749440.0, "grad_norm": 3.0889597050778166, "language_loss": 0.8211453, "learning_rate": 3.999213642847009e-06, "loss": 0.84489858, "num_input_tokens_seen": 6827045, "step": 322, "time_per_iteration": 2.719306230545044 }, { "auxiliary_loss_clip": 0.01311107, "auxiliary_loss_mlp": 0.01066396, "balance_loss_clip": 1.08145475, "balance_loss_mlp": 1.04892004, "epoch": 0.03883845367642638, "flos": 26280613526400.0, "grad_norm": 5.902250473781829, "language_loss": 0.91215533, "learning_rate": 3.999191649385247e-06, "loss": 0.93593037, "num_input_tokens_seen": 6848220, "step": 323, "time_per_iteration": 2.71667742729187 }, { "auxiliary_loss_clip": 0.01236088, "auxiliary_loss_mlp": 0.01009712, "balance_loss_clip": 1.07167816, "balance_loss_mlp": 1.00031817, "epoch": 0.03895869656706547, "flos": 56962835568000.0, "grad_norm": 0.905337378477428, "language_loss": 0.59770799, "learning_rate": 3.999169352647702e-06, "loss": 0.62016594, "num_input_tokens_seen": 6909400, "step": 324, "time_per_iteration": 3.156677722930908 }, { "auxiliary_loss_clip": 0.01316129, "auxiliary_loss_mlp": 0.01070261, "balance_loss_clip": 1.08344948, "balance_loss_mlp": 1.05075836, "epoch": 0.03907893945770456, "flos": 24863363527680.0, "grad_norm": 1.8501606357250273, "language_loss": 0.83130682, "learning_rate": 3.999146752637755e-06, "loss": 0.85517079, "num_input_tokens_seen": 6930445, "step": 325, "time_per_iteration": 2.683835029602051 }, { "auxiliary_loss_clip": 0.0131518, "auxiliary_loss_mlp": 0.0105782, "balance_loss_clip": 1.08185172, "balance_loss_mlp": 1.03935432, "epoch": 0.03919918234834365, "flos": 18368595815040.0, "grad_norm": 2.682868641336022, "language_loss": 0.89205086, "learning_rate": 3.999123849358836e-06, "loss": 0.9157809, "num_input_tokens_seen": 6948110, "step": 326, "time_per_iteration": 2.672957181930542 }, { "auxiliary_loss_clip": 0.0131163, "auxiliary_loss_mlp": 0.01055847, "balance_loss_clip": 1.08043766, "balance_loss_mlp": 1.03745282, "epoch": 0.03931942523898275, "flos": 25225414663680.0, "grad_norm": 4.261353862598198, "language_loss": 0.74749565, "learning_rate": 3.999100642814418e-06, "loss": 0.7711705, "num_input_tokens_seen": 6968550, "step": 327, "time_per_iteration": 2.6969008445739746 }, { "auxiliary_loss_clip": 0.01313191, "auxiliary_loss_mlp": 0.01059462, "balance_loss_clip": 1.08309019, "balance_loss_mlp": 1.04210496, "epoch": 0.03943966812962184, "flos": 23257905240960.0, "grad_norm": 3.2470002138302885, "language_loss": 0.88196522, "learning_rate": 3.999077133008022e-06, "loss": 0.90569174, "num_input_tokens_seen": 6987135, "step": 328, "time_per_iteration": 2.6409268379211426 }, { "auxiliary_loss_clip": 0.01315889, "auxiliary_loss_mlp": 0.01055558, "balance_loss_clip": 1.08357167, "balance_loss_mlp": 1.03709257, "epoch": 0.03955991102026093, "flos": 29168837291520.0, "grad_norm": 1.9703598520233967, "language_loss": 0.90680027, "learning_rate": 3.9990533199432145e-06, "loss": 0.93051475, "num_input_tokens_seen": 7008630, "step": 329, "time_per_iteration": 3.6552486419677734 }, { "auxiliary_loss_clip": 0.01315667, "auxiliary_loss_mlp": 0.01060758, "balance_loss_clip": 1.08353353, "balance_loss_mlp": 1.04231644, "epoch": 0.03968015391090002, "flos": 17602441695360.0, "grad_norm": 2.3818577613506404, "language_loss": 0.75638235, "learning_rate": 3.999029203623608e-06, "loss": 0.7801466, "num_input_tokens_seen": 7026350, "step": 330, "time_per_iteration": 3.5881452560424805 }, { "auxiliary_loss_clip": 0.01310566, "auxiliary_loss_mlp": 0.01058272, "balance_loss_clip": 1.08129394, "balance_loss_mlp": 1.04122508, "epoch": 0.03980039680153911, "flos": 21799285752960.0, "grad_norm": 2.107938380303504, "language_loss": 0.87223387, "learning_rate": 3.99900478405286e-06, "loss": 0.8959223, "num_input_tokens_seen": 7045660, "step": 331, "time_per_iteration": 3.5931057929992676 }, { "auxiliary_loss_clip": 0.01310661, "auxiliary_loss_mlp": 0.01051016, "balance_loss_clip": 1.08240366, "balance_loss_mlp": 1.03396928, "epoch": 0.0399206396921782, "flos": 15195134148480.0, "grad_norm": 2.5831935954435363, "language_loss": 0.82481706, "learning_rate": 3.998980061234676e-06, "loss": 0.84843385, "num_input_tokens_seen": 7063575, "step": 332, "time_per_iteration": 2.6629538536071777 }, { "auxiliary_loss_clip": 0.01316398, "auxiliary_loss_mlp": 0.01054225, "balance_loss_clip": 1.08415699, "balance_loss_mlp": 1.03608119, "epoch": 0.040040882582817294, "flos": 14422910630400.0, "grad_norm": 3.2765878375970265, "language_loss": 0.75989079, "learning_rate": 3.9989550351728055e-06, "loss": 0.78359705, "num_input_tokens_seen": 7080505, "step": 333, "time_per_iteration": 3.4921488761901855 }, { "auxiliary_loss_clip": 0.01312418, "auxiliary_loss_mlp": 0.010673, "balance_loss_clip": 1.08288419, "balance_loss_mlp": 1.04985976, "epoch": 0.040161125473456384, "flos": 19280906375040.0, "grad_norm": 2.5549747578588637, "language_loss": 0.84467292, "learning_rate": 3.998929705871046e-06, "loss": 0.86847007, "num_input_tokens_seen": 7097860, "step": 334, "time_per_iteration": 2.718377113342285 }, { "auxiliary_loss_clip": 0.0131206, "auxiliary_loss_mlp": 0.01054693, "balance_loss_clip": 1.08303678, "balance_loss_mlp": 1.03682327, "epoch": 0.040281368364095474, "flos": 17821101738240.0, "grad_norm": 2.594943846886399, "language_loss": 0.89008689, "learning_rate": 3.99890407333324e-06, "loss": 0.91375446, "num_input_tokens_seen": 7116390, "step": 335, "time_per_iteration": 2.65879225730896 }, { "auxiliary_loss_clip": 0.01305189, "auxiliary_loss_mlp": 0.01053122, "balance_loss_clip": 1.07677388, "balance_loss_mlp": 1.03568137, "epoch": 0.040401611254734564, "flos": 19573757959680.0, "grad_norm": 1.8567481919899163, "language_loss": 0.87158579, "learning_rate": 3.998878137563275e-06, "loss": 0.8951689, "num_input_tokens_seen": 7135940, "step": 336, "time_per_iteration": 2.7029807567596436 }, { "auxiliary_loss_clip": 0.01308126, "auxiliary_loss_mlp": 0.01062454, "balance_loss_clip": 1.08033013, "balance_loss_mlp": 1.04584837, "epoch": 0.040521854145373654, "flos": 22054466949120.0, "grad_norm": 5.184915187114566, "language_loss": 0.85417187, "learning_rate": 3.998851898565085e-06, "loss": 0.87787765, "num_input_tokens_seen": 7155745, "step": 337, "time_per_iteration": 2.7057945728302 }, { "auxiliary_loss_clip": 0.0131052, "auxiliary_loss_mlp": 0.01060716, "balance_loss_clip": 1.07978249, "balance_loss_mlp": 1.04390728, "epoch": 0.04064209703601274, "flos": 22674644196480.0, "grad_norm": 1.9280515583643374, "language_loss": 0.83206749, "learning_rate": 3.998825356342653e-06, "loss": 0.85577989, "num_input_tokens_seen": 7175920, "step": 338, "time_per_iteration": 2.759704828262329 }, { "auxiliary_loss_clip": 0.01309634, "auxiliary_loss_mlp": 0.01046004, "balance_loss_clip": 1.07953835, "balance_loss_mlp": 1.02940989, "epoch": 0.04076233992665183, "flos": 38582172783360.0, "grad_norm": 2.9791869434929774, "language_loss": 0.73254955, "learning_rate": 3.998798510900003e-06, "loss": 0.7561059, "num_input_tokens_seen": 7198720, "step": 339, "time_per_iteration": 2.8123061656951904 }, { "auxiliary_loss_clip": 0.01303911, "auxiliary_loss_mlp": 0.01054731, "balance_loss_clip": 1.07760644, "balance_loss_mlp": 1.03761244, "epoch": 0.04088258281729093, "flos": 25885309374720.0, "grad_norm": 2.8020377577927547, "language_loss": 0.83838844, "learning_rate": 3.998771362241207e-06, "loss": 0.86197484, "num_input_tokens_seen": 7219125, "step": 340, "time_per_iteration": 2.677311658859253 }, { "auxiliary_loss_clip": 0.01305011, "auxiliary_loss_mlp": 0.01055533, "balance_loss_clip": 1.07914829, "balance_loss_mlp": 1.03904581, "epoch": 0.04100282570793002, "flos": 19789832223360.0, "grad_norm": 2.116505356596882, "language_loss": 0.8788749, "learning_rate": 3.998743910370385e-06, "loss": 0.90248036, "num_input_tokens_seen": 7237985, "step": 341, "time_per_iteration": 2.7009551525115967 }, { "auxiliary_loss_clip": 0.01314027, "auxiliary_loss_mlp": 0.01051436, "balance_loss_clip": 1.08639824, "balance_loss_mlp": 1.03416204, "epoch": 0.04112306859856911, "flos": 22565152563840.0, "grad_norm": 2.3140281496041912, "language_loss": 0.73341012, "learning_rate": 3.998716155291702e-06, "loss": 0.7570647, "num_input_tokens_seen": 7255825, "step": 342, "time_per_iteration": 2.632542133331299 }, { "auxiliary_loss_clip": 0.01307555, "auxiliary_loss_mlp": 0.01043454, "balance_loss_clip": 1.08335137, "balance_loss_mlp": 1.02697909, "epoch": 0.0412433114892082, "flos": 25040654081280.0, "grad_norm": 1.6953134538468169, "language_loss": 0.90617108, "learning_rate": 3.998688097009366e-06, "loss": 0.92968118, "num_input_tokens_seen": 7276590, "step": 343, "time_per_iteration": 2.8527209758758545 }, { "auxiliary_loss_clip": 0.01303845, "auxiliary_loss_mlp": 0.01052978, "balance_loss_clip": 1.07916045, "balance_loss_mlp": 1.03610945, "epoch": 0.04136355437984729, "flos": 25191371548800.0, "grad_norm": 3.110155882471581, "language_loss": 0.79673529, "learning_rate": 3.998659735527636e-06, "loss": 0.8203035, "num_input_tokens_seen": 7295680, "step": 344, "time_per_iteration": 2.670529842376709 }, { "auxiliary_loss_clip": 0.01303376, "auxiliary_loss_mlp": 0.01060344, "balance_loss_clip": 1.07962394, "balance_loss_mlp": 1.04312992, "epoch": 0.04148379727048638, "flos": 22966777509120.0, "grad_norm": 2.0288079630142697, "language_loss": 0.77838016, "learning_rate": 3.998631070850813e-06, "loss": 0.80201733, "num_input_tokens_seen": 7316300, "step": 345, "time_per_iteration": 2.7030649185180664 }, { "auxiliary_loss_clip": 0.01304251, "auxiliary_loss_mlp": 0.01059153, "balance_loss_clip": 1.07916498, "balance_loss_mlp": 1.04283285, "epoch": 0.041604040161125476, "flos": 14063481187200.0, "grad_norm": 2.7149746707821008, "language_loss": 0.8338145, "learning_rate": 3.9986021029832455e-06, "loss": 0.85744858, "num_input_tokens_seen": 7333615, "step": 346, "time_per_iteration": 2.6110408306121826 }, { "auxiliary_loss_clip": 0.0130469, "auxiliary_loss_mlp": 0.01052665, "balance_loss_clip": 1.07904434, "balance_loss_mlp": 1.03557062, "epoch": 0.041724283051764566, "flos": 12091877614080.0, "grad_norm": 2.618161004038605, "language_loss": 0.91037285, "learning_rate": 3.9985728319293285e-06, "loss": 0.93394643, "num_input_tokens_seen": 7347590, "step": 347, "time_per_iteration": 2.653554916381836 }, { "auxiliary_loss_clip": 0.01308529, "auxiliary_loss_mlp": 0.01065578, "balance_loss_clip": 1.07882595, "balance_loss_mlp": 1.0465169, "epoch": 0.041844525942403656, "flos": 12385303816320.0, "grad_norm": 2.5259381614977747, "language_loss": 0.85050571, "learning_rate": 3.998543257693501e-06, "loss": 0.87424684, "num_input_tokens_seen": 7364345, "step": 348, "time_per_iteration": 2.8200273513793945 }, { "auxiliary_loss_clip": 0.01304348, "auxiliary_loss_mlp": 0.01054274, "balance_loss_clip": 1.080948, "balance_loss_mlp": 1.03720367, "epoch": 0.041964768833042745, "flos": 23769345041280.0, "grad_norm": 1.7711025230908282, "language_loss": 0.8803162, "learning_rate": 3.998513380280251e-06, "loss": 0.90390241, "num_input_tokens_seen": 7384625, "step": 349, "time_per_iteration": 2.7264652252197266 }, { "auxiliary_loss_clip": 0.01309174, "auxiliary_loss_mlp": 0.01068243, "balance_loss_clip": 1.08033311, "balance_loss_mlp": 1.0509932, "epoch": 0.042085011723681835, "flos": 11875336473600.0, "grad_norm": 3.8200484439665185, "language_loss": 0.95077676, "learning_rate": 3.99848319969411e-06, "loss": 0.9745509, "num_input_tokens_seen": 7402225, "step": 350, "time_per_iteration": 2.9306702613830566 }, { "auxiliary_loss_clip": 0.01311774, "auxiliary_loss_mlp": 0.01067051, "balance_loss_clip": 1.08323812, "balance_loss_mlp": 1.04934812, "epoch": 0.042205254614320925, "flos": 16873957964160.0, "grad_norm": 2.4216160113589194, "language_loss": 0.78877413, "learning_rate": 3.9984527159396564e-06, "loss": 0.81256235, "num_input_tokens_seen": 7420865, "step": 351, "time_per_iteration": 2.8476171493530273 }, { "auxiliary_loss_clip": 0.01302136, "auxiliary_loss_mlp": 0.01054442, "balance_loss_clip": 1.07748055, "balance_loss_mlp": 1.03752637, "epoch": 0.04232549750496002, "flos": 25118508810240.0, "grad_norm": 2.1560167146306735, "language_loss": 0.83963549, "learning_rate": 3.9984219290215154e-06, "loss": 0.86320132, "num_input_tokens_seen": 7441040, "step": 352, "time_per_iteration": 2.7566349506378174 }, { "auxiliary_loss_clip": 0.01303534, "auxiliary_loss_mlp": 0.01059427, "balance_loss_clip": 1.08061063, "balance_loss_mlp": 1.04301178, "epoch": 0.04244574039559911, "flos": 26724541714560.0, "grad_norm": 3.1966676570032213, "language_loss": 0.89213383, "learning_rate": 3.998390838944356e-06, "loss": 0.91576338, "num_input_tokens_seen": 7462545, "step": 353, "time_per_iteration": 2.7652621269226074 }, { "auxiliary_loss_clip": 0.01306941, "auxiliary_loss_mlp": 0.01053478, "balance_loss_clip": 1.07934618, "balance_loss_mlp": 1.0368011, "epoch": 0.0425659832862382, "flos": 20923244951040.0, "grad_norm": 4.429405146484284, "language_loss": 0.90408921, "learning_rate": 3.998359445712895e-06, "loss": 0.92769343, "num_input_tokens_seen": 7481650, "step": 354, "time_per_iteration": 2.69223690032959 }, { "auxiliary_loss_clip": 0.01303635, "auxiliary_loss_mlp": 0.01054416, "balance_loss_clip": 1.07802653, "balance_loss_mlp": 1.03771496, "epoch": 0.04268622617687729, "flos": 23331127115520.0, "grad_norm": 3.0986551063430645, "language_loss": 0.81134617, "learning_rate": 3.9983277493318955e-06, "loss": 0.83492661, "num_input_tokens_seen": 7500945, "step": 355, "time_per_iteration": 2.6368491649627686 }, { "auxiliary_loss_clip": 0.01302412, "auxiliary_loss_mlp": 0.0105426, "balance_loss_clip": 1.07770896, "balance_loss_mlp": 1.03778505, "epoch": 0.04280646906751638, "flos": 25994010908160.0, "grad_norm": 1.653004541875781, "language_loss": 0.81128716, "learning_rate": 3.998295749806165e-06, "loss": 0.83485389, "num_input_tokens_seen": 7522170, "step": 356, "time_per_iteration": 3.516709327697754 }, { "auxiliary_loss_clip": 0.01305651, "auxiliary_loss_mlp": 0.01054769, "balance_loss_clip": 1.07950175, "balance_loss_mlp": 1.03835404, "epoch": 0.04292671195815547, "flos": 26906824258560.0, "grad_norm": 2.0964476951655455, "language_loss": 0.83325434, "learning_rate": 3.998263447140558e-06, "loss": 0.85685855, "num_input_tokens_seen": 7542370, "step": 357, "time_per_iteration": 4.495465517044067 }, { "auxiliary_loss_clip": 0.01302657, "auxiliary_loss_mlp": 0.01052245, "balance_loss_clip": 1.07814622, "balance_loss_mlp": 1.03503132, "epoch": 0.04304695484879457, "flos": 39457315745280.0, "grad_norm": 1.9791897329572008, "language_loss": 0.81494331, "learning_rate": 3.998230841339976e-06, "loss": 0.83849233, "num_input_tokens_seen": 7564380, "step": 358, "time_per_iteration": 3.7497401237487793 }, { "auxiliary_loss_clip": 0.01304031, "auxiliary_loss_mlp": 0.01061637, "balance_loss_clip": 1.08069944, "balance_loss_mlp": 1.0454129, "epoch": 0.04316719773943366, "flos": 19646297475840.0, "grad_norm": 2.9926578627958027, "language_loss": 0.84986901, "learning_rate": 3.998197932409363e-06, "loss": 0.8735258, "num_input_tokens_seen": 7582390, "step": 359, "time_per_iteration": 2.6211178302764893 }, { "auxiliary_loss_clip": 0.01299347, "auxiliary_loss_mlp": 0.01052802, "balance_loss_clip": 1.07755017, "balance_loss_mlp": 1.03618407, "epoch": 0.04328744063007275, "flos": 22452320966400.0, "grad_norm": 2.1391725520609186, "language_loss": 0.86312336, "learning_rate": 3.9981647203537125e-06, "loss": 0.88664484, "num_input_tokens_seen": 7599890, "step": 360, "time_per_iteration": 2.684583902359009 }, { "auxiliary_loss_clip": 0.01301256, "auxiliary_loss_mlp": 0.01052676, "balance_loss_clip": 1.07748687, "balance_loss_mlp": 1.03559351, "epoch": 0.04340768352071184, "flos": 21283033530240.0, "grad_norm": 2.2748306331486976, "language_loss": 0.95937276, "learning_rate": 3.998131205178063e-06, "loss": 0.98291212, "num_input_tokens_seen": 7618360, "step": 361, "time_per_iteration": 2.64471173286438 }, { "auxiliary_loss_clip": 0.013026, "auxiliary_loss_mlp": 0.01056073, "balance_loss_clip": 1.0787288, "balance_loss_mlp": 1.03811979, "epoch": 0.04352792641135093, "flos": 11583705951360.0, "grad_norm": 11.735900798606263, "language_loss": 0.76661503, "learning_rate": 3.998097386887498e-06, "loss": 0.79020178, "num_input_tokens_seen": 7635435, "step": 362, "time_per_iteration": 2.6437597274780273 }, { "auxiliary_loss_clip": 0.01296467, "auxiliary_loss_mlp": 0.010562, "balance_loss_clip": 1.07645893, "balance_loss_mlp": 1.04059577, "epoch": 0.04364816930199002, "flos": 23623547736960.0, "grad_norm": 1.8472651974208198, "language_loss": 0.84921682, "learning_rate": 3.998063265487148e-06, "loss": 0.87274349, "num_input_tokens_seen": 7656485, "step": 363, "time_per_iteration": 2.654468536376953 }, { "auxiliary_loss_clip": 0.01300164, "auxiliary_loss_mlp": 0.01064081, "balance_loss_clip": 1.07740927, "balance_loss_mlp": 1.04808283, "epoch": 0.043768412192629114, "flos": 14429734214400.0, "grad_norm": 4.112434094423012, "language_loss": 0.81128711, "learning_rate": 3.99802884098219e-06, "loss": 0.83492959, "num_input_tokens_seen": 7674595, "step": 364, "time_per_iteration": 2.6554665565490723 }, { "auxiliary_loss_clip": 0.01298253, "auxiliary_loss_mlp": 0.01046731, "balance_loss_clip": 1.07515359, "balance_loss_mlp": 1.0303036, "epoch": 0.043888655083268203, "flos": 26468893641600.0, "grad_norm": 2.478277187301562, "language_loss": 0.82387006, "learning_rate": 3.997994113377845e-06, "loss": 0.8473199, "num_input_tokens_seen": 7693495, "step": 365, "time_per_iteration": 2.6950199604034424 }, { "auxiliary_loss_clip": 0.01302148, "auxiliary_loss_mlp": 0.01063067, "balance_loss_clip": 1.0791018, "balance_loss_mlp": 1.04640126, "epoch": 0.04400889797390729, "flos": 27235263242880.0, "grad_norm": 2.4618489894397824, "language_loss": 0.83243906, "learning_rate": 3.9979590826793815e-06, "loss": 0.8560912, "num_input_tokens_seen": 7714685, "step": 366, "time_per_iteration": 2.705496072769165 }, { "auxiliary_loss_clip": 0.01303664, "auxiliary_loss_mlp": 0.01054857, "balance_loss_clip": 1.08077598, "balance_loss_mlp": 1.03745198, "epoch": 0.04412914086454638, "flos": 20119528183680.0, "grad_norm": 2.1991399256036974, "language_loss": 0.81119907, "learning_rate": 3.997923748892113e-06, "loss": 0.83478421, "num_input_tokens_seen": 7734005, "step": 367, "time_per_iteration": 2.6525352001190186 }, { "auxiliary_loss_clip": 0.01301725, "auxiliary_loss_mlp": 0.01059197, "balance_loss_clip": 1.08004928, "balance_loss_mlp": 1.04267454, "epoch": 0.04424938375518547, "flos": 22604618632320.0, "grad_norm": 1.7871520008729351, "language_loss": 0.88514447, "learning_rate": 3.9978881120214015e-06, "loss": 0.90875375, "num_input_tokens_seen": 7755525, "step": 368, "time_per_iteration": 2.7256197929382324 }, { "auxiliary_loss_clip": 0.0130007, "auxiliary_loss_mlp": 0.01046724, "balance_loss_clip": 1.07588542, "balance_loss_mlp": 1.02989185, "epoch": 0.04436962664582456, "flos": 24132365844480.0, "grad_norm": 1.849002353469361, "language_loss": 0.79390275, "learning_rate": 3.997852172072652e-06, "loss": 0.81737071, "num_input_tokens_seen": 7776740, "step": 369, "time_per_iteration": 2.676048517227173 }, { "auxiliary_loss_clip": 0.01301548, "auxiliary_loss_mlp": 0.0106793, "balance_loss_clip": 1.07754004, "balance_loss_mlp": 1.04942918, "epoch": 0.04448986953646366, "flos": 18222906251520.0, "grad_norm": 3.0680065399828855, "language_loss": 0.89492077, "learning_rate": 3.9978159290513155e-06, "loss": 0.91861558, "num_input_tokens_seen": 7794820, "step": 370, "time_per_iteration": 2.58294677734375 }, { "auxiliary_loss_clip": 0.01307686, "auxiliary_loss_mlp": 0.01062697, "balance_loss_clip": 1.08165538, "balance_loss_mlp": 1.0453763, "epoch": 0.04461011242710275, "flos": 30117920400000.0, "grad_norm": 2.8603399139884647, "language_loss": 0.80434948, "learning_rate": 3.997779382962892e-06, "loss": 0.82805336, "num_input_tokens_seen": 7817705, "step": 371, "time_per_iteration": 2.731043815612793 }, { "auxiliary_loss_clip": 0.01298654, "auxiliary_loss_mlp": 0.01051143, "balance_loss_clip": 1.07828224, "balance_loss_mlp": 1.0345608, "epoch": 0.04473035531774184, "flos": 29752529299200.0, "grad_norm": 2.3908545636859815, "language_loss": 0.74133778, "learning_rate": 3.997742533812924e-06, "loss": 0.76483572, "num_input_tokens_seen": 7840970, "step": 372, "time_per_iteration": 2.782999038696289 }, { "auxiliary_loss_clip": 0.01299294, "auxiliary_loss_mlp": 0.01068106, "balance_loss_clip": 1.0776968, "balance_loss_mlp": 1.05089164, "epoch": 0.04485059820838093, "flos": 13151565676800.0, "grad_norm": 2.7520260148809164, "language_loss": 0.92503977, "learning_rate": 3.997705381607001e-06, "loss": 0.94871378, "num_input_tokens_seen": 7857785, "step": 373, "time_per_iteration": 2.634202003479004 }, { "auxiliary_loss_clip": 0.01229592, "auxiliary_loss_mlp": 0.01031015, "balance_loss_clip": 1.06863999, "balance_loss_mlp": 1.02171695, "epoch": 0.04497084109902002, "flos": 68094209548800.0, "grad_norm": 5.298250616423988, "language_loss": 0.60237736, "learning_rate": 3.997667926350761e-06, "loss": 0.62498343, "num_input_tokens_seen": 7916115, "step": 374, "time_per_iteration": 3.125986337661743 }, { "auxiliary_loss_clip": 0.01229545, "auxiliary_loss_mlp": 0.01026861, "balance_loss_clip": 1.06812024, "balance_loss_mlp": 1.01751459, "epoch": 0.04509108398965911, "flos": 64342263346560.0, "grad_norm": 0.9115823330686188, "language_loss": 0.5778172, "learning_rate": 3.997630168049886e-06, "loss": 0.60038126, "num_input_tokens_seen": 7974480, "step": 375, "time_per_iteration": 3.200150966644287 }, { "auxiliary_loss_clip": 0.01302189, "auxiliary_loss_mlp": 0.01052453, "balance_loss_clip": 1.07994246, "balance_loss_mlp": 1.03531051, "epoch": 0.045211326880298205, "flos": 22271115830400.0, "grad_norm": 1.9778693985668847, "language_loss": 0.77330089, "learning_rate": 3.997592106710101e-06, "loss": 0.79684734, "num_input_tokens_seen": 7993940, "step": 376, "time_per_iteration": 2.6484200954437256 }, { "auxiliary_loss_clip": 0.01297064, "auxiliary_loss_mlp": 0.0105497, "balance_loss_clip": 1.07606614, "balance_loss_mlp": 1.03855538, "epoch": 0.045331569770937295, "flos": 32159441796480.0, "grad_norm": 5.435332119991449, "language_loss": 0.65523994, "learning_rate": 3.997553742337182e-06, "loss": 0.67876029, "num_input_tokens_seen": 8013365, "step": 377, "time_per_iteration": 2.726078510284424 }, { "auxiliary_loss_clip": 0.01300217, "auxiliary_loss_mlp": 0.01064226, "balance_loss_clip": 1.07984042, "balance_loss_mlp": 1.04829991, "epoch": 0.045451812661576385, "flos": 22163455791360.0, "grad_norm": 1.9156859903029109, "language_loss": 0.91562378, "learning_rate": 3.997515074936949e-06, "loss": 0.93926823, "num_input_tokens_seen": 8034240, "step": 378, "time_per_iteration": 2.6822803020477295 }, { "auxiliary_loss_clip": 0.01297574, "auxiliary_loss_mlp": 0.01051214, "balance_loss_clip": 1.07721746, "balance_loss_mlp": 1.0342741, "epoch": 0.045572055552215475, "flos": 16581968305920.0, "grad_norm": 2.6642076871791205, "language_loss": 0.87371695, "learning_rate": 3.997476104515268e-06, "loss": 0.89720482, "num_input_tokens_seen": 8052430, "step": 379, "time_per_iteration": 2.6224355697631836 }, { "auxiliary_loss_clip": 0.01293056, "auxiliary_loss_mlp": 0.01052869, "balance_loss_clip": 1.07755327, "balance_loss_mlp": 1.03671646, "epoch": 0.045692298442854565, "flos": 17603375448960.0, "grad_norm": 2.3321547462077077, "language_loss": 0.77836657, "learning_rate": 3.9974368310780485e-06, "loss": 0.80182588, "num_input_tokens_seen": 8069605, "step": 380, "time_per_iteration": 2.659353494644165 }, { "auxiliary_loss_clip": 0.01304064, "auxiliary_loss_mlp": 0.01061384, "balance_loss_clip": 1.08022654, "balance_loss_mlp": 1.04424191, "epoch": 0.045812541333493655, "flos": 26761098781440.0, "grad_norm": 2.780408237198681, "language_loss": 0.74347103, "learning_rate": 3.997397254631251e-06, "loss": 0.76712549, "num_input_tokens_seen": 8090225, "step": 381, "time_per_iteration": 2.678143262863159 }, { "auxiliary_loss_clip": 0.01218911, "auxiliary_loss_mlp": 0.01012946, "balance_loss_clip": 1.06116951, "balance_loss_mlp": 1.00350475, "epoch": 0.04593278422413275, "flos": 60250349894400.0, "grad_norm": 0.8081417123630017, "language_loss": 0.60029817, "learning_rate": 3.997357375180878e-06, "loss": 0.62261677, "num_input_tokens_seen": 8154505, "step": 382, "time_per_iteration": 4.130557060241699 }, { "auxiliary_loss_clip": 0.01295451, "auxiliary_loss_mlp": 0.01054881, "balance_loss_clip": 1.07608986, "balance_loss_mlp": 1.03870416, "epoch": 0.04605302711477184, "flos": 21799249839360.0, "grad_norm": 1.7384428028315952, "language_loss": 0.75364017, "learning_rate": 3.997317192732979e-06, "loss": 0.77714348, "num_input_tokens_seen": 8173285, "step": 383, "time_per_iteration": 3.5492475032806396 }, { "auxiliary_loss_clip": 0.01298251, "auxiliary_loss_mlp": 0.01081114, "balance_loss_clip": 1.07620525, "balance_loss_mlp": 1.06467474, "epoch": 0.04617327000541093, "flos": 19459705299840.0, "grad_norm": 1.862336881741504, "language_loss": 0.82686806, "learning_rate": 3.99727670729365e-06, "loss": 0.85066175, "num_input_tokens_seen": 8191845, "step": 384, "time_per_iteration": 3.527864694595337 }, { "auxiliary_loss_clip": 0.01297728, "auxiliary_loss_mlp": 0.01052551, "balance_loss_clip": 1.07980442, "balance_loss_mlp": 1.03630292, "epoch": 0.04629351289605002, "flos": 25411468135680.0, "grad_norm": 1.7169656763152812, "language_loss": 0.77886957, "learning_rate": 3.997235918869033e-06, "loss": 0.80237234, "num_input_tokens_seen": 8212880, "step": 385, "time_per_iteration": 3.5657477378845215 }, { "auxiliary_loss_clip": 0.01294538, "auxiliary_loss_mlp": 0.01048789, "balance_loss_clip": 1.07687211, "balance_loss_mlp": 1.03304136, "epoch": 0.04641375578668911, "flos": 20558284813440.0, "grad_norm": 2.0990250560348804, "language_loss": 0.82422608, "learning_rate": 3.997194827465315e-06, "loss": 0.84765935, "num_input_tokens_seen": 8231475, "step": 386, "time_per_iteration": 2.694240093231201 }, { "auxiliary_loss_clip": 0.01293776, "auxiliary_loss_mlp": 0.01048039, "balance_loss_clip": 1.07469344, "balance_loss_mlp": 1.03165984, "epoch": 0.0465339986773282, "flos": 13188661447680.0, "grad_norm": 3.668457870887252, "language_loss": 0.9140296, "learning_rate": 3.997153433088728e-06, "loss": 0.93744779, "num_input_tokens_seen": 8248600, "step": 387, "time_per_iteration": 2.614888906478882 }, { "auxiliary_loss_clip": 0.01294541, "auxiliary_loss_mlp": 0.01048702, "balance_loss_clip": 1.07612348, "balance_loss_mlp": 1.03195333, "epoch": 0.0466542415679673, "flos": 25556547168000.0, "grad_norm": 3.095194407786512, "language_loss": 0.81485796, "learning_rate": 3.997111735745554e-06, "loss": 0.83829039, "num_input_tokens_seen": 8271570, "step": 388, "time_per_iteration": 2.7356226444244385 }, { "auxiliary_loss_clip": 0.01293387, "auxiliary_loss_mlp": 0.01058006, "balance_loss_clip": 1.0762105, "balance_loss_mlp": 1.04109049, "epoch": 0.04677448445860639, "flos": 22236749493120.0, "grad_norm": 1.9160458408406096, "language_loss": 0.82782376, "learning_rate": 3.997069735442118e-06, "loss": 0.85133767, "num_input_tokens_seen": 8291265, "step": 389, "time_per_iteration": 2.6425600051879883 }, { "auxiliary_loss_clip": 0.01289865, "auxiliary_loss_mlp": 0.01049579, "balance_loss_clip": 1.07428908, "balance_loss_mlp": 1.03452289, "epoch": 0.04689472734924548, "flos": 28147825198080.0, "grad_norm": 1.8054039265887218, "language_loss": 0.80316806, "learning_rate": 3.997027432184792e-06, "loss": 0.82656252, "num_input_tokens_seen": 8315925, "step": 390, "time_per_iteration": 2.7311086654663086 }, { "auxiliary_loss_clip": 0.01292172, "auxiliary_loss_mlp": 0.01043596, "balance_loss_clip": 1.07509053, "balance_loss_mlp": 1.02720487, "epoch": 0.04701497023988457, "flos": 23148952312320.0, "grad_norm": 1.9855015041288748, "language_loss": 0.89557445, "learning_rate": 3.99698482597999e-06, "loss": 0.91893214, "num_input_tokens_seen": 8333605, "step": 391, "time_per_iteration": 2.6503472328186035 }, { "auxiliary_loss_clip": 0.01209534, "auxiliary_loss_mlp": 0.01008373, "balance_loss_clip": 1.05423355, "balance_loss_mlp": 0.99917048, "epoch": 0.04713521313052366, "flos": 64827668764800.0, "grad_norm": 0.8819561541659392, "language_loss": 0.63975763, "learning_rate": 3.99694191683418e-06, "loss": 0.66193664, "num_input_tokens_seen": 8394405, "step": 392, "time_per_iteration": 3.2271018028259277 }, { "auxiliary_loss_clip": 0.01300491, "auxiliary_loss_mlp": 0.01049527, "balance_loss_clip": 1.08060265, "balance_loss_mlp": 1.0334574, "epoch": 0.047255456021162746, "flos": 18771585477120.0, "grad_norm": 2.1087823950029008, "language_loss": 0.81835318, "learning_rate": 3.996898704753867e-06, "loss": 0.84185338, "num_input_tokens_seen": 8412355, "step": 393, "time_per_iteration": 2.7225568294525146 }, { "auxiliary_loss_clip": 0.01293015, "auxiliary_loss_mlp": 0.01060726, "balance_loss_clip": 1.07374549, "balance_loss_mlp": 1.04400134, "epoch": 0.04737569891180184, "flos": 22053820504320.0, "grad_norm": 2.4285516811479195, "language_loss": 0.87830842, "learning_rate": 3.996855189745609e-06, "loss": 0.90184581, "num_input_tokens_seen": 8431620, "step": 394, "time_per_iteration": 2.6995530128479004 }, { "auxiliary_loss_clip": 0.01291689, "auxiliary_loss_mlp": 0.01046846, "balance_loss_clip": 1.07387996, "balance_loss_mlp": 1.03215957, "epoch": 0.04749594180244093, "flos": 29057370410880.0, "grad_norm": 2.2024421180345954, "language_loss": 0.92685658, "learning_rate": 3.996811371816007e-06, "loss": 0.95024192, "num_input_tokens_seen": 8454045, "step": 395, "time_per_iteration": 2.738726854324341 }, { "auxiliary_loss_clip": 0.01293593, "auxiliary_loss_mlp": 0.01050369, "balance_loss_clip": 1.07729363, "balance_loss_mlp": 1.0348475, "epoch": 0.04761618469308002, "flos": 35112268172160.0, "grad_norm": 1.9898024634541902, "language_loss": 0.77986836, "learning_rate": 3.996767250971707e-06, "loss": 0.80330795, "num_input_tokens_seen": 8476785, "step": 396, "time_per_iteration": 2.787046194076538 }, { "auxiliary_loss_clip": 0.01293384, "auxiliary_loss_mlp": 0.01056578, "balance_loss_clip": 1.07635403, "balance_loss_mlp": 1.03984058, "epoch": 0.04773642758371911, "flos": 25630702796160.0, "grad_norm": 1.772569646175126, "language_loss": 0.86864698, "learning_rate": 3.996722827219403e-06, "loss": 0.89214659, "num_input_tokens_seen": 8498400, "step": 397, "time_per_iteration": 2.6741998195648193 }, { "auxiliary_loss_clip": 0.01292788, "auxiliary_loss_mlp": 0.01053077, "balance_loss_clip": 1.07613659, "balance_loss_mlp": 1.03673363, "epoch": 0.0478566704743582, "flos": 20631506688000.0, "grad_norm": 2.3970860501203863, "language_loss": 0.82831717, "learning_rate": 3.996678100565833e-06, "loss": 0.85177588, "num_input_tokens_seen": 8517455, "step": 398, "time_per_iteration": 2.639420509338379 }, { "auxiliary_loss_clip": 0.01289822, "auxiliary_loss_mlp": 0.01064366, "balance_loss_clip": 1.07460463, "balance_loss_mlp": 1.04774797, "epoch": 0.04797691336499729, "flos": 18835721210880.0, "grad_norm": 2.1516839352813135, "language_loss": 0.88451064, "learning_rate": 3.996633071017783e-06, "loss": 0.90805256, "num_input_tokens_seen": 8534085, "step": 399, "time_per_iteration": 2.664177656173706 }, { "auxiliary_loss_clip": 0.01290936, "auxiliary_loss_mlp": 0.01052556, "balance_loss_clip": 1.07636344, "balance_loss_mlp": 1.03689194, "epoch": 0.04809715625563638, "flos": 21099673578240.0, "grad_norm": 2.669274155804717, "language_loss": 0.82088906, "learning_rate": 3.996587738582084e-06, "loss": 0.84432399, "num_input_tokens_seen": 8550885, "step": 400, "time_per_iteration": 2.641785144805908 }, { "auxiliary_loss_clip": 0.01293293, "auxiliary_loss_mlp": 0.01048966, "balance_loss_clip": 1.07610917, "balance_loss_mlp": 1.03380287, "epoch": 0.04821739914627548, "flos": 23805650712960.0, "grad_norm": 2.9053764207972157, "language_loss": 0.86313266, "learning_rate": 3.9965421032656115e-06, "loss": 0.88655519, "num_input_tokens_seen": 8570815, "step": 401, "time_per_iteration": 2.7073943614959717 }, { "auxiliary_loss_clip": 0.0129221, "auxiliary_loss_mlp": 0.01051412, "balance_loss_clip": 1.07578194, "balance_loss_mlp": 1.03575957, "epoch": 0.04833764203691457, "flos": 22200587475840.0, "grad_norm": 3.0223910462283547, "language_loss": 0.94327593, "learning_rate": 3.99649616507529e-06, "loss": 0.96671212, "num_input_tokens_seen": 8589910, "step": 402, "time_per_iteration": 2.639754056930542 }, { "auxiliary_loss_clip": 0.01200891, "auxiliary_loss_mlp": 0.01010472, "balance_loss_clip": 1.04910183, "balance_loss_mlp": 1.00165009, "epoch": 0.04845788492755366, "flos": 65904376896000.0, "grad_norm": 0.8911483503930873, "language_loss": 0.63280851, "learning_rate": 3.996449924018088e-06, "loss": 0.65492213, "num_input_tokens_seen": 8650370, "step": 403, "time_per_iteration": 3.1136527061462402 }, { "auxiliary_loss_clip": 0.0128811, "auxiliary_loss_mlp": 0.01048019, "balance_loss_clip": 1.07227468, "balance_loss_mlp": 1.03276062, "epoch": 0.04857812781819275, "flos": 19281301424640.0, "grad_norm": 2.1235192192881294, "language_loss": 0.79580522, "learning_rate": 3.99640338010102e-06, "loss": 0.81916654, "num_input_tokens_seen": 8669475, "step": 404, "time_per_iteration": 2.7265443801879883 }, { "auxiliary_loss_clip": 0.0128719, "auxiliary_loss_mlp": 0.01049882, "balance_loss_clip": 1.07264519, "balance_loss_mlp": 1.03495753, "epoch": 0.04869837070883184, "flos": 24062376193920.0, "grad_norm": 1.9531659266552335, "language_loss": 0.78571874, "learning_rate": 3.996356533331146e-06, "loss": 0.80908942, "num_input_tokens_seen": 8691345, "step": 405, "time_per_iteration": 2.651461362838745 }, { "auxiliary_loss_clip": 0.01293542, "auxiliary_loss_mlp": 0.01048509, "balance_loss_clip": 1.0729543, "balance_loss_mlp": 1.03105664, "epoch": 0.04881861359947093, "flos": 25187169657600.0, "grad_norm": 2.888319179534624, "language_loss": 0.61595905, "learning_rate": 3.996309383715573e-06, "loss": 0.6393795, "num_input_tokens_seen": 8710125, "step": 406, "time_per_iteration": 2.6398096084594727 }, { "auxiliary_loss_clip": 0.01292289, "auxiliary_loss_mlp": 0.01054171, "balance_loss_clip": 1.07597327, "balance_loss_mlp": 1.03665948, "epoch": 0.048938856490110025, "flos": 16362913213440.0, "grad_norm": 1.971694364650014, "language_loss": 0.7383033, "learning_rate": 3.996261931261454e-06, "loss": 0.76176786, "num_input_tokens_seen": 8728705, "step": 407, "time_per_iteration": 2.645510196685791 }, { "auxiliary_loss_clip": 0.01295603, "auxiliary_loss_mlp": 0.01062239, "balance_loss_clip": 1.0782758, "balance_loss_mlp": 1.04571676, "epoch": 0.049059099380749115, "flos": 29895094379520.0, "grad_norm": 1.9163212213885072, "language_loss": 0.86655384, "learning_rate": 3.996214175975987e-06, "loss": 0.89013231, "num_input_tokens_seen": 8749225, "step": 408, "time_per_iteration": 3.588085174560547 }, { "auxiliary_loss_clip": 0.0129555, "auxiliary_loss_mlp": 0.01049339, "balance_loss_clip": 1.07680726, "balance_loss_mlp": 1.03232801, "epoch": 0.049179342271388204, "flos": 35918858027520.0, "grad_norm": 2.2743332457168464, "language_loss": 0.79276836, "learning_rate": 3.996166117866417e-06, "loss": 0.81621718, "num_input_tokens_seen": 8771160, "step": 409, "time_per_iteration": 2.7322707176208496 }, { "auxiliary_loss_clip": 0.01286842, "auxiliary_loss_mlp": 0.01048021, "balance_loss_clip": 1.07119548, "balance_loss_mlp": 1.03320312, "epoch": 0.049299585162027294, "flos": 14611226659200.0, "grad_norm": 2.174391695318995, "language_loss": 0.86905503, "learning_rate": 3.996117756940035e-06, "loss": 0.89240366, "num_input_tokens_seen": 8787845, "step": 410, "time_per_iteration": 2.577420234680176 }, { "auxiliary_loss_clip": 0.01287664, "auxiliary_loss_mlp": 0.01053522, "balance_loss_clip": 1.07339787, "balance_loss_mlp": 1.03821516, "epoch": 0.049419828052666384, "flos": 19567939956480.0, "grad_norm": 2.2102803330424576, "language_loss": 0.97707361, "learning_rate": 3.996069093204175e-06, "loss": 1.00048542, "num_input_tokens_seen": 8803805, "step": 411, "time_per_iteration": 4.427610397338867 }, { "auxiliary_loss_clip": 0.01294352, "auxiliary_loss_mlp": 0.01046866, "balance_loss_clip": 1.07771516, "balance_loss_mlp": 1.03244209, "epoch": 0.049540070943305474, "flos": 13659916907520.0, "grad_norm": 2.352711557636332, "language_loss": 0.88274282, "learning_rate": 3.996020126666221e-06, "loss": 0.90615499, "num_input_tokens_seen": 8820785, "step": 412, "time_per_iteration": 2.5822505950927734 }, { "auxiliary_loss_clip": 0.01292134, "auxiliary_loss_mlp": 0.0105314, "balance_loss_clip": 1.07694256, "balance_loss_mlp": 1.03809571, "epoch": 0.04966031383394457, "flos": 21832035978240.0, "grad_norm": 2.0381910350327015, "language_loss": 0.82182908, "learning_rate": 3.995970857333601e-06, "loss": 0.84528184, "num_input_tokens_seen": 8841195, "step": 413, "time_per_iteration": 2.7868926525115967 }, { "auxiliary_loss_clip": 0.01289397, "auxiliary_loss_mlp": 0.01055598, "balance_loss_clip": 1.07488227, "balance_loss_mlp": 1.03911114, "epoch": 0.04978055672458366, "flos": 28618793349120.0, "grad_norm": 1.7648625853458038, "language_loss": 0.79743814, "learning_rate": 3.995921285213789e-06, "loss": 0.8208881, "num_input_tokens_seen": 8861455, "step": 414, "time_per_iteration": 2.665783643722534 }, { "auxiliary_loss_clip": 0.01288925, "auxiliary_loss_mlp": 0.01044105, "balance_loss_clip": 1.07373428, "balance_loss_mlp": 1.02864313, "epoch": 0.04990079961522275, "flos": 19828220883840.0, "grad_norm": 2.301923957699768, "language_loss": 0.80692011, "learning_rate": 3.995871410314305e-06, "loss": 0.83025038, "num_input_tokens_seen": 8880015, "step": 415, "time_per_iteration": 2.768411159515381 }, { "auxiliary_loss_clip": 0.01184486, "auxiliary_loss_mlp": 0.01009151, "balance_loss_clip": 1.04825163, "balance_loss_mlp": 1.00047255, "epoch": 0.05002104250586184, "flos": 62735045293440.0, "grad_norm": 0.9030998695797329, "language_loss": 0.59643245, "learning_rate": 3.995821232642714e-06, "loss": 0.6183688, "num_input_tokens_seen": 8938420, "step": 416, "time_per_iteration": 3.3072218894958496 }, { "auxiliary_loss_clip": 0.01269257, "auxiliary_loss_mlp": 0.01052915, "balance_loss_clip": 1.07436073, "balance_loss_mlp": 1.03677452, "epoch": 0.05014128539650093, "flos": 27928518710400.0, "grad_norm": 5.421790085037207, "language_loss": 0.8243736, "learning_rate": 3.995770752206629e-06, "loss": 0.84759533, "num_input_tokens_seen": 8959495, "step": 417, "time_per_iteration": 2.7052743434906006 }, { "auxiliary_loss_clip": 0.01291332, "auxiliary_loss_mlp": 0.0104693, "balance_loss_clip": 1.07639861, "balance_loss_mlp": 1.03053904, "epoch": 0.05026152828714002, "flos": 17705576620800.0, "grad_norm": 2.0778779795885534, "language_loss": 0.97145164, "learning_rate": 3.995719969013709e-06, "loss": 0.9948343, "num_input_tokens_seen": 8976675, "step": 418, "time_per_iteration": 2.7102177143096924 }, { "auxiliary_loss_clip": 0.01253479, "auxiliary_loss_mlp": 0.01048254, "balance_loss_clip": 1.06865847, "balance_loss_mlp": 1.03196967, "epoch": 0.05038177117777912, "flos": 19133277477120.0, "grad_norm": 15.389809140542523, "language_loss": 0.85902262, "learning_rate": 3.995668883071655e-06, "loss": 0.8820399, "num_input_tokens_seen": 8992900, "step": 419, "time_per_iteration": 2.7032995223999023 }, { "auxiliary_loss_clip": 0.01292975, "auxiliary_loss_mlp": 0.0105044, "balance_loss_clip": 1.07574022, "balance_loss_mlp": 1.034549, "epoch": 0.050502014068418206, "flos": 20667704618880.0, "grad_norm": 2.279737523478281, "language_loss": 0.91151357, "learning_rate": 3.995617494388219e-06, "loss": 0.93494773, "num_input_tokens_seen": 9011020, "step": 420, "time_per_iteration": 2.6725447177886963 }, { "auxiliary_loss_clip": 0.012497, "auxiliary_loss_mlp": 0.0105205, "balance_loss_clip": 1.06641006, "balance_loss_mlp": 1.0353967, "epoch": 0.050622256959057296, "flos": 21361103740800.0, "grad_norm": 2.3897843969472796, "language_loss": 0.80586004, "learning_rate": 3.995565802971196e-06, "loss": 0.82887757, "num_input_tokens_seen": 9030995, "step": 421, "time_per_iteration": 2.718824863433838 }, { "auxiliary_loss_clip": 0.01250918, "auxiliary_loss_mlp": 0.01057063, "balance_loss_clip": 1.06974006, "balance_loss_mlp": 1.04156613, "epoch": 0.050742499849696386, "flos": 27673588909440.0, "grad_norm": 1.9902930089143869, "language_loss": 0.67726701, "learning_rate": 3.995513808828427e-06, "loss": 0.70034683, "num_input_tokens_seen": 9053790, "step": 422, "time_per_iteration": 2.796081066131592 }, { "auxiliary_loss_clip": 0.01248079, "auxiliary_loss_mlp": 0.01048972, "balance_loss_clip": 1.06643867, "balance_loss_mlp": 1.03370106, "epoch": 0.050862742740335476, "flos": 19865999013120.0, "grad_norm": 2.3613161439233155, "language_loss": 0.7661202, "learning_rate": 3.9954615119678e-06, "loss": 0.78909063, "num_input_tokens_seen": 9072345, "step": 423, "time_per_iteration": 2.702320098876953 }, { "auxiliary_loss_clip": 0.01257682, "auxiliary_loss_mlp": 0.01043432, "balance_loss_clip": 1.06791151, "balance_loss_mlp": 1.0288291, "epoch": 0.050982985630974566, "flos": 22085098272000.0, "grad_norm": 3.4490032738996503, "language_loss": 0.80611616, "learning_rate": 3.995408912397248e-06, "loss": 0.82912737, "num_input_tokens_seen": 9090240, "step": 424, "time_per_iteration": 2.7180817127227783 }, { "auxiliary_loss_clip": 0.01253727, "auxiliary_loss_mlp": 0.01054966, "balance_loss_clip": 1.07150888, "balance_loss_mlp": 1.03944468, "epoch": 0.05110322852161366, "flos": 20740962407040.0, "grad_norm": 2.4383242354792465, "language_loss": 0.93390119, "learning_rate": 3.99535601012475e-06, "loss": 0.95698822, "num_input_tokens_seen": 9105570, "step": 425, "time_per_iteration": 2.72660756111145 }, { "auxiliary_loss_clip": 0.01240089, "auxiliary_loss_mlp": 0.0076717, "balance_loss_clip": 1.06959558, "balance_loss_mlp": 1.00052357, "epoch": 0.05122347141225275, "flos": 28547295327360.0, "grad_norm": 1.8595131581305586, "language_loss": 0.75537908, "learning_rate": 3.995302805158333e-06, "loss": 0.77545166, "num_input_tokens_seen": 9128225, "step": 426, "time_per_iteration": 2.803004741668701 }, { "auxiliary_loss_clip": 0.01242193, "auxiliary_loss_mlp": 0.01059346, "balance_loss_clip": 1.06623244, "balance_loss_mlp": 1.04293144, "epoch": 0.05134371430289184, "flos": 19722679747200.0, "grad_norm": 1.8942337146221704, "language_loss": 0.83664978, "learning_rate": 3.9952492975060665e-06, "loss": 0.85966516, "num_input_tokens_seen": 9148295, "step": 427, "time_per_iteration": 2.8120460510253906 }, { "auxiliary_loss_clip": 0.01268533, "auxiliary_loss_mlp": 0.01050392, "balance_loss_clip": 1.07163215, "balance_loss_mlp": 1.03432274, "epoch": 0.05146395719353093, "flos": 34458945649920.0, "grad_norm": 4.037054317180963, "language_loss": 0.84818387, "learning_rate": 3.995195487176067e-06, "loss": 0.87137312, "num_input_tokens_seen": 9168525, "step": 428, "time_per_iteration": 2.7524492740631104 }, { "auxiliary_loss_clip": 0.01285384, "auxiliary_loss_mlp": 0.01058417, "balance_loss_clip": 1.07193899, "balance_loss_mlp": 1.0431459, "epoch": 0.05158420008417002, "flos": 21760286561280.0, "grad_norm": 1.77901377161528, "language_loss": 0.85738152, "learning_rate": 3.995141374176499e-06, "loss": 0.88081956, "num_input_tokens_seen": 9186920, "step": 429, "time_per_iteration": 2.7009294033050537 }, { "auxiliary_loss_clip": 0.01148415, "auxiliary_loss_mlp": 0.0075989, "balance_loss_clip": 1.04057348, "balance_loss_mlp": 1.00046813, "epoch": 0.05170444297480911, "flos": 72553956226560.0, "grad_norm": 0.8783884477988154, "language_loss": 0.63112628, "learning_rate": 3.995086958515572e-06, "loss": 0.65020931, "num_input_tokens_seen": 9244940, "step": 430, "time_per_iteration": 3.312316656112671 }, { "auxiliary_loss_clip": 0.0119209, "auxiliary_loss_mlp": 0.00759751, "balance_loss_clip": 1.04711723, "balance_loss_mlp": 1.00055933, "epoch": 0.05182468586544821, "flos": 62416159326720.0, "grad_norm": 0.8572921777334838, "language_loss": 0.59979105, "learning_rate": 3.995032240201538e-06, "loss": 0.61930943, "num_input_tokens_seen": 9307335, "step": 431, "time_per_iteration": 3.166560649871826 }, { "auxiliary_loss_clip": 0.01169036, "auxiliary_loss_mlp": 0.01012862, "balance_loss_clip": 1.0438782, "balance_loss_mlp": 1.00480366, "epoch": 0.0519449287560873, "flos": 41225989432320.0, "grad_norm": 0.9299667868115054, "language_loss": 0.63100147, "learning_rate": 3.9949772192427e-06, "loss": 0.65282047, "num_input_tokens_seen": 9353960, "step": 432, "time_per_iteration": 2.9726219177246094 }, { "auxiliary_loss_clip": 0.01249352, "auxiliary_loss_mlp": 0.01050096, "balance_loss_clip": 1.06600463, "balance_loss_mlp": 1.03540373, "epoch": 0.05206517164672639, "flos": 17494530261120.0, "grad_norm": 1.8212929727111247, "language_loss": 0.79519093, "learning_rate": 3.994921895647405e-06, "loss": 0.81818545, "num_input_tokens_seen": 9372130, "step": 433, "time_per_iteration": 3.6096909046173096 }, { "auxiliary_loss_clip": 0.01188595, "auxiliary_loss_mlp": 0.01008375, "balance_loss_clip": 1.04452968, "balance_loss_mlp": 1.00045919, "epoch": 0.05218541453736548, "flos": 64002762973440.0, "grad_norm": 0.8436779026574405, "language_loss": 0.55351132, "learning_rate": 3.994866269424043e-06, "loss": 0.575481, "num_input_tokens_seen": 9428500, "step": 434, "time_per_iteration": 3.1480226516723633 }, { "auxiliary_loss_clip": 0.01198131, "auxiliary_loss_mlp": 0.010505, "balance_loss_clip": 1.05566788, "balance_loss_mlp": 1.03530049, "epoch": 0.05230565742800457, "flos": 19317319787520.0, "grad_norm": 2.7931773097865755, "language_loss": 0.78612697, "learning_rate": 3.9948103405810545e-06, "loss": 0.8086133, "num_input_tokens_seen": 9447450, "step": 435, "time_per_iteration": 2.82161545753479 }, { "auxiliary_loss_clip": 0.01213337, "auxiliary_loss_mlp": 0.01054466, "balance_loss_clip": 1.06108499, "balance_loss_mlp": 1.03948092, "epoch": 0.05242590031864366, "flos": 25298636538240.0, "grad_norm": 1.9585017290646973, "language_loss": 0.86196637, "learning_rate": 3.994754109126923e-06, "loss": 0.88464439, "num_input_tokens_seen": 9468945, "step": 436, "time_per_iteration": 3.732786178588867 }, { "auxiliary_loss_clip": 0.01190644, "auxiliary_loss_mlp": 0.01058864, "balance_loss_clip": 1.06077528, "balance_loss_mlp": 1.04425502, "epoch": 0.052546143209282754, "flos": 26211629456640.0, "grad_norm": 3.102487603203787, "language_loss": 0.93438071, "learning_rate": 3.994697575070181e-06, "loss": 0.9568758, "num_input_tokens_seen": 9488405, "step": 437, "time_per_iteration": 4.724217414855957 }, { "auxiliary_loss_clip": 0.01252051, "auxiliary_loss_mlp": 0.01046012, "balance_loss_clip": 1.07170343, "balance_loss_mlp": 1.03194571, "epoch": 0.052666386099921844, "flos": 22158140578560.0, "grad_norm": 1.8777815103365687, "language_loss": 0.91609454, "learning_rate": 3.994640738419402e-06, "loss": 0.93907517, "num_input_tokens_seen": 9507780, "step": 438, "time_per_iteration": 2.7505035400390625 }, { "auxiliary_loss_clip": 0.01263034, "auxiliary_loss_mlp": 0.01054488, "balance_loss_clip": 1.06862211, "balance_loss_mlp": 1.04023635, "epoch": 0.052786628990560934, "flos": 23881817502720.0, "grad_norm": 2.1098545982706076, "language_loss": 0.80693662, "learning_rate": 3.9945835991832075e-06, "loss": 0.83011186, "num_input_tokens_seen": 9529665, "step": 439, "time_per_iteration": 2.7022833824157715 }, { "auxiliary_loss_clip": 0.01284868, "auxiliary_loss_mlp": 0.01048281, "balance_loss_clip": 1.07439876, "balance_loss_mlp": 1.03372598, "epoch": 0.052906871881200024, "flos": 24605021934720.0, "grad_norm": 2.195139202210898, "language_loss": 0.92865914, "learning_rate": 3.994526157370268e-06, "loss": 0.9519906, "num_input_tokens_seen": 9548280, "step": 440, "time_per_iteration": 2.71628999710083 }, { "auxiliary_loss_clip": 0.01162758, "auxiliary_loss_mlp": 0.01009402, "balance_loss_clip": 1.03917003, "balance_loss_mlp": 1.00120056, "epoch": 0.053027114771839114, "flos": 56461631143680.0, "grad_norm": 0.8919355744608258, "language_loss": 0.59321082, "learning_rate": 3.994468412989296e-06, "loss": 0.61493242, "num_input_tokens_seen": 9609690, "step": 441, "time_per_iteration": 3.394660234451294 }, { "auxiliary_loss_clip": 0.01226917, "auxiliary_loss_mlp": 0.01049701, "balance_loss_clip": 1.06322169, "balance_loss_mlp": 1.03479958, "epoch": 0.053147357662478203, "flos": 17311098481920.0, "grad_norm": 2.7623135640937884, "language_loss": 0.9256649, "learning_rate": 3.994410366049052e-06, "loss": 0.94843102, "num_input_tokens_seen": 9627550, "step": 442, "time_per_iteration": 2.733097791671753 }, { "auxiliary_loss_clip": 0.01261248, "auxiliary_loss_mlp": 0.010415, "balance_loss_clip": 1.06894374, "balance_loss_mlp": 1.02699256, "epoch": 0.0532676005531173, "flos": 17164977955200.0, "grad_norm": 40.59394791320157, "language_loss": 0.83087289, "learning_rate": 3.994352016558341e-06, "loss": 0.85390043, "num_input_tokens_seen": 9644855, "step": 443, "time_per_iteration": 2.833704948425293 }, { "auxiliary_loss_clip": 0.01263772, "auxiliary_loss_mlp": 0.01055575, "balance_loss_clip": 1.07125056, "balance_loss_mlp": 1.0408411, "epoch": 0.05338784344375639, "flos": 27819960831360.0, "grad_norm": 1.9552318702250655, "language_loss": 0.73857021, "learning_rate": 3.994293364526014e-06, "loss": 0.76176369, "num_input_tokens_seen": 9665740, "step": 444, "time_per_iteration": 2.785085916519165 }, { "auxiliary_loss_clip": 0.01241292, "auxiliary_loss_mlp": 0.01049108, "balance_loss_clip": 1.06850517, "balance_loss_mlp": 1.03283572, "epoch": 0.05350808633439548, "flos": 21507691144320.0, "grad_norm": 2.2609387871723112, "language_loss": 0.84899586, "learning_rate": 3.99423440996097e-06, "loss": 0.87189984, "num_input_tokens_seen": 9685280, "step": 445, "time_per_iteration": 2.7917685508728027 }, { "auxiliary_loss_clip": 0.01254602, "auxiliary_loss_mlp": 0.01054763, "balance_loss_clip": 1.07142305, "balance_loss_mlp": 1.03919411, "epoch": 0.05362832922503457, "flos": 20084299920000.0, "grad_norm": 31.025720900556042, "language_loss": 0.81410706, "learning_rate": 3.994175152872152e-06, "loss": 0.8372007, "num_input_tokens_seen": 9704365, "step": 446, "time_per_iteration": 2.7781684398651123 }, { "auxiliary_loss_clip": 0.01265154, "auxiliary_loss_mlp": 0.01060476, "balance_loss_clip": 1.06833529, "balance_loss_mlp": 1.04597461, "epoch": 0.05374857211567366, "flos": 26137222433280.0, "grad_norm": 2.5944538226610674, "language_loss": 0.78682315, "learning_rate": 3.994115593268548e-06, "loss": 0.81007946, "num_input_tokens_seen": 9724145, "step": 447, "time_per_iteration": 2.7722597122192383 }, { "auxiliary_loss_clip": 0.0128086, "auxiliary_loss_mlp": 0.01051094, "balance_loss_clip": 1.07252359, "balance_loss_mlp": 1.03625226, "epoch": 0.05386881500631275, "flos": 27486817165440.0, "grad_norm": 3.4450317371169272, "language_loss": 0.82241881, "learning_rate": 3.994055731159195e-06, "loss": 0.84573835, "num_input_tokens_seen": 9741615, "step": 448, "time_per_iteration": 2.8158562183380127 }, { "auxiliary_loss_clip": 0.01270235, "auxiliary_loss_mlp": 0.01053666, "balance_loss_clip": 1.0761708, "balance_loss_mlp": 1.03918767, "epoch": 0.053989057896951846, "flos": 23585087249280.0, "grad_norm": 1.987517126185603, "language_loss": 0.87227154, "learning_rate": 3.993995566553172e-06, "loss": 0.89551055, "num_input_tokens_seen": 9760580, "step": 449, "time_per_iteration": 2.805704355239868 }, { "auxiliary_loss_clip": 0.01233077, "auxiliary_loss_mlp": 0.01046941, "balance_loss_clip": 1.06122851, "balance_loss_mlp": 1.03199208, "epoch": 0.054109300787590936, "flos": 25228862369280.0, "grad_norm": 1.5755731373425914, "language_loss": 0.77104473, "learning_rate": 3.993935099459607e-06, "loss": 0.79384494, "num_input_tokens_seen": 9782195, "step": 450, "time_per_iteration": 2.8505988121032715 }, { "auxiliary_loss_clip": 0.01274172, "auxiliary_loss_mlp": 0.01043252, "balance_loss_clip": 1.06929421, "balance_loss_mlp": 1.02987683, "epoch": 0.054229543678230026, "flos": 23841525421440.0, "grad_norm": 3.9498097552442273, "language_loss": 0.74033761, "learning_rate": 3.993874329887673e-06, "loss": 0.7635119, "num_input_tokens_seen": 9800850, "step": 451, "time_per_iteration": 2.732203483581543 }, { "auxiliary_loss_clip": 0.01265463, "auxiliary_loss_mlp": 0.01049317, "balance_loss_clip": 1.07213509, "balance_loss_mlp": 1.03590584, "epoch": 0.054349786568869116, "flos": 16320933192960.0, "grad_norm": 2.5003782429865873, "language_loss": 0.86026299, "learning_rate": 3.993813257846589e-06, "loss": 0.88341081, "num_input_tokens_seen": 9817605, "step": 452, "time_per_iteration": 2.789525270462036 }, { "auxiliary_loss_clip": 0.01269379, "auxiliary_loss_mlp": 0.01047437, "balance_loss_clip": 1.07324314, "balance_loss_mlp": 1.03290558, "epoch": 0.054470029459508205, "flos": 18660729127680.0, "grad_norm": 2.1871569514474327, "language_loss": 0.92873228, "learning_rate": 3.993751883345619e-06, "loss": 0.95190042, "num_input_tokens_seen": 9835965, "step": 453, "time_per_iteration": 2.73264741897583 }, { "auxiliary_loss_clip": 0.01245576, "auxiliary_loss_mlp": 0.01052734, "balance_loss_clip": 1.06951928, "balance_loss_mlp": 1.03723717, "epoch": 0.054590272350147295, "flos": 17785298856960.0, "grad_norm": 2.381938006186589, "language_loss": 0.87716043, "learning_rate": 3.993690206394073e-06, "loss": 0.9001435, "num_input_tokens_seen": 9852265, "step": 454, "time_per_iteration": 2.7695465087890625 }, { "auxiliary_loss_clip": 0.01256693, "auxiliary_loss_mlp": 0.01052512, "balance_loss_clip": 1.07246852, "balance_loss_mlp": 1.03707409, "epoch": 0.054710515240786385, "flos": 17785945301760.0, "grad_norm": 2.1616230759216974, "language_loss": 0.87349772, "learning_rate": 3.993628227001307e-06, "loss": 0.89658976, "num_input_tokens_seen": 9870465, "step": 455, "time_per_iteration": 2.713808298110962 }, { "auxiliary_loss_clip": 0.01246105, "auxiliary_loss_mlp": 0.01049585, "balance_loss_clip": 1.06506479, "balance_loss_mlp": 1.03551793, "epoch": 0.05483075813142548, "flos": 48210900180480.0, "grad_norm": 1.9550749895072734, "language_loss": 0.71251023, "learning_rate": 3.993565945176726e-06, "loss": 0.73546714, "num_input_tokens_seen": 9891490, "step": 456, "time_per_iteration": 2.9432170391082764 }, { "auxiliary_loss_clip": 0.01248289, "auxiliary_loss_mlp": 0.01056822, "balance_loss_clip": 1.06962991, "balance_loss_mlp": 1.04238582, "epoch": 0.05495100102206457, "flos": 19682244011520.0, "grad_norm": 2.2005711799880294, "language_loss": 0.83943439, "learning_rate": 3.993503360929776e-06, "loss": 0.86248553, "num_input_tokens_seen": 9910375, "step": 457, "time_per_iteration": 2.6838905811309814 }, { "auxiliary_loss_clip": 0.01187822, "auxiliary_loss_mlp": 0.0105221, "balance_loss_clip": 1.06155205, "balance_loss_mlp": 1.03690338, "epoch": 0.05507124391270366, "flos": 26360048453760.0, "grad_norm": 1.6093704727088132, "language_loss": 0.81191719, "learning_rate": 3.99344047426995e-06, "loss": 0.83431751, "num_input_tokens_seen": 9931635, "step": 458, "time_per_iteration": 3.3502273559570312 }, { "auxiliary_loss_clip": 0.01220043, "auxiliary_loss_mlp": 0.0104713, "balance_loss_clip": 1.06316185, "balance_loss_mlp": 1.03209734, "epoch": 0.05519148680334275, "flos": 22601314581120.0, "grad_norm": 2.462518909384051, "language_loss": 0.93592596, "learning_rate": 3.993377285206789e-06, "loss": 0.95859766, "num_input_tokens_seen": 9951420, "step": 459, "time_per_iteration": 4.177008628845215 }, { "auxiliary_loss_clip": 0.01206826, "auxiliary_loss_mlp": 0.01045207, "balance_loss_clip": 1.06054378, "balance_loss_mlp": 1.03016281, "epoch": 0.05531172969398184, "flos": 40552519380480.0, "grad_norm": 1.6872489127973465, "language_loss": 0.86609066, "learning_rate": 3.99331379374988e-06, "loss": 0.88861102, "num_input_tokens_seen": 9975025, "step": 460, "time_per_iteration": 2.9176807403564453 }, { "auxiliary_loss_clip": 0.01248593, "auxiliary_loss_mlp": 0.01049551, "balance_loss_clip": 1.0618906, "balance_loss_mlp": 1.03538251, "epoch": 0.05543197258462093, "flos": 23477894087040.0, "grad_norm": 1.9805208460318446, "language_loss": 0.79866087, "learning_rate": 3.993249999908852e-06, "loss": 0.82164228, "num_input_tokens_seen": 9995175, "step": 461, "time_per_iteration": 2.7650017738342285 }, { "auxiliary_loss_clip": 0.01278323, "auxiliary_loss_mlp": 0.01053803, "balance_loss_clip": 1.06990552, "balance_loss_mlp": 1.03959322, "epoch": 0.05555221547526003, "flos": 18624603024000.0, "grad_norm": 1.9998614293548942, "language_loss": 0.87319207, "learning_rate": 3.993185903693384e-06, "loss": 0.89651334, "num_input_tokens_seen": 10011975, "step": 462, "time_per_iteration": 4.56042218208313 }, { "auxiliary_loss_clip": 0.01251184, "auxiliary_loss_mlp": 0.0104919, "balance_loss_clip": 1.06962848, "balance_loss_mlp": 1.03504014, "epoch": 0.05567245836589912, "flos": 23587098410880.0, "grad_norm": 3.0132774333864525, "language_loss": 0.82631147, "learning_rate": 3.9931215051131995e-06, "loss": 0.84931517, "num_input_tokens_seen": 10032620, "step": 463, "time_per_iteration": 3.7687015533447266 }, { "auxiliary_loss_clip": 0.01251473, "auxiliary_loss_mlp": 0.01051817, "balance_loss_clip": 1.06551528, "balance_loss_mlp": 1.03670144, "epoch": 0.05579270125653821, "flos": 27746667129600.0, "grad_norm": 1.6337387684428168, "language_loss": 0.80217737, "learning_rate": 3.993056804178068e-06, "loss": 0.82521027, "num_input_tokens_seen": 10054165, "step": 464, "time_per_iteration": 2.761094331741333 }, { "auxiliary_loss_clip": 0.01212272, "auxiliary_loss_mlp": 0.01050303, "balance_loss_clip": 1.06697702, "balance_loss_mlp": 1.0353657, "epoch": 0.0559129441471773, "flos": 27014161075200.0, "grad_norm": 1.888333267000692, "language_loss": 0.84200609, "learning_rate": 3.992991800897803e-06, "loss": 0.86463189, "num_input_tokens_seen": 10073970, "step": 465, "time_per_iteration": 2.839029550552368 }, { "auxiliary_loss_clip": 0.01278467, "auxiliary_loss_mlp": 0.01045914, "balance_loss_clip": 1.07003558, "balance_loss_mlp": 1.03174043, "epoch": 0.05603318703781639, "flos": 15229787794560.0, "grad_norm": 2.5974444680140016, "language_loss": 0.89868486, "learning_rate": 3.9929264952822665e-06, "loss": 0.9219287, "num_input_tokens_seen": 10091505, "step": 466, "time_per_iteration": 2.618161201477051 }, { "auxiliary_loss_clip": 0.01268164, "auxiliary_loss_mlp": 0.01046575, "balance_loss_clip": 1.06962729, "balance_loss_mlp": 1.03172183, "epoch": 0.05615342992845548, "flos": 22266482976000.0, "grad_norm": 2.219720269199115, "language_loss": 0.88394034, "learning_rate": 3.992860887341366e-06, "loss": 0.90708768, "num_input_tokens_seen": 10109675, "step": 467, "time_per_iteration": 2.6831932067871094 }, { "auxiliary_loss_clip": 0.01218039, "auxiliary_loss_mlp": 0.01052648, "balance_loss_clip": 1.06198335, "balance_loss_mlp": 1.03824794, "epoch": 0.056273672819094574, "flos": 23584979508480.0, "grad_norm": 2.0733929362780934, "language_loss": 0.8127926, "learning_rate": 3.992794977085052e-06, "loss": 0.83549953, "num_input_tokens_seen": 10127675, "step": 468, "time_per_iteration": 2.9471590518951416 }, { "auxiliary_loss_clip": 0.01231524, "auxiliary_loss_mlp": 0.010541, "balance_loss_clip": 1.06622851, "balance_loss_mlp": 1.0384475, "epoch": 0.056393915709733664, "flos": 19858708552320.0, "grad_norm": 1.9863433249078764, "language_loss": 0.85034794, "learning_rate": 3.992728764523326e-06, "loss": 0.87320423, "num_input_tokens_seen": 10146620, "step": 469, "time_per_iteration": 2.8474512100219727 }, { "auxiliary_loss_clip": 0.01245232, "auxiliary_loss_mlp": 0.01044407, "balance_loss_clip": 1.06789255, "balance_loss_mlp": 1.02966046, "epoch": 0.05651415860037275, "flos": 22163779013760.0, "grad_norm": 1.8213229695629185, "language_loss": 0.8086611, "learning_rate": 3.99266224966623e-06, "loss": 0.83155745, "num_input_tokens_seen": 10167535, "step": 470, "time_per_iteration": 2.7712759971618652 }, { "auxiliary_loss_clip": 0.01235439, "auxiliary_loss_mlp": 0.01050439, "balance_loss_clip": 1.06718624, "balance_loss_mlp": 1.03572869, "epoch": 0.05663440149101184, "flos": 19463548055040.0, "grad_norm": 2.4560209365963654, "language_loss": 0.87823743, "learning_rate": 3.992595432523855e-06, "loss": 0.90109622, "num_input_tokens_seen": 10184825, "step": 471, "time_per_iteration": 2.7032275199890137 }, { "auxiliary_loss_clip": 0.0122151, "auxiliary_loss_mlp": 0.0104761, "balance_loss_clip": 1.0654614, "balance_loss_mlp": 1.03284049, "epoch": 0.05675464438165093, "flos": 22670226823680.0, "grad_norm": 1.9344620106606596, "language_loss": 0.85925663, "learning_rate": 3.992528313106338e-06, "loss": 0.88194782, "num_input_tokens_seen": 10203025, "step": 472, "time_per_iteration": 2.768698215484619 }, { "auxiliary_loss_clip": 0.01281234, "auxiliary_loss_mlp": 0.00766933, "balance_loss_clip": 1.07621026, "balance_loss_mlp": 1.00052273, "epoch": 0.05687488727229002, "flos": 16901177495040.0, "grad_norm": 2.608944679978251, "language_loss": 0.8213855, "learning_rate": 3.9924608914238595e-06, "loss": 0.84186721, "num_input_tokens_seen": 10218020, "step": 473, "time_per_iteration": 2.6292810440063477 }, { "auxiliary_loss_clip": 0.01265457, "auxiliary_loss_mlp": 0.01050484, "balance_loss_clip": 1.07285976, "balance_loss_mlp": 1.03624475, "epoch": 0.05699513016292912, "flos": 29168980945920.0, "grad_norm": 2.3892819267688936, "language_loss": 0.84047496, "learning_rate": 3.992393167486648e-06, "loss": 0.86363435, "num_input_tokens_seen": 10237170, "step": 474, "time_per_iteration": 2.787321090698242 }, { "auxiliary_loss_clip": 0.01281804, "auxiliary_loss_mlp": 0.01048677, "balance_loss_clip": 1.07408786, "balance_loss_mlp": 1.0342288, "epoch": 0.05711537305356821, "flos": 18916197632640.0, "grad_norm": 2.1886057922156605, "language_loss": 0.80717599, "learning_rate": 3.992325141304977e-06, "loss": 0.83048081, "num_input_tokens_seen": 10255125, "step": 475, "time_per_iteration": 2.690633773803711 }, { "auxiliary_loss_clip": 0.01211362, "auxiliary_loss_mlp": 0.01052831, "balance_loss_clip": 1.06264138, "balance_loss_mlp": 1.03932464, "epoch": 0.0572356159442073, "flos": 26758979879040.0, "grad_norm": 4.199083840588372, "language_loss": 0.86807591, "learning_rate": 3.992256812889166e-06, "loss": 0.89071786, "num_input_tokens_seen": 10271230, "step": 476, "time_per_iteration": 2.7647175788879395 }, { "auxiliary_loss_clip": 0.01281466, "auxiliary_loss_mlp": 0.01047677, "balance_loss_clip": 1.0757134, "balance_loss_mlp": 1.03187037, "epoch": 0.05735585883484639, "flos": 35116146840960.0, "grad_norm": 2.7396963555069553, "language_loss": 0.76558501, "learning_rate": 3.992188182249582e-06, "loss": 0.78887641, "num_input_tokens_seen": 10293125, "step": 477, "time_per_iteration": 2.7652716636657715 }, { "auxiliary_loss_clip": 0.01247579, "auxiliary_loss_mlp": 0.01043338, "balance_loss_clip": 1.07075047, "balance_loss_mlp": 1.02824628, "epoch": 0.05747610172548548, "flos": 18734381965440.0, "grad_norm": 2.0954455570447457, "language_loss": 0.9078356, "learning_rate": 3.992119249396633e-06, "loss": 0.93074483, "num_input_tokens_seen": 10311810, "step": 478, "time_per_iteration": 2.7219924926757812 }, { "auxiliary_loss_clip": 0.01238912, "auxiliary_loss_mlp": 0.00767005, "balance_loss_clip": 1.06746769, "balance_loss_mlp": 1.00043774, "epoch": 0.05759634461612457, "flos": 27964752554880.0, "grad_norm": 2.0770656657548385, "language_loss": 0.82204419, "learning_rate": 3.992050014340778e-06, "loss": 0.84210336, "num_input_tokens_seen": 10332165, "step": 479, "time_per_iteration": 2.8032612800598145 }, { "auxiliary_loss_clip": 0.01166483, "auxiliary_loss_mlp": 0.01014648, "balance_loss_clip": 1.04443502, "balance_loss_mlp": 1.00768602, "epoch": 0.057716587506763666, "flos": 69292009405440.0, "grad_norm": 0.910905180236274, "language_loss": 0.55053842, "learning_rate": 3.99198047709252e-06, "loss": 0.57234979, "num_input_tokens_seen": 10393685, "step": 480, "time_per_iteration": 3.3368217945098877 }, { "auxiliary_loss_clip": 0.01227499, "auxiliary_loss_mlp": 0.01044985, "balance_loss_clip": 1.06382024, "balance_loss_mlp": 1.03000081, "epoch": 0.057836830397402755, "flos": 25009196745600.0, "grad_norm": 1.8878339558894823, "language_loss": 0.7901237, "learning_rate": 3.991910637662408e-06, "loss": 0.81284857, "num_input_tokens_seen": 10413975, "step": 481, "time_per_iteration": 2.8201568126678467 }, { "auxiliary_loss_clip": 0.01275848, "auxiliary_loss_mlp": 0.01048846, "balance_loss_clip": 1.07207441, "balance_loss_mlp": 1.03450465, "epoch": 0.057957073288041845, "flos": 25593894334080.0, "grad_norm": 2.3609264202607214, "language_loss": 0.80945981, "learning_rate": 3.9918404960610355e-06, "loss": 0.83270675, "num_input_tokens_seen": 10433005, "step": 482, "time_per_iteration": 2.705639123916626 }, { "auxiliary_loss_clip": 0.01261971, "auxiliary_loss_mlp": 0.01045315, "balance_loss_clip": 1.06633317, "balance_loss_mlp": 1.03083134, "epoch": 0.058077316178680935, "flos": 20777411733120.0, "grad_norm": 2.382669675124695, "language_loss": 0.77533102, "learning_rate": 3.991770052299043e-06, "loss": 0.79840386, "num_input_tokens_seen": 10451235, "step": 483, "time_per_iteration": 2.689882516860962 }, { "auxiliary_loss_clip": 0.0124657, "auxiliary_loss_mlp": 0.01049021, "balance_loss_clip": 1.06750917, "balance_loss_mlp": 1.03503799, "epoch": 0.058197559069320025, "flos": 18916484941440.0, "grad_norm": 2.438731510245966, "language_loss": 0.87622619, "learning_rate": 3.991699306387118e-06, "loss": 0.89918208, "num_input_tokens_seen": 10469705, "step": 484, "time_per_iteration": 2.7403030395507812 }, { "auxiliary_loss_clip": 0.01264442, "auxiliary_loss_mlp": 0.01049458, "balance_loss_clip": 1.07049692, "balance_loss_mlp": 1.03627968, "epoch": 0.058317801959959115, "flos": 24863327614080.0, "grad_norm": 1.9620359525949826, "language_loss": 0.78014684, "learning_rate": 3.991628258335991e-06, "loss": 0.80328584, "num_input_tokens_seen": 10491910, "step": 485, "time_per_iteration": 2.74094820022583 }, { "auxiliary_loss_clip": 0.01220403, "auxiliary_loss_mlp": 0.01054618, "balance_loss_clip": 1.06550455, "balance_loss_mlp": 1.0404501, "epoch": 0.05843804485059821, "flos": 23257977068160.0, "grad_norm": 3.390579577241081, "language_loss": 0.88070476, "learning_rate": 3.991556908156442e-06, "loss": 0.9034549, "num_input_tokens_seen": 10508435, "step": 486, "time_per_iteration": 3.697441577911377 }, { "auxiliary_loss_clip": 0.01243501, "auxiliary_loss_mlp": 0.01047741, "balance_loss_clip": 1.06441545, "balance_loss_mlp": 1.03260148, "epoch": 0.0585582877412373, "flos": 23150532510720.0, "grad_norm": 1.945461884322535, "language_loss": 0.88192964, "learning_rate": 3.9914852558592914e-06, "loss": 0.90484208, "num_input_tokens_seen": 10529485, "step": 487, "time_per_iteration": 2.7593801021575928 }, { "auxiliary_loss_clip": 0.01261337, "auxiliary_loss_mlp": 0.01050059, "balance_loss_clip": 1.07014239, "balance_loss_mlp": 1.03451419, "epoch": 0.05867853063187639, "flos": 23506406507520.0, "grad_norm": 3.1497987968350474, "language_loss": 0.80457318, "learning_rate": 3.991413301455413e-06, "loss": 0.82768714, "num_input_tokens_seen": 10545935, "step": 488, "time_per_iteration": 3.6796257495880127 }, { "auxiliary_loss_clip": 0.01230689, "auxiliary_loss_mlp": 0.01050321, "balance_loss_clip": 1.06328082, "balance_loss_mlp": 1.036731, "epoch": 0.05879877352251548, "flos": 29495803818240.0, "grad_norm": 2.664112196381184, "language_loss": 0.78087038, "learning_rate": 3.991341044955719e-06, "loss": 0.80368048, "num_input_tokens_seen": 10565690, "step": 489, "time_per_iteration": 3.920098066329956 }, { "auxiliary_loss_clip": 0.01254453, "auxiliary_loss_mlp": 0.00768141, "balance_loss_clip": 1.06582403, "balance_loss_mlp": 1.00043344, "epoch": 0.05891901641315457, "flos": 20157485880960.0, "grad_norm": 2.179350087072872, "language_loss": 0.81960994, "learning_rate": 3.991268486371172e-06, "loss": 0.83983588, "num_input_tokens_seen": 10584245, "step": 490, "time_per_iteration": 3.7074098587036133 }, { "auxiliary_loss_clip": 0.01242201, "auxiliary_loss_mlp": 0.01048752, "balance_loss_clip": 1.06426835, "balance_loss_mlp": 1.03377962, "epoch": 0.05903925930379366, "flos": 24644200694400.0, "grad_norm": 3.1678542666405574, "language_loss": 0.87982762, "learning_rate": 3.991195625712779e-06, "loss": 0.90273714, "num_input_tokens_seen": 10601210, "step": 491, "time_per_iteration": 2.7503597736358643 }, { "auxiliary_loss_clip": 0.0127464, "auxiliary_loss_mlp": 0.01044076, "balance_loss_clip": 1.07094598, "balance_loss_mlp": 1.0291512, "epoch": 0.05915950219443276, "flos": 21250391045760.0, "grad_norm": 2.215587816392955, "language_loss": 0.81682879, "learning_rate": 3.991122462991592e-06, "loss": 0.84001595, "num_input_tokens_seen": 10620730, "step": 492, "time_per_iteration": 2.694131851196289 }, { "auxiliary_loss_clip": 0.0128078, "auxiliary_loss_mlp": 0.01050445, "balance_loss_clip": 1.0711875, "balance_loss_mlp": 1.03534126, "epoch": 0.05927974508507185, "flos": 9902727319680.0, "grad_norm": 3.6912335622416554, "language_loss": 0.81623566, "learning_rate": 3.991048998218712e-06, "loss": 0.83954793, "num_input_tokens_seen": 10634035, "step": 493, "time_per_iteration": 2.549713373184204 }, { "auxiliary_loss_clip": 0.01258553, "auxiliary_loss_mlp": 0.01049885, "balance_loss_clip": 1.06489217, "balance_loss_mlp": 1.03441119, "epoch": 0.05939998797571094, "flos": 18259499232000.0, "grad_norm": 2.472098871333579, "language_loss": 0.76432514, "learning_rate": 3.990975231405281e-06, "loss": 0.78740948, "num_input_tokens_seen": 10652485, "step": 494, "time_per_iteration": 2.6785836219787598 }, { "auxiliary_loss_clip": 0.01255903, "auxiliary_loss_mlp": 0.01051895, "balance_loss_clip": 1.06590652, "balance_loss_mlp": 1.03771532, "epoch": 0.05952023086635003, "flos": 28256598558720.0, "grad_norm": 7.228829903976104, "language_loss": 0.78898716, "learning_rate": 3.990901162562491e-06, "loss": 0.81206506, "num_input_tokens_seen": 10673175, "step": 495, "time_per_iteration": 2.6979448795318604 }, { "auxiliary_loss_clip": 0.01225617, "auxiliary_loss_mlp": 0.00767687, "balance_loss_clip": 1.0589391, "balance_loss_mlp": 1.00032973, "epoch": 0.05964047375698912, "flos": 14902498045440.0, "grad_norm": 2.6481135021640774, "language_loss": 0.90667564, "learning_rate": 3.9908267917015765e-06, "loss": 0.92660868, "num_input_tokens_seen": 10691235, "step": 496, "time_per_iteration": 2.7400946617126465 }, { "auxiliary_loss_clip": 0.01247088, "auxiliary_loss_mlp": 0.01055202, "balance_loss_clip": 1.06414557, "balance_loss_mlp": 1.04073548, "epoch": 0.059760716647628206, "flos": 23185581206400.0, "grad_norm": 2.1674049968123783, "language_loss": 0.93090594, "learning_rate": 3.990752118833821e-06, "loss": 0.95392895, "num_input_tokens_seen": 10708675, "step": 497, "time_per_iteration": 2.6797261238098145 }, { "auxiliary_loss_clip": 0.01275609, "auxiliary_loss_mlp": 0.01056638, "balance_loss_clip": 1.07051015, "balance_loss_mlp": 1.0419569, "epoch": 0.0598809595382673, "flos": 22746968231040.0, "grad_norm": 2.216177291441323, "language_loss": 0.77842438, "learning_rate": 3.990677143970553e-06, "loss": 0.80174685, "num_input_tokens_seen": 10729485, "step": 498, "time_per_iteration": 2.6700661182403564 }, { "auxiliary_loss_clip": 0.01226063, "auxiliary_loss_mlp": 0.0104723, "balance_loss_clip": 1.06796455, "balance_loss_mlp": 1.03323472, "epoch": 0.06000120242890639, "flos": 22127221946880.0, "grad_norm": 2.0947280020378205, "language_loss": 0.81575596, "learning_rate": 3.990601867123144e-06, "loss": 0.83848882, "num_input_tokens_seen": 10749210, "step": 499, "time_per_iteration": 2.748843193054199 }, { "auxiliary_loss_clip": 0.01209316, "auxiliary_loss_mlp": 0.01048629, "balance_loss_clip": 1.06244588, "balance_loss_mlp": 1.03459811, "epoch": 0.06012144531954548, "flos": 19171773878400.0, "grad_norm": 2.8240671772503063, "language_loss": 0.85292506, "learning_rate": 3.990526288303014e-06, "loss": 0.87550449, "num_input_tokens_seen": 10768000, "step": 500, "time_per_iteration": 2.7673041820526123 }, { "auxiliary_loss_clip": 0.01240141, "auxiliary_loss_mlp": 0.00766985, "balance_loss_clip": 1.06591153, "balance_loss_mlp": 1.00040317, "epoch": 0.06024168821018457, "flos": 22783345729920.0, "grad_norm": 2.0749212144003173, "language_loss": 0.90921527, "learning_rate": 3.9904504075216295e-06, "loss": 0.92928654, "num_input_tokens_seen": 10788760, "step": 501, "time_per_iteration": 2.70145583152771 }, { "auxiliary_loss_clip": 0.01225496, "auxiliary_loss_mlp": 0.01045285, "balance_loss_clip": 1.06028736, "balance_loss_mlp": 1.03080082, "epoch": 0.06036193110082366, "flos": 18770687637120.0, "grad_norm": 2.259991164034686, "language_loss": 0.94295013, "learning_rate": 3.990374224790501e-06, "loss": 0.96565795, "num_input_tokens_seen": 10806965, "step": 502, "time_per_iteration": 2.8935465812683105 }, { "auxiliary_loss_clip": 0.01245966, "auxiliary_loss_mlp": 0.01053143, "balance_loss_clip": 1.0682739, "balance_loss_mlp": 1.03862309, "epoch": 0.06048217399146275, "flos": 17201570935680.0, "grad_norm": 2.313371838230714, "language_loss": 0.71181738, "learning_rate": 3.990297740121185e-06, "loss": 0.7348085, "num_input_tokens_seen": 10824900, "step": 503, "time_per_iteration": 2.769035577774048 }, { "auxiliary_loss_clip": 0.01258025, "auxiliary_loss_mlp": 0.00766571, "balance_loss_clip": 1.0665257, "balance_loss_mlp": 1.00045276, "epoch": 0.06060241688210185, "flos": 24024131187840.0, "grad_norm": 2.040325261108514, "language_loss": 0.78292251, "learning_rate": 3.990220953525284e-06, "loss": 0.80316848, "num_input_tokens_seen": 10842010, "step": 504, "time_per_iteration": 2.7679576873779297 }, { "auxiliary_loss_clip": 0.01226785, "auxiliary_loss_mlp": 0.0104379, "balance_loss_clip": 1.06022692, "balance_loss_mlp": 1.02940106, "epoch": 0.06072265977274094, "flos": 14611190745600.0, "grad_norm": 2.695157711280703, "language_loss": 0.74182594, "learning_rate": 3.9901438650144465e-06, "loss": 0.76453167, "num_input_tokens_seen": 10858260, "step": 505, "time_per_iteration": 2.728879928588867 }, { "auxiliary_loss_clip": 0.01249233, "auxiliary_loss_mlp": 0.01049029, "balance_loss_clip": 1.06654334, "balance_loss_mlp": 1.03416371, "epoch": 0.06084290266338003, "flos": 20558284813440.0, "grad_norm": 2.416505483927437, "language_loss": 0.91950572, "learning_rate": 3.990066474600367e-06, "loss": 0.94248831, "num_input_tokens_seen": 10876230, "step": 506, "time_per_iteration": 2.8097784519195557 }, { "auxiliary_loss_clip": 0.01240689, "auxiliary_loss_mlp": 0.01039262, "balance_loss_clip": 1.06100965, "balance_loss_mlp": 1.02555323, "epoch": 0.06096314555401912, "flos": 22309217182080.0, "grad_norm": 1.8419810860514019, "language_loss": 0.67904162, "learning_rate": 3.989988782294786e-06, "loss": 0.70184112, "num_input_tokens_seen": 10896320, "step": 507, "time_per_iteration": 2.724440097808838 }, { "auxiliary_loss_clip": 0.01208248, "auxiliary_loss_mlp": 0.01053328, "balance_loss_clip": 1.05954123, "balance_loss_mlp": 1.03902268, "epoch": 0.06108338844465821, "flos": 19131374056320.0, "grad_norm": 1.750057442343561, "language_loss": 0.95109737, "learning_rate": 3.989910788109489e-06, "loss": 0.97371316, "num_input_tokens_seen": 10912970, "step": 508, "time_per_iteration": 2.712320327758789 }, { "auxiliary_loss_clip": 0.01225518, "auxiliary_loss_mlp": 0.01054832, "balance_loss_clip": 1.06422436, "balance_loss_mlp": 1.04016888, "epoch": 0.0612036313352973, "flos": 33584018169600.0, "grad_norm": 3.2343305592553415, "language_loss": 0.74750209, "learning_rate": 3.989832492056307e-06, "loss": 0.77030563, "num_input_tokens_seen": 10933995, "step": 509, "time_per_iteration": 2.8885419368743896 }, { "auxiliary_loss_clip": 0.01261598, "auxiliary_loss_mlp": 0.01045326, "balance_loss_clip": 1.07048535, "balance_loss_mlp": 1.02906609, "epoch": 0.06132387422593639, "flos": 27490552179840.0, "grad_norm": 2.231000134122838, "language_loss": 0.8104229, "learning_rate": 3.989753894147119e-06, "loss": 0.83349204, "num_input_tokens_seen": 10954120, "step": 510, "time_per_iteration": 2.694120168685913 }, { "auxiliary_loss_clip": 0.01255507, "auxiliary_loss_mlp": 0.01053669, "balance_loss_clip": 1.07117128, "balance_loss_mlp": 1.03963196, "epoch": 0.061444117116575485, "flos": 25885057979520.0, "grad_norm": 2.5728429134818493, "language_loss": 0.80061519, "learning_rate": 3.989674994393846e-06, "loss": 0.82370698, "num_input_tokens_seen": 10973595, "step": 511, "time_per_iteration": 3.660067558288574 }, { "auxiliary_loss_clip": 0.01257031, "auxiliary_loss_mlp": 0.01044658, "balance_loss_clip": 1.06993628, "balance_loss_mlp": 1.02997136, "epoch": 0.061564360007214575, "flos": 28512031150080.0, "grad_norm": 4.286099785148859, "language_loss": 0.94174403, "learning_rate": 3.98959579280846e-06, "loss": 0.96476096, "num_input_tokens_seen": 10991995, "step": 512, "time_per_iteration": 2.8348708152770996 }, { "auxiliary_loss_clip": 0.01197894, "auxiliary_loss_mlp": 0.01049659, "balance_loss_clip": 1.06280112, "balance_loss_mlp": 1.03482962, "epoch": 0.061684602897853665, "flos": 12094355652480.0, "grad_norm": 2.0043007164421436, "language_loss": 0.82988131, "learning_rate": 3.989516289402973e-06, "loss": 0.85235685, "num_input_tokens_seen": 11007625, "step": 513, "time_per_iteration": 2.980318546295166 }, { "auxiliary_loss_clip": 0.01170921, "auxiliary_loss_mlp": 0.0105, "balance_loss_clip": 1.05143929, "balance_loss_mlp": 1.03497946, "epoch": 0.061804845788492754, "flos": 19532639865600.0, "grad_norm": 3.063410894067564, "language_loss": 0.8056106, "learning_rate": 3.989436484189447e-06, "loss": 0.82781982, "num_input_tokens_seen": 11025570, "step": 514, "time_per_iteration": 4.8572142124176025 }, { "auxiliary_loss_clip": 0.01259078, "auxiliary_loss_mlp": 0.01043137, "balance_loss_clip": 1.06657028, "balance_loss_mlp": 1.02834296, "epoch": 0.061925088679131844, "flos": 15341111020800.0, "grad_norm": 2.5145404893792445, "language_loss": 0.81089461, "learning_rate": 3.9893563771799885e-06, "loss": 0.83391678, "num_input_tokens_seen": 11042045, "step": 515, "time_per_iteration": 2.8499398231506348 }, { "auxiliary_loss_clip": 0.01275873, "auxiliary_loss_mlp": 0.01048766, "balance_loss_clip": 1.07002401, "balance_loss_mlp": 1.03359032, "epoch": 0.062045331569770934, "flos": 25919927107200.0, "grad_norm": 2.36639266706255, "language_loss": 0.86121714, "learning_rate": 3.989275968386749e-06, "loss": 0.88446349, "num_input_tokens_seen": 11059955, "step": 516, "time_per_iteration": 3.676647663116455 }, { "auxiliary_loss_clip": 0.01228246, "auxiliary_loss_mlp": 0.01053011, "balance_loss_clip": 1.06115413, "balance_loss_mlp": 1.03791952, "epoch": 0.06216557446041003, "flos": 28110621686400.0, "grad_norm": 1.9739139529991114, "language_loss": 0.76928186, "learning_rate": 3.989195257821926e-06, "loss": 0.79209447, "num_input_tokens_seen": 11078440, "step": 517, "time_per_iteration": 2.7835304737091064 }, { "auxiliary_loss_clip": 0.01245917, "auxiliary_loss_mlp": 0.01049132, "balance_loss_clip": 1.07345212, "balance_loss_mlp": 1.03488648, "epoch": 0.06228581735104912, "flos": 23478181395840.0, "grad_norm": 2.4143862240817393, "language_loss": 0.84428084, "learning_rate": 3.989114245497765e-06, "loss": 0.86723131, "num_input_tokens_seen": 11098240, "step": 518, "time_per_iteration": 2.771448850631714 }, { "auxiliary_loss_clip": 0.0125904, "auxiliary_loss_mlp": 0.01043045, "balance_loss_clip": 1.06549549, "balance_loss_mlp": 1.02870452, "epoch": 0.06240606024168821, "flos": 15195205975680.0, "grad_norm": 2.257281658869347, "language_loss": 0.9506821, "learning_rate": 3.989032931426554e-06, "loss": 0.97370303, "num_input_tokens_seen": 11115395, "step": 519, "time_per_iteration": 2.6714537143707275 }, { "auxiliary_loss_clip": 0.01242529, "auxiliary_loss_mlp": 0.01044203, "balance_loss_clip": 1.06873524, "balance_loss_mlp": 1.02979088, "epoch": 0.06252630313232731, "flos": 20631829910400.0, "grad_norm": 2.535284038704183, "language_loss": 0.86963487, "learning_rate": 3.9889513156206295e-06, "loss": 0.89250207, "num_input_tokens_seen": 11134835, "step": 520, "time_per_iteration": 2.763765335083008 }, { "auxiliary_loss_clip": 0.01230604, "auxiliary_loss_mlp": 0.01053405, "balance_loss_clip": 1.06579816, "balance_loss_mlp": 1.03905785, "epoch": 0.06264654602296639, "flos": 20778058177920.0, "grad_norm": 2.8743371383942735, "language_loss": 0.73683721, "learning_rate": 3.988869398092371e-06, "loss": 0.75967729, "num_input_tokens_seen": 11154745, "step": 521, "time_per_iteration": 2.7117271423339844 }, { "auxiliary_loss_clip": 0.01244974, "auxiliary_loss_mlp": 0.01049164, "balance_loss_clip": 1.06826365, "balance_loss_mlp": 1.0340488, "epoch": 0.06276678891360549, "flos": 29605798241280.0, "grad_norm": 2.3423722175490744, "language_loss": 0.78474742, "learning_rate": 3.988787178854206e-06, "loss": 0.80768883, "num_input_tokens_seen": 11174280, "step": 522, "time_per_iteration": 2.7223896980285645 }, { "auxiliary_loss_clip": 0.01276394, "auxiliary_loss_mlp": 0.01048299, "balance_loss_clip": 1.07242155, "balance_loss_mlp": 1.03295708, "epoch": 0.06288703180424457, "flos": 22126288193280.0, "grad_norm": 2.322000712726739, "language_loss": 0.8777746, "learning_rate": 3.988704657918608e-06, "loss": 0.9010216, "num_input_tokens_seen": 11193340, "step": 523, "time_per_iteration": 2.669917106628418 }, { "auxiliary_loss_clip": 0.01257446, "auxiliary_loss_mlp": 0.01048018, "balance_loss_clip": 1.07014477, "balance_loss_mlp": 1.03426719, "epoch": 0.06300727469488367, "flos": 14976689587200.0, "grad_norm": 2.5131830499663623, "language_loss": 0.79819208, "learning_rate": 3.988621835298094e-06, "loss": 0.82124674, "num_input_tokens_seen": 11210555, "step": 524, "time_per_iteration": 2.6599583625793457 }, { "auxiliary_loss_clip": 0.0127327, "auxiliary_loss_mlp": 0.01039213, "balance_loss_clip": 1.07193589, "balance_loss_mlp": 1.02505636, "epoch": 0.06312751758552275, "flos": 24535391420160.0, "grad_norm": 2.06004887946143, "language_loss": 0.91888976, "learning_rate": 3.988538711005229e-06, "loss": 0.94201458, "num_input_tokens_seen": 11230010, "step": 525, "time_per_iteration": 2.729264259338379 }, { "auxiliary_loss_clip": 0.0125538, "auxiliary_loss_mlp": 0.01053775, "balance_loss_clip": 1.07118225, "balance_loss_mlp": 1.04026866, "epoch": 0.06324776047616185, "flos": 21507008785920.0, "grad_norm": 3.5750501163421586, "language_loss": 0.88528144, "learning_rate": 3.988455285052622e-06, "loss": 0.908373, "num_input_tokens_seen": 11246190, "step": 526, "time_per_iteration": 2.643002986907959 }, { "auxiliary_loss_clip": 0.01252495, "auxiliary_loss_mlp": 0.01049781, "balance_loss_clip": 1.06780243, "balance_loss_mlp": 1.03482044, "epoch": 0.06336800336680094, "flos": 21688034353920.0, "grad_norm": 2.428830019846273, "language_loss": 0.83990163, "learning_rate": 3.98837155745293e-06, "loss": 0.86292446, "num_input_tokens_seen": 11264230, "step": 527, "time_per_iteration": 2.7995424270629883 }, { "auxiliary_loss_clip": 0.01257398, "auxiliary_loss_mlp": 0.01039289, "balance_loss_clip": 1.07203686, "balance_loss_mlp": 1.0245434, "epoch": 0.06348824625744003, "flos": 19500895221120.0, "grad_norm": 3.4494860404568577, "language_loss": 0.75985074, "learning_rate": 3.988287528218854e-06, "loss": 0.7828176, "num_input_tokens_seen": 11283015, "step": 528, "time_per_iteration": 2.6696109771728516 }, { "auxiliary_loss_clip": 0.01256566, "auxiliary_loss_mlp": 0.01048261, "balance_loss_clip": 1.07279873, "balance_loss_mlp": 1.03397417, "epoch": 0.06360848914807912, "flos": 15481233976320.0, "grad_norm": 2.4149909919898866, "language_loss": 0.90539092, "learning_rate": 3.98820319736314e-06, "loss": 0.92843926, "num_input_tokens_seen": 11299630, "step": 529, "time_per_iteration": 2.6579387187957764 }, { "auxiliary_loss_clip": 0.01226875, "auxiliary_loss_mlp": 0.01051584, "balance_loss_clip": 1.0654254, "balance_loss_mlp": 1.03806591, "epoch": 0.0637287320387182, "flos": 20593369422720.0, "grad_norm": 1.8238754252008533, "language_loss": 0.85424072, "learning_rate": 3.988118564898582e-06, "loss": 0.87702525, "num_input_tokens_seen": 11319170, "step": 530, "time_per_iteration": 2.654066324234009 }, { "auxiliary_loss_clip": 0.01214974, "auxiliary_loss_mlp": 0.00767196, "balance_loss_clip": 1.06378567, "balance_loss_mlp": 1.00054157, "epoch": 0.0638489749293573, "flos": 17412222245760.0, "grad_norm": 2.9178790893381708, "language_loss": 0.89050734, "learning_rate": 3.988033630838019e-06, "loss": 0.9103291, "num_input_tokens_seen": 11333210, "step": 531, "time_per_iteration": 2.7737925052642822 }, { "auxiliary_loss_clip": 0.0125666, "auxiliary_loss_mlp": 0.0105158, "balance_loss_clip": 1.0688653, "balance_loss_mlp": 1.03716755, "epoch": 0.0639692178199964, "flos": 23807661874560.0, "grad_norm": 1.8265616736489187, "language_loss": 0.88043255, "learning_rate": 3.987948395194334e-06, "loss": 0.90351498, "num_input_tokens_seen": 11355590, "step": 532, "time_per_iteration": 2.766798257827759 }, { "auxiliary_loss_clip": 0.0124825, "auxiliary_loss_mlp": 0.01048904, "balance_loss_clip": 1.06594467, "balance_loss_mlp": 1.03421712, "epoch": 0.06408946071063548, "flos": 18477225521280.0, "grad_norm": 10.20076848838827, "language_loss": 0.76806599, "learning_rate": 3.987862857980458e-06, "loss": 0.79103756, "num_input_tokens_seen": 11371535, "step": 533, "time_per_iteration": 2.8007936477661133 }, { "auxiliary_loss_clip": 0.01230543, "auxiliary_loss_mlp": 0.01045137, "balance_loss_clip": 1.06867361, "balance_loss_mlp": 1.02974737, "epoch": 0.06420970360127458, "flos": 27162220936320.0, "grad_norm": 6.3713772372351825, "language_loss": 0.76764715, "learning_rate": 3.987777019209368e-06, "loss": 0.79040396, "num_input_tokens_seen": 11392050, "step": 534, "time_per_iteration": 2.83030366897583 }, { "auxiliary_loss_clip": 0.01271308, "auxiliary_loss_mlp": 0.01054732, "balance_loss_clip": 1.07112694, "balance_loss_mlp": 1.04096949, "epoch": 0.06432994649191366, "flos": 23659673840640.0, "grad_norm": 2.05288008398172, "language_loss": 0.81301761, "learning_rate": 3.987690878894084e-06, "loss": 0.83627808, "num_input_tokens_seen": 11411765, "step": 535, "time_per_iteration": 2.722294330596924 }, { "auxiliary_loss_clip": 0.01241391, "auxiliary_loss_mlp": 0.01051371, "balance_loss_clip": 1.06501627, "balance_loss_mlp": 1.03693461, "epoch": 0.06445018938255276, "flos": 23403953940480.0, "grad_norm": 2.447577008159149, "language_loss": 0.85060394, "learning_rate": 3.987604437047673e-06, "loss": 0.87353158, "num_input_tokens_seen": 11431565, "step": 536, "time_per_iteration": 2.749311685562134 }, { "auxiliary_loss_clip": 0.01255792, "auxiliary_loss_mlp": 0.01042214, "balance_loss_clip": 1.06940413, "balance_loss_mlp": 1.02815282, "epoch": 0.06457043227319184, "flos": 19646692525440.0, "grad_norm": 2.3287565981211746, "language_loss": 0.77363759, "learning_rate": 3.987517693683251e-06, "loss": 0.79661763, "num_input_tokens_seen": 11450140, "step": 537, "time_per_iteration": 3.509599447250366 }, { "auxiliary_loss_clip": 0.0123756, "auxiliary_loss_mlp": 0.01051341, "balance_loss_clip": 1.06958568, "balance_loss_mlp": 1.0375843, "epoch": 0.06469067516383094, "flos": 16978744915200.0, "grad_norm": 2.936730091552885, "language_loss": 0.96156335, "learning_rate": 3.9874306488139745e-06, "loss": 0.98445237, "num_input_tokens_seen": 11465400, "step": 538, "time_per_iteration": 2.7631750106811523 }, { "auxiliary_loss_clip": 0.0121615, "auxiliary_loss_mlp": 0.01043107, "balance_loss_clip": 1.06392276, "balance_loss_mlp": 1.02815771, "epoch": 0.06481091805447003, "flos": 23296401642240.0, "grad_norm": 2.0120076715757427, "language_loss": 0.87921613, "learning_rate": 3.987343302453049e-06, "loss": 0.90180874, "num_input_tokens_seen": 11486675, "step": 539, "time_per_iteration": 2.7747802734375 }, { "auxiliary_loss_clip": 0.01239816, "auxiliary_loss_mlp": 0.01042236, "balance_loss_clip": 1.06955314, "balance_loss_mlp": 1.02789545, "epoch": 0.06493116094510912, "flos": 29172356824320.0, "grad_norm": 1.6999519124180849, "language_loss": 0.82468295, "learning_rate": 3.987255654613724e-06, "loss": 0.84750342, "num_input_tokens_seen": 11510440, "step": 540, "time_per_iteration": 3.764968156814575 }, { "auxiliary_loss_clip": 0.01219453, "auxiliary_loss_mlp": 0.01047656, "balance_loss_clip": 1.0619483, "balance_loss_mlp": 1.03324389, "epoch": 0.06505140383574821, "flos": 19865065259520.0, "grad_norm": 4.00320770289931, "language_loss": 0.7058996, "learning_rate": 3.987167705309296e-06, "loss": 0.72857082, "num_input_tokens_seen": 11529715, "step": 541, "time_per_iteration": 2.823775291442871 }, { "auxiliary_loss_clip": 0.0125731, "auxiliary_loss_mlp": 0.00766216, "balance_loss_clip": 1.0697664, "balance_loss_mlp": 1.00043917, "epoch": 0.0651716467263873, "flos": 17924703540480.0, "grad_norm": 2.1050409643189774, "language_loss": 0.95358723, "learning_rate": 3.987079454553108e-06, "loss": 0.97382247, "num_input_tokens_seen": 11547665, "step": 542, "time_per_iteration": 3.5835654735565186 }, { "auxiliary_loss_clip": 0.0121912, "auxiliary_loss_mlp": 0.01045673, "balance_loss_clip": 1.06745076, "balance_loss_mlp": 1.03213048, "epoch": 0.0652918896170264, "flos": 20842840356480.0, "grad_norm": 1.9559651223684358, "language_loss": 0.91122413, "learning_rate": 3.986990902358546e-06, "loss": 0.93387204, "num_input_tokens_seen": 11564605, "step": 543, "time_per_iteration": 2.7830164432525635 }, { "auxiliary_loss_clip": 0.0125283, "auxiliary_loss_mlp": 0.01047175, "balance_loss_clip": 1.06628931, "balance_loss_mlp": 1.03380573, "epoch": 0.06541213250766549, "flos": 21872507627520.0, "grad_norm": 2.2198598916031447, "language_loss": 0.93593681, "learning_rate": 3.986902048739045e-06, "loss": 0.95893681, "num_input_tokens_seen": 11584550, "step": 544, "time_per_iteration": 2.7090060710906982 }, { "auxiliary_loss_clip": 0.01241851, "auxiliary_loss_mlp": 0.01052658, "balance_loss_clip": 1.06984937, "balance_loss_mlp": 1.03856778, "epoch": 0.06553237539830457, "flos": 23110743219840.0, "grad_norm": 6.239805815128727, "language_loss": 0.80086696, "learning_rate": 3.986812893708082e-06, "loss": 0.82381201, "num_input_tokens_seen": 11600740, "step": 545, "time_per_iteration": 2.6782820224761963 }, { "auxiliary_loss_clip": 0.01241062, "auxiliary_loss_mlp": 0.01046016, "balance_loss_clip": 1.06542981, "balance_loss_mlp": 1.03146052, "epoch": 0.06565261828894367, "flos": 17923769786880.0, "grad_norm": 2.2774721940902776, "language_loss": 0.81477523, "learning_rate": 3.9867234372791826e-06, "loss": 0.83764601, "num_input_tokens_seen": 11618695, "step": 546, "time_per_iteration": 2.653667688369751 }, { "auxiliary_loss_clip": 0.01249971, "auxiliary_loss_mlp": 0.01054942, "balance_loss_clip": 1.06697071, "balance_loss_mlp": 1.04181671, "epoch": 0.06577286117958275, "flos": 22783058421120.0, "grad_norm": 1.59376221803739, "language_loss": 0.87219083, "learning_rate": 3.986633679465918e-06, "loss": 0.89523995, "num_input_tokens_seen": 11638850, "step": 547, "time_per_iteration": 2.567714214324951 }, { "auxiliary_loss_clip": 0.01208839, "auxiliary_loss_mlp": 0.01051653, "balance_loss_clip": 1.06465578, "balance_loss_mlp": 1.03799164, "epoch": 0.06589310407022185, "flos": 23696194993920.0, "grad_norm": 2.4668450168390654, "language_loss": 0.80736482, "learning_rate": 3.986543620281904e-06, "loss": 0.8299697, "num_input_tokens_seen": 11658500, "step": 548, "time_per_iteration": 2.6978933811187744 }, { "auxiliary_loss_clip": 0.01223552, "auxiliary_loss_mlp": 0.01048668, "balance_loss_clip": 1.06436563, "balance_loss_mlp": 1.0342561, "epoch": 0.06601334696086093, "flos": 26864772410880.0, "grad_norm": 2.1551949584263013, "language_loss": 0.91550529, "learning_rate": 3.986453259740802e-06, "loss": 0.93822742, "num_input_tokens_seen": 11676670, "step": 549, "time_per_iteration": 2.6462385654449463 }, { "auxiliary_loss_clip": 0.01234891, "auxiliary_loss_mlp": 0.01047171, "balance_loss_clip": 1.06887388, "balance_loss_mlp": 1.03359878, "epoch": 0.06613358985150003, "flos": 12567694101120.0, "grad_norm": 2.754400949292195, "language_loss": 0.78949249, "learning_rate": 3.986362597856319e-06, "loss": 0.81231308, "num_input_tokens_seen": 11693170, "step": 550, "time_per_iteration": 2.6591908931732178 }, { "auxiliary_loss_clip": 0.01230178, "auxiliary_loss_mlp": 0.00767288, "balance_loss_clip": 1.06253052, "balance_loss_mlp": 1.00037336, "epoch": 0.06625383274213913, "flos": 18332505624960.0, "grad_norm": 2.659444106771651, "language_loss": 0.81692833, "learning_rate": 3.986271634642211e-06, "loss": 0.83690298, "num_input_tokens_seen": 11710150, "step": 551, "time_per_iteration": 2.6699206829071045 }, { "auxiliary_loss_clip": 0.01268998, "auxiliary_loss_mlp": 0.01040801, "balance_loss_clip": 1.07280934, "balance_loss_mlp": 1.02634048, "epoch": 0.06637407563277821, "flos": 15375585098880.0, "grad_norm": 1.9851056983928699, "language_loss": 0.81252748, "learning_rate": 3.986180370112274e-06, "loss": 0.83562547, "num_input_tokens_seen": 11726670, "step": 552, "time_per_iteration": 2.7054550647735596 }, { "auxiliary_loss_clip": 0.01259226, "auxiliary_loss_mlp": 0.00766912, "balance_loss_clip": 1.07126045, "balance_loss_mlp": 1.00041723, "epoch": 0.0664943185234173, "flos": 24025244509440.0, "grad_norm": 2.2758320065447264, "language_loss": 0.74467236, "learning_rate": 3.986088804280354e-06, "loss": 0.76493376, "num_input_tokens_seen": 11746400, "step": 553, "time_per_iteration": 2.719813108444214 }, { "auxiliary_loss_clip": 0.01236337, "auxiliary_loss_mlp": 0.01056458, "balance_loss_clip": 1.06397545, "balance_loss_mlp": 1.04248714, "epoch": 0.06661456141405639, "flos": 20957503547520.0, "grad_norm": 2.5990400507914204, "language_loss": 0.93894434, "learning_rate": 3.985996937160342e-06, "loss": 0.96187228, "num_input_tokens_seen": 11765590, "step": 554, "time_per_iteration": 2.6903107166290283 }, { "auxiliary_loss_clip": 0.01253625, "auxiliary_loss_mlp": 0.01050007, "balance_loss_clip": 1.07058775, "balance_loss_mlp": 1.03611946, "epoch": 0.06673480430469549, "flos": 52223953322880.0, "grad_norm": 2.14986177124641, "language_loss": 0.68706322, "learning_rate": 3.985904768766173e-06, "loss": 0.71009958, "num_input_tokens_seen": 11788365, "step": 555, "time_per_iteration": 2.8759732246398926 }, { "auxiliary_loss_clip": 0.01222389, "auxiliary_loss_mlp": 0.01046574, "balance_loss_clip": 1.0646708, "balance_loss_mlp": 1.03210235, "epoch": 0.06685504719533458, "flos": 16217079995520.0, "grad_norm": 3.657305334973761, "language_loss": 0.76252913, "learning_rate": 3.98581229911183e-06, "loss": 0.78521883, "num_input_tokens_seen": 11807285, "step": 556, "time_per_iteration": 2.747011184692383 }, { "auxiliary_loss_clip": 0.01249546, "auxiliary_loss_mlp": 0.01048557, "balance_loss_clip": 1.06304729, "balance_loss_mlp": 1.03464532, "epoch": 0.06697529008597367, "flos": 22491535639680.0, "grad_norm": 2.380812906226626, "language_loss": 0.92250198, "learning_rate": 3.985719528211341e-06, "loss": 0.94548297, "num_input_tokens_seen": 11826655, "step": 557, "time_per_iteration": 2.7364466190338135 }, { "auxiliary_loss_clip": 0.011654, "auxiliary_loss_mlp": 0.01007206, "balance_loss_clip": 1.05401134, "balance_loss_mlp": 1.0015316, "epoch": 0.06709553297661276, "flos": 62688216936960.0, "grad_norm": 0.8487815222614941, "language_loss": 0.63040733, "learning_rate": 3.985626456078777e-06, "loss": 0.65213341, "num_input_tokens_seen": 11891310, "step": 558, "time_per_iteration": 3.384471893310547 }, { "auxiliary_loss_clip": 0.01222713, "auxiliary_loss_mlp": 0.01047848, "balance_loss_clip": 1.0660733, "balance_loss_mlp": 1.03278005, "epoch": 0.06721577586725185, "flos": 11216590997760.0, "grad_norm": 2.7199048749086, "language_loss": 0.86275542, "learning_rate": 3.985533082728259e-06, "loss": 0.88546103, "num_input_tokens_seen": 11906965, "step": 559, "time_per_iteration": 2.749204397201538 }, { "auxiliary_loss_clip": 0.01272863, "auxiliary_loss_mlp": 0.01051082, "balance_loss_clip": 1.07133543, "balance_loss_mlp": 1.03695583, "epoch": 0.06733601875789094, "flos": 25922189664000.0, "grad_norm": 2.2750028965365403, "language_loss": 0.74877608, "learning_rate": 3.985439408173951e-06, "loss": 0.77201557, "num_input_tokens_seen": 11927190, "step": 560, "time_per_iteration": 2.6557657718658447 }, { "auxiliary_loss_clip": 0.01272439, "auxiliary_loss_mlp": 0.01049129, "balance_loss_clip": 1.07173944, "balance_loss_mlp": 1.03497291, "epoch": 0.06745626164853002, "flos": 20813645577600.0, "grad_norm": 1.9945421253271232, "language_loss": 0.71002114, "learning_rate": 3.9853454324300634e-06, "loss": 0.73323691, "num_input_tokens_seen": 11946400, "step": 561, "time_per_iteration": 2.7306649684906006 }, { "auxiliary_loss_clip": 0.01194991, "auxiliary_loss_mlp": 0.01044293, "balance_loss_clip": 1.05986178, "balance_loss_mlp": 1.03100133, "epoch": 0.06757650453916912, "flos": 19829262378240.0, "grad_norm": 2.011743113616089, "language_loss": 0.78157258, "learning_rate": 3.985251155510852e-06, "loss": 0.80396539, "num_input_tokens_seen": 11965430, "step": 562, "time_per_iteration": 2.837284564971924 }, { "auxiliary_loss_clip": 0.01207426, "auxiliary_loss_mlp": 0.01046646, "balance_loss_clip": 1.06768811, "balance_loss_mlp": 1.03248966, "epoch": 0.06769674742980822, "flos": 25739224761600.0, "grad_norm": 2.1370400284696727, "language_loss": 0.80088758, "learning_rate": 3.98515657743062e-06, "loss": 0.82342833, "num_input_tokens_seen": 11984895, "step": 563, "time_per_iteration": 3.6496822834014893 }, { "auxiliary_loss_clip": 0.01243193, "auxiliary_loss_mlp": 0.01046531, "balance_loss_clip": 1.06788337, "balance_loss_mlp": 1.03338194, "epoch": 0.0678169903204473, "flos": 13074788355840.0, "grad_norm": 3.0052785581367867, "language_loss": 0.77968907, "learning_rate": 3.985061698203711e-06, "loss": 0.80258638, "num_input_tokens_seen": 12002010, "step": 564, "time_per_iteration": 2.7362117767333984 }, { "auxiliary_loss_clip": 0.0118718, "auxiliary_loss_mlp": 0.01006799, "balance_loss_clip": 1.05417049, "balance_loss_mlp": 1.00141037, "epoch": 0.0679372332110864, "flos": 70865830788480.0, "grad_norm": 0.8955883533641473, "language_loss": 0.63854265, "learning_rate": 3.984966517844523e-06, "loss": 0.66048247, "num_input_tokens_seen": 12057255, "step": 565, "time_per_iteration": 3.1373822689056396 }, { "auxiliary_loss_clip": 0.01273452, "auxiliary_loss_mlp": 0.01048702, "balance_loss_clip": 1.07199895, "balance_loss_mlp": 1.03394413, "epoch": 0.06805747610172548, "flos": 28256418990720.0, "grad_norm": 2.2968524713936165, "language_loss": 0.80726737, "learning_rate": 3.984871036367492e-06, "loss": 0.83048898, "num_input_tokens_seen": 12077280, "step": 566, "time_per_iteration": 3.5436887741088867 }, { "auxiliary_loss_clip": 0.01257163, "auxiliary_loss_mlp": 0.0076734, "balance_loss_clip": 1.07057774, "balance_loss_mlp": 1.0002656, "epoch": 0.06817771899236458, "flos": 20120533764480.0, "grad_norm": 2.1935862722085675, "language_loss": 0.83063781, "learning_rate": 3.984775253787102e-06, "loss": 0.85088289, "num_input_tokens_seen": 12095570, "step": 567, "time_per_iteration": 3.795020818710327 }, { "auxiliary_loss_clip": 0.01250827, "auxiliary_loss_mlp": 0.01048465, "balance_loss_clip": 1.0637486, "balance_loss_mlp": 1.03501213, "epoch": 0.06829796188300366, "flos": 17930629284480.0, "grad_norm": 2.8327262826445816, "language_loss": 0.88006186, "learning_rate": 3.984679170117885e-06, "loss": 0.90305477, "num_input_tokens_seen": 12111775, "step": 568, "time_per_iteration": 3.6362342834472656 }, { "auxiliary_loss_clip": 0.01250128, "auxiliary_loss_mlp": 0.01043643, "balance_loss_clip": 1.06749773, "balance_loss_mlp": 1.02919483, "epoch": 0.06841820477364276, "flos": 14501627285760.0, "grad_norm": 3.2440940418556896, "language_loss": 0.78774881, "learning_rate": 3.984582785374415e-06, "loss": 0.81068653, "num_input_tokens_seen": 12129215, "step": 569, "time_per_iteration": 2.6611690521240234 }, { "auxiliary_loss_clip": 0.01238, "auxiliary_loss_mlp": 0.00766378, "balance_loss_clip": 1.06898904, "balance_loss_mlp": 1.00023174, "epoch": 0.06853844766428185, "flos": 21938474954880.0, "grad_norm": 30.441531417228404, "language_loss": 0.80686831, "learning_rate": 3.9844860995713155e-06, "loss": 0.82691216, "num_input_tokens_seen": 12148755, "step": 570, "time_per_iteration": 2.7592830657958984 }, { "auxiliary_loss_clip": 0.01257795, "auxiliary_loss_mlp": 0.01046896, "balance_loss_clip": 1.07308006, "balance_loss_mlp": 1.03371167, "epoch": 0.06865869055492094, "flos": 16800628348800.0, "grad_norm": 3.7910060663192895, "language_loss": 0.83214062, "learning_rate": 3.9843891127232524e-06, "loss": 0.85518754, "num_input_tokens_seen": 12166290, "step": 571, "time_per_iteration": 2.7251033782958984 }, { "auxiliary_loss_clip": 0.01199549, "auxiliary_loss_mlp": 0.01059317, "balance_loss_clip": 1.06237507, "balance_loss_mlp": 1.04658532, "epoch": 0.06877893344556003, "flos": 19937281553280.0, "grad_norm": 2.3592642325040116, "language_loss": 0.66511369, "learning_rate": 3.984291824844938e-06, "loss": 0.6877023, "num_input_tokens_seen": 12181385, "step": 572, "time_per_iteration": 2.9484503269195557 }, { "auxiliary_loss_clip": 0.01270819, "auxiliary_loss_mlp": 0.01043832, "balance_loss_clip": 1.07211494, "balance_loss_mlp": 1.02952135, "epoch": 0.06889917633619912, "flos": 23039388852480.0, "grad_norm": 4.7443234943812955, "language_loss": 0.8524062, "learning_rate": 3.984194235951132e-06, "loss": 0.87555271, "num_input_tokens_seen": 12197530, "step": 573, "time_per_iteration": 2.6607229709625244 }, { "auxiliary_loss_clip": 0.01271617, "auxiliary_loss_mlp": 0.01039962, "balance_loss_clip": 1.07430792, "balance_loss_mlp": 1.02653956, "epoch": 0.06901941922683821, "flos": 20960556203520.0, "grad_norm": 8.899396618097825, "language_loss": 0.84563845, "learning_rate": 3.9840963460566375e-06, "loss": 0.86875427, "num_input_tokens_seen": 12216310, "step": 574, "time_per_iteration": 2.711838960647583 }, { "auxiliary_loss_clip": 0.01174776, "auxiliary_loss_mlp": 0.0104771, "balance_loss_clip": 1.06174684, "balance_loss_mlp": 1.03395319, "epoch": 0.06913966211747731, "flos": 24821850384000.0, "grad_norm": 1.5805904695135182, "language_loss": 0.89651769, "learning_rate": 3.983998155176305e-06, "loss": 0.91874254, "num_input_tokens_seen": 12236670, "step": 575, "time_per_iteration": 2.8479719161987305 }, { "auxiliary_loss_clip": 0.01180911, "auxiliary_loss_mlp": 0.01007486, "balance_loss_clip": 1.04950523, "balance_loss_mlp": 1.00235963, "epoch": 0.06925990500811639, "flos": 58367446957440.0, "grad_norm": 0.8196954944579226, "language_loss": 0.57096517, "learning_rate": 3.9838996633250305e-06, "loss": 0.59284914, "num_input_tokens_seen": 12297185, "step": 576, "time_per_iteration": 3.144300699234009 }, { "auxiliary_loss_clip": 0.01252735, "auxiliary_loss_mlp": 0.01046969, "balance_loss_clip": 1.06681657, "balance_loss_mlp": 1.0339154, "epoch": 0.06938014789875549, "flos": 12749940731520.0, "grad_norm": 2.5485545576455357, "language_loss": 0.88342685, "learning_rate": 3.983800870517753e-06, "loss": 0.90642381, "num_input_tokens_seen": 12313975, "step": 577, "time_per_iteration": 2.702695846557617 }, { "auxiliary_loss_clip": 0.01251556, "auxiliary_loss_mlp": 0.01048593, "balance_loss_clip": 1.07170403, "balance_loss_mlp": 1.03561711, "epoch": 0.06950039078939457, "flos": 22820226019200.0, "grad_norm": 9.549613743961567, "language_loss": 0.79185432, "learning_rate": 3.983701776769463e-06, "loss": 0.81485581, "num_input_tokens_seen": 12331385, "step": 578, "time_per_iteration": 2.7667999267578125 }, { "auxiliary_loss_clip": 0.01242359, "auxiliary_loss_mlp": 0.01047474, "balance_loss_clip": 1.06755733, "balance_loss_mlp": 1.03471851, "epoch": 0.06962063368003367, "flos": 21941348042880.0, "grad_norm": 2.2733352728811007, "language_loss": 0.85532874, "learning_rate": 3.9836023820951885e-06, "loss": 0.87822711, "num_input_tokens_seen": 12350600, "step": 579, "time_per_iteration": 2.6785950660705566 }, { "auxiliary_loss_clip": 0.0121925, "auxiliary_loss_mlp": 0.01043829, "balance_loss_clip": 1.06424427, "balance_loss_mlp": 1.03070974, "epoch": 0.06974087657067275, "flos": 20706021452160.0, "grad_norm": 2.2620166790646943, "language_loss": 0.68504947, "learning_rate": 3.983502686510011e-06, "loss": 0.70768028, "num_input_tokens_seen": 12371430, "step": 580, "time_per_iteration": 2.7175381183624268 }, { "auxiliary_loss_clip": 0.01252935, "auxiliary_loss_mlp": 0.00766217, "balance_loss_clip": 1.06700361, "balance_loss_mlp": 1.00024271, "epoch": 0.06986111946131185, "flos": 22638230784000.0, "grad_norm": 1.7979739131655443, "language_loss": 0.7366519, "learning_rate": 3.9834026900290525e-06, "loss": 0.75684345, "num_input_tokens_seen": 12390825, "step": 581, "time_per_iteration": 2.679097890853882 }, { "auxiliary_loss_clip": 0.01267995, "auxiliary_loss_mlp": 0.01046965, "balance_loss_clip": 1.07138169, "balance_loss_mlp": 1.03474557, "epoch": 0.06998136235195095, "flos": 26943453152640.0, "grad_norm": 2.5813490428616768, "language_loss": 1.00304878, "learning_rate": 3.983302392667482e-06, "loss": 1.02619839, "num_input_tokens_seen": 12411670, "step": 582, "time_per_iteration": 2.6406190395355225 }, { "auxiliary_loss_clip": 0.01254476, "auxiliary_loss_mlp": 0.01047893, "balance_loss_clip": 1.07412779, "balance_loss_mlp": 1.03493476, "epoch": 0.07010160524259003, "flos": 22492505306880.0, "grad_norm": 1.77888315778107, "language_loss": 0.93458587, "learning_rate": 3.983201794440517e-06, "loss": 0.95760953, "num_input_tokens_seen": 12431245, "step": 583, "time_per_iteration": 2.694089412689209 }, { "auxiliary_loss_clip": 0.01221923, "auxiliary_loss_mlp": 0.01046933, "balance_loss_clip": 1.06499863, "balance_loss_mlp": 1.03376055, "epoch": 0.07022184813322913, "flos": 18332541538560.0, "grad_norm": 1.8831338267792685, "language_loss": 0.67743397, "learning_rate": 3.9831008953634165e-06, "loss": 0.70012254, "num_input_tokens_seen": 12450535, "step": 584, "time_per_iteration": 2.699342727661133 }, { "auxiliary_loss_clip": 0.01184686, "auxiliary_loss_mlp": 0.01053351, "balance_loss_clip": 1.05762947, "balance_loss_mlp": 1.03982711, "epoch": 0.07034209102386821, "flos": 24675550289280.0, "grad_norm": 3.5120915460034565, "language_loss": 0.81365317, "learning_rate": 3.9829996954514864e-06, "loss": 0.83603358, "num_input_tokens_seen": 12469675, "step": 585, "time_per_iteration": 2.8128342628479004 }, { "auxiliary_loss_clip": 0.01236455, "auxiliary_loss_mlp": 0.0104549, "balance_loss_clip": 1.06577754, "balance_loss_mlp": 1.03222191, "epoch": 0.0704623339145073, "flos": 25995878415360.0, "grad_norm": 2.0319309724450023, "language_loss": 0.84288979, "learning_rate": 3.982898194720079e-06, "loss": 0.8657093, "num_input_tokens_seen": 12490405, "step": 586, "time_per_iteration": 2.7475976943969727 }, { "auxiliary_loss_clip": 0.01230416, "auxiliary_loss_mlp": 0.00766669, "balance_loss_clip": 1.06834865, "balance_loss_mlp": 1.00027406, "epoch": 0.0705825768051464, "flos": 25338318088320.0, "grad_norm": 2.1926900433233367, "language_loss": 0.82552886, "learning_rate": 3.982796393184592e-06, "loss": 0.84549975, "num_input_tokens_seen": 12509485, "step": 587, "time_per_iteration": 2.812030076980591 }, { "auxiliary_loss_clip": 0.01157836, "auxiliary_loss_mlp": 0.01010665, "balance_loss_clip": 1.04122329, "balance_loss_mlp": 1.00534868, "epoch": 0.07070281969578548, "flos": 66047552507520.0, "grad_norm": 0.8190602304442607, "language_loss": 0.62651038, "learning_rate": 3.98269429086047e-06, "loss": 0.64819539, "num_input_tokens_seen": 12567325, "step": 588, "time_per_iteration": 3.1467673778533936 }, { "auxiliary_loss_clip": 0.01231877, "auxiliary_loss_mlp": 0.01043038, "balance_loss_clip": 1.06761074, "balance_loss_mlp": 1.02875113, "epoch": 0.07082306258642458, "flos": 23653568528640.0, "grad_norm": 3.48101706130193, "language_loss": 0.86017406, "learning_rate": 3.982591887763199e-06, "loss": 0.88292325, "num_input_tokens_seen": 12584785, "step": 589, "time_per_iteration": 3.6911938190460205 }, { "auxiliary_loss_clip": 0.01191512, "auxiliary_loss_mlp": 0.01045387, "balance_loss_clip": 1.05399489, "balance_loss_mlp": 1.0323931, "epoch": 0.07094330547706366, "flos": 13880049408000.0, "grad_norm": 7.329092078268937, "language_loss": 0.81627172, "learning_rate": 3.982489183908316e-06, "loss": 0.83864069, "num_input_tokens_seen": 12601205, "step": 590, "time_per_iteration": 2.7544798851013184 }, { "auxiliary_loss_clip": 0.01160077, "auxiliary_loss_mlp": 0.010471, "balance_loss_clip": 1.05072844, "balance_loss_mlp": 1.03422546, "epoch": 0.07106354836770276, "flos": 24645098534400.0, "grad_norm": 1.9239549997030303, "language_loss": 0.84582204, "learning_rate": 3.982386179311399e-06, "loss": 0.86789382, "num_input_tokens_seen": 12621725, "step": 591, "time_per_iteration": 2.8666436672210693 }, { "auxiliary_loss_clip": 0.01255452, "auxiliary_loss_mlp": 0.01041887, "balance_loss_clip": 1.07002628, "balance_loss_mlp": 1.02780867, "epoch": 0.07118379125834184, "flos": 16217223649920.0, "grad_norm": 2.458339054808149, "language_loss": 0.87708449, "learning_rate": 3.982282873988075e-06, "loss": 0.90005791, "num_input_tokens_seen": 12639600, "step": 592, "time_per_iteration": 3.6496973037719727 }, { "auxiliary_loss_clip": 0.01227703, "auxiliary_loss_mlp": 0.01044866, "balance_loss_clip": 1.06587934, "balance_loss_mlp": 1.03203344, "epoch": 0.07130403414898094, "flos": 19719986227200.0, "grad_norm": 1.6552514988420344, "language_loss": 0.87064189, "learning_rate": 3.982179267954016e-06, "loss": 0.89336753, "num_input_tokens_seen": 12660030, "step": 593, "time_per_iteration": 3.7004263401031494 }, { "auxiliary_loss_clip": 0.01264504, "auxiliary_loss_mlp": 0.01038995, "balance_loss_clip": 1.0689106, "balance_loss_mlp": 1.02568567, "epoch": 0.07142427703962004, "flos": 21871933009920.0, "grad_norm": 2.209868235734169, "language_loss": 0.96041369, "learning_rate": 3.982075361224937e-06, "loss": 0.98344874, "num_input_tokens_seen": 12678395, "step": 594, "time_per_iteration": 3.6022818088531494 }, { "auxiliary_loss_clip": 0.01247762, "auxiliary_loss_mlp": 0.00766417, "balance_loss_clip": 1.06928706, "balance_loss_mlp": 1.00025761, "epoch": 0.07154451993025912, "flos": 18296595002880.0, "grad_norm": 1.8683091469593507, "language_loss": 0.88101614, "learning_rate": 3.981971153816602e-06, "loss": 0.90115798, "num_input_tokens_seen": 12696000, "step": 595, "time_per_iteration": 2.675199508666992 }, { "auxiliary_loss_clip": 0.01264722, "auxiliary_loss_mlp": 0.01041331, "balance_loss_clip": 1.07266831, "balance_loss_mlp": 1.0293808, "epoch": 0.07166476282089822, "flos": 22160690444160.0, "grad_norm": 1.629961884715407, "language_loss": 0.96228844, "learning_rate": 3.981866645744819e-06, "loss": 0.98534894, "num_input_tokens_seen": 12716715, "step": 596, "time_per_iteration": 2.757694959640503 }, { "auxiliary_loss_clip": 0.01270674, "auxiliary_loss_mlp": 0.00766009, "balance_loss_clip": 1.07304859, "balance_loss_mlp": 1.00021815, "epoch": 0.0717850057115373, "flos": 14136343925760.0, "grad_norm": 3.0376376238313934, "language_loss": 0.8172909, "learning_rate": 3.9817618370254416e-06, "loss": 0.83765775, "num_input_tokens_seen": 12733370, "step": 597, "time_per_iteration": 2.6664061546325684 }, { "auxiliary_loss_clip": 0.01269022, "auxiliary_loss_mlp": 0.0104643, "balance_loss_clip": 1.07080579, "balance_loss_mlp": 1.03358519, "epoch": 0.0719052486021764, "flos": 30917794412160.0, "grad_norm": 2.434008856111168, "language_loss": 0.87579149, "learning_rate": 3.9816567276743684e-06, "loss": 0.89894605, "num_input_tokens_seen": 12753235, "step": 598, "time_per_iteration": 2.774862289428711 }, { "auxiliary_loss_clip": 0.01227757, "auxiliary_loss_mlp": 0.0104794, "balance_loss_clip": 1.06666911, "balance_loss_mlp": 1.03462386, "epoch": 0.0720254914928155, "flos": 21287019939840.0, "grad_norm": 2.056732012514836, "language_loss": 0.77598363, "learning_rate": 3.9815513177075466e-06, "loss": 0.79874063, "num_input_tokens_seen": 12772020, "step": 599, "time_per_iteration": 2.7556369304656982 }, { "auxiliary_loss_clip": 0.01244839, "auxiliary_loss_mlp": 0.01051119, "balance_loss_clip": 1.06821334, "balance_loss_mlp": 1.03895974, "epoch": 0.07214573438345458, "flos": 27819170732160.0, "grad_norm": 1.7020740694118823, "language_loss": 0.70175874, "learning_rate": 3.9814456071409646e-06, "loss": 0.72471833, "num_input_tokens_seen": 12792555, "step": 600, "time_per_iteration": 2.7655434608459473 }, { "auxiliary_loss_clip": 0.01209077, "auxiliary_loss_mlp": 0.01046935, "balance_loss_clip": 1.06405997, "balance_loss_mlp": 1.03300488, "epoch": 0.07226597727409367, "flos": 25483576688640.0, "grad_norm": 3.6105507838394804, "language_loss": 0.85834688, "learning_rate": 3.981339595990659e-06, "loss": 0.880907, "num_input_tokens_seen": 12811085, "step": 601, "time_per_iteration": 2.8356716632843018 }, { "auxiliary_loss_clip": 0.01252855, "auxiliary_loss_mlp": 0.01041463, "balance_loss_clip": 1.07107198, "balance_loss_mlp": 1.02890992, "epoch": 0.07238622016473276, "flos": 23513840622720.0, "grad_norm": 2.0299282391403906, "language_loss": 0.81401658, "learning_rate": 3.981233284272713e-06, "loss": 0.83695978, "num_input_tokens_seen": 12830830, "step": 602, "time_per_iteration": 2.6941051483154297 }, { "auxiliary_loss_clip": 0.0121604, "auxiliary_loss_mlp": 0.01052257, "balance_loss_clip": 1.06212783, "balance_loss_mlp": 1.03900087, "epoch": 0.07250646305537185, "flos": 25453519983360.0, "grad_norm": 1.8075552276703384, "language_loss": 0.90148711, "learning_rate": 3.981126672003253e-06, "loss": 0.92417008, "num_input_tokens_seen": 12853505, "step": 603, "time_per_iteration": 2.889545202255249 }, { "auxiliary_loss_clip": 0.01237688, "auxiliary_loss_mlp": 0.01043326, "balance_loss_clip": 1.06099033, "balance_loss_mlp": 1.03092861, "epoch": 0.07262670594601094, "flos": 27155038216320.0, "grad_norm": 2.6393995751501595, "language_loss": 0.78409088, "learning_rate": 3.981019759198451e-06, "loss": 0.80690104, "num_input_tokens_seen": 12872455, "step": 604, "time_per_iteration": 2.789447069168091 }, { "auxiliary_loss_clip": 0.01232467, "auxiliary_loss_mlp": 0.01042948, "balance_loss_clip": 1.06223249, "balance_loss_mlp": 1.0302527, "epoch": 0.07274694883665003, "flos": 26651607148800.0, "grad_norm": 2.4519764470448218, "language_loss": 0.84386587, "learning_rate": 3.980912545874528e-06, "loss": 0.86662006, "num_input_tokens_seen": 12892620, "step": 605, "time_per_iteration": 2.774458885192871 }, { "auxiliary_loss_clip": 0.01243484, "auxiliary_loss_mlp": 0.00766968, "balance_loss_clip": 1.06741714, "balance_loss_mlp": 1.00031376, "epoch": 0.07286719172728913, "flos": 29862344154240.0, "grad_norm": 2.481239914483947, "language_loss": 0.85620475, "learning_rate": 3.980805032047746e-06, "loss": 0.87630928, "num_input_tokens_seen": 12914090, "step": 606, "time_per_iteration": 2.7429800033569336 }, { "auxiliary_loss_clip": 0.01234129, "auxiliary_loss_mlp": 0.01042824, "balance_loss_clip": 1.06574273, "balance_loss_mlp": 1.02864969, "epoch": 0.07298743461792821, "flos": 17382057799680.0, "grad_norm": 1.8972346640635336, "language_loss": 0.81001484, "learning_rate": 3.980697217734415e-06, "loss": 0.83278441, "num_input_tokens_seen": 12931830, "step": 607, "time_per_iteration": 2.731691837310791 }, { "auxiliary_loss_clip": 0.01205776, "auxiliary_loss_mlp": 0.00767112, "balance_loss_clip": 1.06333268, "balance_loss_mlp": 1.00031447, "epoch": 0.07310767750856731, "flos": 19498201701120.0, "grad_norm": 1.8378206354242137, "language_loss": 0.91722327, "learning_rate": 3.980589102950891e-06, "loss": 0.93695217, "num_input_tokens_seen": 12949995, "step": 608, "time_per_iteration": 2.836153030395508 }, { "auxiliary_loss_clip": 0.01233678, "auxiliary_loss_mlp": 0.01054547, "balance_loss_clip": 1.06867993, "balance_loss_mlp": 1.04028976, "epoch": 0.07322792039920639, "flos": 29168693637120.0, "grad_norm": 2.531920049188976, "language_loss": 0.7605654, "learning_rate": 3.9804806877135755e-06, "loss": 0.78344762, "num_input_tokens_seen": 12968040, "step": 609, "time_per_iteration": 2.8863418102264404 }, { "auxiliary_loss_clip": 0.01249549, "auxiliary_loss_mlp": 0.00766881, "balance_loss_clip": 1.06228352, "balance_loss_mlp": 1.00036931, "epoch": 0.07334816328984549, "flos": 23477822259840.0, "grad_norm": 5.25238257178374, "language_loss": 0.86292863, "learning_rate": 3.980371972038915e-06, "loss": 0.88309294, "num_input_tokens_seen": 12988530, "step": 610, "time_per_iteration": 2.688387155532837 }, { "auxiliary_loss_clip": 0.01270096, "auxiliary_loss_mlp": 0.0105437, "balance_loss_clip": 1.07211399, "balance_loss_mlp": 1.041049, "epoch": 0.07346840618048459, "flos": 22962467877120.0, "grad_norm": 1.7129578481044332, "language_loss": 0.84240663, "learning_rate": 3.980262955943399e-06, "loss": 0.86565137, "num_input_tokens_seen": 13008195, "step": 611, "time_per_iteration": 2.7256715297698975 }, { "auxiliary_loss_clip": 0.01228872, "auxiliary_loss_mlp": 0.01047901, "balance_loss_clip": 1.06818843, "balance_loss_mlp": 1.03454971, "epoch": 0.07358864907112367, "flos": 17673903803520.0, "grad_norm": 2.4424740787434844, "language_loss": 0.86838496, "learning_rate": 3.980153639443569e-06, "loss": 0.89115262, "num_input_tokens_seen": 13024180, "step": 612, "time_per_iteration": 2.695604085922241 }, { "auxiliary_loss_clip": 0.01242194, "auxiliary_loss_mlp": 0.01050704, "balance_loss_clip": 1.06687391, "balance_loss_mlp": 1.03685212, "epoch": 0.07370889196176277, "flos": 24097029840000.0, "grad_norm": 2.921389844023767, "language_loss": 0.80493581, "learning_rate": 3.980044022556005e-06, "loss": 0.82786471, "num_input_tokens_seen": 13043865, "step": 613, "time_per_iteration": 2.6714742183685303 }, { "auxiliary_loss_clip": 0.01252289, "auxiliary_loss_mlp": 0.0104948, "balance_loss_clip": 1.07033384, "balance_loss_mlp": 1.03592598, "epoch": 0.07382913485240185, "flos": 25885919905920.0, "grad_norm": 2.3485167481016016, "language_loss": 0.73033941, "learning_rate": 3.9799341052973375e-06, "loss": 0.75335717, "num_input_tokens_seen": 13063700, "step": 614, "time_per_iteration": 2.7138421535491943 }, { "auxiliary_loss_clip": 0.01235436, "auxiliary_loss_mlp": 0.01044639, "balance_loss_clip": 1.06867981, "balance_loss_mlp": 1.03130555, "epoch": 0.07394937774304094, "flos": 16873850223360.0, "grad_norm": 2.680992258556214, "language_loss": 0.75166273, "learning_rate": 3.979823887684241e-06, "loss": 0.77446347, "num_input_tokens_seen": 13082640, "step": 615, "time_per_iteration": 3.618635416030884 }, { "auxiliary_loss_clip": 0.01266576, "auxiliary_loss_mlp": 0.01045982, "balance_loss_clip": 1.07110822, "balance_loss_mlp": 1.03242159, "epoch": 0.07406962063368003, "flos": 20703471586560.0, "grad_norm": 2.633816256754086, "language_loss": 0.85084724, "learning_rate": 3.979713369733434e-06, "loss": 0.87397283, "num_input_tokens_seen": 13100505, "step": 616, "time_per_iteration": 2.662513494491577 }, { "auxiliary_loss_clip": 0.012454, "auxiliary_loss_mlp": 0.01047147, "balance_loss_clip": 1.07087803, "balance_loss_mlp": 1.03309226, "epoch": 0.07418986352431912, "flos": 21430985650560.0, "grad_norm": 2.7952238589016423, "language_loss": 0.84869671, "learning_rate": 3.979602551461683e-06, "loss": 0.8716222, "num_input_tokens_seen": 13121285, "step": 617, "time_per_iteration": 2.720712184906006 }, { "auxiliary_loss_clip": 0.01235157, "auxiliary_loss_mlp": 0.01048245, "balance_loss_clip": 1.06905508, "balance_loss_mlp": 1.03458369, "epoch": 0.07431010641495822, "flos": 12021133777920.0, "grad_norm": 2.435347664266673, "language_loss": 0.91493261, "learning_rate": 3.979491432885799e-06, "loss": 0.93776667, "num_input_tokens_seen": 13137550, "step": 618, "time_per_iteration": 3.547816514968872 }, { "auxiliary_loss_clip": 0.01199864, "auxiliary_loss_mlp": 0.00766417, "balance_loss_clip": 1.05987644, "balance_loss_mlp": 1.0004369, "epoch": 0.0744303493055973, "flos": 20957575374720.0, "grad_norm": 2.067191568152436, "language_loss": 0.82908398, "learning_rate": 3.97938001402264e-06, "loss": 0.84874678, "num_input_tokens_seen": 13156675, "step": 619, "time_per_iteration": 3.6499555110931396 }, { "auxiliary_loss_clip": 0.01211546, "auxiliary_loss_mlp": 0.01054885, "balance_loss_clip": 1.06240642, "balance_loss_mlp": 1.04005504, "epoch": 0.0745505921962364, "flos": 16253134272000.0, "grad_norm": 3.0093697918777047, "language_loss": 0.79752105, "learning_rate": 3.979268294889105e-06, "loss": 0.82018542, "num_input_tokens_seen": 13172225, "step": 620, "time_per_iteration": 3.6573145389556885 }, { "auxiliary_loss_clip": 0.01266458, "auxiliary_loss_mlp": 0.01041317, "balance_loss_clip": 1.07057202, "balance_loss_mlp": 1.02773285, "epoch": 0.07467083508687548, "flos": 50944635550080.0, "grad_norm": 2.0206801149425124, "language_loss": 0.74122834, "learning_rate": 3.979156275502143e-06, "loss": 0.76430607, "num_input_tokens_seen": 13195885, "step": 621, "time_per_iteration": 2.954556941986084 }, { "auxiliary_loss_clip": 0.01215603, "auxiliary_loss_mlp": 0.01041908, "balance_loss_clip": 1.06341398, "balance_loss_mlp": 1.02816343, "epoch": 0.07479107797751458, "flos": 17529686697600.0, "grad_norm": 3.8145206079769207, "language_loss": 0.91734397, "learning_rate": 3.979043955878749e-06, "loss": 0.93991911, "num_input_tokens_seen": 13213730, "step": 622, "time_per_iteration": 2.7942793369293213 }, { "auxiliary_loss_clip": 0.01230689, "auxiliary_loss_mlp": 0.01050822, "balance_loss_clip": 1.06696057, "balance_loss_mlp": 1.03687501, "epoch": 0.07491132086815366, "flos": 23473943591040.0, "grad_norm": 2.2196476407419707, "language_loss": 0.8316986, "learning_rate": 3.978931336035959e-06, "loss": 0.85451376, "num_input_tokens_seen": 13232540, "step": 623, "time_per_iteration": 2.7696945667266846 }, { "auxiliary_loss_clip": 0.0125169, "auxiliary_loss_mlp": 0.01048543, "balance_loss_clip": 1.07109416, "balance_loss_mlp": 1.03614533, "epoch": 0.07503156375879276, "flos": 20157557708160.0, "grad_norm": 2.363437918970197, "language_loss": 0.82621062, "learning_rate": 3.9788184159908595e-06, "loss": 0.84921288, "num_input_tokens_seen": 13249670, "step": 624, "time_per_iteration": 2.696173906326294 }, { "auxiliary_loss_clip": 0.01222074, "auxiliary_loss_mlp": 0.01045803, "balance_loss_clip": 1.06092668, "balance_loss_mlp": 1.03303576, "epoch": 0.07515180664943186, "flos": 15115519653120.0, "grad_norm": 6.995444982549287, "language_loss": 0.82714766, "learning_rate": 3.97870519576058e-06, "loss": 0.84982646, "num_input_tokens_seen": 13266095, "step": 625, "time_per_iteration": 2.6936166286468506 }, { "auxiliary_loss_clip": 0.01216905, "auxiliary_loss_mlp": 0.00766601, "balance_loss_clip": 1.06291795, "balance_loss_mlp": 1.00058293, "epoch": 0.07527204954007094, "flos": 21287702298240.0, "grad_norm": 2.5252897124888647, "language_loss": 0.80981338, "learning_rate": 3.978591675362295e-06, "loss": 0.82964844, "num_input_tokens_seen": 13284810, "step": 626, "time_per_iteration": 2.8920786380767822 }, { "auxiliary_loss_clip": 0.01204825, "auxiliary_loss_mlp": 0.01045569, "balance_loss_clip": 1.06814778, "balance_loss_mlp": 1.03229499, "epoch": 0.07539229243071004, "flos": 21324187537920.0, "grad_norm": 1.7718974068641749, "language_loss": 0.87736368, "learning_rate": 3.978477854813226e-06, "loss": 0.89986765, "num_input_tokens_seen": 13304150, "step": 627, "time_per_iteration": 2.8174307346343994 }, { "auxiliary_loss_clip": 0.01249195, "auxiliary_loss_mlp": 0.01042275, "balance_loss_clip": 1.06741107, "balance_loss_mlp": 1.02927566, "epoch": 0.07551253532134912, "flos": 13042540920960.0, "grad_norm": 1.9194130712628374, "language_loss": 0.8265726, "learning_rate": 3.97836373413064e-06, "loss": 0.8494873, "num_input_tokens_seen": 13322205, "step": 628, "time_per_iteration": 2.634899139404297 }, { "auxiliary_loss_clip": 0.0126112, "auxiliary_loss_mlp": 0.01050038, "balance_loss_clip": 1.06530023, "balance_loss_mlp": 1.03659666, "epoch": 0.07563277821198822, "flos": 19208761908480.0, "grad_norm": 2.0542968144716673, "language_loss": 0.7451176, "learning_rate": 3.978249313331848e-06, "loss": 0.76822925, "num_input_tokens_seen": 13340435, "step": 629, "time_per_iteration": 2.6369986534118652 }, { "auxiliary_loss_clip": 0.01251578, "auxiliary_loss_mlp": 0.00766174, "balance_loss_clip": 1.06532121, "balance_loss_mlp": 1.00058961, "epoch": 0.07575302110262731, "flos": 19537200892800.0, "grad_norm": 2.7936885758862715, "language_loss": 0.62184578, "learning_rate": 3.978134592434208e-06, "loss": 0.64202332, "num_input_tokens_seen": 13358185, "step": 630, "time_per_iteration": 2.7721550464630127 }, { "auxiliary_loss_clip": 0.0111171, "auxiliary_loss_mlp": 0.01005466, "balance_loss_clip": 1.04026532, "balance_loss_mlp": 1.00026894, "epoch": 0.0758732639932664, "flos": 67961808017280.0, "grad_norm": 1.0017116308081735, "language_loss": 0.59355319, "learning_rate": 3.978019571455123e-06, "loss": 0.61472493, "num_input_tokens_seen": 13410130, "step": 631, "time_per_iteration": 3.3719329833984375 }, { "auxiliary_loss_clip": 0.01263147, "auxiliary_loss_mlp": 0.01050222, "balance_loss_clip": 1.07086647, "balance_loss_mlp": 1.03799105, "epoch": 0.07599350688390549, "flos": 18989204025600.0, "grad_norm": 2.1757507872781843, "language_loss": 0.83882713, "learning_rate": 3.977904250412042e-06, "loss": 0.86196089, "num_input_tokens_seen": 13429085, "step": 632, "time_per_iteration": 2.732921600341797 }, { "auxiliary_loss_clip": 0.01238489, "auxiliary_loss_mlp": 0.01045879, "balance_loss_clip": 1.06654298, "balance_loss_mlp": 1.03339183, "epoch": 0.07611374977454458, "flos": 21069006341760.0, "grad_norm": 2.6352270921505627, "language_loss": 0.8567484, "learning_rate": 3.97778862932246e-06, "loss": 0.87959212, "num_input_tokens_seen": 13446250, "step": 633, "time_per_iteration": 2.758497953414917 }, { "auxiliary_loss_clip": 0.01131955, "auxiliary_loss_mlp": 0.01043017, "balance_loss_clip": 1.04479384, "balance_loss_mlp": 1.03061938, "epoch": 0.07623399266518367, "flos": 18514536773760.0, "grad_norm": 2.8179001248391984, "language_loss": 0.94381821, "learning_rate": 3.9776727082039144e-06, "loss": 0.96556795, "num_input_tokens_seen": 13463220, "step": 634, "time_per_iteration": 3.2206778526306152 }, { "auxiliary_loss_clip": 0.01169657, "auxiliary_loss_mlp": 0.0100561, "balance_loss_clip": 1.04181182, "balance_loss_mlp": 1.0006268, "epoch": 0.07635423555582276, "flos": 44663036077440.0, "grad_norm": 0.8108504480942063, "language_loss": 0.55459434, "learning_rate": 3.977556487073991e-06, "loss": 0.57634699, "num_input_tokens_seen": 13517775, "step": 635, "time_per_iteration": 3.588414430618286 }, { "auxiliary_loss_clip": 0.01218399, "auxiliary_loss_mlp": 0.01050786, "balance_loss_clip": 1.05915666, "balance_loss_mlp": 1.03822184, "epoch": 0.07647447844646185, "flos": 21761148487680.0, "grad_norm": 1.7114184728097788, "language_loss": 0.81469089, "learning_rate": 3.97743996595032e-06, "loss": 0.83738279, "num_input_tokens_seen": 13537815, "step": 636, "time_per_iteration": 2.7424802780151367 }, { "auxiliary_loss_clip": 0.01264466, "auxiliary_loss_mlp": 0.01041865, "balance_loss_clip": 1.06963754, "balance_loss_mlp": 1.02801847, "epoch": 0.07659472133710095, "flos": 23806799948160.0, "grad_norm": 1.5562689182378755, "language_loss": 0.81803429, "learning_rate": 3.9773231448505804e-06, "loss": 0.84109759, "num_input_tokens_seen": 13559605, "step": 637, "time_per_iteration": 2.6903228759765625 }, { "auxiliary_loss_clip": 0.01227134, "auxiliary_loss_mlp": 0.00767283, "balance_loss_clip": 1.06557965, "balance_loss_mlp": 1.00058293, "epoch": 0.07671496422774003, "flos": 21469984842240.0, "grad_norm": 13.499698074999108, "language_loss": 0.78413266, "learning_rate": 3.977206023792491e-06, "loss": 0.80407679, "num_input_tokens_seen": 13579495, "step": 638, "time_per_iteration": 2.7648534774780273 }, { "auxiliary_loss_clip": 0.01243559, "auxiliary_loss_mlp": 0.01044072, "balance_loss_clip": 1.06733739, "balance_loss_mlp": 1.0308218, "epoch": 0.07683520711837913, "flos": 16980971558400.0, "grad_norm": 2.6399330717275893, "language_loss": 0.81288517, "learning_rate": 3.97708860279382e-06, "loss": 0.83576143, "num_input_tokens_seen": 13597605, "step": 639, "time_per_iteration": 2.8613946437835693 }, { "auxiliary_loss_clip": 0.01205985, "auxiliary_loss_mlp": 0.01045563, "balance_loss_clip": 1.06039095, "balance_loss_mlp": 1.03230143, "epoch": 0.07695545000901821, "flos": 23476744851840.0, "grad_norm": 1.7793490510944252, "language_loss": 0.78112751, "learning_rate": 3.97697088187238e-06, "loss": 0.80364299, "num_input_tokens_seen": 13618120, "step": 640, "time_per_iteration": 2.7413299083709717 }, { "auxiliary_loss_clip": 0.01229562, "auxiliary_loss_mlp": 0.01047609, "balance_loss_clip": 1.06778765, "balance_loss_mlp": 1.03506196, "epoch": 0.07707569289965731, "flos": 17634258167040.0, "grad_norm": 2.1095951714488668, "language_loss": 0.92218769, "learning_rate": 3.976852861046029e-06, "loss": 0.9449594, "num_input_tokens_seen": 13634735, "step": 641, "time_per_iteration": 3.655611515045166 }, { "auxiliary_loss_clip": 0.01196123, "auxiliary_loss_mlp": 0.01051685, "balance_loss_clip": 1.06461263, "balance_loss_mlp": 1.03862596, "epoch": 0.0771959357902964, "flos": 25775674087680.0, "grad_norm": 1.5462469115400466, "language_loss": 0.80450118, "learning_rate": 3.97673454033267e-06, "loss": 0.82697928, "num_input_tokens_seen": 13656835, "step": 642, "time_per_iteration": 2.8135969638824463 }, { "auxiliary_loss_clip": 0.0122969, "auxiliary_loss_mlp": 0.01046167, "balance_loss_clip": 1.06342769, "balance_loss_mlp": 1.03313136, "epoch": 0.07731617868093549, "flos": 19828651847040.0, "grad_norm": 2.020864726259297, "language_loss": 0.82506478, "learning_rate": 3.976615919750254e-06, "loss": 0.84782338, "num_input_tokens_seen": 13674535, "step": 643, "time_per_iteration": 2.7075448036193848 }, { "auxiliary_loss_clip": 0.01240993, "auxiliary_loss_mlp": 0.01050252, "balance_loss_clip": 1.06772649, "balance_loss_mlp": 1.03712726, "epoch": 0.07743642157157458, "flos": 21324654414720.0, "grad_norm": 1.8523831467072889, "language_loss": 0.8700943, "learning_rate": 3.976496999316775e-06, "loss": 0.89300674, "num_input_tokens_seen": 13693290, "step": 644, "time_per_iteration": 3.634830951690674 }, { "auxiliary_loss_clip": 0.01233671, "auxiliary_loss_mlp": 0.01046, "balance_loss_clip": 1.07120335, "balance_loss_mlp": 1.03345287, "epoch": 0.07755666446221367, "flos": 19969133938560.0, "grad_norm": 2.0231060207296725, "language_loss": 0.84091461, "learning_rate": 3.976377779050271e-06, "loss": 0.86371136, "num_input_tokens_seen": 13711420, "step": 645, "time_per_iteration": 3.560788154602051 }, { "auxiliary_loss_clip": 0.01235197, "auxiliary_loss_mlp": 0.01045366, "balance_loss_clip": 1.06389594, "balance_loss_mlp": 1.03195548, "epoch": 0.07767690735285276, "flos": 23623224514560.0, "grad_norm": 2.8003029888870454, "language_loss": 0.84315658, "learning_rate": 3.976258258968831e-06, "loss": 0.86596215, "num_input_tokens_seen": 13729965, "step": 646, "time_per_iteration": 3.552372455596924 }, { "auxiliary_loss_clip": 0.01216262, "auxiliary_loss_mlp": 0.01043758, "balance_loss_clip": 1.06652737, "balance_loss_mlp": 1.03090096, "epoch": 0.07779715024349185, "flos": 22236246702720.0, "grad_norm": 2.3474605646442477, "language_loss": 0.74300957, "learning_rate": 3.976138439090583e-06, "loss": 0.76560974, "num_input_tokens_seen": 13748045, "step": 647, "time_per_iteration": 2.750065326690674 }, { "auxiliary_loss_clip": 0.01212134, "auxiliary_loss_mlp": 0.01043747, "balance_loss_clip": 1.06496298, "balance_loss_mlp": 1.03089595, "epoch": 0.07791739313413094, "flos": 20955097336320.0, "grad_norm": 2.754996001697786, "language_loss": 0.84877717, "learning_rate": 3.976018319433706e-06, "loss": 0.87133604, "num_input_tokens_seen": 13765590, "step": 648, "time_per_iteration": 2.650344133377075 }, { "auxiliary_loss_clip": 0.01245154, "auxiliary_loss_mlp": 0.01045853, "balance_loss_clip": 1.06705499, "balance_loss_mlp": 1.03294241, "epoch": 0.07803763602477004, "flos": 19312327797120.0, "grad_norm": 2.383402959691909, "language_loss": 0.91423553, "learning_rate": 3.9758979000164205e-06, "loss": 0.93714559, "num_input_tokens_seen": 13782410, "step": 649, "time_per_iteration": 2.746569871902466 }, { "auxiliary_loss_clip": 0.01218129, "auxiliary_loss_mlp": 0.01046679, "balance_loss_clip": 1.06154442, "balance_loss_mlp": 1.03339958, "epoch": 0.07815787891540912, "flos": 22710806213760.0, "grad_norm": 2.7258674707255675, "language_loss": 0.72311056, "learning_rate": 3.975777180856995e-06, "loss": 0.74575853, "num_input_tokens_seen": 13801530, "step": 650, "time_per_iteration": 2.7542121410369873 }, { "auxiliary_loss_clip": 0.01263896, "auxiliary_loss_mlp": 0.01051339, "balance_loss_clip": 1.07055414, "balance_loss_mlp": 1.03813648, "epoch": 0.07827812180604822, "flos": 22711129436160.0, "grad_norm": 2.7953816954262187, "language_loss": 0.86331278, "learning_rate": 3.975656161973742e-06, "loss": 0.88646507, "num_input_tokens_seen": 13820615, "step": 651, "time_per_iteration": 2.7876672744750977 }, { "auxiliary_loss_clip": 0.01258149, "auxiliary_loss_mlp": 0.01046523, "balance_loss_clip": 1.06506479, "balance_loss_mlp": 1.03304625, "epoch": 0.0783983646966873, "flos": 21725597001600.0, "grad_norm": 2.479438875058371, "language_loss": 0.88980746, "learning_rate": 3.9755348433850194e-06, "loss": 0.91285419, "num_input_tokens_seen": 13835955, "step": 652, "time_per_iteration": 2.611888885498047 }, { "auxiliary_loss_clip": 0.01133718, "auxiliary_loss_mlp": 0.01018571, "balance_loss_clip": 1.03433418, "balance_loss_mlp": 1.01301575, "epoch": 0.0785186075873264, "flos": 60640877537280.0, "grad_norm": 0.9681590403985557, "language_loss": 0.63662952, "learning_rate": 3.975413225109232e-06, "loss": 0.6581524, "num_input_tokens_seen": 13896505, "step": 653, "time_per_iteration": 3.267427682876587 }, { "auxiliary_loss_clip": 0.01249411, "auxiliary_loss_mlp": 0.01046588, "balance_loss_clip": 1.0695951, "balance_loss_mlp": 1.03254485, "epoch": 0.0786388504779655, "flos": 23877902920320.0, "grad_norm": 4.551158836279269, "language_loss": 0.93552053, "learning_rate": 3.975291307164829e-06, "loss": 0.95848054, "num_input_tokens_seen": 13915150, "step": 654, "time_per_iteration": 2.7345218658447266 }, { "auxiliary_loss_clip": 0.01203775, "auxiliary_loss_mlp": 0.01036142, "balance_loss_clip": 1.06079221, "balance_loss_mlp": 1.02434051, "epoch": 0.07875909336860458, "flos": 15158684822400.0, "grad_norm": 1.9010958421918984, "language_loss": 0.85193247, "learning_rate": 3.975169089570306e-06, "loss": 0.87433165, "num_input_tokens_seen": 13933525, "step": 655, "time_per_iteration": 2.6870152950286865 }, { "auxiliary_loss_clip": 0.01235759, "auxiliary_loss_mlp": 0.01047479, "balance_loss_clip": 1.06658268, "balance_loss_mlp": 1.03468156, "epoch": 0.07887933625924368, "flos": 22236857233920.0, "grad_norm": 2.016431709667311, "language_loss": 0.91472542, "learning_rate": 3.975046572344202e-06, "loss": 0.93755782, "num_input_tokens_seen": 13949985, "step": 656, "time_per_iteration": 2.739366054534912 }, { "auxiliary_loss_clip": 0.01208627, "auxiliary_loss_mlp": 0.01041292, "balance_loss_clip": 1.05930781, "balance_loss_mlp": 1.02878094, "epoch": 0.07899957914988276, "flos": 20777734955520.0, "grad_norm": 1.8479765497574434, "language_loss": 0.7103464, "learning_rate": 3.974923755505103e-06, "loss": 0.73284554, "num_input_tokens_seen": 13969215, "step": 657, "time_per_iteration": 2.723175048828125 }, { "auxiliary_loss_clip": 0.0120665, "auxiliary_loss_mlp": 0.01050382, "balance_loss_clip": 1.06406927, "balance_loss_mlp": 1.03729939, "epoch": 0.07911982204052186, "flos": 23003047267200.0, "grad_norm": 1.7950925914752998, "language_loss": 0.91232836, "learning_rate": 3.974800639071641e-06, "loss": 0.93489867, "num_input_tokens_seen": 13989935, "step": 658, "time_per_iteration": 2.7492823600769043 }, { "auxiliary_loss_clip": 0.01166288, "auxiliary_loss_mlp": 0.00766439, "balance_loss_clip": 1.05419707, "balance_loss_mlp": 1.00031745, "epoch": 0.07924006493116094, "flos": 23111389664640.0, "grad_norm": 3.079121098956225, "language_loss": 1.00707841, "learning_rate": 3.974677223062492e-06, "loss": 1.02640557, "num_input_tokens_seen": 14007150, "step": 659, "time_per_iteration": 2.7649238109588623 }, { "auxiliary_loss_clip": 0.01229075, "auxiliary_loss_mlp": 0.01044235, "balance_loss_clip": 1.06776404, "balance_loss_mlp": 1.03085399, "epoch": 0.07936030782180004, "flos": 16472153450880.0, "grad_norm": 2.8233558904336706, "language_loss": 0.74490088, "learning_rate": 3.974553507496378e-06, "loss": 0.76763403, "num_input_tokens_seen": 14025725, "step": 660, "time_per_iteration": 2.6774539947509766 }, { "auxiliary_loss_clip": 0.01221873, "auxiliary_loss_mlp": 0.01048467, "balance_loss_clip": 1.06459475, "balance_loss_mlp": 1.0343231, "epoch": 0.07948055071243913, "flos": 23733290764800.0, "grad_norm": 2.3436207497808885, "language_loss": 0.89206803, "learning_rate": 3.974429492392068e-06, "loss": 0.91477144, "num_input_tokens_seen": 14045750, "step": 661, "time_per_iteration": 2.7273857593536377 }, { "auxiliary_loss_clip": 0.01260816, "auxiliary_loss_mlp": 0.00766743, "balance_loss_clip": 1.07056594, "balance_loss_mlp": 1.0003624, "epoch": 0.07960079360307822, "flos": 19573326996480.0, "grad_norm": 2.5342840785427834, "language_loss": 0.91150105, "learning_rate": 3.974305177768373e-06, "loss": 0.93177664, "num_input_tokens_seen": 14063960, "step": 662, "time_per_iteration": 2.7140934467315674 }, { "auxiliary_loss_clip": 0.01201469, "auxiliary_loss_mlp": 0.01044995, "balance_loss_clip": 1.06305552, "balance_loss_mlp": 1.03167295, "epoch": 0.07972103649371731, "flos": 23513409659520.0, "grad_norm": 2.2830597569813103, "language_loss": 0.86300647, "learning_rate": 3.974180563644152e-06, "loss": 0.88547111, "num_input_tokens_seen": 14082525, "step": 663, "time_per_iteration": 2.6993300914764404 }, { "auxiliary_loss_clip": 0.0123263, "auxiliary_loss_mlp": 0.01047235, "balance_loss_clip": 1.06595027, "balance_loss_mlp": 1.03529608, "epoch": 0.0798412793843564, "flos": 16726867770240.0, "grad_norm": 2.155790611079928, "language_loss": 0.89190286, "learning_rate": 3.97405565003831e-06, "loss": 0.91470146, "num_input_tokens_seen": 14098610, "step": 664, "time_per_iteration": 2.747103452682495 }, { "auxiliary_loss_clip": 0.01215416, "auxiliary_loss_mlp": 0.0104512, "balance_loss_clip": 1.06268322, "balance_loss_mlp": 1.0325911, "epoch": 0.07996152227499549, "flos": 18223337214720.0, "grad_norm": 2.2648918898753427, "language_loss": 0.78409982, "learning_rate": 3.973930436969794e-06, "loss": 0.80670524, "num_input_tokens_seen": 14117065, "step": 665, "time_per_iteration": 2.710326671600342 }, { "auxiliary_loss_clip": 0.01219266, "auxiliary_loss_mlp": 0.01053722, "balance_loss_clip": 1.06150174, "balance_loss_mlp": 1.03995359, "epoch": 0.08008176516563459, "flos": 20594877793920.0, "grad_norm": 1.7950458805067477, "language_loss": 0.85717058, "learning_rate": 3.973804924457602e-06, "loss": 0.87990052, "num_input_tokens_seen": 14135145, "step": 666, "time_per_iteration": 2.6944193840026855 }, { "auxiliary_loss_clip": 0.01221342, "auxiliary_loss_mlp": 0.01046128, "balance_loss_clip": 1.06180918, "balance_loss_mlp": 1.03135192, "epoch": 0.08020200805627367, "flos": 31834306863360.0, "grad_norm": 1.7753152305464721, "language_loss": 0.8553949, "learning_rate": 3.973679112520771e-06, "loss": 0.87806964, "num_input_tokens_seen": 14156860, "step": 667, "time_per_iteration": 3.761136531829834 }, { "auxiliary_loss_clip": 0.01199509, "auxiliary_loss_mlp": 0.01046045, "balance_loss_clip": 1.056916, "balance_loss_mlp": 1.03316426, "epoch": 0.08032225094691277, "flos": 17783503176960.0, "grad_norm": 2.0171366247085034, "language_loss": 0.98920327, "learning_rate": 3.973553001178389e-06, "loss": 1.01165879, "num_input_tokens_seen": 14174365, "step": 668, "time_per_iteration": 2.744783878326416 }, { "auxiliary_loss_clip": 0.01210438, "auxiliary_loss_mlp": 0.01037566, "balance_loss_clip": 1.06546521, "balance_loss_mlp": 1.02421427, "epoch": 0.08044249383755185, "flos": 24061693835520.0, "grad_norm": 2.724550520006448, "language_loss": 0.757469, "learning_rate": 3.973426590449585e-06, "loss": 0.77994907, "num_input_tokens_seen": 14192320, "step": 669, "time_per_iteration": 2.766023635864258 }, { "auxiliary_loss_clip": 0.01197543, "auxiliary_loss_mlp": 0.01050949, "balance_loss_clip": 1.06214976, "balance_loss_mlp": 1.03898025, "epoch": 0.08056273672819095, "flos": 18223624523520.0, "grad_norm": 2.2438785365717155, "language_loss": 0.7539289, "learning_rate": 3.9732998803535364e-06, "loss": 0.77641386, "num_input_tokens_seen": 14210380, "step": 670, "time_per_iteration": 3.617702007293701 }, { "auxiliary_loss_clip": 0.0126021, "auxiliary_loss_mlp": 0.01045846, "balance_loss_clip": 1.06901252, "balance_loss_mlp": 1.03272688, "epoch": 0.08068297961883003, "flos": 19676856971520.0, "grad_norm": 2.4803702379559733, "language_loss": 0.85466373, "learning_rate": 3.973172870909465e-06, "loss": 0.87772429, "num_input_tokens_seen": 14225145, "step": 671, "time_per_iteration": 3.518620252609253 }, { "auxiliary_loss_clip": 0.01230826, "auxiliary_loss_mlp": 0.01056695, "balance_loss_clip": 1.06166792, "balance_loss_mlp": 1.042974, "epoch": 0.08080322250946913, "flos": 23148736830720.0, "grad_norm": 2.5574045988186542, "language_loss": 0.80810928, "learning_rate": 3.973045562136638e-06, "loss": 0.83098453, "num_input_tokens_seen": 14241960, "step": 672, "time_per_iteration": 3.5936081409454346 }, { "auxiliary_loss_clip": 0.01247713, "auxiliary_loss_mlp": 0.01047933, "balance_loss_clip": 1.06666267, "balance_loss_mlp": 1.03614914, "epoch": 0.08092346540010822, "flos": 21763626526080.0, "grad_norm": 2.14255213459971, "language_loss": 0.91484416, "learning_rate": 3.972917954054368e-06, "loss": 0.93780059, "num_input_tokens_seen": 14260515, "step": 673, "time_per_iteration": 2.6793365478515625 }, { "auxiliary_loss_clip": 0.01229853, "auxiliary_loss_mlp": 0.01052898, "balance_loss_clip": 1.07022917, "balance_loss_mlp": 1.03851557, "epoch": 0.08104370829074731, "flos": 21032485188480.0, "grad_norm": 2.659748712115016, "language_loss": 0.81934547, "learning_rate": 3.972790046682013e-06, "loss": 0.84217286, "num_input_tokens_seen": 14279190, "step": 674, "time_per_iteration": 2.7026286125183105 }, { "auxiliary_loss_clip": 0.01212019, "auxiliary_loss_mlp": 0.01042176, "balance_loss_clip": 1.06014276, "balance_loss_mlp": 1.02945662, "epoch": 0.0811639511813864, "flos": 20083186598400.0, "grad_norm": 1.9966303642096719, "language_loss": 0.79499924, "learning_rate": 3.972661840038977e-06, "loss": 0.81754112, "num_input_tokens_seen": 14299480, "step": 675, "time_per_iteration": 2.7462522983551025 }, { "auxiliary_loss_clip": 0.01249442, "auxiliary_loss_mlp": 0.01052621, "balance_loss_clip": 1.07153201, "balance_loss_mlp": 1.03936505, "epoch": 0.08128419407202549, "flos": 16836718538880.0, "grad_norm": 2.7810527676196397, "language_loss": 0.83501029, "learning_rate": 3.972533334144707e-06, "loss": 0.85803092, "num_input_tokens_seen": 14316405, "step": 676, "time_per_iteration": 2.6242363452911377 }, { "auxiliary_loss_clip": 0.01247163, "auxiliary_loss_mlp": 0.01044762, "balance_loss_clip": 1.06664872, "balance_loss_mlp": 1.03051043, "epoch": 0.08140443696266458, "flos": 23769273214080.0, "grad_norm": 2.1328014802487836, "language_loss": 0.78496337, "learning_rate": 3.972404529018699e-06, "loss": 0.80788261, "num_input_tokens_seen": 14336265, "step": 677, "time_per_iteration": 2.7077319622039795 }, { "auxiliary_loss_clip": 0.01222862, "auxiliary_loss_mlp": 0.01047054, "balance_loss_clip": 1.06021166, "balance_loss_mlp": 1.03386927, "epoch": 0.08152467985330367, "flos": 24390132819840.0, "grad_norm": 1.9413844099161806, "language_loss": 0.85340655, "learning_rate": 3.972275424680493e-06, "loss": 0.87610567, "num_input_tokens_seen": 14356375, "step": 678, "time_per_iteration": 2.7961320877075195 }, { "auxiliary_loss_clip": 0.01259722, "auxiliary_loss_mlp": 0.01042584, "balance_loss_clip": 1.06969404, "balance_loss_mlp": 1.02979279, "epoch": 0.08164492274394276, "flos": 19317750750720.0, "grad_norm": 2.7103477167499763, "language_loss": 0.920012, "learning_rate": 3.972146021149673e-06, "loss": 0.94303501, "num_input_tokens_seen": 14374650, "step": 679, "time_per_iteration": 2.6265804767608643 }, { "auxiliary_loss_clip": 0.0121503, "auxiliary_loss_mlp": 0.01037906, "balance_loss_clip": 1.06470227, "balance_loss_mlp": 1.02546704, "epoch": 0.08176516563458186, "flos": 14830461319680.0, "grad_norm": 5.743056017428087, "language_loss": 0.78892112, "learning_rate": 3.972016318445868e-06, "loss": 0.81145048, "num_input_tokens_seen": 14392650, "step": 680, "time_per_iteration": 2.748793840408325 }, { "auxiliary_loss_clip": 0.01251021, "auxiliary_loss_mlp": 0.01045014, "balance_loss_clip": 1.06926346, "balance_loss_mlp": 1.03152561, "epoch": 0.08188540852522094, "flos": 22602320161920.0, "grad_norm": 2.282037419579402, "language_loss": 0.92086124, "learning_rate": 3.971886316588757e-06, "loss": 0.94382167, "num_input_tokens_seen": 14413155, "step": 681, "time_per_iteration": 2.6780483722686768 }, { "auxiliary_loss_clip": 0.01203733, "auxiliary_loss_mlp": 0.01047748, "balance_loss_clip": 1.06427979, "balance_loss_mlp": 1.03464651, "epoch": 0.08200565141586004, "flos": 19463727623040.0, "grad_norm": 3.9623611601020374, "language_loss": 0.73622012, "learning_rate": 3.9717560155980595e-06, "loss": 0.75873494, "num_input_tokens_seen": 14428805, "step": 682, "time_per_iteration": 2.6980581283569336 }, { "auxiliary_loss_clip": 0.01246413, "auxiliary_loss_mlp": 0.01051862, "balance_loss_clip": 1.06792903, "balance_loss_mlp": 1.0387553, "epoch": 0.08212589430649912, "flos": 20594662312320.0, "grad_norm": 2.0517749349244028, "language_loss": 0.91840953, "learning_rate": 3.971625415493542e-06, "loss": 0.9413923, "num_input_tokens_seen": 14447125, "step": 683, "time_per_iteration": 2.656672477722168 }, { "auxiliary_loss_clip": 0.01212506, "auxiliary_loss_mlp": 0.01040253, "balance_loss_clip": 1.06385517, "balance_loss_mlp": 1.02676463, "epoch": 0.08224613719713822, "flos": 25953611086080.0, "grad_norm": 2.0398826269649515, "language_loss": 0.87763906, "learning_rate": 3.971494516295017e-06, "loss": 0.90016663, "num_input_tokens_seen": 14466575, "step": 684, "time_per_iteration": 2.7585256099700928 }, { "auxiliary_loss_clip": 0.01214612, "auxiliary_loss_mlp": 0.01048093, "balance_loss_clip": 1.06122828, "balance_loss_mlp": 1.03522408, "epoch": 0.08236638008777732, "flos": 23768734510080.0, "grad_norm": 2.6034344329253427, "language_loss": 0.85242552, "learning_rate": 3.971363318022341e-06, "loss": 0.87505251, "num_input_tokens_seen": 14487915, "step": 685, "time_per_iteration": 2.760298013687134 }, { "auxiliary_loss_clip": 0.0122892, "auxiliary_loss_mlp": 0.01047175, "balance_loss_clip": 1.06367302, "balance_loss_mlp": 1.03446734, "epoch": 0.0824866229784164, "flos": 38799144887040.0, "grad_norm": 1.9590745625048704, "language_loss": 0.68354154, "learning_rate": 3.971231820695417e-06, "loss": 0.70630252, "num_input_tokens_seen": 14511530, "step": 686, "time_per_iteration": 2.845672845840454 }, { "auxiliary_loss_clip": 0.01232359, "auxiliary_loss_mlp": 0.01050199, "balance_loss_clip": 1.06686616, "balance_loss_mlp": 1.03744912, "epoch": 0.0826068658690555, "flos": 23107762391040.0, "grad_norm": 2.7992248273957006, "language_loss": 0.8119337, "learning_rate": 3.971100024334193e-06, "loss": 0.83475929, "num_input_tokens_seen": 14529050, "step": 687, "time_per_iteration": 2.6569132804870605 }, { "auxiliary_loss_clip": 0.01193925, "auxiliary_loss_mlp": 0.01042343, "balance_loss_clip": 1.05751133, "balance_loss_mlp": 1.02958739, "epoch": 0.08272710875969458, "flos": 21136374299520.0, "grad_norm": 2.127174707314435, "language_loss": 0.86258209, "learning_rate": 3.970967928958663e-06, "loss": 0.8849448, "num_input_tokens_seen": 14546165, "step": 688, "time_per_iteration": 2.7673656940460205 }, { "auxiliary_loss_clip": 0.01203311, "auxiliary_loss_mlp": 0.01045625, "balance_loss_clip": 1.06268549, "balance_loss_mlp": 1.03302431, "epoch": 0.08284735165033368, "flos": 19063000517760.0, "grad_norm": 1.8616118731682587, "language_loss": 0.83487213, "learning_rate": 3.970835534588865e-06, "loss": 0.85736144, "num_input_tokens_seen": 14563660, "step": 689, "time_per_iteration": 2.7525806427001953 }, { "auxiliary_loss_clip": 0.01233901, "auxiliary_loss_mlp": 0.01045432, "balance_loss_clip": 1.07063866, "balance_loss_mlp": 1.03281987, "epoch": 0.08296759454097276, "flos": 16727442387840.0, "grad_norm": 1.8247371081941353, "language_loss": 0.85330856, "learning_rate": 3.970702841244883e-06, "loss": 0.87610185, "num_input_tokens_seen": 14581980, "step": 690, "time_per_iteration": 2.672231435775757 }, { "auxiliary_loss_clip": 0.01248408, "auxiliary_loss_mlp": 0.01050024, "balance_loss_clip": 1.07248449, "balance_loss_mlp": 1.03710198, "epoch": 0.08308783743161186, "flos": 18004928567040.0, "grad_norm": 2.073013076449816, "language_loss": 0.82841212, "learning_rate": 3.970569848946847e-06, "loss": 0.85139644, "num_input_tokens_seen": 14601795, "step": 691, "time_per_iteration": 2.5989747047424316 }, { "auxiliary_loss_clip": 0.01231282, "auxiliary_loss_mlp": 0.01045423, "balance_loss_clip": 1.06554413, "balance_loss_mlp": 1.03340673, "epoch": 0.08320808032225095, "flos": 15079788599040.0, "grad_norm": 2.181946994990189, "language_loss": 0.82509315, "learning_rate": 3.970436557714932e-06, "loss": 0.84786022, "num_input_tokens_seen": 14618315, "step": 692, "time_per_iteration": 2.7314164638519287 }, { "auxiliary_loss_clip": 0.01221709, "auxiliary_loss_mlp": 0.01044254, "balance_loss_clip": 1.06309927, "balance_loss_mlp": 1.03008056, "epoch": 0.08332832321289003, "flos": 22383085501440.0, "grad_norm": 2.948274446222267, "language_loss": 0.86432374, "learning_rate": 3.970302967569358e-06, "loss": 0.88698339, "num_input_tokens_seen": 14636905, "step": 693, "time_per_iteration": 2.750959634780884 }, { "auxiliary_loss_clip": 0.01242371, "auxiliary_loss_mlp": 0.01053536, "balance_loss_clip": 1.06880677, "balance_loss_mlp": 1.03972578, "epoch": 0.08344856610352913, "flos": 24717386655360.0, "grad_norm": 10.584755954454883, "language_loss": 0.68796128, "learning_rate": 3.9701690785303896e-06, "loss": 0.71092033, "num_input_tokens_seen": 14656100, "step": 694, "time_per_iteration": 3.595456838607788 }, { "auxiliary_loss_clip": 0.01246837, "auxiliary_loss_mlp": 0.01046018, "balance_loss_clip": 1.06837487, "balance_loss_mlp": 1.03300595, "epoch": 0.08356880899416821, "flos": 25370206387200.0, "grad_norm": 2.2797396386524014, "language_loss": 0.88473594, "learning_rate": 3.970034890618339e-06, "loss": 0.90766442, "num_input_tokens_seen": 14675790, "step": 695, "time_per_iteration": 2.8468432426452637 }, { "auxiliary_loss_clip": 0.01228479, "auxiliary_loss_mlp": 0.0104265, "balance_loss_clip": 1.06473327, "balance_loss_mlp": 1.03079474, "epoch": 0.08368905188480731, "flos": 24353072962560.0, "grad_norm": 2.5071696388115656, "language_loss": 0.88209796, "learning_rate": 3.969900403853562e-06, "loss": 0.9048093, "num_input_tokens_seen": 14694830, "step": 696, "time_per_iteration": 3.7635116577148438 }, { "auxiliary_loss_clip": 0.01262162, "auxiliary_loss_mlp": 0.01045396, "balance_loss_clip": 1.0730418, "balance_loss_mlp": 1.03315878, "epoch": 0.08380929477544641, "flos": 18037319656320.0, "grad_norm": 1.769552686175701, "language_loss": 0.78107512, "learning_rate": 3.96976561825646e-06, "loss": 0.80415064, "num_input_tokens_seen": 14711920, "step": 697, "time_per_iteration": 3.4891507625579834 }, { "auxiliary_loss_clip": 0.01202907, "auxiliary_loss_mlp": 0.01048161, "balance_loss_clip": 1.06612742, "balance_loss_mlp": 1.03479159, "epoch": 0.08392953766608549, "flos": 26286287875200.0, "grad_norm": 2.0194426955691314, "language_loss": 0.87234247, "learning_rate": 3.969630533847479e-06, "loss": 0.89485317, "num_input_tokens_seen": 14730880, "step": 698, "time_per_iteration": 3.704092502593994 }, { "auxiliary_loss_clip": 0.01243063, "auxiliary_loss_mlp": 0.01036836, "balance_loss_clip": 1.06575418, "balance_loss_mlp": 1.02510595, "epoch": 0.08404978055672459, "flos": 22492146170880.0, "grad_norm": 2.777024362057275, "language_loss": 0.84283358, "learning_rate": 3.969495150647113e-06, "loss": 0.86563253, "num_input_tokens_seen": 14749050, "step": 699, "time_per_iteration": 2.719754457473755 }, { "auxiliary_loss_clip": 0.01209372, "auxiliary_loss_mlp": 0.01043586, "balance_loss_clip": 1.06598949, "balance_loss_mlp": 1.03032398, "epoch": 0.08417002344736367, "flos": 24826878288000.0, "grad_norm": 1.7877805385368393, "language_loss": 0.76581144, "learning_rate": 3.969359468675899e-06, "loss": 0.78834099, "num_input_tokens_seen": 14769180, "step": 700, "time_per_iteration": 2.732675075531006 }, { "auxiliary_loss_clip": 0.01238464, "auxiliary_loss_mlp": 0.01044252, "balance_loss_clip": 1.06728113, "balance_loss_mlp": 1.03196156, "epoch": 0.08429026633800277, "flos": 16945922862720.0, "grad_norm": 2.1459497001476158, "language_loss": 0.89544439, "learning_rate": 3.969223487954418e-06, "loss": 0.91827154, "num_input_tokens_seen": 14786640, "step": 701, "time_per_iteration": 2.6603808403015137 }, { "auxiliary_loss_clip": 0.01194966, "auxiliary_loss_mlp": 0.01038167, "balance_loss_clip": 1.06360745, "balance_loss_mlp": 1.0261209, "epoch": 0.08441050922864185, "flos": 23841920471040.0, "grad_norm": 2.586219021030874, "language_loss": 0.83047974, "learning_rate": 3.969087208503301e-06, "loss": 0.85281104, "num_input_tokens_seen": 14806720, "step": 702, "time_per_iteration": 2.750436544418335 }, { "auxiliary_loss_clip": 0.01196682, "auxiliary_loss_mlp": 0.01043609, "balance_loss_clip": 1.06262255, "balance_loss_mlp": 1.03034067, "epoch": 0.08453075211928095, "flos": 25520205582720.0, "grad_norm": 2.4177860894404404, "language_loss": 0.84816563, "learning_rate": 3.968950630343219e-06, "loss": 0.87056857, "num_input_tokens_seen": 14823705, "step": 703, "time_per_iteration": 2.7762889862060547 }, { "auxiliary_loss_clip": 0.01220587, "auxiliary_loss_mlp": 0.01039826, "balance_loss_clip": 1.06103706, "balance_loss_mlp": 1.02772021, "epoch": 0.08465099500992004, "flos": 19532496211200.0, "grad_norm": 2.1464520325466285, "language_loss": 0.93510211, "learning_rate": 3.968813753494892e-06, "loss": 0.95770609, "num_input_tokens_seen": 14841865, "step": 704, "time_per_iteration": 2.6497962474823 }, { "auxiliary_loss_clip": 0.01196328, "auxiliary_loss_mlp": 0.00766134, "balance_loss_clip": 1.05840993, "balance_loss_mlp": 1.00028729, "epoch": 0.08477123790055913, "flos": 29351299403520.0, "grad_norm": 4.527949140847245, "language_loss": 0.75522226, "learning_rate": 3.968676577979084e-06, "loss": 0.77484685, "num_input_tokens_seen": 14861415, "step": 705, "time_per_iteration": 2.968214273452759 }, { "auxiliary_loss_clip": 0.01193664, "auxiliary_loss_mlp": 0.01041379, "balance_loss_clip": 1.06046247, "balance_loss_mlp": 1.02816451, "epoch": 0.08489148079119822, "flos": 18624495283200.0, "grad_norm": 2.353269958503046, "language_loss": 0.77990454, "learning_rate": 3.968539103816605e-06, "loss": 0.80225492, "num_input_tokens_seen": 14879215, "step": 706, "time_per_iteration": 2.7759852409362793 }, { "auxiliary_loss_clip": 0.01224104, "auxiliary_loss_mlp": 0.00765879, "balance_loss_clip": 1.06543887, "balance_loss_mlp": 1.00039184, "epoch": 0.0850117236818373, "flos": 23471393725440.0, "grad_norm": 1.8655679264177616, "language_loss": 0.89063811, "learning_rate": 3.9684013310283085e-06, "loss": 0.91053796, "num_input_tokens_seen": 14897900, "step": 707, "time_per_iteration": 2.76870059967041 }, { "auxiliary_loss_clip": 0.01228089, "auxiliary_loss_mlp": 0.01049562, "balance_loss_clip": 1.06886601, "balance_loss_mlp": 1.03786707, "epoch": 0.0851319665724764, "flos": 40625058896640.0, "grad_norm": 3.0941477794634555, "language_loss": 0.64072669, "learning_rate": 3.9682632596350956e-06, "loss": 0.66350317, "num_input_tokens_seen": 14919065, "step": 708, "time_per_iteration": 2.8337628841400146 }, { "auxiliary_loss_clip": 0.01239035, "auxiliary_loss_mlp": 0.01044301, "balance_loss_clip": 1.06920099, "balance_loss_mlp": 1.03195655, "epoch": 0.0852522094631155, "flos": 15879554870400.0, "grad_norm": 2.0227811858772586, "language_loss": 0.78342557, "learning_rate": 3.968124889657911e-06, "loss": 0.80625892, "num_input_tokens_seen": 14934165, "step": 709, "time_per_iteration": 2.6406357288360596 }, { "auxiliary_loss_clip": 0.0119353, "auxiliary_loss_mlp": 0.01045, "balance_loss_clip": 1.05971694, "balance_loss_mlp": 1.03212523, "epoch": 0.08537245235375458, "flos": 14567091822720.0, "grad_norm": 2.2968876080717573, "language_loss": 0.90613914, "learning_rate": 3.967986221117746e-06, "loss": 0.92852449, "num_input_tokens_seen": 14950105, "step": 710, "time_per_iteration": 2.7819108963012695 }, { "auxiliary_loss_clip": 0.01166738, "auxiliary_loss_mlp": 0.01051051, "balance_loss_clip": 1.05559039, "balance_loss_mlp": 1.03814077, "epoch": 0.08549269524439368, "flos": 26468929555200.0, "grad_norm": 4.729345521178511, "language_loss": 0.86564821, "learning_rate": 3.967847254035635e-06, "loss": 0.88782609, "num_input_tokens_seen": 14969490, "step": 711, "time_per_iteration": 3.0751190185546875 }, { "auxiliary_loss_clip": 0.01212795, "auxiliary_loss_mlp": 0.01047025, "balance_loss_clip": 1.06493855, "balance_loss_mlp": 1.03359568, "epoch": 0.08561293813503276, "flos": 13590214565760.0, "grad_norm": 2.0428458617370207, "language_loss": 0.86749256, "learning_rate": 3.967707988432661e-06, "loss": 0.8900907, "num_input_tokens_seen": 14987195, "step": 712, "time_per_iteration": 2.995281219482422 }, { "auxiliary_loss_clip": 0.01254467, "auxiliary_loss_mlp": 0.01045017, "balance_loss_clip": 1.06729472, "balance_loss_mlp": 1.0324049, "epoch": 0.08573318102567186, "flos": 26943524979840.0, "grad_norm": 2.491285011873415, "language_loss": 0.87602705, "learning_rate": 3.967568424329949e-06, "loss": 0.89902192, "num_input_tokens_seen": 15007620, "step": 713, "time_per_iteration": 2.659435749053955 }, { "auxiliary_loss_clip": 0.01152622, "auxiliary_loss_mlp": 0.01020732, "balance_loss_clip": 1.04647088, "balance_loss_mlp": 1.01598775, "epoch": 0.08585342391631094, "flos": 67302739319040.0, "grad_norm": 0.8351390594223864, "language_loss": 0.55526352, "learning_rate": 3.967428561748671e-06, "loss": 0.57699704, "num_input_tokens_seen": 15075590, "step": 714, "time_per_iteration": 3.45459246635437 }, { "auxiliary_loss_clip": 0.01179118, "auxiliary_loss_mlp": 0.01039305, "balance_loss_clip": 1.05474567, "balance_loss_mlp": 1.0258944, "epoch": 0.08597366680695004, "flos": 22456594684800.0, "grad_norm": 2.5559555314301963, "language_loss": 0.87391782, "learning_rate": 3.967288400710045e-06, "loss": 0.89610195, "num_input_tokens_seen": 15095055, "step": 715, "time_per_iteration": 2.7718629837036133 }, { "auxiliary_loss_clip": 0.01206893, "auxiliary_loss_mlp": 0.01043356, "balance_loss_clip": 1.06617701, "balance_loss_mlp": 1.03114271, "epoch": 0.08609390969758914, "flos": 23550505430400.0, "grad_norm": 2.358505753506581, "language_loss": 0.88715887, "learning_rate": 3.9671479412353335e-06, "loss": 0.90966135, "num_input_tokens_seen": 15113520, "step": 716, "time_per_iteration": 2.722043514251709 }, { "auxiliary_loss_clip": 0.01238073, "auxiliary_loss_mlp": 0.01048534, "balance_loss_clip": 1.06611252, "balance_loss_mlp": 1.03521276, "epoch": 0.08621415258822822, "flos": 25885848078720.0, "grad_norm": 2.8229772703437566, "language_loss": 0.74151629, "learning_rate": 3.967007183345843e-06, "loss": 0.76438236, "num_input_tokens_seen": 15133375, "step": 717, "time_per_iteration": 2.749692440032959 }, { "auxiliary_loss_clip": 0.01236393, "auxiliary_loss_mlp": 0.01040357, "balance_loss_clip": 1.06610739, "balance_loss_mlp": 1.02752423, "epoch": 0.08633439547886732, "flos": 13589568120960.0, "grad_norm": 3.2042491538703217, "language_loss": 0.89368844, "learning_rate": 3.966866127062927e-06, "loss": 0.91645586, "num_input_tokens_seen": 15150500, "step": 718, "time_per_iteration": 2.624999523162842 }, { "auxiliary_loss_clip": 0.01147985, "auxiliary_loss_mlp": 0.01004591, "balance_loss_clip": 1.04104447, "balance_loss_mlp": 0.99975073, "epoch": 0.0864546383695064, "flos": 57767342434560.0, "grad_norm": 0.8646542641634731, "language_loss": 0.62710309, "learning_rate": 3.966724772407982e-06, "loss": 0.64862883, "num_input_tokens_seen": 15208015, "step": 719, "time_per_iteration": 4.015966176986694 }, { "auxiliary_loss_clip": 0.01196814, "auxiliary_loss_mlp": 0.01050014, "balance_loss_clip": 1.06251395, "balance_loss_mlp": 1.03809869, "epoch": 0.0865748812601455, "flos": 20046952753920.0, "grad_norm": 2.0642124941767275, "language_loss": 0.88866639, "learning_rate": 3.966583119402454e-06, "loss": 0.91113466, "num_input_tokens_seen": 15224780, "step": 720, "time_per_iteration": 2.722700834274292 }, { "auxiliary_loss_clip": 0.01234984, "auxiliary_loss_mlp": 0.00766024, "balance_loss_clip": 1.06594515, "balance_loss_mlp": 1.00026035, "epoch": 0.08669512415078459, "flos": 35262446935680.0, "grad_norm": 1.8061645746717536, "language_loss": 0.82249022, "learning_rate": 3.9664411680678305e-06, "loss": 0.84250027, "num_input_tokens_seen": 15246535, "step": 721, "time_per_iteration": 2.8109726905822754 }, { "auxiliary_loss_clip": 0.01123631, "auxiliary_loss_mlp": 0.01006567, "balance_loss_clip": 1.03843272, "balance_loss_mlp": 1.00182283, "epoch": 0.08681536704142367, "flos": 65654870048640.0, "grad_norm": 0.8484583294587621, "language_loss": 0.6143558, "learning_rate": 3.966298918425644e-06, "loss": 0.63565779, "num_input_tokens_seen": 15304025, "step": 722, "time_per_iteration": 4.096959352493286 }, { "auxiliary_loss_clip": 0.01237361, "auxiliary_loss_mlp": 0.01047522, "balance_loss_clip": 1.0638063, "balance_loss_mlp": 1.03502929, "epoch": 0.08693560993206277, "flos": 34529940881280.0, "grad_norm": 3.534067229579416, "language_loss": 0.83069241, "learning_rate": 3.966156370497476e-06, "loss": 0.85354125, "num_input_tokens_seen": 15327635, "step": 723, "time_per_iteration": 3.7428741455078125 }, { "auxiliary_loss_clip": 0.01241142, "auxiliary_loss_mlp": 0.01046554, "balance_loss_clip": 1.0659306, "balance_loss_mlp": 1.03320241, "epoch": 0.08705585282270185, "flos": 23149419189120.0, "grad_norm": 1.7641467326062703, "language_loss": 0.88544208, "learning_rate": 3.96601352430495e-06, "loss": 0.908319, "num_input_tokens_seen": 15347405, "step": 724, "time_per_iteration": 3.5794427394866943 }, { "auxiliary_loss_clip": 0.01219568, "auxiliary_loss_mlp": 0.010426, "balance_loss_clip": 1.06602359, "balance_loss_mlp": 1.03014898, "epoch": 0.08717609571334095, "flos": 29497599498240.0, "grad_norm": 1.9273464001831033, "language_loss": 0.83000052, "learning_rate": 3.965870379869735e-06, "loss": 0.85262215, "num_input_tokens_seen": 15369450, "step": 725, "time_per_iteration": 2.828470468521118 }, { "auxiliary_loss_clip": 0.01235903, "auxiliary_loss_mlp": 0.01039862, "balance_loss_clip": 1.06201875, "balance_loss_mlp": 1.02736354, "epoch": 0.08729633860398003, "flos": 20667489137280.0, "grad_norm": 2.170208736759099, "language_loss": 0.87050009, "learning_rate": 3.965726937213547e-06, "loss": 0.89325768, "num_input_tokens_seen": 15388085, "step": 726, "time_per_iteration": 2.7191240787506104 }, { "auxiliary_loss_clip": 0.01234163, "auxiliary_loss_mlp": 0.01036836, "balance_loss_clip": 1.06258941, "balance_loss_mlp": 1.02457559, "epoch": 0.08741658149461913, "flos": 18369493655040.0, "grad_norm": 2.395705109954899, "language_loss": 0.81270868, "learning_rate": 3.965583196358144e-06, "loss": 0.83541864, "num_input_tokens_seen": 15407120, "step": 727, "time_per_iteration": 2.66363263130188 }, { "auxiliary_loss_clip": 0.01253028, "auxiliary_loss_mlp": 0.01040496, "balance_loss_clip": 1.06612349, "balance_loss_mlp": 1.02747297, "epoch": 0.08753682438525823, "flos": 18729677283840.0, "grad_norm": 3.674837615685462, "language_loss": 0.74490404, "learning_rate": 3.965439157325335e-06, "loss": 0.76783925, "num_input_tokens_seen": 15424485, "step": 728, "time_per_iteration": 2.6076135635375977 }, { "auxiliary_loss_clip": 0.0121188, "auxiliary_loss_mlp": 0.0104224, "balance_loss_clip": 1.05793107, "balance_loss_mlp": 1.02933562, "epoch": 0.08765706727589731, "flos": 27776113303680.0, "grad_norm": 2.3433970136250886, "language_loss": 0.75825644, "learning_rate": 3.965294820136968e-06, "loss": 0.7807976, "num_input_tokens_seen": 15446285, "step": 729, "time_per_iteration": 2.7820146083831787 }, { "auxiliary_loss_clip": 0.01218706, "auxiliary_loss_mlp": 0.01046808, "balance_loss_clip": 1.06444001, "balance_loss_mlp": 1.03421903, "epoch": 0.08777731016653641, "flos": 24389127239040.0, "grad_norm": 2.411264455360739, "language_loss": 0.86889023, "learning_rate": 3.965150184814938e-06, "loss": 0.89154536, "num_input_tokens_seen": 15465770, "step": 730, "time_per_iteration": 2.749265193939209 }, { "auxiliary_loss_clip": 0.01205231, "auxiliary_loss_mlp": 0.01044585, "balance_loss_clip": 1.05829072, "balance_loss_mlp": 1.03243732, "epoch": 0.08789755305717549, "flos": 21981855605760.0, "grad_norm": 2.0966104766328817, "language_loss": 0.76569158, "learning_rate": 3.965005251381189e-06, "loss": 0.78818977, "num_input_tokens_seen": 15483705, "step": 731, "time_per_iteration": 2.757766008377075 }, { "auxiliary_loss_clip": 0.01145601, "auxiliary_loss_mlp": 0.01004728, "balance_loss_clip": 1.0345, "balance_loss_mlp": 1.00022149, "epoch": 0.08801779594781459, "flos": 58360120583040.0, "grad_norm": 0.8995173539817212, "language_loss": 0.64620185, "learning_rate": 3.964860019857705e-06, "loss": 0.66770506, "num_input_tokens_seen": 15548620, "step": 732, "time_per_iteration": 3.2875545024871826 }, { "auxiliary_loss_clip": 0.01251721, "auxiliary_loss_mlp": 0.01043533, "balance_loss_clip": 1.06899881, "balance_loss_mlp": 1.03096807, "epoch": 0.08813803883845367, "flos": 23294785530240.0, "grad_norm": 1.8823011591025562, "language_loss": 0.84298432, "learning_rate": 3.964714490266518e-06, "loss": 0.86593688, "num_input_tokens_seen": 15569265, "step": 733, "time_per_iteration": 2.574460983276367 }, { "auxiliary_loss_clip": 0.01142264, "auxiliary_loss_mlp": 0.01005089, "balance_loss_clip": 1.03471923, "balance_loss_mlp": 1.00055873, "epoch": 0.08825828172909277, "flos": 63424924882560.0, "grad_norm": 0.9058110251577008, "language_loss": 0.64563841, "learning_rate": 3.964568662629706e-06, "loss": 0.66711193, "num_input_tokens_seen": 15630570, "step": 734, "time_per_iteration": 3.1315927505493164 }, { "auxiliary_loss_clip": 0.01230072, "auxiliary_loss_mlp": 0.01038409, "balance_loss_clip": 1.06045103, "balance_loss_mlp": 1.02548051, "epoch": 0.08837852461973186, "flos": 26720986268160.0, "grad_norm": 3.1982520278931736, "language_loss": 0.84514499, "learning_rate": 3.9644225369693895e-06, "loss": 0.8678298, "num_input_tokens_seen": 15650870, "step": 735, "time_per_iteration": 2.72280216217041 }, { "auxiliary_loss_clip": 0.01254193, "auxiliary_loss_mlp": 0.010461, "balance_loss_clip": 1.07066083, "balance_loss_mlp": 1.03453112, "epoch": 0.08849876751037095, "flos": 27265427688960.0, "grad_norm": 2.0383496174859963, "language_loss": 0.86618328, "learning_rate": 3.964276113307735e-06, "loss": 0.88918614, "num_input_tokens_seen": 15670835, "step": 736, "time_per_iteration": 2.6650760173797607 }, { "auxiliary_loss_clip": 0.01200667, "auxiliary_loss_mlp": 0.01042981, "balance_loss_clip": 1.06198967, "balance_loss_mlp": 1.03038692, "epoch": 0.08861901040101004, "flos": 19828759587840.0, "grad_norm": 1.9183812032945486, "language_loss": 0.81013262, "learning_rate": 3.9641293916669574e-06, "loss": 0.83256906, "num_input_tokens_seen": 15689795, "step": 737, "time_per_iteration": 2.755119562149048 }, { "auxiliary_loss_clip": 0.01201493, "auxiliary_loss_mlp": 0.01041637, "balance_loss_clip": 1.06092775, "balance_loss_mlp": 1.02878666, "epoch": 0.08873925329164913, "flos": 23658704173440.0, "grad_norm": 2.04386359428358, "language_loss": 0.83056951, "learning_rate": 3.9639823720693115e-06, "loss": 0.85300076, "num_input_tokens_seen": 15711650, "step": 738, "time_per_iteration": 2.812493324279785 }, { "auxiliary_loss_clip": 0.01127754, "auxiliary_loss_mlp": 0.01010225, "balance_loss_clip": 1.04175735, "balance_loss_mlp": 1.00483692, "epoch": 0.08885949618228822, "flos": 71831541893760.0, "grad_norm": 0.8473432630130484, "language_loss": 0.60009986, "learning_rate": 3.963835054537102e-06, "loss": 0.62147963, "num_input_tokens_seen": 15780615, "step": 739, "time_per_iteration": 3.373335123062134 }, { "auxiliary_loss_clip": 0.01215948, "auxiliary_loss_mlp": 0.0104004, "balance_loss_clip": 1.05923653, "balance_loss_mlp": 1.02798796, "epoch": 0.08897973907292732, "flos": 22346169298560.0, "grad_norm": 3.0956375336296422, "language_loss": 0.60901785, "learning_rate": 3.963687439092676e-06, "loss": 0.63157767, "num_input_tokens_seen": 15801300, "step": 740, "time_per_iteration": 2.6933131217956543 }, { "auxiliary_loss_clip": 0.01233963, "auxiliary_loss_mlp": 0.01041706, "balance_loss_clip": 1.0653615, "balance_loss_mlp": 1.02965403, "epoch": 0.0890999819635664, "flos": 21251827589760.0, "grad_norm": 2.0800687241841422, "language_loss": 0.80294734, "learning_rate": 3.963539525758427e-06, "loss": 0.82570398, "num_input_tokens_seen": 15820860, "step": 741, "time_per_iteration": 2.7075846195220947 }, { "auxiliary_loss_clip": 0.01224671, "auxiliary_loss_mlp": 0.01041473, "balance_loss_clip": 1.0651325, "balance_loss_mlp": 1.02945662, "epoch": 0.0892202248542055, "flos": 25370888745600.0, "grad_norm": 2.5090202074810315, "language_loss": 0.67660534, "learning_rate": 3.9633913145567925e-06, "loss": 0.69926679, "num_input_tokens_seen": 15841350, "step": 742, "time_per_iteration": 2.8041248321533203 }, { "auxiliary_loss_clip": 0.01225504, "auxiliary_loss_mlp": 0.01042978, "balance_loss_clip": 1.06542873, "balance_loss_mlp": 1.03087258, "epoch": 0.08934046774484458, "flos": 24457895827200.0, "grad_norm": 3.162040117072184, "language_loss": 0.81627935, "learning_rate": 3.9632428055102575e-06, "loss": 0.83896416, "num_input_tokens_seen": 15861360, "step": 743, "time_per_iteration": 2.7083804607391357 }, { "auxiliary_loss_clip": 0.01243309, "auxiliary_loss_mlp": 0.01034553, "balance_loss_clip": 1.06892991, "balance_loss_mlp": 1.02229202, "epoch": 0.08946071063548368, "flos": 35772773414400.0, "grad_norm": 2.4222512203101583, "language_loss": 0.66943353, "learning_rate": 3.9630939986413495e-06, "loss": 0.69221216, "num_input_tokens_seen": 15883160, "step": 744, "time_per_iteration": 2.7982137203216553 }, { "auxiliary_loss_clip": 0.01190332, "auxiliary_loss_mlp": 0.01047042, "balance_loss_clip": 1.05963683, "balance_loss_mlp": 1.03528762, "epoch": 0.08958095352612276, "flos": 14356584167040.0, "grad_norm": 1.787743573293069, "language_loss": 0.78074259, "learning_rate": 3.962944893972643e-06, "loss": 0.80311632, "num_input_tokens_seen": 15901610, "step": 745, "time_per_iteration": 3.629729747772217 }, { "auxiliary_loss_clip": 0.01217828, "auxiliary_loss_mlp": 0.01041113, "balance_loss_clip": 1.065323, "balance_loss_mlp": 1.02915633, "epoch": 0.08970119641676186, "flos": 17853277345920.0, "grad_norm": 2.8514971394322433, "language_loss": 0.90785909, "learning_rate": 3.962795491526756e-06, "loss": 0.93044853, "num_input_tokens_seen": 15918770, "step": 746, "time_per_iteration": 2.6967415809631348 }, { "auxiliary_loss_clip": 0.01255447, "auxiliary_loss_mlp": 0.0104973, "balance_loss_clip": 1.06849134, "balance_loss_mlp": 1.03721941, "epoch": 0.08982143930740095, "flos": 20811670329600.0, "grad_norm": 2.6480664192225105, "language_loss": 0.8881126, "learning_rate": 3.962645791326354e-06, "loss": 0.9111644, "num_input_tokens_seen": 15938025, "step": 747, "time_per_iteration": 3.612863063812256 }, { "auxiliary_loss_clip": 0.01233781, "auxiliary_loss_mlp": 0.01034143, "balance_loss_clip": 1.06807196, "balance_loss_mlp": 1.02262104, "epoch": 0.08994168219804004, "flos": 24097712198400.0, "grad_norm": 1.806066618536283, "language_loss": 0.83128136, "learning_rate": 3.962495793394146e-06, "loss": 0.85396057, "num_input_tokens_seen": 15957215, "step": 748, "time_per_iteration": 2.686896800994873 }, { "auxiliary_loss_clip": 0.01154345, "auxiliary_loss_mlp": 0.0100601, "balance_loss_clip": 1.03434443, "balance_loss_mlp": 1.00159967, "epoch": 0.09006192508867913, "flos": 57188893812480.0, "grad_norm": 0.7371593434349886, "language_loss": 0.61128163, "learning_rate": 3.9623454977528864e-06, "loss": 0.63288522, "num_input_tokens_seen": 16015870, "step": 749, "time_per_iteration": 4.017616271972656 }, { "auxiliary_loss_clip": 0.01206062, "auxiliary_loss_mlp": 0.01053556, "balance_loss_clip": 1.0599792, "balance_loss_mlp": 1.04044914, "epoch": 0.09018216797931822, "flos": 20487505063680.0, "grad_norm": 1.9758201657783958, "language_loss": 0.84978998, "learning_rate": 3.962194904425375e-06, "loss": 0.87238616, "num_input_tokens_seen": 16036500, "step": 750, "time_per_iteration": 3.651291847229004 }, { "auxiliary_loss_clip": 0.01232775, "auxiliary_loss_mlp": 0.01038304, "balance_loss_clip": 1.06524587, "balance_loss_mlp": 1.02711594, "epoch": 0.09030241086995731, "flos": 22638123043200.0, "grad_norm": 2.2728895278818304, "language_loss": 0.68001461, "learning_rate": 3.9620440134344566e-06, "loss": 0.70272541, "num_input_tokens_seen": 16054655, "step": 751, "time_per_iteration": 2.768510341644287 }, { "auxiliary_loss_clip": 0.0120409, "auxiliary_loss_mlp": 0.01041261, "balance_loss_clip": 1.06400156, "balance_loss_mlp": 1.02882123, "epoch": 0.09042265376059641, "flos": 21871502046720.0, "grad_norm": 2.7224802721906887, "language_loss": 0.82869315, "learning_rate": 3.9618928248030215e-06, "loss": 0.85114664, "num_input_tokens_seen": 16074165, "step": 752, "time_per_iteration": 2.7840282917022705 }, { "auxiliary_loss_clip": 0.01233282, "auxiliary_loss_mlp": 0.01039217, "balance_loss_clip": 1.06599772, "balance_loss_mlp": 1.02734351, "epoch": 0.0905428966512355, "flos": 24316192673280.0, "grad_norm": 2.744090631735611, "language_loss": 0.82770991, "learning_rate": 3.961741338554005e-06, "loss": 0.8504349, "num_input_tokens_seen": 16092505, "step": 753, "time_per_iteration": 2.681762218475342 }, { "auxiliary_loss_clip": 0.01230282, "auxiliary_loss_mlp": 0.01055359, "balance_loss_clip": 1.06675577, "balance_loss_mlp": 1.04274082, "epoch": 0.09066313954187459, "flos": 35845061535360.0, "grad_norm": 2.209333204443945, "language_loss": 0.75877368, "learning_rate": 3.9615895547103865e-06, "loss": 0.7816301, "num_input_tokens_seen": 16116150, "step": 754, "time_per_iteration": 2.815030574798584 }, { "auxiliary_loss_clip": 0.01218613, "auxiliary_loss_mlp": 0.0103846, "balance_loss_clip": 1.06370175, "balance_loss_mlp": 1.02630639, "epoch": 0.09078338243251367, "flos": 29168729550720.0, "grad_norm": 2.1719525401467727, "language_loss": 0.77766424, "learning_rate": 3.961437473295193e-06, "loss": 0.80023497, "num_input_tokens_seen": 16136295, "step": 755, "time_per_iteration": 2.8149194717407227 }, { "auxiliary_loss_clip": 0.01168856, "auxiliary_loss_mlp": 0.01047662, "balance_loss_clip": 1.05044293, "balance_loss_mlp": 1.03588414, "epoch": 0.09090362532315277, "flos": 21907699977600.0, "grad_norm": 3.0984925705992983, "language_loss": 0.72449785, "learning_rate": 3.961285094331495e-06, "loss": 0.74666309, "num_input_tokens_seen": 16154210, "step": 756, "time_per_iteration": 2.9124462604522705 }, { "auxiliary_loss_clip": 0.01247772, "auxiliary_loss_mlp": 0.01043139, "balance_loss_clip": 1.06688213, "balance_loss_mlp": 1.03135478, "epoch": 0.09102386821379185, "flos": 27344503480320.0, "grad_norm": 3.566729164137194, "language_loss": 0.85911244, "learning_rate": 3.961132417842406e-06, "loss": 0.88202155, "num_input_tokens_seen": 16173995, "step": 757, "time_per_iteration": 2.749868392944336 }, { "auxiliary_loss_clip": 0.01224946, "auxiliary_loss_mlp": 0.01037942, "balance_loss_clip": 1.06439745, "balance_loss_mlp": 1.02626491, "epoch": 0.09114411110443095, "flos": 20813501923200.0, "grad_norm": 3.0828914787017303, "language_loss": 0.75058854, "learning_rate": 3.960979443851089e-06, "loss": 0.77321744, "num_input_tokens_seen": 16191020, "step": 758, "time_per_iteration": 2.7564122676849365 }, { "auxiliary_loss_clip": 0.01215906, "auxiliary_loss_mlp": 0.01036981, "balance_loss_clip": 1.06428134, "balance_loss_mlp": 1.02511346, "epoch": 0.09126435399507005, "flos": 26145949438080.0, "grad_norm": 1.6708026147587776, "language_loss": 0.79192483, "learning_rate": 3.96082617238075e-06, "loss": 0.81445366, "num_input_tokens_seen": 16213645, "step": 759, "time_per_iteration": 2.876147985458374 }, { "auxiliary_loss_clip": 0.0121759, "auxiliary_loss_mlp": 0.01038623, "balance_loss_clip": 1.06346011, "balance_loss_mlp": 1.02710152, "epoch": 0.09138459688570913, "flos": 24388911757440.0, "grad_norm": 2.3309888049117022, "language_loss": 0.7968297, "learning_rate": 3.960672603454639e-06, "loss": 0.81939179, "num_input_tokens_seen": 16233625, "step": 760, "time_per_iteration": 2.8499250411987305 }, { "auxiliary_loss_clip": 0.01224839, "auxiliary_loss_mlp": 0.01039615, "balance_loss_clip": 1.06297278, "balance_loss_mlp": 1.02799821, "epoch": 0.09150483977634823, "flos": 21032664756480.0, "grad_norm": 4.545074128782622, "language_loss": 0.76892692, "learning_rate": 3.960518737096054e-06, "loss": 0.79157138, "num_input_tokens_seen": 16253255, "step": 761, "time_per_iteration": 2.741894006729126 }, { "auxiliary_loss_clip": 0.0123448, "auxiliary_loss_mlp": 0.01039002, "balance_loss_clip": 1.06576443, "balance_loss_mlp": 1.02778423, "epoch": 0.09162508266698731, "flos": 22856998567680.0, "grad_norm": 2.6640867552443774, "language_loss": 0.73249102, "learning_rate": 3.960364573328334e-06, "loss": 0.75522578, "num_input_tokens_seen": 16272580, "step": 762, "time_per_iteration": 2.6220500469207764 }, { "auxiliary_loss_clip": 0.01202868, "auxiliary_loss_mlp": 0.01036492, "balance_loss_clip": 1.05909216, "balance_loss_mlp": 1.02445769, "epoch": 0.0917453255576264, "flos": 21724411852800.0, "grad_norm": 1.9967282260163974, "language_loss": 0.88695788, "learning_rate": 3.9602101121748675e-06, "loss": 0.90935147, "num_input_tokens_seen": 16293075, "step": 763, "time_per_iteration": 2.7659759521484375 }, { "auxiliary_loss_clip": 0.01215631, "auxiliary_loss_mlp": 0.01037158, "balance_loss_clip": 1.06386256, "balance_loss_mlp": 1.02591634, "epoch": 0.0918655684482655, "flos": 14609215497600.0, "grad_norm": 2.093474670557306, "language_loss": 0.72849476, "learning_rate": 3.960055353659085e-06, "loss": 0.7510227, "num_input_tokens_seen": 16310185, "step": 764, "time_per_iteration": 2.725687026977539 }, { "auxiliary_loss_clip": 0.01201149, "auxiliary_loss_mlp": 0.01042913, "balance_loss_clip": 1.06054497, "balance_loss_mlp": 1.03124249, "epoch": 0.09198581133890459, "flos": 23435016226560.0, "grad_norm": 2.264598638857187, "language_loss": 0.83863842, "learning_rate": 3.959900297804465e-06, "loss": 0.86107904, "num_input_tokens_seen": 16330355, "step": 765, "time_per_iteration": 2.747556686401367 }, { "auxiliary_loss_clip": 0.01203923, "auxiliary_loss_mlp": 0.01036747, "balance_loss_clip": 1.06095529, "balance_loss_mlp": 1.02587509, "epoch": 0.09210605422954368, "flos": 16795887753600.0, "grad_norm": 2.1479671236386606, "language_loss": 0.77162468, "learning_rate": 3.9597449446345276e-06, "loss": 0.79403138, "num_input_tokens_seen": 16347600, "step": 766, "time_per_iteration": 2.710190534591675 }, { "auxiliary_loss_clip": 0.01203307, "auxiliary_loss_mlp": 0.01040228, "balance_loss_clip": 1.05886936, "balance_loss_mlp": 1.0286293, "epoch": 0.09222629712018277, "flos": 22674249146880.0, "grad_norm": 3.4427678081549273, "language_loss": 0.83120906, "learning_rate": 3.95958929417284e-06, "loss": 0.85364437, "num_input_tokens_seen": 16365755, "step": 767, "time_per_iteration": 2.697509527206421 }, { "auxiliary_loss_clip": 0.01144877, "auxiliary_loss_mlp": 0.01007611, "balance_loss_clip": 1.03667462, "balance_loss_mlp": 1.00312924, "epoch": 0.09234654001082186, "flos": 69976756327680.0, "grad_norm": 0.7338713219237929, "language_loss": 0.58783251, "learning_rate": 3.9594333464430145e-06, "loss": 0.60935742, "num_input_tokens_seen": 16435245, "step": 768, "time_per_iteration": 3.3781991004943848 }, { "auxiliary_loss_clip": 0.01143158, "auxiliary_loss_mlp": 0.01043108, "balance_loss_clip": 1.05021071, "balance_loss_mlp": 1.03204584, "epoch": 0.09246678290146094, "flos": 20011437181440.0, "grad_norm": 2.004930607317738, "language_loss": 0.88396406, "learning_rate": 3.959277101468709e-06, "loss": 0.90582669, "num_input_tokens_seen": 16454795, "step": 769, "time_per_iteration": 2.810681104660034 }, { "auxiliary_loss_clip": 0.01202836, "auxiliary_loss_mlp": 0.01044076, "balance_loss_clip": 1.05888081, "balance_loss_mlp": 1.03248894, "epoch": 0.09258702579210004, "flos": 17747448900480.0, "grad_norm": 2.9483833178776804, "language_loss": 0.78736341, "learning_rate": 3.959120559273624e-06, "loss": 0.80983251, "num_input_tokens_seen": 16472580, "step": 770, "time_per_iteration": 2.6587328910827637 }, { "auxiliary_loss_clip": 0.01203467, "auxiliary_loss_mlp": 0.01048943, "balance_loss_clip": 1.06299174, "balance_loss_mlp": 1.03713489, "epoch": 0.09270726868273914, "flos": 20886544229760.0, "grad_norm": 1.9831195196168443, "language_loss": 0.83472824, "learning_rate": 3.958963719881509e-06, "loss": 0.85725236, "num_input_tokens_seen": 16490670, "step": 771, "time_per_iteration": 3.740633010864258 }, { "auxiliary_loss_clip": 0.012337, "auxiliary_loss_mlp": 0.01046347, "balance_loss_clip": 1.06759048, "balance_loss_mlp": 1.03483748, "epoch": 0.09282751157337822, "flos": 17015697031680.0, "grad_norm": 2.1214620675136886, "language_loss": 0.93888104, "learning_rate": 3.958806583316154e-06, "loss": 0.96168149, "num_input_tokens_seen": 16508640, "step": 772, "time_per_iteration": 2.6777212619781494 }, { "auxiliary_loss_clip": 0.01251112, "auxiliary_loss_mlp": 0.01038309, "balance_loss_clip": 1.06988513, "balance_loss_mlp": 1.02678728, "epoch": 0.09294775446401732, "flos": 32523647748480.0, "grad_norm": 1.7787970787071645, "language_loss": 0.78773671, "learning_rate": 3.9586491496013985e-06, "loss": 0.81063092, "num_input_tokens_seen": 16531035, "step": 773, "time_per_iteration": 3.7450931072235107 }, { "auxiliary_loss_clip": 0.01238858, "auxiliary_loss_mlp": 0.01036293, "balance_loss_clip": 1.06909454, "balance_loss_mlp": 1.0245266, "epoch": 0.0930679973546564, "flos": 18259750627200.0, "grad_norm": 2.2994100251818277, "language_loss": 0.82940274, "learning_rate": 3.958491418761124e-06, "loss": 0.85215425, "num_input_tokens_seen": 16548605, "step": 774, "time_per_iteration": 2.754110097885132 }, { "auxiliary_loss_clip": 0.01216676, "auxiliary_loss_mlp": 0.01039747, "balance_loss_clip": 1.06003261, "balance_loss_mlp": 1.02847612, "epoch": 0.0931882402452955, "flos": 21099745405440.0, "grad_norm": 7.159060694734685, "language_loss": 0.7229321, "learning_rate": 3.958333390819258e-06, "loss": 0.74549633, "num_input_tokens_seen": 16565535, "step": 775, "time_per_iteration": 3.596853017807007 }, { "auxiliary_loss_clip": 0.01247353, "auxiliary_loss_mlp": 0.01050704, "balance_loss_clip": 1.06755412, "balance_loss_mlp": 1.03831232, "epoch": 0.0933084831359346, "flos": 24207275658240.0, "grad_norm": 2.43999179784978, "language_loss": 0.80247557, "learning_rate": 3.9581750657997754e-06, "loss": 0.8254562, "num_input_tokens_seen": 16584900, "step": 776, "time_per_iteration": 3.526524782180786 }, { "auxiliary_loss_clip": 0.01215438, "auxiliary_loss_mlp": 0.01040537, "balance_loss_clip": 1.06233335, "balance_loss_mlp": 1.0284189, "epoch": 0.09342872602657368, "flos": 25480272637440.0, "grad_norm": 1.7134016152099005, "language_loss": 0.89543742, "learning_rate": 3.95801644372669e-06, "loss": 0.91799712, "num_input_tokens_seen": 16604805, "step": 777, "time_per_iteration": 2.749793529510498 }, { "auxiliary_loss_clip": 0.01219697, "auxiliary_loss_mlp": 0.01042278, "balance_loss_clip": 1.06006765, "balance_loss_mlp": 1.03087544, "epoch": 0.09354896891721277, "flos": 23149060053120.0, "grad_norm": 2.859053736978666, "language_loss": 0.84294373, "learning_rate": 3.957857524624068e-06, "loss": 0.86556351, "num_input_tokens_seen": 16623685, "step": 778, "time_per_iteration": 2.721316337585449 }, { "auxiliary_loss_clip": 0.0121424, "auxiliary_loss_mlp": 0.01042161, "balance_loss_clip": 1.06113958, "balance_loss_mlp": 1.03113365, "epoch": 0.09366921180785186, "flos": 24279563779200.0, "grad_norm": 1.7705659548303052, "language_loss": 0.89853454, "learning_rate": 3.957698308516016e-06, "loss": 0.92109859, "num_input_tokens_seen": 16644985, "step": 779, "time_per_iteration": 2.7251243591308594 }, { "auxiliary_loss_clip": 0.01229537, "auxiliary_loss_mlp": 0.00764929, "balance_loss_clip": 1.06850314, "balance_loss_mlp": 1.00071621, "epoch": 0.09378945469849095, "flos": 18730036419840.0, "grad_norm": 2.0163891010333943, "language_loss": 0.8255415, "learning_rate": 3.957538795426688e-06, "loss": 0.84548616, "num_input_tokens_seen": 16662410, "step": 780, "time_per_iteration": 2.678981065750122 }, { "auxiliary_loss_clip": 0.01218288, "auxiliary_loss_mlp": 0.01052171, "balance_loss_clip": 1.06231105, "balance_loss_mlp": 1.04058409, "epoch": 0.09390969758913004, "flos": 23218834222080.0, "grad_norm": 7.087770472743251, "language_loss": 0.77223176, "learning_rate": 3.9573789853802804e-06, "loss": 0.79493642, "num_input_tokens_seen": 16680885, "step": 781, "time_per_iteration": 2.7364144325256348 }, { "auxiliary_loss_clip": 0.012151, "auxiliary_loss_mlp": 0.00765403, "balance_loss_clip": 1.06462812, "balance_loss_mlp": 1.00065005, "epoch": 0.09402994047976913, "flos": 19646728439040.0, "grad_norm": 2.0635957591956986, "language_loss": 0.74735188, "learning_rate": 3.957218878401037e-06, "loss": 0.76715696, "num_input_tokens_seen": 16699375, "step": 782, "time_per_iteration": 2.7537899017333984 }, { "auxiliary_loss_clip": 0.01250834, "auxiliary_loss_mlp": 0.01041313, "balance_loss_clip": 1.06962872, "balance_loss_mlp": 1.0292908, "epoch": 0.09415018337040823, "flos": 29420463041280.0, "grad_norm": 1.8494498680723792, "language_loss": 0.89577246, "learning_rate": 3.957058474513246e-06, "loss": 0.91869396, "num_input_tokens_seen": 16719230, "step": 783, "time_per_iteration": 2.7943217754364014 }, { "auxiliary_loss_clip": 0.01233295, "auxiliary_loss_mlp": 0.01041839, "balance_loss_clip": 1.06917238, "balance_loss_mlp": 1.03087759, "epoch": 0.09427042626104731, "flos": 24572092141440.0, "grad_norm": 1.898341520435001, "language_loss": 0.78466845, "learning_rate": 3.956897773741241e-06, "loss": 0.80741984, "num_input_tokens_seen": 16738220, "step": 784, "time_per_iteration": 2.7535488605499268 }, { "auxiliary_loss_clip": 0.01206111, "auxiliary_loss_mlp": 0.01036706, "balance_loss_clip": 1.0604496, "balance_loss_mlp": 1.02547002, "epoch": 0.09439066915168641, "flos": 26359581576960.0, "grad_norm": 1.789354898587762, "language_loss": 0.71714216, "learning_rate": 3.956736776109398e-06, "loss": 0.73957038, "num_input_tokens_seen": 16759395, "step": 785, "time_per_iteration": 2.810006618499756 }, { "auxiliary_loss_clip": 0.01220177, "auxiliary_loss_mlp": 0.00765402, "balance_loss_clip": 1.06088638, "balance_loss_mlp": 1.00062799, "epoch": 0.09451091204232549, "flos": 19427278296960.0, "grad_norm": 2.258281349081871, "language_loss": 0.83675635, "learning_rate": 3.956575481642143e-06, "loss": 0.85661209, "num_input_tokens_seen": 16778285, "step": 786, "time_per_iteration": 2.830056667327881 }, { "auxiliary_loss_clip": 0.01183777, "auxiliary_loss_mlp": 0.01042451, "balance_loss_clip": 1.05558836, "balance_loss_mlp": 1.03118622, "epoch": 0.09463115493296459, "flos": 25368051571200.0, "grad_norm": 2.368885794193833, "language_loss": 0.7503497, "learning_rate": 3.956413890363943e-06, "loss": 0.77261198, "num_input_tokens_seen": 16795265, "step": 787, "time_per_iteration": 2.80875563621521 }, { "auxiliary_loss_clip": 0.01235395, "auxiliary_loss_mlp": 0.01043599, "balance_loss_clip": 1.06837058, "balance_loss_mlp": 1.03161836, "epoch": 0.09475139782360369, "flos": 10123254869760.0, "grad_norm": 2.0645299508988604, "language_loss": 0.81528306, "learning_rate": 3.956252002299312e-06, "loss": 0.83807302, "num_input_tokens_seen": 16811165, "step": 788, "time_per_iteration": 2.703986644744873 }, { "auxiliary_loss_clip": 0.01245473, "auxiliary_loss_mlp": 0.01036705, "balance_loss_clip": 1.06653619, "balance_loss_mlp": 1.02476072, "epoch": 0.09487164071424277, "flos": 17231088936960.0, "grad_norm": 2.0760257031356666, "language_loss": 0.90921116, "learning_rate": 3.956089817472807e-06, "loss": 0.93203294, "num_input_tokens_seen": 16828470, "step": 789, "time_per_iteration": 2.618983507156372 }, { "auxiliary_loss_clip": 0.01215494, "auxiliary_loss_mlp": 0.01038287, "balance_loss_clip": 1.06625104, "balance_loss_mlp": 1.02655697, "epoch": 0.09499188360488187, "flos": 30849564528000.0, "grad_norm": 2.9811318860132716, "language_loss": 0.85540891, "learning_rate": 3.955927335909032e-06, "loss": 0.87794673, "num_input_tokens_seen": 16851680, "step": 790, "time_per_iteration": 2.7714617252349854 }, { "auxiliary_loss_clip": 0.01184649, "auxiliary_loss_mlp": 0.01035859, "balance_loss_clip": 1.06306624, "balance_loss_mlp": 1.02422452, "epoch": 0.09511212649552095, "flos": 29351694453120.0, "grad_norm": 2.2241378870695274, "language_loss": 0.76105446, "learning_rate": 3.955764557632634e-06, "loss": 0.78325951, "num_input_tokens_seen": 16871490, "step": 791, "time_per_iteration": 2.813549518585205 }, { "auxiliary_loss_clip": 0.01207862, "auxiliary_loss_mlp": 0.01047009, "balance_loss_clip": 1.05999959, "balance_loss_mlp": 1.03525519, "epoch": 0.09523236938616005, "flos": 10378687461120.0, "grad_norm": 4.055142254576654, "language_loss": 0.94861662, "learning_rate": 3.955601482668309e-06, "loss": 0.9711653, "num_input_tokens_seen": 16889350, "step": 792, "time_per_iteration": 2.697558641433716 }, { "auxiliary_loss_clip": 0.01178652, "auxiliary_loss_mlp": 0.0104354, "balance_loss_clip": 1.0539726, "balance_loss_mlp": 1.03188729, "epoch": 0.09535261227679913, "flos": 19061815368960.0, "grad_norm": 2.6248505363224317, "language_loss": 0.88497108, "learning_rate": 3.955438111040794e-06, "loss": 0.90719301, "num_input_tokens_seen": 16907625, "step": 793, "time_per_iteration": 2.7049691677093506 }, { "auxiliary_loss_clip": 0.01178684, "auxiliary_loss_mlp": 0.0103567, "balance_loss_clip": 1.05598283, "balance_loss_mlp": 1.02481031, "epoch": 0.09547285516743823, "flos": 20922993555840.0, "grad_norm": 2.164634627617604, "language_loss": 0.79994118, "learning_rate": 3.955274442774873e-06, "loss": 0.82208472, "num_input_tokens_seen": 16926205, "step": 794, "time_per_iteration": 2.78458833694458 }, { "auxiliary_loss_clip": 0.01230276, "auxiliary_loss_mlp": 0.01049868, "balance_loss_clip": 1.06192338, "balance_loss_mlp": 1.0377264, "epoch": 0.09559309805807732, "flos": 30154405639680.0, "grad_norm": 2.197467611193499, "language_loss": 0.70397079, "learning_rate": 3.9551104778953725e-06, "loss": 0.72677219, "num_input_tokens_seen": 16946500, "step": 795, "time_per_iteration": 2.7491538524627686 }, { "auxiliary_loss_clip": 0.01202322, "auxiliary_loss_mlp": 0.01035862, "balance_loss_clip": 1.05913424, "balance_loss_mlp": 1.02359509, "epoch": 0.0957133409487164, "flos": 21066743784960.0, "grad_norm": 1.8683531426859925, "language_loss": 0.85264462, "learning_rate": 3.954946216427167e-06, "loss": 0.87502646, "num_input_tokens_seen": 16966960, "step": 796, "time_per_iteration": 2.7400617599487305 }, { "auxiliary_loss_clip": 0.0111456, "auxiliary_loss_mlp": 0.01010103, "balance_loss_clip": 1.03376174, "balance_loss_mlp": 1.0057162, "epoch": 0.0958335838393555, "flos": 71297979315840.0, "grad_norm": 0.8876874758579827, "language_loss": 0.61579883, "learning_rate": 3.954781658395176e-06, "loss": 0.63704544, "num_input_tokens_seen": 17023215, "step": 797, "time_per_iteration": 4.097166061401367 }, { "auxiliary_loss_clip": 0.01220723, "auxiliary_loss_mlp": 0.01046202, "balance_loss_clip": 1.06174695, "balance_loss_mlp": 1.03333926, "epoch": 0.09595382672999458, "flos": 21872974504320.0, "grad_norm": 2.078387556535874, "language_loss": 0.92263746, "learning_rate": 3.95461680382436e-06, "loss": 0.94530666, "num_input_tokens_seen": 17042140, "step": 798, "time_per_iteration": 2.8010916709899902 }, { "auxiliary_loss_clip": 0.01236342, "auxiliary_loss_mlp": 0.01046756, "balance_loss_clip": 1.06696236, "balance_loss_mlp": 1.03447747, "epoch": 0.09607406962063368, "flos": 18695562341760.0, "grad_norm": 73.69063321422855, "language_loss": 0.86297911, "learning_rate": 3.9544516527397295e-06, "loss": 0.88581014, "num_input_tokens_seen": 17058490, "step": 799, "time_per_iteration": 3.8383450508117676 }, { "auxiliary_loss_clip": 0.01206766, "auxiliary_loss_mlp": 0.01043904, "balance_loss_clip": 1.06323647, "balance_loss_mlp": 1.03234673, "epoch": 0.09619431251127276, "flos": 22568456615040.0, "grad_norm": 1.8339756391731807, "language_loss": 0.80704331, "learning_rate": 3.954286205166338e-06, "loss": 0.82955003, "num_input_tokens_seen": 17079655, "step": 800, "time_per_iteration": 2.8015217781066895 }, { "auxiliary_loss_clip": 0.0123498, "auxiliary_loss_mlp": 0.01040274, "balance_loss_clip": 1.06909215, "balance_loss_mlp": 1.02797771, "epoch": 0.09631455540191186, "flos": 14246230608000.0, "grad_norm": 2.405197594283106, "language_loss": 0.83736694, "learning_rate": 3.954120461129282e-06, "loss": 0.86011946, "num_input_tokens_seen": 17097065, "step": 801, "time_per_iteration": 4.494358777999878 }, { "auxiliary_loss_clip": 0.01254462, "auxiliary_loss_mlp": 0.0105516, "balance_loss_clip": 1.07187188, "balance_loss_mlp": 1.04380572, "epoch": 0.09643479829255096, "flos": 20740387789440.0, "grad_norm": 2.148884601645676, "language_loss": 0.83681571, "learning_rate": 3.953954420653706e-06, "loss": 0.85991192, "num_input_tokens_seen": 17114090, "step": 802, "time_per_iteration": 2.81734561920166 }, { "auxiliary_loss_clip": 0.01230834, "auxiliary_loss_mlp": 0.010424, "balance_loss_clip": 1.06610632, "balance_loss_mlp": 1.03010333, "epoch": 0.09655504118319004, "flos": 24420476833920.0, "grad_norm": 1.985828667944063, "language_loss": 0.88169736, "learning_rate": 3.953788083764798e-06, "loss": 0.90442973, "num_input_tokens_seen": 17133325, "step": 803, "time_per_iteration": 2.73512864112854 }, { "auxiliary_loss_clip": 0.01181709, "auxiliary_loss_mlp": 0.01042086, "balance_loss_clip": 1.05767536, "balance_loss_mlp": 1.03055251, "epoch": 0.09667528407382914, "flos": 18441961344000.0, "grad_norm": 2.4755695405416045, "language_loss": 0.92424572, "learning_rate": 3.953621450487792e-06, "loss": 0.94648367, "num_input_tokens_seen": 17151945, "step": 804, "time_per_iteration": 2.696316719055176 }, { "auxiliary_loss_clip": 0.01149461, "auxiliary_loss_mlp": 0.0100719, "balance_loss_clip": 1.03321505, "balance_loss_mlp": 1.0029223, "epoch": 0.09679552696446822, "flos": 70816455544320.0, "grad_norm": 0.8432428610399461, "language_loss": 0.61220598, "learning_rate": 3.953454520847964e-06, "loss": 0.63377249, "num_input_tokens_seen": 17216790, "step": 805, "time_per_iteration": 3.343320846557617 }, { "auxiliary_loss_clip": 0.0120862, "auxiliary_loss_mlp": 0.01044766, "balance_loss_clip": 1.06190431, "balance_loss_mlp": 1.03257108, "epoch": 0.09691576985510732, "flos": 21945514020480.0, "grad_norm": 3.0302582024622073, "language_loss": 0.73809743, "learning_rate": 3.9532872948706395e-06, "loss": 0.76063132, "num_input_tokens_seen": 17236285, "step": 806, "time_per_iteration": 2.727023124694824 }, { "auxiliary_loss_clip": 0.01217072, "auxiliary_loss_mlp": 0.01044108, "balance_loss_clip": 1.06198895, "balance_loss_mlp": 1.03218126, "epoch": 0.09703601274574641, "flos": 17965211103360.0, "grad_norm": 5.335472179863121, "language_loss": 0.82614923, "learning_rate": 3.9531197725811845e-06, "loss": 0.84876096, "num_input_tokens_seen": 17251670, "step": 807, "time_per_iteration": 2.717808246612549 }, { "auxiliary_loss_clip": 0.01246409, "auxiliary_loss_mlp": 0.01041001, "balance_loss_clip": 1.06781459, "balance_loss_mlp": 1.02967644, "epoch": 0.0971562556363855, "flos": 22162162901760.0, "grad_norm": 2.208728774567375, "language_loss": 0.87726474, "learning_rate": 3.952951954005013e-06, "loss": 0.90013891, "num_input_tokens_seen": 17271355, "step": 808, "time_per_iteration": 2.6597845554351807 }, { "auxiliary_loss_clip": 0.01213956, "auxiliary_loss_mlp": 0.01040226, "balance_loss_clip": 1.05864966, "balance_loss_mlp": 1.02823305, "epoch": 0.0972764985270246, "flos": 25848716394240.0, "grad_norm": 1.76077687542616, "language_loss": 0.84736747, "learning_rate": 3.952783839167584e-06, "loss": 0.86990929, "num_input_tokens_seen": 17291400, "step": 809, "time_per_iteration": 2.811009168624878 }, { "auxiliary_loss_clip": 0.01229025, "auxiliary_loss_mlp": 0.01041037, "balance_loss_clip": 1.06275785, "balance_loss_mlp": 1.02952099, "epoch": 0.09739674141766368, "flos": 20339373375360.0, "grad_norm": 2.9811076377233054, "language_loss": 0.74930573, "learning_rate": 3.952615428094398e-06, "loss": 0.77200639, "num_input_tokens_seen": 17310920, "step": 810, "time_per_iteration": 2.726468801498413 }, { "auxiliary_loss_clip": 0.0117841, "auxiliary_loss_mlp": 0.01047794, "balance_loss_clip": 1.05529451, "balance_loss_mlp": 1.03624308, "epoch": 0.09751698430830277, "flos": 15743059188480.0, "grad_norm": 1.9658268896634314, "language_loss": 0.73651379, "learning_rate": 3.952446720811004e-06, "loss": 0.75877583, "num_input_tokens_seen": 17329245, "step": 811, "time_per_iteration": 2.7308080196380615 }, { "auxiliary_loss_clip": 0.01102781, "auxiliary_loss_mlp": 0.01007787, "balance_loss_clip": 1.02677464, "balance_loss_mlp": 1.0030663, "epoch": 0.09763722719894186, "flos": 63716806800000.0, "grad_norm": 0.8385906141165206, "language_loss": 0.63648403, "learning_rate": 3.952277717342995e-06, "loss": 0.65758967, "num_input_tokens_seen": 17395680, "step": 812, "time_per_iteration": 3.4346439838409424 }, { "auxiliary_loss_clip": 0.01221149, "auxiliary_loss_mlp": 0.01045174, "balance_loss_clip": 1.06221056, "balance_loss_mlp": 1.03172755, "epoch": 0.09775747008958095, "flos": 22090916275200.0, "grad_norm": 1.9810125951662734, "language_loss": 0.85455412, "learning_rate": 3.952108417716009e-06, "loss": 0.87721735, "num_input_tokens_seen": 17415135, "step": 813, "time_per_iteration": 2.6561295986175537 }, { "auxiliary_loss_clip": 0.01237565, "auxiliary_loss_mlp": 0.01049593, "balance_loss_clip": 1.06900501, "balance_loss_mlp": 1.03667629, "epoch": 0.09787771298022005, "flos": 21286050272640.0, "grad_norm": 1.8005346821377377, "language_loss": 0.84861517, "learning_rate": 3.951938821955727e-06, "loss": 0.87148678, "num_input_tokens_seen": 17434535, "step": 814, "time_per_iteration": 2.6773321628570557 }, { "auxiliary_loss_clip": 0.01217216, "auxiliary_loss_mlp": 0.01038581, "balance_loss_clip": 1.06507885, "balance_loss_mlp": 1.0263685, "epoch": 0.09799795587085913, "flos": 22054574689920.0, "grad_norm": 1.6538343901511334, "language_loss": 0.76672119, "learning_rate": 3.9517689300878786e-06, "loss": 0.78927916, "num_input_tokens_seen": 17454270, "step": 815, "time_per_iteration": 2.755664825439453 }, { "auxiliary_loss_clip": 0.01248974, "auxiliary_loss_mlp": 0.0104046, "balance_loss_clip": 1.06616068, "balance_loss_mlp": 1.02866387, "epoch": 0.09811819876149823, "flos": 22163743100160.0, "grad_norm": 2.8304792893161532, "language_loss": 0.78690398, "learning_rate": 3.951598742138236e-06, "loss": 0.8097983, "num_input_tokens_seen": 17472995, "step": 816, "time_per_iteration": 2.638741970062256 }, { "auxiliary_loss_clip": 0.0122118, "auxiliary_loss_mlp": 0.01040295, "balance_loss_clip": 1.05907452, "balance_loss_mlp": 1.02880347, "epoch": 0.09823844165213731, "flos": 22231111057920.0, "grad_norm": 2.3505121621351783, "language_loss": 0.7980029, "learning_rate": 3.951428258132615e-06, "loss": 0.82061768, "num_input_tokens_seen": 17491115, "step": 817, "time_per_iteration": 2.700972318649292 }, { "auxiliary_loss_clip": 0.01219376, "auxiliary_loss_mlp": 0.01046193, "balance_loss_clip": 1.06356478, "balance_loss_mlp": 1.03339612, "epoch": 0.09835868454277641, "flos": 22487728798080.0, "grad_norm": 2.1583629528398163, "language_loss": 0.84446859, "learning_rate": 3.951257478096879e-06, "loss": 0.86712432, "num_input_tokens_seen": 17509480, "step": 818, "time_per_iteration": 2.696350336074829 }, { "auxiliary_loss_clip": 0.01216348, "auxiliary_loss_mlp": 0.00766817, "balance_loss_clip": 1.06199932, "balance_loss_mlp": 1.00056291, "epoch": 0.0984789274334155, "flos": 16362554077440.0, "grad_norm": 3.0322655562052123, "language_loss": 0.68962365, "learning_rate": 3.951086402056936e-06, "loss": 0.70945525, "num_input_tokens_seen": 17524080, "step": 819, "time_per_iteration": 2.6997785568237305 }, { "auxiliary_loss_clip": 0.01156669, "auxiliary_loss_mlp": 0.00766672, "balance_loss_clip": 1.06087875, "balance_loss_mlp": 1.0006578, "epoch": 0.09859917032405459, "flos": 24243545416320.0, "grad_norm": 1.6867696278171282, "language_loss": 0.83701909, "learning_rate": 3.950915030038735e-06, "loss": 0.85625255, "num_input_tokens_seen": 17543875, "step": 820, "time_per_iteration": 2.8319664001464844 }, { "auxiliary_loss_clip": 0.01231705, "auxiliary_loss_mlp": 0.01042469, "balance_loss_clip": 1.06715667, "balance_loss_mlp": 1.03054237, "epoch": 0.09871941321469369, "flos": 17420195064960.0, "grad_norm": 2.57055421419102, "language_loss": 0.83507842, "learning_rate": 3.9507433620682765e-06, "loss": 0.85782015, "num_input_tokens_seen": 17560810, "step": 821, "time_per_iteration": 2.699984073638916 }, { "auxiliary_loss_clip": 0.01198573, "auxiliary_loss_mlp": 0.01051772, "balance_loss_clip": 1.0578922, "balance_loss_mlp": 1.0386529, "epoch": 0.09883965610533277, "flos": 28477341590400.0, "grad_norm": 1.7482413551045455, "language_loss": 0.88257635, "learning_rate": 3.9505713981716e-06, "loss": 0.90507984, "num_input_tokens_seen": 17583640, "step": 822, "time_per_iteration": 2.754671096801758 }, { "auxiliary_loss_clip": 0.01216927, "auxiliary_loss_mlp": 0.01040434, "balance_loss_clip": 1.06484795, "balance_loss_mlp": 1.02879381, "epoch": 0.09895989899597187, "flos": 23693932437120.0, "grad_norm": 1.8933121638813315, "language_loss": 0.81398875, "learning_rate": 3.950399138374795e-06, "loss": 0.8365624, "num_input_tokens_seen": 17602720, "step": 823, "time_per_iteration": 3.579130172729492 }, { "auxiliary_loss_clip": 0.01229542, "auxiliary_loss_mlp": 0.0104318, "balance_loss_clip": 1.06298971, "balance_loss_mlp": 1.03083038, "epoch": 0.09908014188661095, "flos": 24679608526080.0, "grad_norm": 1.7437643297725411, "language_loss": 0.74301416, "learning_rate": 3.95022658270399e-06, "loss": 0.76574141, "num_input_tokens_seen": 17623085, "step": 824, "time_per_iteration": 2.7581188678741455 }, { "auxiliary_loss_clip": 0.01215478, "auxiliary_loss_mlp": 0.01057243, "balance_loss_clip": 1.06428027, "balance_loss_mlp": 1.04398715, "epoch": 0.09920038477725004, "flos": 14064307200000.0, "grad_norm": 1.919964366209745, "language_loss": 0.78227514, "learning_rate": 3.9500537311853635e-06, "loss": 0.80500233, "num_input_tokens_seen": 17641040, "step": 825, "time_per_iteration": 3.624086618423462 }, { "auxiliary_loss_clip": 0.01228949, "auxiliary_loss_mlp": 0.01044834, "balance_loss_clip": 1.06006539, "balance_loss_mlp": 1.03249037, "epoch": 0.09932062766788914, "flos": 13407070095360.0, "grad_norm": 3.6825376871893805, "language_loss": 0.83098298, "learning_rate": 3.949880583845136e-06, "loss": 0.85372078, "num_input_tokens_seen": 17659115, "step": 826, "time_per_iteration": 2.7161850929260254 }, { "auxiliary_loss_clip": 0.01213039, "auxiliary_loss_mlp": 0.01046911, "balance_loss_clip": 1.05997181, "balance_loss_mlp": 1.03409052, "epoch": 0.09944087055852822, "flos": 19500751566720.0, "grad_norm": 2.0513737128805816, "language_loss": 0.81416333, "learning_rate": 3.949707140709575e-06, "loss": 0.83676291, "num_input_tokens_seen": 17678845, "step": 827, "time_per_iteration": 4.519668102264404 }, { "auxiliary_loss_clip": 0.01232799, "auxiliary_loss_mlp": 0.01049576, "balance_loss_clip": 1.06239641, "balance_loss_mlp": 1.03704143, "epoch": 0.09956111344916732, "flos": 17749100926080.0, "grad_norm": 2.4230774788766247, "language_loss": 0.83359945, "learning_rate": 3.949533401804991e-06, "loss": 0.85642326, "num_input_tokens_seen": 17695750, "step": 828, "time_per_iteration": 2.673691749572754 }, { "auxiliary_loss_clip": 0.0123112, "auxiliary_loss_mlp": 0.00766235, "balance_loss_clip": 1.06533718, "balance_loss_mlp": 1.00062501, "epoch": 0.0996813563398064, "flos": 17967581400960.0, "grad_norm": 4.1966609025211445, "language_loss": 0.91040891, "learning_rate": 3.949359367157739e-06, "loss": 0.93038237, "num_input_tokens_seen": 17714445, "step": 829, "time_per_iteration": 2.806229591369629 }, { "auxiliary_loss_clip": 0.01236456, "auxiliary_loss_mlp": 0.01045063, "balance_loss_clip": 1.06555796, "balance_loss_mlp": 1.03184843, "epoch": 0.0998015992304455, "flos": 17457039440640.0, "grad_norm": 3.4249463196305547, "language_loss": 0.75415516, "learning_rate": 3.949185036794222e-06, "loss": 0.77697027, "num_input_tokens_seen": 17732455, "step": 830, "time_per_iteration": 2.819215774536133 }, { "auxiliary_loss_clip": 0.01245195, "auxiliary_loss_mlp": 0.01039655, "balance_loss_clip": 1.06576025, "balance_loss_mlp": 1.02757907, "epoch": 0.0999218421210846, "flos": 25888757080320.0, "grad_norm": 1.804329408124379, "language_loss": 0.78951228, "learning_rate": 3.949010410740884e-06, "loss": 0.81236076, "num_input_tokens_seen": 17755280, "step": 831, "time_per_iteration": 2.6853208541870117 }, { "auxiliary_loss_clip": 0.01206823, "auxiliary_loss_mlp": 0.00766965, "balance_loss_clip": 1.05958438, "balance_loss_mlp": 1.00052989, "epoch": 0.10004208501172368, "flos": 21215916967680.0, "grad_norm": 1.8601490531013611, "language_loss": 0.86316192, "learning_rate": 3.948835489024216e-06, "loss": 0.88289976, "num_input_tokens_seen": 17775015, "step": 832, "time_per_iteration": 2.9020628929138184 }, { "auxiliary_loss_clip": 0.01234539, "auxiliary_loss_mlp": 0.01036898, "balance_loss_clip": 1.06402206, "balance_loss_mlp": 1.02522159, "epoch": 0.10016232790236278, "flos": 17348409734400.0, "grad_norm": 2.2162637357167942, "language_loss": 0.90873241, "learning_rate": 3.948660271670755e-06, "loss": 0.93144679, "num_input_tokens_seen": 17792165, "step": 833, "time_per_iteration": 2.6379408836364746 }, { "auxiliary_loss_clip": 0.01212123, "auxiliary_loss_mlp": 0.010404, "balance_loss_clip": 1.06154728, "balance_loss_mlp": 1.02763271, "epoch": 0.10028257079300186, "flos": 25666541591040.0, "grad_norm": 2.4131961333562812, "language_loss": 0.84368348, "learning_rate": 3.948484758707079e-06, "loss": 0.86620867, "num_input_tokens_seen": 17811765, "step": 834, "time_per_iteration": 2.772630453109741 }, { "auxiliary_loss_clip": 0.01192461, "auxiliary_loss_mlp": 0.01042359, "balance_loss_clip": 1.05893183, "balance_loss_mlp": 1.03000855, "epoch": 0.10040281368364096, "flos": 25156035544320.0, "grad_norm": 2.3999315447902525, "language_loss": 0.83499879, "learning_rate": 3.948308950159815e-06, "loss": 0.85734701, "num_input_tokens_seen": 17830445, "step": 835, "time_per_iteration": 2.7213056087493896 }, { "auxiliary_loss_clip": 0.01194534, "auxiliary_loss_mlp": 0.0103827, "balance_loss_clip": 1.05565333, "balance_loss_mlp": 1.0252409, "epoch": 0.10052305657428004, "flos": 17603303621760.0, "grad_norm": 2.6071503651608645, "language_loss": 0.76302922, "learning_rate": 3.9481328460556326e-06, "loss": 0.7853573, "num_input_tokens_seen": 17847665, "step": 836, "time_per_iteration": 2.7633168697357178 }, { "auxiliary_loss_clip": 0.01207646, "auxiliary_loss_mlp": 0.01045346, "balance_loss_clip": 1.0585171, "balance_loss_mlp": 1.0316782, "epoch": 0.10064329946491914, "flos": 18660154510080.0, "grad_norm": 2.180934301004454, "language_loss": 0.8981961, "learning_rate": 3.9479564464212455e-06, "loss": 0.92072606, "num_input_tokens_seen": 17866825, "step": 837, "time_per_iteration": 2.7888665199279785 }, { "auxiliary_loss_clip": 0.0125124, "auxiliary_loss_mlp": 0.01043464, "balance_loss_clip": 1.06414366, "balance_loss_mlp": 1.02960014, "epoch": 0.10076354235555823, "flos": 17199056983680.0, "grad_norm": 3.0295685894645743, "language_loss": 0.76489699, "learning_rate": 3.947779751283414e-06, "loss": 0.78784394, "num_input_tokens_seen": 17883995, "step": 838, "time_per_iteration": 2.672478199005127 }, { "auxiliary_loss_clip": 0.01235008, "auxiliary_loss_mlp": 0.00766695, "balance_loss_clip": 1.07028317, "balance_loss_mlp": 1.0005393, "epoch": 0.10088378524619732, "flos": 22962252395520.0, "grad_norm": 2.0621387202742034, "language_loss": 0.76234341, "learning_rate": 3.947602760668944e-06, "loss": 0.78236043, "num_input_tokens_seen": 17903785, "step": 839, "time_per_iteration": 2.6979777812957764 }, { "auxiliary_loss_clip": 0.01235178, "auxiliary_loss_mlp": 0.01047862, "balance_loss_clip": 1.06732488, "balance_loss_mlp": 1.03448641, "epoch": 0.10100402813683641, "flos": 37885828746240.0, "grad_norm": 2.235870548688462, "language_loss": 0.71517932, "learning_rate": 3.947425474604684e-06, "loss": 0.73800969, "num_input_tokens_seen": 17927720, "step": 840, "time_per_iteration": 2.867412567138672 }, { "auxiliary_loss_clip": 0.01218493, "auxiliary_loss_mlp": 0.0104513, "balance_loss_clip": 1.06271315, "balance_loss_mlp": 1.03249359, "epoch": 0.1011242710274755, "flos": 21543458112000.0, "grad_norm": 2.01283865293059, "language_loss": 0.92273867, "learning_rate": 3.947247893117528e-06, "loss": 0.94537497, "num_input_tokens_seen": 17946225, "step": 841, "time_per_iteration": 2.6880884170532227 }, { "auxiliary_loss_clip": 0.01225586, "auxiliary_loss_mlp": 0.01051025, "balance_loss_clip": 1.0623318, "balance_loss_mlp": 1.03804302, "epoch": 0.10124451391811459, "flos": 13621456419840.0, "grad_norm": 3.650204804012978, "language_loss": 0.69128406, "learning_rate": 3.947070016234413e-06, "loss": 0.71405011, "num_input_tokens_seen": 17962015, "step": 842, "time_per_iteration": 2.7309603691101074 }, { "auxiliary_loss_clip": 0.01228004, "auxiliary_loss_mlp": 0.01049791, "balance_loss_clip": 1.06637359, "balance_loss_mlp": 1.03687525, "epoch": 0.10136475680875369, "flos": 16649228522880.0, "grad_norm": 2.5382925418729325, "language_loss": 0.75212044, "learning_rate": 3.946891843982326e-06, "loss": 0.77489841, "num_input_tokens_seen": 17979680, "step": 843, "time_per_iteration": 2.7245867252349854 }, { "auxiliary_loss_clip": 0.01233496, "auxiliary_loss_mlp": 0.01040563, "balance_loss_clip": 1.06793272, "balance_loss_mlp": 1.02733064, "epoch": 0.10148499969939277, "flos": 19461034103040.0, "grad_norm": 2.0780711712283986, "language_loss": 0.745902, "learning_rate": 3.9467133763882935e-06, "loss": 0.76864254, "num_input_tokens_seen": 17998145, "step": 844, "time_per_iteration": 2.7084572315216064 }, { "auxiliary_loss_clip": 0.01221951, "auxiliary_loss_mlp": 0.01047245, "balance_loss_clip": 1.0620538, "balance_loss_mlp": 1.03374505, "epoch": 0.10160524259003187, "flos": 21104988791040.0, "grad_norm": 2.0014232400570613, "language_loss": 0.85912716, "learning_rate": 3.9465346134793905e-06, "loss": 0.88181913, "num_input_tokens_seen": 18017955, "step": 845, "time_per_iteration": 2.690216064453125 }, { "auxiliary_loss_clip": 0.01200027, "auxiliary_loss_mlp": 0.01036313, "balance_loss_clip": 1.06252456, "balance_loss_mlp": 1.02404058, "epoch": 0.10172548548067095, "flos": 17712687513600.0, "grad_norm": 2.0335756715779207, "language_loss": 0.79773676, "learning_rate": 3.9463555552827335e-06, "loss": 0.82010019, "num_input_tokens_seen": 18035125, "step": 846, "time_per_iteration": 2.775200605392456 }, { "auxiliary_loss_clip": 0.01222065, "auxiliary_loss_mlp": 0.01046855, "balance_loss_clip": 1.06207514, "balance_loss_mlp": 1.03443956, "epoch": 0.10184572837131005, "flos": 21104845136640.0, "grad_norm": 5.183985209479954, "language_loss": 0.86436945, "learning_rate": 3.946176201825487e-06, "loss": 0.88705868, "num_input_tokens_seen": 18053160, "step": 847, "time_per_iteration": 2.7158937454223633 }, { "auxiliary_loss_clip": 0.01217622, "auxiliary_loss_mlp": 0.0104694, "balance_loss_clip": 1.06430387, "balance_loss_mlp": 1.03382099, "epoch": 0.10196597126194913, "flos": 26067591918720.0, "grad_norm": 2.226895493962201, "language_loss": 0.83666468, "learning_rate": 3.9459965531348575e-06, "loss": 0.85931039, "num_input_tokens_seen": 18072815, "step": 848, "time_per_iteration": 2.797443151473999 }, { "auxiliary_loss_clip": 0.01214277, "auxiliary_loss_mlp": 0.00766284, "balance_loss_clip": 1.0604893, "balance_loss_mlp": 1.00050855, "epoch": 0.10208621415258823, "flos": 29314634595840.0, "grad_norm": 2.1051017775835636, "language_loss": 0.85618114, "learning_rate": 3.945816609238098e-06, "loss": 0.87598681, "num_input_tokens_seen": 18092225, "step": 849, "time_per_iteration": 3.7540879249572754 }, { "auxiliary_loss_clip": 0.01183873, "auxiliary_loss_mlp": 0.01043565, "balance_loss_clip": 1.05978727, "balance_loss_mlp": 1.02994514, "epoch": 0.10220645704322733, "flos": 23805794367360.0, "grad_norm": 1.8746519610004226, "language_loss": 0.85068768, "learning_rate": 3.945636370162507e-06, "loss": 0.872962, "num_input_tokens_seen": 18112335, "step": 850, "time_per_iteration": 2.7986133098602295 }, { "auxiliary_loss_clip": 0.01231125, "auxiliary_loss_mlp": 0.01046044, "balance_loss_clip": 1.06457877, "balance_loss_mlp": 1.03362894, "epoch": 0.10232669993386641, "flos": 23218546913280.0, "grad_norm": 1.9147581704216472, "language_loss": 0.79162002, "learning_rate": 3.945455835935425e-06, "loss": 0.81439173, "num_input_tokens_seen": 18131520, "step": 851, "time_per_iteration": 2.6855945587158203 }, { "auxiliary_loss_clip": 0.01215669, "auxiliary_loss_mlp": 0.01052345, "balance_loss_clip": 1.05991721, "balance_loss_mlp": 1.0390408, "epoch": 0.1024469428245055, "flos": 22922929981440.0, "grad_norm": 2.3633033551218015, "language_loss": 0.75384748, "learning_rate": 3.94527500658424e-06, "loss": 0.77652764, "num_input_tokens_seen": 18149185, "step": 852, "time_per_iteration": 3.6297285556793213 }, { "auxiliary_loss_clip": 0.01186232, "auxiliary_loss_mlp": 0.01038383, "balance_loss_clip": 1.06101906, "balance_loss_mlp": 1.02588415, "epoch": 0.10256718571514459, "flos": 31359495957120.0, "grad_norm": 2.1535591545344155, "language_loss": 0.81323493, "learning_rate": 3.945093882136382e-06, "loss": 0.83548105, "num_input_tokens_seen": 18172960, "step": 853, "time_per_iteration": 3.778043270111084 }, { "auxiliary_loss_clip": 0.01221343, "auxiliary_loss_mlp": 0.00766406, "balance_loss_clip": 1.06723404, "balance_loss_mlp": 1.00042975, "epoch": 0.10268742860578368, "flos": 23474877344640.0, "grad_norm": 1.966549455973609, "language_loss": 0.84810567, "learning_rate": 3.944912462619329e-06, "loss": 0.8679831, "num_input_tokens_seen": 18191925, "step": 854, "time_per_iteration": 2.752828359603882 }, { "auxiliary_loss_clip": 0.01221739, "auxiliary_loss_mlp": 0.01057265, "balance_loss_clip": 1.06446409, "balance_loss_mlp": 1.04326415, "epoch": 0.10280767149642277, "flos": 25520313323520.0, "grad_norm": 4.54777152088384, "language_loss": 0.80602741, "learning_rate": 3.9447307480606025e-06, "loss": 0.82881743, "num_input_tokens_seen": 18212010, "step": 855, "time_per_iteration": 2.7974417209625244 }, { "auxiliary_loss_clip": 0.01207324, "auxiliary_loss_mlp": 0.01050072, "balance_loss_clip": 1.06220841, "balance_loss_mlp": 1.03693485, "epoch": 0.10292791438706186, "flos": 17347691462400.0, "grad_norm": 2.1737772003421774, "language_loss": 0.90353853, "learning_rate": 3.944548738487767e-06, "loss": 0.92611241, "num_input_tokens_seen": 18229525, "step": 856, "time_per_iteration": 2.7191638946533203 }, { "auxiliary_loss_clip": 0.01256104, "auxiliary_loss_mlp": 0.01051583, "balance_loss_clip": 1.07107818, "balance_loss_mlp": 1.0380888, "epoch": 0.10304815727770096, "flos": 27052693390080.0, "grad_norm": 2.652260681960219, "language_loss": 0.90757865, "learning_rate": 3.944366433928434e-06, "loss": 0.93065554, "num_input_tokens_seen": 18249505, "step": 857, "time_per_iteration": 2.7114450931549072 }, { "auxiliary_loss_clip": 0.01208388, "auxiliary_loss_mlp": 0.01038074, "balance_loss_clip": 1.05919623, "balance_loss_mlp": 1.02617133, "epoch": 0.10316840016834004, "flos": 22782591544320.0, "grad_norm": 2.224065329937135, "language_loss": 0.83720666, "learning_rate": 3.9441838344102594e-06, "loss": 0.85967129, "num_input_tokens_seen": 18269230, "step": 858, "time_per_iteration": 2.75365948677063 }, { "auxiliary_loss_clip": 0.01223887, "auxiliary_loss_mlp": 0.01035486, "balance_loss_clip": 1.06539249, "balance_loss_mlp": 1.02287972, "epoch": 0.10328864305897914, "flos": 20704584908160.0, "grad_norm": 3.03049128011679, "language_loss": 0.67695588, "learning_rate": 3.944000939960943e-06, "loss": 0.69954956, "num_input_tokens_seen": 18287955, "step": 859, "time_per_iteration": 2.8460476398468018 }, { "auxiliary_loss_clip": 0.01237835, "auxiliary_loss_mlp": 0.01051466, "balance_loss_clip": 1.06538081, "balance_loss_mlp": 1.0390209, "epoch": 0.10340888594961822, "flos": 28478814048000.0, "grad_norm": 1.6342849119935081, "language_loss": 0.79977047, "learning_rate": 3.943817750608229e-06, "loss": 0.82266343, "num_input_tokens_seen": 18310505, "step": 860, "time_per_iteration": 2.7784371376037598 }, { "auxiliary_loss_clip": 0.01237584, "auxiliary_loss_mlp": 0.01046214, "balance_loss_clip": 1.06813419, "balance_loss_mlp": 1.03424001, "epoch": 0.10352912884025732, "flos": 13370333460480.0, "grad_norm": 2.870927249681166, "language_loss": 0.81979513, "learning_rate": 3.943634266379908e-06, "loss": 0.84263313, "num_input_tokens_seen": 18327400, "step": 861, "time_per_iteration": 2.6743931770324707 }, { "auxiliary_loss_clip": 0.01234963, "auxiliary_loss_mlp": 0.0104742, "balance_loss_clip": 1.06387401, "balance_loss_mlp": 1.03484905, "epoch": 0.10364937173089642, "flos": 25558558329600.0, "grad_norm": 8.70984526721795, "language_loss": 0.8492893, "learning_rate": 3.943450487303815e-06, "loss": 0.87211311, "num_input_tokens_seen": 18347895, "step": 862, "time_per_iteration": 2.7945656776428223 }, { "auxiliary_loss_clip": 0.01232137, "auxiliary_loss_mlp": 0.01042822, "balance_loss_clip": 1.06670702, "balance_loss_mlp": 1.02935171, "epoch": 0.1037696146215355, "flos": 21215486004480.0, "grad_norm": 2.724199041010767, "language_loss": 0.85389996, "learning_rate": 3.943266413407827e-06, "loss": 0.87664956, "num_input_tokens_seen": 18367170, "step": 863, "time_per_iteration": 2.6405787467956543 }, { "auxiliary_loss_clip": 0.01236893, "auxiliary_loss_mlp": 0.01041858, "balance_loss_clip": 1.0665375, "balance_loss_mlp": 1.03008652, "epoch": 0.1038898575121746, "flos": 25807382818560.0, "grad_norm": 2.0212765000930055, "language_loss": 0.85204262, "learning_rate": 3.94308204471987e-06, "loss": 0.87483019, "num_input_tokens_seen": 18386185, "step": 864, "time_per_iteration": 2.736572504043579 }, { "auxiliary_loss_clip": 0.01202718, "auxiliary_loss_mlp": 0.01045815, "balance_loss_clip": 1.06030226, "balance_loss_mlp": 1.03366148, "epoch": 0.10401010040281368, "flos": 19062425900160.0, "grad_norm": 2.38618981585464, "language_loss": 0.74333978, "learning_rate": 3.942897381267912e-06, "loss": 0.76582509, "num_input_tokens_seen": 18402550, "step": 865, "time_per_iteration": 2.6789400577545166 }, { "auxiliary_loss_clip": 0.0123074, "auxiliary_loss_mlp": 0.01047364, "balance_loss_clip": 1.06444407, "balance_loss_mlp": 1.03462625, "epoch": 0.10413034329345278, "flos": 16355119962240.0, "grad_norm": 2.4417871316530664, "language_loss": 0.66440886, "learning_rate": 3.942712423079965e-06, "loss": 0.68718994, "num_input_tokens_seen": 18418940, "step": 866, "time_per_iteration": 2.648188829421997 }, { "auxiliary_loss_clip": 0.01183109, "auxiliary_loss_mlp": 0.01040896, "balance_loss_clip": 1.05025387, "balance_loss_mlp": 1.02853405, "epoch": 0.10425058618409186, "flos": 17236511890560.0, "grad_norm": 2.4365805615118528, "language_loss": 0.90076077, "learning_rate": 3.942527170184088e-06, "loss": 0.92300081, "num_input_tokens_seen": 18435560, "step": 867, "time_per_iteration": 2.6601991653442383 }, { "auxiliary_loss_clip": 0.01247976, "auxiliary_loss_mlp": 0.01045774, "balance_loss_clip": 1.06698036, "balance_loss_mlp": 1.03282166, "epoch": 0.10437082907473096, "flos": 17967365919360.0, "grad_norm": 2.6423603573592445, "language_loss": 0.78080183, "learning_rate": 3.942341622608385e-06, "loss": 0.80373931, "num_input_tokens_seen": 18452590, "step": 868, "time_per_iteration": 2.705995559692383 }, { "auxiliary_loss_clip": 0.01218664, "auxiliary_loss_mlp": 0.01041669, "balance_loss_clip": 1.06767201, "balance_loss_mlp": 1.02925324, "epoch": 0.10449107196537005, "flos": 36283315374720.0, "grad_norm": 1.6496802367269785, "language_loss": 0.77794516, "learning_rate": 3.942155780381001e-06, "loss": 0.80054843, "num_input_tokens_seen": 18476325, "step": 869, "time_per_iteration": 2.814772367477417 }, { "auxiliary_loss_clip": 0.01217448, "auxiliary_loss_mlp": 0.01043533, "balance_loss_clip": 1.06173706, "balance_loss_mlp": 1.03024697, "epoch": 0.10461131485600914, "flos": 23802095266560.0, "grad_norm": 1.9026002993777102, "language_loss": 0.75815982, "learning_rate": 3.94196964353013e-06, "loss": 0.78076965, "num_input_tokens_seen": 18495775, "step": 870, "time_per_iteration": 2.7935009002685547 }, { "auxiliary_loss_clip": 0.01211194, "auxiliary_loss_mlp": 0.00766155, "balance_loss_clip": 1.05718327, "balance_loss_mlp": 1.00043547, "epoch": 0.10473155774664823, "flos": 18405476104320.0, "grad_norm": 2.2179909265834206, "language_loss": 0.80854946, "learning_rate": 3.941783212084008e-06, "loss": 0.82832295, "num_input_tokens_seen": 18513530, "step": 871, "time_per_iteration": 2.635150194168091 }, { "auxiliary_loss_clip": 0.01204605, "auxiliary_loss_mlp": 0.01041876, "balance_loss_clip": 1.06213641, "balance_loss_mlp": 1.02898359, "epoch": 0.10485180063728732, "flos": 25592637358080.0, "grad_norm": 2.801892732330398, "language_loss": 0.78991276, "learning_rate": 3.941596486070916e-06, "loss": 0.81237757, "num_input_tokens_seen": 18531575, "step": 872, "time_per_iteration": 2.809086561203003 }, { "auxiliary_loss_clip": 0.01179668, "auxiliary_loss_mlp": 0.01038859, "balance_loss_clip": 1.06103826, "balance_loss_mlp": 1.02588856, "epoch": 0.10497204352792641, "flos": 27088747666560.0, "grad_norm": 3.041006339393339, "language_loss": 0.58689111, "learning_rate": 3.941409465519182e-06, "loss": 0.60907638, "num_input_tokens_seen": 18552100, "step": 873, "time_per_iteration": 2.810176372528076 }, { "auxiliary_loss_clip": 0.01222089, "auxiliary_loss_mlp": 0.01038217, "balance_loss_clip": 1.06166553, "balance_loss_mlp": 1.02528894, "epoch": 0.10509228641856551, "flos": 32858479353600.0, "grad_norm": 1.6296496474046385, "language_loss": 0.8528837, "learning_rate": 3.941222150457176e-06, "loss": 0.87548673, "num_input_tokens_seen": 18575355, "step": 874, "time_per_iteration": 2.801281213760376 }, { "auxiliary_loss_clip": 0.01236626, "auxiliary_loss_mlp": 0.01045921, "balance_loss_clip": 1.06380415, "balance_loss_mlp": 1.03333282, "epoch": 0.10521252930920459, "flos": 14319165173760.0, "grad_norm": 2.7069906480962205, "language_loss": 0.72073054, "learning_rate": 3.941034540913311e-06, "loss": 0.74355596, "num_input_tokens_seen": 18592885, "step": 875, "time_per_iteration": 3.513859510421753 }, { "auxiliary_loss_clip": 0.01236293, "auxiliary_loss_mlp": 0.00767022, "balance_loss_clip": 1.06554723, "balance_loss_mlp": 1.0003649, "epoch": 0.10533277219984369, "flos": 21687028773120.0, "grad_norm": 1.6527549340217902, "language_loss": 0.8231498, "learning_rate": 3.940846636916051e-06, "loss": 0.84318292, "num_input_tokens_seen": 18612920, "step": 876, "time_per_iteration": 2.6228067874908447 }, { "auxiliary_loss_clip": 0.01218215, "auxiliary_loss_mlp": 0.01043144, "balance_loss_clip": 1.06558788, "balance_loss_mlp": 1.03044868, "epoch": 0.10545301509048277, "flos": 22269787027200.0, "grad_norm": 2.1469769928113696, "language_loss": 0.86653167, "learning_rate": 3.940658438493899e-06, "loss": 0.88914526, "num_input_tokens_seen": 18630765, "step": 877, "time_per_iteration": 2.7319178581237793 }, { "auxiliary_loss_clip": 0.01249226, "auxiliary_loss_mlp": 0.01042402, "balance_loss_clip": 1.06460428, "balance_loss_mlp": 1.02933705, "epoch": 0.10557325798112187, "flos": 22199725549440.0, "grad_norm": 2.866454860164799, "language_loss": 0.75677502, "learning_rate": 3.940469945675405e-06, "loss": 0.77969128, "num_input_tokens_seen": 18649150, "step": 878, "time_per_iteration": 3.5418338775634766 }, { "auxiliary_loss_clip": 0.01166809, "auxiliary_loss_mlp": 0.01048354, "balance_loss_clip": 1.05872774, "balance_loss_mlp": 1.03643882, "epoch": 0.10569350087176095, "flos": 25775889569280.0, "grad_norm": 2.18972083473854, "language_loss": 0.91538298, "learning_rate": 3.940281158489163e-06, "loss": 0.93753463, "num_input_tokens_seen": 18668380, "step": 879, "time_per_iteration": 4.611927032470703 }, { "auxiliary_loss_clip": 0.01165435, "auxiliary_loss_mlp": 0.01043414, "balance_loss_clip": 1.05344748, "balance_loss_mlp": 1.03068197, "epoch": 0.10581374376240005, "flos": 17311385790720.0, "grad_norm": 1.8625438836957642, "language_loss": 0.82948434, "learning_rate": 3.940092076963812e-06, "loss": 0.85157275, "num_input_tokens_seen": 18685875, "step": 880, "time_per_iteration": 2.7155847549438477 }, { "auxiliary_loss_clip": 0.01215662, "auxiliary_loss_mlp": 0.01047218, "balance_loss_clip": 1.06050062, "balance_loss_mlp": 1.03408098, "epoch": 0.10593398665303914, "flos": 34349454017280.0, "grad_norm": 2.1203893087907835, "language_loss": 0.79066795, "learning_rate": 3.9399027011280355e-06, "loss": 0.81329674, "num_input_tokens_seen": 18707970, "step": 881, "time_per_iteration": 2.8114402294158936 }, { "auxiliary_loss_clip": 0.01220132, "auxiliary_loss_mlp": 0.01040243, "balance_loss_clip": 1.06882739, "balance_loss_mlp": 1.02670681, "epoch": 0.10605422954367823, "flos": 23257977068160.0, "grad_norm": 2.013315060915039, "language_loss": 0.77519238, "learning_rate": 3.939713031010561e-06, "loss": 0.79779613, "num_input_tokens_seen": 18726335, "step": 882, "time_per_iteration": 2.6769256591796875 }, { "auxiliary_loss_clip": 0.01200588, "auxiliary_loss_mlp": 0.01040384, "balance_loss_clip": 1.06201875, "balance_loss_mlp": 1.02829576, "epoch": 0.10617447243431732, "flos": 22820118278400.0, "grad_norm": 1.958399565876181, "language_loss": 0.77936989, "learning_rate": 3.939523066640163e-06, "loss": 0.80177963, "num_input_tokens_seen": 18745230, "step": 883, "time_per_iteration": 2.7463083267211914 }, { "auxiliary_loss_clip": 0.01236378, "auxiliary_loss_mlp": 0.01035566, "balance_loss_clip": 1.06654358, "balance_loss_mlp": 1.02310896, "epoch": 0.10629471532495641, "flos": 24386577373440.0, "grad_norm": 1.9980486137166447, "language_loss": 0.8122673, "learning_rate": 3.939332808045657e-06, "loss": 0.83498675, "num_input_tokens_seen": 18764880, "step": 884, "time_per_iteration": 2.7339131832122803 }, { "auxiliary_loss_clip": 0.01201727, "auxiliary_loss_mlp": 0.01036475, "balance_loss_clip": 1.06278765, "balance_loss_mlp": 1.02330828, "epoch": 0.1064149582155955, "flos": 21105491581440.0, "grad_norm": 1.808275246480227, "language_loss": 0.84684676, "learning_rate": 3.939142255255906e-06, "loss": 0.86922872, "num_input_tokens_seen": 18785765, "step": 885, "time_per_iteration": 2.7635855674743652 }, { "auxiliary_loss_clip": 0.01229297, "auxiliary_loss_mlp": 0.0105358, "balance_loss_clip": 1.06376374, "balance_loss_mlp": 1.04108059, "epoch": 0.1065352011062346, "flos": 20702035042560.0, "grad_norm": 2.1588754332130313, "language_loss": 0.86845803, "learning_rate": 3.938951408299817e-06, "loss": 0.89128685, "num_input_tokens_seen": 18804605, "step": 886, "time_per_iteration": 2.6421570777893066 }, { "auxiliary_loss_clip": 0.0111838, "auxiliary_loss_mlp": 0.01033337, "balance_loss_clip": 1.05633283, "balance_loss_mlp": 1.02909327, "epoch": 0.10665544399687368, "flos": 62659632689280.0, "grad_norm": 0.8080971349885397, "language_loss": 0.54477358, "learning_rate": 3.938760267206342e-06, "loss": 0.56629074, "num_input_tokens_seen": 18866425, "step": 887, "time_per_iteration": 3.50337290763855 }, { "auxiliary_loss_clip": 0.01246517, "auxiliary_loss_mlp": 0.0104142, "balance_loss_clip": 1.06672513, "balance_loss_mlp": 1.02877152, "epoch": 0.10677568688751278, "flos": 26140382830080.0, "grad_norm": 2.286074217334298, "language_loss": 0.78923786, "learning_rate": 3.938568832004475e-06, "loss": 0.81211722, "num_input_tokens_seen": 18885130, "step": 888, "time_per_iteration": 2.8928821086883545 }, { "auxiliary_loss_clip": 0.01205005, "auxiliary_loss_mlp": 0.0103782, "balance_loss_clip": 1.06015253, "balance_loss_mlp": 1.02467132, "epoch": 0.10689592977815186, "flos": 12786533712000.0, "grad_norm": 1.9285321127176778, "language_loss": 0.75394833, "learning_rate": 3.938377102723257e-06, "loss": 0.77637655, "num_input_tokens_seen": 18902265, "step": 889, "time_per_iteration": 2.669562578201294 }, { "auxiliary_loss_clip": 0.01169904, "auxiliary_loss_mlp": 0.01051975, "balance_loss_clip": 1.05648482, "balance_loss_mlp": 1.03873694, "epoch": 0.10701617266879096, "flos": 22126683242880.0, "grad_norm": 4.175362228161861, "language_loss": 0.83613646, "learning_rate": 3.938185079391774e-06, "loss": 0.85835522, "num_input_tokens_seen": 18919310, "step": 890, "time_per_iteration": 2.758146286010742 }, { "auxiliary_loss_clip": 0.01247529, "auxiliary_loss_mlp": 0.01044105, "balance_loss_clip": 1.06683469, "balance_loss_mlp": 1.03139114, "epoch": 0.10713641555943004, "flos": 19745625559680.0, "grad_norm": 2.4461098603454707, "language_loss": 1.06419182, "learning_rate": 3.937992762039157e-06, "loss": 1.08710814, "num_input_tokens_seen": 18932635, "step": 891, "time_per_iteration": 2.5805094242095947 }, { "auxiliary_loss_clip": 0.01233391, "auxiliary_loss_mlp": 0.01046433, "balance_loss_clip": 1.06718135, "balance_loss_mlp": 1.03498936, "epoch": 0.10725665845006914, "flos": 23952992302080.0, "grad_norm": 1.6299187110477094, "language_loss": 0.80522287, "learning_rate": 3.937800150694577e-06, "loss": 0.82802105, "num_input_tokens_seen": 18953810, "step": 892, "time_per_iteration": 2.689061403274536 }, { "auxiliary_loss_clip": 0.01189344, "auxiliary_loss_mlp": 0.01039296, "balance_loss_clip": 1.06128716, "balance_loss_mlp": 1.02690446, "epoch": 0.10737690134070824, "flos": 18551704371840.0, "grad_norm": 2.304122150003733, "language_loss": 0.76276076, "learning_rate": 3.937607245387255e-06, "loss": 0.78504717, "num_input_tokens_seen": 18973175, "step": 893, "time_per_iteration": 2.816016435623169 }, { "auxiliary_loss_clip": 0.01217855, "auxiliary_loss_mlp": 0.01040004, "balance_loss_clip": 1.06032777, "balance_loss_mlp": 1.02763629, "epoch": 0.10749714423134732, "flos": 22707609903360.0, "grad_norm": 2.104445150080955, "language_loss": 0.72159636, "learning_rate": 3.937414046146455e-06, "loss": 0.74417496, "num_input_tokens_seen": 18991130, "step": 894, "time_per_iteration": 2.7783203125 }, { "auxiliary_loss_clip": 0.01250052, "auxiliary_loss_mlp": 0.01051367, "balance_loss_clip": 1.06977296, "balance_loss_mlp": 1.03799152, "epoch": 0.10761738712198642, "flos": 21106066199040.0, "grad_norm": 3.602496869310269, "language_loss": 0.75437546, "learning_rate": 3.9372205530014845e-06, "loss": 0.77738965, "num_input_tokens_seen": 19009610, "step": 895, "time_per_iteration": 2.649681568145752 }, { "auxiliary_loss_clip": 0.01248769, "auxiliary_loss_mlp": 0.01041265, "balance_loss_clip": 1.06555653, "balance_loss_mlp": 1.02809262, "epoch": 0.1077376300126255, "flos": 23766723348480.0, "grad_norm": 2.6405859602332677, "language_loss": 0.7135148, "learning_rate": 3.937026765981696e-06, "loss": 0.73641521, "num_input_tokens_seen": 19029680, "step": 896, "time_per_iteration": 2.71081280708313 }, { "auxiliary_loss_clip": 0.01202369, "auxiliary_loss_mlp": 0.01045481, "balance_loss_clip": 1.06374383, "balance_loss_mlp": 1.0323379, "epoch": 0.1078578729032646, "flos": 20919581763840.0, "grad_norm": 1.7956908413202568, "language_loss": 0.79649657, "learning_rate": 3.936832685116488e-06, "loss": 0.81897503, "num_input_tokens_seen": 19047775, "step": 897, "time_per_iteration": 2.727457284927368 }, { "auxiliary_loss_clip": 0.01247253, "auxiliary_loss_mlp": 0.01040304, "balance_loss_clip": 1.06677735, "balance_loss_mlp": 1.02733409, "epoch": 0.10797811579390369, "flos": 14829886702080.0, "grad_norm": 2.400150538612429, "language_loss": 0.9014008, "learning_rate": 3.936638310435301e-06, "loss": 0.92427635, "num_input_tokens_seen": 19065640, "step": 898, "time_per_iteration": 2.640552282333374 }, { "auxiliary_loss_clip": 0.0123803, "auxiliary_loss_mlp": 0.01048388, "balance_loss_clip": 1.06714976, "balance_loss_mlp": 1.03563881, "epoch": 0.10809835868454278, "flos": 19536985411200.0, "grad_norm": 1.9963498496713716, "language_loss": 0.81481671, "learning_rate": 3.936443641967623e-06, "loss": 0.83768094, "num_input_tokens_seen": 19084470, "step": 899, "time_per_iteration": 2.6696507930755615 }, { "auxiliary_loss_clip": 0.01224742, "auxiliary_loss_mlp": 0.01041858, "balance_loss_clip": 1.06626463, "balance_loss_mlp": 1.02828622, "epoch": 0.10821860157518187, "flos": 18442320480000.0, "grad_norm": 2.057852574468916, "language_loss": 0.83245605, "learning_rate": 3.936248679742983e-06, "loss": 0.85512209, "num_input_tokens_seen": 19102965, "step": 900, "time_per_iteration": 2.6768975257873535 }, { "auxiliary_loss_clip": 0.01115453, "auxiliary_loss_mlp": 0.01015895, "balance_loss_clip": 1.04207575, "balance_loss_mlp": 1.01126981, "epoch": 0.10833884446582095, "flos": 49359468447360.0, "grad_norm": 1.0533330000125987, "language_loss": 0.70152789, "learning_rate": 3.936053423790959e-06, "loss": 0.72284138, "num_input_tokens_seen": 19151285, "step": 901, "time_per_iteration": 3.991771697998047 }, { "auxiliary_loss_clip": 0.01249106, "auxiliary_loss_mlp": 0.01038083, "balance_loss_clip": 1.0698365, "balance_loss_mlp": 1.02579248, "epoch": 0.10845908735646005, "flos": 20411912891520.0, "grad_norm": 2.7753995416901582, "language_loss": 0.7761265, "learning_rate": 3.935857874141168e-06, "loss": 0.79899836, "num_input_tokens_seen": 19170120, "step": 902, "time_per_iteration": 2.6928069591522217 }, { "auxiliary_loss_clip": 0.01216789, "auxiliary_loss_mlp": 0.0104358, "balance_loss_clip": 1.06579399, "balance_loss_mlp": 1.03042555, "epoch": 0.10857933024709913, "flos": 14027750133120.0, "grad_norm": 2.212637860243819, "language_loss": 0.8404665, "learning_rate": 3.935662030823279e-06, "loss": 0.86307013, "num_input_tokens_seen": 19186305, "step": 903, "time_per_iteration": 2.7537600994110107 }, { "auxiliary_loss_clip": 0.01232429, "auxiliary_loss_mlp": 0.0105149, "balance_loss_clip": 1.06293535, "balance_loss_mlp": 1.03882456, "epoch": 0.10869957313773823, "flos": 13369004657280.0, "grad_norm": 3.4176534045519698, "language_loss": 0.7266472, "learning_rate": 3.935465893866998e-06, "loss": 0.74948639, "num_input_tokens_seen": 19204530, "step": 904, "time_per_iteration": 3.567657232284546 }, { "auxiliary_loss_clip": 0.01214739, "auxiliary_loss_mlp": 0.01052053, "balance_loss_clip": 1.06431055, "balance_loss_mlp": 1.03884459, "epoch": 0.10881981602837733, "flos": 25807095509760.0, "grad_norm": 2.092687207648666, "language_loss": 0.80228299, "learning_rate": 3.935269463302079e-06, "loss": 0.82495093, "num_input_tokens_seen": 19222735, "step": 905, "time_per_iteration": 3.640103340148926 }, { "auxiliary_loss_clip": 0.01239282, "auxiliary_loss_mlp": 0.01051159, "balance_loss_clip": 1.06821966, "balance_loss_mlp": 1.03746235, "epoch": 0.10894005891901641, "flos": 20777555387520.0, "grad_norm": 1.7890963574337118, "language_loss": 0.77071738, "learning_rate": 3.935072739158322e-06, "loss": 0.79362178, "num_input_tokens_seen": 19242445, "step": 906, "time_per_iteration": 2.691774606704712 }, { "auxiliary_loss_clip": 0.01215313, "auxiliary_loss_mlp": 0.01048702, "balance_loss_clip": 1.06044984, "balance_loss_mlp": 1.03625035, "epoch": 0.10906030180965551, "flos": 26649883296000.0, "grad_norm": 2.024298022238146, "language_loss": 0.80061764, "learning_rate": 3.934875721465569e-06, "loss": 0.8232578, "num_input_tokens_seen": 19262865, "step": 907, "time_per_iteration": 2.7293708324432373 }, { "auxiliary_loss_clip": 0.01213286, "auxiliary_loss_mlp": 0.01045216, "balance_loss_clip": 1.06000066, "balance_loss_mlp": 1.03142905, "epoch": 0.10918054470029459, "flos": 36534402420480.0, "grad_norm": 2.1699830991752207, "language_loss": 0.71643782, "learning_rate": 3.9346784102537076e-06, "loss": 0.73902285, "num_input_tokens_seen": 19285000, "step": 908, "time_per_iteration": 2.8297483921051025 }, { "auxiliary_loss_clip": 0.01248014, "auxiliary_loss_mlp": 0.01041137, "balance_loss_clip": 1.06628454, "balance_loss_mlp": 1.02910304, "epoch": 0.10930078759093369, "flos": 21762549118080.0, "grad_norm": 1.8101754864628627, "language_loss": 0.78425217, "learning_rate": 3.934480805552669e-06, "loss": 0.80714369, "num_input_tokens_seen": 19306010, "step": 909, "time_per_iteration": 2.6531052589416504 }, { "auxiliary_loss_clip": 0.01243554, "auxiliary_loss_mlp": 0.00766678, "balance_loss_clip": 1.06356514, "balance_loss_mlp": 1.00029337, "epoch": 0.10942103048157277, "flos": 22601781457920.0, "grad_norm": 2.0949817134062494, "language_loss": 0.88251209, "learning_rate": 3.93428290739243e-06, "loss": 0.90261441, "num_input_tokens_seen": 19325380, "step": 910, "time_per_iteration": 2.61702823638916 }, { "auxiliary_loss_clip": 0.01219529, "auxiliary_loss_mlp": 0.01041906, "balance_loss_clip": 1.06306159, "balance_loss_mlp": 1.02987766, "epoch": 0.10954127337221187, "flos": 15045781397760.0, "grad_norm": 2.4045280418354027, "language_loss": 0.80006766, "learning_rate": 3.9340847158030125e-06, "loss": 0.82268202, "num_input_tokens_seen": 19338960, "step": 911, "time_per_iteration": 2.666435480117798 }, { "auxiliary_loss_clip": 0.01237052, "auxiliary_loss_mlp": 0.01043158, "balance_loss_clip": 1.06685925, "balance_loss_mlp": 1.03102255, "epoch": 0.10966151626285096, "flos": 21650974496640.0, "grad_norm": 1.804493007429792, "language_loss": 0.75487876, "learning_rate": 3.9338862308144814e-06, "loss": 0.77768087, "num_input_tokens_seen": 19357780, "step": 912, "time_per_iteration": 2.6516332626342773 }, { "auxiliary_loss_clip": 0.01249054, "auxiliary_loss_mlp": 0.01037409, "balance_loss_clip": 1.06906915, "balance_loss_mlp": 1.025756, "epoch": 0.10978175915349005, "flos": 20121359777280.0, "grad_norm": 1.6696219718298633, "language_loss": 0.84225106, "learning_rate": 3.933687452456946e-06, "loss": 0.86511564, "num_input_tokens_seen": 19377680, "step": 913, "time_per_iteration": 2.65940523147583 }, { "auxiliary_loss_clip": 0.01200463, "auxiliary_loss_mlp": 0.01040995, "balance_loss_clip": 1.0598619, "balance_loss_mlp": 1.02810299, "epoch": 0.10990200204412914, "flos": 20412667077120.0, "grad_norm": 2.7558152884865494, "language_loss": 0.86751771, "learning_rate": 3.933488380760562e-06, "loss": 0.88993227, "num_input_tokens_seen": 19397040, "step": 914, "time_per_iteration": 2.735445022583008 }, { "auxiliary_loss_clip": 0.0124561, "auxiliary_loss_mlp": 0.00767108, "balance_loss_clip": 1.06500518, "balance_loss_mlp": 1.0002178, "epoch": 0.11002224493476823, "flos": 17530117660800.0, "grad_norm": 1.976580054584242, "language_loss": 0.87228578, "learning_rate": 3.9332890157555286e-06, "loss": 0.89241302, "num_input_tokens_seen": 19413975, "step": 915, "time_per_iteration": 2.6587424278259277 }, { "auxiliary_loss_clip": 0.01218549, "auxiliary_loss_mlp": 0.01041519, "balance_loss_clip": 1.06381798, "balance_loss_mlp": 1.02860856, "epoch": 0.11014248782540732, "flos": 12203093099520.0, "grad_norm": 1.8954252434749164, "language_loss": 0.7646786, "learning_rate": 3.933089357472088e-06, "loss": 0.78727925, "num_input_tokens_seen": 19432005, "step": 916, "time_per_iteration": 2.6733617782592773 }, { "auxiliary_loss_clip": 0.01246705, "auxiliary_loss_mlp": 0.01042062, "balance_loss_clip": 1.06915808, "balance_loss_mlp": 1.02972412, "epoch": 0.11026273071604642, "flos": 22382977760640.0, "grad_norm": 1.8198640056544872, "language_loss": 0.86086285, "learning_rate": 3.932889405940529e-06, "loss": 0.88375056, "num_input_tokens_seen": 19450100, "step": 917, "time_per_iteration": 2.6456074714660645 }, { "auxiliary_loss_clip": 0.01216563, "auxiliary_loss_mlp": 0.01044116, "balance_loss_clip": 1.06689, "balance_loss_mlp": 1.03125954, "epoch": 0.1103829736066855, "flos": 19829046896640.0, "grad_norm": 2.207391567752706, "language_loss": 0.80012709, "learning_rate": 3.932689161191184e-06, "loss": 0.82273388, "num_input_tokens_seen": 19467805, "step": 918, "time_per_iteration": 2.759584665298462 }, { "auxiliary_loss_clip": 0.01229231, "auxiliary_loss_mlp": 0.01041287, "balance_loss_clip": 1.06298268, "balance_loss_mlp": 1.02829289, "epoch": 0.1105032164973246, "flos": 22669616292480.0, "grad_norm": 2.089991588550204, "language_loss": 0.88282669, "learning_rate": 3.93248862325443e-06, "loss": 0.90553188, "num_input_tokens_seen": 19486710, "step": 919, "time_per_iteration": 2.6891262531280518 }, { "auxiliary_loss_clip": 0.01143171, "auxiliary_loss_mlp": 0.01005942, "balance_loss_clip": 1.04195523, "balance_loss_mlp": 1.00153112, "epoch": 0.11062345938796368, "flos": 66483507876480.0, "grad_norm": 0.9356238967383186, "language_loss": 0.6436373, "learning_rate": 3.932287792160688e-06, "loss": 0.66512835, "num_input_tokens_seen": 19545170, "step": 920, "time_per_iteration": 3.168898344039917 }, { "auxiliary_loss_clip": 0.01232107, "auxiliary_loss_mlp": 0.01039851, "balance_loss_clip": 1.06333661, "balance_loss_mlp": 1.02704203, "epoch": 0.11074370227860278, "flos": 21907771804800.0, "grad_norm": 2.5904337070281085, "language_loss": 0.80551583, "learning_rate": 3.932086667940424e-06, "loss": 0.82823539, "num_input_tokens_seen": 19561875, "step": 921, "time_per_iteration": 2.661914825439453 }, { "auxiliary_loss_clip": 0.01225628, "auxiliary_loss_mlp": 0.00766505, "balance_loss_clip": 1.06499016, "balance_loss_mlp": 1.00023973, "epoch": 0.11086394516924186, "flos": 28658115763200.0, "grad_norm": 2.0189925312360812, "language_loss": 0.82029617, "learning_rate": 3.93188525062415e-06, "loss": 0.84021753, "num_input_tokens_seen": 19582340, "step": 922, "time_per_iteration": 2.7445971965789795 }, { "auxiliary_loss_clip": 0.01230313, "auxiliary_loss_mlp": 0.01040618, "balance_loss_clip": 1.06485081, "balance_loss_mlp": 1.02708733, "epoch": 0.11098418805988096, "flos": 24535247765760.0, "grad_norm": 2.4779742144676358, "language_loss": 0.86209947, "learning_rate": 3.931683540242418e-06, "loss": 0.88480878, "num_input_tokens_seen": 19603405, "step": 923, "time_per_iteration": 2.666851282119751 }, { "auxiliary_loss_clip": 0.01220921, "auxiliary_loss_mlp": 0.01047538, "balance_loss_clip": 1.06030118, "balance_loss_mlp": 1.03468776, "epoch": 0.11110443095052006, "flos": 22960384888320.0, "grad_norm": 2.9617159815421537, "language_loss": 0.91108, "learning_rate": 3.9314815368258295e-06, "loss": 0.93376458, "num_input_tokens_seen": 19619885, "step": 924, "time_per_iteration": 2.738511085510254 }, { "auxiliary_loss_clip": 0.01233552, "auxiliary_loss_mlp": 0.01039946, "balance_loss_clip": 1.06717813, "balance_loss_mlp": 1.0283525, "epoch": 0.11122467384115914, "flos": 18950025265920.0, "grad_norm": 1.744489379875936, "language_loss": 0.78978252, "learning_rate": 3.9312792404050275e-06, "loss": 0.81251746, "num_input_tokens_seen": 19637940, "step": 925, "time_per_iteration": 2.615124464035034 }, { "auxiliary_loss_clip": 0.01241466, "auxiliary_loss_mlp": 0.0104485, "balance_loss_clip": 1.06591105, "balance_loss_mlp": 1.03347111, "epoch": 0.11134491673179824, "flos": 25082957324160.0, "grad_norm": 2.0333945926790253, "language_loss": 0.77662396, "learning_rate": 3.9310766510107e-06, "loss": 0.79948711, "num_input_tokens_seen": 19657115, "step": 926, "time_per_iteration": 2.6691548824310303 }, { "auxiliary_loss_clip": 0.01204878, "auxiliary_loss_mlp": 0.01044171, "balance_loss_clip": 1.06116569, "balance_loss_mlp": 1.03167772, "epoch": 0.11146515962243732, "flos": 24499121662080.0, "grad_norm": 2.0510050306855874, "language_loss": 0.9190051, "learning_rate": 3.9308737686735806e-06, "loss": 0.94149554, "num_input_tokens_seen": 19677075, "step": 927, "time_per_iteration": 3.668632984161377 }, { "auxiliary_loss_clip": 0.01245128, "auxiliary_loss_mlp": 0.01046252, "balance_loss_clip": 1.06597066, "balance_loss_mlp": 1.03475976, "epoch": 0.11158540251307641, "flos": 22343763087360.0, "grad_norm": 2.314189862331838, "language_loss": 0.82781088, "learning_rate": 3.9306705934244455e-06, "loss": 0.85072464, "num_input_tokens_seen": 19697155, "step": 928, "time_per_iteration": 2.637141466140747 }, { "auxiliary_loss_clip": 0.01196225, "auxiliary_loss_mlp": 0.01036468, "balance_loss_clip": 1.05799985, "balance_loss_mlp": 1.02480948, "epoch": 0.11170564540371551, "flos": 19902304684800.0, "grad_norm": 2.0344790231946113, "language_loss": 0.88410747, "learning_rate": 3.930467125294116e-06, "loss": 0.90643442, "num_input_tokens_seen": 19716705, "step": 929, "time_per_iteration": 2.741306781768799 }, { "auxiliary_loss_clip": 0.01075586, "auxiliary_loss_mlp": 0.0101398, "balance_loss_clip": 1.02908206, "balance_loss_mlp": 1.00954556, "epoch": 0.1118258882943546, "flos": 64586239499520.0, "grad_norm": 0.9619706542878402, "language_loss": 0.6049825, "learning_rate": 3.930263364313458e-06, "loss": 0.62587816, "num_input_tokens_seen": 19767275, "step": 930, "time_per_iteration": 4.053828716278076 }, { "auxiliary_loss_clip": 0.01195583, "auxiliary_loss_mlp": 0.01048109, "balance_loss_clip": 1.05755901, "balance_loss_mlp": 1.03541338, "epoch": 0.11194613118499369, "flos": 17201965985280.0, "grad_norm": 1.9678652485269101, "language_loss": 0.83169824, "learning_rate": 3.930059310513384e-06, "loss": 0.85413522, "num_input_tokens_seen": 19786315, "step": 931, "time_per_iteration": 3.678701877593994 }, { "auxiliary_loss_clip": 0.0118279, "auxiliary_loss_mlp": 0.00766164, "balance_loss_clip": 1.05664051, "balance_loss_mlp": 1.00023556, "epoch": 0.11206637407563277, "flos": 31863465728640.0, "grad_norm": 1.7894914356257638, "language_loss": 0.8412717, "learning_rate": 3.929854963924846e-06, "loss": 0.86076123, "num_input_tokens_seen": 19806580, "step": 932, "time_per_iteration": 3.7827935218811035 }, { "auxiliary_loss_clip": 0.01197657, "auxiliary_loss_mlp": 0.01043243, "balance_loss_clip": 1.05710959, "balance_loss_mlp": 1.03179884, "epoch": 0.11218661696627187, "flos": 21945621761280.0, "grad_norm": 2.442619687463289, "language_loss": 0.77553833, "learning_rate": 3.929650324578845e-06, "loss": 0.79794729, "num_input_tokens_seen": 19826045, "step": 933, "time_per_iteration": 2.7438747882843018 }, { "auxiliary_loss_clip": 0.0121345, "auxiliary_loss_mlp": 0.01047755, "balance_loss_clip": 1.06033564, "balance_loss_mlp": 1.03448749, "epoch": 0.11230685985691095, "flos": 25878198481920.0, "grad_norm": 2.4748029237980855, "language_loss": 0.82257754, "learning_rate": 3.929445392506423e-06, "loss": 0.84518957, "num_input_tokens_seen": 19843985, "step": 934, "time_per_iteration": 2.769374132156372 }, { "auxiliary_loss_clip": 0.01226924, "auxiliary_loss_mlp": 0.01042588, "balance_loss_clip": 1.0663228, "balance_loss_mlp": 1.03107274, "epoch": 0.11242710274755005, "flos": 22231506107520.0, "grad_norm": 1.8630698329724482, "language_loss": 0.75717986, "learning_rate": 3.92924016773867e-06, "loss": 0.77987498, "num_input_tokens_seen": 19860480, "step": 935, "time_per_iteration": 2.684752941131592 }, { "auxiliary_loss_clip": 0.01208588, "auxiliary_loss_mlp": 0.0076555, "balance_loss_clip": 1.05764651, "balance_loss_mlp": 1.00021172, "epoch": 0.11254734563818915, "flos": 17712184723200.0, "grad_norm": 2.4708820206276396, "language_loss": 0.73527056, "learning_rate": 3.9290346503067175e-06, "loss": 0.75501192, "num_input_tokens_seen": 19877145, "step": 936, "time_per_iteration": 2.6843154430389404 }, { "auxiliary_loss_clip": 0.0122702, "auxiliary_loss_mlp": 0.01045479, "balance_loss_clip": 1.05984712, "balance_loss_mlp": 1.03398156, "epoch": 0.11266758852882823, "flos": 54930397334400.0, "grad_norm": 1.8824483813893804, "language_loss": 0.79062212, "learning_rate": 3.9288288402417415e-06, "loss": 0.8133471, "num_input_tokens_seen": 19903405, "step": 937, "time_per_iteration": 2.996154546737671 }, { "auxiliary_loss_clip": 0.01228938, "auxiliary_loss_mlp": 0.01048263, "balance_loss_clip": 1.06383896, "balance_loss_mlp": 1.03577638, "epoch": 0.11278783141946733, "flos": 18878132194560.0, "grad_norm": 2.418799948569045, "language_loss": 0.70665085, "learning_rate": 3.928622737574964e-06, "loss": 0.72942287, "num_input_tokens_seen": 19918740, "step": 938, "time_per_iteration": 2.589139938354492 }, { "auxiliary_loss_clip": 0.01209412, "auxiliary_loss_mlp": 0.01041237, "balance_loss_clip": 1.05730867, "balance_loss_mlp": 1.02933967, "epoch": 0.11290807431010641, "flos": 26469252777600.0, "grad_norm": 1.90500818354009, "language_loss": 0.91213286, "learning_rate": 3.928416342337652e-06, "loss": 0.93463939, "num_input_tokens_seen": 19938475, "step": 939, "time_per_iteration": 2.8485541343688965 }, { "auxiliary_loss_clip": 0.01210972, "auxiliary_loss_mlp": 0.01042714, "balance_loss_clip": 1.06076241, "balance_loss_mlp": 1.03037548, "epoch": 0.1130283172007455, "flos": 22710590732160.0, "grad_norm": 1.8356567422733645, "language_loss": 0.82697129, "learning_rate": 3.928209654561113e-06, "loss": 0.84950811, "num_input_tokens_seen": 19959310, "step": 940, "time_per_iteration": 2.6778650283813477 }, { "auxiliary_loss_clip": 0.01204254, "auxiliary_loss_mlp": 0.01040828, "balance_loss_clip": 1.06117082, "balance_loss_mlp": 1.02959824, "epoch": 0.1131485600913846, "flos": 23219911630080.0, "grad_norm": 2.7090578420998614, "language_loss": 0.81434512, "learning_rate": 3.928002674276703e-06, "loss": 0.83679593, "num_input_tokens_seen": 19978700, "step": 941, "time_per_iteration": 2.724036693572998 }, { "auxiliary_loss_clip": 0.01159386, "auxiliary_loss_mlp": 0.01038365, "balance_loss_clip": 1.05062127, "balance_loss_mlp": 1.0259254, "epoch": 0.11326880298202369, "flos": 14064271286400.0, "grad_norm": 2.5148930335213078, "language_loss": 0.76159537, "learning_rate": 3.92779540151582e-06, "loss": 0.78357291, "num_input_tokens_seen": 19995785, "step": 942, "time_per_iteration": 2.6909029483795166 }, { "auxiliary_loss_clip": 0.01214356, "auxiliary_loss_mlp": 0.01033661, "balance_loss_clip": 1.06281066, "balance_loss_mlp": 1.02162063, "epoch": 0.11338904587266278, "flos": 16325386479360.0, "grad_norm": 1.8727610944021158, "language_loss": 0.85762382, "learning_rate": 3.927587836309907e-06, "loss": 0.88010401, "num_input_tokens_seen": 20013615, "step": 943, "time_per_iteration": 2.709599733352661 }, { "auxiliary_loss_clip": 0.01205138, "auxiliary_loss_mlp": 0.01044077, "balance_loss_clip": 1.05908132, "balance_loss_mlp": 1.03213835, "epoch": 0.11350928876330187, "flos": 24426258923520.0, "grad_norm": 2.0838301047652243, "language_loss": 0.78213561, "learning_rate": 3.927379978690452e-06, "loss": 0.80462772, "num_input_tokens_seen": 20032880, "step": 944, "time_per_iteration": 2.7202539443969727 }, { "auxiliary_loss_clip": 0.01179112, "auxiliary_loss_mlp": 0.01039572, "balance_loss_clip": 1.05057192, "balance_loss_mlp": 1.027812, "epoch": 0.11362953165394096, "flos": 24497074586880.0, "grad_norm": 2.251789963989746, "language_loss": 0.87207854, "learning_rate": 3.927171828688987e-06, "loss": 0.89426535, "num_input_tokens_seen": 20052405, "step": 945, "time_per_iteration": 2.74395751953125 }, { "auxiliary_loss_clip": 0.01239403, "auxiliary_loss_mlp": 0.01044228, "balance_loss_clip": 1.06358266, "balance_loss_mlp": 1.03308821, "epoch": 0.11374977454458005, "flos": 24060831909120.0, "grad_norm": 2.713441190846801, "language_loss": 0.8200044, "learning_rate": 3.926963386337088e-06, "loss": 0.84284067, "num_input_tokens_seen": 20070635, "step": 946, "time_per_iteration": 2.6787290573120117 }, { "auxiliary_loss_clip": 0.01244004, "auxiliary_loss_mlp": 0.01039332, "balance_loss_clip": 1.0654825, "balance_loss_mlp": 1.02702355, "epoch": 0.11387001743521914, "flos": 39457638967680.0, "grad_norm": 2.66043057144668, "language_loss": 0.6982249, "learning_rate": 3.926754651666375e-06, "loss": 0.72105825, "num_input_tokens_seen": 20091195, "step": 947, "time_per_iteration": 2.7977957725524902 }, { "auxiliary_loss_clip": 0.0119403, "auxiliary_loss_mlp": 0.0103976, "balance_loss_clip": 1.05890822, "balance_loss_mlp": 1.0280838, "epoch": 0.11399026032585824, "flos": 25082454533760.0, "grad_norm": 3.1230107214901137, "language_loss": 0.78485614, "learning_rate": 3.926545624708513e-06, "loss": 0.80719399, "num_input_tokens_seen": 20110435, "step": 948, "time_per_iteration": 2.7633402347564697 }, { "auxiliary_loss_clip": 0.0118835, "auxiliary_loss_mlp": 0.01043386, "balance_loss_clip": 1.05540562, "balance_loss_mlp": 1.03183484, "epoch": 0.11411050321649732, "flos": 17961835224960.0, "grad_norm": 2.7802415678317396, "language_loss": 0.85803127, "learning_rate": 3.926336305495213e-06, "loss": 0.88034868, "num_input_tokens_seen": 20128995, "step": 949, "time_per_iteration": 2.6712121963500977 }, { "auxiliary_loss_clip": 0.01181103, "auxiliary_loss_mlp": 0.01039434, "balance_loss_clip": 1.05337596, "balance_loss_mlp": 1.02756119, "epoch": 0.11423074610713642, "flos": 22455409536000.0, "grad_norm": 3.203545537145531, "language_loss": 0.88745785, "learning_rate": 3.926126694058226e-06, "loss": 0.9096632, "num_input_tokens_seen": 20148145, "step": 950, "time_per_iteration": 2.7590508460998535 }, { "auxiliary_loss_clip": 0.01174442, "auxiliary_loss_mlp": 0.01038335, "balance_loss_clip": 1.05846429, "balance_loss_mlp": 1.02693856, "epoch": 0.1143509889977755, "flos": 19717687756800.0, "grad_norm": 1.4023193683428083, "language_loss": 0.8210789, "learning_rate": 3.92591679042935e-06, "loss": 0.8432067, "num_input_tokens_seen": 20168035, "step": 951, "time_per_iteration": 2.7586982250213623 }, { "auxiliary_loss_clip": 0.01227502, "auxiliary_loss_mlp": 0.01047092, "balance_loss_clip": 1.06656897, "balance_loss_mlp": 1.03474772, "epoch": 0.1144712318884146, "flos": 19822869757440.0, "grad_norm": 1.9381444018132117, "language_loss": 0.82432806, "learning_rate": 3.92570659464043e-06, "loss": 0.84707403, "num_input_tokens_seen": 20186095, "step": 952, "time_per_iteration": 2.6687779426574707 }, { "auxiliary_loss_clip": 0.01223601, "auxiliary_loss_mlp": 0.0076702, "balance_loss_clip": 1.06467128, "balance_loss_mlp": 1.0002538, "epoch": 0.1145914747790537, "flos": 14939198766720.0, "grad_norm": 2.02107728223693, "language_loss": 0.80192882, "learning_rate": 3.925496106723349e-06, "loss": 0.82183498, "num_input_tokens_seen": 20203535, "step": 953, "time_per_iteration": 3.585479259490967 }, { "auxiliary_loss_clip": 0.01226048, "auxiliary_loss_mlp": 0.0104189, "balance_loss_clip": 1.06188786, "balance_loss_mlp": 1.02976036, "epoch": 0.11471171766969278, "flos": 19865029345920.0, "grad_norm": 2.4363253936646863, "language_loss": 0.83883834, "learning_rate": 3.9252853267100405e-06, "loss": 0.86151773, "num_input_tokens_seen": 20222780, "step": 954, "time_per_iteration": 2.6228597164154053 }, { "auxiliary_loss_clip": 0.01186543, "auxiliary_loss_mlp": 0.01039105, "balance_loss_clip": 1.05905259, "balance_loss_mlp": 1.02729714, "epoch": 0.11483196056033187, "flos": 22526476594560.0, "grad_norm": 2.3855024245819303, "language_loss": 0.83867049, "learning_rate": 3.9250742546324786e-06, "loss": 0.86092699, "num_input_tokens_seen": 20243015, "step": 955, "time_per_iteration": 2.7445526123046875 }, { "auxiliary_loss_clip": 0.01207852, "auxiliary_loss_mlp": 0.01037793, "balance_loss_clip": 1.0602169, "balance_loss_mlp": 1.0262115, "epoch": 0.11495220345097096, "flos": 28220292887040.0, "grad_norm": 1.9286240264953656, "language_loss": 0.86832291, "learning_rate": 3.924862890522683e-06, "loss": 0.89077938, "num_input_tokens_seen": 20263025, "step": 956, "time_per_iteration": 3.6156067848205566 }, { "auxiliary_loss_clip": 0.01223611, "auxiliary_loss_mlp": 0.01042058, "balance_loss_clip": 1.06044948, "balance_loss_mlp": 1.02960014, "epoch": 0.11507244634161005, "flos": 17492267704320.0, "grad_norm": 3.8476838524434, "language_loss": 0.86394691, "learning_rate": 3.9246512344127174e-06, "loss": 0.88660359, "num_input_tokens_seen": 20280685, "step": 957, "time_per_iteration": 4.452392101287842 }, { "auxiliary_loss_clip": 0.01148771, "auxiliary_loss_mlp": 0.01039029, "balance_loss_clip": 1.05381346, "balance_loss_mlp": 1.02762115, "epoch": 0.11519268923224914, "flos": 22564937082240.0, "grad_norm": 2.357301816905252, "language_loss": 0.81979465, "learning_rate": 3.9244392863346895e-06, "loss": 0.84167266, "num_input_tokens_seen": 20300090, "step": 958, "time_per_iteration": 2.839357376098633 }, { "auxiliary_loss_clip": 0.01218342, "auxiliary_loss_mlp": 0.01046092, "balance_loss_clip": 1.06786919, "balance_loss_mlp": 1.03230548, "epoch": 0.11531293212288823, "flos": 16982839065600.0, "grad_norm": 1.921215739868516, "language_loss": 0.92433584, "learning_rate": 3.9242270463207524e-06, "loss": 0.94698018, "num_input_tokens_seen": 20318480, "step": 959, "time_per_iteration": 2.6328580379486084 }, { "auxiliary_loss_clip": 0.01169025, "auxiliary_loss_mlp": 0.01033907, "balance_loss_clip": 1.05587888, "balance_loss_mlp": 1.02188516, "epoch": 0.11543317501352733, "flos": 12422004537600.0, "grad_norm": 3.680768908868924, "language_loss": 0.85114092, "learning_rate": 3.924014514403102e-06, "loss": 0.87317026, "num_input_tokens_seen": 20334635, "step": 960, "time_per_iteration": 2.782628297805786 }, { "auxiliary_loss_clip": 0.01175066, "auxiliary_loss_mlp": 0.01036467, "balance_loss_clip": 1.05417514, "balance_loss_mlp": 1.02456963, "epoch": 0.11555341790416641, "flos": 19821648695040.0, "grad_norm": 2.280670672565852, "language_loss": 0.91109574, "learning_rate": 3.92380169061398e-06, "loss": 0.93321109, "num_input_tokens_seen": 20352415, "step": 961, "time_per_iteration": 2.796318531036377 }, { "auxiliary_loss_clip": 0.01186945, "auxiliary_loss_mlp": 0.00766728, "balance_loss_clip": 1.0534786, "balance_loss_mlp": 1.00023937, "epoch": 0.11567366079480551, "flos": 25738865625600.0, "grad_norm": 2.2182044948234854, "language_loss": 0.83695763, "learning_rate": 3.9235885749856705e-06, "loss": 0.85649431, "num_input_tokens_seen": 20371095, "step": 962, "time_per_iteration": 2.735335111618042 }, { "auxiliary_loss_clip": 0.01212266, "auxiliary_loss_mlp": 0.01048401, "balance_loss_clip": 1.06498218, "balance_loss_mlp": 1.03653395, "epoch": 0.1157939036854446, "flos": 18223301301120.0, "grad_norm": 2.1172917927199033, "language_loss": 0.82457185, "learning_rate": 3.9233751675505035e-06, "loss": 0.84717852, "num_input_tokens_seen": 20389805, "step": 963, "time_per_iteration": 2.7368810176849365 }, { "auxiliary_loss_clip": 0.01208606, "auxiliary_loss_mlp": 0.01050231, "balance_loss_clip": 1.06327796, "balance_loss_mlp": 1.03797066, "epoch": 0.11591414657608369, "flos": 23073755189760.0, "grad_norm": 2.831350012665959, "language_loss": 0.85114169, "learning_rate": 3.923161468340853e-06, "loss": 0.87373006, "num_input_tokens_seen": 20409640, "step": 964, "time_per_iteration": 2.6985890865325928 }, { "auxiliary_loss_clip": 0.01164901, "auxiliary_loss_mlp": 0.01051439, "balance_loss_clip": 1.05065608, "balance_loss_mlp": 1.03969646, "epoch": 0.11603438946672277, "flos": 19461716461440.0, "grad_norm": 1.8287065880319882, "language_loss": 0.81426215, "learning_rate": 3.9229474773891374e-06, "loss": 0.83642554, "num_input_tokens_seen": 20428180, "step": 965, "time_per_iteration": 2.735017776489258 }, { "auxiliary_loss_clip": 0.01197022, "auxiliary_loss_mlp": 0.01041269, "balance_loss_clip": 1.05257916, "balance_loss_mlp": 1.02877593, "epoch": 0.11615463235736187, "flos": 26831986272000.0, "grad_norm": 11.018900659824112, "language_loss": 0.84009349, "learning_rate": 3.922733194727818e-06, "loss": 0.86247647, "num_input_tokens_seen": 20447975, "step": 966, "time_per_iteration": 2.722153902053833 }, { "auxiliary_loss_clip": 0.01232885, "auxiliary_loss_mlp": 0.01043351, "balance_loss_clip": 1.06558871, "balance_loss_mlp": 1.03029168, "epoch": 0.11627487524800097, "flos": 18580324533120.0, "grad_norm": 2.2477593052344944, "language_loss": 0.87119257, "learning_rate": 3.922518620389402e-06, "loss": 0.89395499, "num_input_tokens_seen": 20464840, "step": 967, "time_per_iteration": 2.640848159790039 }, { "auxiliary_loss_clip": 0.01120612, "auxiliary_loss_mlp": 0.0104354, "balance_loss_clip": 1.04912531, "balance_loss_mlp": 1.03221548, "epoch": 0.11639511813864005, "flos": 18150474476160.0, "grad_norm": 2.1908432728590013, "language_loss": 0.89534372, "learning_rate": 3.922303754406439e-06, "loss": 0.91698527, "num_input_tokens_seen": 20482680, "step": 968, "time_per_iteration": 2.917951822280884 }, { "auxiliary_loss_clip": 0.01176585, "auxiliary_loss_mlp": 0.01046399, "balance_loss_clip": 1.05419278, "balance_loss_mlp": 1.03430557, "epoch": 0.11651536102927915, "flos": 20922023888640.0, "grad_norm": 2.0891838082258176, "language_loss": 0.79058111, "learning_rate": 3.922088596811526e-06, "loss": 0.81281096, "num_input_tokens_seen": 20501810, "step": 969, "time_per_iteration": 3.018881320953369 }, { "auxiliary_loss_clip": 0.01216814, "auxiliary_loss_mlp": 0.01044417, "balance_loss_clip": 1.06240034, "balance_loss_mlp": 1.0320673, "epoch": 0.11663560391991823, "flos": 16508602776960.0, "grad_norm": 2.761817269000313, "language_loss": 0.86915678, "learning_rate": 3.9218731476373e-06, "loss": 0.89176911, "num_input_tokens_seen": 20517995, "step": 970, "time_per_iteration": 2.578216552734375 }, { "auxiliary_loss_clip": 0.01230007, "auxiliary_loss_mlp": 0.01045348, "balance_loss_clip": 1.06567621, "balance_loss_mlp": 1.03213394, "epoch": 0.11675584681055733, "flos": 19865029345920.0, "grad_norm": 2.21898532467825, "language_loss": 0.84510791, "learning_rate": 3.9216574069164455e-06, "loss": 0.86786151, "num_input_tokens_seen": 20536970, "step": 971, "time_per_iteration": 2.662442207336426 }, { "auxiliary_loss_clip": 0.01236273, "auxiliary_loss_mlp": 0.01049932, "balance_loss_clip": 1.06203866, "balance_loss_mlp": 1.03840458, "epoch": 0.11687608970119642, "flos": 21944364785280.0, "grad_norm": 1.601132191449602, "language_loss": 0.8026607, "learning_rate": 3.921441374681691e-06, "loss": 0.82552278, "num_input_tokens_seen": 20557030, "step": 972, "time_per_iteration": 2.6640870571136475 }, { "auxiliary_loss_clip": 0.0120301, "auxiliary_loss_mlp": 0.01047886, "balance_loss_clip": 1.05925512, "balance_loss_mlp": 1.03559542, "epoch": 0.1169963325918355, "flos": 24061155131520.0, "grad_norm": 1.7536400200938245, "language_loss": 0.64747274, "learning_rate": 3.921225050965808e-06, "loss": 0.66998172, "num_input_tokens_seen": 20576915, "step": 973, "time_per_iteration": 2.7196614742279053 }, { "auxiliary_loss_clip": 0.01194518, "auxiliary_loss_mlp": 0.01043712, "balance_loss_clip": 1.06012273, "balance_loss_mlp": 1.03163612, "epoch": 0.1171165754824746, "flos": 23368151059200.0, "grad_norm": 2.3956614799887364, "language_loss": 0.74985856, "learning_rate": 3.921008435801612e-06, "loss": 0.77224088, "num_input_tokens_seen": 20596000, "step": 974, "time_per_iteration": 2.765462636947632 }, { "auxiliary_loss_clip": 0.01214333, "auxiliary_loss_mlp": 0.01036882, "balance_loss_clip": 1.05909729, "balance_loss_mlp": 1.02458584, "epoch": 0.11723681837311369, "flos": 18552243075840.0, "grad_norm": 2.0090630832286593, "language_loss": 0.75966966, "learning_rate": 3.920791529221963e-06, "loss": 0.78218174, "num_input_tokens_seen": 20614675, "step": 975, "time_per_iteration": 2.6394972801208496 }, { "auxiliary_loss_clip": 0.01208587, "auxiliary_loss_mlp": 0.00766352, "balance_loss_clip": 1.05880678, "balance_loss_mlp": 1.00027275, "epoch": 0.11735706126375278, "flos": 23550541344000.0, "grad_norm": 2.2279314327012347, "language_loss": 0.76726687, "learning_rate": 3.920574331259768e-06, "loss": 0.78701627, "num_input_tokens_seen": 20635875, "step": 976, "time_per_iteration": 2.747483253479004 }, { "auxiliary_loss_clip": 0.01197717, "auxiliary_loss_mlp": 0.01042687, "balance_loss_clip": 1.05875754, "balance_loss_mlp": 1.02947283, "epoch": 0.11747730415439187, "flos": 22381541216640.0, "grad_norm": 4.241316545385091, "language_loss": 0.79744089, "learning_rate": 3.9203568419479716e-06, "loss": 0.81984496, "num_input_tokens_seen": 20656430, "step": 977, "time_per_iteration": 2.6808676719665527 }, { "auxiliary_loss_clip": 0.01207752, "auxiliary_loss_mlp": 0.01042121, "balance_loss_clip": 1.05998313, "balance_loss_mlp": 1.03015256, "epoch": 0.11759754704503096, "flos": 22200731130240.0, "grad_norm": 1.8784960756479705, "language_loss": 0.75267178, "learning_rate": 3.92013906131957e-06, "loss": 0.77517051, "num_input_tokens_seen": 20675360, "step": 978, "time_per_iteration": 2.7283780574798584 }, { "auxiliary_loss_clip": 0.01194735, "auxiliary_loss_mlp": 0.01048382, "balance_loss_clip": 1.05908442, "balance_loss_mlp": 1.03676546, "epoch": 0.11771778993567006, "flos": 22309755886080.0, "grad_norm": 1.594555645243806, "language_loss": 0.82723701, "learning_rate": 3.9199209894076e-06, "loss": 0.8496682, "num_input_tokens_seen": 20695675, "step": 979, "time_per_iteration": 3.673367500305176 }, { "auxiliary_loss_clip": 0.01238869, "auxiliary_loss_mlp": 0.01039675, "balance_loss_clip": 1.06095433, "balance_loss_mlp": 1.02675843, "epoch": 0.11783803282630914, "flos": 21288169175040.0, "grad_norm": 2.028395915344561, "language_loss": 0.90016335, "learning_rate": 3.919702626245142e-06, "loss": 0.92294878, "num_input_tokens_seen": 20715330, "step": 980, "time_per_iteration": 2.627700090408325 }, { "auxiliary_loss_clip": 0.01194115, "auxiliary_loss_mlp": 0.01040816, "balance_loss_clip": 1.05565047, "balance_loss_mlp": 1.02766156, "epoch": 0.11795827571694824, "flos": 25371535190400.0, "grad_norm": 2.516723385917264, "language_loss": 0.66517174, "learning_rate": 3.919483971865322e-06, "loss": 0.68752098, "num_input_tokens_seen": 20735325, "step": 981, "time_per_iteration": 2.7805824279785156 }, { "auxiliary_loss_clip": 0.01210565, "auxiliary_loss_mlp": 0.0104357, "balance_loss_clip": 1.06297064, "balance_loss_mlp": 1.0314467, "epoch": 0.11807851860758732, "flos": 23622218933760.0, "grad_norm": 3.010496536546041, "language_loss": 0.87977231, "learning_rate": 3.91926502630131e-06, "loss": 0.90231359, "num_input_tokens_seen": 20755940, "step": 982, "time_per_iteration": 4.591506004333496 }, { "auxiliary_loss_clip": 0.01232182, "auxiliary_loss_mlp": 0.01046582, "balance_loss_clip": 1.06879413, "balance_loss_mlp": 1.03469634, "epoch": 0.11819876149822642, "flos": 24972496024320.0, "grad_norm": 1.853811005970903, "language_loss": 0.72170103, "learning_rate": 3.91904578958632e-06, "loss": 0.74448866, "num_input_tokens_seen": 20775355, "step": 983, "time_per_iteration": 3.6544196605682373 }, { "auxiliary_loss_clip": 0.01242216, "auxiliary_loss_mlp": 0.01037051, "balance_loss_clip": 1.06588817, "balance_loss_mlp": 1.02527916, "epoch": 0.11831900438886551, "flos": 23003226835200.0, "grad_norm": 2.361953154635545, "language_loss": 0.83964217, "learning_rate": 3.918826261753608e-06, "loss": 0.86243486, "num_input_tokens_seen": 20794935, "step": 984, "time_per_iteration": 2.6990838050842285 }, { "auxiliary_loss_clip": 0.01208057, "auxiliary_loss_mlp": 0.01043314, "balance_loss_clip": 1.05932236, "balance_loss_mlp": 1.03069019, "epoch": 0.1184392472795046, "flos": 27965147604480.0, "grad_norm": 3.018648225488855, "language_loss": 0.7081753, "learning_rate": 3.918606442836478e-06, "loss": 0.73068893, "num_input_tokens_seen": 20817155, "step": 985, "time_per_iteration": 2.7193636894226074 }, { "auxiliary_loss_clip": 0.01223472, "auxiliary_loss_mlp": 0.01039511, "balance_loss_clip": 1.06475222, "balance_loss_mlp": 1.02745318, "epoch": 0.1185594901701437, "flos": 19898497843200.0, "grad_norm": 1.8282481166300326, "language_loss": 0.77451992, "learning_rate": 3.918386332868277e-06, "loss": 0.79714978, "num_input_tokens_seen": 20835125, "step": 986, "time_per_iteration": 2.672311544418335 }, { "auxiliary_loss_clip": 0.01217269, "auxiliary_loss_mlp": 0.01035429, "balance_loss_clip": 1.06138325, "balance_loss_mlp": 1.023669, "epoch": 0.11867973306078278, "flos": 18912354877440.0, "grad_norm": 1.6806511150770254, "language_loss": 0.94136679, "learning_rate": 3.918165931882394e-06, "loss": 0.96389377, "num_input_tokens_seen": 20853525, "step": 987, "time_per_iteration": 2.6779942512512207 }, { "auxiliary_loss_clip": 0.01149473, "auxiliary_loss_mlp": 0.01045635, "balance_loss_clip": 1.04991853, "balance_loss_mlp": 1.03426874, "epoch": 0.11879997595142187, "flos": 16982803152000.0, "grad_norm": 2.606474351763251, "language_loss": 0.75148934, "learning_rate": 3.917945239912264e-06, "loss": 0.77344036, "num_input_tokens_seen": 20871000, "step": 988, "time_per_iteration": 2.821244955062866 }, { "auxiliary_loss_clip": 0.01178148, "auxiliary_loss_mlp": 0.01042161, "balance_loss_clip": 1.05618382, "balance_loss_mlp": 1.03101516, "epoch": 0.11892021884206096, "flos": 17530369056000.0, "grad_norm": 2.0452572511216562, "language_loss": 0.7549057, "learning_rate": 3.917724256991367e-06, "loss": 0.77710873, "num_input_tokens_seen": 20889745, "step": 989, "time_per_iteration": 2.781541347503662 }, { "auxiliary_loss_clip": 0.01196474, "auxiliary_loss_mlp": 0.01043677, "balance_loss_clip": 1.05832624, "balance_loss_mlp": 1.03133297, "epoch": 0.11904046173270005, "flos": 30955895763840.0, "grad_norm": 2.37185533473021, "language_loss": 0.81819278, "learning_rate": 3.9175029831532245e-06, "loss": 0.84059429, "num_input_tokens_seen": 20909260, "step": 990, "time_per_iteration": 2.807943344116211 }, { "auxiliary_loss_clip": 0.01194762, "auxiliary_loss_mlp": 0.01037252, "balance_loss_clip": 1.06024683, "balance_loss_mlp": 1.02527118, "epoch": 0.11916070462333915, "flos": 20157234485760.0, "grad_norm": 2.335386215766892, "language_loss": 0.88629401, "learning_rate": 3.917281418431404e-06, "loss": 0.90861416, "num_input_tokens_seen": 20928305, "step": 991, "time_per_iteration": 2.8357701301574707 }, { "auxiliary_loss_clip": 0.01209864, "auxiliary_loss_mlp": 0.01040948, "balance_loss_clip": 1.06176877, "balance_loss_mlp": 1.02884209, "epoch": 0.11928094751397823, "flos": 23551115961600.0, "grad_norm": 4.627295551758001, "language_loss": 0.76924664, "learning_rate": 3.917059562859516e-06, "loss": 0.79175472, "num_input_tokens_seen": 20947630, "step": 992, "time_per_iteration": 2.7211391925811768 }, { "auxiliary_loss_clip": 0.01196209, "auxiliary_loss_mlp": 0.01046774, "balance_loss_clip": 1.05791759, "balance_loss_mlp": 1.03545523, "epoch": 0.11940119040461733, "flos": 23908426502400.0, "grad_norm": 2.103340825448616, "language_loss": 0.88911939, "learning_rate": 3.916837416471218e-06, "loss": 0.91154921, "num_input_tokens_seen": 20964250, "step": 993, "time_per_iteration": 2.7415590286254883 }, { "auxiliary_loss_clip": 0.01212488, "auxiliary_loss_mlp": 0.01041683, "balance_loss_clip": 1.055179, "balance_loss_mlp": 1.02973223, "epoch": 0.11952143329525641, "flos": 13844533835520.0, "grad_norm": 2.7099815192463628, "language_loss": 0.72377706, "learning_rate": 3.916614979300207e-06, "loss": 0.7463187, "num_input_tokens_seen": 20979095, "step": 994, "time_per_iteration": 2.6360268592834473 }, { "auxiliary_loss_clip": 0.01169415, "auxiliary_loss_mlp": 0.01040863, "balance_loss_clip": 1.05651307, "balance_loss_mlp": 1.02904916, "epoch": 0.11964167618589551, "flos": 27015525792000.0, "grad_norm": 1.7423434347040332, "language_loss": 0.79016989, "learning_rate": 3.9163922513802274e-06, "loss": 0.81227267, "num_input_tokens_seen": 21001430, "step": 995, "time_per_iteration": 2.7856686115264893 }, { "auxiliary_loss_clip": 0.01239784, "auxiliary_loss_mlp": 0.01046106, "balance_loss_clip": 1.06280887, "balance_loss_mlp": 1.03411973, "epoch": 0.1197619190765346, "flos": 12567622273920.0, "grad_norm": 3.5008829695044756, "language_loss": 0.82648677, "learning_rate": 3.916169232745067e-06, "loss": 0.84934568, "num_input_tokens_seen": 21019105, "step": 996, "time_per_iteration": 2.5894672870635986 }, { "auxiliary_loss_clip": 0.01197132, "auxiliary_loss_mlp": 0.01041151, "balance_loss_clip": 1.05834794, "balance_loss_mlp": 1.0289979, "epoch": 0.11988216196717369, "flos": 16909437623040.0, "grad_norm": 3.463031328335616, "language_loss": 0.9199627, "learning_rate": 3.915945923428559e-06, "loss": 0.94234556, "num_input_tokens_seen": 21035630, "step": 997, "time_per_iteration": 2.6703150272369385 }, { "auxiliary_loss_clip": 0.01216926, "auxiliary_loss_mlp": 0.01036467, "balance_loss_clip": 1.05744815, "balance_loss_mlp": 1.02417016, "epoch": 0.12000240485781279, "flos": 16216577205120.0, "grad_norm": 2.223741481512645, "language_loss": 0.82723713, "learning_rate": 3.915722323464577e-06, "loss": 0.84977108, "num_input_tokens_seen": 21054235, "step": 998, "time_per_iteration": 2.5862603187561035 }, { "auxiliary_loss_clip": 0.01223153, "auxiliary_loss_mlp": 0.01045563, "balance_loss_clip": 1.06189752, "balance_loss_mlp": 1.03320646, "epoch": 0.12012264774845187, "flos": 49344887525760.0, "grad_norm": 2.654948446917869, "language_loss": 0.7049132, "learning_rate": 3.91549843288704e-06, "loss": 0.72760034, "num_input_tokens_seen": 21077915, "step": 999, "time_per_iteration": 2.8897671699523926 }, { "auxiliary_loss_clip": 0.01187739, "auxiliary_loss_mlp": 0.00766025, "balance_loss_clip": 1.05170202, "balance_loss_mlp": 1.00029778, "epoch": 0.12024289063909097, "flos": 26979435601920.0, "grad_norm": 6.836598332948917, "language_loss": 0.79306066, "learning_rate": 3.915274251729916e-06, "loss": 0.81259835, "num_input_tokens_seen": 21099205, "step": 1000, "time_per_iteration": 2.7679402828216553 }, { "auxiliary_loss_clip": 0.01199068, "auxiliary_loss_mlp": 0.01040181, "balance_loss_clip": 1.05937397, "balance_loss_mlp": 1.02802742, "epoch": 0.12036313352973005, "flos": 19537308633600.0, "grad_norm": 2.100508337181235, "language_loss": 0.90401828, "learning_rate": 3.91504978002721e-06, "loss": 0.92641079, "num_input_tokens_seen": 21118260, "step": 1001, "time_per_iteration": 2.6632964611053467 }, { "auxiliary_loss_clip": 0.01205623, "auxiliary_loss_mlp": 0.00765827, "balance_loss_clip": 1.0555104, "balance_loss_mlp": 1.00025058, "epoch": 0.12048337642036915, "flos": 17268256535040.0, "grad_norm": 2.139760483327339, "language_loss": 0.7625075, "learning_rate": 3.914825017812974e-06, "loss": 0.78222197, "num_input_tokens_seen": 21134910, "step": 1002, "time_per_iteration": 2.669376850128174 }, { "auxiliary_loss_clip": 0.01208367, "auxiliary_loss_mlp": 0.01040331, "balance_loss_clip": 1.06180906, "balance_loss_mlp": 1.02870786, "epoch": 0.12060361931100824, "flos": 22856962654080.0, "grad_norm": 3.1723476437260074, "language_loss": 0.72827929, "learning_rate": 3.9145999651213065e-06, "loss": 0.75076628, "num_input_tokens_seen": 21154150, "step": 1003, "time_per_iteration": 2.6851589679718018 }, { "auxiliary_loss_clip": 0.01219851, "auxiliary_loss_mlp": 0.01039249, "balance_loss_clip": 1.05988896, "balance_loss_mlp": 1.02698219, "epoch": 0.12072386220164733, "flos": 16726795943040.0, "grad_norm": 2.9322866067130313, "language_loss": 0.87743139, "learning_rate": 3.9143746219863465e-06, "loss": 0.90002239, "num_input_tokens_seen": 21171255, "step": 1004, "time_per_iteration": 2.6655683517456055 }, { "auxiliary_loss_clip": 0.01128134, "auxiliary_loss_mlp": 0.01015774, "balance_loss_clip": 1.03513479, "balance_loss_mlp": 1.01172054, "epoch": 0.12084410509228642, "flos": 55144176105600.0, "grad_norm": 0.9642825807684384, "language_loss": 0.64759457, "learning_rate": 3.914148988442278e-06, "loss": 0.66903365, "num_input_tokens_seen": 21227045, "step": 1005, "time_per_iteration": 4.149083375930786 }, { "auxiliary_loss_clip": 0.01199904, "auxiliary_loss_mlp": 0.01044434, "balance_loss_clip": 1.06069803, "balance_loss_mlp": 1.03080845, "epoch": 0.1209643479829255, "flos": 26760236855040.0, "grad_norm": 3.3712035042437263, "language_loss": 0.95385063, "learning_rate": 3.91392306452333e-06, "loss": 0.97629404, "num_input_tokens_seen": 21244120, "step": 1006, "time_per_iteration": 2.6977760791778564 }, { "auxiliary_loss_clip": 0.01241587, "auxiliary_loss_mlp": 0.01039246, "balance_loss_clip": 1.06577992, "balance_loss_mlp": 1.02599573, "epoch": 0.1210845908735646, "flos": 11035026725760.0, "grad_norm": 3.0433924067305123, "language_loss": 0.66414505, "learning_rate": 3.913696850263774e-06, "loss": 0.68695337, "num_input_tokens_seen": 21258485, "step": 1007, "time_per_iteration": 2.5928385257720947 }, { "auxiliary_loss_clip": 0.01224049, "auxiliary_loss_mlp": 0.01034809, "balance_loss_clip": 1.06162, "balance_loss_mlp": 1.02187455, "epoch": 0.1212048337642037, "flos": 20484631975680.0, "grad_norm": 2.714896387572321, "language_loss": 0.79220355, "learning_rate": 3.913470345697929e-06, "loss": 0.8147921, "num_input_tokens_seen": 21277115, "step": 1008, "time_per_iteration": 2.6702680587768555 }, { "auxiliary_loss_clip": 0.01180781, "auxiliary_loss_mlp": 0.01055174, "balance_loss_clip": 1.0581758, "balance_loss_mlp": 1.04278183, "epoch": 0.12132507665484278, "flos": 22346061557760.0, "grad_norm": 2.1612957399340558, "language_loss": 0.85538167, "learning_rate": 3.913243550860153e-06, "loss": 0.87774122, "num_input_tokens_seen": 21294880, "step": 1009, "time_per_iteration": 4.556337833404541 }, { "auxiliary_loss_clip": 0.01229511, "auxiliary_loss_mlp": 0.01038687, "balance_loss_clip": 1.06807673, "balance_loss_mlp": 1.02620614, "epoch": 0.12144531954548188, "flos": 29314957818240.0, "grad_norm": 1.9318389329214432, "language_loss": 0.76182044, "learning_rate": 3.913016465784852e-06, "loss": 0.78450239, "num_input_tokens_seen": 21315555, "step": 1010, "time_per_iteration": 2.760122060775757 }, { "auxiliary_loss_clip": 0.01177105, "auxiliary_loss_mlp": 0.01043451, "balance_loss_clip": 1.05449259, "balance_loss_mlp": 1.03123784, "epoch": 0.12156556243612096, "flos": 20485242506880.0, "grad_norm": 3.152718434235606, "language_loss": 0.7217055, "learning_rate": 3.912789090506474e-06, "loss": 0.74391103, "num_input_tokens_seen": 21334815, "step": 1011, "time_per_iteration": 2.7064361572265625 }, { "auxiliary_loss_clip": 0.01196555, "auxiliary_loss_mlp": 0.01042131, "balance_loss_clip": 1.05560446, "balance_loss_mlp": 1.02916718, "epoch": 0.12168580532676006, "flos": 16472009796480.0, "grad_norm": 2.2598393338023715, "language_loss": 0.71455395, "learning_rate": 3.9125614250595114e-06, "loss": 0.73694086, "num_input_tokens_seen": 21351025, "step": 1012, "time_per_iteration": 2.720088243484497 }, { "auxiliary_loss_clip": 0.01224939, "auxiliary_loss_mlp": 0.01041069, "balance_loss_clip": 1.06164598, "balance_loss_mlp": 1.02883244, "epoch": 0.12180604821739914, "flos": 15341290588800.0, "grad_norm": 3.610529489626667, "language_loss": 0.8918128, "learning_rate": 3.912333469478502e-06, "loss": 0.91447282, "num_input_tokens_seen": 21368990, "step": 1013, "time_per_iteration": 2.6277966499328613 }, { "auxiliary_loss_clip": 0.01208199, "auxiliary_loss_mlp": 0.01046439, "balance_loss_clip": 1.05979753, "balance_loss_mlp": 1.03442907, "epoch": 0.12192629110803824, "flos": 19318038059520.0, "grad_norm": 2.0282263993499448, "language_loss": 0.78075641, "learning_rate": 3.912105223798025e-06, "loss": 0.80330276, "num_input_tokens_seen": 21388410, "step": 1014, "time_per_iteration": 2.7380383014678955 }, { "auxiliary_loss_clip": 0.01111117, "auxiliary_loss_mlp": 0.01008344, "balance_loss_clip": 1.03049302, "balance_loss_mlp": 1.00438595, "epoch": 0.12204653399867733, "flos": 47725354085760.0, "grad_norm": 0.997267695907163, "language_loss": 0.67599809, "learning_rate": 3.9118766880527065e-06, "loss": 0.69719279, "num_input_tokens_seen": 21442845, "step": 1015, "time_per_iteration": 3.139171838760376 }, { "auxiliary_loss_clip": 0.01165538, "auxiliary_loss_mlp": 0.01035787, "balance_loss_clip": 1.0551095, "balance_loss_mlp": 1.02442598, "epoch": 0.12216677688931642, "flos": 18221936584320.0, "grad_norm": 2.6617455559319594, "language_loss": 0.73838609, "learning_rate": 3.9116478622772145e-06, "loss": 0.76039934, "num_input_tokens_seen": 21461420, "step": 1016, "time_per_iteration": 2.842247724533081 }, { "auxiliary_loss_clip": 0.01220746, "auxiliary_loss_mlp": 0.01041676, "balance_loss_clip": 1.06475544, "balance_loss_mlp": 1.03017187, "epoch": 0.12228701977995551, "flos": 27525636789120.0, "grad_norm": 1.6797797903490534, "language_loss": 0.87949681, "learning_rate": 3.911418746506261e-06, "loss": 0.90212107, "num_input_tokens_seen": 21481550, "step": 1017, "time_per_iteration": 2.766968011856079 }, { "auxiliary_loss_clip": 0.01231613, "auxiliary_loss_mlp": 0.01039303, "balance_loss_clip": 1.06980133, "balance_loss_mlp": 1.02760887, "epoch": 0.1224072626705946, "flos": 21798136517760.0, "grad_norm": 2.1052424541637014, "language_loss": 0.78587574, "learning_rate": 3.911189340774604e-06, "loss": 0.80858493, "num_input_tokens_seen": 21501680, "step": 1018, "time_per_iteration": 2.852184295654297 }, { "auxiliary_loss_clip": 0.01215152, "auxiliary_loss_mlp": 0.01040848, "balance_loss_clip": 1.05927789, "balance_loss_mlp": 1.02868855, "epoch": 0.1225275055612337, "flos": 20703758895360.0, "grad_norm": 1.8352759341157254, "language_loss": 0.79429638, "learning_rate": 3.910959645117043e-06, "loss": 0.81685638, "num_input_tokens_seen": 21521015, "step": 1019, "time_per_iteration": 2.707390069961548 }, { "auxiliary_loss_clip": 0.01116646, "auxiliary_loss_mlp": 0.00757178, "balance_loss_clip": 1.03242874, "balance_loss_mlp": 1.00058997, "epoch": 0.12264774845187278, "flos": 57745294462080.0, "grad_norm": 0.8192346531021951, "language_loss": 0.56712556, "learning_rate": 3.910729659568423e-06, "loss": 0.58586377, "num_input_tokens_seen": 21578200, "step": 1020, "time_per_iteration": 3.213798999786377 }, { "auxiliary_loss_clip": 0.01207365, "auxiliary_loss_mlp": 0.01038297, "balance_loss_clip": 1.06127453, "balance_loss_mlp": 1.02760363, "epoch": 0.12276799134251187, "flos": 26396282298240.0, "grad_norm": 1.8117810529576872, "language_loss": 0.82636088, "learning_rate": 3.9104993841636344e-06, "loss": 0.84881747, "num_input_tokens_seen": 21598770, "step": 1021, "time_per_iteration": 2.7447478771209717 }, { "auxiliary_loss_clip": 0.01203224, "auxiliary_loss_mlp": 0.0076578, "balance_loss_clip": 1.0625968, "balance_loss_mlp": 1.00021994, "epoch": 0.12288823423315097, "flos": 21064193919360.0, "grad_norm": 1.92550574409633, "language_loss": 0.80797172, "learning_rate": 3.910268818937608e-06, "loss": 0.82766175, "num_input_tokens_seen": 21616925, "step": 1022, "time_per_iteration": 2.702984571456909 }, { "auxiliary_loss_clip": 0.01179876, "auxiliary_loss_mlp": 0.01050083, "balance_loss_clip": 1.06073713, "balance_loss_mlp": 1.03819799, "epoch": 0.12300847712379005, "flos": 12312441077760.0, "grad_norm": 2.6334926811970325, "language_loss": 0.87809557, "learning_rate": 3.9100379639253196e-06, "loss": 0.90039515, "num_input_tokens_seen": 21633645, "step": 1023, "time_per_iteration": 2.7395734786987305 }, { "auxiliary_loss_clip": 0.01204328, "auxiliary_loss_mlp": 0.01038534, "balance_loss_clip": 1.05612648, "balance_loss_mlp": 1.02721548, "epoch": 0.12312872001442915, "flos": 16762239688320.0, "grad_norm": 2.4865386583179783, "language_loss": 0.86495757, "learning_rate": 3.909806819161791e-06, "loss": 0.8873862, "num_input_tokens_seen": 21649120, "step": 1024, "time_per_iteration": 2.6661536693573 }, { "auxiliary_loss_clip": 0.01197366, "auxiliary_loss_mlp": 0.01043399, "balance_loss_clip": 1.05812716, "balance_loss_mlp": 1.0316267, "epoch": 0.12324896290506823, "flos": 18404937400320.0, "grad_norm": 2.7099821135562743, "language_loss": 0.86673242, "learning_rate": 3.909575384682086e-06, "loss": 0.88914007, "num_input_tokens_seen": 21668000, "step": 1025, "time_per_iteration": 2.6912689208984375 }, { "auxiliary_loss_clip": 0.01222144, "auxiliary_loss_mlp": 0.01034944, "balance_loss_clip": 1.0599184, "balance_loss_mlp": 1.02367234, "epoch": 0.12336920579570733, "flos": 18915407533440.0, "grad_norm": 1.8457799696127073, "language_loss": 0.69466639, "learning_rate": 3.9093436605213144e-06, "loss": 0.71723723, "num_input_tokens_seen": 21688500, "step": 1026, "time_per_iteration": 2.6372241973876953 }, { "auxiliary_loss_clip": 0.01206877, "auxiliary_loss_mlp": 0.010489, "balance_loss_clip": 1.06031322, "balance_loss_mlp": 1.03750992, "epoch": 0.12348944868634643, "flos": 23878369797120.0, "grad_norm": 1.8186932862759035, "language_loss": 0.79729408, "learning_rate": 3.909111646714627e-06, "loss": 0.81985188, "num_input_tokens_seen": 21709345, "step": 1027, "time_per_iteration": 2.6928157806396484 }, { "auxiliary_loss_clip": 0.01237321, "auxiliary_loss_mlp": 0.01034235, "balance_loss_clip": 1.06371367, "balance_loss_mlp": 1.02283895, "epoch": 0.12360969157698551, "flos": 19026084314880.0, "grad_norm": 2.066197549482594, "language_loss": 0.72243047, "learning_rate": 3.9088793432972206e-06, "loss": 0.74514604, "num_input_tokens_seen": 21728165, "step": 1028, "time_per_iteration": 2.6527819633483887 }, { "auxiliary_loss_clip": 0.01181583, "auxiliary_loss_mlp": 0.01044987, "balance_loss_clip": 1.05958152, "balance_loss_mlp": 1.03407979, "epoch": 0.1237299344676246, "flos": 13224607983360.0, "grad_norm": 2.0138793474776984, "language_loss": 0.8236081, "learning_rate": 3.908646750304336e-06, "loss": 0.84587377, "num_input_tokens_seen": 21745850, "step": 1029, "time_per_iteration": 2.7002947330474854 }, { "auxiliary_loss_clip": 0.01211712, "auxiliary_loss_mlp": 0.01055454, "balance_loss_clip": 1.06326294, "balance_loss_mlp": 1.0434792, "epoch": 0.12385017735826369, "flos": 20485673470080.0, "grad_norm": 1.6877638081388653, "language_loss": 0.87112778, "learning_rate": 3.908413867771257e-06, "loss": 0.89379942, "num_input_tokens_seen": 21764760, "step": 1030, "time_per_iteration": 2.7035481929779053 }, { "auxiliary_loss_clip": 0.01222017, "auxiliary_loss_mlp": 0.01043375, "balance_loss_clip": 1.06317127, "balance_loss_mlp": 1.03201437, "epoch": 0.12397042024890279, "flos": 17347835116800.0, "grad_norm": 1.6668902048060372, "language_loss": 0.80655205, "learning_rate": 3.908180695733311e-06, "loss": 0.82920605, "num_input_tokens_seen": 21784250, "step": 1031, "time_per_iteration": 3.73565936088562 }, { "auxiliary_loss_clip": 0.01151903, "auxiliary_loss_mlp": 0.01039364, "balance_loss_clip": 1.05042434, "balance_loss_mlp": 1.02719843, "epoch": 0.12409066313954187, "flos": 20412343854720.0, "grad_norm": 1.9692088083761699, "language_loss": 0.82474846, "learning_rate": 3.907947234225871e-06, "loss": 0.84666115, "num_input_tokens_seen": 21803260, "step": 1032, "time_per_iteration": 2.770810842514038 }, { "auxiliary_loss_clip": 0.01154066, "auxiliary_loss_mlp": 0.01045037, "balance_loss_clip": 1.05274415, "balance_loss_mlp": 1.03344965, "epoch": 0.12421090603018096, "flos": 20736688688640.0, "grad_norm": 2.1144716535253747, "language_loss": 0.8690263, "learning_rate": 3.907713483284352e-06, "loss": 0.89101732, "num_input_tokens_seen": 21822735, "step": 1033, "time_per_iteration": 2.7912769317626953 }, { "auxiliary_loss_clip": 0.01135078, "auxiliary_loss_mlp": 0.01050138, "balance_loss_clip": 1.04941773, "balance_loss_mlp": 1.03854489, "epoch": 0.12433114892082006, "flos": 24498834353280.0, "grad_norm": 2.4920298503669795, "language_loss": 0.97180855, "learning_rate": 3.907479442944216e-06, "loss": 0.99366069, "num_input_tokens_seen": 21841140, "step": 1034, "time_per_iteration": 3.0524399280548096 }, { "auxiliary_loss_clip": 0.01222183, "auxiliary_loss_mlp": 0.01042276, "balance_loss_clip": 1.06443906, "balance_loss_mlp": 1.03123713, "epoch": 0.12445139181145914, "flos": 19682315838720.0, "grad_norm": 2.2645677634719936, "language_loss": 0.92359698, "learning_rate": 3.907245113240963e-06, "loss": 0.94624162, "num_input_tokens_seen": 21859260, "step": 1035, "time_per_iteration": 3.8346590995788574 }, { "auxiliary_loss_clip": 0.01186464, "auxiliary_loss_mlp": 0.01049417, "balance_loss_clip": 1.05460072, "balance_loss_mlp": 1.0370611, "epoch": 0.12457163470209824, "flos": 46423087522560.0, "grad_norm": 1.7959883945230293, "language_loss": 0.73786026, "learning_rate": 3.907010494210144e-06, "loss": 0.7602191, "num_input_tokens_seen": 21881920, "step": 1036, "time_per_iteration": 2.892209768295288 }, { "auxiliary_loss_clip": 0.01227789, "auxiliary_loss_mlp": 0.01035422, "balance_loss_clip": 1.06530428, "balance_loss_mlp": 1.02364993, "epoch": 0.12469187759273732, "flos": 20376289578240.0, "grad_norm": 2.2480390252644105, "language_loss": 0.91834772, "learning_rate": 3.9067755858873495e-06, "loss": 0.94097984, "num_input_tokens_seen": 21898720, "step": 1037, "time_per_iteration": 2.655383825302124 }, { "auxiliary_loss_clip": 0.01102238, "auxiliary_loss_mlp": 0.0101771, "balance_loss_clip": 1.02810752, "balance_loss_mlp": 1.01387143, "epoch": 0.12481212048337642, "flos": 69224641447680.0, "grad_norm": 0.866247275807406, "language_loss": 0.62776899, "learning_rate": 3.906540388308214e-06, "loss": 0.64896846, "num_input_tokens_seen": 21958305, "step": 1038, "time_per_iteration": 3.3002307415008545 }, { "auxiliary_loss_clip": 0.01160858, "auxiliary_loss_mlp": 0.01036866, "balance_loss_clip": 1.05657113, "balance_loss_mlp": 1.02579701, "epoch": 0.12493236337401552, "flos": 18223696350720.0, "grad_norm": 2.028309569430993, "language_loss": 0.81432915, "learning_rate": 3.906304901508417e-06, "loss": 0.83630633, "num_input_tokens_seen": 21977205, "step": 1039, "time_per_iteration": 2.698611259460449 }, { "auxiliary_loss_clip": 0.01226121, "auxiliary_loss_mlp": 0.01045218, "balance_loss_clip": 1.06595254, "balance_loss_mlp": 1.03415537, "epoch": 0.12505260626465461, "flos": 30044375303040.0, "grad_norm": 2.199682275757682, "language_loss": 0.75662541, "learning_rate": 3.9060691255236835e-06, "loss": 0.77933884, "num_input_tokens_seen": 21997770, "step": 1040, "time_per_iteration": 2.743642807006836 }, { "auxiliary_loss_clip": 0.01218266, "auxiliary_loss_mlp": 0.01050941, "balance_loss_clip": 1.0598948, "balance_loss_mlp": 1.03973496, "epoch": 0.1251728491552937, "flos": 24433980347520.0, "grad_norm": 1.7495995931455124, "language_loss": 0.80553901, "learning_rate": 3.905833060389778e-06, "loss": 0.8282311, "num_input_tokens_seen": 22021890, "step": 1041, "time_per_iteration": 2.711275815963745 }, { "auxiliary_loss_clip": 0.01240483, "auxiliary_loss_mlp": 0.00765839, "balance_loss_clip": 1.06730604, "balance_loss_mlp": 1.00025833, "epoch": 0.12529309204593278, "flos": 27119809952640.0, "grad_norm": 2.057146825837037, "language_loss": 0.78258127, "learning_rate": 3.905596706142513e-06, "loss": 0.80264449, "num_input_tokens_seen": 22043300, "step": 1042, "time_per_iteration": 2.7108614444732666 }, { "auxiliary_loss_clip": 0.01187402, "auxiliary_loss_mlp": 0.01039451, "balance_loss_clip": 1.0571909, "balance_loss_mlp": 1.02823925, "epoch": 0.12541333493657186, "flos": 30774151923840.0, "grad_norm": 2.0307786263864736, "language_loss": 0.85716456, "learning_rate": 3.9053600628177435e-06, "loss": 0.8794331, "num_input_tokens_seen": 22062910, "step": 1043, "time_per_iteration": 2.827369213104248 }, { "auxiliary_loss_clip": 0.01235281, "auxiliary_loss_mlp": 0.01035742, "balance_loss_clip": 1.06225777, "balance_loss_mlp": 1.02415442, "epoch": 0.12553357782721097, "flos": 23659566099840.0, "grad_norm": 1.9416982344498939, "language_loss": 0.84755492, "learning_rate": 3.905123130451367e-06, "loss": 0.87026525, "num_input_tokens_seen": 22084010, "step": 1044, "time_per_iteration": 2.6386501789093018 }, { "auxiliary_loss_clip": 0.01235743, "auxiliary_loss_mlp": 0.01042301, "balance_loss_clip": 1.06345475, "balance_loss_mlp": 1.03059435, "epoch": 0.12565382071785006, "flos": 24863758577280.0, "grad_norm": 2.8232633924260337, "language_loss": 0.78900093, "learning_rate": 3.904885909079326e-06, "loss": 0.81178141, "num_input_tokens_seen": 22102795, "step": 1045, "time_per_iteration": 2.615800142288208 }, { "auxiliary_loss_clip": 0.01222249, "auxiliary_loss_mlp": 0.01048381, "balance_loss_clip": 1.0627712, "balance_loss_mlp": 1.03635907, "epoch": 0.12577406360848914, "flos": 21360780518400.0, "grad_norm": 2.9764906229022428, "language_loss": 0.77890193, "learning_rate": 3.904648398737607e-06, "loss": 0.80160826, "num_input_tokens_seen": 22121360, "step": 1046, "time_per_iteration": 2.857059955596924 }, { "auxiliary_loss_clip": 0.01234725, "auxiliary_loss_mlp": 0.01036989, "balance_loss_clip": 1.06298566, "balance_loss_mlp": 1.02581882, "epoch": 0.12589430649912825, "flos": 36138056774400.0, "grad_norm": 1.8871588868468498, "language_loss": 0.78104854, "learning_rate": 3.9044105994622406e-06, "loss": 0.80376571, "num_input_tokens_seen": 22142505, "step": 1047, "time_per_iteration": 2.8072240352630615 }, { "auxiliary_loss_clip": 0.01211216, "auxiliary_loss_mlp": 0.00765871, "balance_loss_clip": 1.06199956, "balance_loss_mlp": 1.00024652, "epoch": 0.12601454938976733, "flos": 25337671643520.0, "grad_norm": 2.192955793430223, "language_loss": 0.81682098, "learning_rate": 3.9041725112893005e-06, "loss": 0.83659184, "num_input_tokens_seen": 22163730, "step": 1048, "time_per_iteration": 2.744117259979248 }, { "auxiliary_loss_clip": 0.01190946, "auxiliary_loss_mlp": 0.01043072, "balance_loss_clip": 1.06274557, "balance_loss_mlp": 1.03195572, "epoch": 0.12613479228040642, "flos": 15560094286080.0, "grad_norm": 2.6765530802726034, "language_loss": 0.75256962, "learning_rate": 3.903934134254904e-06, "loss": 0.77490985, "num_input_tokens_seen": 22181520, "step": 1049, "time_per_iteration": 2.733605146408081 }, { "auxiliary_loss_clip": 0.0122719, "auxiliary_loss_mlp": 0.01035764, "balance_loss_clip": 1.06268108, "balance_loss_mlp": 1.02390289, "epoch": 0.1262550351710455, "flos": 21470595373440.0, "grad_norm": 2.7045057625861, "language_loss": 0.85582596, "learning_rate": 3.903695468395213e-06, "loss": 0.87845552, "num_input_tokens_seen": 22199390, "step": 1050, "time_per_iteration": 2.7244279384613037 }, { "auxiliary_loss_clip": 0.01208956, "auxiliary_loss_mlp": 0.01042426, "balance_loss_clip": 1.05702412, "balance_loss_mlp": 1.0305171, "epoch": 0.1263752780616846, "flos": 31576719456000.0, "grad_norm": 3.6590693709016753, "language_loss": 0.56098348, "learning_rate": 3.903456513746434e-06, "loss": 0.58349729, "num_input_tokens_seen": 22220365, "step": 1051, "time_per_iteration": 2.882082462310791 }, { "auxiliary_loss_clip": 0.01236676, "auxiliary_loss_mlp": 0.01036846, "balance_loss_clip": 1.06408322, "balance_loss_mlp": 1.02667701, "epoch": 0.1264955209523237, "flos": 28768217927040.0, "grad_norm": 2.137058736443117, "language_loss": 0.87869394, "learning_rate": 3.903217270344815e-06, "loss": 0.90142918, "num_input_tokens_seen": 22240615, "step": 1052, "time_per_iteration": 2.7169365882873535 }, { "auxiliary_loss_clip": 0.01183918, "auxiliary_loss_mlp": 0.01041025, "balance_loss_clip": 1.05434382, "balance_loss_mlp": 1.02958727, "epoch": 0.12661576384296278, "flos": 29241125412480.0, "grad_norm": 3.2102904495582836, "language_loss": 0.82816863, "learning_rate": 3.902977738226648e-06, "loss": 0.85041803, "num_input_tokens_seen": 22261350, "step": 1053, "time_per_iteration": 2.761601686477661 }, { "auxiliary_loss_clip": 0.01222237, "auxiliary_loss_mlp": 0.01036744, "balance_loss_clip": 1.06330395, "balance_loss_mlp": 1.02572322, "epoch": 0.12673600673360189, "flos": 20850346298880.0, "grad_norm": 1.820013316374067, "language_loss": 0.91271973, "learning_rate": 3.902737917428273e-06, "loss": 0.93530953, "num_input_tokens_seen": 22279515, "step": 1054, "time_per_iteration": 2.636953592300415 }, { "auxiliary_loss_clip": 0.01236978, "auxiliary_loss_mlp": 0.01035642, "balance_loss_clip": 1.06408501, "balance_loss_mlp": 1.02413273, "epoch": 0.12685624962424097, "flos": 25263695583360.0, "grad_norm": 2.1988515504703687, "language_loss": 0.84212232, "learning_rate": 3.902497807986068e-06, "loss": 0.86484855, "num_input_tokens_seen": 22299535, "step": 1055, "time_per_iteration": 2.653538465499878 }, { "auxiliary_loss_clip": 0.011917, "auxiliary_loss_mlp": 0.01036745, "balance_loss_clip": 1.05470276, "balance_loss_mlp": 1.02516365, "epoch": 0.12697649251488005, "flos": 27527109246720.0, "grad_norm": 1.9942215181809144, "language_loss": 0.83942252, "learning_rate": 3.902257409936458e-06, "loss": 0.86170697, "num_input_tokens_seen": 22320300, "step": 1056, "time_per_iteration": 2.786736011505127 }, { "auxiliary_loss_clip": 0.01203202, "auxiliary_loss_mlp": 0.01035728, "balance_loss_clip": 1.06046379, "balance_loss_mlp": 1.02507615, "epoch": 0.12709673540551916, "flos": 21251863503360.0, "grad_norm": 2.3064929944173604, "language_loss": 0.84262311, "learning_rate": 3.902016723315912e-06, "loss": 0.86501241, "num_input_tokens_seen": 22338240, "step": 1057, "time_per_iteration": 3.560701847076416 }, { "auxiliary_loss_clip": 0.01216642, "auxiliary_loss_mlp": 0.01037329, "balance_loss_clip": 1.05772209, "balance_loss_mlp": 1.0265944, "epoch": 0.12721697829615825, "flos": 25337707557120.0, "grad_norm": 2.556425578173309, "language_loss": 0.69383103, "learning_rate": 3.901775748160941e-06, "loss": 0.71637076, "num_input_tokens_seen": 22357420, "step": 1058, "time_per_iteration": 2.6972365379333496 }, { "auxiliary_loss_clip": 0.01113968, "auxiliary_loss_mlp": 0.01016283, "balance_loss_clip": 1.03210139, "balance_loss_mlp": 1.01304078, "epoch": 0.12733722118679733, "flos": 61943287754880.0, "grad_norm": 0.798876760146273, "language_loss": 0.60854244, "learning_rate": 3.901534484508101e-06, "loss": 0.62984502, "num_input_tokens_seen": 22420095, "step": 1059, "time_per_iteration": 3.2554681301116943 }, { "auxiliary_loss_clip": 0.01192901, "auxiliary_loss_mlp": 0.01035464, "balance_loss_clip": 1.05592012, "balance_loss_mlp": 1.0240432, "epoch": 0.1274574640774364, "flos": 26976742081920.0, "grad_norm": 1.766505421912866, "language_loss": 0.74543524, "learning_rate": 3.901292932393991e-06, "loss": 0.76771891, "num_input_tokens_seen": 22438975, "step": 1060, "time_per_iteration": 2.769479990005493 }, { "auxiliary_loss_clip": 0.01237651, "auxiliary_loss_mlp": 0.01045327, "balance_loss_clip": 1.06697166, "balance_loss_mlp": 1.03396678, "epoch": 0.12757770696807552, "flos": 22236318529920.0, "grad_norm": 2.9230540707977415, "language_loss": 0.85324538, "learning_rate": 3.9010510918552555e-06, "loss": 0.87607515, "num_input_tokens_seen": 22458050, "step": 1061, "time_per_iteration": 4.85273289680481 }, { "auxiliary_loss_clip": 0.01197541, "auxiliary_loss_mlp": 0.01044727, "balance_loss_clip": 1.05429363, "balance_loss_mlp": 1.03251982, "epoch": 0.1276979498587146, "flos": 28547905858560.0, "grad_norm": 2.4842629684057704, "language_loss": 0.74904287, "learning_rate": 3.900808962928581e-06, "loss": 0.7714656, "num_input_tokens_seen": 22475665, "step": 1062, "time_per_iteration": 2.7578938007354736 }, { "auxiliary_loss_clip": 0.0123426, "auxiliary_loss_mlp": 0.01040378, "balance_loss_clip": 1.06349015, "balance_loss_mlp": 1.02972043, "epoch": 0.1278181927493537, "flos": 17420338719360.0, "grad_norm": 2.3770196167955473, "language_loss": 0.89829278, "learning_rate": 3.900566545650698e-06, "loss": 0.92103916, "num_input_tokens_seen": 22493335, "step": 1063, "time_per_iteration": 2.6138265132904053 }, { "auxiliary_loss_clip": 0.01219617, "auxiliary_loss_mlp": 0.01039962, "balance_loss_clip": 1.06359696, "balance_loss_mlp": 1.0277853, "epoch": 0.1279384356399928, "flos": 21138636856320.0, "grad_norm": 2.674173751967933, "language_loss": 0.81823325, "learning_rate": 3.900323840058381e-06, "loss": 0.84082901, "num_input_tokens_seen": 22511045, "step": 1064, "time_per_iteration": 2.707716703414917 }, { "auxiliary_loss_clip": 0.01219082, "auxiliary_loss_mlp": 0.0103505, "balance_loss_clip": 1.05900753, "balance_loss_mlp": 1.02430308, "epoch": 0.12805867853063188, "flos": 26576733248640.0, "grad_norm": 1.8350418296813011, "language_loss": 0.81643486, "learning_rate": 3.900080846188449e-06, "loss": 0.8389762, "num_input_tokens_seen": 22529635, "step": 1065, "time_per_iteration": 2.669092893600464 }, { "auxiliary_loss_clip": 0.01233326, "auxiliary_loss_mlp": 0.01040927, "balance_loss_clip": 1.06184471, "balance_loss_mlp": 1.02911937, "epoch": 0.12817892142127096, "flos": 16436206915200.0, "grad_norm": 1.9865116618456167, "language_loss": 0.81413066, "learning_rate": 3.8998375640777625e-06, "loss": 0.83687317, "num_input_tokens_seen": 22547505, "step": 1066, "time_per_iteration": 2.6514127254486084 }, { "auxiliary_loss_clip": 0.01130801, "auxiliary_loss_mlp": 0.01007337, "balance_loss_clip": 1.04696548, "balance_loss_mlp": 1.00376117, "epoch": 0.12829916431191005, "flos": 60757049099520.0, "grad_norm": 0.8071888930315945, "language_loss": 0.52612066, "learning_rate": 3.899593993763229e-06, "loss": 0.54750204, "num_input_tokens_seen": 22608465, "step": 1067, "time_per_iteration": 3.2835206985473633 }, { "auxiliary_loss_clip": 0.01181333, "auxiliary_loss_mlp": 0.01042632, "balance_loss_clip": 1.05670452, "balance_loss_mlp": 1.03140831, "epoch": 0.12841940720254916, "flos": 29786895636480.0, "grad_norm": 2.160559809724421, "language_loss": 0.81916666, "learning_rate": 3.899350135281796e-06, "loss": 0.84140635, "num_input_tokens_seen": 22629465, "step": 1068, "time_per_iteration": 2.830223560333252 }, { "auxiliary_loss_clip": 0.01193624, "auxiliary_loss_mlp": 0.01039066, "balance_loss_clip": 1.05892146, "balance_loss_mlp": 1.02828896, "epoch": 0.12853965009318824, "flos": 25951851319680.0, "grad_norm": 2.014659319365307, "language_loss": 0.79925734, "learning_rate": 3.8991059886704585e-06, "loss": 0.82158422, "num_input_tokens_seen": 22648970, "step": 1069, "time_per_iteration": 2.8045356273651123 }, { "auxiliary_loss_clip": 0.0118362, "auxiliary_loss_mlp": 0.01040102, "balance_loss_clip": 1.05872977, "balance_loss_mlp": 1.02861023, "epoch": 0.12865989298382732, "flos": 30846871008000.0, "grad_norm": 2.150975240117665, "language_loss": 0.82911611, "learning_rate": 3.898861553966252e-06, "loss": 0.85135329, "num_input_tokens_seen": 22668620, "step": 1070, "time_per_iteration": 2.8105504512786865 }, { "auxiliary_loss_clip": 0.01142522, "auxiliary_loss_mlp": 0.01039841, "balance_loss_clip": 1.05012536, "balance_loss_mlp": 1.02904654, "epoch": 0.12878013587446643, "flos": 25885776251520.0, "grad_norm": 1.620895140793095, "language_loss": 0.88055265, "learning_rate": 3.898616831206257e-06, "loss": 0.90237629, "num_input_tokens_seen": 22689045, "step": 1071, "time_per_iteration": 2.937422275543213 }, { "auxiliary_loss_clip": 0.01181227, "auxiliary_loss_mlp": 0.01045001, "balance_loss_clip": 1.05263722, "balance_loss_mlp": 1.03314543, "epoch": 0.12890037876510552, "flos": 23333138277120.0, "grad_norm": 1.9696164750286227, "language_loss": 0.76916087, "learning_rate": 3.8983718204276e-06, "loss": 0.7914232, "num_input_tokens_seen": 22711265, "step": 1072, "time_per_iteration": 2.799363613128662 }, { "auxiliary_loss_clip": 0.01204179, "auxiliary_loss_mlp": 0.01040309, "balance_loss_clip": 1.05744338, "balance_loss_mlp": 1.0291748, "epoch": 0.1290206216557446, "flos": 23587242065280.0, "grad_norm": 3.9786945919320647, "language_loss": 0.82347155, "learning_rate": 3.898126521667446e-06, "loss": 0.84591645, "num_input_tokens_seen": 22731420, "step": 1073, "time_per_iteration": 2.7921648025512695 }, { "auxiliary_loss_clip": 0.01220672, "auxiliary_loss_mlp": 0.01040077, "balance_loss_clip": 1.06165624, "balance_loss_mlp": 1.02827537, "epoch": 0.12914086454638368, "flos": 24170610850560.0, "grad_norm": 1.6770865917990314, "language_loss": 0.83236742, "learning_rate": 3.897880934963007e-06, "loss": 0.85497493, "num_input_tokens_seen": 22750970, "step": 1074, "time_per_iteration": 2.6896181106567383 }, { "auxiliary_loss_clip": 0.01198033, "auxiliary_loss_mlp": 0.01042275, "balance_loss_clip": 1.05532527, "balance_loss_mlp": 1.03106952, "epoch": 0.1292611074370228, "flos": 20267157081600.0, "grad_norm": 2.3132023180962418, "language_loss": 0.7876969, "learning_rate": 3.89763506035154e-06, "loss": 0.81009996, "num_input_tokens_seen": 22768820, "step": 1075, "time_per_iteration": 2.739361047744751 }, { "auxiliary_loss_clip": 0.01206987, "auxiliary_loss_mlp": 0.0103626, "balance_loss_clip": 1.05725479, "balance_loss_mlp": 1.02519774, "epoch": 0.12938135032766188, "flos": 27377684668800.0, "grad_norm": 1.7018010414803082, "language_loss": 0.81401211, "learning_rate": 3.897388897870343e-06, "loss": 0.83644462, "num_input_tokens_seen": 22789460, "step": 1076, "time_per_iteration": 2.751187562942505 }, { "auxiliary_loss_clip": 0.01213479, "auxiliary_loss_mlp": 0.01040322, "balance_loss_clip": 1.05825496, "balance_loss_mlp": 1.02873468, "epoch": 0.12950159321830096, "flos": 29277107861760.0, "grad_norm": 3.708839246961591, "language_loss": 0.75073683, "learning_rate": 3.89714244755676e-06, "loss": 0.77327484, "num_input_tokens_seen": 22810820, "step": 1077, "time_per_iteration": 2.834130048751831 }, { "auxiliary_loss_clip": 0.01155544, "auxiliary_loss_mlp": 0.01045835, "balance_loss_clip": 1.04715598, "balance_loss_mlp": 1.03367567, "epoch": 0.12962183610894007, "flos": 24534888629760.0, "grad_norm": 2.7983480906951224, "language_loss": 0.86467409, "learning_rate": 3.896895709448175e-06, "loss": 0.88668787, "num_input_tokens_seen": 22830570, "step": 1078, "time_per_iteration": 2.7367212772369385 }, { "auxiliary_loss_clip": 0.0115194, "auxiliary_loss_mlp": 0.01042561, "balance_loss_clip": 1.04961538, "balance_loss_mlp": 1.0310514, "epoch": 0.12974207899957915, "flos": 11215944552960.0, "grad_norm": 3.0087592756580124, "language_loss": 0.77225995, "learning_rate": 3.896648683582019e-06, "loss": 0.79420495, "num_input_tokens_seen": 22845905, "step": 1079, "time_per_iteration": 2.7789998054504395 }, { "auxiliary_loss_clip": 0.01174121, "auxiliary_loss_mlp": 0.01037101, "balance_loss_clip": 1.05867374, "balance_loss_mlp": 1.0264194, "epoch": 0.12986232189021824, "flos": 24717889445760.0, "grad_norm": 2.182101251924361, "language_loss": 0.80719298, "learning_rate": 3.896401369995766e-06, "loss": 0.82930523, "num_input_tokens_seen": 22865710, "step": 1080, "time_per_iteration": 2.873300552368164 }, { "auxiliary_loss_clip": 0.01233011, "auxiliary_loss_mlp": 0.01037571, "balance_loss_clip": 1.06226301, "balance_loss_mlp": 1.02643728, "epoch": 0.12998256478085732, "flos": 23915357827200.0, "grad_norm": 1.7951023402405335, "language_loss": 0.79566765, "learning_rate": 3.896153768726932e-06, "loss": 0.81837344, "num_input_tokens_seen": 22886020, "step": 1081, "time_per_iteration": 2.732997417449951 }, { "auxiliary_loss_clip": 0.01221374, "auxiliary_loss_mlp": 0.01039582, "balance_loss_clip": 1.06381392, "balance_loss_mlp": 1.02795959, "epoch": 0.13010280767149643, "flos": 18624207974400.0, "grad_norm": 2.280801468611434, "language_loss": 0.88251865, "learning_rate": 3.8959058798130806e-06, "loss": 0.90512824, "num_input_tokens_seen": 22903995, "step": 1082, "time_per_iteration": 2.659595251083374 }, { "auxiliary_loss_clip": 0.01201629, "auxiliary_loss_mlp": 0.00765898, "balance_loss_clip": 1.05810475, "balance_loss_mlp": 1.00033975, "epoch": 0.1302230505621355, "flos": 22783992174720.0, "grad_norm": 1.756937336143744, "language_loss": 0.74862212, "learning_rate": 3.895657703291814e-06, "loss": 0.76829737, "num_input_tokens_seen": 22924100, "step": 1083, "time_per_iteration": 3.6165482997894287 }, { "auxiliary_loss_clip": 0.01210244, "auxiliary_loss_mlp": 0.01036063, "balance_loss_clip": 1.05787182, "balance_loss_mlp": 1.02393985, "epoch": 0.1303432934527746, "flos": 21323612920320.0, "grad_norm": 3.2739882441738, "language_loss": 0.7936691, "learning_rate": 3.895409239200781e-06, "loss": 0.81613219, "num_input_tokens_seen": 22939985, "step": 1084, "time_per_iteration": 2.6997861862182617 }, { "auxiliary_loss_clip": 0.01207569, "auxiliary_loss_mlp": 0.01038799, "balance_loss_clip": 1.05651629, "balance_loss_mlp": 1.02730763, "epoch": 0.1304635363434137, "flos": 20922490765440.0, "grad_norm": 2.341261451370678, "language_loss": 0.91384739, "learning_rate": 3.895160487577673e-06, "loss": 0.93631113, "num_input_tokens_seen": 22957555, "step": 1085, "time_per_iteration": 2.704479694366455 }, { "auxiliary_loss_clip": 0.0111998, "auxiliary_loss_mlp": 0.0101245, "balance_loss_clip": 1.03094923, "balance_loss_mlp": 1.00873089, "epoch": 0.1305837792340528, "flos": 63245659080960.0, "grad_norm": 0.7894819968500649, "language_loss": 0.60918617, "learning_rate": 3.894911448460226e-06, "loss": 0.63051045, "num_input_tokens_seen": 23016870, "step": 1086, "time_per_iteration": 3.1110501289367676 }, { "auxiliary_loss_clip": 0.01118464, "auxiliary_loss_mlp": 0.01039785, "balance_loss_clip": 1.04593015, "balance_loss_mlp": 1.02813816, "epoch": 0.13070402212469187, "flos": 26428852955520.0, "grad_norm": 2.1994215901221303, "language_loss": 0.72558993, "learning_rate": 3.8946621218862195e-06, "loss": 0.74717236, "num_input_tokens_seen": 23037870, "step": 1087, "time_per_iteration": 4.6585118770599365 }, { "auxiliary_loss_clip": 0.01184638, "auxiliary_loss_mlp": 0.01034235, "balance_loss_clip": 1.05637622, "balance_loss_mlp": 1.02360749, "epoch": 0.13082426501533098, "flos": 27673409341440.0, "grad_norm": 1.9108873457378215, "language_loss": 0.88826632, "learning_rate": 3.894412507893475e-06, "loss": 0.91045499, "num_input_tokens_seen": 23058150, "step": 1088, "time_per_iteration": 2.8063507080078125 }, { "auxiliary_loss_clip": 0.01177255, "auxiliary_loss_mlp": 0.01038468, "balance_loss_clip": 1.05203068, "balance_loss_mlp": 1.02625453, "epoch": 0.13094450790597006, "flos": 24826770547200.0, "grad_norm": 4.639505596642658, "language_loss": 0.72179878, "learning_rate": 3.894162606519859e-06, "loss": 0.74395597, "num_input_tokens_seen": 23077100, "step": 1089, "time_per_iteration": 2.7616124153137207 }, { "auxiliary_loss_clip": 0.01172562, "auxiliary_loss_mlp": 0.01041618, "balance_loss_clip": 1.05457127, "balance_loss_mlp": 1.03031731, "epoch": 0.13106475079660915, "flos": 19062605468160.0, "grad_norm": 2.0353053773280845, "language_loss": 0.77093685, "learning_rate": 3.893912417803282e-06, "loss": 0.79307866, "num_input_tokens_seen": 23096815, "step": 1090, "time_per_iteration": 2.8263602256774902 }, { "auxiliary_loss_clip": 0.01165562, "auxiliary_loss_mlp": 0.01039427, "balance_loss_clip": 1.04913771, "balance_loss_mlp": 1.02798295, "epoch": 0.13118499368724823, "flos": 28913189218560.0, "grad_norm": 2.2737363490734106, "language_loss": 0.76612949, "learning_rate": 3.8936619417816975e-06, "loss": 0.7881794, "num_input_tokens_seen": 23117145, "step": 1091, "time_per_iteration": 2.8181614875793457 }, { "auxiliary_loss_clip": 0.01187025, "auxiliary_loss_mlp": 0.010471, "balance_loss_clip": 1.05856049, "balance_loss_mlp": 1.03533411, "epoch": 0.13130523657788734, "flos": 14283398206080.0, "grad_norm": 1.968124414740172, "language_loss": 0.71391261, "learning_rate": 3.8934111784931015e-06, "loss": 0.73625386, "num_input_tokens_seen": 23134595, "step": 1092, "time_per_iteration": 2.6925787925720215 }, { "auxiliary_loss_clip": 0.01109423, "auxiliary_loss_mlp": 0.01006085, "balance_loss_clip": 1.02961278, "balance_loss_mlp": 1.00234187, "epoch": 0.13142547946852642, "flos": 70174155519360.0, "grad_norm": 0.9159114935045927, "language_loss": 0.59043086, "learning_rate": 3.893160127975535e-06, "loss": 0.61158592, "num_input_tokens_seen": 23195285, "step": 1093, "time_per_iteration": 3.3218305110931396 }, { "auxiliary_loss_clip": 0.01178296, "auxiliary_loss_mlp": 0.01036219, "balance_loss_clip": 1.05390394, "balance_loss_mlp": 1.02592492, "epoch": 0.1315457223591655, "flos": 45805998844800.0, "grad_norm": 2.700905205803401, "language_loss": 0.81337857, "learning_rate": 3.8929087902670826e-06, "loss": 0.83552372, "num_input_tokens_seen": 23216915, "step": 1094, "time_per_iteration": 3.0302677154541016 }, { "auxiliary_loss_clip": 0.0112343, "auxiliary_loss_mlp": 0.01003488, "balance_loss_clip": 1.02861845, "balance_loss_mlp": 0.99988824, "epoch": 0.13166596524980462, "flos": 62881165820160.0, "grad_norm": 0.9420288168330911, "language_loss": 0.60766435, "learning_rate": 3.8926571654058715e-06, "loss": 0.62893343, "num_input_tokens_seen": 23273560, "step": 1095, "time_per_iteration": 3.179468870162964 }, { "auxiliary_loss_clip": 0.01182283, "auxiliary_loss_mlp": 0.01044188, "balance_loss_clip": 1.05502522, "balance_loss_mlp": 1.03344774, "epoch": 0.1317862081404437, "flos": 23586523793280.0, "grad_norm": 2.3637812483979315, "language_loss": 0.77033162, "learning_rate": 3.892405253430074e-06, "loss": 0.79259634, "num_input_tokens_seen": 23291080, "step": 1096, "time_per_iteration": 2.7917513847351074 }, { "auxiliary_loss_clip": 0.01198824, "auxiliary_loss_mlp": 0.00765316, "balance_loss_clip": 1.05416965, "balance_loss_mlp": 1.00035751, "epoch": 0.13190645103108278, "flos": 20260764460800.0, "grad_norm": 1.924255025083963, "language_loss": 0.82513064, "learning_rate": 3.892153054377904e-06, "loss": 0.84477204, "num_input_tokens_seen": 23308485, "step": 1097, "time_per_iteration": 2.7591538429260254 }, { "auxiliary_loss_clip": 0.0106837, "auxiliary_loss_mlp": 0.0101432, "balance_loss_clip": 1.02872324, "balance_loss_mlp": 1.01079178, "epoch": 0.13202669392172187, "flos": 53455440136320.0, "grad_norm": 0.9429081832227368, "language_loss": 0.59368134, "learning_rate": 3.891900568287619e-06, "loss": 0.61450827, "num_input_tokens_seen": 23360870, "step": 1098, "time_per_iteration": 3.1594460010528564 }, { "auxiliary_loss_clip": 0.0118799, "auxiliary_loss_mlp": 0.01040992, "balance_loss_clip": 1.05458307, "balance_loss_mlp": 1.02978635, "epoch": 0.13214693681236098, "flos": 15851293845120.0, "grad_norm": 2.6126593444008064, "language_loss": 0.72153783, "learning_rate": 3.891647795197523e-06, "loss": 0.7438277, "num_input_tokens_seen": 23376910, "step": 1099, "time_per_iteration": 2.7069716453552246 }, { "auxiliary_loss_clip": 0.01188433, "auxiliary_loss_mlp": 0.01042092, "balance_loss_clip": 1.05194664, "balance_loss_mlp": 1.03023672, "epoch": 0.13226717970300006, "flos": 19353840940800.0, "grad_norm": 8.599552645340156, "language_loss": 0.68919945, "learning_rate": 3.8913947351459605e-06, "loss": 0.7115047, "num_input_tokens_seen": 23394450, "step": 1100, "time_per_iteration": 2.7359695434570312 }, { "auxiliary_loss_clip": 0.01230747, "auxiliary_loss_mlp": 0.01035633, "balance_loss_clip": 1.06164312, "balance_loss_mlp": 1.02514899, "epoch": 0.13238742259363914, "flos": 20698084546560.0, "grad_norm": 1.8958803167784046, "language_loss": 0.6770587, "learning_rate": 3.89114138817132e-06, "loss": 0.69972253, "num_input_tokens_seen": 23411115, "step": 1101, "time_per_iteration": 2.6670451164245605 }, { "auxiliary_loss_clip": 0.01212073, "auxiliary_loss_mlp": 0.0104838, "balance_loss_clip": 1.05928802, "balance_loss_mlp": 1.03783584, "epoch": 0.13250766548427825, "flos": 21032449274880.0, "grad_norm": 2.1332691254620326, "language_loss": 0.84176129, "learning_rate": 3.890887754312035e-06, "loss": 0.86436582, "num_input_tokens_seen": 23429360, "step": 1102, "time_per_iteration": 2.7282679080963135 }, { "auxiliary_loss_clip": 0.0119349, "auxiliary_loss_mlp": 0.0103637, "balance_loss_clip": 1.05276346, "balance_loss_mlp": 1.02558792, "epoch": 0.13262790837491734, "flos": 22637871648000.0, "grad_norm": 2.1614699592731084, "language_loss": 0.87940717, "learning_rate": 3.890633833606581e-06, "loss": 0.90170586, "num_input_tokens_seen": 23449050, "step": 1103, "time_per_iteration": 2.740064859390259 }, { "auxiliary_loss_clip": 0.01219616, "auxiliary_loss_mlp": 0.01038827, "balance_loss_clip": 1.06710124, "balance_loss_mlp": 1.02788973, "epoch": 0.13274815126555642, "flos": 19683141851520.0, "grad_norm": 2.4230482170485046, "language_loss": 0.69789934, "learning_rate": 3.890379626093477e-06, "loss": 0.72048378, "num_input_tokens_seen": 23468800, "step": 1104, "time_per_iteration": 2.725937604904175 }, { "auxiliary_loss_clip": 0.01155155, "auxiliary_loss_mlp": 0.01039585, "balance_loss_clip": 1.05263281, "balance_loss_mlp": 1.02857614, "epoch": 0.1328683941561955, "flos": 21317687176320.0, "grad_norm": 5.961143744005907, "language_loss": 0.92816705, "learning_rate": 3.890125131811287e-06, "loss": 0.95011449, "num_input_tokens_seen": 23486850, "step": 1105, "time_per_iteration": 2.753812313079834 }, { "auxiliary_loss_clip": 0.01187978, "auxiliary_loss_mlp": 0.01026744, "balance_loss_clip": 1.05181253, "balance_loss_mlp": 1.01636147, "epoch": 0.1329886370468346, "flos": 13699131580800.0, "grad_norm": 1.9337355698622487, "language_loss": 0.75507557, "learning_rate": 3.889870350798618e-06, "loss": 0.77722275, "num_input_tokens_seen": 23504195, "step": 1106, "time_per_iteration": 2.675607919692993 }, { "auxiliary_loss_clip": 0.01229542, "auxiliary_loss_mlp": 0.01036862, "balance_loss_clip": 1.06031942, "balance_loss_mlp": 1.02615142, "epoch": 0.1331088799374737, "flos": 21032413361280.0, "grad_norm": 1.594137305097391, "language_loss": 0.78603721, "learning_rate": 3.889615283094119e-06, "loss": 0.80870122, "num_input_tokens_seen": 23523385, "step": 1107, "time_per_iteration": 2.5718166828155518 }, { "auxiliary_loss_clip": 0.01233561, "auxiliary_loss_mlp": 0.01044299, "balance_loss_clip": 1.06054628, "balance_loss_mlp": 1.03256917, "epoch": 0.13322912282811278, "flos": 18260432985600.0, "grad_norm": 2.727846927857355, "language_loss": 0.84934723, "learning_rate": 3.889359928736485e-06, "loss": 0.87212574, "num_input_tokens_seen": 23541330, "step": 1108, "time_per_iteration": 2.651667594909668 }, { "auxiliary_loss_clip": 0.01200675, "auxiliary_loss_mlp": 0.0076558, "balance_loss_clip": 1.06160474, "balance_loss_mlp": 1.0002625, "epoch": 0.1333493657187519, "flos": 24460876656000.0, "grad_norm": 2.412021822735133, "language_loss": 0.91338766, "learning_rate": 3.889104287764451e-06, "loss": 0.93305016, "num_input_tokens_seen": 23561705, "step": 1109, "time_per_iteration": 3.6379072666168213 }, { "auxiliary_loss_clip": 0.01194874, "auxiliary_loss_mlp": 0.01037843, "balance_loss_clip": 1.05716181, "balance_loss_mlp": 1.02661324, "epoch": 0.13346960860939097, "flos": 22158930677760.0, "grad_norm": 2.0191153906746653, "language_loss": 0.90590143, "learning_rate": 3.888848360216798e-06, "loss": 0.92822862, "num_input_tokens_seen": 23579350, "step": 1110, "time_per_iteration": 2.685408115386963 }, { "auxiliary_loss_clip": 0.01114857, "auxiliary_loss_mlp": 0.01016668, "balance_loss_clip": 1.0304563, "balance_loss_mlp": 1.01323438, "epoch": 0.13358985150003005, "flos": 67931212608000.0, "grad_norm": 0.8156939426202992, "language_loss": 0.56590152, "learning_rate": 3.888592146132351e-06, "loss": 0.58721685, "num_input_tokens_seen": 23640620, "step": 1111, "time_per_iteration": 3.3684709072113037 }, { "auxiliary_loss_clip": 0.01215355, "auxiliary_loss_mlp": 0.0103084, "balance_loss_clip": 1.06140423, "balance_loss_mlp": 1.02018881, "epoch": 0.13371009439066917, "flos": 26834284742400.0, "grad_norm": 2.239995209620398, "language_loss": 0.7872861, "learning_rate": 3.888335645549978e-06, "loss": 0.80974805, "num_input_tokens_seen": 23661040, "step": 1112, "time_per_iteration": 2.7217676639556885 }, { "auxiliary_loss_clip": 0.01234141, "auxiliary_loss_mlp": 0.01044273, "balance_loss_clip": 1.06363797, "balance_loss_mlp": 1.03347874, "epoch": 0.13383033728130825, "flos": 26322844942080.0, "grad_norm": 2.8639483862568844, "language_loss": 0.81577265, "learning_rate": 3.888078858508588e-06, "loss": 0.83855677, "num_input_tokens_seen": 23680900, "step": 1113, "time_per_iteration": 4.483829021453857 }, { "auxiliary_loss_clip": 0.01198444, "auxiliary_loss_mlp": 0.01035513, "balance_loss_clip": 1.06125247, "balance_loss_mlp": 1.02426529, "epoch": 0.13395058017194733, "flos": 22563931501440.0, "grad_norm": 2.1132454005450003, "language_loss": 0.84611714, "learning_rate": 3.8878217850471365e-06, "loss": 0.86845672, "num_input_tokens_seen": 23700815, "step": 1114, "time_per_iteration": 2.7248823642730713 }, { "auxiliary_loss_clip": 0.01238173, "auxiliary_loss_mlp": 0.01045272, "balance_loss_clip": 1.06580782, "balance_loss_mlp": 1.03391171, "epoch": 0.13407082306258641, "flos": 25810938264960.0, "grad_norm": 1.7238083315414132, "language_loss": 0.73686063, "learning_rate": 3.887564425204621e-06, "loss": 0.75969517, "num_input_tokens_seen": 23722500, "step": 1115, "time_per_iteration": 2.6231260299682617 }, { "auxiliary_loss_clip": 0.01093754, "auxiliary_loss_mlp": 0.01006334, "balance_loss_clip": 1.02726567, "balance_loss_mlp": 1.00294876, "epoch": 0.13419106595322552, "flos": 68338365269760.0, "grad_norm": 0.8383876770233804, "language_loss": 0.54665816, "learning_rate": 3.887306779020083e-06, "loss": 0.56765902, "num_input_tokens_seen": 23777155, "step": 1116, "time_per_iteration": 3.2276229858398438 }, { "auxiliary_loss_clip": 0.01219183, "auxiliary_loss_mlp": 0.01041377, "balance_loss_clip": 1.06195617, "balance_loss_mlp": 1.02999878, "epoch": 0.1343113088438646, "flos": 20449080489600.0, "grad_norm": 2.2046856556632224, "language_loss": 0.70686364, "learning_rate": 3.887048846532608e-06, "loss": 0.72946918, "num_input_tokens_seen": 23794130, "step": 1117, "time_per_iteration": 2.6227176189422607 }, { "auxiliary_loss_clip": 0.01094477, "auxiliary_loss_mlp": 0.01004296, "balance_loss_clip": 1.02672219, "balance_loss_mlp": 1.00086308, "epoch": 0.1344315517345037, "flos": 67389784951680.0, "grad_norm": 0.7909576807685992, "language_loss": 0.5811367, "learning_rate": 3.8867906277813224e-06, "loss": 0.60212445, "num_input_tokens_seen": 23852285, "step": 1118, "time_per_iteration": 3.170527696609497 }, { "auxiliary_loss_clip": 0.01220801, "auxiliary_loss_mlp": 0.00765383, "balance_loss_clip": 1.06221616, "balance_loss_mlp": 1.00063288, "epoch": 0.1345517946251428, "flos": 40734442788480.0, "grad_norm": 2.1802861608348363, "language_loss": 0.74295092, "learning_rate": 3.886532122805399e-06, "loss": 0.76281273, "num_input_tokens_seen": 23874765, "step": 1119, "time_per_iteration": 2.8791720867156982 }, { "auxiliary_loss_clip": 0.01144738, "auxiliary_loss_mlp": 0.01042451, "balance_loss_clip": 1.05097055, "balance_loss_mlp": 1.03069723, "epoch": 0.13467203751578188, "flos": 22816850140800.0, "grad_norm": 2.25042591410831, "language_loss": 0.8974508, "learning_rate": 3.886273331644053e-06, "loss": 0.91932273, "num_input_tokens_seen": 23893635, "step": 1120, "time_per_iteration": 2.7626118659973145 }, { "auxiliary_loss_clip": 0.01164678, "auxiliary_loss_mlp": 0.01033792, "balance_loss_clip": 1.05545461, "balance_loss_mlp": 1.02331352, "epoch": 0.13479228040642097, "flos": 17091576512640.0, "grad_norm": 1.9161704471879897, "language_loss": 0.82289207, "learning_rate": 3.886014254336542e-06, "loss": 0.84487683, "num_input_tokens_seen": 23910110, "step": 1121, "time_per_iteration": 2.8524169921875 }, { "auxiliary_loss_clip": 0.01211465, "auxiliary_loss_mlp": 0.01040985, "balance_loss_clip": 1.05755019, "balance_loss_mlp": 1.03028643, "epoch": 0.13491252329706005, "flos": 23730525417600.0, "grad_norm": 1.6925843210919918, "language_loss": 0.92409372, "learning_rate": 3.885754890922168e-06, "loss": 0.94661826, "num_input_tokens_seen": 23930440, "step": 1122, "time_per_iteration": 2.7263529300689697 }, { "auxiliary_loss_clip": 0.01130127, "auxiliary_loss_mlp": 0.01037053, "balance_loss_clip": 1.05130744, "balance_loss_mlp": 1.02582991, "epoch": 0.13503276618769916, "flos": 34127058960000.0, "grad_norm": 1.8272213429318391, "language_loss": 0.78646427, "learning_rate": 3.885495241440277e-06, "loss": 0.80813605, "num_input_tokens_seen": 23954535, "step": 1123, "time_per_iteration": 2.904585361480713 }, { "auxiliary_loss_clip": 0.01233631, "auxiliary_loss_mlp": 0.01036961, "balance_loss_clip": 1.0627377, "balance_loss_mlp": 1.02583325, "epoch": 0.13515300907833824, "flos": 17712328377600.0, "grad_norm": 2.3698409967968836, "language_loss": 0.74266928, "learning_rate": 3.885235305930257e-06, "loss": 0.7653752, "num_input_tokens_seen": 23972735, "step": 1124, "time_per_iteration": 2.5818779468536377 }, { "auxiliary_loss_clip": 0.01178054, "auxiliary_loss_mlp": 0.01040764, "balance_loss_clip": 1.05522275, "balance_loss_mlp": 1.03000534, "epoch": 0.13527325196897733, "flos": 20260872201600.0, "grad_norm": 4.456976906464268, "language_loss": 0.85422242, "learning_rate": 3.884975084431539e-06, "loss": 0.8764106, "num_input_tokens_seen": 23987685, "step": 1125, "time_per_iteration": 2.7775962352752686 }, { "auxiliary_loss_clip": 0.01205798, "auxiliary_loss_mlp": 0.00765478, "balance_loss_clip": 1.05913711, "balance_loss_mlp": 1.00073028, "epoch": 0.13539349485961644, "flos": 18186492839040.0, "grad_norm": 2.183887364763836, "language_loss": 0.92090023, "learning_rate": 3.8847145769836e-06, "loss": 0.94061297, "num_input_tokens_seen": 24004105, "step": 1126, "time_per_iteration": 2.8285653591156006 }, { "auxiliary_loss_clip": 0.01232596, "auxiliary_loss_mlp": 0.0103485, "balance_loss_clip": 1.06419539, "balance_loss_mlp": 1.02397203, "epoch": 0.13551373775025552, "flos": 19317463441920.0, "grad_norm": 2.3048519473988374, "language_loss": 0.66589153, "learning_rate": 3.884453783625959e-06, "loss": 0.68856609, "num_input_tokens_seen": 24021715, "step": 1127, "time_per_iteration": 2.595576286315918 }, { "auxiliary_loss_clip": 0.01198459, "auxiliary_loss_mlp": 0.01042349, "balance_loss_clip": 1.05889177, "balance_loss_mlp": 1.03148329, "epoch": 0.1356339806408946, "flos": 20850813175680.0, "grad_norm": 2.436872275808764, "language_loss": 0.84546453, "learning_rate": 3.884192704398176e-06, "loss": 0.8678726, "num_input_tokens_seen": 24038915, "step": 1128, "time_per_iteration": 2.698815107345581 }, { "auxiliary_loss_clip": 0.01216812, "auxiliary_loss_mlp": 0.01043699, "balance_loss_clip": 1.05916715, "balance_loss_mlp": 1.03323877, "epoch": 0.13575422353153369, "flos": 50476037696640.0, "grad_norm": 1.700540788388767, "language_loss": 0.74835658, "learning_rate": 3.883931339339858e-06, "loss": 0.7709617, "num_input_tokens_seen": 24063300, "step": 1129, "time_per_iteration": 2.8856260776519775 }, { "auxiliary_loss_clip": 0.01225187, "auxiliary_loss_mlp": 0.01035405, "balance_loss_clip": 1.06255329, "balance_loss_mlp": 1.02425933, "epoch": 0.1358744664221728, "flos": 18150797698560.0, "grad_norm": 2.2860937054266564, "language_loss": 0.78828573, "learning_rate": 3.883669688490654e-06, "loss": 0.81089163, "num_input_tokens_seen": 24081070, "step": 1130, "time_per_iteration": 2.690363883972168 }, { "auxiliary_loss_clip": 0.01190336, "auxiliary_loss_mlp": 0.00765934, "balance_loss_clip": 1.05718076, "balance_loss_mlp": 1.00061572, "epoch": 0.13599470931281188, "flos": 18442966924800.0, "grad_norm": 2.686854094856528, "language_loss": 0.85423708, "learning_rate": 3.883407751890256e-06, "loss": 0.8737998, "num_input_tokens_seen": 24099675, "step": 1131, "time_per_iteration": 2.7279837131500244 }, { "auxiliary_loss_clip": 0.01180831, "auxiliary_loss_mlp": 0.01036642, "balance_loss_clip": 1.05225122, "balance_loss_mlp": 1.02554321, "epoch": 0.13611495220345096, "flos": 26680766014080.0, "grad_norm": 2.087136934452564, "language_loss": 0.85873151, "learning_rate": 3.8831455295783994e-06, "loss": 0.88090622, "num_input_tokens_seen": 24118925, "step": 1132, "time_per_iteration": 2.7800850868225098 }, { "auxiliary_loss_clip": 0.0119255, "auxiliary_loss_mlp": 0.01032127, "balance_loss_clip": 1.05634964, "balance_loss_mlp": 1.02083218, "epoch": 0.13623519509409007, "flos": 21686238673920.0, "grad_norm": 1.7862779758109626, "language_loss": 0.74030447, "learning_rate": 3.882883021594864e-06, "loss": 0.76255125, "num_input_tokens_seen": 24137065, "step": 1133, "time_per_iteration": 2.650254011154175 }, { "auxiliary_loss_clip": 0.01178653, "auxiliary_loss_mlp": 0.01045186, "balance_loss_clip": 1.05707145, "balance_loss_mlp": 1.0343914, "epoch": 0.13635543798472916, "flos": 14830389492480.0, "grad_norm": 2.1664577873178548, "language_loss": 0.8674885, "learning_rate": 3.8826202279794705e-06, "loss": 0.88972694, "num_input_tokens_seen": 24154125, "step": 1134, "time_per_iteration": 2.7394473552703857 }, { "auxiliary_loss_clip": 0.01236867, "auxiliary_loss_mlp": 0.01039344, "balance_loss_clip": 1.06683183, "balance_loss_mlp": 1.02805483, "epoch": 0.13647568087536824, "flos": 22890323410560.0, "grad_norm": 1.9451314843504632, "language_loss": 0.70131779, "learning_rate": 3.882357148772085e-06, "loss": 0.72407985, "num_input_tokens_seen": 24171550, "step": 1135, "time_per_iteration": 3.5486795902252197 }, { "auxiliary_loss_clip": 0.01175206, "auxiliary_loss_mlp": 0.0104224, "balance_loss_clip": 1.05700278, "balance_loss_mlp": 1.03152943, "epoch": 0.13659592376600732, "flos": 19937927998080.0, "grad_norm": 4.231213380106844, "language_loss": 0.84945726, "learning_rate": 3.882093784012617e-06, "loss": 0.87163174, "num_input_tokens_seen": 24190190, "step": 1136, "time_per_iteration": 2.725208044052124 }, { "auxiliary_loss_clip": 0.01199978, "auxiliary_loss_mlp": 0.01049518, "balance_loss_clip": 1.05827248, "balance_loss_mlp": 1.03803182, "epoch": 0.13671616665664643, "flos": 21428579439360.0, "grad_norm": 1.8720530265014979, "language_loss": 0.8411597, "learning_rate": 3.881830133741019e-06, "loss": 0.86365467, "num_input_tokens_seen": 24209055, "step": 1137, "time_per_iteration": 2.752316951751709 }, { "auxiliary_loss_clip": 0.01191245, "auxiliary_loss_mlp": 0.01043746, "balance_loss_clip": 1.06197071, "balance_loss_mlp": 1.03258181, "epoch": 0.13683640954728551, "flos": 22778138257920.0, "grad_norm": 2.0765504247654656, "language_loss": 0.76102978, "learning_rate": 3.881566197997285e-06, "loss": 0.78337967, "num_input_tokens_seen": 24225490, "step": 1138, "time_per_iteration": 2.6883456707000732 }, { "auxiliary_loss_clip": 0.0119772, "auxiliary_loss_mlp": 0.01039232, "balance_loss_clip": 1.06064487, "balance_loss_mlp": 1.02933741, "epoch": 0.1369566524379246, "flos": 21725884310400.0, "grad_norm": 1.5279681114569097, "language_loss": 0.74543828, "learning_rate": 3.881301976821456e-06, "loss": 0.76780784, "num_input_tokens_seen": 24245520, "step": 1139, "time_per_iteration": 4.50075101852417 }, { "auxiliary_loss_clip": 0.01214294, "auxiliary_loss_mlp": 0.01036008, "balance_loss_clip": 1.06149733, "balance_loss_mlp": 1.02482069, "epoch": 0.1370768953285637, "flos": 18624459369600.0, "grad_norm": 1.8182741834650884, "language_loss": 0.90331072, "learning_rate": 3.881037470253612e-06, "loss": 0.92581379, "num_input_tokens_seen": 24265035, "step": 1140, "time_per_iteration": 2.67482590675354 }, { "auxiliary_loss_clip": 0.01168962, "auxiliary_loss_mlp": 0.01041079, "balance_loss_clip": 1.05458057, "balance_loss_mlp": 1.03022528, "epoch": 0.1371971382192028, "flos": 14939521989120.0, "grad_norm": 2.727979033283011, "language_loss": 0.79404998, "learning_rate": 3.88077267833388e-06, "loss": 0.81615037, "num_input_tokens_seen": 24281550, "step": 1141, "time_per_iteration": 2.7099382877349854 }, { "auxiliary_loss_clip": 0.01164416, "auxiliary_loss_mlp": 0.01040793, "balance_loss_clip": 1.05650043, "balance_loss_mlp": 1.02991509, "epoch": 0.13731738110984187, "flos": 19023785844480.0, "grad_norm": 2.0539160383646857, "language_loss": 0.83984089, "learning_rate": 3.880507601102427e-06, "loss": 0.861893, "num_input_tokens_seen": 24299485, "step": 1142, "time_per_iteration": 2.76448917388916 }, { "auxiliary_loss_clip": 0.01233585, "auxiliary_loss_mlp": 0.01049758, "balance_loss_clip": 1.06611609, "balance_loss_mlp": 1.03861237, "epoch": 0.13743762400048098, "flos": 18187462506240.0, "grad_norm": 2.0171180441402456, "language_loss": 0.82084405, "learning_rate": 3.880242238599467e-06, "loss": 0.84367746, "num_input_tokens_seen": 24316010, "step": 1143, "time_per_iteration": 2.577920436859131 }, { "auxiliary_loss_clip": 0.01229473, "auxiliary_loss_mlp": 0.01039626, "balance_loss_clip": 1.0626955, "balance_loss_mlp": 1.0285337, "epoch": 0.13755786689112007, "flos": 21031982398080.0, "grad_norm": 2.4309083029303786, "language_loss": 0.8335923, "learning_rate": 3.879976590865254e-06, "loss": 0.85628331, "num_input_tokens_seen": 24335465, "step": 1144, "time_per_iteration": 2.6204025745391846 }, { "auxiliary_loss_clip": 0.01200242, "auxiliary_loss_mlp": 0.01038145, "balance_loss_clip": 1.06092632, "balance_loss_mlp": 1.02701688, "epoch": 0.13767810978175915, "flos": 21360636864000.0, "grad_norm": 2.6693306466522433, "language_loss": 0.87186241, "learning_rate": 3.879710657940087e-06, "loss": 0.89424634, "num_input_tokens_seen": 24354415, "step": 1145, "time_per_iteration": 2.848679780960083 }, { "auxiliary_loss_clip": 0.01218746, "auxiliary_loss_mlp": 0.01041509, "balance_loss_clip": 1.0635097, "balance_loss_mlp": 1.03043461, "epoch": 0.13779835267239823, "flos": 30592084861440.0, "grad_norm": 1.7909629185773799, "language_loss": 0.70142758, "learning_rate": 3.879444439864308e-06, "loss": 0.72403014, "num_input_tokens_seen": 24373990, "step": 1146, "time_per_iteration": 2.7371418476104736 }, { "auxiliary_loss_clip": 0.01213803, "auxiliary_loss_mlp": 0.00765845, "balance_loss_clip": 1.05822802, "balance_loss_mlp": 1.00067079, "epoch": 0.13791859556303734, "flos": 22669867687680.0, "grad_norm": 2.352702958173118, "language_loss": 0.85997128, "learning_rate": 3.879177936678301e-06, "loss": 0.87976778, "num_input_tokens_seen": 24392995, "step": 1147, "time_per_iteration": 2.6676275730133057 }, { "auxiliary_loss_clip": 0.01208088, "auxiliary_loss_mlp": 0.01047576, "balance_loss_clip": 1.05927896, "balance_loss_mlp": 1.03579831, "epoch": 0.13803883845367643, "flos": 35224166016000.0, "grad_norm": 1.9074180614889205, "language_loss": 0.77291101, "learning_rate": 3.878911148422496e-06, "loss": 0.79546762, "num_input_tokens_seen": 24414470, "step": 1148, "time_per_iteration": 2.8286795616149902 }, { "auxiliary_loss_clip": 0.01218961, "auxiliary_loss_mlp": 0.01039196, "balance_loss_clip": 1.06102681, "balance_loss_mlp": 1.02759075, "epoch": 0.1381590813443155, "flos": 32014542332160.0, "grad_norm": 2.3257573166904, "language_loss": 0.70484996, "learning_rate": 3.878644075137364e-06, "loss": 0.72743154, "num_input_tokens_seen": 24435120, "step": 1149, "time_per_iteration": 2.8019440174102783 }, { "auxiliary_loss_clip": 0.01162127, "auxiliary_loss_mlp": 0.01035836, "balance_loss_clip": 1.05043757, "balance_loss_mlp": 1.02475595, "epoch": 0.13827932423495462, "flos": 17821855923840.0, "grad_norm": 2.3140551464775303, "language_loss": 0.79149461, "learning_rate": 3.878376716863418e-06, "loss": 0.81347418, "num_input_tokens_seen": 24451420, "step": 1150, "time_per_iteration": 2.6529364585876465 }, { "auxiliary_loss_clip": 0.01196322, "auxiliary_loss_mlp": 0.01042837, "balance_loss_clip": 1.05552399, "balance_loss_mlp": 1.03212023, "epoch": 0.1383995671255937, "flos": 19427098728960.0, "grad_norm": 2.8178864798301815, "language_loss": 0.71838844, "learning_rate": 3.878109073641219e-06, "loss": 0.74078, "num_input_tokens_seen": 24470450, "step": 1151, "time_per_iteration": 2.709325075149536 }, { "auxiliary_loss_clip": 0.01167708, "auxiliary_loss_mlp": 0.01038636, "balance_loss_clip": 1.05609381, "balance_loss_mlp": 1.02786517, "epoch": 0.13851981001623279, "flos": 28296603331200.0, "grad_norm": 1.6107904379459834, "language_loss": 0.81167781, "learning_rate": 3.877841145511366e-06, "loss": 0.83374119, "num_input_tokens_seen": 24493190, "step": 1152, "time_per_iteration": 2.815868377685547 }, { "auxiliary_loss_clip": 0.01222692, "auxiliary_loss_mlp": 0.01045542, "balance_loss_clip": 1.06283748, "balance_loss_mlp": 1.03430092, "epoch": 0.13864005290687187, "flos": 21213079793280.0, "grad_norm": 1.7222197307130964, "language_loss": 0.82926863, "learning_rate": 3.8775729325145035e-06, "loss": 0.851951, "num_input_tokens_seen": 24512425, "step": 1153, "time_per_iteration": 2.681976795196533 }, { "auxiliary_loss_clip": 0.01080075, "auxiliary_loss_mlp": 0.01006332, "balance_loss_clip": 1.02467167, "balance_loss_mlp": 1.00299442, "epoch": 0.13876029579751098, "flos": 71653389413760.0, "grad_norm": 0.7937130332311666, "language_loss": 0.64775288, "learning_rate": 3.877304434691321e-06, "loss": 0.66861701, "num_input_tokens_seen": 24579275, "step": 1154, "time_per_iteration": 3.415860891342163 }, { "auxiliary_loss_clip": 0.01186598, "auxiliary_loss_mlp": 0.01036052, "balance_loss_clip": 1.06064188, "balance_loss_mlp": 1.02568126, "epoch": 0.13888053868815006, "flos": 21941348042880.0, "grad_norm": 1.8181160544596704, "language_loss": 0.79606926, "learning_rate": 3.877035652082548e-06, "loss": 0.81829584, "num_input_tokens_seen": 24598720, "step": 1155, "time_per_iteration": 2.730027675628662 }, { "auxiliary_loss_clip": 0.01195761, "auxiliary_loss_mlp": 0.01040635, "balance_loss_clip": 1.06201911, "balance_loss_mlp": 1.02876735, "epoch": 0.13900078157878915, "flos": 19608627087360.0, "grad_norm": 1.7243531977411166, "language_loss": 0.85330909, "learning_rate": 3.87676658472896e-06, "loss": 0.87567306, "num_input_tokens_seen": 24617530, "step": 1156, "time_per_iteration": 2.717961311340332 }, { "auxiliary_loss_clip": 0.01211747, "auxiliary_loss_mlp": 0.01040129, "balance_loss_clip": 1.05571079, "balance_loss_mlp": 1.02926302, "epoch": 0.13912102446942826, "flos": 22638051216000.0, "grad_norm": 2.3832685797276123, "language_loss": 0.85097796, "learning_rate": 3.876497232671372e-06, "loss": 0.87349671, "num_input_tokens_seen": 24637485, "step": 1157, "time_per_iteration": 2.6454429626464844 }, { "auxiliary_loss_clip": 0.01170328, "auxiliary_loss_mlp": 0.0103933, "balance_loss_clip": 1.05421853, "balance_loss_mlp": 1.02819645, "epoch": 0.13924126736006734, "flos": 29643324975360.0, "grad_norm": 2.1507306919810114, "language_loss": 0.83803481, "learning_rate": 3.876227595950647e-06, "loss": 0.86013138, "num_input_tokens_seen": 24656915, "step": 1158, "time_per_iteration": 2.7870540618896484 }, { "auxiliary_loss_clip": 0.01230742, "auxiliary_loss_mlp": 0.01035152, "balance_loss_clip": 1.06337821, "balance_loss_mlp": 1.02357054, "epoch": 0.13936151025070642, "flos": 27417653527680.0, "grad_norm": 1.5555805836656236, "language_loss": 0.79169214, "learning_rate": 3.875957674607686e-06, "loss": 0.81435108, "num_input_tokens_seen": 24679190, "step": 1159, "time_per_iteration": 2.68554949760437 }, { "auxiliary_loss_clip": 0.01204482, "auxiliary_loss_mlp": 0.0076674, "balance_loss_clip": 1.05572021, "balance_loss_mlp": 1.00058579, "epoch": 0.1394817531413455, "flos": 16399326625920.0, "grad_norm": 1.7379403025435036, "language_loss": 0.87995005, "learning_rate": 3.8756874686834386e-06, "loss": 0.89966232, "num_input_tokens_seen": 24697405, "step": 1160, "time_per_iteration": 3.6171417236328125 }, { "auxiliary_loss_clip": 0.01220245, "auxiliary_loss_mlp": 0.00766102, "balance_loss_clip": 1.06036556, "balance_loss_mlp": 1.00062156, "epoch": 0.13960199603198462, "flos": 30922319525760.0, "grad_norm": 1.6647464933110165, "language_loss": 0.8043012, "learning_rate": 3.875416978218893e-06, "loss": 0.82416463, "num_input_tokens_seen": 24720600, "step": 1161, "time_per_iteration": 2.7196340560913086 }, { "auxiliary_loss_clip": 0.01193903, "auxiliary_loss_mlp": 0.01034022, "balance_loss_clip": 1.0545342, "balance_loss_mlp": 1.02288806, "epoch": 0.1397222389226237, "flos": 18113773754880.0, "grad_norm": 2.280341845333764, "language_loss": 0.82957339, "learning_rate": 3.8751462032550835e-06, "loss": 0.85185266, "num_input_tokens_seen": 24737605, "step": 1162, "time_per_iteration": 2.7208597660064697 }, { "auxiliary_loss_clip": 0.01199853, "auxiliary_loss_mlp": 0.01036763, "balance_loss_clip": 1.06250226, "balance_loss_mlp": 1.02624309, "epoch": 0.13984248181326278, "flos": 16872772815360.0, "grad_norm": 3.4816704094840296, "language_loss": 0.82980108, "learning_rate": 3.874875143833085e-06, "loss": 0.85216725, "num_input_tokens_seen": 24755845, "step": 1163, "time_per_iteration": 2.719616174697876 }, { "auxiliary_loss_clip": 0.01216468, "auxiliary_loss_mlp": 0.01037641, "balance_loss_clip": 1.06163049, "balance_loss_mlp": 1.02632189, "epoch": 0.1399627247039019, "flos": 54121401267840.0, "grad_norm": 3.286100487865083, "language_loss": 0.68820703, "learning_rate": 3.874603799994019e-06, "loss": 0.71074814, "num_input_tokens_seen": 24779380, "step": 1164, "time_per_iteration": 2.927874803543091 }, { "auxiliary_loss_clip": 0.01176223, "auxiliary_loss_mlp": 0.01045843, "balance_loss_clip": 1.05394125, "balance_loss_mlp": 1.03487015, "epoch": 0.14008296759454097, "flos": 11765521618560.0, "grad_norm": 2.9795500256942535, "language_loss": 0.87145269, "learning_rate": 3.874332171779046e-06, "loss": 0.8936733, "num_input_tokens_seen": 24794260, "step": 1165, "time_per_iteration": 4.516380071640015 }, { "auxiliary_loss_clip": 0.01183177, "auxiliary_loss_mlp": 0.01038037, "balance_loss_clip": 1.05745125, "balance_loss_mlp": 1.02730799, "epoch": 0.14020321048518006, "flos": 22017514832640.0, "grad_norm": 1.6906577585042804, "language_loss": 0.75717759, "learning_rate": 3.874060259229373e-06, "loss": 0.77938974, "num_input_tokens_seen": 24815835, "step": 1166, "time_per_iteration": 2.8095974922180176 }, { "auxiliary_loss_clip": 0.01218601, "auxiliary_loss_mlp": 0.01047861, "balance_loss_clip": 1.06300867, "balance_loss_mlp": 1.03625, "epoch": 0.14032345337581917, "flos": 23404313076480.0, "grad_norm": 2.197214655886737, "language_loss": 0.93849444, "learning_rate": 3.873788062386249e-06, "loss": 0.96115911, "num_input_tokens_seen": 24834095, "step": 1167, "time_per_iteration": 2.707846164703369 }, { "auxiliary_loss_clip": 0.01189982, "auxiliary_loss_mlp": 0.01041922, "balance_loss_clip": 1.0601542, "balance_loss_mlp": 1.03032839, "epoch": 0.14044369626645825, "flos": 29645767100160.0, "grad_norm": 2.115652955981573, "language_loss": 0.82456028, "learning_rate": 3.873515581290965e-06, "loss": 0.84687924, "num_input_tokens_seen": 24858900, "step": 1168, "time_per_iteration": 2.7815768718719482 }, { "auxiliary_loss_clip": 0.01183767, "auxiliary_loss_mlp": 0.01033851, "balance_loss_clip": 1.05891788, "balance_loss_mlp": 1.02299142, "epoch": 0.14056393915709733, "flos": 18332972501760.0, "grad_norm": 2.4220738244679985, "language_loss": 0.75516158, "learning_rate": 3.8732428159848575e-06, "loss": 0.77733773, "num_input_tokens_seen": 24877875, "step": 1169, "time_per_iteration": 2.7488629817962646 }, { "auxiliary_loss_clip": 0.01217163, "auxiliary_loss_mlp": 0.01042, "balance_loss_clip": 1.06533909, "balance_loss_mlp": 1.03046656, "epoch": 0.14068418204773642, "flos": 26687517770880.0, "grad_norm": 1.9219501981153029, "language_loss": 0.78350127, "learning_rate": 3.872969766509304e-06, "loss": 0.80609292, "num_input_tokens_seen": 24898430, "step": 1170, "time_per_iteration": 2.770106077194214 }, { "auxiliary_loss_clip": 0.01082778, "auxiliary_loss_mlp": 0.01010405, "balance_loss_clip": 1.02597535, "balance_loss_mlp": 1.00704372, "epoch": 0.14080442493837553, "flos": 65259314501760.0, "grad_norm": 0.7687104911707842, "language_loss": 0.55604547, "learning_rate": 3.872696432905726e-06, "loss": 0.57697731, "num_input_tokens_seen": 24959250, "step": 1171, "time_per_iteration": 3.2625792026519775 }, { "auxiliary_loss_clip": 0.01217254, "auxiliary_loss_mlp": 0.01037148, "balance_loss_clip": 1.05836725, "balance_loss_mlp": 1.02498317, "epoch": 0.1409246678290146, "flos": 25776715582080.0, "grad_norm": 2.784654170309308, "language_loss": 0.71696085, "learning_rate": 3.872422815215589e-06, "loss": 0.73950487, "num_input_tokens_seen": 24978330, "step": 1172, "time_per_iteration": 2.719391345977783 }, { "auxiliary_loss_clip": 0.012069, "auxiliary_loss_mlp": 0.01041764, "balance_loss_clip": 1.05529308, "balance_loss_mlp": 1.03029013, "epoch": 0.1410449107196537, "flos": 21868521217920.0, "grad_norm": 1.7473169302375895, "language_loss": 0.74482375, "learning_rate": 3.8721489134803994e-06, "loss": 0.76731038, "num_input_tokens_seen": 24997120, "step": 1173, "time_per_iteration": 2.660723924636841 }, { "auxiliary_loss_clip": 0.01215377, "auxiliary_loss_mlp": 0.01030005, "balance_loss_clip": 1.06171513, "balance_loss_mlp": 1.01867449, "epoch": 0.1411651536102928, "flos": 16684133564160.0, "grad_norm": 2.301418495531914, "language_loss": 0.72769761, "learning_rate": 3.871874727741707e-06, "loss": 0.75015146, "num_input_tokens_seen": 25014350, "step": 1174, "time_per_iteration": 2.620027780532837 }, { "auxiliary_loss_clip": 0.01209984, "auxiliary_loss_mlp": 0.01040879, "balance_loss_clip": 1.06016052, "balance_loss_mlp": 1.03018546, "epoch": 0.1412853965009319, "flos": 20992264934400.0, "grad_norm": 1.965947201254203, "language_loss": 0.96875179, "learning_rate": 3.871600258041108e-06, "loss": 0.99126047, "num_input_tokens_seen": 25033875, "step": 1175, "time_per_iteration": 2.658416986465454 }, { "auxiliary_loss_clip": 0.01190663, "auxiliary_loss_mlp": 0.01032429, "balance_loss_clip": 1.05542505, "balance_loss_mlp": 1.02121711, "epoch": 0.14140563939157097, "flos": 20335279224960.0, "grad_norm": 2.6292616194805283, "language_loss": 0.8574124, "learning_rate": 3.871325504420238e-06, "loss": 0.87964332, "num_input_tokens_seen": 25052865, "step": 1176, "time_per_iteration": 2.683713436126709 }, { "auxiliary_loss_clip": 0.01228045, "auxiliary_loss_mlp": 0.010313, "balance_loss_clip": 1.06136012, "balance_loss_mlp": 1.02043962, "epoch": 0.14152588228221005, "flos": 21068826773760.0, "grad_norm": 2.195623364277583, "language_loss": 0.81630832, "learning_rate": 3.871050466920776e-06, "loss": 0.83890182, "num_input_tokens_seen": 25072770, "step": 1177, "time_per_iteration": 2.615522623062134 }, { "auxiliary_loss_clip": 0.011715, "auxiliary_loss_mlp": 0.01039215, "balance_loss_clip": 1.05102813, "balance_loss_mlp": 1.02843833, "epoch": 0.14164612517284916, "flos": 18223157646720.0, "grad_norm": 2.139111381767639, "language_loss": 0.79918897, "learning_rate": 3.870775145584447e-06, "loss": 0.82129616, "num_input_tokens_seen": 25090550, "step": 1178, "time_per_iteration": 2.6630585193634033 }, { "auxiliary_loss_clip": 0.01203337, "auxiliary_loss_mlp": 0.01041069, "balance_loss_clip": 1.0582974, "balance_loss_mlp": 1.02753234, "epoch": 0.14176636806348825, "flos": 22744454279040.0, "grad_norm": 2.6819696655023972, "language_loss": 0.64154553, "learning_rate": 3.8704995404530145e-06, "loss": 0.66398954, "num_input_tokens_seen": 25106175, "step": 1179, "time_per_iteration": 2.688589334487915 }, { "auxiliary_loss_clip": 0.01226157, "auxiliary_loss_mlp": 0.01051918, "balance_loss_clip": 1.06128585, "balance_loss_mlp": 1.04065275, "epoch": 0.14188661095412733, "flos": 22091095843200.0, "grad_norm": 2.1022701180732386, "language_loss": 0.85243762, "learning_rate": 3.87022365156829e-06, "loss": 0.87521839, "num_input_tokens_seen": 25126890, "step": 1180, "time_per_iteration": 2.6162021160125732 }, { "auxiliary_loss_clip": 0.01137869, "auxiliary_loss_mlp": 0.01034291, "balance_loss_clip": 1.050282, "balance_loss_mlp": 1.02328217, "epoch": 0.14200685384476644, "flos": 24352390604160.0, "grad_norm": 2.1447473297802455, "language_loss": 0.81489068, "learning_rate": 3.869947478972123e-06, "loss": 0.83661234, "num_input_tokens_seen": 25147915, "step": 1181, "time_per_iteration": 2.9183812141418457 }, { "auxiliary_loss_clip": 0.01203629, "auxiliary_loss_mlp": 0.01038404, "balance_loss_clip": 1.05627489, "balance_loss_mlp": 1.02728188, "epoch": 0.14212709673540552, "flos": 24022048199040.0, "grad_norm": 2.3093325161314473, "language_loss": 0.82918704, "learning_rate": 3.869671022706412e-06, "loss": 0.85160738, "num_input_tokens_seen": 25166645, "step": 1182, "time_per_iteration": 2.7755186557769775 }, { "auxiliary_loss_clip": 0.01150239, "auxiliary_loss_mlp": 0.01040033, "balance_loss_clip": 1.05009019, "balance_loss_mlp": 1.02799892, "epoch": 0.1422473396260446, "flos": 26431797870720.0, "grad_norm": 2.2288387068433466, "language_loss": 0.64955342, "learning_rate": 3.869394282813092e-06, "loss": 0.6714561, "num_input_tokens_seen": 25185845, "step": 1183, "time_per_iteration": 2.79331111907959 }, { "auxiliary_loss_clip": 0.0119128, "auxiliary_loss_mlp": 0.01035796, "balance_loss_clip": 1.05605412, "balance_loss_mlp": 1.02455449, "epoch": 0.1423675825166837, "flos": 17055306754560.0, "grad_norm": 2.4592609011729376, "language_loss": 0.89634633, "learning_rate": 3.869117259334147e-06, "loss": 0.91861713, "num_input_tokens_seen": 25203770, "step": 1184, "time_per_iteration": 2.6829237937927246 }, { "auxiliary_loss_clip": 0.01206939, "auxiliary_loss_mlp": 0.01036988, "balance_loss_clip": 1.05743837, "balance_loss_mlp": 1.0265336, "epoch": 0.1424878254073228, "flos": 17929480049280.0, "grad_norm": 1.7359719081151015, "language_loss": 0.82280344, "learning_rate": 3.868839952311599e-06, "loss": 0.84524274, "num_input_tokens_seen": 25221725, "step": 1185, "time_per_iteration": 2.662015438079834 }, { "auxiliary_loss_clip": 0.01196969, "auxiliary_loss_mlp": 0.01039444, "balance_loss_clip": 1.06051517, "balance_loss_mlp": 1.02850044, "epoch": 0.14260806829796188, "flos": 20303606407680.0, "grad_norm": 2.797085976501659, "language_loss": 0.80312598, "learning_rate": 3.868562361787516e-06, "loss": 0.82549012, "num_input_tokens_seen": 25240855, "step": 1186, "time_per_iteration": 2.7269883155822754 }, { "auxiliary_loss_clip": 0.01131864, "auxiliary_loss_mlp": 0.01037084, "balance_loss_clip": 1.04704022, "balance_loss_mlp": 1.02575374, "epoch": 0.14272831118860096, "flos": 23185724860800.0, "grad_norm": 2.105933509796222, "language_loss": 0.68887997, "learning_rate": 3.868284487804009e-06, "loss": 0.7105695, "num_input_tokens_seen": 25260085, "step": 1187, "time_per_iteration": 3.7207744121551514 }, { "auxiliary_loss_clip": 0.01198911, "auxiliary_loss_mlp": 0.01040894, "balance_loss_clip": 1.05511713, "balance_loss_mlp": 1.02993321, "epoch": 0.14284855407924008, "flos": 27232210586880.0, "grad_norm": 1.6211412083852588, "language_loss": 0.78175557, "learning_rate": 3.86800633040323e-06, "loss": 0.80415356, "num_input_tokens_seen": 25280675, "step": 1188, "time_per_iteration": 2.7550668716430664 }, { "auxiliary_loss_clip": 0.01199021, "auxiliary_loss_mlp": 0.00765738, "balance_loss_clip": 1.06147051, "balance_loss_mlp": 1.00074446, "epoch": 0.14296879696987916, "flos": 28184202696960.0, "grad_norm": 2.2045211015118773, "language_loss": 0.78446126, "learning_rate": 3.867727889627376e-06, "loss": 0.80410886, "num_input_tokens_seen": 25300290, "step": 1189, "time_per_iteration": 2.740311861038208 }, { "auxiliary_loss_clip": 0.01171647, "auxiliary_loss_mlp": 0.01038412, "balance_loss_clip": 1.05613017, "balance_loss_mlp": 1.02647328, "epoch": 0.14308903986051824, "flos": 19390290266880.0, "grad_norm": 2.693677260777442, "language_loss": 0.78339458, "learning_rate": 3.867449165518687e-06, "loss": 0.80549526, "num_input_tokens_seen": 25316760, "step": 1190, "time_per_iteration": 2.7284646034240723 }, { "auxiliary_loss_clip": 0.01232593, "auxiliary_loss_mlp": 0.00765744, "balance_loss_clip": 1.06187487, "balance_loss_mlp": 1.00061345, "epoch": 0.14320928275115732, "flos": 17457506317440.0, "grad_norm": 1.7217594128849356, "language_loss": 0.71078998, "learning_rate": 3.867170158119444e-06, "loss": 0.73077333, "num_input_tokens_seen": 25335760, "step": 1191, "time_per_iteration": 4.493774890899658 }, { "auxiliary_loss_clip": 0.01231627, "auxiliary_loss_mlp": 0.01043424, "balance_loss_clip": 1.06233311, "balance_loss_mlp": 1.03211069, "epoch": 0.14332952564179643, "flos": 21466070259840.0, "grad_norm": 4.853829715682026, "language_loss": 0.75158399, "learning_rate": 3.866890867471972e-06, "loss": 0.77433449, "num_input_tokens_seen": 25354230, "step": 1192, "time_per_iteration": 2.6253933906555176 }, { "auxiliary_loss_clip": 0.01195578, "auxiliary_loss_mlp": 0.01033362, "balance_loss_clip": 1.05474019, "balance_loss_mlp": 1.02222824, "epoch": 0.14344976853243552, "flos": 16396992241920.0, "grad_norm": 2.5418725894431273, "language_loss": 0.89557326, "learning_rate": 3.86661129361864e-06, "loss": 0.91786265, "num_input_tokens_seen": 25368720, "step": 1193, "time_per_iteration": 2.7305965423583984 }, { "auxiliary_loss_clip": 0.01197874, "auxiliary_loss_mlp": 0.01035207, "balance_loss_clip": 1.06050396, "balance_loss_mlp": 1.0240016, "epoch": 0.1435700114230746, "flos": 18916736336640.0, "grad_norm": 1.8605033795656913, "language_loss": 0.86038327, "learning_rate": 3.866331436601859e-06, "loss": 0.88271403, "num_input_tokens_seen": 25386715, "step": 1194, "time_per_iteration": 2.6115710735321045 }, { "auxiliary_loss_clip": 0.01231446, "auxiliary_loss_mlp": 0.01037489, "balance_loss_clip": 1.06520653, "balance_loss_mlp": 1.02621222, "epoch": 0.1436902543137137, "flos": 19755394058880.0, "grad_norm": 3.7635740208777135, "language_loss": 0.73784709, "learning_rate": 3.866051296464083e-06, "loss": 0.76053643, "num_input_tokens_seen": 25405550, "step": 1195, "time_per_iteration": 2.6558682918548584 }, { "auxiliary_loss_clip": 0.012311, "auxiliary_loss_mlp": 0.00766063, "balance_loss_clip": 1.06124532, "balance_loss_mlp": 1.00073349, "epoch": 0.1438104972043528, "flos": 14684807669760.0, "grad_norm": 2.7261894411531684, "language_loss": 0.85220695, "learning_rate": 3.86577087324781e-06, "loss": 0.87217861, "num_input_tokens_seen": 25422040, "step": 1196, "time_per_iteration": 2.589552164077759 }, { "auxiliary_loss_clip": 0.01214195, "auxiliary_loss_mlp": 0.01033367, "balance_loss_clip": 1.06420279, "balance_loss_mlp": 1.0221374, "epoch": 0.14393074009499188, "flos": 17092330698240.0, "grad_norm": 4.845764736605291, "language_loss": 0.77121109, "learning_rate": 3.865490166995578e-06, "loss": 0.79368675, "num_input_tokens_seen": 25440270, "step": 1197, "time_per_iteration": 2.657001495361328 }, { "auxiliary_loss_clip": 0.01216498, "auxiliary_loss_mlp": 0.01035292, "balance_loss_clip": 1.06313467, "balance_loss_mlp": 1.02373409, "epoch": 0.144050982985631, "flos": 30476200608000.0, "grad_norm": 2.1936411710860506, "language_loss": 0.84291196, "learning_rate": 3.86520917774997e-06, "loss": 0.86542982, "num_input_tokens_seen": 25459705, "step": 1198, "time_per_iteration": 2.7346129417419434 }, { "auxiliary_loss_clip": 0.0121134, "auxiliary_loss_mlp": 0.01035997, "balance_loss_clip": 1.06165838, "balance_loss_mlp": 1.02542877, "epoch": 0.14417122587627007, "flos": 17858484817920.0, "grad_norm": 2.334827196203235, "language_loss": 0.74881321, "learning_rate": 3.864927905553614e-06, "loss": 0.77128661, "num_input_tokens_seen": 25477615, "step": 1199, "time_per_iteration": 2.621856451034546 }, { "auxiliary_loss_clip": 0.01181762, "auxiliary_loss_mlp": 0.01035753, "balance_loss_clip": 1.05647326, "balance_loss_mlp": 1.02470207, "epoch": 0.14429146876690915, "flos": 21613914639360.0, "grad_norm": 1.5863821606740192, "language_loss": 0.88727188, "learning_rate": 3.8646463504491765e-06, "loss": 0.90944707, "num_input_tokens_seen": 25497750, "step": 1200, "time_per_iteration": 2.7526328563690186 }, { "auxiliary_loss_clip": 0.01220059, "auxiliary_loss_mlp": 0.01045156, "balance_loss_clip": 1.06212604, "balance_loss_mlp": 1.03417075, "epoch": 0.14441171165754824, "flos": 23258120722560.0, "grad_norm": 2.1066813009988463, "language_loss": 0.83281273, "learning_rate": 3.8643645124793705e-06, "loss": 0.85546494, "num_input_tokens_seen": 25516650, "step": 1201, "time_per_iteration": 2.6592607498168945 }, { "auxiliary_loss_clip": 0.0120934, "auxiliary_loss_mlp": 0.01033881, "balance_loss_clip": 1.05877054, "balance_loss_mlp": 1.02354527, "epoch": 0.14453195454818735, "flos": 42854213963520.0, "grad_norm": 1.6566024758929419, "language_loss": 0.74934459, "learning_rate": 3.8640823916869515e-06, "loss": 0.7717768, "num_input_tokens_seen": 25540960, "step": 1202, "time_per_iteration": 2.8913087844848633 }, { "auxiliary_loss_clip": 0.0122843, "auxiliary_loss_mlp": 0.01046486, "balance_loss_clip": 1.06119752, "balance_loss_mlp": 1.0356375, "epoch": 0.14465219743882643, "flos": 27235873774080.0, "grad_norm": 1.5238064837978664, "language_loss": 0.78454107, "learning_rate": 3.863799988114714e-06, "loss": 0.8072902, "num_input_tokens_seen": 25562990, "step": 1203, "time_per_iteration": 2.723848819732666 }, { "auxiliary_loss_clip": 0.01231348, "auxiliary_loss_mlp": 0.01037456, "balance_loss_clip": 1.06131637, "balance_loss_mlp": 1.02622056, "epoch": 0.1447724403294655, "flos": 16690705752960.0, "grad_norm": 2.343943914485854, "language_loss": 0.70939064, "learning_rate": 3.863517301805502e-06, "loss": 0.73207867, "num_input_tokens_seen": 25581380, "step": 1204, "time_per_iteration": 2.6028659343719482 }, { "auxiliary_loss_clip": 0.01189008, "auxiliary_loss_mlp": 0.01042675, "balance_loss_clip": 1.06226504, "balance_loss_mlp": 1.03102195, "epoch": 0.14489268322010462, "flos": 20073741321600.0, "grad_norm": 2.828405867536094, "language_loss": 0.9720484, "learning_rate": 3.863234332802196e-06, "loss": 0.99436522, "num_input_tokens_seen": 25593585, "step": 1205, "time_per_iteration": 2.671290636062622 }, { "auxiliary_loss_clip": 0.01193546, "auxiliary_loss_mlp": 0.0104118, "balance_loss_clip": 1.05712402, "balance_loss_mlp": 1.03001642, "epoch": 0.1450129261107437, "flos": 27125627955840.0, "grad_norm": 3.209296282681346, "language_loss": 0.74069047, "learning_rate": 3.862951081147723e-06, "loss": 0.76303774, "num_input_tokens_seen": 25613750, "step": 1206, "time_per_iteration": 2.7038304805755615 }, { "auxiliary_loss_clip": 0.01219416, "auxiliary_loss_mlp": 0.01043425, "balance_loss_clip": 1.0648191, "balance_loss_mlp": 1.03272605, "epoch": 0.1451331690013828, "flos": 25702344472320.0, "grad_norm": 3.069748800779454, "language_loss": 0.78790188, "learning_rate": 3.862667546885053e-06, "loss": 0.81053019, "num_input_tokens_seen": 25632300, "step": 1207, "time_per_iteration": 2.6833574771881104 }, { "auxiliary_loss_clip": 0.01202918, "auxiliary_loss_mlp": 0.01040394, "balance_loss_clip": 1.05875838, "balance_loss_mlp": 1.0294503, "epoch": 0.14525341189202187, "flos": 25737393168000.0, "grad_norm": 1.912362446360852, "language_loss": 0.7377848, "learning_rate": 3.8623837300571965e-06, "loss": 0.76021796, "num_input_tokens_seen": 25651285, "step": 1208, "time_per_iteration": 2.729201078414917 }, { "auxiliary_loss_clip": 0.01230605, "auxiliary_loss_mlp": 0.01039819, "balance_loss_clip": 1.0626955, "balance_loss_mlp": 1.02857161, "epoch": 0.14537365478266098, "flos": 23073898844160.0, "grad_norm": 2.9063878517480988, "language_loss": 0.84056526, "learning_rate": 3.8620996307072085e-06, "loss": 0.86326945, "num_input_tokens_seen": 25671990, "step": 1209, "time_per_iteration": 2.649345636367798 }, { "auxiliary_loss_clip": 0.01187367, "auxiliary_loss_mlp": 0.01042372, "balance_loss_clip": 1.0552119, "balance_loss_mlp": 1.0307076, "epoch": 0.14549389767330007, "flos": 20595021448320.0, "grad_norm": 2.448998006961012, "language_loss": 0.64815927, "learning_rate": 3.861815248878188e-06, "loss": 0.67045665, "num_input_tokens_seen": 25689475, "step": 1210, "time_per_iteration": 2.6924753189086914 }, { "auxiliary_loss_clip": 0.01191914, "auxiliary_loss_mlp": 0.0103581, "balance_loss_clip": 1.05885947, "balance_loss_mlp": 1.02431774, "epoch": 0.14561414056393915, "flos": 15121804533120.0, "grad_norm": 7.0438819024340225, "language_loss": 0.80189013, "learning_rate": 3.861530584613274e-06, "loss": 0.82416731, "num_input_tokens_seen": 25707475, "step": 1211, "time_per_iteration": 2.6943118572235107 }, { "auxiliary_loss_clip": 0.01217193, "auxiliary_loss_mlp": 0.00765645, "balance_loss_clip": 1.06248999, "balance_loss_mlp": 1.00071084, "epoch": 0.14573438345457826, "flos": 19427493778560.0, "grad_norm": 2.217084007034587, "language_loss": 0.82685435, "learning_rate": 3.86124563795565e-06, "loss": 0.84668279, "num_input_tokens_seen": 25726290, "step": 1212, "time_per_iteration": 3.449018716812134 }, { "auxiliary_loss_clip": 0.01229928, "auxiliary_loss_mlp": 0.01042511, "balance_loss_clip": 1.06475008, "balance_loss_mlp": 1.03225303, "epoch": 0.14585462634521734, "flos": 24828422572800.0, "grad_norm": 1.8308858316356038, "language_loss": 0.70092964, "learning_rate": 3.860960408948543e-06, "loss": 0.72365397, "num_input_tokens_seen": 25748040, "step": 1213, "time_per_iteration": 2.797030448913574 }, { "auxiliary_loss_clip": 0.01210057, "auxiliary_loss_mlp": 0.01038355, "balance_loss_clip": 1.06341398, "balance_loss_mlp": 1.02776289, "epoch": 0.14597486923585642, "flos": 15448627405440.0, "grad_norm": 2.4749967712640317, "language_loss": 0.89233148, "learning_rate": 3.860674897635222e-06, "loss": 0.91481566, "num_input_tokens_seen": 25764525, "step": 1214, "time_per_iteration": 2.7346909046173096 }, { "auxiliary_loss_clip": 0.0121561, "auxiliary_loss_mlp": 0.0103526, "balance_loss_clip": 1.06315351, "balance_loss_mlp": 1.02474582, "epoch": 0.1460951121264955, "flos": 16655154266880.0, "grad_norm": 2.6773754531319502, "language_loss": 0.83647525, "learning_rate": 3.860389104058998e-06, "loss": 0.85898393, "num_input_tokens_seen": 25782755, "step": 1215, "time_per_iteration": 2.7007076740264893 }, { "auxiliary_loss_clip": 0.011968, "auxiliary_loss_mlp": 0.01034712, "balance_loss_clip": 1.05718493, "balance_loss_mlp": 1.02351797, "epoch": 0.14621535501713462, "flos": 24863291700480.0, "grad_norm": 1.8507080037177677, "language_loss": 0.72697031, "learning_rate": 3.860103028263227e-06, "loss": 0.74928546, "num_input_tokens_seen": 25805860, "step": 1216, "time_per_iteration": 2.741389751434326 }, { "auxiliary_loss_clip": 0.01157454, "auxiliary_loss_mlp": 0.01030721, "balance_loss_clip": 1.04766464, "balance_loss_mlp": 1.01968241, "epoch": 0.1463355979077737, "flos": 25228000442880.0, "grad_norm": 2.113300153065524, "language_loss": 0.70305192, "learning_rate": 3.859816670291304e-06, "loss": 0.72493374, "num_input_tokens_seen": 25824955, "step": 1217, "time_per_iteration": 4.647987365722656 }, { "auxiliary_loss_clip": 0.01146763, "auxiliary_loss_mlp": 0.01038952, "balance_loss_clip": 1.05488086, "balance_loss_mlp": 1.02829432, "epoch": 0.14645584079841278, "flos": 22054143726720.0, "grad_norm": 1.9008766322596444, "language_loss": 0.9021554, "learning_rate": 3.859530030186672e-06, "loss": 0.92401266, "num_input_tokens_seen": 25841965, "step": 1218, "time_per_iteration": 2.8006250858306885 }, { "auxiliary_loss_clip": 0.01201595, "auxiliary_loss_mlp": 0.01035074, "balance_loss_clip": 1.06003046, "balance_loss_mlp": 1.02304006, "epoch": 0.1465760836890519, "flos": 23623870959360.0, "grad_norm": 2.4363108320231266, "language_loss": 0.82830942, "learning_rate": 3.859243107992813e-06, "loss": 0.85067612, "num_input_tokens_seen": 25860770, "step": 1219, "time_per_iteration": 2.7219016551971436 }, { "auxiliary_loss_clip": 0.01183486, "auxiliary_loss_mlp": 0.01039444, "balance_loss_clip": 1.05102491, "balance_loss_mlp": 1.02775562, "epoch": 0.14669632657969098, "flos": 37407893356800.0, "grad_norm": 4.637679592290566, "language_loss": 0.78136623, "learning_rate": 3.858955903753252e-06, "loss": 0.80359548, "num_input_tokens_seen": 25879410, "step": 1220, "time_per_iteration": 2.8418514728546143 }, { "auxiliary_loss_clip": 0.01215223, "auxiliary_loss_mlp": 0.01048955, "balance_loss_clip": 1.0612855, "balance_loss_mlp": 1.03872085, "epoch": 0.14681656947033006, "flos": 28365910623360.0, "grad_norm": 1.5695511245293694, "language_loss": 0.83525586, "learning_rate": 3.858668417511559e-06, "loss": 0.85789764, "num_input_tokens_seen": 25902160, "step": 1221, "time_per_iteration": 2.74118709564209 }, { "auxiliary_loss_clip": 0.01202199, "auxiliary_loss_mlp": 0.01041443, "balance_loss_clip": 1.06095386, "balance_loss_mlp": 1.03038001, "epoch": 0.14693681236096917, "flos": 18479488078080.0, "grad_norm": 2.0825384376447484, "language_loss": 0.76783532, "learning_rate": 3.8583806493113445e-06, "loss": 0.7902717, "num_input_tokens_seen": 25920505, "step": 1222, "time_per_iteration": 2.758145809173584 }, { "auxiliary_loss_clip": 0.0121494, "auxiliary_loss_mlp": 0.01046133, "balance_loss_clip": 1.06453562, "balance_loss_mlp": 1.0351007, "epoch": 0.14705705525160825, "flos": 20777806782720.0, "grad_norm": 2.0887704716655238, "language_loss": 0.82391608, "learning_rate": 3.858092599196263e-06, "loss": 0.84652674, "num_input_tokens_seen": 25938460, "step": 1223, "time_per_iteration": 2.689225673675537 }, { "auxiliary_loss_clip": 0.01214397, "auxiliary_loss_mlp": 0.01041592, "balance_loss_clip": 1.06148612, "balance_loss_mlp": 1.03051138, "epoch": 0.14717729814224734, "flos": 29932944336000.0, "grad_norm": 3.23279300194928, "language_loss": 0.82237494, "learning_rate": 3.857804267210012e-06, "loss": 0.84493482, "num_input_tokens_seen": 25957760, "step": 1224, "time_per_iteration": 2.6660141944885254 }, { "auxiliary_loss_clip": 0.01172052, "auxiliary_loss_mlp": 0.01036203, "balance_loss_clip": 1.05426693, "balance_loss_mlp": 1.02502692, "epoch": 0.14729754103288642, "flos": 20047491457920.0, "grad_norm": 2.16563211812386, "language_loss": 0.88379008, "learning_rate": 3.857515653396331e-06, "loss": 0.90587264, "num_input_tokens_seen": 25974970, "step": 1225, "time_per_iteration": 2.6902647018432617 }, { "auxiliary_loss_clip": 0.01171989, "auxiliary_loss_mlp": 0.01037484, "balance_loss_clip": 1.0557642, "balance_loss_mlp": 1.02654099, "epoch": 0.14741778392352553, "flos": 19281516906240.0, "grad_norm": 2.1523590221682247, "language_loss": 0.87250215, "learning_rate": 3.857226757799002e-06, "loss": 0.89459693, "num_input_tokens_seen": 25992525, "step": 1226, "time_per_iteration": 2.755070924758911 }, { "auxiliary_loss_clip": 0.01198168, "auxiliary_loss_mlp": 0.01035144, "balance_loss_clip": 1.05614901, "balance_loss_mlp": 1.02449214, "epoch": 0.1475380268141646, "flos": 25411108999680.0, "grad_norm": 3.049291465948468, "language_loss": 0.74183846, "learning_rate": 3.85693758046185e-06, "loss": 0.7641716, "num_input_tokens_seen": 26010815, "step": 1227, "time_per_iteration": 2.7398159503936768 }, { "auxiliary_loss_clip": 0.01233688, "auxiliary_loss_mlp": 0.01039627, "balance_loss_clip": 1.06636882, "balance_loss_mlp": 1.0278852, "epoch": 0.1476582697048037, "flos": 20847652778880.0, "grad_norm": 1.719989944343744, "language_loss": 0.83042467, "learning_rate": 3.8566481214287435e-06, "loss": 0.85315788, "num_input_tokens_seen": 26028935, "step": 1228, "time_per_iteration": 2.6091153621673584 }, { "auxiliary_loss_clip": 0.01175198, "auxiliary_loss_mlp": 0.01046738, "balance_loss_clip": 1.05245531, "balance_loss_mlp": 1.03518116, "epoch": 0.1477785125954428, "flos": 14028109269120.0, "grad_norm": 2.024697828877587, "language_loss": 0.90692252, "learning_rate": 3.8563583807435935e-06, "loss": 0.92914188, "num_input_tokens_seen": 26045080, "step": 1229, "time_per_iteration": 2.8040363788604736 }, { "auxiliary_loss_clip": 0.01213082, "auxiliary_loss_mlp": 0.00765619, "balance_loss_clip": 1.05977607, "balance_loss_mlp": 1.00045073, "epoch": 0.1478987554860819, "flos": 20516699842560.0, "grad_norm": 1.7971076226353624, "language_loss": 0.77343976, "learning_rate": 3.856068358450353e-06, "loss": 0.79322672, "num_input_tokens_seen": 26065030, "step": 1230, "time_per_iteration": 2.712881088256836 }, { "auxiliary_loss_clip": 0.01196417, "auxiliary_loss_mlp": 0.01036849, "balance_loss_clip": 1.06255507, "balance_loss_mlp": 1.02533972, "epoch": 0.14801899837672097, "flos": 17857012360320.0, "grad_norm": 1.704728709384041, "language_loss": 0.86052501, "learning_rate": 3.8557780545930186e-06, "loss": 0.88285768, "num_input_tokens_seen": 26083445, "step": 1231, "time_per_iteration": 2.6880502700805664 }, { "auxiliary_loss_clip": 0.01195278, "auxiliary_loss_mlp": 0.01036512, "balance_loss_clip": 1.05822134, "balance_loss_mlp": 1.025527, "epoch": 0.14813924126736006, "flos": 20881408584960.0, "grad_norm": 1.8138645288613187, "language_loss": 0.79300463, "learning_rate": 3.855487469215628e-06, "loss": 0.81532246, "num_input_tokens_seen": 26102375, "step": 1232, "time_per_iteration": 2.676414966583252 }, { "auxiliary_loss_clip": 0.01179423, "auxiliary_loss_mlp": 0.01032504, "balance_loss_clip": 1.05518603, "balance_loss_mlp": 1.02151346, "epoch": 0.14825948415799917, "flos": 37414070496000.0, "grad_norm": 2.1431688119116306, "language_loss": 0.72569293, "learning_rate": 3.855196602362264e-06, "loss": 0.74781221, "num_input_tokens_seen": 26125295, "step": 1233, "time_per_iteration": 2.8021509647369385 }, { "auxiliary_loss_clip": 0.01213111, "auxiliary_loss_mlp": 0.01036529, "balance_loss_clip": 1.05954039, "balance_loss_mlp": 1.02527595, "epoch": 0.14837972704863825, "flos": 22014641744640.0, "grad_norm": 2.160107862357133, "language_loss": 0.94309986, "learning_rate": 3.854905454077051e-06, "loss": 0.96559632, "num_input_tokens_seen": 26142905, "step": 1234, "time_per_iteration": 2.6102452278137207 }, { "auxiliary_loss_clip": 0.01132882, "auxiliary_loss_mlp": 0.0103181, "balance_loss_clip": 1.0498848, "balance_loss_mlp": 1.02058685, "epoch": 0.14849996993927733, "flos": 20996323171200.0, "grad_norm": 2.0762463665928665, "language_loss": 0.8834399, "learning_rate": 3.854614024404155e-06, "loss": 0.90508682, "num_input_tokens_seen": 26161215, "step": 1235, "time_per_iteration": 2.753676652908325 }, { "auxiliary_loss_clip": 0.01183526, "auxiliary_loss_mlp": 0.0104003, "balance_loss_clip": 1.05339622, "balance_loss_mlp": 1.02891397, "epoch": 0.14862021282991644, "flos": 20047994248320.0, "grad_norm": 2.1900345219156496, "language_loss": 0.89382362, "learning_rate": 3.8543223133877865e-06, "loss": 0.91605914, "num_input_tokens_seen": 26179810, "step": 1236, "time_per_iteration": 2.686641216278076 }, { "auxiliary_loss_clip": 0.01178187, "auxiliary_loss_mlp": 0.01033472, "balance_loss_clip": 1.05352545, "balance_loss_mlp": 1.02150917, "epoch": 0.14874045572055553, "flos": 22712027276160.0, "grad_norm": 1.7254784548751605, "language_loss": 0.88607669, "learning_rate": 3.854030321072198e-06, "loss": 0.90819323, "num_input_tokens_seen": 26199715, "step": 1237, "time_per_iteration": 2.7118351459503174 }, { "auxiliary_loss_clip": 0.01187648, "auxiliary_loss_mlp": 0.01036741, "balance_loss_clip": 1.05731344, "balance_loss_mlp": 1.02608418, "epoch": 0.1488606986111946, "flos": 25411288567680.0, "grad_norm": 1.947605796348242, "language_loss": 0.73387992, "learning_rate": 3.853738047501682e-06, "loss": 0.75612384, "num_input_tokens_seen": 26220275, "step": 1238, "time_per_iteration": 3.721106767654419 }, { "auxiliary_loss_clip": 0.01214274, "auxiliary_loss_mlp": 0.01038241, "balance_loss_clip": 1.06100142, "balance_loss_mlp": 1.02708316, "epoch": 0.1489809415018337, "flos": 17018749687680.0, "grad_norm": 1.8854486018911207, "language_loss": 0.77631086, "learning_rate": 3.85344549272058e-06, "loss": 0.79883605, "num_input_tokens_seen": 26238255, "step": 1239, "time_per_iteration": 2.6427366733551025 }, { "auxiliary_loss_clip": 0.01206202, "auxiliary_loss_mlp": 0.01045409, "balance_loss_clip": 1.05728757, "balance_loss_mlp": 1.03465641, "epoch": 0.1491011843924728, "flos": 33659394860160.0, "grad_norm": 1.8034476970958224, "language_loss": 0.82624644, "learning_rate": 3.853152656773269e-06, "loss": 0.84876251, "num_input_tokens_seen": 26259690, "step": 1240, "time_per_iteration": 2.7432587146759033 }, { "auxiliary_loss_clip": 0.01195803, "auxiliary_loss_mlp": 0.01037, "balance_loss_clip": 1.05880284, "balance_loss_mlp": 1.02664661, "epoch": 0.14922142728311188, "flos": 21179000764800.0, "grad_norm": 1.8038287663705896, "language_loss": 0.85037917, "learning_rate": 3.852859539704174e-06, "loss": 0.87270713, "num_input_tokens_seen": 26278990, "step": 1241, "time_per_iteration": 2.663396120071411 }, { "auxiliary_loss_clip": 0.01168018, "auxiliary_loss_mlp": 0.01034375, "balance_loss_clip": 1.05496144, "balance_loss_mlp": 1.02315199, "epoch": 0.14934167017375097, "flos": 29860548474240.0, "grad_norm": 2.1693647055375997, "language_loss": 0.76379752, "learning_rate": 3.85256614155776e-06, "loss": 0.7858215, "num_input_tokens_seen": 26299120, "step": 1242, "time_per_iteration": 2.764119863510132 }, { "auxiliary_loss_clip": 0.01211666, "auxiliary_loss_mlp": 0.01037132, "balance_loss_clip": 1.05849314, "balance_loss_mlp": 1.02582479, "epoch": 0.14946191306439008, "flos": 17019216564480.0, "grad_norm": 3.867528129114803, "language_loss": 0.74389017, "learning_rate": 3.852272462378535e-06, "loss": 0.7663781, "num_input_tokens_seen": 26316995, "step": 1243, "time_per_iteration": 4.421874046325684 }, { "auxiliary_loss_clip": 0.01197605, "auxiliary_loss_mlp": 0.0104493, "balance_loss_clip": 1.0596931, "balance_loss_mlp": 1.03321171, "epoch": 0.14958215595502916, "flos": 15669047214720.0, "grad_norm": 2.2717501066451358, "language_loss": 0.77568674, "learning_rate": 3.85197850221105e-06, "loss": 0.79811215, "num_input_tokens_seen": 26333295, "step": 1244, "time_per_iteration": 2.577883243560791 }, { "auxiliary_loss_clip": 0.01213947, "auxiliary_loss_mlp": 0.01036381, "balance_loss_clip": 1.06350434, "balance_loss_mlp": 1.02564073, "epoch": 0.14970239884566824, "flos": 33108560818560.0, "grad_norm": 1.9109854236198984, "language_loss": 0.76320648, "learning_rate": 3.851684261099899e-06, "loss": 0.78570974, "num_input_tokens_seen": 26355035, "step": 1245, "time_per_iteration": 2.642407178878784 }, { "auxiliary_loss_clip": 0.01199269, "auxiliary_loss_mlp": 0.01039544, "balance_loss_clip": 1.06269789, "balance_loss_mlp": 1.02831483, "epoch": 0.14982264173630733, "flos": 17821245392640.0, "grad_norm": 1.9225540144171578, "language_loss": 0.86979043, "learning_rate": 3.851389739089718e-06, "loss": 0.89217854, "num_input_tokens_seen": 26371655, "step": 1246, "time_per_iteration": 2.55082368850708 }, { "auxiliary_loss_clip": 0.01217051, "auxiliary_loss_mlp": 0.01048271, "balance_loss_clip": 1.06520402, "balance_loss_mlp": 1.03636813, "epoch": 0.14994288462694644, "flos": 32409559175040.0, "grad_norm": 1.9765406680640472, "language_loss": 0.80425286, "learning_rate": 3.851094936225186e-06, "loss": 0.82690603, "num_input_tokens_seen": 26392540, "step": 1247, "time_per_iteration": 2.6267290115356445 }, { "auxiliary_loss_clip": 0.01193472, "auxiliary_loss_mlp": 0.01039562, "balance_loss_clip": 1.0598979, "balance_loss_mlp": 1.02893472, "epoch": 0.15006312751758552, "flos": 31794661226880.0, "grad_norm": 1.4403886486846316, "language_loss": 0.76860803, "learning_rate": 3.850799852551024e-06, "loss": 0.79093838, "num_input_tokens_seen": 26414960, "step": 1248, "time_per_iteration": 2.815251588821411 }, { "auxiliary_loss_clip": 0.01205783, "auxiliary_loss_mlp": 0.01036844, "balance_loss_clip": 1.05807889, "balance_loss_mlp": 1.0249052, "epoch": 0.1501833704082246, "flos": 16618022582400.0, "grad_norm": 2.098768869862525, "language_loss": 0.86086851, "learning_rate": 3.850504488111995e-06, "loss": 0.88329476, "num_input_tokens_seen": 26431635, "step": 1249, "time_per_iteration": 2.586219310760498 }, { "auxiliary_loss_clip": 0.01190596, "auxiliary_loss_mlp": 0.01037037, "balance_loss_clip": 1.05498421, "balance_loss_mlp": 1.02667797, "epoch": 0.15030361329886371, "flos": 23471178243840.0, "grad_norm": 1.7697391737190888, "language_loss": 0.82803303, "learning_rate": 3.850208842952907e-06, "loss": 0.85030931, "num_input_tokens_seen": 26450440, "step": 1250, "time_per_iteration": 2.7449941635131836 }, { "auxiliary_loss_clip": 0.01178829, "auxiliary_loss_mlp": 0.01038221, "balance_loss_clip": 1.05690706, "balance_loss_mlp": 1.02768242, "epoch": 0.1504238561895028, "flos": 25629409906560.0, "grad_norm": 2.6101226267658206, "language_loss": 0.79159009, "learning_rate": 3.849912917118608e-06, "loss": 0.81376064, "num_input_tokens_seen": 26471480, "step": 1251, "time_per_iteration": 2.779860258102417 }, { "auxiliary_loss_clip": 0.01122472, "auxiliary_loss_mlp": 0.0101448, "balance_loss_clip": 1.03202558, "balance_loss_mlp": 1.01111794, "epoch": 0.15054409908014188, "flos": 52095146129280.0, "grad_norm": 0.8775674771255021, "language_loss": 0.59289014, "learning_rate": 3.849616710653992e-06, "loss": 0.61425966, "num_input_tokens_seen": 26532950, "step": 1252, "time_per_iteration": 3.168269634246826 }, { "auxiliary_loss_clip": 0.01210076, "auxiliary_loss_mlp": 0.01040261, "balance_loss_clip": 1.05928135, "balance_loss_mlp": 1.02890658, "epoch": 0.150664341970781, "flos": 18880251096960.0, "grad_norm": 1.8689901611023974, "language_loss": 0.75104451, "learning_rate": 3.84932022360399e-06, "loss": 0.77354789, "num_input_tokens_seen": 26551615, "step": 1253, "time_per_iteration": 2.6280484199523926 }, { "auxiliary_loss_clip": 0.01195692, "auxiliary_loss_mlp": 0.01038377, "balance_loss_clip": 1.06087136, "balance_loss_mlp": 1.0280292, "epoch": 0.15078458486142007, "flos": 22163240309760.0, "grad_norm": 2.6089142786859885, "language_loss": 0.84899712, "learning_rate": 3.849023456013581e-06, "loss": 0.87133777, "num_input_tokens_seen": 26569175, "step": 1254, "time_per_iteration": 2.639106035232544 }, { "auxiliary_loss_clip": 0.01218913, "auxiliary_loss_mlp": 0.01039083, "balance_loss_clip": 1.06209481, "balance_loss_mlp": 1.02866983, "epoch": 0.15090482775205916, "flos": 26651894457600.0, "grad_norm": 2.120969529222285, "language_loss": 0.62871814, "learning_rate": 3.848726407927784e-06, "loss": 0.65129805, "num_input_tokens_seen": 26589560, "step": 1255, "time_per_iteration": 2.6973910331726074 }, { "auxiliary_loss_clip": 0.01202101, "auxiliary_loss_mlp": 0.01038571, "balance_loss_clip": 1.06353927, "balance_loss_mlp": 1.0270493, "epoch": 0.15102507064269824, "flos": 21798998444160.0, "grad_norm": 2.8240874565370095, "language_loss": 0.86614645, "learning_rate": 3.84842907939166e-06, "loss": 0.88855314, "num_input_tokens_seen": 26608785, "step": 1256, "time_per_iteration": 2.6589887142181396 }, { "auxiliary_loss_clip": 0.01173358, "auxiliary_loss_mlp": 0.01038454, "balance_loss_clip": 1.05466747, "balance_loss_mlp": 1.02765989, "epoch": 0.15114531353333735, "flos": 22820908377600.0, "grad_norm": 2.866018843632024, "language_loss": 0.70662761, "learning_rate": 3.8481314704503146e-06, "loss": 0.72874576, "num_input_tokens_seen": 26628615, "step": 1257, "time_per_iteration": 2.727477550506592 }, { "auxiliary_loss_clip": 0.01209913, "auxiliary_loss_mlp": 0.01044317, "balance_loss_clip": 1.06303227, "balance_loss_mlp": 1.03382051, "epoch": 0.15126555642397643, "flos": 19682674974720.0, "grad_norm": 2.314754622612884, "language_loss": 0.88067603, "learning_rate": 3.847833581148895e-06, "loss": 0.90321833, "num_input_tokens_seen": 26647525, "step": 1258, "time_per_iteration": 2.6216983795166016 }, { "auxiliary_loss_clip": 0.01224628, "auxiliary_loss_mlp": 0.01036248, "balance_loss_clip": 1.05974698, "balance_loss_mlp": 1.02529299, "epoch": 0.15138579931461552, "flos": 28726022424960.0, "grad_norm": 1.9793562406692984, "language_loss": 0.81299448, "learning_rate": 3.84753541153259e-06, "loss": 0.83560324, "num_input_tokens_seen": 26667095, "step": 1259, "time_per_iteration": 2.7240817546844482 }, { "auxiliary_loss_clip": 0.01214749, "auxiliary_loss_mlp": 0.01042208, "balance_loss_clip": 1.06173253, "balance_loss_mlp": 1.03119278, "epoch": 0.15150604220525463, "flos": 22127006465280.0, "grad_norm": 1.6380330321145067, "language_loss": 0.83293808, "learning_rate": 3.847236961646633e-06, "loss": 0.85550773, "num_input_tokens_seen": 26686075, "step": 1260, "time_per_iteration": 2.708862781524658 }, { "auxiliary_loss_clip": 0.01191439, "auxiliary_loss_mlp": 0.01043375, "balance_loss_clip": 1.05748093, "balance_loss_mlp": 1.03234172, "epoch": 0.1516262850958937, "flos": 12968708515200.0, "grad_norm": 2.715761959035085, "language_loss": 0.78000396, "learning_rate": 3.846938231536296e-06, "loss": 0.80235207, "num_input_tokens_seen": 26701695, "step": 1261, "time_per_iteration": 2.661027431488037 }, { "auxiliary_loss_clip": 0.01217566, "auxiliary_loss_mlp": 0.01040465, "balance_loss_clip": 1.06398702, "balance_loss_mlp": 1.02859211, "epoch": 0.1517465279865328, "flos": 21797130936960.0, "grad_norm": 1.8020792689945944, "language_loss": 0.81134558, "learning_rate": 3.8466392212468995e-06, "loss": 0.83392584, "num_input_tokens_seen": 26721885, "step": 1262, "time_per_iteration": 2.6364200115203857 }, { "auxiliary_loss_clip": 0.01098162, "auxiliary_loss_mlp": 0.01009657, "balance_loss_clip": 1.02627468, "balance_loss_mlp": 1.00629544, "epoch": 0.15186677087717187, "flos": 58174569901440.0, "grad_norm": 0.8279155592851141, "language_loss": 0.61927038, "learning_rate": 3.8463399308238e-06, "loss": 0.64034855, "num_input_tokens_seen": 26780990, "step": 1263, "time_per_iteration": 3.2125496864318848 }, { "auxiliary_loss_clip": 0.01213478, "auxiliary_loss_mlp": 0.01044309, "balance_loss_clip": 1.06178522, "balance_loss_mlp": 1.03349018, "epoch": 0.15198701376781099, "flos": 32669696448000.0, "grad_norm": 2.8151698714914954, "language_loss": 0.6421411, "learning_rate": 3.846040360312402e-06, "loss": 0.66471899, "num_input_tokens_seen": 26804250, "step": 1264, "time_per_iteration": 3.662544012069702 }, { "auxiliary_loss_clip": 0.01225093, "auxiliary_loss_mlp": 0.01042173, "balance_loss_clip": 1.05884719, "balance_loss_mlp": 1.0310626, "epoch": 0.15210725665845007, "flos": 28402575431040.0, "grad_norm": 2.203914782878996, "language_loss": 0.81170654, "learning_rate": 3.8457405097581485e-06, "loss": 0.83437932, "num_input_tokens_seen": 26823240, "step": 1265, "time_per_iteration": 2.6349384784698486 }, { "auxiliary_loss_clip": 0.01168788, "auxiliary_loss_mlp": 0.01036556, "balance_loss_clip": 1.0509758, "balance_loss_mlp": 1.02489734, "epoch": 0.15222749954908915, "flos": 19938179393280.0, "grad_norm": 1.7320737088909786, "language_loss": 0.78480041, "learning_rate": 3.8454403792065275e-06, "loss": 0.80685389, "num_input_tokens_seen": 26842060, "step": 1266, "time_per_iteration": 2.740194082260132 }, { "auxiliary_loss_clip": 0.01172993, "auxiliary_loss_mlp": 0.01048142, "balance_loss_clip": 1.05526578, "balance_loss_mlp": 1.0371685, "epoch": 0.15234774243972826, "flos": 21324223451520.0, "grad_norm": 2.0101612906626887, "language_loss": 0.85991836, "learning_rate": 3.845139968703068e-06, "loss": 0.88212967, "num_input_tokens_seen": 26859580, "step": 1267, "time_per_iteration": 2.6715123653411865 }, { "auxiliary_loss_clip": 0.01162759, "auxiliary_loss_mlp": 0.01038033, "balance_loss_clip": 1.05241513, "balance_loss_mlp": 1.02732229, "epoch": 0.15246798533036734, "flos": 25957812977280.0, "grad_norm": 1.9489894574471673, "language_loss": 0.8275584, "learning_rate": 3.844839278293342e-06, "loss": 0.84956634, "num_input_tokens_seen": 26880430, "step": 1268, "time_per_iteration": 2.7834231853485107 }, { "auxiliary_loss_clip": 0.01230933, "auxiliary_loss_mlp": 0.01038224, "balance_loss_clip": 1.0663023, "balance_loss_mlp": 1.02758491, "epoch": 0.15258822822100643, "flos": 25811907932160.0, "grad_norm": 2.2605617477309905, "language_loss": 0.76936567, "learning_rate": 3.8445383080229654e-06, "loss": 0.79205728, "num_input_tokens_seen": 26896445, "step": 1269, "time_per_iteration": 3.5650291442871094 }, { "auxiliary_loss_clip": 0.01187059, "auxiliary_loss_mlp": 0.01036245, "balance_loss_clip": 1.05597997, "balance_loss_mlp": 1.02438378, "epoch": 0.1527084711116455, "flos": 25265455349760.0, "grad_norm": 2.3146121832539244, "language_loss": 0.73455602, "learning_rate": 3.844237057937593e-06, "loss": 0.75678909, "num_input_tokens_seen": 26915450, "step": 1270, "time_per_iteration": 3.644118309020996 }, { "auxiliary_loss_clip": 0.01215851, "auxiliary_loss_mlp": 0.01035536, "balance_loss_clip": 1.06042027, "balance_loss_mlp": 1.02463377, "epoch": 0.15282871400228462, "flos": 29240227572480.0, "grad_norm": 2.379709745001183, "language_loss": 0.78080845, "learning_rate": 3.843935528082926e-06, "loss": 0.80332232, "num_input_tokens_seen": 26936475, "step": 1271, "time_per_iteration": 2.6751418113708496 }, { "auxiliary_loss_clip": 0.01210739, "auxiliary_loss_mlp": 0.0103945, "balance_loss_clip": 1.05877256, "balance_loss_mlp": 1.02797627, "epoch": 0.1529489568929237, "flos": 20882952869760.0, "grad_norm": 1.7119555867298408, "language_loss": 0.84927511, "learning_rate": 3.843633718504704e-06, "loss": 0.87177694, "num_input_tokens_seen": 26954920, "step": 1272, "time_per_iteration": 2.6493210792541504 }, { "auxiliary_loss_clip": 0.01182148, "auxiliary_loss_mlp": 0.01036479, "balance_loss_clip": 1.05598104, "balance_loss_mlp": 1.02492166, "epoch": 0.1530691997835628, "flos": 20083833043200.0, "grad_norm": 2.41101394043956, "language_loss": 0.90229356, "learning_rate": 3.843331629248715e-06, "loss": 0.92447984, "num_input_tokens_seen": 26972520, "step": 1273, "time_per_iteration": 2.736314296722412 }, { "auxiliary_loss_clip": 0.01227278, "auxiliary_loss_mlp": 0.01035449, "balance_loss_clip": 1.06334352, "balance_loss_mlp": 1.02500033, "epoch": 0.1531894426742019, "flos": 28759814144640.0, "grad_norm": 2.11349647091796, "language_loss": 0.76929057, "learning_rate": 3.843029260360782e-06, "loss": 0.79191786, "num_input_tokens_seen": 26990890, "step": 1274, "time_per_iteration": 2.681389331817627 }, { "auxiliary_loss_clip": 0.01209068, "auxiliary_loss_mlp": 0.01039604, "balance_loss_clip": 1.06064916, "balance_loss_mlp": 1.02941132, "epoch": 0.15330968556484098, "flos": 22236282616320.0, "grad_norm": 2.0508964171238455, "language_loss": 0.78829205, "learning_rate": 3.8427266118867755e-06, "loss": 0.8107788, "num_input_tokens_seen": 27010640, "step": 1275, "time_per_iteration": 2.7134149074554443 }, { "auxiliary_loss_clip": 0.01196377, "auxiliary_loss_mlp": 0.0103788, "balance_loss_clip": 1.05854177, "balance_loss_mlp": 1.02660239, "epoch": 0.15342992845548006, "flos": 27527504296320.0, "grad_norm": 3.9984130307014487, "language_loss": 0.83514953, "learning_rate": 3.842423683872608e-06, "loss": 0.85749209, "num_input_tokens_seen": 27031215, "step": 1276, "time_per_iteration": 2.768399953842163 }, { "auxiliary_loss_clip": 0.01212443, "auxiliary_loss_mlp": 0.01037727, "balance_loss_clip": 1.06028962, "balance_loss_mlp": 1.02625966, "epoch": 0.15355017134611917, "flos": 19609596754560.0, "grad_norm": 5.278392600736629, "language_loss": 0.77593124, "learning_rate": 3.842120476364232e-06, "loss": 0.79843295, "num_input_tokens_seen": 27049665, "step": 1277, "time_per_iteration": 2.5766642093658447 }, { "auxiliary_loss_clip": 0.01214933, "auxiliary_loss_mlp": 0.01048325, "balance_loss_clip": 1.05987394, "balance_loss_mlp": 1.03670812, "epoch": 0.15367041423675826, "flos": 18478590238080.0, "grad_norm": 2.2094644234524177, "language_loss": 0.8361268, "learning_rate": 3.841816989407644e-06, "loss": 0.8587594, "num_input_tokens_seen": 27065155, "step": 1278, "time_per_iteration": 2.6689441204071045 }, { "auxiliary_loss_clip": 0.01180367, "auxiliary_loss_mlp": 0.0103468, "balance_loss_clip": 1.05645967, "balance_loss_mlp": 1.0239929, "epoch": 0.15379065712739734, "flos": 41427662342400.0, "grad_norm": 2.026232934352127, "language_loss": 0.7673614, "learning_rate": 3.841513223048884e-06, "loss": 0.78951192, "num_input_tokens_seen": 27085840, "step": 1279, "time_per_iteration": 2.832150459289551 }, { "auxiliary_loss_clip": 0.01180082, "auxiliary_loss_mlp": 0.01041048, "balance_loss_clip": 1.05566669, "balance_loss_mlp": 1.030123, "epoch": 0.15391090001803642, "flos": 22054215553920.0, "grad_norm": 2.593592270047382, "language_loss": 0.78879118, "learning_rate": 3.841209177334031e-06, "loss": 0.81100249, "num_input_tokens_seen": 27104200, "step": 1280, "time_per_iteration": 2.7665064334869385 }, { "auxiliary_loss_clip": 0.01209514, "auxiliary_loss_mlp": 0.01041037, "balance_loss_clip": 1.0622201, "balance_loss_mlp": 1.03032613, "epoch": 0.15403114290867553, "flos": 15450351258240.0, "grad_norm": 1.8842519683169598, "language_loss": 0.7498098, "learning_rate": 3.84090485230921e-06, "loss": 0.77231526, "num_input_tokens_seen": 27122440, "step": 1281, "time_per_iteration": 2.598557949066162 }, { "auxiliary_loss_clip": 0.01226399, "auxiliary_loss_mlp": 0.01032292, "balance_loss_clip": 1.06296921, "balance_loss_mlp": 1.02124786, "epoch": 0.15415138579931462, "flos": 17929156826880.0, "grad_norm": 7.22433559557855, "language_loss": 0.76386023, "learning_rate": 3.840600248020588e-06, "loss": 0.78644717, "num_input_tokens_seen": 27139380, "step": 1282, "time_per_iteration": 2.8246712684631348 }, { "auxiliary_loss_clip": 0.01201647, "auxiliary_loss_mlp": 0.0103965, "balance_loss_clip": 1.05748272, "balance_loss_mlp": 1.02825916, "epoch": 0.1542716286899537, "flos": 11429325296640.0, "grad_norm": 3.2395284955879258, "language_loss": 0.80170673, "learning_rate": 3.840295364514371e-06, "loss": 0.82411969, "num_input_tokens_seen": 27156760, "step": 1283, "time_per_iteration": 2.645066261291504 }, { "auxiliary_loss_clip": 0.01195441, "auxiliary_loss_mlp": 0.01037822, "balance_loss_clip": 1.05800343, "balance_loss_mlp": 1.02767777, "epoch": 0.1543918715805928, "flos": 17420338719360.0, "grad_norm": 2.6566751164067064, "language_loss": 0.78854489, "learning_rate": 3.83999020183681e-06, "loss": 0.81087756, "num_input_tokens_seen": 27175455, "step": 1284, "time_per_iteration": 2.6616122722625732 }, { "auxiliary_loss_clip": 0.01147475, "auxiliary_loss_mlp": 0.0103352, "balance_loss_clip": 1.05147338, "balance_loss_mlp": 1.02289808, "epoch": 0.1545121144712319, "flos": 17786376264960.0, "grad_norm": 1.8245517485877292, "language_loss": 0.78946495, "learning_rate": 3.839684760034199e-06, "loss": 0.81127489, "num_input_tokens_seen": 27193660, "step": 1285, "time_per_iteration": 2.7298710346221924 }, { "auxiliary_loss_clip": 0.01174063, "auxiliary_loss_mlp": 0.01034765, "balance_loss_clip": 1.05754209, "balance_loss_mlp": 1.02365494, "epoch": 0.15463235736187098, "flos": 28220185146240.0, "grad_norm": 2.63103091412387, "language_loss": 0.65142268, "learning_rate": 3.8393790391528716e-06, "loss": 0.67351097, "num_input_tokens_seen": 27214355, "step": 1286, "time_per_iteration": 2.7687134742736816 }, { "auxiliary_loss_clip": 0.01195758, "auxiliary_loss_mlp": 0.01033107, "balance_loss_clip": 1.05759835, "balance_loss_mlp": 1.02236056, "epoch": 0.15475260025251006, "flos": 22856890826880.0, "grad_norm": 1.9558125119649594, "language_loss": 0.89503682, "learning_rate": 3.8390730392392075e-06, "loss": 0.9173255, "num_input_tokens_seen": 27234335, "step": 1287, "time_per_iteration": 2.7378504276275635 }, { "auxiliary_loss_clip": 0.01230329, "auxiliary_loss_mlp": 0.01030061, "balance_loss_clip": 1.06604218, "balance_loss_mlp": 1.01977909, "epoch": 0.15487284314314917, "flos": 17602872658560.0, "grad_norm": 2.393964559601533, "language_loss": 0.79352933, "learning_rate": 3.838766760339626e-06, "loss": 0.81613326, "num_input_tokens_seen": 27252860, "step": 1288, "time_per_iteration": 2.583246946334839 }, { "auxiliary_loss_clip": 0.01159915, "auxiliary_loss_mlp": 0.01035622, "balance_loss_clip": 1.05227792, "balance_loss_mlp": 1.0242312, "epoch": 0.15499308603378825, "flos": 20082037363200.0, "grad_norm": 2.5679836129045537, "language_loss": 0.792768, "learning_rate": 3.838460202500587e-06, "loss": 0.81472337, "num_input_tokens_seen": 27268650, "step": 1289, "time_per_iteration": 2.7179596424102783 }, { "auxiliary_loss_clip": 0.01181499, "auxiliary_loss_mlp": 0.01036261, "balance_loss_clip": 1.06191945, "balance_loss_mlp": 1.02535355, "epoch": 0.15511332892442733, "flos": 15918051271680.0, "grad_norm": 2.8762997256528533, "language_loss": 0.7399981, "learning_rate": 3.838153365768599e-06, "loss": 0.76217568, "num_input_tokens_seen": 27285160, "step": 1290, "time_per_iteration": 3.6000165939331055 }, { "auxiliary_loss_clip": 0.01181552, "auxiliary_loss_mlp": 0.01036968, "balance_loss_clip": 1.06176651, "balance_loss_mlp": 1.02560091, "epoch": 0.15523357181506645, "flos": 41282475569280.0, "grad_norm": 2.338496168978605, "language_loss": 0.7477892, "learning_rate": 3.837846250190206e-06, "loss": 0.76997441, "num_input_tokens_seen": 27308025, "step": 1291, "time_per_iteration": 2.849884510040283 }, { "auxiliary_loss_clip": 0.01164162, "auxiliary_loss_mlp": 0.00764859, "balance_loss_clip": 1.05550861, "balance_loss_mlp": 1.00049973, "epoch": 0.15535381470570553, "flos": 18478769806080.0, "grad_norm": 2.0433099659576057, "language_loss": 0.76871532, "learning_rate": 3.837538855811998e-06, "loss": 0.78800553, "num_input_tokens_seen": 27326200, "step": 1292, "time_per_iteration": 2.691004514694214 }, { "auxiliary_loss_clip": 0.01200183, "auxiliary_loss_mlp": 0.01047123, "balance_loss_clip": 1.05987108, "balance_loss_mlp": 1.03559542, "epoch": 0.1554740575963446, "flos": 13918150759680.0, "grad_norm": 2.3016446650573936, "language_loss": 0.70889395, "learning_rate": 3.837231182680606e-06, "loss": 0.73136699, "num_input_tokens_seen": 27344165, "step": 1293, "time_per_iteration": 2.7653911113739014 }, { "auxiliary_loss_clip": 0.01212321, "auxiliary_loss_mlp": 0.0103523, "balance_loss_clip": 1.05959964, "balance_loss_mlp": 1.02510309, "epoch": 0.1555943004869837, "flos": 20847078161280.0, "grad_norm": 1.856622557441132, "language_loss": 0.76360977, "learning_rate": 3.836923230842706e-06, "loss": 0.78608525, "num_input_tokens_seen": 27363280, "step": 1294, "time_per_iteration": 2.653813123703003 }, { "auxiliary_loss_clip": 0.01166589, "auxiliary_loss_mlp": 0.01034666, "balance_loss_clip": 1.05222058, "balance_loss_mlp": 1.02415776, "epoch": 0.1557145433776228, "flos": 22085888371200.0, "grad_norm": 2.4241691458439214, "language_loss": 0.8078742, "learning_rate": 3.836615000345011e-06, "loss": 0.82988679, "num_input_tokens_seen": 27381460, "step": 1295, "time_per_iteration": 5.447427272796631 }, { "auxiliary_loss_clip": 0.01226344, "auxiliary_loss_mlp": 0.01034741, "balance_loss_clip": 1.06285632, "balance_loss_mlp": 1.02392864, "epoch": 0.1558347862682619, "flos": 19791987039360.0, "grad_norm": 2.239363065816323, "language_loss": 0.77996665, "learning_rate": 3.836306491234282e-06, "loss": 0.8025775, "num_input_tokens_seen": 27399310, "step": 1296, "time_per_iteration": 2.57647705078125 }, { "auxiliary_loss_clip": 0.01193919, "auxiliary_loss_mlp": 0.01040276, "balance_loss_clip": 1.06381035, "balance_loss_mlp": 1.02985716, "epoch": 0.15595502915890097, "flos": 17237086508160.0, "grad_norm": 2.4698675423644625, "language_loss": 0.76080388, "learning_rate": 3.835997703557317e-06, "loss": 0.78314584, "num_input_tokens_seen": 27416050, "step": 1297, "time_per_iteration": 2.6633152961730957 }, { "auxiliary_loss_clip": 0.01164481, "auxiliary_loss_mlp": 0.01035777, "balance_loss_clip": 1.05081856, "balance_loss_mlp": 1.0254004, "epoch": 0.15607527204954008, "flos": 19719519350400.0, "grad_norm": 1.881595972596585, "language_loss": 0.80037606, "learning_rate": 3.83568863736096e-06, "loss": 0.82237864, "num_input_tokens_seen": 27434920, "step": 1298, "time_per_iteration": 2.730390787124634 }, { "auxiliary_loss_clip": 0.0117888, "auxiliary_loss_mlp": 0.01039281, "balance_loss_clip": 1.05119789, "balance_loss_mlp": 1.02861762, "epoch": 0.15619551494017916, "flos": 18515650095360.0, "grad_norm": 3.5045548507245923, "language_loss": 0.89135909, "learning_rate": 3.8353792926920975e-06, "loss": 0.91354072, "num_input_tokens_seen": 27453570, "step": 1299, "time_per_iteration": 2.6947710514068604 }, { "auxiliary_loss_clip": 0.01220166, "auxiliary_loss_mlp": 0.01036328, "balance_loss_clip": 1.06393528, "balance_loss_mlp": 1.02431214, "epoch": 0.15631575783081825, "flos": 19902125116800.0, "grad_norm": 2.4166952092060554, "language_loss": 0.81919348, "learning_rate": 3.835069669597655e-06, "loss": 0.84175843, "num_input_tokens_seen": 27471960, "step": 1300, "time_per_iteration": 2.6733474731445312 }, { "auxiliary_loss_clip": 0.01215533, "auxiliary_loss_mlp": 0.00765728, "balance_loss_clip": 1.06076205, "balance_loss_mlp": 1.0005548, "epoch": 0.15643600072145733, "flos": 20777663128320.0, "grad_norm": 2.7407490348045576, "language_loss": 0.79506087, "learning_rate": 3.834759768124603e-06, "loss": 0.81487346, "num_input_tokens_seen": 27490835, "step": 1301, "time_per_iteration": 2.6863908767700195 }, { "auxiliary_loss_clip": 0.01186797, "auxiliary_loss_mlp": 0.0103821, "balance_loss_clip": 1.06176066, "balance_loss_mlp": 1.02717161, "epoch": 0.15655624361209644, "flos": 18546389159040.0, "grad_norm": 2.166714178251868, "language_loss": 0.76257992, "learning_rate": 3.834449588319953e-06, "loss": 0.78482997, "num_input_tokens_seen": 27508870, "step": 1302, "time_per_iteration": 2.7003772258758545 }, { "auxiliary_loss_clip": 0.0121121, "auxiliary_loss_mlp": 0.0103945, "balance_loss_clip": 1.06599391, "balance_loss_mlp": 1.02938914, "epoch": 0.15667648650273552, "flos": 25229544727680.0, "grad_norm": 1.9437962396540251, "language_loss": 0.85080099, "learning_rate": 3.834139130230758e-06, "loss": 0.87330759, "num_input_tokens_seen": 27528175, "step": 1303, "time_per_iteration": 2.7817845344543457 }, { "auxiliary_loss_clip": 0.01197612, "auxiliary_loss_mlp": 0.01040529, "balance_loss_clip": 1.0567503, "balance_loss_mlp": 1.02953172, "epoch": 0.1567967293933746, "flos": 24827093769600.0, "grad_norm": 1.6468386960634478, "language_loss": 0.81439406, "learning_rate": 3.833828393904117e-06, "loss": 0.83677548, "num_input_tokens_seen": 27548455, "step": 1304, "time_per_iteration": 2.730656623840332 }, { "auxiliary_loss_clip": 0.01162226, "auxiliary_loss_mlp": 0.01047403, "balance_loss_clip": 1.05428195, "balance_loss_mlp": 1.03696668, "epoch": 0.15691697228401372, "flos": 19164555244800.0, "grad_norm": 2.263429126983323, "language_loss": 0.77465695, "learning_rate": 3.833517379387165e-06, "loss": 0.79675323, "num_input_tokens_seen": 27564910, "step": 1305, "time_per_iteration": 2.7459049224853516 }, { "auxiliary_loss_clip": 0.01212993, "auxiliary_loss_mlp": 0.01040287, "balance_loss_clip": 1.06077123, "balance_loss_mlp": 1.02987432, "epoch": 0.1570372151746528, "flos": 24790931752320.0, "grad_norm": 1.898011724123316, "language_loss": 0.88723552, "learning_rate": 3.833206086727085e-06, "loss": 0.9097684, "num_input_tokens_seen": 27584260, "step": 1306, "time_per_iteration": 2.86450457572937 }, { "auxiliary_loss_clip": 0.01181492, "auxiliary_loss_mlp": 0.01036395, "balance_loss_clip": 1.05282438, "balance_loss_mlp": 1.02578568, "epoch": 0.15715745806529188, "flos": 24863650836480.0, "grad_norm": 2.2892520885911556, "language_loss": 0.70732301, "learning_rate": 3.8328945159710994e-06, "loss": 0.7295019, "num_input_tokens_seen": 27604440, "step": 1307, "time_per_iteration": 2.7197964191436768 }, { "auxiliary_loss_clip": 0.01223028, "auxiliary_loss_mlp": 0.00765431, "balance_loss_clip": 1.06799102, "balance_loss_mlp": 1.00047064, "epoch": 0.157277700955931, "flos": 21872148491520.0, "grad_norm": 2.2442982339529753, "language_loss": 0.88882279, "learning_rate": 3.832582667166473e-06, "loss": 0.90870732, "num_input_tokens_seen": 27624250, "step": 1308, "time_per_iteration": 2.7076189517974854 }, { "auxiliary_loss_clip": 0.01195606, "auxiliary_loss_mlp": 0.01044342, "balance_loss_clip": 1.05981553, "balance_loss_mlp": 1.03311241, "epoch": 0.15739794384657008, "flos": 24533344344960.0, "grad_norm": 1.8370767738617375, "language_loss": 0.81799906, "learning_rate": 3.8322705403605125e-06, "loss": 0.84039855, "num_input_tokens_seen": 27644595, "step": 1309, "time_per_iteration": 2.751643657684326 }, { "auxiliary_loss_clip": 0.01188594, "auxiliary_loss_mlp": 0.01033042, "balance_loss_clip": 1.05698895, "balance_loss_mlp": 1.02235484, "epoch": 0.15751818673720916, "flos": 17745329998080.0, "grad_norm": 2.025354039780842, "language_loss": 0.81564081, "learning_rate": 3.831958135600568e-06, "loss": 0.83785725, "num_input_tokens_seen": 27662145, "step": 1310, "time_per_iteration": 2.747087240219116 }, { "auxiliary_loss_clip": 0.01214308, "auxiliary_loss_mlp": 0.01041787, "balance_loss_clip": 1.06346655, "balance_loss_mlp": 1.03123116, "epoch": 0.15763842962784824, "flos": 17858520731520.0, "grad_norm": 2.0888562355766638, "language_loss": 0.7936219, "learning_rate": 3.831645452934032e-06, "loss": 0.81618285, "num_input_tokens_seen": 27680575, "step": 1311, "time_per_iteration": 2.6973438262939453 }, { "auxiliary_loss_clip": 0.01230768, "auxiliary_loss_mlp": 0.01040346, "balance_loss_clip": 1.0676899, "balance_loss_mlp": 1.02847838, "epoch": 0.15775867251848735, "flos": 26980908059520.0, "grad_norm": 1.918264281530322, "language_loss": 0.80265933, "learning_rate": 3.831332492408336e-06, "loss": 0.82537049, "num_input_tokens_seen": 27701985, "step": 1312, "time_per_iteration": 2.6607227325439453 }, { "auxiliary_loss_clip": 0.01192595, "auxiliary_loss_mlp": 0.01031664, "balance_loss_clip": 1.05737209, "balance_loss_mlp": 1.02103686, "epoch": 0.15787891540912644, "flos": 19240398812160.0, "grad_norm": 2.7755379098816166, "language_loss": 0.69333172, "learning_rate": 3.831019254070957e-06, "loss": 0.71557438, "num_input_tokens_seen": 27719770, "step": 1313, "time_per_iteration": 2.6310677528381348 }, { "auxiliary_loss_clip": 0.01167983, "auxiliary_loss_mlp": 0.01039068, "balance_loss_clip": 1.05372524, "balance_loss_mlp": 1.028548, "epoch": 0.15799915829976552, "flos": 27271102037760.0, "grad_norm": 3.2528012643946744, "language_loss": 0.94788027, "learning_rate": 3.8307057379694135e-06, "loss": 0.96995068, "num_input_tokens_seen": 27739105, "step": 1314, "time_per_iteration": 2.79300594329834 }, { "auxiliary_loss_clip": 0.01228975, "auxiliary_loss_mlp": 0.01038354, "balance_loss_clip": 1.06251121, "balance_loss_mlp": 1.02802491, "epoch": 0.15811940119040463, "flos": 20405520270720.0, "grad_norm": 2.2933259017625085, "language_loss": 0.82580692, "learning_rate": 3.830391944151264e-06, "loss": 0.84848017, "num_input_tokens_seen": 27754985, "step": 1315, "time_per_iteration": 2.5895400047302246 }, { "auxiliary_loss_clip": 0.01198072, "auxiliary_loss_mlp": 0.01035952, "balance_loss_clip": 1.05833983, "balance_loss_mlp": 1.02533031, "epoch": 0.1582396440810437, "flos": 32599347661440.0, "grad_norm": 1.7421330026681197, "language_loss": 0.67478943, "learning_rate": 3.830077872664114e-06, "loss": 0.69712973, "num_input_tokens_seen": 27776110, "step": 1316, "time_per_iteration": 3.8042240142822266 }, { "auxiliary_loss_clip": 0.01154932, "auxiliary_loss_mlp": 0.01045039, "balance_loss_clip": 1.05350304, "balance_loss_mlp": 1.03472114, "epoch": 0.1583598869716828, "flos": 33800559310080.0, "grad_norm": 1.7706233568705718, "language_loss": 0.73570228, "learning_rate": 3.829763523555604e-06, "loss": 0.75770199, "num_input_tokens_seen": 27796510, "step": 1317, "time_per_iteration": 2.906458616256714 }, { "auxiliary_loss_clip": 0.01211216, "auxiliary_loss_mlp": 0.01038609, "balance_loss_clip": 1.06701887, "balance_loss_mlp": 1.02843451, "epoch": 0.15848012986232188, "flos": 24681332378880.0, "grad_norm": 2.29847518511585, "language_loss": 0.7858097, "learning_rate": 3.829448896873423e-06, "loss": 0.80830801, "num_input_tokens_seen": 27815610, "step": 1318, "time_per_iteration": 2.6642143726348877 }, { "auxiliary_loss_clip": 0.01155234, "auxiliary_loss_mlp": 0.00765891, "balance_loss_clip": 1.05721354, "balance_loss_mlp": 1.00052655, "epoch": 0.158600372752961, "flos": 22602068766720.0, "grad_norm": 2.067408346680746, "language_loss": 0.79493648, "learning_rate": 3.829133992665299e-06, "loss": 0.81414771, "num_input_tokens_seen": 27834735, "step": 1319, "time_per_iteration": 2.7497541904449463 }, { "auxiliary_loss_clip": 0.01202104, "auxiliary_loss_mlp": 0.01034725, "balance_loss_clip": 1.06061745, "balance_loss_mlp": 1.02459788, "epoch": 0.15872061564360007, "flos": 27927944092800.0, "grad_norm": 2.3398467983765707, "language_loss": 0.8901279, "learning_rate": 3.828818810979002e-06, "loss": 0.91249615, "num_input_tokens_seen": 27853065, "step": 1320, "time_per_iteration": 4.507674217224121 }, { "auxiliary_loss_clip": 0.01228672, "auxiliary_loss_mlp": 0.010434, "balance_loss_clip": 1.06649208, "balance_loss_mlp": 1.03298068, "epoch": 0.15884085853423915, "flos": 23696805525120.0, "grad_norm": 1.7204090021091543, "language_loss": 0.80384701, "learning_rate": 3.8285033518623454e-06, "loss": 0.82656777, "num_input_tokens_seen": 27873315, "step": 1321, "time_per_iteration": 3.5840232372283936 }, { "auxiliary_loss_clip": 0.01218653, "auxiliary_loss_mlp": 0.01044678, "balance_loss_clip": 1.06445539, "balance_loss_mlp": 1.03343678, "epoch": 0.15896110142487826, "flos": 23112359331840.0, "grad_norm": 2.7430607004755263, "language_loss": 0.81399989, "learning_rate": 3.8281876153631845e-06, "loss": 0.83663321, "num_input_tokens_seen": 27890070, "step": 1322, "time_per_iteration": 2.6522269248962402 }, { "auxiliary_loss_clip": 0.01163285, "auxiliary_loss_mlp": 0.01036927, "balance_loss_clip": 1.05560255, "balance_loss_mlp": 1.02592444, "epoch": 0.15908134431551735, "flos": 14685238632960.0, "grad_norm": 1.956180091575244, "language_loss": 0.65208936, "learning_rate": 3.827871601529416e-06, "loss": 0.67409152, "num_input_tokens_seen": 27908590, "step": 1323, "time_per_iteration": 2.762910842895508 }, { "auxiliary_loss_clip": 0.01180463, "auxiliary_loss_mlp": 0.01032917, "balance_loss_clip": 1.05753696, "balance_loss_mlp": 1.02264094, "epoch": 0.15920158720615643, "flos": 20193611984640.0, "grad_norm": 1.8267453161054032, "language_loss": 0.80638754, "learning_rate": 3.827555310408979e-06, "loss": 0.82852137, "num_input_tokens_seen": 27927985, "step": 1324, "time_per_iteration": 2.7244620323181152 }, { "auxiliary_loss_clip": 0.01183984, "auxiliary_loss_mlp": 0.01039571, "balance_loss_clip": 1.0630002, "balance_loss_mlp": 1.02829361, "epoch": 0.1593218300967955, "flos": 24826626892800.0, "grad_norm": 1.6246529764713744, "language_loss": 0.82331175, "learning_rate": 3.827238742049854e-06, "loss": 0.84554732, "num_input_tokens_seen": 27948280, "step": 1325, "time_per_iteration": 2.780841112136841 }, { "auxiliary_loss_clip": 0.01225537, "auxiliary_loss_mlp": 0.01038213, "balance_loss_clip": 1.06184673, "balance_loss_mlp": 1.02729344, "epoch": 0.15944207298743462, "flos": 28328707111680.0, "grad_norm": 1.8793773974062478, "language_loss": 0.52288342, "learning_rate": 3.826921896500066e-06, "loss": 0.5455209, "num_input_tokens_seen": 27969565, "step": 1326, "time_per_iteration": 2.6798951625823975 }, { "auxiliary_loss_clip": 0.01188077, "auxiliary_loss_mlp": 0.01039111, "balance_loss_clip": 1.06079531, "balance_loss_mlp": 1.02833509, "epoch": 0.1595623158780737, "flos": 22964838174720.0, "grad_norm": 1.8781466863466236, "language_loss": 0.78703606, "learning_rate": 3.826604773807678e-06, "loss": 0.80930793, "num_input_tokens_seen": 27987540, "step": 1327, "time_per_iteration": 2.744772434234619 }, { "auxiliary_loss_clip": 0.01199564, "auxiliary_loss_mlp": 0.01044591, "balance_loss_clip": 1.05766582, "balance_loss_mlp": 1.03252089, "epoch": 0.1596825587687128, "flos": 19710540950400.0, "grad_norm": 2.809090677346955, "language_loss": 0.73693991, "learning_rate": 3.826287374020798e-06, "loss": 0.75938141, "num_input_tokens_seen": 28002345, "step": 1328, "time_per_iteration": 2.6480305194854736 }, { "auxiliary_loss_clip": 0.01230119, "auxiliary_loss_mlp": 0.01039869, "balance_loss_clip": 1.06746447, "balance_loss_mlp": 1.02952111, "epoch": 0.1598028016593519, "flos": 22637727993600.0, "grad_norm": 2.0742878498039206, "language_loss": 0.82239622, "learning_rate": 3.825969697187575e-06, "loss": 0.84509611, "num_input_tokens_seen": 28021675, "step": 1329, "time_per_iteration": 2.676424264907837 }, { "auxiliary_loss_clip": 0.0118076, "auxiliary_loss_mlp": 0.0103474, "balance_loss_clip": 1.05774117, "balance_loss_mlp": 1.02410066, "epoch": 0.15992304454999098, "flos": 20482908122880.0, "grad_norm": 1.9812020783085575, "language_loss": 0.69682431, "learning_rate": 3.8256517433562015e-06, "loss": 0.71897936, "num_input_tokens_seen": 28039615, "step": 1330, "time_per_iteration": 2.699894666671753 }, { "auxiliary_loss_clip": 0.01230662, "auxiliary_loss_mlp": 0.01028853, "balance_loss_clip": 1.06663311, "balance_loss_mlp": 1.0188452, "epoch": 0.16004328744063007, "flos": 17676094533120.0, "grad_norm": 2.5596096766486403, "language_loss": 0.91969979, "learning_rate": 3.82533351257491e-06, "loss": 0.94229496, "num_input_tokens_seen": 28057565, "step": 1331, "time_per_iteration": 2.6839871406555176 }, { "auxiliary_loss_clip": 0.01207509, "auxiliary_loss_mlp": 0.01044141, "balance_loss_clip": 1.06224251, "balance_loss_mlp": 1.03287578, "epoch": 0.16016353033126918, "flos": 24098717779200.0, "grad_norm": 1.8189157520705472, "language_loss": 0.88406497, "learning_rate": 3.825015004891975e-06, "loss": 0.90658146, "num_input_tokens_seen": 28076305, "step": 1332, "time_per_iteration": 2.6576993465423584 }, { "auxiliary_loss_clip": 0.01204506, "auxiliary_loss_mlp": 0.01038365, "balance_loss_clip": 1.05759454, "balance_loss_mlp": 1.02812505, "epoch": 0.16028377322190826, "flos": 27634841112960.0, "grad_norm": 1.9436023243649394, "language_loss": 0.75770265, "learning_rate": 3.824696220355716e-06, "loss": 0.78013134, "num_input_tokens_seen": 28097895, "step": 1333, "time_per_iteration": 2.79388427734375 }, { "auxiliary_loss_clip": 0.01190389, "auxiliary_loss_mlp": 0.01036521, "balance_loss_clip": 1.05783224, "balance_loss_mlp": 1.0257442, "epoch": 0.16040401611254734, "flos": 20961202648320.0, "grad_norm": 31.501914570555847, "language_loss": 0.78889006, "learning_rate": 3.824377159014491e-06, "loss": 0.81115913, "num_input_tokens_seen": 28118790, "step": 1334, "time_per_iteration": 2.783365249633789 }, { "auxiliary_loss_clip": 0.01211275, "auxiliary_loss_mlp": 0.01037805, "balance_loss_clip": 1.06316888, "balance_loss_mlp": 1.0275588, "epoch": 0.16052425900318643, "flos": 21247051080960.0, "grad_norm": 2.067934597032914, "language_loss": 0.8481943, "learning_rate": 3.824057820916702e-06, "loss": 0.8706851, "num_input_tokens_seen": 28135995, "step": 1335, "time_per_iteration": 2.6405575275421143 }, { "auxiliary_loss_clip": 0.0119952, "auxiliary_loss_mlp": 0.01036175, "balance_loss_clip": 1.0619843, "balance_loss_mlp": 1.02478504, "epoch": 0.16064450189382554, "flos": 15524004096000.0, "grad_norm": 2.683351743257417, "language_loss": 0.71455777, "learning_rate": 3.8237382061107904e-06, "loss": 0.73691463, "num_input_tokens_seen": 28152715, "step": 1336, "time_per_iteration": 2.6827914714813232 }, { "auxiliary_loss_clip": 0.01125225, "auxiliary_loss_mlp": 0.01029131, "balance_loss_clip": 1.04608703, "balance_loss_mlp": 1.01877201, "epoch": 0.16076474478446462, "flos": 21178497974400.0, "grad_norm": 2.068463885737031, "language_loss": 0.78623801, "learning_rate": 3.823418314645243e-06, "loss": 0.80778158, "num_input_tokens_seen": 28171590, "step": 1337, "time_per_iteration": 2.7938859462738037 }, { "auxiliary_loss_clip": 0.0115092, "auxiliary_loss_mlp": 0.01033187, "balance_loss_clip": 1.05494809, "balance_loss_mlp": 1.0230726, "epoch": 0.1608849876751037, "flos": 18366476912640.0, "grad_norm": 2.014169017637328, "language_loss": 0.75178897, "learning_rate": 3.823098146568588e-06, "loss": 0.77363002, "num_input_tokens_seen": 28191295, "step": 1338, "time_per_iteration": 2.820744276046753 }, { "auxiliary_loss_clip": 0.01213494, "auxiliary_loss_mlp": 0.0103258, "balance_loss_clip": 1.06439304, "balance_loss_mlp": 1.0229001, "epoch": 0.1610052305657428, "flos": 29497024880640.0, "grad_norm": 1.7866710493257718, "language_loss": 0.71435857, "learning_rate": 3.822777701929394e-06, "loss": 0.73681933, "num_input_tokens_seen": 28213120, "step": 1339, "time_per_iteration": 2.7570526599884033 }, { "auxiliary_loss_clip": 0.01197902, "auxiliary_loss_mlp": 0.01039036, "balance_loss_clip": 1.05649436, "balance_loss_mlp": 1.02873075, "epoch": 0.1611254734563819, "flos": 26797871329920.0, "grad_norm": 1.981556774085316, "language_loss": 0.73508179, "learning_rate": 3.8224569807762714e-06, "loss": 0.75745118, "num_input_tokens_seen": 28232440, "step": 1340, "time_per_iteration": 2.7074010372161865 }, { "auxiliary_loss_clip": 0.01143583, "auxiliary_loss_mlp": 0.01039269, "balance_loss_clip": 1.04812956, "balance_loss_mlp": 1.02887416, "epoch": 0.16124571634702098, "flos": 22419570741120.0, "grad_norm": 1.9075958486341633, "language_loss": 0.76627779, "learning_rate": 3.822135983157873e-06, "loss": 0.78810632, "num_input_tokens_seen": 28251715, "step": 1341, "time_per_iteration": 2.7287495136260986 }, { "auxiliary_loss_clip": 0.01220929, "auxiliary_loss_mlp": 0.00764125, "balance_loss_clip": 1.06079459, "balance_loss_mlp": 1.00042868, "epoch": 0.16136595923766006, "flos": 10999116103680.0, "grad_norm": 2.154973411354358, "language_loss": 0.84887254, "learning_rate": 3.821814709122896e-06, "loss": 0.86872303, "num_input_tokens_seen": 28269765, "step": 1342, "time_per_iteration": 3.506011486053467 }, { "auxiliary_loss_clip": 0.01194504, "auxiliary_loss_mlp": 0.01033668, "balance_loss_clip": 1.0599649, "balance_loss_mlp": 1.02363086, "epoch": 0.16148620212829917, "flos": 21214983214080.0, "grad_norm": 2.349953692256193, "language_loss": 0.84976244, "learning_rate": 3.821493158720076e-06, "loss": 0.87204409, "num_input_tokens_seen": 28288870, "step": 1343, "time_per_iteration": 2.7098159790039062 }, { "auxiliary_loss_clip": 0.01179729, "auxiliary_loss_mlp": 0.01041791, "balance_loss_clip": 1.05304956, "balance_loss_mlp": 1.03038824, "epoch": 0.16160644501893826, "flos": 16758468760320.0, "grad_norm": 15.718647329379587, "language_loss": 0.73935485, "learning_rate": 3.821171331998191e-06, "loss": 0.7615701, "num_input_tokens_seen": 28305400, "step": 1344, "time_per_iteration": 2.6615591049194336 }, { "auxiliary_loss_clip": 0.01101678, "auxiliary_loss_mlp": 0.01007389, "balance_loss_clip": 1.03582716, "balance_loss_mlp": 1.00433719, "epoch": 0.16172668790957734, "flos": 64444967308800.0, "grad_norm": 0.706792882770157, "language_loss": 0.54470193, "learning_rate": 3.820849229006064e-06, "loss": 0.56579262, "num_input_tokens_seen": 28373150, "step": 1345, "time_per_iteration": 3.4247233867645264 }, { "auxiliary_loss_clip": 0.01226448, "auxiliary_loss_mlp": 0.01043442, "balance_loss_clip": 1.0619278, "balance_loss_mlp": 1.03297579, "epoch": 0.16184693080021645, "flos": 23257689759360.0, "grad_norm": 2.0318291914116298, "language_loss": 0.71435952, "learning_rate": 3.8205268497925564e-06, "loss": 0.7370584, "num_input_tokens_seen": 28393620, "step": 1346, "time_per_iteration": 3.6226305961608887 }, { "auxiliary_loss_clip": 0.01223749, "auxiliary_loss_mlp": 0.01034974, "balance_loss_clip": 1.06129885, "balance_loss_mlp": 1.0244782, "epoch": 0.16196717369085553, "flos": 17451113696640.0, "grad_norm": 2.146532717180153, "language_loss": 0.78589976, "learning_rate": 3.8202041944065725e-06, "loss": 0.80848694, "num_input_tokens_seen": 28409440, "step": 1347, "time_per_iteration": 4.44841742515564 }, { "auxiliary_loss_clip": 0.01226323, "auxiliary_loss_mlp": 0.01034404, "balance_loss_clip": 1.06486773, "balance_loss_mlp": 1.02392006, "epoch": 0.16208741658149461, "flos": 23873377806720.0, "grad_norm": 2.059042334785779, "language_loss": 0.74059761, "learning_rate": 3.819881262897061e-06, "loss": 0.76320493, "num_input_tokens_seen": 28427575, "step": 1348, "time_per_iteration": 2.6366240978240967 }, { "auxiliary_loss_clip": 0.01184674, "auxiliary_loss_mlp": 0.01036192, "balance_loss_clip": 1.06253815, "balance_loss_mlp": 1.02457488, "epoch": 0.1622076594721337, "flos": 25884806584320.0, "grad_norm": 2.082306287602304, "language_loss": 0.73841178, "learning_rate": 3.819558055313008e-06, "loss": 0.76062042, "num_input_tokens_seen": 28448260, "step": 1349, "time_per_iteration": 2.7356936931610107 }, { "auxiliary_loss_clip": 0.01217748, "auxiliary_loss_mlp": 0.01044163, "balance_loss_clip": 1.06385183, "balance_loss_mlp": 1.03375554, "epoch": 0.1623279023627728, "flos": 21539759011200.0, "grad_norm": 2.370254777971957, "language_loss": 0.77185345, "learning_rate": 3.819234571703444e-06, "loss": 0.79447252, "num_input_tokens_seen": 28467085, "step": 1350, "time_per_iteration": 2.6230547428131104 }, { "auxiliary_loss_clip": 0.01204067, "auxiliary_loss_mlp": 0.01041708, "balance_loss_clip": 1.05969977, "balance_loss_mlp": 1.03024626, "epoch": 0.1624481452534119, "flos": 22085421494400.0, "grad_norm": 1.90585672177158, "language_loss": 0.85788441, "learning_rate": 3.8189108121174435e-06, "loss": 0.88034213, "num_input_tokens_seen": 28486850, "step": 1351, "time_per_iteration": 2.773103713989258 }, { "auxiliary_loss_clip": 0.01180693, "auxiliary_loss_mlp": 0.01042905, "balance_loss_clip": 1.06347191, "balance_loss_mlp": 1.03198504, "epoch": 0.16256838814405097, "flos": 27087490690560.0, "grad_norm": 1.6630190077934404, "language_loss": 0.83297193, "learning_rate": 3.818586776604118e-06, "loss": 0.85520798, "num_input_tokens_seen": 28507490, "step": 1352, "time_per_iteration": 2.72802996635437 }, { "auxiliary_loss_clip": 0.01189281, "auxiliary_loss_mlp": 0.01034192, "balance_loss_clip": 1.05770707, "balance_loss_mlp": 1.02335596, "epoch": 0.16268863103469008, "flos": 20120354196480.0, "grad_norm": 2.3245025433671307, "language_loss": 0.61517918, "learning_rate": 3.818262465212625e-06, "loss": 0.63741386, "num_input_tokens_seen": 28527615, "step": 1353, "time_per_iteration": 2.6996500492095947 }, { "auxiliary_loss_clip": 0.01203111, "auxiliary_loss_mlp": 0.01037914, "balance_loss_clip": 1.06055665, "balance_loss_mlp": 1.02632642, "epoch": 0.16280887392532917, "flos": 18332792933760.0, "grad_norm": 2.2323350688685633, "language_loss": 0.77407175, "learning_rate": 3.817937877992161e-06, "loss": 0.79648197, "num_input_tokens_seen": 28544910, "step": 1354, "time_per_iteration": 2.609583854675293 }, { "auxiliary_loss_clip": 0.01181257, "auxiliary_loss_mlp": 0.00765538, "balance_loss_clip": 1.05624402, "balance_loss_mlp": 1.00053239, "epoch": 0.16292911681596825, "flos": 11874330892800.0, "grad_norm": 2.4836465973822905, "language_loss": 0.85998744, "learning_rate": 3.817613014991967e-06, "loss": 0.87945533, "num_input_tokens_seen": 28561050, "step": 1355, "time_per_iteration": 2.723642110824585 }, { "auxiliary_loss_clip": 0.01169495, "auxiliary_loss_mlp": 0.01038797, "balance_loss_clip": 1.05239177, "balance_loss_mlp": 1.02794862, "epoch": 0.16304935970660733, "flos": 26103466627200.0, "grad_norm": 2.0678664328136875, "language_loss": 0.76728565, "learning_rate": 3.817287876261323e-06, "loss": 0.78936857, "num_input_tokens_seen": 28581385, "step": 1356, "time_per_iteration": 2.7060506343841553 }, { "auxiliary_loss_clip": 0.01196749, "auxiliary_loss_mlp": 0.0103927, "balance_loss_clip": 1.06272793, "balance_loss_mlp": 1.02831519, "epoch": 0.16316960259724644, "flos": 29351945848320.0, "grad_norm": 1.8330933380018286, "language_loss": 0.80030483, "learning_rate": 3.816962461849553e-06, "loss": 0.82266498, "num_input_tokens_seen": 28603255, "step": 1357, "time_per_iteration": 2.7294507026672363 }, { "auxiliary_loss_clip": 0.01189794, "auxiliary_loss_mlp": 0.01040556, "balance_loss_clip": 1.06186652, "balance_loss_mlp": 1.02989244, "epoch": 0.16328984548788553, "flos": 20886759711360.0, "grad_norm": 1.914936601577789, "language_loss": 0.84594107, "learning_rate": 3.8166367718060235e-06, "loss": 0.86824459, "num_input_tokens_seen": 28623145, "step": 1358, "time_per_iteration": 2.6864376068115234 }, { "auxiliary_loss_clip": 0.01208589, "auxiliary_loss_mlp": 0.01033362, "balance_loss_clip": 1.06032491, "balance_loss_mlp": 1.02280593, "epoch": 0.1634100883785246, "flos": 18041090584320.0, "grad_norm": 3.044225928149676, "language_loss": 0.76611805, "learning_rate": 3.816310806180139e-06, "loss": 0.7885375, "num_input_tokens_seen": 28641555, "step": 1359, "time_per_iteration": 2.7074708938598633 }, { "auxiliary_loss_clip": 0.01197176, "auxiliary_loss_mlp": 0.01040138, "balance_loss_clip": 1.06260788, "balance_loss_mlp": 1.02882504, "epoch": 0.16353033126916372, "flos": 24572128055040.0, "grad_norm": 1.7475665467686825, "language_loss": 0.80799806, "learning_rate": 3.81598456502135e-06, "loss": 0.83037126, "num_input_tokens_seen": 28661575, "step": 1360, "time_per_iteration": 2.709463119506836 }, { "auxiliary_loss_clip": 0.01194337, "auxiliary_loss_mlp": 0.0103987, "balance_loss_clip": 1.06141019, "balance_loss_mlp": 1.02937388, "epoch": 0.1636505741598028, "flos": 19892895321600.0, "grad_norm": 1.990796875681064, "language_loss": 0.87152076, "learning_rate": 3.8156580483791455e-06, "loss": 0.89386284, "num_input_tokens_seen": 28676765, "step": 1361, "time_per_iteration": 2.6877455711364746 }, { "auxiliary_loss_clip": 0.01226143, "auxiliary_loss_mlp": 0.01038239, "balance_loss_clip": 1.06250465, "balance_loss_mlp": 1.02760518, "epoch": 0.16377081705044189, "flos": 28402611344640.0, "grad_norm": 2.368438712233047, "language_loss": 0.77093369, "learning_rate": 3.815331256303059e-06, "loss": 0.79357755, "num_input_tokens_seen": 28696795, "step": 1362, "time_per_iteration": 2.694855213165283 }, { "auxiliary_loss_clip": 0.01183394, "auxiliary_loss_mlp": 0.01037893, "balance_loss_clip": 1.06333888, "balance_loss_mlp": 1.0266571, "epoch": 0.163891059941081, "flos": 21908059113600.0, "grad_norm": 2.3845480664722722, "language_loss": 0.77255613, "learning_rate": 3.815004188842665e-06, "loss": 0.79476899, "num_input_tokens_seen": 28714835, "step": 1363, "time_per_iteration": 2.7188937664031982 }, { "auxiliary_loss_clip": 0.01188665, "auxiliary_loss_mlp": 0.01039981, "balance_loss_clip": 1.05492926, "balance_loss_mlp": 1.03021169, "epoch": 0.16401130283172008, "flos": 26797619934720.0, "grad_norm": 1.6379327618332375, "language_loss": 0.79727268, "learning_rate": 3.814676846047578e-06, "loss": 0.81955922, "num_input_tokens_seen": 28735710, "step": 1364, "time_per_iteration": 2.7413017749786377 }, { "auxiliary_loss_clip": 0.0120874, "auxiliary_loss_mlp": 0.01040139, "balance_loss_clip": 1.05991864, "balance_loss_mlp": 1.03023314, "epoch": 0.16413154572235916, "flos": 32997417160320.0, "grad_norm": 1.9972154235236679, "language_loss": 0.69828516, "learning_rate": 3.8143492279674565e-06, "loss": 0.72077399, "num_input_tokens_seen": 28758405, "step": 1365, "time_per_iteration": 2.6900131702423096 }, { "auxiliary_loss_clip": 0.01097543, "auxiliary_loss_mlp": 0.01009562, "balance_loss_clip": 1.03295505, "balance_loss_mlp": 1.00664163, "epoch": 0.16425178861299825, "flos": 40113622074240.0, "grad_norm": 0.8422414866576833, "language_loss": 0.58454561, "learning_rate": 3.8140213346519997e-06, "loss": 0.60561657, "num_input_tokens_seen": 28809000, "step": 1366, "time_per_iteration": 3.0356743335723877 }, { "auxiliary_loss_clip": 0.01173608, "auxiliary_loss_mlp": 0.01046082, "balance_loss_clip": 1.05760479, "balance_loss_mlp": 1.03444719, "epoch": 0.16437203150363736, "flos": 25447486498560.0, "grad_norm": 1.890849893652848, "language_loss": 0.76837242, "learning_rate": 3.813693166150948e-06, "loss": 0.79056931, "num_input_tokens_seen": 28829210, "step": 1367, "time_per_iteration": 2.765251636505127 }, { "auxiliary_loss_clip": 0.01174565, "auxiliary_loss_mlp": 0.01036497, "balance_loss_clip": 1.05951202, "balance_loss_mlp": 1.02507663, "epoch": 0.16449227439427644, "flos": 23476888506240.0, "grad_norm": 4.715004523600085, "language_loss": 0.8515414, "learning_rate": 3.813364722514086e-06, "loss": 0.87365198, "num_input_tokens_seen": 28847545, "step": 1368, "time_per_iteration": 3.6013832092285156 }, { "auxiliary_loss_clip": 0.01206977, "auxiliary_loss_mlp": 0.010446, "balance_loss_clip": 1.05973005, "balance_loss_mlp": 1.0339129, "epoch": 0.16461251728491552, "flos": 13545217802880.0, "grad_norm": 3.1992866244109663, "language_loss": 0.80753827, "learning_rate": 3.8130360037912368e-06, "loss": 0.83005404, "num_input_tokens_seen": 28863990, "step": 1369, "time_per_iteration": 2.659313678741455 }, { "auxiliary_loss_clip": 0.01210093, "auxiliary_loss_mlp": 0.01037859, "balance_loss_clip": 1.05998945, "balance_loss_mlp": 1.02702844, "epoch": 0.16473276017555463, "flos": 23003298662400.0, "grad_norm": 2.129268998164014, "language_loss": 0.81847328, "learning_rate": 3.812707010032268e-06, "loss": 0.84095275, "num_input_tokens_seen": 28883045, "step": 1370, "time_per_iteration": 2.676392078399658 }, { "auxiliary_loss_clip": 0.01218283, "auxiliary_loss_mlp": 0.0104467, "balance_loss_clip": 1.06622577, "balance_loss_mlp": 1.03403068, "epoch": 0.16485300306619372, "flos": 24790680357120.0, "grad_norm": 1.8149263814636039, "language_loss": 0.79139954, "learning_rate": 3.8123777412870863e-06, "loss": 0.8140291, "num_input_tokens_seen": 28902545, "step": 1371, "time_per_iteration": 2.6979081630706787 }, { "auxiliary_loss_clip": 0.01198602, "auxiliary_loss_mlp": 0.01034567, "balance_loss_clip": 1.0597682, "balance_loss_mlp": 1.02418923, "epoch": 0.1649732459568328, "flos": 21106497162240.0, "grad_norm": 2.141230323830424, "language_loss": 0.78432405, "learning_rate": 3.812048197605643e-06, "loss": 0.80665576, "num_input_tokens_seen": 28921440, "step": 1372, "time_per_iteration": 3.5658137798309326 }, { "auxiliary_loss_clip": 0.0121085, "auxiliary_loss_mlp": 0.01034353, "balance_loss_clip": 1.06176257, "balance_loss_mlp": 1.02276599, "epoch": 0.16509348884747188, "flos": 20266726118400.0, "grad_norm": 2.0091620776325665, "language_loss": 0.81723845, "learning_rate": 3.8117183790379277e-06, "loss": 0.83969051, "num_input_tokens_seen": 28939890, "step": 1373, "time_per_iteration": 4.375774621963501 }, { "auxiliary_loss_clip": 0.01227585, "auxiliary_loss_mlp": 0.01033683, "balance_loss_clip": 1.06346107, "balance_loss_mlp": 1.02310896, "epoch": 0.165213731738111, "flos": 11035493602560.0, "grad_norm": 3.3396092069705907, "language_loss": 0.93681508, "learning_rate": 3.811388285633976e-06, "loss": 0.95942777, "num_input_tokens_seen": 28955875, "step": 1374, "time_per_iteration": 2.5478992462158203 }, { "auxiliary_loss_clip": 0.01172749, "auxiliary_loss_mlp": 0.01035772, "balance_loss_clip": 1.05834711, "balance_loss_mlp": 1.02483439, "epoch": 0.16533397462875007, "flos": 29972051268480.0, "grad_norm": 2.28068910450057, "language_loss": 0.62023109, "learning_rate": 3.811057917443861e-06, "loss": 0.64231622, "num_input_tokens_seen": 28975140, "step": 1375, "time_per_iteration": 2.7869110107421875 }, { "auxiliary_loss_clip": 0.01109085, "auxiliary_loss_mlp": 0.01005692, "balance_loss_clip": 1.03020215, "balance_loss_mlp": 1.00261676, "epoch": 0.16545421751938916, "flos": 65556763027200.0, "grad_norm": 0.8569516029847498, "language_loss": 0.6825732, "learning_rate": 3.8107272745177e-06, "loss": 0.70372093, "num_input_tokens_seen": 29047470, "step": 1376, "time_per_iteration": 3.3500595092773438 }, { "auxiliary_loss_clip": 0.01185596, "auxiliary_loss_mlp": 0.010356, "balance_loss_clip": 1.06089938, "balance_loss_mlp": 1.02520537, "epoch": 0.16557446041002827, "flos": 22492361652480.0, "grad_norm": 1.8329689627213155, "language_loss": 0.78643268, "learning_rate": 3.8103963569056513e-06, "loss": 0.80864465, "num_input_tokens_seen": 29066605, "step": 1377, "time_per_iteration": 2.7369496822357178 }, { "auxiliary_loss_clip": 0.01187069, "auxiliary_loss_mlp": 0.01035902, "balance_loss_clip": 1.05401182, "balance_loss_mlp": 1.02522111, "epoch": 0.16569470330066735, "flos": 24602723464320.0, "grad_norm": 1.644939081810944, "language_loss": 0.88104236, "learning_rate": 3.8100651646579146e-06, "loss": 0.90327215, "num_input_tokens_seen": 29085815, "step": 1378, "time_per_iteration": 2.6735615730285645 }, { "auxiliary_loss_clip": 0.01187949, "auxiliary_loss_mlp": 0.01039061, "balance_loss_clip": 1.05586076, "balance_loss_mlp": 1.02861261, "epoch": 0.16581494619130643, "flos": 15006207588480.0, "grad_norm": 2.1568701111289914, "language_loss": 0.92811346, "learning_rate": 3.8097336978247317e-06, "loss": 0.95038354, "num_input_tokens_seen": 29102520, "step": 1379, "time_per_iteration": 2.6430978775024414 }, { "auxiliary_loss_clip": 0.01182745, "auxiliary_loss_mlp": 0.01041074, "balance_loss_clip": 1.05559313, "balance_loss_mlp": 1.0301249, "epoch": 0.16593518908194552, "flos": 17420338719360.0, "grad_norm": 2.2985836695641986, "language_loss": 0.88887179, "learning_rate": 3.8094019564563854e-06, "loss": 0.91110992, "num_input_tokens_seen": 29119450, "step": 1380, "time_per_iteration": 2.612462282180786 }, { "auxiliary_loss_clip": 0.01225119, "auxiliary_loss_mlp": 0.00764697, "balance_loss_clip": 1.06286478, "balance_loss_mlp": 1.00052512, "epoch": 0.16605543197258463, "flos": 20412631163520.0, "grad_norm": 2.99107971737057, "language_loss": 0.75116527, "learning_rate": 3.809069940603201e-06, "loss": 0.77106339, "num_input_tokens_seen": 29137405, "step": 1381, "time_per_iteration": 2.7121667861938477 }, { "auxiliary_loss_clip": 0.01184576, "auxiliary_loss_mlp": 0.0103806, "balance_loss_clip": 1.05897403, "balance_loss_mlp": 1.02675259, "epoch": 0.1661756748632237, "flos": 14209745368320.0, "grad_norm": 2.0515317504695503, "language_loss": 0.77973926, "learning_rate": 3.8087376503155452e-06, "loss": 0.80196559, "num_input_tokens_seen": 29154890, "step": 1382, "time_per_iteration": 2.606696128845215 }, { "auxiliary_loss_clip": 0.01101277, "auxiliary_loss_mlp": 0.01003077, "balance_loss_clip": 1.02749765, "balance_loss_mlp": 1.00032294, "epoch": 0.1662959177538628, "flos": 66080877350400.0, "grad_norm": 0.8929302535826479, "language_loss": 0.56240308, "learning_rate": 3.808405085643826e-06, "loss": 0.58344662, "num_input_tokens_seen": 29219770, "step": 1383, "time_per_iteration": 3.281708002090454 }, { "auxiliary_loss_clip": 0.01227783, "auxiliary_loss_mlp": 0.00765341, "balance_loss_clip": 1.06477785, "balance_loss_mlp": 1.00056577, "epoch": 0.1664161606445019, "flos": 20740567357440.0, "grad_norm": 2.1881124610304603, "language_loss": 0.88889068, "learning_rate": 3.8080722466384925e-06, "loss": 0.90882194, "num_input_tokens_seen": 29237620, "step": 1384, "time_per_iteration": 2.6021363735198975 }, { "auxiliary_loss_clip": 0.01229455, "auxiliary_loss_mlp": 0.01035733, "balance_loss_clip": 1.06344676, "balance_loss_mlp": 1.02518904, "epoch": 0.166536403535141, "flos": 25260930236160.0, "grad_norm": 2.4441607083293886, "language_loss": 0.71020973, "learning_rate": 3.8077391333500376e-06, "loss": 0.73286164, "num_input_tokens_seen": 29256760, "step": 1385, "time_per_iteration": 2.7025511264801025 }, { "auxiliary_loss_clip": 0.01195492, "auxiliary_loss_mlp": 0.01037769, "balance_loss_clip": 1.0614599, "balance_loss_mlp": 1.02754676, "epoch": 0.16665664642578007, "flos": 25447450584960.0, "grad_norm": 1.765808766379165, "language_loss": 0.76668763, "learning_rate": 3.8074057458289934e-06, "loss": 0.78902024, "num_input_tokens_seen": 29277450, "step": 1386, "time_per_iteration": 2.6995954513549805 }, { "auxiliary_loss_clip": 0.01193856, "auxiliary_loss_mlp": 0.01037292, "balance_loss_clip": 1.05693769, "balance_loss_mlp": 1.02701044, "epoch": 0.16677688931641918, "flos": 22200767043840.0, "grad_norm": 1.9862111102295252, "language_loss": 0.82654405, "learning_rate": 3.807072084125934e-06, "loss": 0.8488555, "num_input_tokens_seen": 29299300, "step": 1387, "time_per_iteration": 2.691457748413086 }, { "auxiliary_loss_clip": 0.0119372, "auxiliary_loss_mlp": 0.01033055, "balance_loss_clip": 1.06194925, "balance_loss_mlp": 1.02223063, "epoch": 0.16689713220705826, "flos": 16945958776320.0, "grad_norm": 2.135181117493788, "language_loss": 0.80272245, "learning_rate": 3.806738148291477e-06, "loss": 0.82499015, "num_input_tokens_seen": 29316125, "step": 1388, "time_per_iteration": 2.6920456886291504 }, { "auxiliary_loss_clip": 0.01151089, "auxiliary_loss_mlp": 0.01040489, "balance_loss_clip": 1.05243015, "balance_loss_mlp": 1.02812672, "epoch": 0.16701737509769735, "flos": 36244423923840.0, "grad_norm": 2.0353639825343373, "language_loss": 0.71211946, "learning_rate": 3.8064039383762793e-06, "loss": 0.73403525, "num_input_tokens_seen": 29338490, "step": 1389, "time_per_iteration": 2.875511407852173 }, { "auxiliary_loss_clip": 0.01213918, "auxiliary_loss_mlp": 0.01038134, "balance_loss_clip": 1.06601572, "balance_loss_mlp": 1.02751803, "epoch": 0.16713761798833643, "flos": 23258659426560.0, "grad_norm": 3.6764840929757336, "language_loss": 0.76328599, "learning_rate": 3.8060694544310396e-06, "loss": 0.78580654, "num_input_tokens_seen": 29357000, "step": 1390, "time_per_iteration": 2.682415008544922 }, { "auxiliary_loss_clip": 0.01227578, "auxiliary_loss_mlp": 0.01041056, "balance_loss_clip": 1.06478631, "balance_loss_mlp": 1.0302732, "epoch": 0.16725786087897554, "flos": 25302515207040.0, "grad_norm": 1.8325435688495901, "language_loss": 0.7877388, "learning_rate": 3.8057346965065006e-06, "loss": 0.81042516, "num_input_tokens_seen": 29378230, "step": 1391, "time_per_iteration": 2.6407477855682373 }, { "auxiliary_loss_clip": 0.01197433, "auxiliary_loss_mlp": 0.01040505, "balance_loss_clip": 1.06410694, "balance_loss_mlp": 1.02963269, "epoch": 0.16737810376961462, "flos": 31831541516160.0, "grad_norm": 1.635873384155522, "language_loss": 0.84891963, "learning_rate": 3.805399664653443e-06, "loss": 0.87129903, "num_input_tokens_seen": 29400370, "step": 1392, "time_per_iteration": 2.836308717727661 }, { "auxiliary_loss_clip": 0.01227529, "auxiliary_loss_mlp": 0.01039183, "balance_loss_clip": 1.06360567, "balance_loss_mlp": 1.02760196, "epoch": 0.1674983466602537, "flos": 27961843553280.0, "grad_norm": 2.3096463813857113, "language_loss": 0.74055016, "learning_rate": 3.805064358922692e-06, "loss": 0.76321733, "num_input_tokens_seen": 29418660, "step": 1393, "time_per_iteration": 2.6994788646698 }, { "auxiliary_loss_clip": 0.01212539, "auxiliary_loss_mlp": 0.01034818, "balance_loss_clip": 1.06056798, "balance_loss_mlp": 1.02351737, "epoch": 0.16761858955089282, "flos": 21762656858880.0, "grad_norm": 3.2287345784707058, "language_loss": 0.81118667, "learning_rate": 3.8047287793651136e-06, "loss": 0.83366024, "num_input_tokens_seen": 29440105, "step": 1394, "time_per_iteration": 3.605530023574829 }, { "auxiliary_loss_clip": 0.01183658, "auxiliary_loss_mlp": 0.01036202, "balance_loss_clip": 1.06014323, "balance_loss_mlp": 1.02571142, "epoch": 0.1677388324415319, "flos": 23805507058560.0, "grad_norm": 1.7407172212455306, "language_loss": 0.88580894, "learning_rate": 3.8043929260316137e-06, "loss": 0.9080075, "num_input_tokens_seen": 29458260, "step": 1395, "time_per_iteration": 2.751116991043091 }, { "auxiliary_loss_clip": 0.01199278, "auxiliary_loss_mlp": 0.01032647, "balance_loss_clip": 1.06507802, "balance_loss_mlp": 1.02223444, "epoch": 0.16785907533217098, "flos": 20558859431040.0, "grad_norm": 1.9246889218890606, "language_loss": 0.83547258, "learning_rate": 3.8040567989731417e-06, "loss": 0.85779184, "num_input_tokens_seen": 29476205, "step": 1396, "time_per_iteration": 2.747361660003662 }, { "auxiliary_loss_clip": 0.01202938, "auxiliary_loss_mlp": 0.0103582, "balance_loss_clip": 1.06064296, "balance_loss_mlp": 1.02606869, "epoch": 0.16797931822281006, "flos": 15669657745920.0, "grad_norm": 3.2985040043908715, "language_loss": 0.79449439, "learning_rate": 3.8037203982406876e-06, "loss": 0.81688195, "num_input_tokens_seen": 29494370, "step": 1397, "time_per_iteration": 2.649172067642212 }, { "auxiliary_loss_clip": 0.01225304, "auxiliary_loss_mlp": 0.01038749, "balance_loss_clip": 1.06486607, "balance_loss_mlp": 1.02807415, "epoch": 0.16809956111344918, "flos": 16541101607040.0, "grad_norm": 1.7310485300913174, "language_loss": 0.73128581, "learning_rate": 3.8033837238852835e-06, "loss": 0.7539264, "num_input_tokens_seen": 29511070, "step": 1398, "time_per_iteration": 3.508976936340332 }, { "auxiliary_loss_clip": 0.01181665, "auxiliary_loss_mlp": 0.01039665, "balance_loss_clip": 1.05503464, "balance_loss_mlp": 1.02956247, "epoch": 0.16821980400408826, "flos": 23258084808960.0, "grad_norm": 1.9845298170789722, "language_loss": 0.69455779, "learning_rate": 3.8030467759580017e-06, "loss": 0.71677113, "num_input_tokens_seen": 29531990, "step": 1399, "time_per_iteration": 4.420074939727783 }, { "auxiliary_loss_clip": 0.01210636, "auxiliary_loss_mlp": 0.01043083, "balance_loss_clip": 1.05882168, "balance_loss_mlp": 1.03283656, "epoch": 0.16834004689472734, "flos": 20774754126720.0, "grad_norm": 2.074630287643699, "language_loss": 0.87269884, "learning_rate": 3.802709554509958e-06, "loss": 0.89523602, "num_input_tokens_seen": 29549790, "step": 1400, "time_per_iteration": 2.6797807216644287 }, { "auxiliary_loss_clip": 0.01190603, "auxiliary_loss_mlp": 0.01036893, "balance_loss_clip": 1.05478728, "balance_loss_mlp": 1.02605104, "epoch": 0.16846028978536645, "flos": 26687302289280.0, "grad_norm": 2.160290164775485, "language_loss": 0.79165792, "learning_rate": 3.8023720595923083e-06, "loss": 0.8139329, "num_input_tokens_seen": 29569045, "step": 1401, "time_per_iteration": 2.6895766258239746 }, { "auxiliary_loss_clip": 0.011608, "auxiliary_loss_mlp": 0.01038184, "balance_loss_clip": 1.05231881, "balance_loss_mlp": 1.02722836, "epoch": 0.16858053267600553, "flos": 18843298980480.0, "grad_norm": 2.6766640808265048, "language_loss": 0.87636095, "learning_rate": 3.80203429125625e-06, "loss": 0.89835083, "num_input_tokens_seen": 29587220, "step": 1402, "time_per_iteration": 2.731353759765625 }, { "auxiliary_loss_clip": 0.01142428, "auxiliary_loss_mlp": 0.0103628, "balance_loss_clip": 1.05385792, "balance_loss_mlp": 1.0267669, "epoch": 0.16870077556664462, "flos": 27744548227200.0, "grad_norm": 2.3306238265850325, "language_loss": 0.70078373, "learning_rate": 3.8016962495530225e-06, "loss": 0.72257084, "num_input_tokens_seen": 29606410, "step": 1403, "time_per_iteration": 2.8432488441467285 }, { "auxiliary_loss_clip": 0.0122589, "auxiliary_loss_mlp": 0.01042637, "balance_loss_clip": 1.06340051, "balance_loss_mlp": 1.03131199, "epoch": 0.1688210184572837, "flos": 13730768484480.0, "grad_norm": 2.9182645956867934, "language_loss": 0.76328236, "learning_rate": 3.8013579345339063e-06, "loss": 0.78596759, "num_input_tokens_seen": 29621275, "step": 1404, "time_per_iteration": 2.558044910430908 }, { "auxiliary_loss_clip": 0.0117894, "auxiliary_loss_mlp": 0.0103307, "balance_loss_clip": 1.05557072, "balance_loss_mlp": 1.02211475, "epoch": 0.1689412613479228, "flos": 26468785900800.0, "grad_norm": 2.401140848384728, "language_loss": 0.69371951, "learning_rate": 3.801019346250224e-06, "loss": 0.71583956, "num_input_tokens_seen": 29641420, "step": 1405, "time_per_iteration": 2.7142367362976074 }, { "auxiliary_loss_clip": 0.01208297, "auxiliary_loss_mlp": 0.01036994, "balance_loss_clip": 1.06167889, "balance_loss_mlp": 1.0270822, "epoch": 0.1690615042385619, "flos": 21138852337920.0, "grad_norm": 2.400461623334689, "language_loss": 0.84560919, "learning_rate": 3.8006804847533395e-06, "loss": 0.86806208, "num_input_tokens_seen": 29660935, "step": 1406, "time_per_iteration": 2.641300916671753 }, { "auxiliary_loss_clip": 0.01221457, "auxiliary_loss_mlp": 0.01034935, "balance_loss_clip": 1.06121802, "balance_loss_mlp": 1.02442074, "epoch": 0.16918174712920098, "flos": 20849340718080.0, "grad_norm": 1.8681371461371108, "language_loss": 0.85504961, "learning_rate": 3.8003413500946556e-06, "loss": 0.87761354, "num_input_tokens_seen": 29681045, "step": 1407, "time_per_iteration": 2.667349338531494 }, { "auxiliary_loss_clip": 0.01193893, "auxiliary_loss_mlp": 0.01033331, "balance_loss_clip": 1.05993736, "balance_loss_mlp": 1.02290034, "epoch": 0.1693019900198401, "flos": 16983270028800.0, "grad_norm": 2.6274459154397167, "language_loss": 0.8316946, "learning_rate": 3.8000019423256216e-06, "loss": 0.85396683, "num_input_tokens_seen": 29698810, "step": 1408, "time_per_iteration": 2.668830156326294 }, { "auxiliary_loss_clip": 0.01180333, "auxiliary_loss_mlp": 0.01036578, "balance_loss_clip": 1.05659294, "balance_loss_mlp": 1.02693415, "epoch": 0.16942223291047917, "flos": 26796901662720.0, "grad_norm": 1.843323211139918, "language_loss": 0.88209116, "learning_rate": 3.7996622614977234e-06, "loss": 0.90426028, "num_input_tokens_seen": 29720000, "step": 1409, "time_per_iteration": 2.7244889736175537 }, { "auxiliary_loss_clip": 0.01194381, "auxiliary_loss_mlp": 0.0104115, "balance_loss_clip": 1.06346202, "balance_loss_mlp": 1.03089762, "epoch": 0.16954247580111825, "flos": 18583700411520.0, "grad_norm": 2.2052345873315344, "language_loss": 0.79239404, "learning_rate": 3.799322307662492e-06, "loss": 0.81474942, "num_input_tokens_seen": 29737820, "step": 1410, "time_per_iteration": 2.6510775089263916 }, { "auxiliary_loss_clip": 0.01168568, "auxiliary_loss_mlp": 0.01038519, "balance_loss_clip": 1.05618262, "balance_loss_mlp": 1.02807581, "epoch": 0.16966271869175734, "flos": 13983651210240.0, "grad_norm": 2.7323244795110364, "language_loss": 0.83720154, "learning_rate": 3.798982080871496e-06, "loss": 0.85927236, "num_input_tokens_seen": 29752960, "step": 1411, "time_per_iteration": 2.728882074356079 }, { "auxiliary_loss_clip": 0.01224397, "auxiliary_loss_mlp": 0.01038605, "balance_loss_clip": 1.06291604, "balance_loss_mlp": 1.02749431, "epoch": 0.16978296158239645, "flos": 37487328284160.0, "grad_norm": 2.162222253477683, "language_loss": 0.67522776, "learning_rate": 3.798641581176349e-06, "loss": 0.69785774, "num_input_tokens_seen": 29775240, "step": 1412, "time_per_iteration": 2.7433390617370605 }, { "auxiliary_loss_clip": 0.01190962, "auxiliary_loss_mlp": 0.01043901, "balance_loss_clip": 1.05623341, "balance_loss_mlp": 1.03322601, "epoch": 0.16990320447303553, "flos": 28328958506880.0, "grad_norm": 1.813039688217963, "language_loss": 0.74242407, "learning_rate": 3.7983008086287044e-06, "loss": 0.76477265, "num_input_tokens_seen": 29796560, "step": 1413, "time_per_iteration": 2.7895894050598145 }, { "auxiliary_loss_clip": 0.01191416, "auxiliary_loss_mlp": 0.01034845, "balance_loss_clip": 1.05716228, "balance_loss_mlp": 1.02407432, "epoch": 0.1700234473636746, "flos": 20188189031040.0, "grad_norm": 2.4660180421609534, "language_loss": 0.79757679, "learning_rate": 3.797959763280257e-06, "loss": 0.81983948, "num_input_tokens_seen": 29815245, "step": 1414, "time_per_iteration": 2.7089343070983887 }, { "auxiliary_loss_clip": 0.01212121, "auxiliary_loss_mlp": 0.01035697, "balance_loss_clip": 1.06163597, "balance_loss_mlp": 1.02522457, "epoch": 0.17014369025431372, "flos": 24858658846080.0, "grad_norm": 2.567424336397936, "language_loss": 0.79042554, "learning_rate": 3.797618445182743e-06, "loss": 0.8129037, "num_input_tokens_seen": 29836640, "step": 1415, "time_per_iteration": 2.7094528675079346 }, { "auxiliary_loss_clip": 0.01156095, "auxiliary_loss_mlp": 0.01040327, "balance_loss_clip": 1.05098486, "balance_loss_mlp": 1.02933562, "epoch": 0.1702639331449528, "flos": 16467233287680.0, "grad_norm": 2.087124864082911, "language_loss": 0.84926105, "learning_rate": 3.79727685438794e-06, "loss": 0.8712253, "num_input_tokens_seen": 29850830, "step": 1416, "time_per_iteration": 2.6894283294677734 }, { "auxiliary_loss_clip": 0.01118992, "auxiliary_loss_mlp": 0.01003965, "balance_loss_clip": 1.03284335, "balance_loss_mlp": 1.00122356, "epoch": 0.1703841760355919, "flos": 52508870979840.0, "grad_norm": 0.982372001611575, "language_loss": 0.61682397, "learning_rate": 3.796934990947667e-06, "loss": 0.6380536, "num_input_tokens_seen": 29912515, "step": 1417, "time_per_iteration": 3.237032413482666 }, { "auxiliary_loss_clip": 0.01117238, "auxiliary_loss_mlp": 0.01004314, "balance_loss_clip": 1.0318315, "balance_loss_mlp": 1.00147653, "epoch": 0.170504418926231, "flos": 49370637576960.0, "grad_norm": 0.8700664927492163, "language_loss": 0.6251235, "learning_rate": 3.7965928549137854e-06, "loss": 0.64633894, "num_input_tokens_seen": 29969330, "step": 1418, "time_per_iteration": 3.188380002975464 }, { "auxiliary_loss_clip": 0.01179961, "auxiliary_loss_mlp": 0.01043999, "balance_loss_clip": 1.05102825, "balance_loss_mlp": 1.03204262, "epoch": 0.17062466181687008, "flos": 25849219184640.0, "grad_norm": 3.2081277692011425, "language_loss": 0.77560937, "learning_rate": 3.7962504463381953e-06, "loss": 0.797849, "num_input_tokens_seen": 29990820, "step": 1419, "time_per_iteration": 3.668259620666504 }, { "auxiliary_loss_clip": 0.01186827, "auxiliary_loss_mlp": 0.00765818, "balance_loss_clip": 1.0609566, "balance_loss_mlp": 1.00066209, "epoch": 0.17074490470750917, "flos": 20960412549120.0, "grad_norm": 3.878299203601888, "language_loss": 0.78973007, "learning_rate": 3.7959077652728412e-06, "loss": 0.80925655, "num_input_tokens_seen": 30009275, "step": 1420, "time_per_iteration": 2.7301025390625 }, { "auxiliary_loss_clip": 0.01194278, "auxiliary_loss_mlp": 0.01033218, "balance_loss_clip": 1.05802667, "balance_loss_mlp": 1.02266157, "epoch": 0.17086514759814825, "flos": 20959766104320.0, "grad_norm": 2.2269097204284054, "language_loss": 0.77199525, "learning_rate": 3.795564811769707e-06, "loss": 0.79427022, "num_input_tokens_seen": 30027630, "step": 1421, "time_per_iteration": 2.7083685398101807 }, { "auxiliary_loss_clip": 0.01194398, "auxiliary_loss_mlp": 0.01035742, "balance_loss_clip": 1.06180477, "balance_loss_mlp": 1.02484608, "epoch": 0.17098539048878736, "flos": 28474073452800.0, "grad_norm": 2.5341188234589964, "language_loss": 0.78203297, "learning_rate": 3.795221585880818e-06, "loss": 0.8043344, "num_input_tokens_seen": 30048310, "step": 1422, "time_per_iteration": 2.7298927307128906 }, { "auxiliary_loss_clip": 0.01178549, "auxiliary_loss_mlp": 0.01037526, "balance_loss_clip": 1.06067371, "balance_loss_mlp": 1.02769709, "epoch": 0.17110563337942644, "flos": 16290014561280.0, "grad_norm": 2.072994167385411, "language_loss": 0.91689312, "learning_rate": 3.794878087658242e-06, "loss": 0.93905389, "num_input_tokens_seen": 30066080, "step": 1423, "time_per_iteration": 2.6999073028564453 }, { "auxiliary_loss_clip": 0.01211418, "auxiliary_loss_mlp": 0.01037824, "balance_loss_clip": 1.06128097, "balance_loss_mlp": 1.02709484, "epoch": 0.17122587627006552, "flos": 29674207693440.0, "grad_norm": 1.9607214443753314, "language_loss": 0.78631586, "learning_rate": 3.7945343171540873e-06, "loss": 0.80880833, "num_input_tokens_seen": 30086955, "step": 1424, "time_per_iteration": 4.568922281265259 }, { "auxiliary_loss_clip": 0.01222907, "auxiliary_loss_mlp": 0.01038125, "balance_loss_clip": 1.06172287, "balance_loss_mlp": 1.02758694, "epoch": 0.17134611916070464, "flos": 25338389915520.0, "grad_norm": 2.3551049204746812, "language_loss": 0.79356331, "learning_rate": 3.7941902744205033e-06, "loss": 0.81617361, "num_input_tokens_seen": 30107990, "step": 1425, "time_per_iteration": 3.605212926864624 }, { "auxiliary_loss_clip": 0.01197834, "auxiliary_loss_mlp": 0.01039326, "balance_loss_clip": 1.05930197, "balance_loss_mlp": 1.02856147, "epoch": 0.17146636205134372, "flos": 13953845900160.0, "grad_norm": 2.101180685369853, "language_loss": 0.83530927, "learning_rate": 3.7938459595096817e-06, "loss": 0.85768092, "num_input_tokens_seen": 30126535, "step": 1426, "time_per_iteration": 2.6255178451538086 }, { "auxiliary_loss_clip": 0.01212844, "auxiliary_loss_mlp": 0.01041164, "balance_loss_clip": 1.05943108, "balance_loss_mlp": 1.03039932, "epoch": 0.1715866049419828, "flos": 23915214172800.0, "grad_norm": 1.8441442114866078, "language_loss": 0.86305904, "learning_rate": 3.7935013724738545e-06, "loss": 0.88559908, "num_input_tokens_seen": 30147035, "step": 1427, "time_per_iteration": 2.6195647716522217 }, { "auxiliary_loss_clip": 0.01204013, "auxiliary_loss_mlp": 0.01039149, "balance_loss_clip": 1.0600667, "balance_loss_mlp": 1.0288676, "epoch": 0.17170684783262188, "flos": 22709369669760.0, "grad_norm": 1.8435586132463884, "language_loss": 0.78312886, "learning_rate": 3.7931565133652945e-06, "loss": 0.80556041, "num_input_tokens_seen": 30167110, "step": 1428, "time_per_iteration": 2.708631992340088 }, { "auxiliary_loss_clip": 0.01223486, "auxiliary_loss_mlp": 0.01042278, "balance_loss_clip": 1.06147885, "balance_loss_mlp": 1.03186464, "epoch": 0.171827090723261, "flos": 26613290315520.0, "grad_norm": 2.8771705462660306, "language_loss": 0.67884421, "learning_rate": 3.792811382236317e-06, "loss": 0.70150185, "num_input_tokens_seen": 30185620, "step": 1429, "time_per_iteration": 2.633237838745117 }, { "auxiliary_loss_clip": 0.01213982, "auxiliary_loss_mlp": 0.01041157, "balance_loss_clip": 1.06077766, "balance_loss_mlp": 1.02987969, "epoch": 0.17194733361390008, "flos": 28148507556480.0, "grad_norm": 2.079458280323583, "language_loss": 0.78143734, "learning_rate": 3.792465979139279e-06, "loss": 0.80398875, "num_input_tokens_seen": 30208225, "step": 1430, "time_per_iteration": 2.7056972980499268 }, { "auxiliary_loss_clip": 0.01088954, "auxiliary_loss_mlp": 0.01005981, "balance_loss_clip": 1.02841711, "balance_loss_mlp": 1.00326335, "epoch": 0.17206757650453916, "flos": 65530689753600.0, "grad_norm": 0.9525094892736684, "language_loss": 0.65678722, "learning_rate": 3.792120304126576e-06, "loss": 0.67773658, "num_input_tokens_seen": 30271600, "step": 1431, "time_per_iteration": 3.2865302562713623 }, { "auxiliary_loss_clip": 0.0113345, "auxiliary_loss_mlp": 0.01033877, "balance_loss_clip": 1.0499686, "balance_loss_mlp": 1.02333331, "epoch": 0.17218781939517827, "flos": 22273486128000.0, "grad_norm": 1.9197448995605524, "language_loss": 0.83603144, "learning_rate": 3.791774357250649e-06, "loss": 0.85770464, "num_input_tokens_seen": 30290430, "step": 1432, "time_per_iteration": 2.7867259979248047 }, { "auxiliary_loss_clip": 0.01187536, "auxiliary_loss_mlp": 0.01038714, "balance_loss_clip": 1.05668092, "balance_loss_mlp": 1.02852201, "epoch": 0.17230806228581735, "flos": 14137313592960.0, "grad_norm": 2.130523236315261, "language_loss": 0.78688818, "learning_rate": 3.7914281385639757e-06, "loss": 0.8091507, "num_input_tokens_seen": 30308305, "step": 1433, "time_per_iteration": 2.6444058418273926 }, { "auxiliary_loss_clip": 0.0120583, "auxiliary_loss_mlp": 0.01033073, "balance_loss_clip": 1.05666518, "balance_loss_mlp": 1.02267206, "epoch": 0.17242830517645644, "flos": 20704836303360.0, "grad_norm": 2.3925042493742197, "language_loss": 0.79963535, "learning_rate": 3.7910816481190784e-06, "loss": 0.82202435, "num_input_tokens_seen": 30328120, "step": 1434, "time_per_iteration": 2.6217660903930664 }, { "auxiliary_loss_clip": 0.01181486, "auxiliary_loss_mlp": 0.01031999, "balance_loss_clip": 1.05512881, "balance_loss_mlp": 1.02124023, "epoch": 0.17254854806709552, "flos": 30774582887040.0, "grad_norm": 2.3126469802090344, "language_loss": 0.75050175, "learning_rate": 3.7907348859685193e-06, "loss": 0.77263653, "num_input_tokens_seen": 30349825, "step": 1435, "time_per_iteration": 2.75966215133667 }, { "auxiliary_loss_clip": 0.01206303, "auxiliary_loss_mlp": 0.01032868, "balance_loss_clip": 1.06273782, "balance_loss_mlp": 1.02244961, "epoch": 0.17266879095773463, "flos": 26614726859520.0, "grad_norm": 2.043725839902142, "language_loss": 0.80907905, "learning_rate": 3.790387852164902e-06, "loss": 0.83147079, "num_input_tokens_seen": 30370555, "step": 1436, "time_per_iteration": 2.661001682281494 }, { "auxiliary_loss_clip": 0.01207801, "auxiliary_loss_mlp": 0.01036431, "balance_loss_clip": 1.06041312, "balance_loss_mlp": 1.02564812, "epoch": 0.1727890338483737, "flos": 20266295155200.0, "grad_norm": 2.014579748757546, "language_loss": 0.76593566, "learning_rate": 3.7900405467608707e-06, "loss": 0.788378, "num_input_tokens_seen": 30390100, "step": 1437, "time_per_iteration": 2.675593137741089 }, { "auxiliary_loss_clip": 0.01148599, "auxiliary_loss_mlp": 0.010357, "balance_loss_clip": 1.04925644, "balance_loss_mlp": 1.02411282, "epoch": 0.1729092767390128, "flos": 18179812909440.0, "grad_norm": 3.590798098147219, "language_loss": 0.79428625, "learning_rate": 3.7896929698091114e-06, "loss": 0.81612927, "num_input_tokens_seen": 30402915, "step": 1438, "time_per_iteration": 2.682413339614868 }, { "auxiliary_loss_clip": 0.0122842, "auxiliary_loss_mlp": 0.01051644, "balance_loss_clip": 1.06611288, "balance_loss_mlp": 1.03979445, "epoch": 0.1730295196296519, "flos": 26759518583040.0, "grad_norm": 2.768274637306085, "language_loss": 0.68118602, "learning_rate": 3.7893451213623518e-06, "loss": 0.70398664, "num_input_tokens_seen": 30420145, "step": 1439, "time_per_iteration": 2.6361167430877686 }, { "auxiliary_loss_clip": 0.01208547, "auxiliary_loss_mlp": 0.00765922, "balance_loss_clip": 1.06347561, "balance_loss_mlp": 1.00058746, "epoch": 0.173149762520291, "flos": 23842531002240.0, "grad_norm": 2.3066697891852668, "language_loss": 0.82222337, "learning_rate": 3.7889970014733606e-06, "loss": 0.84196806, "num_input_tokens_seen": 30439250, "step": 1440, "time_per_iteration": 2.7188098430633545 }, { "auxiliary_loss_clip": 0.0114482, "auxiliary_loss_mlp": 0.0103851, "balance_loss_clip": 1.04918766, "balance_loss_mlp": 1.02732229, "epoch": 0.17327000541093007, "flos": 23368186972800.0, "grad_norm": 1.694784205306711, "language_loss": 0.78321624, "learning_rate": 3.7886486101949463e-06, "loss": 0.80504954, "num_input_tokens_seen": 30460430, "step": 1441, "time_per_iteration": 2.748162269592285 }, { "auxiliary_loss_clip": 0.01149221, "auxiliary_loss_mlp": 0.01038761, "balance_loss_clip": 1.04993248, "balance_loss_mlp": 1.02721012, "epoch": 0.17339024830156918, "flos": 18221290139520.0, "grad_norm": 2.0628141629239454, "language_loss": 0.8801896, "learning_rate": 3.7882999475799594e-06, "loss": 0.90206939, "num_input_tokens_seen": 30478465, "step": 1442, "time_per_iteration": 2.7650859355926514 }, { "auxiliary_loss_clip": 0.01142461, "auxiliary_loss_mlp": 0.01041925, "balance_loss_clip": 1.05152965, "balance_loss_mlp": 1.03154206, "epoch": 0.17351049119220827, "flos": 23332024955520.0, "grad_norm": 1.8386888472608776, "language_loss": 0.81268907, "learning_rate": 3.787951013681293e-06, "loss": 0.83453298, "num_input_tokens_seen": 30496510, "step": 1443, "time_per_iteration": 2.8288912773132324 }, { "auxiliary_loss_clip": 0.01208011, "auxiliary_loss_mlp": 0.01037803, "balance_loss_clip": 1.0587374, "balance_loss_mlp": 1.02719367, "epoch": 0.17363073408284735, "flos": 23803495896960.0, "grad_norm": 2.047541340593442, "language_loss": 0.7761966, "learning_rate": 3.787601808551879e-06, "loss": 0.79865479, "num_input_tokens_seen": 30516325, "step": 1444, "time_per_iteration": 2.7429821491241455 }, { "auxiliary_loss_clip": 0.01181325, "auxiliary_loss_mlp": 0.01043104, "balance_loss_clip": 1.0566895, "balance_loss_mlp": 1.03154659, "epoch": 0.17375097697348643, "flos": 18515290959360.0, "grad_norm": 3.2698178571771495, "language_loss": 0.84157097, "learning_rate": 3.7872523322446926e-06, "loss": 0.86381531, "num_input_tokens_seen": 30535210, "step": 1445, "time_per_iteration": 2.6699202060699463 }, { "auxiliary_loss_clip": 0.01166652, "auxiliary_loss_mlp": 0.01033224, "balance_loss_clip": 1.05013978, "balance_loss_mlp": 1.02255452, "epoch": 0.17387121986412554, "flos": 38877897456000.0, "grad_norm": 1.6526688703505965, "language_loss": 0.60292137, "learning_rate": 3.7869025848127478e-06, "loss": 0.62492019, "num_input_tokens_seen": 30559405, "step": 1446, "time_per_iteration": 3.788430690765381 }, { "auxiliary_loss_clip": 0.01209474, "auxiliary_loss_mlp": 0.01036914, "balance_loss_clip": 1.05833697, "balance_loss_mlp": 1.02670348, "epoch": 0.17399146275476463, "flos": 20375714960640.0, "grad_norm": 2.6239609962461423, "language_loss": 0.80768383, "learning_rate": 3.786552566309102e-06, "loss": 0.83014774, "num_input_tokens_seen": 30577615, "step": 1447, "time_per_iteration": 2.601665735244751 }, { "auxiliary_loss_clip": 0.0119255, "auxiliary_loss_mlp": 0.0076562, "balance_loss_clip": 1.06085896, "balance_loss_mlp": 1.00049806, "epoch": 0.1741117056454037, "flos": 19164339763200.0, "grad_norm": 2.8120628978777464, "language_loss": 0.86157525, "learning_rate": 3.7862022767868517e-06, "loss": 0.88115692, "num_input_tokens_seen": 30595205, "step": 1448, "time_per_iteration": 2.703667640686035 }, { "auxiliary_loss_clip": 0.0117648, "auxiliary_loss_mlp": 0.0104016, "balance_loss_clip": 1.06227255, "balance_loss_mlp": 1.02893603, "epoch": 0.17423194853604282, "flos": 25374300537600.0, "grad_norm": 2.806840894651503, "language_loss": 0.84633338, "learning_rate": 3.7858517162991367e-06, "loss": 0.86849976, "num_input_tokens_seen": 30615280, "step": 1449, "time_per_iteration": 2.731600284576416 }, { "auxiliary_loss_clip": 0.01179586, "auxiliary_loss_mlp": 0.01037421, "balance_loss_clip": 1.05336297, "balance_loss_mlp": 1.02658534, "epoch": 0.1743521914266819, "flos": 25191874339200.0, "grad_norm": 2.6047806528481043, "language_loss": 0.61037409, "learning_rate": 3.7855008848991363e-06, "loss": 0.63254416, "num_input_tokens_seen": 30633485, "step": 1450, "time_per_iteration": 5.538543224334717 }, { "auxiliary_loss_clip": 0.01186003, "auxiliary_loss_mlp": 0.01032997, "balance_loss_clip": 1.05597258, "balance_loss_mlp": 1.02235806, "epoch": 0.17447243431732098, "flos": 25666577504640.0, "grad_norm": 1.8930947032185024, "language_loss": 0.77847755, "learning_rate": 3.7851497826400714e-06, "loss": 0.80066752, "num_input_tokens_seen": 30653625, "step": 1451, "time_per_iteration": 2.7068231105804443 }, { "auxiliary_loss_clip": 0.01225303, "auxiliary_loss_mlp": 0.01037223, "balance_loss_clip": 1.06441963, "balance_loss_mlp": 1.02571297, "epoch": 0.17459267720796007, "flos": 36281950657920.0, "grad_norm": 2.190051188089824, "language_loss": 0.76136565, "learning_rate": 3.7847984095752034e-06, "loss": 0.78399086, "num_input_tokens_seen": 30677080, "step": 1452, "time_per_iteration": 2.819262981414795 }, { "auxiliary_loss_clip": 0.01223189, "auxiliary_loss_mlp": 0.01041839, "balance_loss_clip": 1.06170464, "balance_loss_mlp": 1.03094912, "epoch": 0.17471292009859918, "flos": 20011113959040.0, "grad_norm": 2.1789644755244386, "language_loss": 0.80741262, "learning_rate": 3.784446765757836e-06, "loss": 0.83006287, "num_input_tokens_seen": 30695725, "step": 1453, "time_per_iteration": 2.548492193222046 }, { "auxiliary_loss_clip": 0.01162756, "auxiliary_loss_mlp": 0.01035257, "balance_loss_clip": 1.05346417, "balance_loss_mlp": 1.02431345, "epoch": 0.17483316298923826, "flos": 27819242559360.0, "grad_norm": 2.1830890684036377, "language_loss": 0.77905548, "learning_rate": 3.7840948512413133e-06, "loss": 0.80103564, "num_input_tokens_seen": 30713310, "step": 1454, "time_per_iteration": 2.7616875171661377 }, { "auxiliary_loss_clip": 0.0117651, "auxiliary_loss_mlp": 0.01036416, "balance_loss_clip": 1.05978966, "balance_loss_mlp": 1.02546036, "epoch": 0.17495340587987734, "flos": 44017934791680.0, "grad_norm": 2.0477412226349414, "language_loss": 0.79007757, "learning_rate": 3.7837426660790196e-06, "loss": 0.81220686, "num_input_tokens_seen": 30734725, "step": 1455, "time_per_iteration": 2.8776140213012695 }, { "auxiliary_loss_clip": 0.0122028, "auxiliary_loss_mlp": 0.01037465, "balance_loss_clip": 1.06004345, "balance_loss_mlp": 1.02677786, "epoch": 0.17507364877051645, "flos": 20885825957760.0, "grad_norm": 2.0319704277145676, "language_loss": 0.8212356, "learning_rate": 3.783390210324382e-06, "loss": 0.84381306, "num_input_tokens_seen": 30754450, "step": 1456, "time_per_iteration": 2.6259491443634033 }, { "auxiliary_loss_clip": 0.01177036, "auxiliary_loss_mlp": 0.01033826, "balance_loss_clip": 1.05782819, "balance_loss_mlp": 1.02311468, "epoch": 0.17519389166115554, "flos": 24717602136960.0, "grad_norm": 8.358997140889132, "language_loss": 0.72720116, "learning_rate": 3.7830374840308676e-06, "loss": 0.74930978, "num_input_tokens_seen": 30774605, "step": 1457, "time_per_iteration": 2.73349928855896 }, { "auxiliary_loss_clip": 0.01210574, "auxiliary_loss_mlp": 0.01036559, "balance_loss_clip": 1.06397033, "balance_loss_mlp": 1.02533507, "epoch": 0.17531413455179462, "flos": 23798144770560.0, "grad_norm": 2.944344584648095, "language_loss": 0.82482076, "learning_rate": 3.7826844872519842e-06, "loss": 0.84729207, "num_input_tokens_seen": 30792460, "step": 1458, "time_per_iteration": 2.65874981880188 }, { "auxiliary_loss_clip": 0.01193872, "auxiliary_loss_mlp": 0.0103752, "balance_loss_clip": 1.06271482, "balance_loss_mlp": 1.02718413, "epoch": 0.1754343774424337, "flos": 24572379450240.0, "grad_norm": 3.7897461846972207, "language_loss": 0.72351772, "learning_rate": 3.782331220041282e-06, "loss": 0.74583161, "num_input_tokens_seen": 30812525, "step": 1459, "time_per_iteration": 2.7126193046569824 }, { "auxiliary_loss_clip": 0.01186052, "auxiliary_loss_mlp": 0.01036455, "balance_loss_clip": 1.05572891, "balance_loss_mlp": 1.02604222, "epoch": 0.17555462033307281, "flos": 18114599767680.0, "grad_norm": 2.9777343610132925, "language_loss": 0.83277702, "learning_rate": 3.7819776824523504e-06, "loss": 0.85500205, "num_input_tokens_seen": 30830390, "step": 1460, "time_per_iteration": 2.6637237071990967 }, { "auxiliary_loss_clip": 0.01203757, "auxiliary_loss_mlp": 0.01038733, "balance_loss_clip": 1.06007195, "balance_loss_mlp": 1.02743793, "epoch": 0.1756748632237119, "flos": 28366018364160.0, "grad_norm": 1.8995387074856196, "language_loss": 0.83891654, "learning_rate": 3.7816238745388213e-06, "loss": 0.86134142, "num_input_tokens_seen": 30849935, "step": 1461, "time_per_iteration": 2.715343713760376 }, { "auxiliary_loss_clip": 0.01195339, "auxiliary_loss_mlp": 0.01039226, "balance_loss_clip": 1.05519998, "balance_loss_mlp": 1.0279603, "epoch": 0.17579510611435098, "flos": 25732939881600.0, "grad_norm": 2.06976280443509, "language_loss": 0.87413156, "learning_rate": 3.781269796354367e-06, "loss": 0.89647722, "num_input_tokens_seen": 30869555, "step": 1462, "time_per_iteration": 2.7079031467437744 }, { "auxiliary_loss_clip": 0.0120052, "auxiliary_loss_mlp": 0.01041077, "balance_loss_clip": 1.06368864, "balance_loss_mlp": 1.03033602, "epoch": 0.1759153490049901, "flos": 18588081870720.0, "grad_norm": 1.7366994528358626, "language_loss": 0.8589142, "learning_rate": 3.7809154479527006e-06, "loss": 0.88133013, "num_input_tokens_seen": 30888760, "step": 1463, "time_per_iteration": 2.664198875427246 }, { "auxiliary_loss_clip": 0.01170275, "auxiliary_loss_mlp": 0.01032458, "balance_loss_clip": 1.05361068, "balance_loss_mlp": 1.02175856, "epoch": 0.17603559189562917, "flos": 18619323724800.0, "grad_norm": 2.300537578986749, "language_loss": 0.84274334, "learning_rate": 3.780560829387577e-06, "loss": 0.86477065, "num_input_tokens_seen": 30907260, "step": 1464, "time_per_iteration": 2.6615333557128906 }, { "auxiliary_loss_clip": 0.01113461, "auxiliary_loss_mlp": 0.01008904, "balance_loss_clip": 1.03028893, "balance_loss_mlp": 1.00634134, "epoch": 0.17615583478626826, "flos": 60530775373440.0, "grad_norm": 0.856271415963087, "language_loss": 0.57956278, "learning_rate": 3.7802059407127915e-06, "loss": 0.60078645, "num_input_tokens_seen": 30965810, "step": 1465, "time_per_iteration": 3.1868064403533936 }, { "auxiliary_loss_clip": 0.01185199, "auxiliary_loss_mlp": 0.0104106, "balance_loss_clip": 1.05337477, "balance_loss_mlp": 1.02999187, "epoch": 0.17627607767690734, "flos": 23616221362560.0, "grad_norm": 2.22115736069406, "language_loss": 0.86102527, "learning_rate": 3.7798507819821797e-06, "loss": 0.88328791, "num_input_tokens_seen": 30982935, "step": 1466, "time_per_iteration": 2.671494960784912 }, { "auxiliary_loss_clip": 0.01167466, "auxiliary_loss_mlp": 0.01028823, "balance_loss_clip": 1.0556314, "balance_loss_mlp": 1.01792693, "epoch": 0.17639632056754645, "flos": 17639070589440.0, "grad_norm": 2.4798735935630187, "language_loss": 0.79383594, "learning_rate": 3.7794953532496197e-06, "loss": 0.81579888, "num_input_tokens_seen": 30998840, "step": 1467, "time_per_iteration": 2.6757991313934326 }, { "auxiliary_loss_clip": 0.01087377, "auxiliary_loss_mlp": 0.00757265, "balance_loss_clip": 1.04324389, "balance_loss_mlp": 1.00107861, "epoch": 0.17651656345818553, "flos": 57932604910080.0, "grad_norm": 0.8577574351629216, "language_loss": 0.5798983, "learning_rate": 3.7791396545690295e-06, "loss": 0.59834474, "num_input_tokens_seen": 31060075, "step": 1468, "time_per_iteration": 3.2522215843200684 }, { "auxiliary_loss_clip": 0.01208372, "auxiliary_loss_mlp": 0.01040576, "balance_loss_clip": 1.06046963, "balance_loss_mlp": 1.02988267, "epoch": 0.17663680634882462, "flos": 22929502170240.0, "grad_norm": 2.0834266025202073, "language_loss": 0.80879736, "learning_rate": 3.7787836859943685e-06, "loss": 0.83128685, "num_input_tokens_seen": 31078800, "step": 1469, "time_per_iteration": 2.6371004581451416 }, { "auxiliary_loss_clip": 0.01203308, "auxiliary_loss_mlp": 0.01033729, "balance_loss_clip": 1.05575204, "balance_loss_mlp": 1.02227914, "epoch": 0.17675704923946373, "flos": 22637979388800.0, "grad_norm": 2.4691593755137333, "language_loss": 0.78554195, "learning_rate": 3.7784274475796363e-06, "loss": 0.80791235, "num_input_tokens_seen": 31097430, "step": 1470, "time_per_iteration": 2.6697990894317627 }, { "auxiliary_loss_clip": 0.0118212, "auxiliary_loss_mlp": 0.01040622, "balance_loss_clip": 1.05588055, "balance_loss_mlp": 1.02847469, "epoch": 0.1768772921301028, "flos": 27126525795840.0, "grad_norm": 2.7366067321924734, "language_loss": 0.75790048, "learning_rate": 3.7780709393788745e-06, "loss": 0.78012794, "num_input_tokens_seen": 31117905, "step": 1471, "time_per_iteration": 2.7280287742614746 }, { "auxiliary_loss_clip": 0.01220801, "auxiliary_loss_mlp": 0.01038116, "balance_loss_clip": 1.06104171, "balance_loss_mlp": 1.02708316, "epoch": 0.1769975350207419, "flos": 19172133014400.0, "grad_norm": 2.0482150438751665, "language_loss": 0.75249445, "learning_rate": 3.777714161446165e-06, "loss": 0.77508354, "num_input_tokens_seen": 31137610, "step": 1472, "time_per_iteration": 3.607952117919922 }, { "auxiliary_loss_clip": 0.01210978, "auxiliary_loss_mlp": 0.01034546, "balance_loss_clip": 1.06048012, "balance_loss_mlp": 1.02381182, "epoch": 0.177117777911381, "flos": 36134932291200.0, "grad_norm": 3.482897521573406, "language_loss": 0.6954968, "learning_rate": 3.7773571138356304e-06, "loss": 0.71795207, "num_input_tokens_seen": 31157780, "step": 1473, "time_per_iteration": 2.8020639419555664 }, { "auxiliary_loss_clip": 0.01152826, "auxiliary_loss_mlp": 0.01042225, "balance_loss_clip": 1.0558846, "balance_loss_mlp": 1.03165662, "epoch": 0.17723802080202009, "flos": 22090593052800.0, "grad_norm": 2.819462449038129, "language_loss": 0.89291161, "learning_rate": 3.776999796601435e-06, "loss": 0.9148621, "num_input_tokens_seen": 31176540, "step": 1474, "time_per_iteration": 2.712747812271118 }, { "auxiliary_loss_clip": 0.01216427, "auxiliary_loss_mlp": 0.01040648, "balance_loss_clip": 1.06424701, "balance_loss_mlp": 1.02925122, "epoch": 0.17735826369265917, "flos": 30222671437440.0, "grad_norm": 2.2350172850720815, "language_loss": 0.72904682, "learning_rate": 3.776642209797783e-06, "loss": 0.75161755, "num_input_tokens_seen": 31198370, "step": 1475, "time_per_iteration": 2.7424850463867188 }, { "auxiliary_loss_clip": 0.01200332, "auxiliary_loss_mlp": 0.01046608, "balance_loss_clip": 1.05749202, "balance_loss_mlp": 1.03491926, "epoch": 0.17747850658329825, "flos": 21397588980480.0, "grad_norm": 2.4739380629783407, "language_loss": 0.78113347, "learning_rate": 3.7762843534789205e-06, "loss": 0.80360293, "num_input_tokens_seen": 31217120, "step": 1476, "time_per_iteration": 4.455347537994385 }, { "auxiliary_loss_clip": 0.0119609, "auxiliary_loss_mlp": 0.01036031, "balance_loss_clip": 1.05695081, "balance_loss_mlp": 1.02432466, "epoch": 0.17759874947393736, "flos": 16983341856000.0, "grad_norm": 3.9846752132317707, "language_loss": 0.88412607, "learning_rate": 3.7759262276991343e-06, "loss": 0.90644723, "num_input_tokens_seen": 31234730, "step": 1477, "time_per_iteration": 2.6873996257781982 }, { "auxiliary_loss_clip": 0.01200224, "auxiliary_loss_mlp": 0.01041776, "balance_loss_clip": 1.06019711, "balance_loss_mlp": 1.03039777, "epoch": 0.17771899236457644, "flos": 11546107390080.0, "grad_norm": 3.4652024160164223, "language_loss": 0.80727243, "learning_rate": 3.7755678325127506e-06, "loss": 0.82969242, "num_input_tokens_seen": 31252410, "step": 1478, "time_per_iteration": 2.6408019065856934 }, { "auxiliary_loss_clip": 0.01157659, "auxiliary_loss_mlp": 0.01038433, "balance_loss_clip": 1.05624568, "balance_loss_mlp": 1.02881265, "epoch": 0.17783923525521553, "flos": 18807747494400.0, "grad_norm": 1.8664464248934465, "language_loss": 0.75644612, "learning_rate": 3.7752091679741393e-06, "loss": 0.77840698, "num_input_tokens_seen": 31270200, "step": 1479, "time_per_iteration": 2.7341856956481934 }, { "auxiliary_loss_clip": 0.01205086, "auxiliary_loss_mlp": 0.01037014, "balance_loss_clip": 1.05916214, "balance_loss_mlp": 1.02570081, "epoch": 0.17795947814585464, "flos": 30408365773440.0, "grad_norm": 5.07705131741926, "language_loss": 0.77805638, "learning_rate": 3.774850234137708e-06, "loss": 0.80047739, "num_input_tokens_seen": 31287495, "step": 1480, "time_per_iteration": 2.721266269683838 }, { "auxiliary_loss_clip": 0.01204305, "auxiliary_loss_mlp": 0.01040749, "balance_loss_clip": 1.05854452, "balance_loss_mlp": 1.02960896, "epoch": 0.17807972103649372, "flos": 24389055411840.0, "grad_norm": 3.202785772095777, "language_loss": 0.83118099, "learning_rate": 3.7744910310579076e-06, "loss": 0.85363156, "num_input_tokens_seen": 31306420, "step": 1481, "time_per_iteration": 2.6455836296081543 }, { "auxiliary_loss_clip": 0.01222852, "auxiliary_loss_mlp": 0.01029552, "balance_loss_clip": 1.06354475, "balance_loss_mlp": 1.01946104, "epoch": 0.1781999639271328, "flos": 20301559332480.0, "grad_norm": 2.183064579157, "language_loss": 0.85431188, "learning_rate": 3.774131558789229e-06, "loss": 0.87683594, "num_input_tokens_seen": 31325750, "step": 1482, "time_per_iteration": 2.637542724609375 }, { "auxiliary_loss_clip": 0.01221943, "auxiliary_loss_mlp": 0.00765529, "balance_loss_clip": 1.06272209, "balance_loss_mlp": 1.00032067, "epoch": 0.1783202068177719, "flos": 15924479806080.0, "grad_norm": 2.9361214873853685, "language_loss": 0.70261681, "learning_rate": 3.773771817386203e-06, "loss": 0.7224915, "num_input_tokens_seen": 31343080, "step": 1483, "time_per_iteration": 2.561711072921753 }, { "auxiliary_loss_clip": 0.01192224, "auxiliary_loss_mlp": 0.01042113, "balance_loss_clip": 1.05820155, "balance_loss_mlp": 1.0317241, "epoch": 0.178440449708411, "flos": 20631758083200.0, "grad_norm": 1.6507507940809993, "language_loss": 0.79399836, "learning_rate": 3.773411806903403e-06, "loss": 0.81634176, "num_input_tokens_seen": 31362160, "step": 1484, "time_per_iteration": 2.7249412536621094 }, { "auxiliary_loss_clip": 0.01156171, "auxiliary_loss_mlp": 0.01044214, "balance_loss_clip": 1.05597425, "balance_loss_mlp": 1.03334165, "epoch": 0.17856069259905008, "flos": 21686059105920.0, "grad_norm": 1.8652448150744427, "language_loss": 0.94738078, "learning_rate": 3.7730515273954415e-06, "loss": 0.96938455, "num_input_tokens_seen": 31380770, "step": 1485, "time_per_iteration": 2.755969524383545 }, { "auxiliary_loss_clip": 0.01220911, "auxiliary_loss_mlp": 0.01036151, "balance_loss_clip": 1.06308913, "balance_loss_mlp": 1.02505898, "epoch": 0.17868093548968916, "flos": 26572962320640.0, "grad_norm": 2.4590614620788918, "language_loss": 0.85089135, "learning_rate": 3.772690978916973e-06, "loss": 0.87346196, "num_input_tokens_seen": 31400525, "step": 1486, "time_per_iteration": 2.6572811603546143 }, { "auxiliary_loss_clip": 0.01206889, "auxiliary_loss_mlp": 0.01041186, "balance_loss_clip": 1.06037462, "balance_loss_mlp": 1.02993226, "epoch": 0.17880117838032827, "flos": 18581006891520.0, "grad_norm": 2.0196853401237127, "language_loss": 0.86559904, "learning_rate": 3.772330161522693e-06, "loss": 0.88807976, "num_input_tokens_seen": 31418435, "step": 1487, "time_per_iteration": 2.62556529045105 }, { "auxiliary_loss_clip": 0.01195675, "auxiliary_loss_mlp": 0.01033323, "balance_loss_clip": 1.06579161, "balance_loss_mlp": 1.02231443, "epoch": 0.17892142127096736, "flos": 26541217676160.0, "grad_norm": 2.22740113039234, "language_loss": 0.79661804, "learning_rate": 3.7719690752673365e-06, "loss": 0.81890798, "num_input_tokens_seen": 31439230, "step": 1488, "time_per_iteration": 2.6927714347839355 }, { "auxiliary_loss_clip": 0.01181992, "auxiliary_loss_mlp": 0.01038525, "balance_loss_clip": 1.06125891, "balance_loss_mlp": 1.02842784, "epoch": 0.17904166416160644, "flos": 23872623621120.0, "grad_norm": 2.1764372132252636, "language_loss": 0.78681922, "learning_rate": 3.7716077202056796e-06, "loss": 0.80902445, "num_input_tokens_seen": 31457705, "step": 1489, "time_per_iteration": 2.76432204246521 }, { "auxiliary_loss_clip": 0.01180209, "auxiliary_loss_mlp": 0.01036492, "balance_loss_clip": 1.05742669, "balance_loss_mlp": 1.02562642, "epoch": 0.17916190705224552, "flos": 19134426712320.0, "grad_norm": 2.7845375015612728, "language_loss": 0.93584228, "learning_rate": 3.7712460963925404e-06, "loss": 0.95800936, "num_input_tokens_seen": 31473645, "step": 1490, "time_per_iteration": 2.6645596027374268 }, { "auxiliary_loss_clip": 0.01180206, "auxiliary_loss_mlp": 0.01032008, "balance_loss_clip": 1.05305052, "balance_loss_mlp": 1.02136254, "epoch": 0.17928214994288463, "flos": 25152120961920.0, "grad_norm": 2.104899015224723, "language_loss": 0.75887668, "learning_rate": 3.7708842038827775e-06, "loss": 0.78099883, "num_input_tokens_seen": 31492605, "step": 1491, "time_per_iteration": 2.6988956928253174 }, { "auxiliary_loss_clip": 0.01206247, "auxiliary_loss_mlp": 0.01043209, "balance_loss_clip": 1.05870342, "balance_loss_mlp": 1.03233719, "epoch": 0.17940239283352372, "flos": 22384629786240.0, "grad_norm": 1.837977430375674, "language_loss": 0.85780579, "learning_rate": 3.770522042731288e-06, "loss": 0.88030034, "num_input_tokens_seen": 31514500, "step": 1492, "time_per_iteration": 2.6633012294769287 }, { "auxiliary_loss_clip": 0.01153884, "auxiliary_loss_mlp": 0.01034693, "balance_loss_clip": 1.05431116, "balance_loss_mlp": 1.02268255, "epoch": 0.1795226357241628, "flos": 23178685795200.0, "grad_norm": 3.4223667252981294, "language_loss": 0.88099819, "learning_rate": 3.7701596129930122e-06, "loss": 0.90288401, "num_input_tokens_seen": 31533225, "step": 1493, "time_per_iteration": 2.7647504806518555 }, { "auxiliary_loss_clip": 0.01185472, "auxiliary_loss_mlp": 0.01035223, "balance_loss_clip": 1.05919743, "balance_loss_mlp": 1.0241785, "epoch": 0.1796428786148019, "flos": 22090413484800.0, "grad_norm": 1.9985352966411212, "language_loss": 0.73371691, "learning_rate": 3.7697969147229315e-06, "loss": 0.75592387, "num_input_tokens_seen": 31551385, "step": 1494, "time_per_iteration": 2.6852943897247314 }, { "auxiliary_loss_clip": 0.01203908, "auxiliary_loss_mlp": 0.01034932, "balance_loss_clip": 1.05812299, "balance_loss_mlp": 1.0242691, "epoch": 0.179763121505441, "flos": 21324618501120.0, "grad_norm": 3.422576437416537, "language_loss": 0.85315645, "learning_rate": 3.7694339479760647e-06, "loss": 0.87554485, "num_input_tokens_seen": 31570415, "step": 1495, "time_per_iteration": 2.730390787124634 }, { "auxiliary_loss_clip": 0.01102571, "auxiliary_loss_mlp": 0.01005075, "balance_loss_clip": 1.03228998, "balance_loss_mlp": 1.00238132, "epoch": 0.17988336439608008, "flos": 68161864815360.0, "grad_norm": 0.8011476154917179, "language_loss": 0.5735482, "learning_rate": 3.769070712807476e-06, "loss": 0.59462464, "num_input_tokens_seen": 31632445, "step": 1496, "time_per_iteration": 3.2969655990600586 }, { "auxiliary_loss_clip": 0.01137501, "auxiliary_loss_mlp": 0.01037596, "balance_loss_clip": 1.0543561, "balance_loss_mlp": 1.02743888, "epoch": 0.18000360728671919, "flos": 21945047143680.0, "grad_norm": 2.2346381953774044, "language_loss": 0.78763902, "learning_rate": 3.768707209272266e-06, "loss": 0.80939001, "num_input_tokens_seen": 31652575, "step": 1497, "time_per_iteration": 3.7097339630126953 }, { "auxiliary_loss_clip": 0.01185112, "auxiliary_loss_mlp": 0.01034492, "balance_loss_clip": 1.05497885, "balance_loss_mlp": 1.02333403, "epoch": 0.18012385017735827, "flos": 18986330937600.0, "grad_norm": 2.30493783304258, "language_loss": 0.7696833, "learning_rate": 3.768343437425579e-06, "loss": 0.7918793, "num_input_tokens_seen": 31671145, "step": 1498, "time_per_iteration": 2.685978651046753 }, { "auxiliary_loss_clip": 0.01125444, "auxiliary_loss_mlp": 0.01040795, "balance_loss_clip": 1.05189931, "balance_loss_mlp": 1.02929139, "epoch": 0.18024409306799735, "flos": 19748103598080.0, "grad_norm": 2.785100245837132, "language_loss": 0.86319327, "learning_rate": 3.7679793973225987e-06, "loss": 0.88485563, "num_input_tokens_seen": 31686955, "step": 1499, "time_per_iteration": 2.784003496170044 }, { "auxiliary_loss_clip": 0.01071333, "auxiliary_loss_mlp": 0.01007019, "balance_loss_clip": 1.02887845, "balance_loss_mlp": 1.00417006, "epoch": 0.18036433595863643, "flos": 67227183060480.0, "grad_norm": 0.8802582100401489, "language_loss": 0.61638987, "learning_rate": 3.767615089018549e-06, "loss": 0.63717341, "num_input_tokens_seen": 31749300, "step": 1500, "time_per_iteration": 3.3119330406188965 }, { "auxiliary_loss_clip": 0.01184901, "auxiliary_loss_mlp": 0.01045123, "balance_loss_clip": 1.05675173, "balance_loss_mlp": 1.03363132, "epoch": 0.18048457884927555, "flos": 18181464935040.0, "grad_norm": 4.720955942267611, "language_loss": 0.85996592, "learning_rate": 3.7672505125686966e-06, "loss": 0.88226616, "num_input_tokens_seen": 31765665, "step": 1501, "time_per_iteration": 3.5245425701141357 }, { "auxiliary_loss_clip": 0.01159389, "auxiliary_loss_mlp": 0.01041649, "balance_loss_clip": 1.05420756, "balance_loss_mlp": 1.03018737, "epoch": 0.18060482173991463, "flos": 15813767111040.0, "grad_norm": 4.09909658600406, "language_loss": 0.84476846, "learning_rate": 3.7668856680283455e-06, "loss": 0.86677885, "num_input_tokens_seen": 31782690, "step": 1502, "time_per_iteration": 3.6115450859069824 }, { "auxiliary_loss_clip": 0.01199486, "auxiliary_loss_mlp": 0.01035956, "balance_loss_clip": 1.06021976, "balance_loss_mlp": 1.02517378, "epoch": 0.1807250646305537, "flos": 18587399512320.0, "grad_norm": 1.9679725395845071, "language_loss": 0.82734519, "learning_rate": 3.7665205554528437e-06, "loss": 0.84969962, "num_input_tokens_seen": 31802045, "step": 1503, "time_per_iteration": 3.576866388320923 }, { "auxiliary_loss_clip": 0.01190657, "auxiliary_loss_mlp": 0.01038424, "balance_loss_clip": 1.06180382, "balance_loss_mlp": 1.02703989, "epoch": 0.18084530752119282, "flos": 23149131880320.0, "grad_norm": 2.1429393656805775, "language_loss": 0.74432653, "learning_rate": 3.7661551748975782e-06, "loss": 0.76661736, "num_input_tokens_seen": 31820220, "step": 1504, "time_per_iteration": 2.7332825660705566 }, { "auxiliary_loss_clip": 0.01095017, "auxiliary_loss_mlp": 0.01004178, "balance_loss_clip": 1.02560365, "balance_loss_mlp": 1.00150764, "epoch": 0.1809655504118319, "flos": 59803153568640.0, "grad_norm": 0.8113945235702145, "language_loss": 0.60501766, "learning_rate": 3.7657895264179772e-06, "loss": 0.62600958, "num_input_tokens_seen": 31876195, "step": 1505, "time_per_iteration": 3.258424758911133 }, { "auxiliary_loss_clip": 0.01187805, "auxiliary_loss_mlp": 0.01039831, "balance_loss_clip": 1.05833924, "balance_loss_mlp": 1.02833903, "epoch": 0.181085793302471, "flos": 44201941188480.0, "grad_norm": 1.9671106367414704, "language_loss": 0.74751341, "learning_rate": 3.765423610069509e-06, "loss": 0.76978976, "num_input_tokens_seen": 31901585, "step": 1506, "time_per_iteration": 2.9830660820007324 }, { "auxiliary_loss_clip": 0.01195583, "auxiliary_loss_mlp": 0.01039337, "balance_loss_clip": 1.06159282, "balance_loss_mlp": 1.02775586, "epoch": 0.18120603619311007, "flos": 34898384638080.0, "grad_norm": 1.8856338638125494, "language_loss": 0.73137265, "learning_rate": 3.765057425907683e-06, "loss": 0.75372183, "num_input_tokens_seen": 31923045, "step": 1507, "time_per_iteration": 2.8265762329101562 }, { "auxiliary_loss_clip": 0.01210152, "auxiliary_loss_mlp": 0.01035303, "balance_loss_clip": 1.05947614, "balance_loss_mlp": 1.02478266, "epoch": 0.18132627908374918, "flos": 21506757390720.0, "grad_norm": 1.8928962531001756, "language_loss": 0.78825259, "learning_rate": 3.764690973988048e-06, "loss": 0.81070709, "num_input_tokens_seen": 31943385, "step": 1508, "time_per_iteration": 2.6553759574890137 }, { "auxiliary_loss_clip": 0.01180872, "auxiliary_loss_mlp": 0.01040105, "balance_loss_clip": 1.05789423, "balance_loss_mlp": 1.02956736, "epoch": 0.18144652197438826, "flos": 29057693633280.0, "grad_norm": 2.0271499772919004, "language_loss": 0.74016738, "learning_rate": 3.7643242543661967e-06, "loss": 0.76237714, "num_input_tokens_seen": 31966045, "step": 1509, "time_per_iteration": 2.767719030380249 }, { "auxiliary_loss_clip": 0.01092667, "auxiliary_loss_mlp": 0.0100254, "balance_loss_clip": 1.02933884, "balance_loss_mlp": 0.99984568, "epoch": 0.18156676486502735, "flos": 68675064382080.0, "grad_norm": 0.815735386778952, "language_loss": 0.60423523, "learning_rate": 3.7639572670977573e-06, "loss": 0.62518728, "num_input_tokens_seen": 32021540, "step": 1510, "time_per_iteration": 3.145909547805786 }, { "auxiliary_loss_clip": 0.01179225, "auxiliary_loss_mlp": 0.01032884, "balance_loss_clip": 1.05631661, "balance_loss_mlp": 1.02303767, "epoch": 0.18168700775566646, "flos": 26471515334400.0, "grad_norm": 2.269369433723992, "language_loss": 0.76507246, "learning_rate": 3.7635900122384042e-06, "loss": 0.78719354, "num_input_tokens_seen": 32044535, "step": 1511, "time_per_iteration": 2.770695447921753 }, { "auxiliary_loss_clip": 0.01199179, "auxiliary_loss_mlp": 0.01043718, "balance_loss_clip": 1.06047273, "balance_loss_mlp": 1.03192186, "epoch": 0.18180725064630554, "flos": 15005668884480.0, "grad_norm": 2.2065914506343343, "language_loss": 0.86825395, "learning_rate": 3.7632224898438477e-06, "loss": 0.89068294, "num_input_tokens_seen": 32061010, "step": 1512, "time_per_iteration": 2.642801523208618 }, { "auxiliary_loss_clip": 0.01177396, "auxiliary_loss_mlp": 0.01036003, "balance_loss_clip": 1.0540669, "balance_loss_mlp": 1.02597713, "epoch": 0.18192749353694462, "flos": 19682387665920.0, "grad_norm": 3.6415886196113703, "language_loss": 0.79210877, "learning_rate": 3.762854699969842e-06, "loss": 0.81424278, "num_input_tokens_seen": 32081520, "step": 1513, "time_per_iteration": 2.7427663803100586 }, { "auxiliary_loss_clip": 0.01205837, "auxiliary_loss_mlp": 0.01034402, "balance_loss_clip": 1.06225657, "balance_loss_mlp": 1.02408433, "epoch": 0.1820477364275837, "flos": 20702717400960.0, "grad_norm": 1.9737186271089855, "language_loss": 0.73094189, "learning_rate": 3.762486642672179e-06, "loss": 0.7533443, "num_input_tokens_seen": 32098460, "step": 1514, "time_per_iteration": 2.6408891677856445 }, { "auxiliary_loss_clip": 0.01197735, "auxiliary_loss_mlp": 0.01032479, "balance_loss_clip": 1.06167412, "balance_loss_mlp": 1.02184582, "epoch": 0.18216797931822282, "flos": 17128708197120.0, "grad_norm": 2.1596216626296076, "language_loss": 0.87096071, "learning_rate": 3.7621183180066946e-06, "loss": 0.89326286, "num_input_tokens_seen": 32116420, "step": 1515, "time_per_iteration": 2.679025888442993 }, { "auxiliary_loss_clip": 0.01186124, "auxiliary_loss_mlp": 0.01035362, "balance_loss_clip": 1.05201066, "balance_loss_mlp": 1.02447271, "epoch": 0.1822882222088619, "flos": 29242561956480.0, "grad_norm": 1.8549359114350574, "language_loss": 0.74037498, "learning_rate": 3.7617497260292625e-06, "loss": 0.76258981, "num_input_tokens_seen": 32138475, "step": 1516, "time_per_iteration": 2.7294561862945557 }, { "auxiliary_loss_clip": 0.01188572, "auxiliary_loss_mlp": 0.0104525, "balance_loss_clip": 1.05934072, "balance_loss_mlp": 1.03350782, "epoch": 0.18240846509950098, "flos": 17702739446400.0, "grad_norm": 3.2758223001493367, "language_loss": 0.7857042, "learning_rate": 3.7613808667957967e-06, "loss": 0.80804235, "num_input_tokens_seen": 32151165, "step": 1517, "time_per_iteration": 2.6694765090942383 }, { "auxiliary_loss_clip": 0.01194802, "auxiliary_loss_mlp": 0.01037697, "balance_loss_clip": 1.05845201, "balance_loss_mlp": 1.02621698, "epoch": 0.1825287079901401, "flos": 14790025584000.0, "grad_norm": 2.0204416511851178, "language_loss": 0.91021287, "learning_rate": 3.7610117403622547e-06, "loss": 0.93253791, "num_input_tokens_seen": 32167725, "step": 1518, "time_per_iteration": 2.6257219314575195 }, { "auxiliary_loss_clip": 0.01165623, "auxiliary_loss_mlp": 0.01033741, "balance_loss_clip": 1.05126762, "balance_loss_mlp": 1.02327466, "epoch": 0.18264895088077918, "flos": 21946232292480.0, "grad_norm": 1.6522070271233864, "language_loss": 0.90037608, "learning_rate": 3.7606423467846313e-06, "loss": 0.92236972, "num_input_tokens_seen": 32187330, "step": 1519, "time_per_iteration": 2.7517755031585693 }, { "auxiliary_loss_clip": 0.01185996, "auxiliary_loss_mlp": 0.01038386, "balance_loss_clip": 1.06087494, "balance_loss_mlp": 1.02791953, "epoch": 0.18276919377141826, "flos": 20886759711360.0, "grad_norm": 2.0708404567846137, "language_loss": 0.79767936, "learning_rate": 3.760272686118964e-06, "loss": 0.81992316, "num_input_tokens_seen": 32205550, "step": 1520, "time_per_iteration": 2.7002108097076416 }, { "auxiliary_loss_clip": 0.01192408, "auxiliary_loss_mlp": 0.01037946, "balance_loss_clip": 1.05708814, "balance_loss_mlp": 1.02709842, "epoch": 0.18288943666205737, "flos": 21469877101440.0, "grad_norm": 2.24835767019247, "language_loss": 0.92644173, "learning_rate": 3.7599027584213297e-06, "loss": 0.94874531, "num_input_tokens_seen": 32224430, "step": 1521, "time_per_iteration": 2.7223899364471436 }, { "auxiliary_loss_clip": 0.01210897, "auxiliary_loss_mlp": 0.01040177, "balance_loss_clip": 1.05953503, "balance_loss_mlp": 1.02923357, "epoch": 0.18300967955269645, "flos": 21539363961600.0, "grad_norm": 2.1166431969046267, "language_loss": 0.78154182, "learning_rate": 3.7595325637478465e-06, "loss": 0.80405259, "num_input_tokens_seen": 32242455, "step": 1522, "time_per_iteration": 2.650888204574585 }, { "auxiliary_loss_clip": 0.01185476, "auxiliary_loss_mlp": 0.01045204, "balance_loss_clip": 1.06077313, "balance_loss_mlp": 1.03399825, "epoch": 0.18312992244333554, "flos": 28876237102080.0, "grad_norm": 1.8451627073144232, "language_loss": 0.81564701, "learning_rate": 3.7591621021546723e-06, "loss": 0.83795381, "num_input_tokens_seen": 32264450, "step": 1523, "time_per_iteration": 2.71543025970459 }, { "auxiliary_loss_clip": 0.01198556, "auxiliary_loss_mlp": 0.01042405, "balance_loss_clip": 1.05639613, "balance_loss_mlp": 1.03150368, "epoch": 0.18325016533397462, "flos": 20120102801280.0, "grad_norm": 2.006656340185675, "language_loss": 0.81693959, "learning_rate": 3.7587913736980062e-06, "loss": 0.83934915, "num_input_tokens_seen": 32284090, "step": 1524, "time_per_iteration": 3.5757951736450195 }, { "auxiliary_loss_clip": 0.0113251, "auxiliary_loss_mlp": 0.01043693, "balance_loss_clip": 1.04866862, "balance_loss_mlp": 1.03220153, "epoch": 0.18337040822461373, "flos": 23329187781120.0, "grad_norm": 1.628198307359135, "language_loss": 0.84508044, "learning_rate": 3.7584203784340865e-06, "loss": 0.86684251, "num_input_tokens_seen": 32303260, "step": 1525, "time_per_iteration": 2.761080265045166 }, { "auxiliary_loss_clip": 0.01187347, "auxiliary_loss_mlp": 0.01035232, "balance_loss_clip": 1.05584908, "balance_loss_mlp": 1.0243777, "epoch": 0.1834906511152528, "flos": 25009555881600.0, "grad_norm": 2.305725598596235, "language_loss": 0.85479438, "learning_rate": 3.7580491164191938e-06, "loss": 0.87702012, "num_input_tokens_seen": 32321570, "step": 1526, "time_per_iteration": 2.6950886249542236 }, { "auxiliary_loss_clip": 0.01108378, "auxiliary_loss_mlp": 0.01003146, "balance_loss_clip": 1.02621531, "balance_loss_mlp": 1.00057113, "epoch": 0.1836108940058919, "flos": 67251493589760.0, "grad_norm": 0.753394429682065, "language_loss": 0.61300087, "learning_rate": 3.757677587709648e-06, "loss": 0.63411605, "num_input_tokens_seen": 32384835, "step": 1527, "time_per_iteration": 4.208168029785156 }, { "auxiliary_loss_clip": 0.01171193, "auxiliary_loss_mlp": 0.0105292, "balance_loss_clip": 1.05639935, "balance_loss_mlp": 1.04199457, "epoch": 0.183731136896531, "flos": 25738721971200.0, "grad_norm": 2.7629933578772383, "language_loss": 0.75684947, "learning_rate": 3.7573057923618095e-06, "loss": 0.77909058, "num_input_tokens_seen": 32404930, "step": 1528, "time_per_iteration": 4.504167318344116 }, { "auxiliary_loss_clip": 0.01158715, "auxiliary_loss_mlp": 0.01032511, "balance_loss_clip": 1.05071008, "balance_loss_mlp": 1.02214611, "epoch": 0.1838513797871701, "flos": 20449403712000.0, "grad_norm": 2.2030850904400006, "language_loss": 0.74380654, "learning_rate": 3.7569337304320793e-06, "loss": 0.76571882, "num_input_tokens_seen": 32424515, "step": 1529, "time_per_iteration": 2.7050156593322754 }, { "auxiliary_loss_clip": 0.01090254, "auxiliary_loss_mlp": 0.0100435, "balance_loss_clip": 1.02225065, "balance_loss_mlp": 1.0017159, "epoch": 0.18397162267780917, "flos": 68565141786240.0, "grad_norm": 0.834205629065132, "language_loss": 0.64479637, "learning_rate": 3.756561401976899e-06, "loss": 0.6657424, "num_input_tokens_seen": 32484220, "step": 1530, "time_per_iteration": 3.1571104526519775 }, { "auxiliary_loss_clip": 0.01224498, "auxiliary_loss_mlp": 0.0103767, "balance_loss_clip": 1.06284559, "balance_loss_mlp": 1.02692962, "epoch": 0.18409186556844825, "flos": 31941104976000.0, "grad_norm": 2.370360550615901, "language_loss": 0.82795393, "learning_rate": 3.7561888070527514e-06, "loss": 0.85057563, "num_input_tokens_seen": 32506260, "step": 1531, "time_per_iteration": 2.6579456329345703 }, { "auxiliary_loss_clip": 0.01163816, "auxiliary_loss_mlp": 0.00765957, "balance_loss_clip": 1.05538225, "balance_loss_mlp": 1.00040245, "epoch": 0.18421210845908736, "flos": 20120533764480.0, "grad_norm": 2.2651025913241933, "language_loss": 0.80405438, "learning_rate": 3.7558159457161577e-06, "loss": 0.8233521, "num_input_tokens_seen": 32524225, "step": 1532, "time_per_iteration": 2.703951120376587 }, { "auxiliary_loss_clip": 0.01189495, "auxiliary_loss_mlp": 0.00765644, "balance_loss_clip": 1.05694175, "balance_loss_mlp": 1.00048661, "epoch": 0.18433235134972645, "flos": 23110491824640.0, "grad_norm": 3.1661083724776273, "language_loss": 0.78293765, "learning_rate": 3.755442818023681e-06, "loss": 0.80248904, "num_input_tokens_seen": 32543850, "step": 1533, "time_per_iteration": 2.690779685974121 }, { "auxiliary_loss_clip": 0.01173701, "auxiliary_loss_mlp": 0.01033258, "balance_loss_clip": 1.05628777, "balance_loss_mlp": 1.02308321, "epoch": 0.18445259424036553, "flos": 18291351617280.0, "grad_norm": 1.8765192831108266, "language_loss": 0.75920582, "learning_rate": 3.7550694240319246e-06, "loss": 0.78127539, "num_input_tokens_seen": 32561725, "step": 1534, "time_per_iteration": 2.7708969116210938 }, { "auxiliary_loss_clip": 0.01208527, "auxiliary_loss_mlp": 0.01034327, "balance_loss_clip": 1.05859756, "balance_loss_mlp": 1.02318668, "epoch": 0.18457283713100464, "flos": 21324079797120.0, "grad_norm": 2.031560077333653, "language_loss": 0.76692456, "learning_rate": 3.7546957637975326e-06, "loss": 0.78935307, "num_input_tokens_seen": 32579135, "step": 1535, "time_per_iteration": 2.6517465114593506 }, { "auxiliary_loss_clip": 0.01132797, "auxiliary_loss_mlp": 0.01035702, "balance_loss_clip": 1.04492617, "balance_loss_mlp": 1.02594435, "epoch": 0.18469308002164372, "flos": 20375679047040.0, "grad_norm": 1.4989304484593549, "language_loss": 0.74119532, "learning_rate": 3.7543218373771873e-06, "loss": 0.76288033, "num_input_tokens_seen": 32598460, "step": 1536, "time_per_iteration": 2.7783422470092773 }, { "auxiliary_loss_clip": 0.01136601, "auxiliary_loss_mlp": 0.00765261, "balance_loss_clip": 1.049263, "balance_loss_mlp": 1.0003624, "epoch": 0.1848133229122828, "flos": 26435892021120.0, "grad_norm": 1.405667770186295, "language_loss": 0.78065598, "learning_rate": 3.753947644827615e-06, "loss": 0.79967463, "num_input_tokens_seen": 32621920, "step": 1537, "time_per_iteration": 2.8490183353424072 }, { "auxiliary_loss_clip": 0.01092945, "auxiliary_loss_mlp": 0.01004962, "balance_loss_clip": 1.02077401, "balance_loss_mlp": 1.0023396, "epoch": 0.1849335658029219, "flos": 70547447612160.0, "grad_norm": 0.9478098100613206, "language_loss": 0.5716722, "learning_rate": 3.753573186205579e-06, "loss": 0.59265125, "num_input_tokens_seen": 32690040, "step": 1538, "time_per_iteration": 3.357631206512451 }, { "auxiliary_loss_clip": 0.01179199, "auxiliary_loss_mlp": 0.00765653, "balance_loss_clip": 1.05256522, "balance_loss_mlp": 1.00034702, "epoch": 0.185053808693561, "flos": 17384140788480.0, "grad_norm": 2.293041159548234, "language_loss": 0.7808972, "learning_rate": 3.753198461567885e-06, "loss": 0.80034578, "num_input_tokens_seen": 32707285, "step": 1539, "time_per_iteration": 2.7164037227630615 }, { "auxiliary_loss_clip": 0.01168278, "auxiliary_loss_mlp": 0.01035678, "balance_loss_clip": 1.05676031, "balance_loss_mlp": 1.02593303, "epoch": 0.18517405158420008, "flos": 28986159697920.0, "grad_norm": 2.3343906622686115, "language_loss": 0.92079973, "learning_rate": 3.7528234709713783e-06, "loss": 0.94283932, "num_input_tokens_seen": 32730030, "step": 1540, "time_per_iteration": 2.754969835281372 }, { "auxiliary_loss_clip": 0.01207359, "auxiliary_loss_mlp": 0.01042898, "balance_loss_clip": 1.05953884, "balance_loss_mlp": 1.03229988, "epoch": 0.18529429447483917, "flos": 26794962328320.0, "grad_norm": 2.2285643729722455, "language_loss": 0.8446945, "learning_rate": 3.7524482144729447e-06, "loss": 0.86719698, "num_input_tokens_seen": 32749485, "step": 1541, "time_per_iteration": 2.727708339691162 }, { "auxiliary_loss_clip": 0.01166277, "auxiliary_loss_mlp": 0.01044505, "balance_loss_clip": 1.05021858, "balance_loss_mlp": 1.03355527, "epoch": 0.18541453736547828, "flos": 13581595301760.0, "grad_norm": 2.0523139261393357, "language_loss": 0.83921075, "learning_rate": 3.7520726921295106e-06, "loss": 0.86131859, "num_input_tokens_seen": 32766205, "step": 1542, "time_per_iteration": 2.719616174697876 }, { "auxiliary_loss_clip": 0.01200944, "auxiliary_loss_mlp": 0.01039634, "balance_loss_clip": 1.05758786, "balance_loss_mlp": 1.02935207, "epoch": 0.18553478025611736, "flos": 24025424077440.0, "grad_norm": 2.7878874223455066, "language_loss": 0.72619563, "learning_rate": 3.751696903998042e-06, "loss": 0.74860144, "num_input_tokens_seen": 32784840, "step": 1543, "time_per_iteration": 2.6694929599761963 }, { "auxiliary_loss_clip": 0.01203687, "auxiliary_loss_mlp": 0.01045195, "balance_loss_clip": 1.06130791, "balance_loss_mlp": 1.03419209, "epoch": 0.18565502314675644, "flos": 25885165720320.0, "grad_norm": 1.8064544834521932, "language_loss": 0.7034775, "learning_rate": 3.7513208501355456e-06, "loss": 0.72596633, "num_input_tokens_seen": 32805945, "step": 1544, "time_per_iteration": 2.730578660964966 }, { "auxiliary_loss_clip": 0.01186007, "auxiliary_loss_mlp": 0.01036003, "balance_loss_clip": 1.05677509, "balance_loss_mlp": 1.02549517, "epoch": 0.18577526603739553, "flos": 19610063631360.0, "grad_norm": 1.8913219813158655, "language_loss": 0.83862758, "learning_rate": 3.750944530599069e-06, "loss": 0.86084765, "num_input_tokens_seen": 32825515, "step": 1545, "time_per_iteration": 2.6332736015319824 }, { "auxiliary_loss_clip": 0.0121367, "auxiliary_loss_mlp": 0.01038006, "balance_loss_clip": 1.06261134, "balance_loss_mlp": 1.02702105, "epoch": 0.18589550892803464, "flos": 18474891137280.0, "grad_norm": 2.638061196974815, "language_loss": 0.81366014, "learning_rate": 3.7505679454456992e-06, "loss": 0.83617693, "num_input_tokens_seen": 32842125, "step": 1546, "time_per_iteration": 2.647264003753662 }, { "auxiliary_loss_clip": 0.01125197, "auxiliary_loss_mlp": 0.01038666, "balance_loss_clip": 1.04923642, "balance_loss_mlp": 1.02783608, "epoch": 0.18601575181867372, "flos": 23549966726400.0, "grad_norm": 1.863593080330065, "language_loss": 0.70353127, "learning_rate": 3.750191094732564e-06, "loss": 0.72516984, "num_input_tokens_seen": 32862990, "step": 1547, "time_per_iteration": 2.80033278465271 }, { "auxiliary_loss_clip": 0.01124016, "auxiliary_loss_mlp": 0.00765497, "balance_loss_clip": 1.04614568, "balance_loss_mlp": 1.0004034, "epoch": 0.1861359947093128, "flos": 26360192108160.0, "grad_norm": 2.0613546795768576, "language_loss": 0.75225663, "learning_rate": 3.7498139785168313e-06, "loss": 0.77115178, "num_input_tokens_seen": 32883595, "step": 1548, "time_per_iteration": 2.8836395740509033 }, { "auxiliary_loss_clip": 0.01203837, "auxiliary_loss_mlp": 0.01036111, "balance_loss_clip": 1.05984747, "balance_loss_mlp": 1.02578807, "epoch": 0.1862562375999519, "flos": 23331198942720.0, "grad_norm": 1.6620364607766418, "language_loss": 0.775769, "learning_rate": 3.749436596855709e-06, "loss": 0.79816842, "num_input_tokens_seen": 32902895, "step": 1549, "time_per_iteration": 2.713435411453247 }, { "auxiliary_loss_clip": 0.01199839, "auxiliary_loss_mlp": 0.01043537, "balance_loss_clip": 1.05910087, "balance_loss_mlp": 1.03218269, "epoch": 0.186376480490591, "flos": 16648222942080.0, "grad_norm": 2.1044541381393436, "language_loss": 0.90375984, "learning_rate": 3.749058949806446e-06, "loss": 0.92619359, "num_input_tokens_seen": 32919620, "step": 1550, "time_per_iteration": 3.5943896770477295 }, { "auxiliary_loss_clip": 0.01204935, "auxiliary_loss_mlp": 0.01043959, "balance_loss_clip": 1.05878937, "balance_loss_mlp": 1.03376043, "epoch": 0.18649672338123008, "flos": 21468656039040.0, "grad_norm": 1.8869258175938082, "language_loss": 0.84575939, "learning_rate": 3.748681037426331e-06, "loss": 0.86824834, "num_input_tokens_seen": 32938830, "step": 1551, "time_per_iteration": 2.675835609436035 }, { "auxiliary_loss_clip": 0.01222203, "auxiliary_loss_mlp": 0.0103791, "balance_loss_clip": 1.06256175, "balance_loss_mlp": 1.02635837, "epoch": 0.1866169662718692, "flos": 12312728386560.0, "grad_norm": 2.138547454651232, "language_loss": 0.9206028, "learning_rate": 3.7483028597726936e-06, "loss": 0.94320393, "num_input_tokens_seen": 32955600, "step": 1552, "time_per_iteration": 2.5795977115631104 }, { "auxiliary_loss_clip": 0.01177532, "auxiliary_loss_mlp": 0.0103912, "balance_loss_clip": 1.05959272, "balance_loss_mlp": 1.02797341, "epoch": 0.18673720916250827, "flos": 23581280407680.0, "grad_norm": 1.7485933997125052, "language_loss": 0.62804615, "learning_rate": 3.7479244169029017e-06, "loss": 0.65021276, "num_input_tokens_seen": 32975390, "step": 1553, "time_per_iteration": 3.667445421218872 }, { "auxiliary_loss_clip": 0.01205208, "auxiliary_loss_mlp": 0.01024101, "balance_loss_clip": 1.05507171, "balance_loss_mlp": 1.01315725, "epoch": 0.18685745205314735, "flos": 19718370115200.0, "grad_norm": 2.2766063790769855, "language_loss": 0.73617238, "learning_rate": 3.7475457088743658e-06, "loss": 0.75846547, "num_input_tokens_seen": 32992640, "step": 1554, "time_per_iteration": 4.511739492416382 }, { "auxiliary_loss_clip": 0.01181066, "auxiliary_loss_mlp": 0.01032778, "balance_loss_clip": 1.05635643, "balance_loss_mlp": 1.0224309, "epoch": 0.18697769494378644, "flos": 34204123589760.0, "grad_norm": 2.1464264030806066, "language_loss": 0.74841517, "learning_rate": 3.7471667357445348e-06, "loss": 0.77055365, "num_input_tokens_seen": 33012470, "step": 1555, "time_per_iteration": 2.8026514053344727 }, { "auxiliary_loss_clip": 0.01149765, "auxiliary_loss_mlp": 0.01037529, "balance_loss_clip": 1.05454826, "balance_loss_mlp": 1.02707982, "epoch": 0.18709793783442555, "flos": 34241327101440.0, "grad_norm": 2.1731735162998476, "language_loss": 0.72486848, "learning_rate": 3.7467874975709e-06, "loss": 0.74674141, "num_input_tokens_seen": 33033275, "step": 1556, "time_per_iteration": 2.9177565574645996 }, { "auxiliary_loss_clip": 0.01211216, "auxiliary_loss_mlp": 0.01041531, "balance_loss_clip": 1.06325042, "balance_loss_mlp": 1.03126121, "epoch": 0.18721818072506463, "flos": 40734550529280.0, "grad_norm": 2.049736313733444, "language_loss": 0.7841152, "learning_rate": 3.7464079944109904e-06, "loss": 0.80664271, "num_input_tokens_seen": 33055135, "step": 1557, "time_per_iteration": 2.792910575866699 }, { "auxiliary_loss_clip": 0.01177893, "auxiliary_loss_mlp": 0.01035192, "balance_loss_clip": 1.0555048, "balance_loss_mlp": 1.0245347, "epoch": 0.18733842361570371, "flos": 22157386392960.0, "grad_norm": 1.9534016767966933, "language_loss": 0.772452, "learning_rate": 3.746028226322376e-06, "loss": 0.79458284, "num_input_tokens_seen": 33071015, "step": 1558, "time_per_iteration": 2.7582898139953613 }, { "auxiliary_loss_clip": 0.01187852, "auxiliary_loss_mlp": 0.01034534, "balance_loss_clip": 1.05721664, "balance_loss_mlp": 1.0240612, "epoch": 0.18745866650634282, "flos": 18914940656640.0, "grad_norm": 2.0698175452531005, "language_loss": 0.75213742, "learning_rate": 3.745648193362669e-06, "loss": 0.77436125, "num_input_tokens_seen": 33090370, "step": 1559, "time_per_iteration": 2.752578020095825 }, { "auxiliary_loss_clip": 0.01187528, "auxiliary_loss_mlp": 0.01038606, "balance_loss_clip": 1.05856013, "balance_loss_mlp": 1.02921855, "epoch": 0.1875789093969819, "flos": 19314626267520.0, "grad_norm": 1.9585756823152098, "language_loss": 0.7227543, "learning_rate": 3.745267895589518e-06, "loss": 0.74501562, "num_input_tokens_seen": 33108910, "step": 1560, "time_per_iteration": 2.6389782428741455 }, { "auxiliary_loss_clip": 0.01190118, "auxiliary_loss_mlp": 0.01038619, "balance_loss_clip": 1.05855727, "balance_loss_mlp": 1.0280925, "epoch": 0.187699152287621, "flos": 17018965169280.0, "grad_norm": 2.4000054849976897, "language_loss": 0.8227638, "learning_rate": 3.7448873330606154e-06, "loss": 0.84505117, "num_input_tokens_seen": 33126680, "step": 1561, "time_per_iteration": 2.676196575164795 }, { "auxiliary_loss_clip": 0.01165676, "auxiliary_loss_mlp": 0.01040402, "balance_loss_clip": 1.05524015, "balance_loss_mlp": 1.02898169, "epoch": 0.18781939517826007, "flos": 22346384780160.0, "grad_norm": 1.9902395232447103, "language_loss": 0.87343127, "learning_rate": 3.7445065058336914e-06, "loss": 0.89549208, "num_input_tokens_seen": 33145550, "step": 1562, "time_per_iteration": 2.705169439315796 }, { "auxiliary_loss_clip": 0.01147191, "auxiliary_loss_mlp": 0.01036849, "balance_loss_clip": 1.05046642, "balance_loss_mlp": 1.02679372, "epoch": 0.18793963806889918, "flos": 14611478054400.0, "grad_norm": 1.909371913225208, "language_loss": 0.86465353, "learning_rate": 3.7441254139665176e-06, "loss": 0.88649392, "num_input_tokens_seen": 33161735, "step": 1563, "time_per_iteration": 2.7222046852111816 }, { "auxiliary_loss_clip": 0.01223611, "auxiliary_loss_mlp": 0.01032982, "balance_loss_clip": 1.06577659, "balance_loss_mlp": 1.02301657, "epoch": 0.18805988095953827, "flos": 17457075354240.0, "grad_norm": 1.6763679024808564, "language_loss": 0.82419741, "learning_rate": 3.743744057516905e-06, "loss": 0.84676337, "num_input_tokens_seen": 33179795, "step": 1564, "time_per_iteration": 2.580401659011841 }, { "auxiliary_loss_clip": 0.01160257, "auxiliary_loss_mlp": 0.01037641, "balance_loss_clip": 1.05504799, "balance_loss_mlp": 1.02694821, "epoch": 0.18818012385017735, "flos": 15043877976960.0, "grad_norm": 5.793394314832816, "language_loss": 0.87681663, "learning_rate": 3.743362436542706e-06, "loss": 0.8987956, "num_input_tokens_seen": 33194485, "step": 1565, "time_per_iteration": 2.7685439586639404 }, { "auxiliary_loss_clip": 0.012181, "auxiliary_loss_mlp": 0.01035666, "balance_loss_clip": 1.06113791, "balance_loss_mlp": 1.02571225, "epoch": 0.18830036674081646, "flos": 47551975136640.0, "grad_norm": 2.1582880945463363, "language_loss": 0.76736325, "learning_rate": 3.7429805511018115e-06, "loss": 0.7899009, "num_input_tokens_seen": 33216145, "step": 1566, "time_per_iteration": 2.832289695739746 }, { "auxiliary_loss_clip": 0.01173628, "auxiliary_loss_mlp": 0.00764729, "balance_loss_clip": 1.0582372, "balance_loss_mlp": 1.00038528, "epoch": 0.18842060963145554, "flos": 30044626698240.0, "grad_norm": 2.216693082363248, "language_loss": 0.78352576, "learning_rate": 3.7425984012521524e-06, "loss": 0.80290937, "num_input_tokens_seen": 33236345, "step": 1567, "time_per_iteration": 2.8060951232910156 }, { "auxiliary_loss_clip": 0.01073331, "auxiliary_loss_mlp": 0.00756162, "balance_loss_clip": 1.02222025, "balance_loss_mlp": 1.00105119, "epoch": 0.18854085252209463, "flos": 70318372625280.0, "grad_norm": 0.7425552400340839, "language_loss": 0.60485101, "learning_rate": 3.7422159870517025e-06, "loss": 0.62314594, "num_input_tokens_seen": 33301600, "step": 1568, "time_per_iteration": 3.280465602874756 }, { "auxiliary_loss_clip": 0.01185657, "auxiliary_loss_mlp": 0.01035624, "balance_loss_clip": 1.05711293, "balance_loss_mlp": 1.02600336, "epoch": 0.1886610954127337, "flos": 21289318410240.0, "grad_norm": 1.8437193005744055, "language_loss": 0.78954947, "learning_rate": 3.7418333085584717e-06, "loss": 0.81176227, "num_input_tokens_seen": 33322785, "step": 1569, "time_per_iteration": 2.7407495975494385 }, { "auxiliary_loss_clip": 0.0117684, "auxiliary_loss_mlp": 0.01035543, "balance_loss_clip": 1.05848289, "balance_loss_mlp": 1.02524924, "epoch": 0.18878133830337282, "flos": 17266819991040.0, "grad_norm": 2.1849065105825303, "language_loss": 0.90973103, "learning_rate": 3.7414503658305128e-06, "loss": 0.93185484, "num_input_tokens_seen": 33340020, "step": 1570, "time_per_iteration": 2.7343006134033203 }, { "auxiliary_loss_clip": 0.01166993, "auxiliary_loss_mlp": 0.01038624, "balance_loss_clip": 1.05254006, "balance_loss_mlp": 1.02765059, "epoch": 0.1889015811940119, "flos": 25775207210880.0, "grad_norm": 2.3411829510861315, "language_loss": 0.77812481, "learning_rate": 3.7410671589259185e-06, "loss": 0.80018103, "num_input_tokens_seen": 33358620, "step": 1571, "time_per_iteration": 2.7734787464141846 }, { "auxiliary_loss_clip": 0.01220217, "auxiliary_loss_mlp": 0.01038748, "balance_loss_clip": 1.06310534, "balance_loss_mlp": 1.02805507, "epoch": 0.18902182408465099, "flos": 21032197879680.0, "grad_norm": 2.1706876567441715, "language_loss": 0.79648364, "learning_rate": 3.7406836879028205e-06, "loss": 0.81907332, "num_input_tokens_seen": 33378845, "step": 1572, "time_per_iteration": 2.6460819244384766 }, { "auxiliary_loss_clip": 0.01199143, "auxiliary_loss_mlp": 0.01034739, "balance_loss_clip": 1.05855775, "balance_loss_mlp": 1.02451706, "epoch": 0.1891420669752901, "flos": 22272121411200.0, "grad_norm": 4.345703891062653, "language_loss": 0.76719952, "learning_rate": 3.7402999528193907e-06, "loss": 0.78953826, "num_input_tokens_seen": 33398345, "step": 1573, "time_per_iteration": 2.629659414291382 }, { "auxiliary_loss_clip": 0.01162275, "auxiliary_loss_mlp": 0.00764961, "balance_loss_clip": 1.05411887, "balance_loss_mlp": 1.00051212, "epoch": 0.18926230986592918, "flos": 22017802141440.0, "grad_norm": 5.380342744314285, "language_loss": 0.8582077, "learning_rate": 3.739915953733842e-06, "loss": 0.87748003, "num_input_tokens_seen": 33416390, "step": 1574, "time_per_iteration": 2.768580198287964 }, { "auxiliary_loss_clip": 0.01217959, "auxiliary_loss_mlp": 0.01037479, "balance_loss_clip": 1.05993533, "balance_loss_mlp": 1.02704859, "epoch": 0.18938255275656826, "flos": 24462672336000.0, "grad_norm": 1.7624967846574133, "language_loss": 0.82347304, "learning_rate": 3.7395316907044264e-06, "loss": 0.84602743, "num_input_tokens_seen": 33437175, "step": 1575, "time_per_iteration": 3.5580410957336426 }, { "auxiliary_loss_clip": 0.01204102, "auxiliary_loss_mlp": 0.01035411, "balance_loss_clip": 1.06009257, "balance_loss_mlp": 1.02553463, "epoch": 0.18950279564720737, "flos": 24427049022720.0, "grad_norm": 1.6713884459230004, "language_loss": 0.79664993, "learning_rate": 3.7391471637894364e-06, "loss": 0.81904507, "num_input_tokens_seen": 33459440, "step": 1576, "time_per_iteration": 2.676105260848999 }, { "auxiliary_loss_clip": 0.01172574, "auxiliary_loss_mlp": 0.01032558, "balance_loss_clip": 1.05108213, "balance_loss_mlp": 1.02276516, "epoch": 0.18962303853784646, "flos": 19756291898880.0, "grad_norm": 2.007904152228609, "language_loss": 0.85011685, "learning_rate": 3.738762373047205e-06, "loss": 0.87216818, "num_input_tokens_seen": 33479360, "step": 1577, "time_per_iteration": 2.739616632461548 }, { "auxiliary_loss_clip": 0.01178191, "auxiliary_loss_mlp": 0.01038831, "balance_loss_clip": 1.05817544, "balance_loss_mlp": 1.02928281, "epoch": 0.18974328142848554, "flos": 21032054225280.0, "grad_norm": 1.874922164716666, "language_loss": 0.8353647, "learning_rate": 3.738377318536103e-06, "loss": 0.85753495, "num_input_tokens_seen": 33499245, "step": 1578, "time_per_iteration": 2.706434726715088 }, { "auxiliary_loss_clip": 0.01215787, "auxiliary_loss_mlp": 0.01030252, "balance_loss_clip": 1.06143451, "balance_loss_mlp": 1.0209949, "epoch": 0.18986352431912462, "flos": 12966122736000.0, "grad_norm": 2.0384051017475584, "language_loss": 0.71717292, "learning_rate": 3.7379920003145447e-06, "loss": 0.73963332, "num_input_tokens_seen": 33513520, "step": 1579, "time_per_iteration": 3.548917531967163 }, { "auxiliary_loss_clip": 0.01182055, "auxiliary_loss_mlp": 0.01041239, "balance_loss_clip": 1.05852056, "balance_loss_mlp": 1.03079617, "epoch": 0.18998376720976373, "flos": 23767908497280.0, "grad_norm": 1.6697420886148606, "language_loss": 0.83789343, "learning_rate": 3.7376064184409817e-06, "loss": 0.86012638, "num_input_tokens_seen": 33533100, "step": 1580, "time_per_iteration": 3.5903079509735107 }, { "auxiliary_loss_clip": 0.01186505, "auxiliary_loss_mlp": 0.01036881, "balance_loss_clip": 1.05727291, "balance_loss_mlp": 1.02682018, "epoch": 0.19010401010040281, "flos": 22966023323520.0, "grad_norm": 1.498791558434531, "language_loss": 0.8705529, "learning_rate": 3.7372205729739063e-06, "loss": 0.89278674, "num_input_tokens_seen": 33554915, "step": 1581, "time_per_iteration": 2.695162773132324 }, { "auxiliary_loss_clip": 0.01209502, "auxiliary_loss_mlp": 0.01042128, "balance_loss_clip": 1.06037307, "balance_loss_mlp": 1.03178096, "epoch": 0.1902242529910419, "flos": 19135647774720.0, "grad_norm": 2.115984785384188, "language_loss": 0.71743214, "learning_rate": 3.7368344639718514e-06, "loss": 0.73994851, "num_input_tokens_seen": 33572850, "step": 1582, "time_per_iteration": 2.7316370010375977 }, { "auxiliary_loss_clip": 0.01203752, "auxiliary_loss_mlp": 0.01038249, "balance_loss_clip": 1.05649948, "balance_loss_mlp": 1.02812862, "epoch": 0.190344495881681, "flos": 25483935824640.0, "grad_norm": 1.5873846863794812, "language_loss": 0.80469447, "learning_rate": 3.7364480914933895e-06, "loss": 0.82711452, "num_input_tokens_seen": 33593090, "step": 1583, "time_per_iteration": 2.6865155696868896 }, { "auxiliary_loss_clip": 0.01155495, "auxiliary_loss_mlp": 0.00764898, "balance_loss_clip": 1.05501914, "balance_loss_mlp": 1.0003345, "epoch": 0.1904647387723201, "flos": 26792843425920.0, "grad_norm": 2.0179144475329407, "language_loss": 0.80900228, "learning_rate": 3.7360614555971325e-06, "loss": 0.82820618, "num_input_tokens_seen": 33612745, "step": 1584, "time_per_iteration": 2.7985048294067383 }, { "auxiliary_loss_clip": 0.01204705, "auxiliary_loss_mlp": 0.00764631, "balance_loss_clip": 1.06078291, "balance_loss_mlp": 1.0003469, "epoch": 0.19058498166295917, "flos": 23987753688960.0, "grad_norm": 2.141482612512167, "language_loss": 0.85261488, "learning_rate": 3.735674556341733e-06, "loss": 0.87230819, "num_input_tokens_seen": 33632360, "step": 1585, "time_per_iteration": 2.681871175765991 }, { "auxiliary_loss_clip": 0.01187675, "auxiliary_loss_mlp": 0.01035638, "balance_loss_clip": 1.05938864, "balance_loss_mlp": 1.02486181, "epoch": 0.19070522455359826, "flos": 28293299280000.0, "grad_norm": 4.068689563343202, "language_loss": 0.8290869, "learning_rate": 3.7352873937858835e-06, "loss": 0.85132003, "num_input_tokens_seen": 33653895, "step": 1586, "time_per_iteration": 2.7686574459075928 }, { "auxiliary_loss_clip": 0.01170021, "auxiliary_loss_mlp": 0.00764413, "balance_loss_clip": 1.0569011, "balance_loss_mlp": 1.00036395, "epoch": 0.19082546744423737, "flos": 25660220797440.0, "grad_norm": 1.974835517063537, "language_loss": 0.72060329, "learning_rate": 3.734899967988316e-06, "loss": 0.73994762, "num_input_tokens_seen": 33672075, "step": 1587, "time_per_iteration": 2.745598554611206 }, { "auxiliary_loss_clip": 0.01167828, "auxiliary_loss_mlp": 0.01035026, "balance_loss_clip": 1.05303645, "balance_loss_mlp": 1.02517295, "epoch": 0.19094571033487645, "flos": 19719483436800.0, "grad_norm": 1.8850345534381805, "language_loss": 0.8374157, "learning_rate": 3.7345122790078026e-06, "loss": 0.85944414, "num_input_tokens_seen": 33689640, "step": 1588, "time_per_iteration": 2.72975492477417 }, { "auxiliary_loss_clip": 0.01202308, "auxiliary_loss_mlp": 0.01036479, "balance_loss_clip": 1.05875206, "balance_loss_mlp": 1.02590537, "epoch": 0.19106595322551553, "flos": 21616320850560.0, "grad_norm": 2.803734634021549, "language_loss": 0.93232375, "learning_rate": 3.7341243269031556e-06, "loss": 0.95471156, "num_input_tokens_seen": 33708630, "step": 1589, "time_per_iteration": 2.628833055496216 }, { "auxiliary_loss_clip": 0.01181731, "auxiliary_loss_mlp": 0.01033928, "balance_loss_clip": 1.05804682, "balance_loss_mlp": 1.02414083, "epoch": 0.19118619611615464, "flos": 29896890059520.0, "grad_norm": 2.6499761573150264, "language_loss": 0.77434117, "learning_rate": 3.7337361117332275e-06, "loss": 0.7964977, "num_input_tokens_seen": 33730370, "step": 1590, "time_per_iteration": 2.7652671337127686 }, { "auxiliary_loss_clip": 0.01177619, "auxiliary_loss_mlp": 0.01039383, "balance_loss_clip": 1.0554738, "balance_loss_mlp": 1.02883291, "epoch": 0.19130643900679373, "flos": 17273428093440.0, "grad_norm": 2.02621404764273, "language_loss": 0.76929939, "learning_rate": 3.7333476335569087e-06, "loss": 0.7914694, "num_input_tokens_seen": 33748370, "step": 1591, "time_per_iteration": 2.7320308685302734 }, { "auxiliary_loss_clip": 0.01189601, "auxiliary_loss_mlp": 0.01035784, "balance_loss_clip": 1.05841565, "balance_loss_mlp": 1.02534735, "epoch": 0.1914266818974328, "flos": 24826339584000.0, "grad_norm": 6.864705202975076, "language_loss": 0.66954768, "learning_rate": 3.7329588924331325e-06, "loss": 0.69180149, "num_input_tokens_seen": 33769575, "step": 1592, "time_per_iteration": 2.6901040077209473 }, { "auxiliary_loss_clip": 0.01167635, "auxiliary_loss_mlp": 0.01033345, "balance_loss_clip": 1.0541029, "balance_loss_mlp": 1.02349818, "epoch": 0.1915469247880719, "flos": 18952467390720.0, "grad_norm": 1.8818494644286625, "language_loss": 0.8277123, "learning_rate": 3.732569888420871e-06, "loss": 0.84972215, "num_input_tokens_seen": 33789110, "step": 1593, "time_per_iteration": 2.828733444213867 }, { "auxiliary_loss_clip": 0.01223582, "auxiliary_loss_mlp": 0.01038814, "balance_loss_clip": 1.05992627, "balance_loss_mlp": 1.02851427, "epoch": 0.191667167678711, "flos": 21032952065280.0, "grad_norm": 3.0410573350660997, "language_loss": 0.8227663, "learning_rate": 3.732180621579134e-06, "loss": 0.8453902, "num_input_tokens_seen": 33808325, "step": 1594, "time_per_iteration": 2.6834306716918945 }, { "auxiliary_loss_clip": 0.01191656, "auxiliary_loss_mlp": 0.01044208, "balance_loss_clip": 1.0617007, "balance_loss_mlp": 1.03338969, "epoch": 0.1917874105693501, "flos": 34237663914240.0, "grad_norm": 1.951970570191408, "language_loss": 0.81202137, "learning_rate": 3.7317910919669745e-06, "loss": 0.83437997, "num_input_tokens_seen": 33829520, "step": 1595, "time_per_iteration": 2.8777410984039307 }, { "auxiliary_loss_clip": 0.01205437, "auxiliary_loss_mlp": 0.01046728, "balance_loss_clip": 1.06299031, "balance_loss_mlp": 1.03521812, "epoch": 0.19190765345998917, "flos": 23550613171200.0, "grad_norm": 2.3180453556524134, "language_loss": 0.75839764, "learning_rate": 3.7314012996434826e-06, "loss": 0.78091931, "num_input_tokens_seen": 33848250, "step": 1596, "time_per_iteration": 2.6536946296691895 }, { "auxiliary_loss_clip": 0.01191873, "auxiliary_loss_mlp": 0.01037822, "balance_loss_clip": 1.05964255, "balance_loss_mlp": 1.02804041, "epoch": 0.19202789635062828, "flos": 19861330245120.0, "grad_norm": 1.870282944765852, "language_loss": 0.81197864, "learning_rate": 3.7310112446677907e-06, "loss": 0.8342756, "num_input_tokens_seen": 33866160, "step": 1597, "time_per_iteration": 2.710824489593506 }, { "auxiliary_loss_clip": 0.012236, "auxiliary_loss_mlp": 0.01041152, "balance_loss_clip": 1.064816, "balance_loss_mlp": 1.03011894, "epoch": 0.19214813924126736, "flos": 20922957642240.0, "grad_norm": 3.753689376995639, "language_loss": 0.69051981, "learning_rate": 3.7306209270990695e-06, "loss": 0.71316731, "num_input_tokens_seen": 33884165, "step": 1598, "time_per_iteration": 2.621387004852295 }, { "auxiliary_loss_clip": 0.01193857, "auxiliary_loss_mlp": 0.01043922, "balance_loss_clip": 1.06226587, "balance_loss_mlp": 1.03323483, "epoch": 0.19226838213190645, "flos": 26359725231360.0, "grad_norm": 2.035298772948818, "language_loss": 0.86845267, "learning_rate": 3.7302303469965292e-06, "loss": 0.8908304, "num_input_tokens_seen": 33903705, "step": 1599, "time_per_iteration": 2.737128734588623 }, { "auxiliary_loss_clip": 0.0120864, "auxiliary_loss_mlp": 0.01035745, "balance_loss_clip": 1.06500137, "balance_loss_mlp": 1.02584434, "epoch": 0.19238862502254553, "flos": 20850525866880.0, "grad_norm": 1.762670257314, "language_loss": 0.70751715, "learning_rate": 3.7298395044194206e-06, "loss": 0.72996098, "num_input_tokens_seen": 33922515, "step": 1600, "time_per_iteration": 2.620255470275879 }, { "auxiliary_loss_clip": 0.01222166, "auxiliary_loss_mlp": 0.01034266, "balance_loss_clip": 1.06487823, "balance_loss_mlp": 1.02424037, "epoch": 0.19250886791318464, "flos": 21726063878400.0, "grad_norm": 2.7357127338932568, "language_loss": 0.94362319, "learning_rate": 3.7294483994270356e-06, "loss": 0.96618748, "num_input_tokens_seen": 33940840, "step": 1601, "time_per_iteration": 2.6158266067504883 }, { "auxiliary_loss_clip": 0.01152619, "auxiliary_loss_mlp": 0.01039455, "balance_loss_clip": 1.05232263, "balance_loss_mlp": 1.02971578, "epoch": 0.19262911080382372, "flos": 23367827836800.0, "grad_norm": 2.7854940335818568, "language_loss": 0.7802276, "learning_rate": 3.7290570320787033e-06, "loss": 0.80214834, "num_input_tokens_seen": 33960420, "step": 1602, "time_per_iteration": 3.7063117027282715 }, { "auxiliary_loss_clip": 0.0120658, "auxiliary_loss_mlp": 0.01036808, "balance_loss_clip": 1.06370831, "balance_loss_mlp": 1.02691936, "epoch": 0.1927493536944628, "flos": 21943502858880.0, "grad_norm": 2.0727468896605106, "language_loss": 0.71819353, "learning_rate": 3.728665402433793e-06, "loss": 0.74062741, "num_input_tokens_seen": 33978990, "step": 1603, "time_per_iteration": 2.6120762825012207 }, { "auxiliary_loss_clip": 0.01188657, "auxiliary_loss_mlp": 0.01031847, "balance_loss_clip": 1.0603888, "balance_loss_mlp": 1.02200651, "epoch": 0.19286959658510192, "flos": 16545590807040.0, "grad_norm": 2.4437673649739304, "language_loss": 0.86346799, "learning_rate": 3.7282735105517164e-06, "loss": 0.88567299, "num_input_tokens_seen": 33997115, "step": 1604, "time_per_iteration": 2.690920114517212 }, { "auxiliary_loss_clip": 0.01165931, "auxiliary_loss_mlp": 0.01043488, "balance_loss_clip": 1.05529737, "balance_loss_mlp": 1.03249681, "epoch": 0.192989839475741, "flos": 21616967295360.0, "grad_norm": 2.1402701626119898, "language_loss": 0.6792804, "learning_rate": 3.727881356491922e-06, "loss": 0.70137459, "num_input_tokens_seen": 34015525, "step": 1605, "time_per_iteration": 3.6823296546936035 }, { "auxiliary_loss_clip": 0.01225588, "auxiliary_loss_mlp": 0.01042151, "balance_loss_clip": 1.06856585, "balance_loss_mlp": 1.03245914, "epoch": 0.19311008236638008, "flos": 19281516906240.0, "grad_norm": 1.7567586223534826, "language_loss": 0.75727755, "learning_rate": 3.7274889403139002e-06, "loss": 0.77995497, "num_input_tokens_seen": 34033150, "step": 1606, "time_per_iteration": 4.3374645709991455 }, { "auxiliary_loss_clip": 0.01156219, "auxiliary_loss_mlp": 0.01030953, "balance_loss_clip": 1.05780292, "balance_loss_mlp": 1.02104712, "epoch": 0.1932303252570192, "flos": 28652369587200.0, "grad_norm": 2.1572398440936387, "language_loss": 0.78492785, "learning_rate": 3.727096262077179e-06, "loss": 0.80679953, "num_input_tokens_seen": 34052145, "step": 1607, "time_per_iteration": 2.8018698692321777 }, { "auxiliary_loss_clip": 0.01206899, "auxiliary_loss_mlp": 0.01036742, "balance_loss_clip": 1.06037724, "balance_loss_mlp": 1.02663291, "epoch": 0.19335056814765827, "flos": 18368990864640.0, "grad_norm": 1.7816225885615478, "language_loss": 0.85400224, "learning_rate": 3.7267033218413285e-06, "loss": 0.87643868, "num_input_tokens_seen": 34069940, "step": 1608, "time_per_iteration": 2.617530345916748 }, { "auxiliary_loss_clip": 0.01144398, "auxiliary_loss_mlp": 0.01040793, "balance_loss_clip": 1.04822755, "balance_loss_mlp": 1.03029084, "epoch": 0.19347081103829736, "flos": 13260877741440.0, "grad_norm": 2.0671221688213035, "language_loss": 0.81318182, "learning_rate": 3.726310119665957e-06, "loss": 0.83503377, "num_input_tokens_seen": 34086275, "step": 1609, "time_per_iteration": 2.738497734069824 }, { "auxiliary_loss_clip": 0.01204686, "auxiliary_loss_mlp": 0.0104103, "balance_loss_clip": 1.06112373, "balance_loss_mlp": 1.03017569, "epoch": 0.19359105392893644, "flos": 20300122788480.0, "grad_norm": 1.8434858420124096, "language_loss": 0.85479486, "learning_rate": 3.725916655610713e-06, "loss": 0.8772521, "num_input_tokens_seen": 34105605, "step": 1610, "time_per_iteration": 2.6593339443206787 }, { "auxiliary_loss_clip": 0.01183212, "auxiliary_loss_mlp": 0.01037507, "balance_loss_clip": 1.05775452, "balance_loss_mlp": 1.02583039, "epoch": 0.19371129681957555, "flos": 20484596062080.0, "grad_norm": 6.557986718030366, "language_loss": 0.75276494, "learning_rate": 3.725522929735284e-06, "loss": 0.77497208, "num_input_tokens_seen": 34122540, "step": 1611, "time_per_iteration": 2.6925272941589355 }, { "auxiliary_loss_clip": 0.0119619, "auxiliary_loss_mlp": 0.01035871, "balance_loss_clip": 1.05735564, "balance_loss_mlp": 1.0257262, "epoch": 0.19383153971021463, "flos": 30445497457920.0, "grad_norm": 1.963077797995845, "language_loss": 0.74259394, "learning_rate": 3.725128942099399e-06, "loss": 0.76491457, "num_input_tokens_seen": 34142940, "step": 1612, "time_per_iteration": 2.819523334503174 }, { "auxiliary_loss_clip": 0.01181183, "auxiliary_loss_mlp": 0.01035892, "balance_loss_clip": 1.05540347, "balance_loss_mlp": 1.0256815, "epoch": 0.19395178260085372, "flos": 24569937325440.0, "grad_norm": 2.1590773267294163, "language_loss": 0.79879105, "learning_rate": 3.7247346927628245e-06, "loss": 0.82096183, "num_input_tokens_seen": 34162875, "step": 1613, "time_per_iteration": 2.7619969844818115 }, { "auxiliary_loss_clip": 0.0118884, "auxiliary_loss_mlp": 0.0076609, "balance_loss_clip": 1.05869246, "balance_loss_mlp": 1.00037909, "epoch": 0.19407202549149283, "flos": 28950608211840.0, "grad_norm": 1.8465221731833261, "language_loss": 0.79274416, "learning_rate": 3.7243401817853694e-06, "loss": 0.81229347, "num_input_tokens_seen": 34183565, "step": 1614, "time_per_iteration": 2.769099235534668 }, { "auxiliary_loss_clip": 0.01194108, "auxiliary_loss_mlp": 0.01037405, "balance_loss_clip": 1.05786669, "balance_loss_mlp": 1.02812481, "epoch": 0.1941922683821319, "flos": 18004497603840.0, "grad_norm": 1.8103876307679714, "language_loss": 0.71763432, "learning_rate": 3.723945409226879e-06, "loss": 0.73994946, "num_input_tokens_seen": 34202055, "step": 1615, "time_per_iteration": 2.701265573501587 }, { "auxiliary_loss_clip": 0.01202262, "auxiliary_loss_mlp": 0.01043159, "balance_loss_clip": 1.0570035, "balance_loss_mlp": 1.03273416, "epoch": 0.194312511272771, "flos": 9720337034880.0, "grad_norm": 2.2771167668968153, "language_loss": 0.80291224, "learning_rate": 3.723550375147241e-06, "loss": 0.8253665, "num_input_tokens_seen": 34216830, "step": 1616, "time_per_iteration": 2.653162956237793 }, { "auxiliary_loss_clip": 0.01165251, "auxiliary_loss_mlp": 0.01038034, "balance_loss_clip": 1.05426133, "balance_loss_mlp": 1.0279727, "epoch": 0.19443275416341008, "flos": 27016208150400.0, "grad_norm": 2.165658358875777, "language_loss": 0.80520159, "learning_rate": 3.7231550796063816e-06, "loss": 0.82723451, "num_input_tokens_seen": 34236840, "step": 1617, "time_per_iteration": 2.7206828594207764 }, { "auxiliary_loss_clip": 0.01196668, "auxiliary_loss_mlp": 0.01037117, "balance_loss_clip": 1.06060588, "balance_loss_mlp": 1.02574456, "epoch": 0.1945529970540492, "flos": 15846625077120.0, "grad_norm": 1.9546624966010528, "language_loss": 0.64659065, "learning_rate": 3.722759522664266e-06, "loss": 0.6689285, "num_input_tokens_seen": 34254140, "step": 1618, "time_per_iteration": 2.6727190017700195 }, { "auxiliary_loss_clip": 0.01160696, "auxiliary_loss_mlp": 0.01035625, "balance_loss_clip": 1.05421662, "balance_loss_mlp": 1.02559912, "epoch": 0.19467323994468827, "flos": 19314985403520.0, "grad_norm": 3.1102726666328198, "language_loss": 0.82070327, "learning_rate": 3.7223637043809016e-06, "loss": 0.84266645, "num_input_tokens_seen": 34273120, "step": 1619, "time_per_iteration": 2.6848127841949463 }, { "auxiliary_loss_clip": 0.01180828, "auxiliary_loss_mlp": 0.01033686, "balance_loss_clip": 1.06012321, "balance_loss_mlp": 1.02308261, "epoch": 0.19479348283532735, "flos": 24133227770880.0, "grad_norm": 1.8725404678165212, "language_loss": 0.86233091, "learning_rate": 3.7219676248163322e-06, "loss": 0.88447607, "num_input_tokens_seen": 34290285, "step": 1620, "time_per_iteration": 2.7272744178771973 }, { "auxiliary_loss_clip": 0.01208096, "auxiliary_loss_mlp": 0.01033054, "balance_loss_clip": 1.06157351, "balance_loss_mlp": 1.02289128, "epoch": 0.19491372572596646, "flos": 25775638174080.0, "grad_norm": 1.9724666738993923, "language_loss": 0.93326533, "learning_rate": 3.721571284030643e-06, "loss": 0.95567691, "num_input_tokens_seen": 34310095, "step": 1621, "time_per_iteration": 2.671029806137085 }, { "auxiliary_loss_clip": 0.01207851, "auxiliary_loss_mlp": 0.0103608, "balance_loss_clip": 1.06112516, "balance_loss_mlp": 1.02604258, "epoch": 0.19503396861660555, "flos": 19645220067840.0, "grad_norm": 2.11034263706293, "language_loss": 0.7865411, "learning_rate": 3.7211746820839587e-06, "loss": 0.80898041, "num_input_tokens_seen": 34327190, "step": 1622, "time_per_iteration": 2.646682024002075 }, { "auxiliary_loss_clip": 0.01115296, "auxiliary_loss_mlp": 0.0103223, "balance_loss_clip": 1.04758763, "balance_loss_mlp": 1.02154839, "epoch": 0.19515421150724463, "flos": 21033023892480.0, "grad_norm": 1.7202495622638343, "language_loss": 0.81000853, "learning_rate": 3.7207778190364437e-06, "loss": 0.83148378, "num_input_tokens_seen": 34345615, "step": 1623, "time_per_iteration": 2.862776279449463 }, { "auxiliary_loss_clip": 0.01125254, "auxiliary_loss_mlp": 0.01038618, "balance_loss_clip": 1.04838061, "balance_loss_mlp": 1.02908158, "epoch": 0.1952744543978837, "flos": 32961255143040.0, "grad_norm": 1.7146883622886562, "language_loss": 0.74363679, "learning_rate": 3.720380694948302e-06, "loss": 0.76527548, "num_input_tokens_seen": 34368500, "step": 1624, "time_per_iteration": 2.845585823059082 }, { "auxiliary_loss_clip": 0.0107259, "auxiliary_loss_mlp": 0.01002951, "balance_loss_clip": 1.02277088, "balance_loss_mlp": 1.0002923, "epoch": 0.19539469728852282, "flos": 64044312030720.0, "grad_norm": 1.0376579821521623, "language_loss": 0.71220857, "learning_rate": 3.719983309879777e-06, "loss": 0.73296392, "num_input_tokens_seen": 34428280, "step": 1625, "time_per_iteration": 3.3515465259552 }, { "auxiliary_loss_clip": 0.01162774, "auxiliary_loss_mlp": 0.01030777, "balance_loss_clip": 1.0530616, "balance_loss_mlp": 1.02048278, "epoch": 0.1955149401791619, "flos": 13370908078080.0, "grad_norm": 2.1474978154627813, "language_loss": 0.77619386, "learning_rate": 3.719585663891151e-06, "loss": 0.79812938, "num_input_tokens_seen": 34445815, "step": 1626, "time_per_iteration": 2.7452781200408936 }, { "auxiliary_loss_clip": 0.01148647, "auxiliary_loss_mlp": 0.01040302, "balance_loss_clip": 1.05319595, "balance_loss_mlp": 1.02975178, "epoch": 0.195635183069801, "flos": 18728887184640.0, "grad_norm": 2.0570827351973087, "language_loss": 0.78805733, "learning_rate": 3.719187757042747e-06, "loss": 0.80994684, "num_input_tokens_seen": 34463635, "step": 1627, "time_per_iteration": 3.778257369995117 }, { "auxiliary_loss_clip": 0.01086681, "auxiliary_loss_mlp": 0.01003099, "balance_loss_clip": 1.02014399, "balance_loss_mlp": 1.00058353, "epoch": 0.1957554259604401, "flos": 69313952615040.0, "grad_norm": 0.7270530549696459, "language_loss": 0.54912388, "learning_rate": 3.7187895893949275e-06, "loss": 0.57002163, "num_input_tokens_seen": 34530105, "step": 1628, "time_per_iteration": 3.36309552192688 }, { "auxiliary_loss_clip": 0.01145026, "auxiliary_loss_mlp": 0.0103547, "balance_loss_clip": 1.04988635, "balance_loss_mlp": 1.02501535, "epoch": 0.19587566885107918, "flos": 21069257736960.0, "grad_norm": 2.5348951175679564, "language_loss": 0.75784409, "learning_rate": 3.7183911610080937e-06, "loss": 0.77964914, "num_input_tokens_seen": 34546970, "step": 1629, "time_per_iteration": 2.732863664627075 }, { "auxiliary_loss_clip": 0.01176398, "auxiliary_loss_mlp": 0.01038151, "balance_loss_clip": 1.05638313, "balance_loss_mlp": 1.02742171, "epoch": 0.19599591174171827, "flos": 22194661731840.0, "grad_norm": 3.3058366412703197, "language_loss": 0.75441158, "learning_rate": 3.7179924719426872e-06, "loss": 0.77655709, "num_input_tokens_seen": 34564865, "step": 1630, "time_per_iteration": 3.665860891342163 }, { "auxiliary_loss_clip": 0.01210037, "auxiliary_loss_mlp": 0.01036393, "balance_loss_clip": 1.06387007, "balance_loss_mlp": 1.02538955, "epoch": 0.19611615463235738, "flos": 23768375374080.0, "grad_norm": 2.9257658447004586, "language_loss": 0.76209515, "learning_rate": 3.7175935222591885e-06, "loss": 0.78455943, "num_input_tokens_seen": 34584165, "step": 1631, "time_per_iteration": 3.63474178314209 }, { "auxiliary_loss_clip": 0.01196353, "auxiliary_loss_mlp": 0.01039639, "balance_loss_clip": 1.06337619, "balance_loss_mlp": 1.02954149, "epoch": 0.19623639752299646, "flos": 28618218731520.0, "grad_norm": 2.0664447591213055, "language_loss": 0.7441802, "learning_rate": 3.717194312018118e-06, "loss": 0.76654017, "num_input_tokens_seen": 34603150, "step": 1632, "time_per_iteration": 3.656517744064331 }, { "auxiliary_loss_clip": 0.01205469, "auxiliary_loss_mlp": 0.01032213, "balance_loss_clip": 1.05960906, "balance_loss_mlp": 1.02162683, "epoch": 0.19635664041363554, "flos": 21032700670080.0, "grad_norm": 2.2734319481801646, "language_loss": 0.76245272, "learning_rate": 3.716794841280036e-06, "loss": 0.7848295, "num_input_tokens_seen": 34621855, "step": 1633, "time_per_iteration": 2.607369899749756 }, { "auxiliary_loss_clip": 0.01212144, "auxiliary_loss_mlp": 0.01040858, "balance_loss_clip": 1.0621357, "balance_loss_mlp": 1.02940226, "epoch": 0.19647688330427462, "flos": 18879748306560.0, "grad_norm": 3.5203457896433736, "language_loss": 0.77359742, "learning_rate": 3.7163951101055407e-06, "loss": 0.79612744, "num_input_tokens_seen": 34639915, "step": 1634, "time_per_iteration": 2.6446104049682617 }, { "auxiliary_loss_clip": 0.01186148, "auxiliary_loss_mlp": 0.01041353, "balance_loss_clip": 1.06057644, "balance_loss_mlp": 1.03054643, "epoch": 0.19659712619491373, "flos": 24242503921920.0, "grad_norm": 2.008789667085443, "language_loss": 0.79184604, "learning_rate": 3.715995118555273e-06, "loss": 0.81412101, "num_input_tokens_seen": 34659890, "step": 1635, "time_per_iteration": 2.670377492904663 }, { "auxiliary_loss_clip": 0.01158876, "auxiliary_loss_mlp": 0.0104062, "balance_loss_clip": 1.05378532, "balance_loss_mlp": 1.02953315, "epoch": 0.19671736908555282, "flos": 24717422568960.0, "grad_norm": 2.1760719837035603, "language_loss": 0.85765707, "learning_rate": 3.71559486668991e-06, "loss": 0.87965202, "num_input_tokens_seen": 34678750, "step": 1636, "time_per_iteration": 2.76328444480896 }, { "auxiliary_loss_clip": 0.01212096, "auxiliary_loss_mlp": 0.00764534, "balance_loss_clip": 1.06325912, "balance_loss_mlp": 1.00035894, "epoch": 0.1968376119761919, "flos": 23842279607040.0, "grad_norm": 1.911921921568929, "language_loss": 0.77551758, "learning_rate": 3.715194354570169e-06, "loss": 0.79528391, "num_input_tokens_seen": 34698755, "step": 1637, "time_per_iteration": 2.6655447483062744 }, { "auxiliary_loss_clip": 0.01201765, "auxiliary_loss_mlp": 0.0103861, "balance_loss_clip": 1.06042945, "balance_loss_mlp": 1.02773762, "epoch": 0.196957854866831, "flos": 18113917409280.0, "grad_norm": 2.5200842005866777, "language_loss": 0.83330292, "learning_rate": 3.714793582256809e-06, "loss": 0.85570669, "num_input_tokens_seen": 34715820, "step": 1638, "time_per_iteration": 2.6650736331939697 }, { "auxiliary_loss_clip": 0.01219675, "auxiliary_loss_mlp": 0.01034701, "balance_loss_clip": 1.06224465, "balance_loss_mlp": 1.02437758, "epoch": 0.1970780977574701, "flos": 21653129312640.0, "grad_norm": 2.4702991948462785, "language_loss": 0.84924835, "learning_rate": 3.7143925498106253e-06, "loss": 0.87179208, "num_input_tokens_seen": 34734360, "step": 1639, "time_per_iteration": 2.603489637374878 }, { "auxiliary_loss_clip": 0.01187866, "auxiliary_loss_mlp": 0.01045339, "balance_loss_clip": 1.05353928, "balance_loss_mlp": 1.03477144, "epoch": 0.19719834064810918, "flos": 20811813984000.0, "grad_norm": 4.130725524126125, "language_loss": 0.79318523, "learning_rate": 3.7139912572924558e-06, "loss": 0.81551731, "num_input_tokens_seen": 34753390, "step": 1640, "time_per_iteration": 2.755589723587036 }, { "auxiliary_loss_clip": 0.01198583, "auxiliary_loss_mlp": 0.01036873, "balance_loss_clip": 1.05454624, "balance_loss_mlp": 1.0266633, "epoch": 0.19731858353874826, "flos": 23434800744960.0, "grad_norm": 3.1789331643547816, "language_loss": 0.80617511, "learning_rate": 3.7135897047631744e-06, "loss": 0.82852972, "num_input_tokens_seen": 34771275, "step": 1641, "time_per_iteration": 2.6487386226654053 }, { "auxiliary_loss_clip": 0.0119064, "auxiliary_loss_mlp": 0.01035115, "balance_loss_clip": 1.0570358, "balance_loss_mlp": 1.02526796, "epoch": 0.19743882642938737, "flos": 23988184652160.0, "grad_norm": 1.8979611486045749, "language_loss": 0.75852454, "learning_rate": 3.713187892283698e-06, "loss": 0.78078204, "num_input_tokens_seen": 34790885, "step": 1642, "time_per_iteration": 2.7182199954986572 }, { "auxiliary_loss_clip": 0.0115662, "auxiliary_loss_mlp": 0.01034802, "balance_loss_clip": 1.05070972, "balance_loss_mlp": 1.02413881, "epoch": 0.19755906932002645, "flos": 15004340081280.0, "grad_norm": 2.242851995114338, "language_loss": 0.8746227, "learning_rate": 3.71278581991498e-06, "loss": 0.89653695, "num_input_tokens_seen": 34806745, "step": 1643, "time_per_iteration": 2.7273337841033936 }, { "auxiliary_loss_clip": 0.01177831, "auxiliary_loss_mlp": 0.00766101, "balance_loss_clip": 1.06085277, "balance_loss_mlp": 1.00039816, "epoch": 0.19767931221066554, "flos": 19494466686720.0, "grad_norm": 1.820360104805376, "language_loss": 0.78936523, "learning_rate": 3.712383487718015e-06, "loss": 0.80880451, "num_input_tokens_seen": 34824985, "step": 1644, "time_per_iteration": 2.7188034057617188 }, { "auxiliary_loss_clip": 0.01136896, "auxiliary_loss_mlp": 0.01037118, "balance_loss_clip": 1.05050242, "balance_loss_mlp": 1.026896, "epoch": 0.19779955510130465, "flos": 25737895958400.0, "grad_norm": 2.0979741703832544, "language_loss": 0.86931986, "learning_rate": 3.7119808957538365e-06, "loss": 0.89105999, "num_input_tokens_seen": 34843980, "step": 1645, "time_per_iteration": 2.822990894317627 }, { "auxiliary_loss_clip": 0.01179385, "auxiliary_loss_mlp": 0.0103612, "balance_loss_clip": 1.05174506, "balance_loss_mlp": 1.02539754, "epoch": 0.19791979799194373, "flos": 20777699041920.0, "grad_norm": 1.9802365886251696, "language_loss": 0.80001855, "learning_rate": 3.711578044083517e-06, "loss": 0.8221736, "num_input_tokens_seen": 34860780, "step": 1646, "time_per_iteration": 2.6890830993652344 }, { "auxiliary_loss_clip": 0.01186781, "auxiliary_loss_mlp": 0.01037959, "balance_loss_clip": 1.05419421, "balance_loss_mlp": 1.02755785, "epoch": 0.1980400408825828, "flos": 25589010084480.0, "grad_norm": 2.1566918454533126, "language_loss": 0.7448898, "learning_rate": 3.7111749327681698e-06, "loss": 0.76713723, "num_input_tokens_seen": 34880815, "step": 1647, "time_per_iteration": 2.747211217880249 }, { "auxiliary_loss_clip": 0.01208941, "auxiliary_loss_mlp": 0.01036124, "balance_loss_clip": 1.06238294, "balance_loss_mlp": 1.02525854, "epoch": 0.1981602837732219, "flos": 23513840622720.0, "grad_norm": 2.3378797838211516, "language_loss": 0.86383677, "learning_rate": 3.7107715618689455e-06, "loss": 0.88628745, "num_input_tokens_seen": 34899790, "step": 1648, "time_per_iteration": 2.665029764175415 }, { "auxiliary_loss_clip": 0.01201609, "auxiliary_loss_mlp": 0.01035316, "balance_loss_clip": 1.06036925, "balance_loss_mlp": 1.02458668, "epoch": 0.198280526663861, "flos": 23185365724800.0, "grad_norm": 1.4542661382610191, "language_loss": 0.83546567, "learning_rate": 3.710367931447035e-06, "loss": 0.85783488, "num_input_tokens_seen": 34921570, "step": 1649, "time_per_iteration": 2.666713237762451 }, { "auxiliary_loss_clip": 0.01210826, "auxiliary_loss_mlp": 0.01040442, "balance_loss_clip": 1.06071258, "balance_loss_mlp": 1.02879477, "epoch": 0.1984007695545001, "flos": 21689470897920.0, "grad_norm": 2.5420637162641144, "language_loss": 0.86742747, "learning_rate": 3.70996404156367e-06, "loss": 0.88994014, "num_input_tokens_seen": 34941205, "step": 1650, "time_per_iteration": 2.631938934326172 }, { "auxiliary_loss_clip": 0.01146724, "auxiliary_loss_mlp": 0.01037288, "balance_loss_clip": 1.05157137, "balance_loss_mlp": 1.02735245, "epoch": 0.19852101244513917, "flos": 36064008887040.0, "grad_norm": 2.204740948371221, "language_loss": 0.72861457, "learning_rate": 3.7095598922801187e-06, "loss": 0.75045466, "num_input_tokens_seen": 34963280, "step": 1651, "time_per_iteration": 2.9008846282958984 }, { "auxiliary_loss_clip": 0.01221802, "auxiliary_loss_mlp": 0.01041053, "balance_loss_clip": 1.06314492, "balance_loss_mlp": 1.03073561, "epoch": 0.19864125533577828, "flos": 23105894883840.0, "grad_norm": 3.8111210650229514, "language_loss": 0.75859523, "learning_rate": 3.7091554836576914e-06, "loss": 0.78122377, "num_input_tokens_seen": 34979955, "step": 1652, "time_per_iteration": 2.6117119789123535 }, { "auxiliary_loss_clip": 0.01201637, "auxiliary_loss_mlp": 0.00764428, "balance_loss_clip": 1.06084156, "balance_loss_mlp": 1.00034595, "epoch": 0.19876149822641737, "flos": 24608505553920.0, "grad_norm": 3.1427705602844607, "language_loss": 0.82674503, "learning_rate": 3.708750815757736e-06, "loss": 0.84640574, "num_input_tokens_seen": 35000725, "step": 1653, "time_per_iteration": 3.607034206390381 }, { "auxiliary_loss_clip": 0.01206087, "auxiliary_loss_mlp": 0.01037251, "balance_loss_clip": 1.05963492, "balance_loss_mlp": 1.02586007, "epoch": 0.19888174111705645, "flos": 32196645308160.0, "grad_norm": 2.283091313267767, "language_loss": 0.73141235, "learning_rate": 3.7083458886416407e-06, "loss": 0.75384569, "num_input_tokens_seen": 35019920, "step": 1654, "time_per_iteration": 2.7444164752960205 }, { "auxiliary_loss_clip": 0.01144953, "auxiliary_loss_mlp": 0.01037923, "balance_loss_clip": 1.05416512, "balance_loss_mlp": 1.02794504, "epoch": 0.19900198400769553, "flos": 24608469640320.0, "grad_norm": 2.399440414759682, "language_loss": 0.88255692, "learning_rate": 3.707940702370832e-06, "loss": 0.90438569, "num_input_tokens_seen": 35040765, "step": 1655, "time_per_iteration": 2.810795783996582 }, { "auxiliary_loss_clip": 0.01100475, "auxiliary_loss_mlp": 0.01006355, "balance_loss_clip": 1.02480221, "balance_loss_mlp": 1.00398278, "epoch": 0.19912222689833464, "flos": 67915805673600.0, "grad_norm": 0.8792203477426301, "language_loss": 0.58308947, "learning_rate": 3.707535257006777e-06, "loss": 0.60415781, "num_input_tokens_seen": 35106390, "step": 1656, "time_per_iteration": 4.201031923294067 }, { "auxiliary_loss_clip": 0.01189415, "auxiliary_loss_mlp": 0.0103821, "balance_loss_clip": 1.05763602, "balance_loss_mlp": 1.02688503, "epoch": 0.19924246978897373, "flos": 15742340916480.0, "grad_norm": 2.4644453506577033, "language_loss": 0.8831085, "learning_rate": 3.707129552610981e-06, "loss": 0.90538478, "num_input_tokens_seen": 35125040, "step": 1657, "time_per_iteration": 3.5399820804595947 }, { "auxiliary_loss_clip": 0.01179449, "auxiliary_loss_mlp": 0.01036465, "balance_loss_clip": 1.05714822, "balance_loss_mlp": 1.02568245, "epoch": 0.1993627126796128, "flos": 17566566986880.0, "grad_norm": 1.9039598106391453, "language_loss": 0.73251021, "learning_rate": 3.70672358924499e-06, "loss": 0.75466931, "num_input_tokens_seen": 35144280, "step": 1658, "time_per_iteration": 3.5388295650482178 }, { "auxiliary_loss_clip": 0.01176945, "auxiliary_loss_mlp": 0.0104295, "balance_loss_clip": 1.06166828, "balance_loss_mlp": 1.03319836, "epoch": 0.19948295557025192, "flos": 40843826680320.0, "grad_norm": 1.9177668510356336, "language_loss": 0.78614283, "learning_rate": 3.706317366970386e-06, "loss": 0.80834174, "num_input_tokens_seen": 35165280, "step": 1659, "time_per_iteration": 2.958683490753174 }, { "auxiliary_loss_clip": 0.01218795, "auxiliary_loss_mlp": 0.00765849, "balance_loss_clip": 1.05849886, "balance_loss_mlp": 1.00038064, "epoch": 0.199603198460891, "flos": 25082418620160.0, "grad_norm": 4.0338024809059965, "language_loss": 0.83964419, "learning_rate": 3.705910885848795e-06, "loss": 0.85949063, "num_input_tokens_seen": 35183655, "step": 1660, "time_per_iteration": 2.6872732639312744 }, { "auxiliary_loss_clip": 0.01200486, "auxiliary_loss_mlp": 0.01032206, "balance_loss_clip": 1.05850911, "balance_loss_mlp": 1.0219481, "epoch": 0.19972344135153008, "flos": 20084120352000.0, "grad_norm": 2.0604614335470437, "language_loss": 0.8458153, "learning_rate": 3.705504145941879e-06, "loss": 0.86814225, "num_input_tokens_seen": 35201825, "step": 1661, "time_per_iteration": 2.6485090255737305 }, { "auxiliary_loss_clip": 0.01216679, "auxiliary_loss_mlp": 0.01036696, "balance_loss_clip": 1.05956721, "balance_loss_mlp": 1.02624726, "epoch": 0.1998436842421692, "flos": 23727472761600.0, "grad_norm": 6.054430083791726, "language_loss": 0.78466845, "learning_rate": 3.7050971473113403e-06, "loss": 0.80720222, "num_input_tokens_seen": 35221600, "step": 1662, "time_per_iteration": 2.666290760040283 }, { "auxiliary_loss_clip": 0.01197174, "auxiliary_loss_mlp": 0.00765121, "balance_loss_clip": 1.0552845, "balance_loss_mlp": 1.00030494, "epoch": 0.19996392713280828, "flos": 36102361633920.0, "grad_norm": 1.9929382229563757, "language_loss": 0.79739201, "learning_rate": 3.7046898900189196e-06, "loss": 0.81701493, "num_input_tokens_seen": 35245935, "step": 1663, "time_per_iteration": 2.741389513015747 }, { "auxiliary_loss_clip": 0.01177984, "auxiliary_loss_mlp": 0.0103851, "balance_loss_clip": 1.05891132, "balance_loss_mlp": 1.02779341, "epoch": 0.20008417002344736, "flos": 23657662679040.0, "grad_norm": 1.84739961850067, "language_loss": 0.82973504, "learning_rate": 3.704282374126398e-06, "loss": 0.85190004, "num_input_tokens_seen": 35265615, "step": 1664, "time_per_iteration": 2.6868438720703125 }, { "auxiliary_loss_clip": 0.01168726, "auxiliary_loss_mlp": 0.01037255, "balance_loss_clip": 1.05378294, "balance_loss_mlp": 1.02631795, "epoch": 0.20020441291408644, "flos": 21872076664320.0, "grad_norm": 1.7463519401263667, "language_loss": 0.87579846, "learning_rate": 3.7038745996955954e-06, "loss": 0.89785826, "num_input_tokens_seen": 35284960, "step": 1665, "time_per_iteration": 2.809835433959961 }, { "auxiliary_loss_clip": 0.01179234, "auxiliary_loss_mlp": 0.01035143, "balance_loss_clip": 1.05551934, "balance_loss_mlp": 1.02387178, "epoch": 0.20032465580472555, "flos": 23179691376000.0, "grad_norm": 4.442985304959047, "language_loss": 0.71608877, "learning_rate": 3.703466566788371e-06, "loss": 0.73823249, "num_input_tokens_seen": 35304090, "step": 1666, "time_per_iteration": 2.766836404800415 }, { "auxiliary_loss_clip": 0.01177856, "auxiliary_loss_mlp": 0.01038855, "balance_loss_clip": 1.05594087, "balance_loss_mlp": 1.02832317, "epoch": 0.20044489869536464, "flos": 23873521461120.0, "grad_norm": 1.9135495055218326, "language_loss": 0.74500132, "learning_rate": 3.703058275466622e-06, "loss": 0.7671684, "num_input_tokens_seen": 35323325, "step": 1667, "time_per_iteration": 2.7509090900421143 }, { "auxiliary_loss_clip": 0.01187117, "auxiliary_loss_mlp": 0.01029837, "balance_loss_clip": 1.057423, "balance_loss_mlp": 1.02010381, "epoch": 0.20056514158600372, "flos": 21945226711680.0, "grad_norm": 2.3913991522243183, "language_loss": 0.77819586, "learning_rate": 3.7026497257922877e-06, "loss": 0.80036533, "num_input_tokens_seen": 35343635, "step": 1668, "time_per_iteration": 2.7524971961975098 }, { "auxiliary_loss_clip": 0.01151494, "auxiliary_loss_mlp": 0.01029604, "balance_loss_clip": 1.04945195, "balance_loss_mlp": 1.01946521, "epoch": 0.20068538447664283, "flos": 23879159896320.0, "grad_norm": 9.70644699769287, "language_loss": 0.85245621, "learning_rate": 3.7022409178273436e-06, "loss": 0.87426722, "num_input_tokens_seen": 35364615, "step": 1669, "time_per_iteration": 2.8346471786499023 }, { "auxiliary_loss_clip": 0.01198425, "auxiliary_loss_mlp": 0.01036047, "balance_loss_clip": 1.05878019, "balance_loss_mlp": 1.02513337, "epoch": 0.2008056273672819, "flos": 18442823270400.0, "grad_norm": 2.100290395354796, "language_loss": 0.78446567, "learning_rate": 3.7018318516338054e-06, "loss": 0.80681038, "num_input_tokens_seen": 35383775, "step": 1670, "time_per_iteration": 2.726123094558716 }, { "auxiliary_loss_clip": 0.01208043, "auxiliary_loss_mlp": 0.01038799, "balance_loss_clip": 1.06002617, "balance_loss_mlp": 1.02818298, "epoch": 0.200925870257921, "flos": 23659530186240.0, "grad_norm": 3.967887376780115, "language_loss": 0.82039654, "learning_rate": 3.7014225272737284e-06, "loss": 0.84286499, "num_input_tokens_seen": 35403000, "step": 1671, "time_per_iteration": 2.789266347885132 }, { "auxiliary_loss_clip": 0.01195852, "auxiliary_loss_mlp": 0.0103807, "balance_loss_clip": 1.05797982, "balance_loss_mlp": 1.02740717, "epoch": 0.20104611314856008, "flos": 16217115909120.0, "grad_norm": 2.7160509788082843, "language_loss": 0.74009037, "learning_rate": 3.701012944809207e-06, "loss": 0.76242965, "num_input_tokens_seen": 35420115, "step": 1672, "time_per_iteration": 2.6169464588165283 }, { "auxiliary_loss_clip": 0.01188796, "auxiliary_loss_mlp": 0.00765066, "balance_loss_clip": 1.06103432, "balance_loss_mlp": 1.0003618, "epoch": 0.2011663560391992, "flos": 21397373498880.0, "grad_norm": 1.9073008496628021, "language_loss": 0.784289, "learning_rate": 3.700603104302374e-06, "loss": 0.80382764, "num_input_tokens_seen": 35439925, "step": 1673, "time_per_iteration": 2.7431938648223877 }, { "auxiliary_loss_clip": 0.01062004, "auxiliary_loss_mlp": 0.01009294, "balance_loss_clip": 1.01992321, "balance_loss_mlp": 1.00677896, "epoch": 0.20128659892983827, "flos": 62229459409920.0, "grad_norm": 0.9028779273303366, "language_loss": 0.5594275, "learning_rate": 3.7001930058154027e-06, "loss": 0.58014047, "num_input_tokens_seen": 35504885, "step": 1674, "time_per_iteration": 3.295372486114502 }, { "auxiliary_loss_clip": 0.01170699, "auxiliary_loss_mlp": 0.01044722, "balance_loss_clip": 1.05583417, "balance_loss_mlp": 1.03348613, "epoch": 0.20140684182047736, "flos": 28438737448320.0, "grad_norm": 2.590227625655642, "language_loss": 0.79631025, "learning_rate": 3.6997826494105037e-06, "loss": 0.81846446, "num_input_tokens_seen": 35525330, "step": 1675, "time_per_iteration": 2.783301591873169 }, { "auxiliary_loss_clip": 0.01184746, "auxiliary_loss_mlp": 0.01040066, "balance_loss_clip": 1.05573487, "balance_loss_mlp": 1.02946877, "epoch": 0.20152708471111647, "flos": 28074064619520.0, "grad_norm": 2.4710316702413055, "language_loss": 0.69307631, "learning_rate": 3.6993720351499286e-06, "loss": 0.71532446, "num_input_tokens_seen": 35546455, "step": 1676, "time_per_iteration": 2.7147305011749268 }, { "auxiliary_loss_clip": 0.01187529, "auxiliary_loss_mlp": 0.01034813, "balance_loss_clip": 1.06359935, "balance_loss_mlp": 1.02510321, "epoch": 0.20164732760175555, "flos": 23549751244800.0, "grad_norm": 1.7207198602815565, "language_loss": 0.77161872, "learning_rate": 3.6989611630959666e-06, "loss": 0.79384208, "num_input_tokens_seen": 35565010, "step": 1677, "time_per_iteration": 2.723766803741455 }, { "auxiliary_loss_clip": 0.01100808, "auxiliary_loss_mlp": 0.0100347, "balance_loss_clip": 1.02627802, "balance_loss_mlp": 1.00114572, "epoch": 0.20176757049239463, "flos": 71100616037760.0, "grad_norm": 0.6888361340675919, "language_loss": 0.58290714, "learning_rate": 3.6985500333109474e-06, "loss": 0.6039499, "num_input_tokens_seen": 35633340, "step": 1678, "time_per_iteration": 3.3094279766082764 }, { "auxiliary_loss_clip": 0.01164952, "auxiliary_loss_mlp": 0.01029008, "balance_loss_clip": 1.05326629, "balance_loss_mlp": 1.01895845, "epoch": 0.20188781338303372, "flos": 21430159637760.0, "grad_norm": 3.254991614295124, "language_loss": 0.76923764, "learning_rate": 3.6981386458572385e-06, "loss": 0.79117727, "num_input_tokens_seen": 35651315, "step": 1679, "time_per_iteration": 3.813613176345825 }, { "auxiliary_loss_clip": 0.01162272, "auxiliary_loss_mlp": 0.01045037, "balance_loss_clip": 1.05218637, "balance_loss_mlp": 1.03391504, "epoch": 0.20200805627367283, "flos": 11546215130880.0, "grad_norm": 2.4749328846522283, "language_loss": 0.75854427, "learning_rate": 3.6977270007972468e-06, "loss": 0.78061736, "num_input_tokens_seen": 35668850, "step": 1680, "time_per_iteration": 2.853621482849121 }, { "auxiliary_loss_clip": 0.0119381, "auxiliary_loss_mlp": 0.01033535, "balance_loss_clip": 1.06105661, "balance_loss_mlp": 1.02347946, "epoch": 0.2021282991643119, "flos": 28545391906560.0, "grad_norm": 2.8403465665444285, "language_loss": 0.72534204, "learning_rate": 3.6973150981934196e-06, "loss": 0.74761546, "num_input_tokens_seen": 35690080, "step": 1681, "time_per_iteration": 3.7295126914978027 }, { "auxiliary_loss_clip": 0.01220908, "auxiliary_loss_mlp": 0.01040401, "balance_loss_clip": 1.06258249, "balance_loss_mlp": 1.02992892, "epoch": 0.202248542054951, "flos": 17923446564480.0, "grad_norm": 2.5666566693481507, "language_loss": 0.83531266, "learning_rate": 3.6969029381082415e-06, "loss": 0.85792577, "num_input_tokens_seen": 35706075, "step": 1682, "time_per_iteration": 2.6280176639556885 }, { "auxiliary_loss_clip": 0.0118254, "auxiliary_loss_mlp": 0.01032295, "balance_loss_clip": 1.05650663, "balance_loss_mlp": 1.02269852, "epoch": 0.2023687849455901, "flos": 19864634296320.0, "grad_norm": 2.3371718072948053, "language_loss": 0.79532433, "learning_rate": 3.696490520604237e-06, "loss": 0.81747264, "num_input_tokens_seen": 35724765, "step": 1683, "time_per_iteration": 3.653432846069336 }, { "auxiliary_loss_clip": 0.01195, "auxiliary_loss_mlp": 0.01035665, "balance_loss_clip": 1.05813158, "balance_loss_mlp": 1.02583575, "epoch": 0.20248902783622919, "flos": 22564721600640.0, "grad_norm": 1.8466424481199166, "language_loss": 0.80921042, "learning_rate": 3.696077845743968e-06, "loss": 0.8315171, "num_input_tokens_seen": 35744355, "step": 1684, "time_per_iteration": 3.5204105377197266 }, { "auxiliary_loss_clip": 0.01215021, "auxiliary_loss_mlp": 0.01037451, "balance_loss_clip": 1.05734468, "balance_loss_mlp": 1.02710903, "epoch": 0.20260927072686827, "flos": 22709728805760.0, "grad_norm": 3.409134916094016, "language_loss": 0.73203027, "learning_rate": 3.69566491359004e-06, "loss": 0.75455499, "num_input_tokens_seen": 35761000, "step": 1685, "time_per_iteration": 2.602459192276001 }, { "auxiliary_loss_clip": 0.01184047, "auxiliary_loss_mlp": 0.01041367, "balance_loss_clip": 1.05686069, "balance_loss_mlp": 1.03007793, "epoch": 0.20272951361750738, "flos": 51023998650240.0, "grad_norm": 2.3044442012510777, "language_loss": 0.69552159, "learning_rate": 3.695251724205092e-06, "loss": 0.7177757, "num_input_tokens_seen": 35785360, "step": 1686, "time_per_iteration": 2.928880214691162 }, { "auxiliary_loss_clip": 0.01216673, "auxiliary_loss_mlp": 0.01035234, "balance_loss_clip": 1.06263804, "balance_loss_mlp": 1.02505374, "epoch": 0.20284975650814646, "flos": 26578133879040.0, "grad_norm": 1.6454935353654447, "language_loss": 0.86735886, "learning_rate": 3.6948382776518054e-06, "loss": 0.88987797, "num_input_tokens_seen": 35806065, "step": 1687, "time_per_iteration": 2.675960063934326 }, { "auxiliary_loss_clip": 0.01176942, "auxiliary_loss_mlp": 0.01043648, "balance_loss_clip": 1.05519176, "balance_loss_mlp": 1.03368163, "epoch": 0.20296999939878554, "flos": 16034222833920.0, "grad_norm": 3.422564855315413, "language_loss": 0.7927435, "learning_rate": 3.6944245739929e-06, "loss": 0.81494939, "num_input_tokens_seen": 35822225, "step": 1688, "time_per_iteration": 2.710784673690796 }, { "auxiliary_loss_clip": 0.01203204, "auxiliary_loss_mlp": 0.01043801, "balance_loss_clip": 1.06107306, "balance_loss_mlp": 1.03254163, "epoch": 0.20309024228942463, "flos": 19203374868480.0, "grad_norm": 7.121614655897142, "language_loss": 0.7185654, "learning_rate": 3.6940106132911332e-06, "loss": 0.74103546, "num_input_tokens_seen": 35839410, "step": 1689, "time_per_iteration": 2.6182799339294434 }, { "auxiliary_loss_clip": 0.01204453, "auxiliary_loss_mlp": 0.01037627, "balance_loss_clip": 1.06064963, "balance_loss_mlp": 1.02676725, "epoch": 0.20321048518006374, "flos": 22821087945600.0, "grad_norm": 3.0127473001887743, "language_loss": 0.88788921, "learning_rate": 3.6935963956093037e-06, "loss": 0.91031003, "num_input_tokens_seen": 35859495, "step": 1690, "time_per_iteration": 2.720550775527954 }, { "auxiliary_loss_clip": 0.01192746, "auxiliary_loss_mlp": 0.01038113, "balance_loss_clip": 1.05827355, "balance_loss_mlp": 1.02748561, "epoch": 0.20333072807070282, "flos": 19096397187840.0, "grad_norm": 1.889931560689134, "language_loss": 0.69490588, "learning_rate": 3.6931819210102474e-06, "loss": 0.71721447, "num_input_tokens_seen": 35878890, "step": 1691, "time_per_iteration": 2.8321774005889893 }, { "auxiliary_loss_clip": 0.01216064, "auxiliary_loss_mlp": 0.01029998, "balance_loss_clip": 1.05856526, "balance_loss_mlp": 1.01913226, "epoch": 0.2034509709613419, "flos": 18180962144640.0, "grad_norm": 5.4572864688174185, "language_loss": 0.8422972, "learning_rate": 3.6927671895568402e-06, "loss": 0.8647579, "num_input_tokens_seen": 35897950, "step": 1692, "time_per_iteration": 2.588818311691284 }, { "auxiliary_loss_clip": 0.0121611, "auxiliary_loss_mlp": 0.01036414, "balance_loss_clip": 1.06152439, "balance_loss_mlp": 1.02574492, "epoch": 0.20357121385198101, "flos": 22923899648640.0, "grad_norm": 3.3141171458972605, "language_loss": 0.86597788, "learning_rate": 3.692352201311996e-06, "loss": 0.88850307, "num_input_tokens_seen": 35916800, "step": 1693, "time_per_iteration": 2.8246896266937256 }, { "auxiliary_loss_clip": 0.01168714, "auxiliary_loss_mlp": 0.0103537, "balance_loss_clip": 1.05582273, "balance_loss_mlp": 1.02456975, "epoch": 0.2036914567426201, "flos": 20922131629440.0, "grad_norm": 10.031485493633443, "language_loss": 0.76880246, "learning_rate": 3.6919369563386687e-06, "loss": 0.79084331, "num_input_tokens_seen": 35936600, "step": 1694, "time_per_iteration": 2.7632648944854736 }, { "auxiliary_loss_clip": 0.01180218, "auxiliary_loss_mlp": 0.01029641, "balance_loss_clip": 1.05718493, "balance_loss_mlp": 1.01984191, "epoch": 0.20381169963325918, "flos": 15519155760000.0, "grad_norm": 3.695182405151826, "language_loss": 0.79577863, "learning_rate": 3.69152145469985e-06, "loss": 0.81787717, "num_input_tokens_seen": 35953645, "step": 1695, "time_per_iteration": 2.7080001831054688 }, { "auxiliary_loss_clip": 0.01160321, "auxiliary_loss_mlp": 0.01037944, "balance_loss_clip": 1.05185795, "balance_loss_mlp": 1.02747738, "epoch": 0.20393194252389826, "flos": 28833143760000.0, "grad_norm": 2.136562814459686, "language_loss": 0.81943333, "learning_rate": 3.691105696458572e-06, "loss": 0.841416, "num_input_tokens_seen": 35970940, "step": 1696, "time_per_iteration": 2.7780191898345947 }, { "auxiliary_loss_clip": 0.01215379, "auxiliary_loss_mlp": 0.01034074, "balance_loss_clip": 1.0616461, "balance_loss_mlp": 1.02426267, "epoch": 0.20405218541453737, "flos": 22488554810880.0, "grad_norm": 2.9553705179796386, "language_loss": 0.67877775, "learning_rate": 3.690689681677904e-06, "loss": 0.70127225, "num_input_tokens_seen": 35989410, "step": 1697, "time_per_iteration": 2.6194007396698 }, { "auxiliary_loss_clip": 0.01186905, "auxiliary_loss_mlp": 0.0103543, "balance_loss_clip": 1.05554366, "balance_loss_mlp": 1.02510667, "epoch": 0.20417242830517646, "flos": 25374408278400.0, "grad_norm": 1.9613400301558965, "language_loss": 0.88823247, "learning_rate": 3.690273410420956e-06, "loss": 0.91045582, "num_input_tokens_seen": 36009175, "step": 1698, "time_per_iteration": 2.7287704944610596 }, { "auxiliary_loss_clip": 0.01197688, "auxiliary_loss_mlp": 0.01030533, "balance_loss_clip": 1.05717802, "balance_loss_mlp": 1.01998258, "epoch": 0.20429267119581554, "flos": 14793078240000.0, "grad_norm": 2.752520543902167, "language_loss": 0.77102751, "learning_rate": 3.689856882750875e-06, "loss": 0.79330969, "num_input_tokens_seen": 36024375, "step": 1699, "time_per_iteration": 2.7311878204345703 }, { "auxiliary_loss_clip": 0.01193948, "auxiliary_loss_mlp": 0.01032458, "balance_loss_clip": 1.05663562, "balance_loss_mlp": 1.02276039, "epoch": 0.20441291408645465, "flos": 17781851151360.0, "grad_norm": 1.832459255344264, "language_loss": 0.785676, "learning_rate": 3.6894400987308486e-06, "loss": 0.80794007, "num_input_tokens_seen": 36041895, "step": 1700, "time_per_iteration": 2.592207670211792 }, { "auxiliary_loss_clip": 0.01202257, "auxiliary_loss_mlp": 0.01039268, "balance_loss_clip": 1.05735421, "balance_loss_mlp": 1.02747202, "epoch": 0.20453315697709373, "flos": 16435668211200.0, "grad_norm": 4.505765965966685, "language_loss": 0.85194415, "learning_rate": 3.6890230584241024e-06, "loss": 0.87435937, "num_input_tokens_seen": 36058825, "step": 1701, "time_per_iteration": 2.653635263442993 }, { "auxiliary_loss_clip": 0.01120877, "auxiliary_loss_mlp": 0.01013901, "balance_loss_clip": 1.03207219, "balance_loss_mlp": 1.01164782, "epoch": 0.20465339986773282, "flos": 66713085653760.0, "grad_norm": 1.0746719082313454, "language_loss": 0.66490388, "learning_rate": 3.6886057618939016e-06, "loss": 0.68625164, "num_input_tokens_seen": 36121645, "step": 1702, "time_per_iteration": 3.22568678855896 }, { "auxiliary_loss_clip": 0.01164045, "auxiliary_loss_mlp": 0.01035159, "balance_loss_clip": 1.05295658, "balance_loss_mlp": 1.02488267, "epoch": 0.2047736427583719, "flos": 41974114924800.0, "grad_norm": 2.25800134615194, "language_loss": 0.69488227, "learning_rate": 3.6881882092035492e-06, "loss": 0.7168743, "num_input_tokens_seen": 36143030, "step": 1703, "time_per_iteration": 2.8601865768432617 }, { "auxiliary_loss_clip": 0.01092428, "auxiliary_loss_mlp": 0.00755837, "balance_loss_clip": 1.03160381, "balance_loss_mlp": 1.00131416, "epoch": 0.204893885649011, "flos": 69940878641280.0, "grad_norm": 0.9337215248100351, "language_loss": 0.61198246, "learning_rate": 3.6877704004163873e-06, "loss": 0.63046515, "num_input_tokens_seen": 36203435, "step": 1704, "time_per_iteration": 3.44199538230896 }, { "auxiliary_loss_clip": 0.0121476, "auxiliary_loss_mlp": 0.01035604, "balance_loss_clip": 1.05945885, "balance_loss_mlp": 1.02559614, "epoch": 0.2050141285396501, "flos": 22200012858240.0, "grad_norm": 4.583968677116409, "language_loss": 0.77592146, "learning_rate": 3.6873523355957984e-06, "loss": 0.79842508, "num_input_tokens_seen": 36222435, "step": 1705, "time_per_iteration": 3.5690698623657227 }, { "auxiliary_loss_clip": 0.01119163, "auxiliary_loss_mlp": 0.01002587, "balance_loss_clip": 1.03022861, "balance_loss_mlp": 1.0002625, "epoch": 0.20513437143028918, "flos": 46283721730560.0, "grad_norm": 0.9892376734725632, "language_loss": 0.64065397, "learning_rate": 3.686934014805201e-06, "loss": 0.66187143, "num_input_tokens_seen": 36273065, "step": 1706, "time_per_iteration": 3.0296056270599365 }, { "auxiliary_loss_clip": 0.01198525, "auxiliary_loss_mlp": 0.0103426, "balance_loss_clip": 1.05843735, "balance_loss_mlp": 1.02348971, "epoch": 0.20525461432092829, "flos": 21904324099200.0, "grad_norm": 1.859275298104681, "language_loss": 0.81331307, "learning_rate": 3.6865154381080552e-06, "loss": 0.83564091, "num_input_tokens_seen": 36293750, "step": 1707, "time_per_iteration": 2.730319023132324 }, { "auxiliary_loss_clip": 0.01125023, "auxiliary_loss_mlp": 0.01036949, "balance_loss_clip": 1.04747486, "balance_loss_mlp": 1.02692986, "epoch": 0.20537485721156737, "flos": 21214264942080.0, "grad_norm": 2.039175482377376, "language_loss": 0.82293898, "learning_rate": 3.6860966055678585e-06, "loss": 0.84455872, "num_input_tokens_seen": 36310105, "step": 1708, "time_per_iteration": 3.699664354324341 }, { "auxiliary_loss_clip": 0.01198678, "auxiliary_loss_mlp": 0.0102908, "balance_loss_clip": 1.05847085, "balance_loss_mlp": 1.0183686, "epoch": 0.20549510010220645, "flos": 20191205773440.0, "grad_norm": 1.8475223525157303, "language_loss": 0.86486328, "learning_rate": 3.685677517248147e-06, "loss": 0.88714087, "num_input_tokens_seen": 36328995, "step": 1709, "time_per_iteration": 4.453352212905884 }, { "auxiliary_loss_clip": 0.01184796, "auxiliary_loss_mlp": 0.00764922, "balance_loss_clip": 1.06096375, "balance_loss_mlp": 1.00054121, "epoch": 0.20561534299284553, "flos": 17016702612480.0, "grad_norm": 3.2977189738829145, "language_loss": 0.8049196, "learning_rate": 3.6852581732124967e-06, "loss": 0.82441688, "num_input_tokens_seen": 36346340, "step": 1710, "time_per_iteration": 2.7561628818511963 }, { "auxiliary_loss_clip": 0.01197343, "auxiliary_loss_mlp": 0.01032608, "balance_loss_clip": 1.05760574, "balance_loss_mlp": 1.02190292, "epoch": 0.20573558588348465, "flos": 22890467064960.0, "grad_norm": 2.276397483175922, "language_loss": 0.76492828, "learning_rate": 3.6848385735245213e-06, "loss": 0.78722781, "num_input_tokens_seen": 36365430, "step": 1711, "time_per_iteration": 2.6317827701568604 }, { "auxiliary_loss_clip": 0.01188072, "auxiliary_loss_mlp": 0.01027688, "balance_loss_clip": 1.05526495, "balance_loss_mlp": 1.01803827, "epoch": 0.20585582877412373, "flos": 24643123286400.0, "grad_norm": 2.190492230157755, "language_loss": 0.86416459, "learning_rate": 3.6844187182478734e-06, "loss": 0.8863222, "num_input_tokens_seen": 36386285, "step": 1712, "time_per_iteration": 2.729926586151123 }, { "auxiliary_loss_clip": 0.01171509, "auxiliary_loss_mlp": 0.01035112, "balance_loss_clip": 1.05101156, "balance_loss_mlp": 1.02492523, "epoch": 0.2059760716647628, "flos": 24206952435840.0, "grad_norm": 2.4523242675344354, "language_loss": 0.75339741, "learning_rate": 3.683998607446246e-06, "loss": 0.77546358, "num_input_tokens_seen": 36404935, "step": 1713, "time_per_iteration": 2.759671688079834 }, { "auxiliary_loss_clip": 0.01199785, "auxiliary_loss_mlp": 0.01035975, "balance_loss_clip": 1.05978942, "balance_loss_mlp": 1.02626586, "epoch": 0.20609631455540192, "flos": 20229522606720.0, "grad_norm": 2.265295381206127, "language_loss": 0.74892426, "learning_rate": 3.6835782411833686e-06, "loss": 0.7712819, "num_input_tokens_seen": 36424455, "step": 1714, "time_per_iteration": 2.640052556991577 }, { "auxiliary_loss_clip": 0.01155996, "auxiliary_loss_mlp": 0.0103664, "balance_loss_clip": 1.05100727, "balance_loss_mlp": 1.0264771, "epoch": 0.206216557446041, "flos": 19864957518720.0, "grad_norm": 1.8692162648044905, "language_loss": 0.7410692, "learning_rate": 3.68315761952301e-06, "loss": 0.7629956, "num_input_tokens_seen": 36441685, "step": 1715, "time_per_iteration": 2.7175722122192383 }, { "auxiliary_loss_clip": 0.01212391, "auxiliary_loss_mlp": 0.01039047, "balance_loss_clip": 1.05879593, "balance_loss_mlp": 1.02861595, "epoch": 0.2063368003366801, "flos": 24096311568000.0, "grad_norm": 2.301196399137744, "language_loss": 0.83216989, "learning_rate": 3.6827367425289797e-06, "loss": 0.85468429, "num_input_tokens_seen": 36461460, "step": 1716, "time_per_iteration": 2.624420404434204 }, { "auxiliary_loss_clip": 0.01184779, "auxiliary_loss_mlp": 0.01039491, "balance_loss_clip": 1.0575155, "balance_loss_mlp": 1.02918565, "epoch": 0.2064570432273192, "flos": 20340163474560.0, "grad_norm": 3.0858014415244703, "language_loss": 0.72680068, "learning_rate": 3.6823156102651225e-06, "loss": 0.74904341, "num_input_tokens_seen": 36479615, "step": 1717, "time_per_iteration": 2.7593390941619873 }, { "auxiliary_loss_clip": 0.01130672, "auxiliary_loss_mlp": 0.01031376, "balance_loss_clip": 1.05279279, "balance_loss_mlp": 1.02186918, "epoch": 0.20657728611795828, "flos": 20520363029760.0, "grad_norm": 3.408372898553658, "language_loss": 0.70911294, "learning_rate": 3.6818942227953257e-06, "loss": 0.73073345, "num_input_tokens_seen": 36500160, "step": 1718, "time_per_iteration": 2.809767723083496 }, { "auxiliary_loss_clip": 0.0116924, "auxiliary_loss_mlp": 0.01035104, "balance_loss_clip": 1.05308485, "balance_loss_mlp": 1.02509046, "epoch": 0.20669752900859736, "flos": 21799285752960.0, "grad_norm": 1.9109493014703354, "language_loss": 0.69148213, "learning_rate": 3.681472580183512e-06, "loss": 0.71352559, "num_input_tokens_seen": 36518810, "step": 1719, "time_per_iteration": 2.717984914779663 }, { "auxiliary_loss_clip": 0.0119411, "auxiliary_loss_mlp": 0.01032479, "balance_loss_clip": 1.05733609, "balance_loss_mlp": 1.02352047, "epoch": 0.20681777189923645, "flos": 15122020014720.0, "grad_norm": 2.3698632069511874, "language_loss": 0.8635602, "learning_rate": 3.6810506824936455e-06, "loss": 0.88582611, "num_input_tokens_seen": 36536890, "step": 1720, "time_per_iteration": 2.652461528778076 }, { "auxiliary_loss_clip": 0.01089195, "auxiliary_loss_mlp": 0.01006138, "balance_loss_clip": 1.02389336, "balance_loss_mlp": 1.00369406, "epoch": 0.20693801478987556, "flos": 56481021509760.0, "grad_norm": 1.1881034499513454, "language_loss": 0.62472129, "learning_rate": 3.680628529789726e-06, "loss": 0.64567459, "num_input_tokens_seen": 36589300, "step": 1721, "time_per_iteration": 3.097033977508545 }, { "auxiliary_loss_clip": 0.01220505, "auxiliary_loss_mlp": 0.0103573, "balance_loss_clip": 1.06276131, "balance_loss_mlp": 1.02420259, "epoch": 0.20705825768051464, "flos": 21614201948160.0, "grad_norm": 1.8158738435769777, "language_loss": 0.86948621, "learning_rate": 3.680206122135796e-06, "loss": 0.8920486, "num_input_tokens_seen": 36609905, "step": 1722, "time_per_iteration": 2.63529634475708 }, { "auxiliary_loss_clip": 0.0116041, "auxiliary_loss_mlp": 0.01039126, "balance_loss_clip": 1.05478978, "balance_loss_mlp": 1.0291481, "epoch": 0.20717850057115372, "flos": 25848895962240.0, "grad_norm": 6.789407037999269, "language_loss": 0.78595388, "learning_rate": 3.6797834595959323e-06, "loss": 0.8079493, "num_input_tokens_seen": 36629805, "step": 1723, "time_per_iteration": 2.755037307739258 }, { "auxiliary_loss_clip": 0.01142468, "auxiliary_loss_mlp": 0.01033577, "balance_loss_clip": 1.04797077, "balance_loss_mlp": 1.02264571, "epoch": 0.20729874346179283, "flos": 29130807767040.0, "grad_norm": 4.367857733743932, "language_loss": 0.78193772, "learning_rate": 3.679360542234254e-06, "loss": 0.80369818, "num_input_tokens_seen": 36649150, "step": 1724, "time_per_iteration": 2.886676788330078 }, { "auxiliary_loss_clip": 0.01175821, "auxiliary_loss_mlp": 0.0076487, "balance_loss_clip": 1.05178928, "balance_loss_mlp": 1.00048518, "epoch": 0.20741898635243192, "flos": 29023363209600.0, "grad_norm": 1.9048844373106568, "language_loss": 0.72284812, "learning_rate": 3.678937370114916e-06, "loss": 0.74225509, "num_input_tokens_seen": 36668955, "step": 1725, "time_per_iteration": 2.7701542377471924 }, { "auxiliary_loss_clip": 0.01179496, "auxiliary_loss_mlp": 0.01037124, "balance_loss_clip": 1.05552888, "balance_loss_mlp": 1.02787375, "epoch": 0.207539229243071, "flos": 15559447841280.0, "grad_norm": 1.938857728205364, "language_loss": 0.78481722, "learning_rate": 3.678513943302114e-06, "loss": 0.80698341, "num_input_tokens_seen": 36685730, "step": 1726, "time_per_iteration": 2.700315237045288 }, { "auxiliary_loss_clip": 0.01212311, "auxiliary_loss_mlp": 0.01037629, "balance_loss_clip": 1.06042004, "balance_loss_mlp": 1.02818167, "epoch": 0.20765947213371008, "flos": 20521081301760.0, "grad_norm": 1.8202160737902255, "language_loss": 0.85487711, "learning_rate": 3.678090261860082e-06, "loss": 0.8773765, "num_input_tokens_seen": 36705460, "step": 1727, "time_per_iteration": 2.643371820449829 }, { "auxiliary_loss_clip": 0.01168509, "auxiliary_loss_mlp": 0.010306, "balance_loss_clip": 1.04963374, "balance_loss_mlp": 1.02073526, "epoch": 0.2077797150243492, "flos": 19354415558400.0, "grad_norm": 2.0320129293750253, "language_loss": 0.77687228, "learning_rate": 3.6776663258530906e-06, "loss": 0.79886341, "num_input_tokens_seen": 36724110, "step": 1728, "time_per_iteration": 2.7693707942962646 }, { "auxiliary_loss_clip": 0.01198529, "auxiliary_loss_mlp": 0.01032354, "balance_loss_clip": 1.05635071, "balance_loss_mlp": 1.02203071, "epoch": 0.20789995791498828, "flos": 21829952989440.0, "grad_norm": 1.9668632081977668, "language_loss": 0.71253777, "learning_rate": 3.6772421353454516e-06, "loss": 0.73484665, "num_input_tokens_seen": 36742705, "step": 1729, "time_per_iteration": 2.6397814750671387 }, { "auxiliary_loss_clip": 0.01197347, "auxiliary_loss_mlp": 0.01037826, "balance_loss_clip": 1.05975199, "balance_loss_mlp": 1.02698362, "epoch": 0.20802020080562736, "flos": 23148844571520.0, "grad_norm": 1.8654733944421404, "language_loss": 0.8829729, "learning_rate": 3.6768176904015153e-06, "loss": 0.90532464, "num_input_tokens_seen": 36762510, "step": 1730, "time_per_iteration": 2.6546359062194824 }, { "auxiliary_loss_clip": 0.01198077, "auxiliary_loss_mlp": 0.01034587, "balance_loss_clip": 1.05658317, "balance_loss_mlp": 1.02432895, "epoch": 0.20814044369626647, "flos": 23072677781760.0, "grad_norm": 1.919141280248307, "language_loss": 0.59853429, "learning_rate": 3.6763929910856674e-06, "loss": 0.62086093, "num_input_tokens_seen": 36780960, "step": 1731, "time_per_iteration": 3.6072425842285156 }, { "auxiliary_loss_clip": 0.01198715, "auxiliary_loss_mlp": 0.01031528, "balance_loss_clip": 1.05895197, "balance_loss_mlp": 1.02169299, "epoch": 0.20826068658690555, "flos": 19608016556160.0, "grad_norm": 2.34189815347847, "language_loss": 0.77571517, "learning_rate": 3.6759680374623365e-06, "loss": 0.79801762, "num_input_tokens_seen": 36798875, "step": 1732, "time_per_iteration": 2.633657932281494 }, { "auxiliary_loss_clip": 0.01214125, "auxiliary_loss_mlp": 0.01031684, "balance_loss_clip": 1.06242466, "balance_loss_mlp": 1.02196276, "epoch": 0.20838092947754464, "flos": 25374049142400.0, "grad_norm": 2.3433147656913462, "language_loss": 0.75293887, "learning_rate": 3.675542829595986e-06, "loss": 0.77539694, "num_input_tokens_seen": 36818540, "step": 1733, "time_per_iteration": 3.5939900875091553 }, { "auxiliary_loss_clip": 0.01185841, "auxiliary_loss_mlp": 0.01033637, "balance_loss_clip": 1.05744135, "balance_loss_mlp": 1.0234741, "epoch": 0.20850117236818372, "flos": 24061729749120.0, "grad_norm": 1.4719628875557313, "language_loss": 0.7935015, "learning_rate": 3.6751173675511213e-06, "loss": 0.81569636, "num_input_tokens_seen": 36840585, "step": 1734, "time_per_iteration": 2.6695034503936768 }, { "auxiliary_loss_clip": 0.01179629, "auxiliary_loss_mlp": 0.01027226, "balance_loss_clip": 1.05138862, "balance_loss_mlp": 1.01733112, "epoch": 0.20862141525882283, "flos": 20077799558400.0, "grad_norm": 2.1789433624444032, "language_loss": 0.87397063, "learning_rate": 3.674691651392283e-06, "loss": 0.89603925, "num_input_tokens_seen": 36858255, "step": 1735, "time_per_iteration": 3.539163112640381 }, { "auxiliary_loss_clip": 0.01188342, "auxiliary_loss_mlp": 0.01030565, "balance_loss_clip": 1.05802441, "balance_loss_mlp": 1.01995552, "epoch": 0.2087416581494619, "flos": 39015183237120.0, "grad_norm": 2.1752788467259085, "language_loss": 0.75683272, "learning_rate": 3.674265681184053e-06, "loss": 0.7790218, "num_input_tokens_seen": 36881515, "step": 1736, "time_per_iteration": 3.7494421005249023 }, { "auxiliary_loss_clip": 0.01184734, "auxiliary_loss_mlp": 0.01032751, "balance_loss_clip": 1.05622435, "balance_loss_mlp": 1.02240384, "epoch": 0.208861901040101, "flos": 26101994169600.0, "grad_norm": 2.251027785148863, "language_loss": 0.86255765, "learning_rate": 3.6738394569910504e-06, "loss": 0.88473248, "num_input_tokens_seen": 36902055, "step": 1737, "time_per_iteration": 2.6958632469177246 }, { "auxiliary_loss_clip": 0.0120149, "auxiliary_loss_mlp": 0.01034242, "balance_loss_clip": 1.06069207, "balance_loss_mlp": 1.02358425, "epoch": 0.2089821439307401, "flos": 28398732675840.0, "grad_norm": 5.195665248401253, "language_loss": 0.83122218, "learning_rate": 3.6734129788779333e-06, "loss": 0.85357952, "num_input_tokens_seen": 36921230, "step": 1738, "time_per_iteration": 2.7096691131591797 }, { "auxiliary_loss_clip": 0.01167545, "auxiliary_loss_mlp": 0.01034392, "balance_loss_clip": 1.05646694, "balance_loss_mlp": 1.02545702, "epoch": 0.2091023868213792, "flos": 21069616872960.0, "grad_norm": 1.7629681885007642, "language_loss": 0.90293068, "learning_rate": 3.6729862469093976e-06, "loss": 0.92495, "num_input_tokens_seen": 36940325, "step": 1739, "time_per_iteration": 2.7805280685424805 }, { "auxiliary_loss_clip": 0.01171152, "auxiliary_loss_mlp": 0.01031732, "balance_loss_clip": 1.05247748, "balance_loss_mlp": 1.02193928, "epoch": 0.20922262971201827, "flos": 22455481363200.0, "grad_norm": 2.127832818135102, "language_loss": 0.82559264, "learning_rate": 3.6725592611501782e-06, "loss": 0.84762144, "num_input_tokens_seen": 36959000, "step": 1740, "time_per_iteration": 2.7038371562957764 }, { "auxiliary_loss_clip": 0.01197738, "auxiliary_loss_mlp": 0.01037388, "balance_loss_clip": 1.05664849, "balance_loss_mlp": 1.02729082, "epoch": 0.20934287260265738, "flos": 27852244179840.0, "grad_norm": 1.7592717649007117, "language_loss": 0.76531589, "learning_rate": 3.6721320216650496e-06, "loss": 0.7876671, "num_input_tokens_seen": 36979615, "step": 1741, "time_per_iteration": 2.7016961574554443 }, { "auxiliary_loss_clip": 0.0118254, "auxiliary_loss_mlp": 0.01034075, "balance_loss_clip": 1.05787027, "balance_loss_mlp": 1.02376366, "epoch": 0.20946311549329646, "flos": 16435309075200.0, "grad_norm": 1.8820783358753295, "language_loss": 0.83724016, "learning_rate": 3.6717045285188215e-06, "loss": 0.85940635, "num_input_tokens_seen": 36997310, "step": 1742, "time_per_iteration": 2.648798942565918 }, { "auxiliary_loss_clip": 0.01143482, "auxiliary_loss_mlp": 0.01036963, "balance_loss_clip": 1.04817343, "balance_loss_mlp": 1.02705038, "epoch": 0.20958335838393555, "flos": 22492720788480.0, "grad_norm": 2.358981630364755, "language_loss": 0.86743903, "learning_rate": 3.671276781776346e-06, "loss": 0.88924348, "num_input_tokens_seen": 37015965, "step": 1743, "time_per_iteration": 2.781121015548706 }, { "auxiliary_loss_clip": 0.01177577, "auxiliary_loss_mlp": 0.01030095, "balance_loss_clip": 1.05394745, "balance_loss_mlp": 1.01983714, "epoch": 0.20970360127457463, "flos": 25224768218880.0, "grad_norm": 3.6156926356358157, "language_loss": 0.67300093, "learning_rate": 3.6708487815025128e-06, "loss": 0.69507772, "num_input_tokens_seen": 37036545, "step": 1744, "time_per_iteration": 2.7319209575653076 }, { "auxiliary_loss_clip": 0.01166208, "auxiliary_loss_mlp": 0.01037588, "balance_loss_clip": 1.05317521, "balance_loss_mlp": 1.02743173, "epoch": 0.20982384416521374, "flos": 18479164855680.0, "grad_norm": 2.3701408808397373, "language_loss": 0.74224412, "learning_rate": 3.6704205277622463e-06, "loss": 0.76428211, "num_input_tokens_seen": 37054985, "step": 1745, "time_per_iteration": 2.7446377277374268 }, { "auxiliary_loss_clip": 0.01185169, "auxiliary_loss_mlp": 0.01034407, "balance_loss_clip": 1.05410028, "balance_loss_mlp": 1.02463818, "epoch": 0.20994408705585282, "flos": 25373546352000.0, "grad_norm": 2.020786924539658, "language_loss": 0.80790091, "learning_rate": 3.6699920206205146e-06, "loss": 0.8300966, "num_input_tokens_seen": 37075725, "step": 1746, "time_per_iteration": 2.7242655754089355 }, { "auxiliary_loss_clip": 0.01197023, "auxiliary_loss_mlp": 0.01035682, "balance_loss_clip": 1.05551553, "balance_loss_mlp": 1.02551413, "epoch": 0.2100643299464919, "flos": 21320955313920.0, "grad_norm": 4.152939383314306, "language_loss": 0.82030648, "learning_rate": 3.669563260142321e-06, "loss": 0.84263355, "num_input_tokens_seen": 37094615, "step": 1747, "time_per_iteration": 2.7027008533477783 }, { "auxiliary_loss_clip": 0.01181268, "auxiliary_loss_mlp": 0.01033783, "balance_loss_clip": 1.05859828, "balance_loss_mlp": 1.0233227, "epoch": 0.21018457283713102, "flos": 19354379644800.0, "grad_norm": 2.299356492945342, "language_loss": 0.84258187, "learning_rate": 3.6691342463927083e-06, "loss": 0.86473238, "num_input_tokens_seen": 37113610, "step": 1748, "time_per_iteration": 2.6373085975646973 }, { "auxiliary_loss_clip": 0.01173526, "auxiliary_loss_mlp": 0.01035403, "balance_loss_clip": 1.05430388, "balance_loss_mlp": 1.02457297, "epoch": 0.2103048157277701, "flos": 28330035914880.0, "grad_norm": 1.7652732312417194, "language_loss": 0.82074702, "learning_rate": 3.668704979436758e-06, "loss": 0.84283632, "num_input_tokens_seen": 37133705, "step": 1749, "time_per_iteration": 2.741953134536743 }, { "auxiliary_loss_clip": 0.01178513, "auxiliary_loss_mlp": 0.01033948, "balance_loss_clip": 1.05456066, "balance_loss_mlp": 1.02454281, "epoch": 0.21042505861840918, "flos": 17457290835840.0, "grad_norm": 2.5127510584124977, "language_loss": 0.78787661, "learning_rate": 3.668275459339588e-06, "loss": 0.81000119, "num_input_tokens_seen": 37152185, "step": 1750, "time_per_iteration": 2.6339824199676514 }, { "auxiliary_loss_clip": 0.01214409, "auxiliary_loss_mlp": 0.01037802, "balance_loss_clip": 1.06183457, "balance_loss_mlp": 1.02819419, "epoch": 0.21054530150904827, "flos": 14209817195520.0, "grad_norm": 2.016458935429845, "language_loss": 0.79947603, "learning_rate": 3.667845686166358e-06, "loss": 0.82199812, "num_input_tokens_seen": 37169110, "step": 1751, "time_per_iteration": 2.621464252471924 }, { "auxiliary_loss_clip": 0.01153449, "auxiliary_loss_mlp": 0.01042917, "balance_loss_clip": 1.05179548, "balance_loss_mlp": 1.03197348, "epoch": 0.21066554439968738, "flos": 18618210403200.0, "grad_norm": 1.9573107428860472, "language_loss": 0.86366576, "learning_rate": 3.6674156599822634e-06, "loss": 0.88562942, "num_input_tokens_seen": 37184905, "step": 1752, "time_per_iteration": 2.6789586544036865 }, { "auxiliary_loss_clip": 0.01158539, "auxiliary_loss_mlp": 0.01038968, "balance_loss_clip": 1.05080712, "balance_loss_mlp": 1.02892423, "epoch": 0.21078578729032646, "flos": 23658883741440.0, "grad_norm": 1.8460760120628468, "language_loss": 0.81502587, "learning_rate": 3.666985380852539e-06, "loss": 0.83700097, "num_input_tokens_seen": 37203910, "step": 1753, "time_per_iteration": 2.8073105812072754 }, { "auxiliary_loss_clip": 0.01182176, "auxiliary_loss_mlp": 0.01034425, "balance_loss_clip": 1.05574274, "balance_loss_mlp": 1.02417254, "epoch": 0.21090603018096554, "flos": 29346379240320.0, "grad_norm": 3.300729141016106, "language_loss": 0.7408824, "learning_rate": 3.6665548488424576e-06, "loss": 0.76304841, "num_input_tokens_seen": 37222670, "step": 1754, "time_per_iteration": 2.734341859817505 }, { "auxiliary_loss_clip": 0.01219819, "auxiliary_loss_mlp": 0.01038548, "balance_loss_clip": 1.06279731, "balance_loss_mlp": 1.02700293, "epoch": 0.21102627307160465, "flos": 23261245205760.0, "grad_norm": 1.7371224417138964, "language_loss": 0.87847388, "learning_rate": 3.6661240640173307e-06, "loss": 0.9010576, "num_input_tokens_seen": 37244140, "step": 1755, "time_per_iteration": 2.610875368118286 }, { "auxiliary_loss_clip": 0.01083327, "auxiliary_loss_mlp": 0.01005329, "balance_loss_clip": 1.02822673, "balance_loss_mlp": 1.0028857, "epoch": 0.21114651596224374, "flos": 54633454577280.0, "grad_norm": 0.8591838570941044, "language_loss": 0.57903075, "learning_rate": 3.6656930264425085e-06, "loss": 0.59991729, "num_input_tokens_seen": 37308185, "step": 1756, "time_per_iteration": 3.296066999435425 }, { "auxiliary_loss_clip": 0.01214418, "auxiliary_loss_mlp": 0.01034107, "balance_loss_clip": 1.06040132, "balance_loss_mlp": 1.02350914, "epoch": 0.21126675885288282, "flos": 21543314457600.0, "grad_norm": 2.161858495790297, "language_loss": 0.75665987, "learning_rate": 3.665261736183378e-06, "loss": 0.77914518, "num_input_tokens_seen": 37328220, "step": 1757, "time_per_iteration": 3.4858505725860596 }, { "auxiliary_loss_clip": 0.01177423, "auxiliary_loss_mlp": 0.010378, "balance_loss_clip": 1.06085145, "balance_loss_mlp": 1.02665997, "epoch": 0.2113870017435219, "flos": 10961876678400.0, "grad_norm": 2.5292912417616877, "language_loss": 0.887061, "learning_rate": 3.664830193305366e-06, "loss": 0.9092133, "num_input_tokens_seen": 37345995, "step": 1758, "time_per_iteration": 2.6865692138671875 }, { "auxiliary_loss_clip": 0.01163439, "auxiliary_loss_mlp": 0.01037385, "balance_loss_clip": 1.05148101, "balance_loss_mlp": 1.0252136, "epoch": 0.211507244634161, "flos": 16653825463680.0, "grad_norm": 2.4519806190875686, "language_loss": 0.77273959, "learning_rate": 3.6643983978739373e-06, "loss": 0.79474783, "num_input_tokens_seen": 37362610, "step": 1759, "time_per_iteration": 3.671708345413208 }, { "auxiliary_loss_clip": 0.01180031, "auxiliary_loss_mlp": 0.01037246, "balance_loss_clip": 1.05958152, "balance_loss_mlp": 1.02655911, "epoch": 0.2116274875248001, "flos": 20954091755520.0, "grad_norm": 2.141732757669399, "language_loss": 0.8240115, "learning_rate": 3.663966349954596e-06, "loss": 0.84618425, "num_input_tokens_seen": 37382790, "step": 1760, "time_per_iteration": 2.6791062355041504 }, { "auxiliary_loss_clip": 0.01108013, "auxiliary_loss_mlp": 0.01002311, "balance_loss_clip": 1.02802515, "balance_loss_mlp": 0.99974787, "epoch": 0.21174773041543918, "flos": 68196949424640.0, "grad_norm": 0.7923717963250839, "language_loss": 0.59683514, "learning_rate": 3.6635340496128816e-06, "loss": 0.6179384, "num_input_tokens_seen": 37439720, "step": 1761, "time_per_iteration": 4.90225625038147 }, { "auxiliary_loss_clip": 0.01151439, "auxiliary_loss_mlp": 0.01034277, "balance_loss_clip": 1.05331182, "balance_loss_mlp": 1.02493107, "epoch": 0.2118679733060783, "flos": 20668315150080.0, "grad_norm": 2.625622024097744, "language_loss": 0.92715335, "learning_rate": 3.6631014969143747e-06, "loss": 0.94901049, "num_input_tokens_seen": 37459410, "step": 1762, "time_per_iteration": 2.727818727493286 }, { "auxiliary_loss_clip": 0.01200936, "auxiliary_loss_mlp": 0.01032751, "balance_loss_clip": 1.0620116, "balance_loss_mlp": 1.02204633, "epoch": 0.21198821619671737, "flos": 23223431162880.0, "grad_norm": 1.8860442645353335, "language_loss": 0.88795447, "learning_rate": 3.662668691924693e-06, "loss": 0.91029125, "num_input_tokens_seen": 37480460, "step": 1763, "time_per_iteration": 2.6984100341796875 }, { "auxiliary_loss_clip": 0.01169212, "auxiliary_loss_mlp": 0.01038787, "balance_loss_clip": 1.05488932, "balance_loss_mlp": 1.02728963, "epoch": 0.21210845908735645, "flos": 24498547044480.0, "grad_norm": 1.9561302674551908, "language_loss": 0.70958638, "learning_rate": 3.6622356347094927e-06, "loss": 0.73166645, "num_input_tokens_seen": 37502025, "step": 1764, "time_per_iteration": 2.7697904109954834 }, { "auxiliary_loss_clip": 0.01171342, "auxiliary_loss_mlp": 0.01041509, "balance_loss_clip": 1.05306935, "balance_loss_mlp": 1.02965963, "epoch": 0.21222870197799554, "flos": 27089789160960.0, "grad_norm": 1.9054738172142327, "language_loss": 0.78225625, "learning_rate": 3.6618023253344684e-06, "loss": 0.80438471, "num_input_tokens_seen": 37520885, "step": 1765, "time_per_iteration": 2.73400616645813 }, { "auxiliary_loss_clip": 0.01198875, "auxiliary_loss_mlp": 0.01038019, "balance_loss_clip": 1.05706179, "balance_loss_mlp": 1.02733743, "epoch": 0.21234894486863465, "flos": 16873850223360.0, "grad_norm": 1.775025470344949, "language_loss": 0.83533955, "learning_rate": 3.6613687638653527e-06, "loss": 0.85770845, "num_input_tokens_seen": 37539055, "step": 1766, "time_per_iteration": 2.6351661682128906 }, { "auxiliary_loss_clip": 0.01183042, "auxiliary_loss_mlp": 0.01030063, "balance_loss_clip": 1.06089258, "balance_loss_mlp": 1.01999593, "epoch": 0.21246918775927373, "flos": 23474949171840.0, "grad_norm": 2.4108681728164245, "language_loss": 0.77835119, "learning_rate": 3.660934950367916e-06, "loss": 0.80048227, "num_input_tokens_seen": 37558300, "step": 1767, "time_per_iteration": 2.714970588684082 }, { "auxiliary_loss_clip": 0.01199211, "auxiliary_loss_mlp": 0.01034358, "balance_loss_clip": 1.05767524, "balance_loss_mlp": 1.02420735, "epoch": 0.21258943064991281, "flos": 22382295402240.0, "grad_norm": 2.6204232658594635, "language_loss": 0.83423209, "learning_rate": 3.660500884907968e-06, "loss": 0.85656774, "num_input_tokens_seen": 37579040, "step": 1768, "time_per_iteration": 2.6279985904693604 }, { "auxiliary_loss_clip": 0.01065435, "auxiliary_loss_mlp": 0.01011638, "balance_loss_clip": 1.02263594, "balance_loss_mlp": 1.00930142, "epoch": 0.21270967354055192, "flos": 59440168679040.0, "grad_norm": 0.8262394958997549, "language_loss": 0.60001528, "learning_rate": 3.660066567551356e-06, "loss": 0.62078607, "num_input_tokens_seen": 37639185, "step": 1769, "time_per_iteration": 3.184523344039917 }, { "auxiliary_loss_clip": 0.0119885, "auxiliary_loss_mlp": 0.0076563, "balance_loss_clip": 1.05792427, "balance_loss_mlp": 1.00061297, "epoch": 0.212829916431191, "flos": 21544032729600.0, "grad_norm": 4.115122315992354, "language_loss": 0.84261167, "learning_rate": 3.6596319983639657e-06, "loss": 0.86225641, "num_input_tokens_seen": 37657765, "step": 1770, "time_per_iteration": 2.737048625946045 }, { "auxiliary_loss_clip": 0.01171767, "auxiliary_loss_mlp": 0.00765331, "balance_loss_clip": 1.05763376, "balance_loss_mlp": 1.00067925, "epoch": 0.2129501593218301, "flos": 28987739896320.0, "grad_norm": 1.5651411987687167, "language_loss": 0.86200804, "learning_rate": 3.6591971774117214e-06, "loss": 0.88137901, "num_input_tokens_seen": 37680740, "step": 1771, "time_per_iteration": 2.820146322250366 }, { "auxiliary_loss_clip": 0.01205188, "auxiliary_loss_mlp": 0.01039564, "balance_loss_clip": 1.06016827, "balance_loss_mlp": 1.02938986, "epoch": 0.2130704022124692, "flos": 18806993308800.0, "grad_norm": 2.2522321738925877, "language_loss": 0.80182433, "learning_rate": 3.6587621047605833e-06, "loss": 0.82427192, "num_input_tokens_seen": 37697910, "step": 1772, "time_per_iteration": 2.6427388191223145 }, { "auxiliary_loss_clip": 0.01200058, "auxiliary_loss_mlp": 0.01038599, "balance_loss_clip": 1.05875921, "balance_loss_mlp": 1.0283891, "epoch": 0.21319064510310828, "flos": 13918150759680.0, "grad_norm": 2.160865934651858, "language_loss": 0.86939013, "learning_rate": 3.6583267804765542e-06, "loss": 0.89177668, "num_input_tokens_seen": 37712245, "step": 1773, "time_per_iteration": 2.627722978591919 }, { "auxiliary_loss_clip": 0.01200507, "auxiliary_loss_mlp": 0.01037061, "balance_loss_clip": 1.05978048, "balance_loss_mlp": 1.02560461, "epoch": 0.21331088799374737, "flos": 20959694277120.0, "grad_norm": 2.4920913340649045, "language_loss": 0.85753095, "learning_rate": 3.6578912046256702e-06, "loss": 0.87990665, "num_input_tokens_seen": 37730765, "step": 1774, "time_per_iteration": 2.6075780391693115 }, { "auxiliary_loss_clip": 0.01163427, "auxiliary_loss_mlp": 0.01040636, "balance_loss_clip": 1.05191159, "balance_loss_mlp": 1.02983594, "epoch": 0.21343113088438645, "flos": 18624638937600.0, "grad_norm": 2.828633540206646, "language_loss": 0.76166302, "learning_rate": 3.6574553772740083e-06, "loss": 0.78370363, "num_input_tokens_seen": 37748695, "step": 1775, "time_per_iteration": 2.780250072479248 }, { "auxiliary_loss_clip": 0.01126524, "auxiliary_loss_mlp": 0.01004296, "balance_loss_clip": 1.05086899, "balance_loss_mlp": 1.00160217, "epoch": 0.21355137377502556, "flos": 67413128791680.0, "grad_norm": 0.8818462000973489, "language_loss": 0.6185379, "learning_rate": 3.657019298487684e-06, "loss": 0.63984609, "num_input_tokens_seen": 37813705, "step": 1776, "time_per_iteration": 3.2535343170166016 }, { "auxiliary_loss_clip": 0.01205738, "auxiliary_loss_mlp": 0.00764951, "balance_loss_clip": 1.05784142, "balance_loss_mlp": 1.00067043, "epoch": 0.21367161666566464, "flos": 34532095697280.0, "grad_norm": 1.9899987500529634, "language_loss": 0.83646441, "learning_rate": 3.6565829683328495e-06, "loss": 0.85617125, "num_input_tokens_seen": 37836330, "step": 1777, "time_per_iteration": 2.736564874649048 }, { "auxiliary_loss_clip": 0.01198493, "auxiliary_loss_mlp": 0.01045615, "balance_loss_clip": 1.05993319, "balance_loss_mlp": 1.03530324, "epoch": 0.21379185955630373, "flos": 18989347680000.0, "grad_norm": 1.9991778811840666, "language_loss": 0.86167383, "learning_rate": 3.6561463868756965e-06, "loss": 0.88411492, "num_input_tokens_seen": 37855030, "step": 1778, "time_per_iteration": 2.649639129638672 }, { "auxiliary_loss_clip": 0.01201537, "auxiliary_loss_mlp": 0.01034728, "balance_loss_clip": 1.06206787, "balance_loss_mlp": 1.02404094, "epoch": 0.21391210244694284, "flos": 28218497207040.0, "grad_norm": 1.5542390934085208, "language_loss": 0.77976918, "learning_rate": 3.655709554182452e-06, "loss": 0.80213183, "num_input_tokens_seen": 37875370, "step": 1779, "time_per_iteration": 2.692957639694214 }, { "auxiliary_loss_clip": 0.01205198, "auxiliary_loss_mlp": 0.01036121, "balance_loss_clip": 1.0592016, "balance_loss_mlp": 1.02571368, "epoch": 0.21403234533758192, "flos": 17455064192640.0, "grad_norm": 1.8699029661439395, "language_loss": 0.84632224, "learning_rate": 3.6552724703193855e-06, "loss": 0.86873543, "num_input_tokens_seen": 37892560, "step": 1780, "time_per_iteration": 2.6412971019744873 }, { "auxiliary_loss_clip": 0.01065691, "auxiliary_loss_mlp": 0.01006816, "balance_loss_clip": 1.02148199, "balance_loss_mlp": 1.00420499, "epoch": 0.214152588228221, "flos": 51637606686720.0, "grad_norm": 0.7951587210304365, "language_loss": 0.55895865, "learning_rate": 3.654835135352801e-06, "loss": 0.57968372, "num_input_tokens_seen": 37947370, "step": 1781, "time_per_iteration": 3.189809799194336 }, { "auxiliary_loss_clip": 0.01155183, "auxiliary_loss_mlp": 0.01034452, "balance_loss_clip": 1.04953837, "balance_loss_mlp": 1.02385986, "epoch": 0.21427283111886009, "flos": 19496154625920.0, "grad_norm": 1.994049338517953, "language_loss": 0.87541991, "learning_rate": 3.654397549349043e-06, "loss": 0.89731628, "num_input_tokens_seen": 37964745, "step": 1782, "time_per_iteration": 2.7551605701446533 }, { "auxiliary_loss_clip": 0.01182525, "auxiliary_loss_mlp": 0.01043308, "balance_loss_clip": 1.05849504, "balance_loss_mlp": 1.03213215, "epoch": 0.2143930740094992, "flos": 20084802710400.0, "grad_norm": 2.423530107588807, "language_loss": 0.75310594, "learning_rate": 3.653959712374491e-06, "loss": 0.77536428, "num_input_tokens_seen": 37982850, "step": 1783, "time_per_iteration": 3.6156795024871826 }, { "auxiliary_loss_clip": 0.01168226, "auxiliary_loss_mlp": 0.01039742, "balance_loss_clip": 1.05939507, "balance_loss_mlp": 1.0295918, "epoch": 0.21451331690013828, "flos": 21798603394560.0, "grad_norm": 1.64401731165414, "language_loss": 0.82592952, "learning_rate": 3.6535216244955663e-06, "loss": 0.84800923, "num_input_tokens_seen": 38002745, "step": 1784, "time_per_iteration": 2.692180633544922 }, { "auxiliary_loss_clip": 0.01185031, "auxiliary_loss_mlp": 0.01035144, "balance_loss_clip": 1.05687666, "balance_loss_mlp": 1.02440894, "epoch": 0.21463355979077736, "flos": 32853882412800.0, "grad_norm": 7.034347853325655, "language_loss": 0.71055663, "learning_rate": 3.653083285778726e-06, "loss": 0.7327584, "num_input_tokens_seen": 38024115, "step": 1785, "time_per_iteration": 2.826132297515869 }, { "auxiliary_loss_clip": 0.01202823, "auxiliary_loss_mlp": 0.01035837, "balance_loss_clip": 1.05730963, "balance_loss_mlp": 1.02522182, "epoch": 0.21475380268141647, "flos": 21543817248000.0, "grad_norm": 2.501163561153789, "language_loss": 0.81216162, "learning_rate": 3.6526446962904653e-06, "loss": 0.83454823, "num_input_tokens_seen": 38042830, "step": 1786, "time_per_iteration": 3.622807502746582 }, { "auxiliary_loss_clip": 0.01198403, "auxiliary_loss_mlp": 0.01033478, "balance_loss_clip": 1.06112254, "balance_loss_mlp": 1.02324426, "epoch": 0.21487404557205556, "flos": 32159082660480.0, "grad_norm": 1.7009124734797032, "language_loss": 0.74526447, "learning_rate": 3.652205856097318e-06, "loss": 0.76758331, "num_input_tokens_seen": 38066015, "step": 1787, "time_per_iteration": 3.643477201461792 }, { "auxiliary_loss_clip": 0.01182967, "auxiliary_loss_mlp": 0.00765432, "balance_loss_clip": 1.05683017, "balance_loss_mlp": 1.00067806, "epoch": 0.21499428846269464, "flos": 12673091583360.0, "grad_norm": 1.9815090072320514, "language_loss": 0.79203022, "learning_rate": 3.651766765265856e-06, "loss": 0.81151414, "num_input_tokens_seen": 38083025, "step": 1788, "time_per_iteration": 3.6836748123168945 }, { "auxiliary_loss_clip": 0.0118162, "auxiliary_loss_mlp": 0.01037419, "balance_loss_clip": 1.05590022, "balance_loss_mlp": 1.02655959, "epoch": 0.21511453135333372, "flos": 23471573293440.0, "grad_norm": 2.260235776268623, "language_loss": 0.81325787, "learning_rate": 3.65132742386269e-06, "loss": 0.83544827, "num_input_tokens_seen": 38098245, "step": 1789, "time_per_iteration": 2.6759603023529053 }, { "auxiliary_loss_clip": 0.01217335, "auxiliary_loss_mlp": 0.01036549, "balance_loss_clip": 1.06120515, "balance_loss_mlp": 1.02639842, "epoch": 0.21523477424397283, "flos": 26943560893440.0, "grad_norm": 1.7301039237285534, "language_loss": 0.8429172, "learning_rate": 3.6508878319544656e-06, "loss": 0.8654561, "num_input_tokens_seen": 38118460, "step": 1790, "time_per_iteration": 2.6598570346832275 }, { "auxiliary_loss_clip": 0.01182792, "auxiliary_loss_mlp": 0.01034613, "balance_loss_clip": 1.0588696, "balance_loss_mlp": 1.02382469, "epoch": 0.21535501713461191, "flos": 18916161719040.0, "grad_norm": 3.633145880823816, "language_loss": 0.82070619, "learning_rate": 3.65044798960787e-06, "loss": 0.84288025, "num_input_tokens_seen": 38136800, "step": 1791, "time_per_iteration": 2.723137617111206 }, { "auxiliary_loss_clip": 0.01160693, "auxiliary_loss_mlp": 0.01037635, "balance_loss_clip": 1.05145836, "balance_loss_mlp": 1.02770519, "epoch": 0.215475260025251, "flos": 17895113712000.0, "grad_norm": 2.441683093458962, "language_loss": 0.78109038, "learning_rate": 3.650007896889627e-06, "loss": 0.80307364, "num_input_tokens_seen": 38155380, "step": 1792, "time_per_iteration": 2.72836971282959 }, { "auxiliary_loss_clip": 0.01213585, "auxiliary_loss_mlp": 0.01036118, "balance_loss_clip": 1.06141329, "balance_loss_mlp": 1.02560985, "epoch": 0.2155955029158901, "flos": 16654292340480.0, "grad_norm": 1.800681664534201, "language_loss": 0.8063162, "learning_rate": 3.6495675538664974e-06, "loss": 0.8288132, "num_input_tokens_seen": 38174395, "step": 1793, "time_per_iteration": 2.6715281009674072 }, { "auxiliary_loss_clip": 0.01184846, "auxiliary_loss_mlp": 0.01034792, "balance_loss_clip": 1.0541935, "balance_loss_mlp": 1.0249567, "epoch": 0.2157157458065292, "flos": 23621213352960.0, "grad_norm": 2.6696899005338732, "language_loss": 0.82647794, "learning_rate": 3.649126960605282e-06, "loss": 0.8486743, "num_input_tokens_seen": 38195380, "step": 1794, "time_per_iteration": 2.708662748336792 }, { "auxiliary_loss_clip": 0.01184817, "auxiliary_loss_mlp": 0.01036875, "balance_loss_clip": 1.05817842, "balance_loss_mlp": 1.02605665, "epoch": 0.21583598869716827, "flos": 22127078292480.0, "grad_norm": 3.750168991725045, "language_loss": 0.83363801, "learning_rate": 3.6486861171728174e-06, "loss": 0.85585493, "num_input_tokens_seen": 38213775, "step": 1795, "time_per_iteration": 2.8233165740966797 }, { "auxiliary_loss_clip": 0.01172224, "auxiliary_loss_mlp": 0.01032972, "balance_loss_clip": 1.05344927, "balance_loss_mlp": 1.02187908, "epoch": 0.21595623158780738, "flos": 23441229279360.0, "grad_norm": 2.7320547901854404, "language_loss": 0.78621161, "learning_rate": 3.6482450236359803e-06, "loss": 0.80826354, "num_input_tokens_seen": 38235630, "step": 1796, "time_per_iteration": 2.6944985389709473 }, { "auxiliary_loss_clip": 0.01197818, "auxiliary_loss_mlp": 0.01030602, "balance_loss_clip": 1.06018043, "balance_loss_mlp": 1.02004659, "epoch": 0.21607647447844647, "flos": 26906501036160.0, "grad_norm": 2.5024859229128062, "language_loss": 0.78098518, "learning_rate": 3.647803680061683e-06, "loss": 0.80326939, "num_input_tokens_seen": 38256045, "step": 1797, "time_per_iteration": 2.7337169647216797 }, { "auxiliary_loss_clip": 0.0118774, "auxiliary_loss_mlp": 0.01033692, "balance_loss_clip": 1.05802274, "balance_loss_mlp": 1.02299309, "epoch": 0.21619671736908555, "flos": 14495378319360.0, "grad_norm": 2.558051470818116, "language_loss": 0.74255008, "learning_rate": 3.6473620865168776e-06, "loss": 0.76476437, "num_input_tokens_seen": 38272915, "step": 1798, "time_per_iteration": 2.627615213394165 }, { "auxiliary_loss_clip": 0.01183354, "auxiliary_loss_mlp": 0.01031239, "balance_loss_clip": 1.05707574, "balance_loss_mlp": 1.02084398, "epoch": 0.21631696025972463, "flos": 17931096161280.0, "grad_norm": 2.2240906913083527, "language_loss": 0.81940365, "learning_rate": 3.646920243068554e-06, "loss": 0.84154958, "num_input_tokens_seen": 38290810, "step": 1799, "time_per_iteration": 2.692553758621216 }, { "auxiliary_loss_clip": 0.01176273, "auxiliary_loss_mlp": 0.01042717, "balance_loss_clip": 1.05753756, "balance_loss_mlp": 1.03141618, "epoch": 0.21643720315036374, "flos": 24462385027200.0, "grad_norm": 2.0922270667759757, "language_loss": 0.74491096, "learning_rate": 3.6464781497837384e-06, "loss": 0.76710081, "num_input_tokens_seen": 38312785, "step": 1800, "time_per_iteration": 2.691991090774536 }, { "auxiliary_loss_clip": 0.01187807, "auxiliary_loss_mlp": 0.01040894, "balance_loss_clip": 1.05440569, "balance_loss_mlp": 1.03027844, "epoch": 0.21655744604100283, "flos": 28474432588800.0, "grad_norm": 3.022631748535448, "language_loss": 0.73073852, "learning_rate": 3.6460358067294965e-06, "loss": 0.75302553, "num_input_tokens_seen": 38334015, "step": 1801, "time_per_iteration": 2.7114710807800293 }, { "auxiliary_loss_clip": 0.01218217, "auxiliary_loss_mlp": 0.0103646, "balance_loss_clip": 1.05979252, "balance_loss_mlp": 1.02537394, "epoch": 0.2166776889316419, "flos": 20152960767360.0, "grad_norm": 2.2965163126684023, "language_loss": 0.78137761, "learning_rate": 3.645593213972932e-06, "loss": 0.80392432, "num_input_tokens_seen": 38352920, "step": 1802, "time_per_iteration": 2.603483200073242 }, { "auxiliary_loss_clip": 0.0119552, "auxiliary_loss_mlp": 0.01037419, "balance_loss_clip": 1.0588398, "balance_loss_mlp": 1.02690482, "epoch": 0.21679793182228102, "flos": 15193482122880.0, "grad_norm": 2.484984148965709, "language_loss": 0.80256057, "learning_rate": 3.6451503715811852e-06, "loss": 0.82489002, "num_input_tokens_seen": 38371230, "step": 1803, "time_per_iteration": 2.6477551460266113 }, { "auxiliary_loss_clip": 0.01188069, "auxiliary_loss_mlp": 0.01037016, "balance_loss_clip": 1.06076527, "balance_loss_mlp": 1.02724123, "epoch": 0.2169181747129201, "flos": 17384464010880.0, "grad_norm": 2.2386802417569163, "language_loss": 0.80138993, "learning_rate": 3.6447072796214345e-06, "loss": 0.82364082, "num_input_tokens_seen": 38389795, "step": 1804, "time_per_iteration": 2.671994209289551 }, { "auxiliary_loss_clip": 0.01064049, "auxiliary_loss_mlp": 0.01019514, "balance_loss_clip": 1.02499199, "balance_loss_mlp": 1.01699913, "epoch": 0.21703841760355919, "flos": 58760955429120.0, "grad_norm": 1.0807393904872657, "language_loss": 0.63187885, "learning_rate": 3.644263938160898e-06, "loss": 0.65271449, "num_input_tokens_seen": 38445760, "step": 1805, "time_per_iteration": 3.1721842288970947 }, { "auxiliary_loss_clip": 0.01167083, "auxiliary_loss_mlp": 0.01036652, "balance_loss_clip": 1.05634451, "balance_loss_mlp": 1.0255897, "epoch": 0.21715866049419827, "flos": 22418457419520.0, "grad_norm": 1.9907323830796033, "language_loss": 0.7196517, "learning_rate": 3.6438203472668293e-06, "loss": 0.74168909, "num_input_tokens_seen": 38465405, "step": 1806, "time_per_iteration": 2.7546093463897705 }, { "auxiliary_loss_clip": 0.01190949, "auxiliary_loss_mlp": 0.01032793, "balance_loss_clip": 1.06079054, "balance_loss_mlp": 1.0232507, "epoch": 0.21727890338483738, "flos": 17237732952960.0, "grad_norm": 2.146204592793586, "language_loss": 0.81514049, "learning_rate": 3.6433765070065206e-06, "loss": 0.83737791, "num_input_tokens_seen": 38483195, "step": 1807, "time_per_iteration": 2.6538589000701904 }, { "auxiliary_loss_clip": 0.0121787, "auxiliary_loss_mlp": 0.01034274, "balance_loss_clip": 1.06219339, "balance_loss_mlp": 1.02356303, "epoch": 0.21739914627547646, "flos": 13434792416640.0, "grad_norm": 4.626795731050862, "language_loss": 0.87786114, "learning_rate": 3.6429324174473025e-06, "loss": 0.90038252, "num_input_tokens_seen": 38496735, "step": 1808, "time_per_iteration": 2.6576898097991943 }, { "auxiliary_loss_clip": 0.01201369, "auxiliary_loss_mlp": 0.01038042, "balance_loss_clip": 1.05789018, "balance_loss_mlp": 1.02799869, "epoch": 0.21751938916611555, "flos": 20959514709120.0, "grad_norm": 2.5281077565170125, "language_loss": 0.8477757, "learning_rate": 3.6424880786565425e-06, "loss": 0.87016976, "num_input_tokens_seen": 38512880, "step": 1809, "time_per_iteration": 3.558098554611206 }, { "auxiliary_loss_clip": 0.01155044, "auxiliary_loss_mlp": 0.01035655, "balance_loss_clip": 1.05775809, "balance_loss_mlp": 1.02371073, "epoch": 0.21763963205675466, "flos": 27599936071680.0, "grad_norm": 2.1673442240153644, "language_loss": 0.79798645, "learning_rate": 3.6420434907016482e-06, "loss": 0.81989342, "num_input_tokens_seen": 38532570, "step": 1810, "time_per_iteration": 2.8068535327911377 }, { "auxiliary_loss_clip": 0.01203231, "auxiliary_loss_mlp": 0.0103937, "balance_loss_clip": 1.0630368, "balance_loss_mlp": 1.02945769, "epoch": 0.21775987494739374, "flos": 21430411032960.0, "grad_norm": 4.843516819571492, "language_loss": 0.81219363, "learning_rate": 3.6415986536500606e-06, "loss": 0.83461964, "num_input_tokens_seen": 38550900, "step": 1811, "time_per_iteration": 2.800466775894165 }, { "auxiliary_loss_clip": 0.01153202, "auxiliary_loss_mlp": 0.01035284, "balance_loss_clip": 1.0614264, "balance_loss_mlp": 1.02558589, "epoch": 0.21788011783803282, "flos": 18332972501760.0, "grad_norm": 1.7362865144594841, "language_loss": 0.8086834, "learning_rate": 3.641153567569263e-06, "loss": 0.83056831, "num_input_tokens_seen": 38569215, "step": 1812, "time_per_iteration": 3.638063430786133 }, { "auxiliary_loss_clip": 0.01196622, "auxiliary_loss_mlp": 0.01036012, "balance_loss_clip": 1.05865788, "balance_loss_mlp": 1.0262723, "epoch": 0.2180003607286719, "flos": 30262748037120.0, "grad_norm": 1.9978458423470355, "language_loss": 0.9555766, "learning_rate": 3.640708232526774e-06, "loss": 0.97790295, "num_input_tokens_seen": 38587870, "step": 1813, "time_per_iteration": 3.5974247455596924 }, { "auxiliary_loss_clip": 0.01139455, "auxiliary_loss_mlp": 0.01037194, "balance_loss_clip": 1.04905963, "balance_loss_mlp": 1.02642906, "epoch": 0.21812060361931102, "flos": 25480272637440.0, "grad_norm": 1.8828328384306876, "language_loss": 0.78319716, "learning_rate": 3.6402626485901504e-06, "loss": 0.80496359, "num_input_tokens_seen": 38606965, "step": 1814, "time_per_iteration": 3.70356822013855 }, { "auxiliary_loss_clip": 0.01202218, "auxiliary_loss_mlp": 0.01035535, "balance_loss_clip": 1.06582224, "balance_loss_mlp": 1.02595043, "epoch": 0.2182408465099501, "flos": 21908166854400.0, "grad_norm": 2.6568164766562585, "language_loss": 0.78020823, "learning_rate": 3.639816815826988e-06, "loss": 0.80258572, "num_input_tokens_seen": 38626290, "step": 1815, "time_per_iteration": 2.6984000205993652 }, { "auxiliary_loss_clip": 0.01184383, "auxiliary_loss_mlp": 0.01031229, "balance_loss_clip": 1.05909717, "balance_loss_mlp": 1.02156734, "epoch": 0.21836108940058918, "flos": 23657339456640.0, "grad_norm": 2.4899170986942156, "language_loss": 0.78029096, "learning_rate": 3.6393707343049176e-06, "loss": 0.80244708, "num_input_tokens_seen": 38646620, "step": 1816, "time_per_iteration": 2.7015702724456787 }, { "auxiliary_loss_clip": 0.0120664, "auxiliary_loss_mlp": 0.01041559, "balance_loss_clip": 1.06096768, "balance_loss_mlp": 1.03080606, "epoch": 0.2184813322912283, "flos": 24681009156480.0, "grad_norm": 4.017228392060421, "language_loss": 0.73605734, "learning_rate": 3.6389244040916104e-06, "loss": 0.75853932, "num_input_tokens_seen": 38665695, "step": 1817, "time_per_iteration": 2.6307883262634277 }, { "auxiliary_loss_clip": 0.01181048, "auxiliary_loss_mlp": 0.00765581, "balance_loss_clip": 1.06063044, "balance_loss_mlp": 1.00070786, "epoch": 0.21860157518186737, "flos": 26574650259840.0, "grad_norm": 3.114167499833823, "language_loss": 0.79104865, "learning_rate": 3.6384778252547747e-06, "loss": 0.81051493, "num_input_tokens_seen": 38681575, "step": 1818, "time_per_iteration": 2.775829315185547 }, { "auxiliary_loss_clip": 0.01189007, "auxiliary_loss_mlp": 0.00765429, "balance_loss_clip": 1.06454086, "balance_loss_mlp": 1.00079155, "epoch": 0.21872181807250646, "flos": 20886292834560.0, "grad_norm": 3.6375560423266053, "language_loss": 0.78562129, "learning_rate": 3.638030997862155e-06, "loss": 0.80516565, "num_input_tokens_seen": 38700510, "step": 1819, "time_per_iteration": 2.690314292907715 }, { "auxiliary_loss_clip": 0.01085016, "auxiliary_loss_mlp": 0.01004354, "balance_loss_clip": 1.02834165, "balance_loss_mlp": 1.00193429, "epoch": 0.21884206096314554, "flos": 61209452897280.0, "grad_norm": 0.7593087947221429, "language_loss": 0.59430015, "learning_rate": 3.6375839219815356e-06, "loss": 0.61519384, "num_input_tokens_seen": 38758310, "step": 1820, "time_per_iteration": 3.1798441410064697 }, { "auxiliary_loss_clip": 0.01220578, "auxiliary_loss_mlp": 0.01041838, "balance_loss_clip": 1.06383872, "balance_loss_mlp": 1.031389, "epoch": 0.21896230385378465, "flos": 23473835850240.0, "grad_norm": 2.1259965951760345, "language_loss": 0.82796824, "learning_rate": 3.6371365976807375e-06, "loss": 0.85059237, "num_input_tokens_seen": 38778705, "step": 1821, "time_per_iteration": 2.6662960052490234 }, { "auxiliary_loss_clip": 0.01149639, "auxiliary_loss_mlp": 0.0103986, "balance_loss_clip": 1.05478394, "balance_loss_mlp": 1.02978063, "epoch": 0.21908254674442373, "flos": 25081915829760.0, "grad_norm": 1.8358495279171585, "language_loss": 0.83423477, "learning_rate": 3.6366890250276185e-06, "loss": 0.85612977, "num_input_tokens_seen": 38799660, "step": 1822, "time_per_iteration": 2.80729341506958 }, { "auxiliary_loss_clip": 0.01218015, "auxiliary_loss_mlp": 0.01034632, "balance_loss_clip": 1.06483722, "balance_loss_mlp": 1.02491689, "epoch": 0.21920278963506282, "flos": 23513768795520.0, "grad_norm": 2.8727458937532027, "language_loss": 0.90055668, "learning_rate": 3.6362412040900764e-06, "loss": 0.92308313, "num_input_tokens_seen": 38819450, "step": 1823, "time_per_iteration": 2.6766951084136963 }, { "auxiliary_loss_clip": 0.01205425, "auxiliary_loss_mlp": 0.0103608, "balance_loss_clip": 1.06215954, "balance_loss_mlp": 1.025828, "epoch": 0.21932303252570193, "flos": 29242238734080.0, "grad_norm": 2.24888374340237, "language_loss": 0.80171275, "learning_rate": 3.635793134936044e-06, "loss": 0.82412779, "num_input_tokens_seen": 38840460, "step": 1824, "time_per_iteration": 2.71860671043396 }, { "auxiliary_loss_clip": 0.01204379, "auxiliary_loss_mlp": 0.01039996, "balance_loss_clip": 1.06342423, "balance_loss_mlp": 1.0302093, "epoch": 0.219443275416341, "flos": 20806857907200.0, "grad_norm": 1.9938930529238397, "language_loss": 0.73317885, "learning_rate": 3.635344817633494e-06, "loss": 0.75562263, "num_input_tokens_seen": 38859775, "step": 1825, "time_per_iteration": 2.67822265625 }, { "auxiliary_loss_clip": 0.01197397, "auxiliary_loss_mlp": 0.01031107, "balance_loss_clip": 1.06078017, "balance_loss_mlp": 1.02095604, "epoch": 0.2195635183069801, "flos": 14501555458560.0, "grad_norm": 2.8461982774270522, "language_loss": 0.75604427, "learning_rate": 3.634896252250436e-06, "loss": 0.77832925, "num_input_tokens_seen": 38876540, "step": 1826, "time_per_iteration": 2.6497950553894043 }, { "auxiliary_loss_clip": 0.01223444, "auxiliary_loss_mlp": 0.01038942, "balance_loss_clip": 1.06663823, "balance_loss_mlp": 1.02822495, "epoch": 0.2196837611976192, "flos": 24243473589120.0, "grad_norm": 2.058806167910548, "language_loss": 0.82046247, "learning_rate": 3.6344474388549157e-06, "loss": 0.84308636, "num_input_tokens_seen": 38896195, "step": 1827, "time_per_iteration": 2.672501564025879 }, { "auxiliary_loss_clip": 0.01207842, "auxiliary_loss_mlp": 0.01034433, "balance_loss_clip": 1.06610966, "balance_loss_mlp": 1.02369857, "epoch": 0.2198040040882583, "flos": 18074523168000.0, "grad_norm": 2.8822020888149535, "language_loss": 0.80662358, "learning_rate": 3.6339983775150183e-06, "loss": 0.82904631, "num_input_tokens_seen": 38912755, "step": 1828, "time_per_iteration": 2.6545498371124268 }, { "auxiliary_loss_clip": 0.01203008, "auxiliary_loss_mlp": 0.01034707, "balance_loss_clip": 1.06423676, "balance_loss_mlp": 1.02472353, "epoch": 0.21992424697889737, "flos": 17784185535360.0, "grad_norm": 2.665058451746, "language_loss": 0.84304738, "learning_rate": 3.6335490682988664e-06, "loss": 0.86542451, "num_input_tokens_seen": 38928365, "step": 1829, "time_per_iteration": 2.6310906410217285 }, { "auxiliary_loss_clip": 0.01133752, "auxiliary_loss_mlp": 0.01035736, "balance_loss_clip": 1.05035019, "balance_loss_mlp": 1.02500141, "epoch": 0.22004448986953645, "flos": 17638495971840.0, "grad_norm": 2.3781619191960974, "language_loss": 0.82762015, "learning_rate": 3.63309951127462e-06, "loss": 0.84931505, "num_input_tokens_seen": 38945275, "step": 1830, "time_per_iteration": 2.841913938522339 }, { "auxiliary_loss_clip": 0.01173402, "auxiliary_loss_mlp": 0.01040144, "balance_loss_clip": 1.05869031, "balance_loss_mlp": 1.02973652, "epoch": 0.22016473276017556, "flos": 22275533203200.0, "grad_norm": 2.963626019553092, "language_loss": 0.75262737, "learning_rate": 3.6326497065104757e-06, "loss": 0.77476287, "num_input_tokens_seen": 38965740, "step": 1831, "time_per_iteration": 2.75930118560791 }, { "auxiliary_loss_clip": 0.01205529, "auxiliary_loss_mlp": 0.0103985, "balance_loss_clip": 1.06212187, "balance_loss_mlp": 1.02921653, "epoch": 0.22028497565081465, "flos": 25556259859200.0, "grad_norm": 2.3870122101065574, "language_loss": 0.7829532, "learning_rate": 3.6321996540746697e-06, "loss": 0.80540693, "num_input_tokens_seen": 38984815, "step": 1832, "time_per_iteration": 2.737990617752075 }, { "auxiliary_loss_clip": 0.01169517, "auxiliary_loss_mlp": 0.01045849, "balance_loss_clip": 1.05675757, "balance_loss_mlp": 1.03504848, "epoch": 0.22040521854145373, "flos": 36247332925440.0, "grad_norm": 1.9085573774356985, "language_loss": 0.79970318, "learning_rate": 3.6317493540354733e-06, "loss": 0.82185686, "num_input_tokens_seen": 39008230, "step": 1833, "time_per_iteration": 2.8235697746276855 }, { "auxiliary_loss_clip": 0.01194353, "auxiliary_loss_mlp": 0.01040195, "balance_loss_clip": 1.05787289, "balance_loss_mlp": 1.02921546, "epoch": 0.22052546143209284, "flos": 11838420270720.0, "grad_norm": 2.0020641396983048, "language_loss": 0.7672444, "learning_rate": 3.6312988064611976e-06, "loss": 0.78958988, "num_input_tokens_seen": 39026540, "step": 1834, "time_per_iteration": 2.677777051925659 }, { "auxiliary_loss_clip": 0.011702, "auxiliary_loss_mlp": 0.0103056, "balance_loss_clip": 1.05270791, "balance_loss_mlp": 1.02067721, "epoch": 0.22064570432273192, "flos": 24209250906240.0, "grad_norm": 2.7130221828710943, "language_loss": 0.81256092, "learning_rate": 3.6308480114201896e-06, "loss": 0.8345685, "num_input_tokens_seen": 39048460, "step": 1835, "time_per_iteration": 3.659214496612549 }, { "auxiliary_loss_clip": 0.01219847, "auxiliary_loss_mlp": 0.01036724, "balance_loss_clip": 1.06618309, "balance_loss_mlp": 1.02655578, "epoch": 0.220765947213371, "flos": 17931347556480.0, "grad_norm": 1.9767578058702866, "language_loss": 0.76200628, "learning_rate": 3.630396968980835e-06, "loss": 0.78457201, "num_input_tokens_seen": 39066335, "step": 1836, "time_per_iteration": 2.6587564945220947 }, { "auxiliary_loss_clip": 0.0118689, "auxiliary_loss_mlp": 0.01031672, "balance_loss_clip": 1.05949306, "balance_loss_mlp": 1.02174211, "epoch": 0.2208861901040101, "flos": 26757040544640.0, "grad_norm": 2.8360199717239745, "language_loss": 0.83389807, "learning_rate": 3.6299456792115575e-06, "loss": 0.85608375, "num_input_tokens_seen": 39087590, "step": 1837, "time_per_iteration": 2.735236406326294 }, { "auxiliary_loss_clip": 0.01109269, "auxiliary_loss_mlp": 0.01040808, "balance_loss_clip": 1.0451498, "balance_loss_mlp": 1.03068089, "epoch": 0.2210064329946492, "flos": 17817977255040.0, "grad_norm": 2.1433344494949043, "language_loss": 0.80947232, "learning_rate": 3.629494142180815e-06, "loss": 0.83097315, "num_input_tokens_seen": 39106335, "step": 1838, "time_per_iteration": 3.750500202178955 }, { "auxiliary_loss_clip": 0.01218689, "auxiliary_loss_mlp": 0.01037488, "balance_loss_clip": 1.065696, "balance_loss_mlp": 1.02802825, "epoch": 0.22112667588528828, "flos": 17967401832960.0, "grad_norm": 2.5282357540140232, "language_loss": 0.85351014, "learning_rate": 3.6290423579571075e-06, "loss": 0.87607193, "num_input_tokens_seen": 39122875, "step": 1839, "time_per_iteration": 4.407745599746704 }, { "auxiliary_loss_clip": 0.01201212, "auxiliary_loss_mlp": 0.01034112, "balance_loss_clip": 1.06338716, "balance_loss_mlp": 1.02421212, "epoch": 0.22124691877592736, "flos": 18369206346240.0, "grad_norm": 1.740999549317465, "language_loss": 0.80091316, "learning_rate": 3.6285903266089694e-06, "loss": 0.82326639, "num_input_tokens_seen": 39142150, "step": 1840, "time_per_iteration": 2.6872785091400146 }, { "auxiliary_loss_clip": 0.0118739, "auxiliary_loss_mlp": 0.01038179, "balance_loss_clip": 1.05883408, "balance_loss_mlp": 1.02672875, "epoch": 0.22136716166656648, "flos": 20813286441600.0, "grad_norm": 2.024390396079131, "language_loss": 0.77091789, "learning_rate": 3.628138048204974e-06, "loss": 0.79317361, "num_input_tokens_seen": 39162835, "step": 1841, "time_per_iteration": 2.673769950866699 }, { "auxiliary_loss_clip": 0.01152607, "auxiliary_loss_mlp": 0.01036568, "balance_loss_clip": 1.05890179, "balance_loss_mlp": 1.02497458, "epoch": 0.22148740455720556, "flos": 17675699483520.0, "grad_norm": 2.694904066026041, "language_loss": 0.76247817, "learning_rate": 3.6276855228137304e-06, "loss": 0.78436995, "num_input_tokens_seen": 39181040, "step": 1842, "time_per_iteration": 2.707284927368164 }, { "auxiliary_loss_clip": 0.01217645, "auxiliary_loss_mlp": 0.00765144, "balance_loss_clip": 1.064026, "balance_loss_mlp": 1.0009166, "epoch": 0.22160764744784464, "flos": 21726710323200.0, "grad_norm": 2.2314190607423607, "language_loss": 0.81759357, "learning_rate": 3.6272327505038874e-06, "loss": 0.83742148, "num_input_tokens_seen": 39197505, "step": 1843, "time_per_iteration": 2.5897231101989746 }, { "auxiliary_loss_clip": 0.01165169, "auxiliary_loss_mlp": 0.01035488, "balance_loss_clip": 1.05719507, "balance_loss_mlp": 1.026124, "epoch": 0.22172789033848372, "flos": 23764712186880.0, "grad_norm": 2.447148086686586, "language_loss": 0.78499901, "learning_rate": 3.626779731344131e-06, "loss": 0.80700558, "num_input_tokens_seen": 39217295, "step": 1844, "time_per_iteration": 2.7728829383850098 }, { "auxiliary_loss_clip": 0.01216139, "auxiliary_loss_mlp": 0.01036084, "balance_loss_clip": 1.0642761, "balance_loss_mlp": 1.02646387, "epoch": 0.22184813322912283, "flos": 16982300361600.0, "grad_norm": 2.587831653155976, "language_loss": 0.85215747, "learning_rate": 3.6263264654031814e-06, "loss": 0.87467968, "num_input_tokens_seen": 39234195, "step": 1845, "time_per_iteration": 2.5795907974243164 }, { "auxiliary_loss_clip": 0.0107862, "auxiliary_loss_mlp": 0.01003736, "balance_loss_clip": 1.02934444, "balance_loss_mlp": 1.00135159, "epoch": 0.22196837611976192, "flos": 61823740314240.0, "grad_norm": 0.7006750785663416, "language_loss": 0.59167969, "learning_rate": 3.6258729527498008e-06, "loss": 0.61250329, "num_input_tokens_seen": 39295040, "step": 1846, "time_per_iteration": 3.2299644947052 }, { "auxiliary_loss_clip": 0.01194544, "auxiliary_loss_mlp": 0.01027727, "balance_loss_clip": 1.06169391, "balance_loss_mlp": 1.01777351, "epoch": 0.222088619010401, "flos": 25558019625600.0, "grad_norm": 5.008858775445267, "language_loss": 0.65485823, "learning_rate": 3.6254191934527854e-06, "loss": 0.67708093, "num_input_tokens_seen": 39314395, "step": 1847, "time_per_iteration": 2.78926157951355 }, { "auxiliary_loss_clip": 0.01173902, "auxiliary_loss_mlp": 0.01035288, "balance_loss_clip": 1.06272066, "balance_loss_mlp": 1.02503622, "epoch": 0.2222088619010401, "flos": 19318612677120.0, "grad_norm": 2.7523368927283065, "language_loss": 0.65042222, "learning_rate": 3.6249651875809715e-06, "loss": 0.67251408, "num_input_tokens_seen": 39334275, "step": 1848, "time_per_iteration": 2.6862504482269287 }, { "auxiliary_loss_clip": 0.011845, "auxiliary_loss_mlp": 0.01034707, "balance_loss_clip": 1.06268108, "balance_loss_mlp": 1.02468181, "epoch": 0.2223291047916792, "flos": 19099342103040.0, "grad_norm": 2.2794279333336545, "language_loss": 0.88872385, "learning_rate": 3.62451093520323e-06, "loss": 0.91091597, "num_input_tokens_seen": 39352180, "step": 1849, "time_per_iteration": 2.774221181869507 }, { "auxiliary_loss_clip": 0.0115026, "auxiliary_loss_mlp": 0.01030364, "balance_loss_clip": 1.05094564, "balance_loss_mlp": 1.02051711, "epoch": 0.22244934768231828, "flos": 20850418126080.0, "grad_norm": 7.686921950621746, "language_loss": 0.90544045, "learning_rate": 3.6240564363884714e-06, "loss": 0.92724669, "num_input_tokens_seen": 39372125, "step": 1850, "time_per_iteration": 2.8026583194732666 }, { "auxiliary_loss_clip": 0.01203429, "auxiliary_loss_mlp": 0.01034959, "balance_loss_clip": 1.05842233, "balance_loss_mlp": 1.02496958, "epoch": 0.2225695905729574, "flos": 15632921111040.0, "grad_norm": 1.9323874957189782, "language_loss": 0.70435619, "learning_rate": 3.623601691205643e-06, "loss": 0.72674012, "num_input_tokens_seen": 39391200, "step": 1851, "time_per_iteration": 2.735692024230957 }, { "auxiliary_loss_clip": 0.01196673, "auxiliary_loss_mlp": 0.01034722, "balance_loss_clip": 1.05701089, "balance_loss_mlp": 1.02477419, "epoch": 0.22268983346359647, "flos": 25373582265600.0, "grad_norm": 2.0804402693740895, "language_loss": 0.81471896, "learning_rate": 3.623146699723729e-06, "loss": 0.83703291, "num_input_tokens_seen": 39410660, "step": 1852, "time_per_iteration": 2.65704607963562 }, { "auxiliary_loss_clip": 0.01186343, "auxiliary_loss_mlp": 0.010314, "balance_loss_clip": 1.06373024, "balance_loss_mlp": 1.02103436, "epoch": 0.22281007635423555, "flos": 13261452359040.0, "grad_norm": 3.489718648786214, "language_loss": 0.77485317, "learning_rate": 3.6226914620117507e-06, "loss": 0.79703057, "num_input_tokens_seen": 39429280, "step": 1853, "time_per_iteration": 2.7184698581695557 }, { "auxiliary_loss_clip": 0.01169804, "auxiliary_loss_mlp": 0.01031101, "balance_loss_clip": 1.05184221, "balance_loss_mlp": 1.02156997, "epoch": 0.22293031924487464, "flos": 15340536403200.0, "grad_norm": 2.147676607591994, "language_loss": 0.80770695, "learning_rate": 3.622235978138768e-06, "loss": 0.82971597, "num_input_tokens_seen": 39446905, "step": 1854, "time_per_iteration": 2.706463098526001 }, { "auxiliary_loss_clip": 0.01200723, "auxiliary_loss_mlp": 0.01031829, "balance_loss_clip": 1.06280386, "balance_loss_mlp": 1.02174425, "epoch": 0.22305056213551375, "flos": 22564649773440.0, "grad_norm": 2.334516205402604, "language_loss": 0.8117429, "learning_rate": 3.621780248173877e-06, "loss": 0.83406842, "num_input_tokens_seen": 39465105, "step": 1855, "time_per_iteration": 2.677907705307007 }, { "auxiliary_loss_clip": 0.01104823, "auxiliary_loss_mlp": 0.01003545, "balance_loss_clip": 1.02771127, "balance_loss_mlp": 1.00123239, "epoch": 0.22317080502615283, "flos": 64880419887360.0, "grad_norm": 0.9163670325223952, "language_loss": 0.61016548, "learning_rate": 3.6213242721862125e-06, "loss": 0.63124919, "num_input_tokens_seen": 39523560, "step": 1856, "time_per_iteration": 3.2198057174682617 }, { "auxiliary_loss_clip": 0.01171627, "auxiliary_loss_mlp": 0.01028821, "balance_loss_clip": 1.05338693, "balance_loss_mlp": 1.01907015, "epoch": 0.2232910479167919, "flos": 25775997310080.0, "grad_norm": 1.8419840607744538, "language_loss": 0.75310349, "learning_rate": 3.620868050244945e-06, "loss": 0.77510798, "num_input_tokens_seen": 39544040, "step": 1857, "time_per_iteration": 2.747532606124878 }, { "auxiliary_loss_clip": 0.01176845, "auxiliary_loss_mlp": 0.01035845, "balance_loss_clip": 1.0538528, "balance_loss_mlp": 1.02461529, "epoch": 0.22341129080743102, "flos": 23251799928960.0, "grad_norm": 2.4527914060757507, "language_loss": 0.77715278, "learning_rate": 3.6204115824192817e-06, "loss": 0.79927969, "num_input_tokens_seen": 39561515, "step": 1858, "time_per_iteration": 2.7165987491607666 }, { "auxiliary_loss_clip": 0.01172828, "auxiliary_loss_mlp": 0.01035266, "balance_loss_clip": 1.05156064, "balance_loss_mlp": 1.02524602, "epoch": 0.2235315336980701, "flos": 21214552250880.0, "grad_norm": 2.488365459165997, "language_loss": 0.76699555, "learning_rate": 3.619954868778471e-06, "loss": 0.78907645, "num_input_tokens_seen": 39578210, "step": 1859, "time_per_iteration": 2.6350719928741455 }, { "auxiliary_loss_clip": 0.0118452, "auxiliary_loss_mlp": 0.01029586, "balance_loss_clip": 1.056862, "balance_loss_mlp": 1.02050209, "epoch": 0.2236517765887092, "flos": 19901945548800.0, "grad_norm": 2.8355746386640215, "language_loss": 0.83080876, "learning_rate": 3.6194979093917944e-06, "loss": 0.8529498, "num_input_tokens_seen": 39597625, "step": 1860, "time_per_iteration": 2.7272799015045166 }, { "auxiliary_loss_clip": 0.01177612, "auxiliary_loss_mlp": 0.01040456, "balance_loss_clip": 1.05411828, "balance_loss_mlp": 1.03009701, "epoch": 0.22377201947934827, "flos": 23214847812480.0, "grad_norm": 2.389515494373004, "language_loss": 0.87036407, "learning_rate": 3.6190407043285724e-06, "loss": 0.89254475, "num_input_tokens_seen": 39615360, "step": 1861, "time_per_iteration": 3.5463955402374268 }, { "auxiliary_loss_clip": 0.01216468, "auxiliary_loss_mlp": 0.01035878, "balance_loss_clip": 1.06103301, "balance_loss_mlp": 1.02616835, "epoch": 0.22389226236998738, "flos": 26794244056320.0, "grad_norm": 2.2519289821310915, "language_loss": 0.75847542, "learning_rate": 3.618583253658163e-06, "loss": 0.78099883, "num_input_tokens_seen": 39635460, "step": 1862, "time_per_iteration": 2.7567789554595947 }, { "auxiliary_loss_clip": 0.01162558, "auxiliary_loss_mlp": 0.00765307, "balance_loss_clip": 1.06085968, "balance_loss_mlp": 1.00090528, "epoch": 0.22401250526062647, "flos": 24170359455360.0, "grad_norm": 2.460246262400244, "language_loss": 0.86613905, "learning_rate": 3.618125557449961e-06, "loss": 0.88541776, "num_input_tokens_seen": 39653515, "step": 1863, "time_per_iteration": 2.7878665924072266 }, { "auxiliary_loss_clip": 0.01193377, "auxiliary_loss_mlp": 0.01034204, "balance_loss_clip": 1.05683053, "balance_loss_mlp": 1.02443445, "epoch": 0.22413274815126555, "flos": 16759761649920.0, "grad_norm": 2.1245305673400137, "language_loss": 0.831541, "learning_rate": 3.6176676157733983e-06, "loss": 0.85381687, "num_input_tokens_seen": 39668525, "step": 1864, "time_per_iteration": 3.54984974861145 }, { "auxiliary_loss_clip": 0.01159462, "auxiliary_loss_mlp": 0.01033801, "balance_loss_clip": 1.05221152, "balance_loss_mlp": 1.02334666, "epoch": 0.22425299104190466, "flos": 21360205900800.0, "grad_norm": 2.315493971244824, "language_loss": 0.75841653, "learning_rate": 3.6172094286979443e-06, "loss": 0.78034914, "num_input_tokens_seen": 39685895, "step": 1865, "time_per_iteration": 3.613912343978882 }, { "auxiliary_loss_clip": 0.0118125, "auxiliary_loss_mlp": 0.01031841, "balance_loss_clip": 1.05418837, "balance_loss_mlp": 1.02232838, "epoch": 0.22437323393254374, "flos": 32165547108480.0, "grad_norm": 1.512771859955036, "language_loss": 0.81430918, "learning_rate": 3.6167509962931064e-06, "loss": 0.83644009, "num_input_tokens_seen": 39711595, "step": 1866, "time_per_iteration": 3.733426809310913 }, { "auxiliary_loss_clip": 0.01162787, "auxiliary_loss_mlp": 0.01041233, "balance_loss_clip": 1.05748868, "balance_loss_mlp": 1.03066528, "epoch": 0.22449347682318282, "flos": 18002809664640.0, "grad_norm": 2.4153824926256, "language_loss": 0.76907849, "learning_rate": 3.6162923186284276e-06, "loss": 0.79111874, "num_input_tokens_seen": 39727555, "step": 1867, "time_per_iteration": 2.7301242351531982 }, { "auxiliary_loss_clip": 0.01182094, "auxiliary_loss_mlp": 0.01038268, "balance_loss_clip": 1.05478024, "balance_loss_mlp": 1.02814758, "epoch": 0.2246137197138219, "flos": 18697286194560.0, "grad_norm": 3.3831917687446946, "language_loss": 0.86203659, "learning_rate": 3.6158333957734888e-06, "loss": 0.88424015, "num_input_tokens_seen": 39746145, "step": 1868, "time_per_iteration": 2.6613547801971436 }, { "auxiliary_loss_clip": 0.01174235, "auxiliary_loss_mlp": 0.0103871, "balance_loss_clip": 1.05351102, "balance_loss_mlp": 1.02916789, "epoch": 0.22473396260446102, "flos": 15590653781760.0, "grad_norm": 3.6077309753674895, "language_loss": 0.83134723, "learning_rate": 3.6153742277979088e-06, "loss": 0.85347664, "num_input_tokens_seen": 39763575, "step": 1869, "time_per_iteration": 2.6783289909362793 }, { "auxiliary_loss_clip": 0.01180976, "auxiliary_loss_mlp": 0.01036406, "balance_loss_clip": 1.05407047, "balance_loss_mlp": 1.02682745, "epoch": 0.2248542054951001, "flos": 14465501182080.0, "grad_norm": 2.5027507586476974, "language_loss": 0.77990782, "learning_rate": 3.6149148147713434e-06, "loss": 0.8020817, "num_input_tokens_seen": 39781810, "step": 1870, "time_per_iteration": 2.7274792194366455 }, { "auxiliary_loss_clip": 0.01207009, "auxiliary_loss_mlp": 0.01032616, "balance_loss_clip": 1.06420708, "balance_loss_mlp": 1.02324629, "epoch": 0.22497444838573918, "flos": 19243882431360.0, "grad_norm": 2.64242613976117, "language_loss": 0.86630714, "learning_rate": 3.614455156763484e-06, "loss": 0.88870347, "num_input_tokens_seen": 39800115, "step": 1871, "time_per_iteration": 2.6262431144714355 }, { "auxiliary_loss_clip": 0.0114948, "auxiliary_loss_mlp": 0.01040982, "balance_loss_clip": 1.04890788, "balance_loss_mlp": 1.03049207, "epoch": 0.2250946912763783, "flos": 16910299549440.0, "grad_norm": 3.4972664763473094, "language_loss": 0.71705008, "learning_rate": 3.613995253844061e-06, "loss": 0.73895472, "num_input_tokens_seen": 39817795, "step": 1872, "time_per_iteration": 2.781992197036743 }, { "auxiliary_loss_clip": 0.01195595, "auxiliary_loss_mlp": 0.01035194, "balance_loss_clip": 1.0580616, "balance_loss_mlp": 1.02476907, "epoch": 0.22521493416701738, "flos": 24681368292480.0, "grad_norm": 3.515640940144834, "language_loss": 0.80787235, "learning_rate": 3.6135351060828414e-06, "loss": 0.83018023, "num_input_tokens_seen": 39838270, "step": 1873, "time_per_iteration": 2.705841064453125 }, { "auxiliary_loss_clip": 0.01221324, "auxiliary_loss_mlp": 0.0104308, "balance_loss_clip": 1.06415665, "balance_loss_mlp": 1.03233957, "epoch": 0.22533517705765646, "flos": 17821963664640.0, "grad_norm": 3.1593450817507494, "language_loss": 0.69453454, "learning_rate": 3.6130747135496285e-06, "loss": 0.71717858, "num_input_tokens_seen": 39857270, "step": 1874, "time_per_iteration": 2.5820014476776123 }, { "auxiliary_loss_clip": 0.01210112, "auxiliary_loss_mlp": 0.01036649, "balance_loss_clip": 1.0584662, "balance_loss_mlp": 1.02617633, "epoch": 0.22545541994829554, "flos": 33691390899840.0, "grad_norm": 2.003505718888647, "language_loss": 0.65842372, "learning_rate": 3.6126140763142646e-06, "loss": 0.68089128, "num_input_tokens_seen": 39882300, "step": 1875, "time_per_iteration": 2.746408224105835 }, { "auxiliary_loss_clip": 0.0121467, "auxiliary_loss_mlp": 0.0103354, "balance_loss_clip": 1.06067753, "balance_loss_mlp": 1.02403331, "epoch": 0.22557566283893465, "flos": 19171594310400.0, "grad_norm": 2.5537282761514963, "language_loss": 0.85908711, "learning_rate": 3.6121531944466275e-06, "loss": 0.88156915, "num_input_tokens_seen": 39899625, "step": 1876, "time_per_iteration": 2.6696348190307617 }, { "auxiliary_loss_clip": 0.01197416, "auxiliary_loss_mlp": 0.01036412, "balance_loss_clip": 1.05891728, "balance_loss_mlp": 1.02608836, "epoch": 0.22569590572957374, "flos": 20773281669120.0, "grad_norm": 2.347460720481523, "language_loss": 0.78125638, "learning_rate": 3.611692068016633e-06, "loss": 0.80359465, "num_input_tokens_seen": 39915955, "step": 1877, "time_per_iteration": 2.658524513244629 }, { "auxiliary_loss_clip": 0.01163147, "auxiliary_loss_mlp": 0.01039386, "balance_loss_clip": 1.05112553, "balance_loss_mlp": 1.02875221, "epoch": 0.22581614862021282, "flos": 18442715529600.0, "grad_norm": 2.319259921439485, "language_loss": 0.75214529, "learning_rate": 3.611230697094233e-06, "loss": 0.77417064, "num_input_tokens_seen": 39932655, "step": 1878, "time_per_iteration": 2.679168701171875 }, { "auxiliary_loss_clip": 0.01186276, "auxiliary_loss_mlp": 0.01041113, "balance_loss_clip": 1.05641305, "balance_loss_mlp": 1.03212488, "epoch": 0.22593639151085193, "flos": 20048389297920.0, "grad_norm": 2.490542031087404, "language_loss": 0.87155211, "learning_rate": 3.6107690817494173e-06, "loss": 0.89382601, "num_input_tokens_seen": 39952875, "step": 1879, "time_per_iteration": 2.715020179748535 }, { "auxiliary_loss_clip": 0.01144344, "auxiliary_loss_mlp": 0.01037718, "balance_loss_clip": 1.04955077, "balance_loss_mlp": 1.02779937, "epoch": 0.226056634401491, "flos": 13115116350720.0, "grad_norm": 3.8495490775116017, "language_loss": 0.70705295, "learning_rate": 3.6103072220522117e-06, "loss": 0.72887361, "num_input_tokens_seen": 39968405, "step": 1880, "time_per_iteration": 2.6595053672790527 }, { "auxiliary_loss_clip": 0.01171926, "auxiliary_loss_mlp": 0.01033147, "balance_loss_clip": 1.05405843, "balance_loss_mlp": 1.0239203, "epoch": 0.2261768772921301, "flos": 18988378012800.0, "grad_norm": 1.8700366559525883, "language_loss": 0.91759664, "learning_rate": 3.609845118072682e-06, "loss": 0.93964738, "num_input_tokens_seen": 39987075, "step": 1881, "time_per_iteration": 2.7835686206817627 }, { "auxiliary_loss_clip": 0.01202775, "auxiliary_loss_mlp": 0.00765717, "balance_loss_clip": 1.05743432, "balance_loss_mlp": 1.00106931, "epoch": 0.2262971201827692, "flos": 19974054101760.0, "grad_norm": 1.9431831628162566, "language_loss": 0.80088961, "learning_rate": 3.6093827698809276e-06, "loss": 0.82057452, "num_input_tokens_seen": 40006175, "step": 1882, "time_per_iteration": 2.6244235038757324 }, { "auxiliary_loss_clip": 0.01190868, "auxiliary_loss_mlp": 0.01027632, "balance_loss_clip": 1.05210817, "balance_loss_mlp": 1.0184108, "epoch": 0.2264173630734083, "flos": 16654543735680.0, "grad_norm": 3.7540043296123797, "language_loss": 0.84287113, "learning_rate": 3.6089201775470864e-06, "loss": 0.86505616, "num_input_tokens_seen": 40021630, "step": 1883, "time_per_iteration": 2.6386256217956543 }, { "auxiliary_loss_clip": 0.01153287, "auxiliary_loss_mlp": 0.01033291, "balance_loss_clip": 1.04914474, "balance_loss_mlp": 1.02370048, "epoch": 0.22653760596404737, "flos": 24389809597440.0, "grad_norm": 1.6267726635396331, "language_loss": 0.77647734, "learning_rate": 3.6084573411413334e-06, "loss": 0.79834318, "num_input_tokens_seen": 40041025, "step": 1884, "time_per_iteration": 2.7672119140625 }, { "auxiliary_loss_clip": 0.0116633, "auxiliary_loss_mlp": 0.01042169, "balance_loss_clip": 1.05354762, "balance_loss_mlp": 1.03142858, "epoch": 0.22665784885468646, "flos": 18332541538560.0, "grad_norm": 2.252267114944765, "language_loss": 0.81393075, "learning_rate": 3.607994260733881e-06, "loss": 0.8360157, "num_input_tokens_seen": 40060265, "step": 1885, "time_per_iteration": 2.702138900756836 }, { "auxiliary_loss_clip": 0.01185967, "auxiliary_loss_mlp": 0.01030831, "balance_loss_clip": 1.05306232, "balance_loss_mlp": 1.02193809, "epoch": 0.22677809174532557, "flos": 24058102475520.0, "grad_norm": 2.7480258433000384, "language_loss": 0.74656773, "learning_rate": 3.6075309363949776e-06, "loss": 0.76873571, "num_input_tokens_seen": 40079435, "step": 1886, "time_per_iteration": 2.7162277698516846 }, { "auxiliary_loss_clip": 0.01210632, "auxiliary_loss_mlp": 0.01036099, "balance_loss_clip": 1.05871522, "balance_loss_mlp": 1.0262171, "epoch": 0.22689833463596465, "flos": 20374242503040.0, "grad_norm": 3.9211199745360585, "language_loss": 0.81431985, "learning_rate": 3.6070673681949094e-06, "loss": 0.83678722, "num_input_tokens_seen": 40097800, "step": 1887, "time_per_iteration": 3.5330729484558105 }, { "auxiliary_loss_clip": 0.01181156, "auxiliary_loss_mlp": 0.00764507, "balance_loss_clip": 1.05450344, "balance_loss_mlp": 1.00097179, "epoch": 0.22701857752660373, "flos": 30120398438400.0, "grad_norm": 1.8265983648189121, "language_loss": 0.81671679, "learning_rate": 3.606603556203999e-06, "loss": 0.83617342, "num_input_tokens_seen": 40122745, "step": 1888, "time_per_iteration": 2.828099250793457 }, { "auxiliary_loss_clip": 0.01196873, "auxiliary_loss_mlp": 0.01035888, "balance_loss_clip": 1.0531106, "balance_loss_mlp": 1.02622008, "epoch": 0.22713882041724284, "flos": 22492182084480.0, "grad_norm": 2.219981082596709, "language_loss": 0.83695358, "learning_rate": 3.6061395004926066e-06, "loss": 0.85928118, "num_input_tokens_seen": 40141680, "step": 1889, "time_per_iteration": 2.659557819366455 }, { "auxiliary_loss_clip": 0.01211834, "auxiliary_loss_mlp": 0.01034849, "balance_loss_clip": 1.05811119, "balance_loss_mlp": 1.02430511, "epoch": 0.22725906330788193, "flos": 20521548178560.0, "grad_norm": 7.594772763661466, "language_loss": 0.84833384, "learning_rate": 3.605675201131129e-06, "loss": 0.87080061, "num_input_tokens_seen": 40160140, "step": 1890, "time_per_iteration": 3.572787046432495 }, { "auxiliary_loss_clip": 0.0120277, "auxiliary_loss_mlp": 0.01032483, "balance_loss_clip": 1.0593431, "balance_loss_mlp": 1.02245748, "epoch": 0.227379306198521, "flos": 18989922297600.0, "grad_norm": 3.1918408291837994, "language_loss": 0.79583478, "learning_rate": 3.60521065819e-06, "loss": 0.81818724, "num_input_tokens_seen": 40177450, "step": 1891, "time_per_iteration": 3.5321075916290283 }, { "auxiliary_loss_clip": 0.01183301, "auxiliary_loss_mlp": 0.01035765, "balance_loss_clip": 1.05224764, "balance_loss_mlp": 1.02665722, "epoch": 0.2274995490891601, "flos": 21798351999360.0, "grad_norm": 2.0380865161189696, "language_loss": 0.88032722, "learning_rate": 3.60474587173969e-06, "loss": 0.90251791, "num_input_tokens_seen": 40195935, "step": 1892, "time_per_iteration": 3.6196961402893066 }, { "auxiliary_loss_clip": 0.01195752, "auxiliary_loss_mlp": 0.01034534, "balance_loss_clip": 1.06022549, "balance_loss_mlp": 1.02542651, "epoch": 0.2276197919797992, "flos": 19058654972160.0, "grad_norm": 2.3209669309471512, "language_loss": 0.84464121, "learning_rate": 3.6042808418507084e-06, "loss": 0.86694407, "num_input_tokens_seen": 40213620, "step": 1893, "time_per_iteration": 2.640303134918213 }, { "auxiliary_loss_clip": 0.01199554, "auxiliary_loss_mlp": 0.01040366, "balance_loss_clip": 1.05936956, "balance_loss_mlp": 1.03030443, "epoch": 0.22774003487043828, "flos": 18806777827200.0, "grad_norm": 2.3848659221837933, "language_loss": 0.7719717, "learning_rate": 3.6038155685935976e-06, "loss": 0.79437089, "num_input_tokens_seen": 40230190, "step": 1894, "time_per_iteration": 2.658017873764038 }, { "auxiliary_loss_clip": 0.01196357, "auxiliary_loss_mlp": 0.01034151, "balance_loss_clip": 1.05805969, "balance_loss_mlp": 1.02463245, "epoch": 0.22786027776107737, "flos": 23002544476800.0, "grad_norm": 7.368234507231393, "language_loss": 0.70834446, "learning_rate": 3.6033500520389404e-06, "loss": 0.73064947, "num_input_tokens_seen": 40246860, "step": 1895, "time_per_iteration": 2.7004048824310303 }, { "auxiliary_loss_clip": 0.01075728, "auxiliary_loss_mlp": 0.01004341, "balance_loss_clip": 1.03040862, "balance_loss_mlp": 1.00199306, "epoch": 0.22798052065171648, "flos": 66706872600960.0, "grad_norm": 0.8115595839022945, "language_loss": 0.64770842, "learning_rate": 3.6028842922573553e-06, "loss": 0.66850913, "num_input_tokens_seen": 40311005, "step": 1896, "time_per_iteration": 3.3901093006134033 }, { "auxiliary_loss_clip": 0.01088772, "auxiliary_loss_mlp": 0.00755887, "balance_loss_clip": 1.03008151, "balance_loss_mlp": 1.00178456, "epoch": 0.22810076354235556, "flos": 62080896758400.0, "grad_norm": 0.8587956336581836, "language_loss": 0.62911236, "learning_rate": 3.602418289319497e-06, "loss": 0.64755893, "num_input_tokens_seen": 40369560, "step": 1897, "time_per_iteration": 3.2240233421325684 }, { "auxiliary_loss_clip": 0.01150895, "auxiliary_loss_mlp": 0.01036985, "balance_loss_clip": 1.05127764, "balance_loss_mlp": 1.02713811, "epoch": 0.22822100643299464, "flos": 23876358635520.0, "grad_norm": 2.6377734043383363, "language_loss": 0.73038548, "learning_rate": 3.601952043296059e-06, "loss": 0.75226426, "num_input_tokens_seen": 40389555, "step": 1898, "time_per_iteration": 2.7809786796569824 }, { "auxiliary_loss_clip": 0.01184732, "auxiliary_loss_mlp": 0.01036367, "balance_loss_clip": 1.05259967, "balance_loss_mlp": 1.0273428, "epoch": 0.22834124932363373, "flos": 20991331180800.0, "grad_norm": 2.10855413442385, "language_loss": 0.80807167, "learning_rate": 3.6014855542577696e-06, "loss": 0.83028263, "num_input_tokens_seen": 40406765, "step": 1899, "time_per_iteration": 2.675363779067993 }, { "auxiliary_loss_clip": 0.01179808, "auxiliary_loss_mlp": 0.01035399, "balance_loss_clip": 1.05440485, "balance_loss_mlp": 1.02573121, "epoch": 0.22846149221427284, "flos": 24901572620160.0, "grad_norm": 1.9905290881186342, "language_loss": 0.84516394, "learning_rate": 3.6010188222753943e-06, "loss": 0.86731595, "num_input_tokens_seen": 40427535, "step": 1900, "time_per_iteration": 2.724919557571411 }, { "auxiliary_loss_clip": 0.01092268, "auxiliary_loss_mlp": 0.01003625, "balance_loss_clip": 1.02971458, "balance_loss_mlp": 1.00152671, "epoch": 0.22858173510491192, "flos": 56132294319360.0, "grad_norm": 0.8971585760056745, "language_loss": 0.64069796, "learning_rate": 3.6005518474197372e-06, "loss": 0.66165692, "num_input_tokens_seen": 40479580, "step": 1901, "time_per_iteration": 3.145097494125366 }, { "auxiliary_loss_clip": 0.01198708, "auxiliary_loss_mlp": 0.01037253, "balance_loss_clip": 1.06089973, "balance_loss_mlp": 1.02719784, "epoch": 0.228701977995551, "flos": 24170826332160.0, "grad_norm": 2.376937990197744, "language_loss": 0.77814448, "learning_rate": 3.6000846297616373e-06, "loss": 0.80050421, "num_input_tokens_seen": 40497880, "step": 1902, "time_per_iteration": 2.685399293899536 }, { "auxiliary_loss_clip": 0.01220041, "auxiliary_loss_mlp": 0.01034085, "balance_loss_clip": 1.06521249, "balance_loss_mlp": 1.0235765, "epoch": 0.22882222088619011, "flos": 21387892308480.0, "grad_norm": 2.9158543007723132, "language_loss": 0.72659683, "learning_rate": 3.5996171693719717e-06, "loss": 0.74913812, "num_input_tokens_seen": 40513975, "step": 1903, "time_per_iteration": 2.606912851333618 }, { "auxiliary_loss_clip": 0.01107056, "auxiliary_loss_mlp": 0.01002813, "balance_loss_clip": 1.02971005, "balance_loss_mlp": 1.00072694, "epoch": 0.2289424637768292, "flos": 64589615377920.0, "grad_norm": 0.8402830213619247, "language_loss": 0.64757788, "learning_rate": 3.5991494663216528e-06, "loss": 0.6686765, "num_input_tokens_seen": 40576960, "step": 1904, "time_per_iteration": 3.2863552570343018 }, { "auxiliary_loss_clip": 0.01213358, "auxiliary_loss_mlp": 0.01026189, "balance_loss_clip": 1.06058931, "balance_loss_mlp": 1.0162828, "epoch": 0.22906270666746828, "flos": 22163419877760.0, "grad_norm": 1.9723511631818114, "language_loss": 0.87554455, "learning_rate": 3.5986815206816314e-06, "loss": 0.89793998, "num_input_tokens_seen": 40595780, "step": 1905, "time_per_iteration": 2.5956368446350098 }, { "auxiliary_loss_clip": 0.01210286, "auxiliary_loss_mlp": 0.01031377, "balance_loss_clip": 1.05881751, "balance_loss_mlp": 1.02174437, "epoch": 0.2291829495581074, "flos": 25772334122880.0, "grad_norm": 2.5509605689945407, "language_loss": 0.7457096, "learning_rate": 3.598213332522895e-06, "loss": 0.76812625, "num_input_tokens_seen": 40615810, "step": 1906, "time_per_iteration": 2.687082529067993 }, { "auxiliary_loss_clip": 0.01197247, "auxiliary_loss_mlp": 0.0103449, "balance_loss_clip": 1.05723798, "balance_loss_mlp": 1.02444696, "epoch": 0.22930319244874647, "flos": 31172760126720.0, "grad_norm": 1.8577766388843853, "language_loss": 0.77467829, "learning_rate": 3.597744901916466e-06, "loss": 0.79699564, "num_input_tokens_seen": 40637095, "step": 1907, "time_per_iteration": 2.685657024383545 }, { "auxiliary_loss_clip": 0.01217108, "auxiliary_loss_mlp": 0.01036086, "balance_loss_clip": 1.06028461, "balance_loss_mlp": 1.02623916, "epoch": 0.22942343533938556, "flos": 23254098399360.0, "grad_norm": 2.185938560597546, "language_loss": 0.76440191, "learning_rate": 3.5972762289334058e-06, "loss": 0.7869339, "num_input_tokens_seen": 40656725, "step": 1908, "time_per_iteration": 2.62160587310791 }, { "auxiliary_loss_clip": 0.01135027, "auxiliary_loss_mlp": 0.01040068, "balance_loss_clip": 1.05355132, "balance_loss_mlp": 1.02912402, "epoch": 0.22954367823002464, "flos": 14610903436800.0, "grad_norm": 10.292343208175309, "language_loss": 0.85200208, "learning_rate": 3.5968073136448116e-06, "loss": 0.87375307, "num_input_tokens_seen": 40674745, "step": 1909, "time_per_iteration": 2.665807008743286 }, { "auxiliary_loss_clip": 0.0120058, "auxiliary_loss_mlp": 0.01028513, "balance_loss_clip": 1.05749691, "balance_loss_mlp": 1.01921439, "epoch": 0.22966392112066375, "flos": 16763604405120.0, "grad_norm": 5.567898357289049, "language_loss": 0.91310298, "learning_rate": 3.596338156121818e-06, "loss": 0.93539393, "num_input_tokens_seen": 40693630, "step": 1910, "time_per_iteration": 2.5899336338043213 }, { "auxiliary_loss_clip": 0.01088361, "auxiliary_loss_mlp": 0.01008881, "balance_loss_clip": 1.0258944, "balance_loss_mlp": 1.00684214, "epoch": 0.22978416401130283, "flos": 67474247783040.0, "grad_norm": 0.827635615357902, "language_loss": 0.59336019, "learning_rate": 3.595868756435595e-06, "loss": 0.61433262, "num_input_tokens_seen": 40761310, "step": 1911, "time_per_iteration": 3.2890682220458984 }, { "auxiliary_loss_clip": 0.01174272, "auxiliary_loss_mlp": 0.01036892, "balance_loss_clip": 1.05849361, "balance_loss_mlp": 1.02583528, "epoch": 0.22990440690194192, "flos": 19865137086720.0, "grad_norm": 2.3446043564954606, "language_loss": 0.80218375, "learning_rate": 3.5953991146573504e-06, "loss": 0.8242954, "num_input_tokens_seen": 40779955, "step": 1912, "time_per_iteration": 2.6832809448242188 }, { "auxiliary_loss_clip": 0.01200041, "auxiliary_loss_mlp": 0.0103906, "balance_loss_clip": 1.0551542, "balance_loss_mlp": 1.02824759, "epoch": 0.23002464979258103, "flos": 13289246507520.0, "grad_norm": 3.810678026521688, "language_loss": 0.83133256, "learning_rate": 3.5949292308583294e-06, "loss": 0.85372359, "num_input_tokens_seen": 40793200, "step": 1913, "time_per_iteration": 3.449061632156372 }, { "auxiliary_loss_clip": 0.01213864, "auxiliary_loss_mlp": 0.01040536, "balance_loss_clip": 1.06106162, "balance_loss_mlp": 1.02989066, "epoch": 0.2301448926832201, "flos": 22163779013760.0, "grad_norm": 2.312903276174975, "language_loss": 0.80481696, "learning_rate": 3.594459105109811e-06, "loss": 0.82736099, "num_input_tokens_seen": 40812380, "step": 1914, "time_per_iteration": 2.5734827518463135 }, { "auxiliary_loss_clip": 0.01202703, "auxiliary_loss_mlp": 0.01038215, "balance_loss_clip": 1.05928516, "balance_loss_mlp": 1.02820802, "epoch": 0.2302651355738592, "flos": 20704477167360.0, "grad_norm": 2.0535223142972967, "language_loss": 0.81305301, "learning_rate": 3.593988737483115e-06, "loss": 0.83546221, "num_input_tokens_seen": 40832320, "step": 1915, "time_per_iteration": 2.6523566246032715 }, { "auxiliary_loss_clip": 0.01184269, "auxiliary_loss_mlp": 0.01035808, "balance_loss_clip": 1.05791116, "balance_loss_mlp": 1.02538323, "epoch": 0.23038537846449827, "flos": 18588943797120.0, "grad_norm": 2.612209647421837, "language_loss": 0.78474045, "learning_rate": 3.5935181280495947e-06, "loss": 0.80694115, "num_input_tokens_seen": 40850900, "step": 1916, "time_per_iteration": 3.6101200580596924 }, { "auxiliary_loss_clip": 0.01081584, "auxiliary_loss_mlp": 0.01003799, "balance_loss_clip": 1.0212599, "balance_loss_mlp": 1.00158203, "epoch": 0.23050562135513739, "flos": 64224260190720.0, "grad_norm": 0.8002380842083977, "language_loss": 0.54299456, "learning_rate": 3.5930472768806412e-06, "loss": 0.56384844, "num_input_tokens_seen": 40909570, "step": 1917, "time_per_iteration": 4.175180196762085 }, { "auxiliary_loss_clip": 0.01213102, "auxiliary_loss_mlp": 0.01032487, "balance_loss_clip": 1.06085348, "balance_loss_mlp": 1.02248502, "epoch": 0.23062586424577647, "flos": 17313396952320.0, "grad_norm": 2.9830732562478968, "language_loss": 0.77248573, "learning_rate": 3.5925761840476826e-06, "loss": 0.7949416, "num_input_tokens_seen": 40928180, "step": 1918, "time_per_iteration": 3.5137860774993896 }, { "auxiliary_loss_clip": 0.01180747, "auxiliary_loss_mlp": 0.01032016, "balance_loss_clip": 1.05970621, "balance_loss_mlp": 1.02236021, "epoch": 0.23074610713641555, "flos": 27855979194240.0, "grad_norm": 2.3059921643812813, "language_loss": 0.8144871, "learning_rate": 3.592104849622183e-06, "loss": 0.83661479, "num_input_tokens_seen": 40950435, "step": 1919, "time_per_iteration": 2.6971824169158936 }, { "auxiliary_loss_clip": 0.01145746, "auxiliary_loss_mlp": 0.01039485, "balance_loss_clip": 1.05359972, "balance_loss_mlp": 1.02997208, "epoch": 0.23086635002705466, "flos": 28841798937600.0, "grad_norm": 1.648637048108304, "language_loss": 0.72957635, "learning_rate": 3.591633273675644e-06, "loss": 0.7514286, "num_input_tokens_seen": 40972670, "step": 1920, "time_per_iteration": 2.8208978176116943 }, { "auxiliary_loss_clip": 0.0108051, "auxiliary_loss_mlp": 0.01009847, "balance_loss_clip": 1.03736222, "balance_loss_mlp": 1.00721216, "epoch": 0.23098659291769374, "flos": 62923681566720.0, "grad_norm": 0.9089346813454777, "language_loss": 0.58154774, "learning_rate": 3.591161456279602e-06, "loss": 0.60245132, "num_input_tokens_seen": 41018215, "step": 1921, "time_per_iteration": 3.171978235244751 }, { "auxiliary_loss_clip": 0.01191631, "auxiliary_loss_mlp": 0.01036939, "balance_loss_clip": 1.05791402, "balance_loss_mlp": 1.02693141, "epoch": 0.23110683580833283, "flos": 23476816679040.0, "grad_norm": 4.172921017366292, "language_loss": 0.80441523, "learning_rate": 3.590689397505633e-06, "loss": 0.82670093, "num_input_tokens_seen": 41039125, "step": 1922, "time_per_iteration": 2.7086033821105957 }, { "auxiliary_loss_clip": 0.01209856, "auxiliary_loss_mlp": 0.0103794, "balance_loss_clip": 1.05955362, "balance_loss_mlp": 1.02856374, "epoch": 0.2312270786989719, "flos": 27271066124160.0, "grad_norm": 1.9984050822606354, "language_loss": 0.86853862, "learning_rate": 3.590217097425347e-06, "loss": 0.89101654, "num_input_tokens_seen": 41059025, "step": 1923, "time_per_iteration": 2.6914286613464355 }, { "auxiliary_loss_clip": 0.01217749, "auxiliary_loss_mlp": 0.01038885, "balance_loss_clip": 1.06441355, "balance_loss_mlp": 1.02849615, "epoch": 0.23134732158961102, "flos": 13261344618240.0, "grad_norm": 3.180738552801169, "language_loss": 0.70619231, "learning_rate": 3.589744556110391e-06, "loss": 0.72875869, "num_input_tokens_seen": 41077015, "step": 1924, "time_per_iteration": 2.579301595687866 }, { "auxiliary_loss_clip": 0.01177917, "auxiliary_loss_mlp": 0.01036419, "balance_loss_clip": 1.05336607, "balance_loss_mlp": 1.02614331, "epoch": 0.2314675644802501, "flos": 36977648250240.0, "grad_norm": 2.0494474299839585, "language_loss": 0.84518307, "learning_rate": 3.58927177363245e-06, "loss": 0.8673265, "num_input_tokens_seen": 41099840, "step": 1925, "time_per_iteration": 2.826348304748535 }, { "auxiliary_loss_clip": 0.01161051, "auxiliary_loss_mlp": 0.01034788, "balance_loss_clip": 1.05109394, "balance_loss_mlp": 1.02326059, "epoch": 0.2315878073708892, "flos": 23842207779840.0, "grad_norm": 4.2063729783792825, "language_loss": 0.72526681, "learning_rate": 3.5887987500632447e-06, "loss": 0.74722517, "num_input_tokens_seen": 41117845, "step": 1926, "time_per_iteration": 2.6821634769439697 }, { "auxiliary_loss_clip": 0.01174421, "auxiliary_loss_mlp": 0.01028904, "balance_loss_clip": 1.05630469, "balance_loss_mlp": 1.01932585, "epoch": 0.2317080502615283, "flos": 23039424766080.0, "grad_norm": 2.9251378631771043, "language_loss": 0.8425622, "learning_rate": 3.5883254854745325e-06, "loss": 0.86459547, "num_input_tokens_seen": 41136235, "step": 1927, "time_per_iteration": 2.7368109226226807 }, { "auxiliary_loss_clip": 0.01203568, "auxiliary_loss_mlp": 0.01032833, "balance_loss_clip": 1.05825496, "balance_loss_mlp": 1.02227163, "epoch": 0.23182829315216738, "flos": 11254656435840.0, "grad_norm": 2.8369511866931068, "language_loss": 0.75266194, "learning_rate": 3.587851979938107e-06, "loss": 0.77502596, "num_input_tokens_seen": 41153125, "step": 1928, "time_per_iteration": 2.6279940605163574 }, { "auxiliary_loss_clip": 0.01199699, "auxiliary_loss_mlp": 0.01044479, "balance_loss_clip": 1.05915332, "balance_loss_mlp": 1.0340662, "epoch": 0.23194853604280646, "flos": 19828939155840.0, "grad_norm": 2.5173860282966154, "language_loss": 0.77202064, "learning_rate": 3.5873782335257985e-06, "loss": 0.79446244, "num_input_tokens_seen": 41171290, "step": 1929, "time_per_iteration": 2.651021957397461 }, { "auxiliary_loss_clip": 0.01169085, "auxiliary_loss_mlp": 0.01043019, "balance_loss_clip": 1.057814, "balance_loss_mlp": 1.03212285, "epoch": 0.23206877893344555, "flos": 15305020830720.0, "grad_norm": 2.2277469218688153, "language_loss": 0.78823781, "learning_rate": 3.5869042463094744e-06, "loss": 0.81035888, "num_input_tokens_seen": 41189005, "step": 1930, "time_per_iteration": 2.6858103275299072 }, { "auxiliary_loss_clip": 0.01136646, "auxiliary_loss_mlp": 0.01034124, "balance_loss_clip": 1.04988813, "balance_loss_mlp": 1.02356768, "epoch": 0.23218902182408466, "flos": 22711488572160.0, "grad_norm": 1.869988668749133, "language_loss": 0.7731728, "learning_rate": 3.586430018361038e-06, "loss": 0.79488045, "num_input_tokens_seen": 41208775, "step": 1931, "time_per_iteration": 2.7825987339019775 }, { "auxiliary_loss_clip": 0.01169141, "auxiliary_loss_mlp": 0.01040569, "balance_loss_clip": 1.05244637, "balance_loss_mlp": 1.03009629, "epoch": 0.23230926471472374, "flos": 22710734386560.0, "grad_norm": 1.9608534513653555, "language_loss": 0.75923729, "learning_rate": 3.5859555497524283e-06, "loss": 0.7813344, "num_input_tokens_seen": 41226010, "step": 1932, "time_per_iteration": 2.673743963241577 }, { "auxiliary_loss_clip": 0.01199449, "auxiliary_loss_mlp": 0.0103734, "balance_loss_clip": 1.06053948, "balance_loss_mlp": 1.02780342, "epoch": 0.23242950760536282, "flos": 20375499479040.0, "grad_norm": 2.302545750135597, "language_loss": 0.9195286, "learning_rate": 3.5854808405556237e-06, "loss": 0.9418965, "num_input_tokens_seen": 41245245, "step": 1933, "time_per_iteration": 2.720630645751953 }, { "auxiliary_loss_clip": 0.01169323, "auxiliary_loss_mlp": 0.01033053, "balance_loss_clip": 1.05411923, "balance_loss_mlp": 1.02361107, "epoch": 0.23254975049600193, "flos": 16908324301440.0, "grad_norm": 2.6518617984897443, "language_loss": 0.7640788, "learning_rate": 3.5850058908426355e-06, "loss": 0.78610253, "num_input_tokens_seen": 41263795, "step": 1934, "time_per_iteration": 2.7200772762298584 }, { "auxiliary_loss_clip": 0.01183608, "auxiliary_loss_mlp": 0.01036463, "balance_loss_clip": 1.053267, "balance_loss_mlp": 1.02701616, "epoch": 0.23266999338664102, "flos": 23294821443840.0, "grad_norm": 2.010392403267564, "language_loss": 0.85563564, "learning_rate": 3.584530700685514e-06, "loss": 0.87783635, "num_input_tokens_seen": 41284055, "step": 1935, "time_per_iteration": 2.721214771270752 }, { "auxiliary_loss_clip": 0.01181151, "auxiliary_loss_mlp": 0.0103207, "balance_loss_clip": 1.0590781, "balance_loss_mlp": 1.022259, "epoch": 0.2327902362772801, "flos": 19569987031680.0, "grad_norm": 2.215274319574889, "language_loss": 0.8867116, "learning_rate": 3.5840552701563448e-06, "loss": 0.90884387, "num_input_tokens_seen": 41300255, "step": 1936, "time_per_iteration": 2.6790263652801514 }, { "auxiliary_loss_clip": 0.01210016, "auxiliary_loss_mlp": 0.01038871, "balance_loss_clip": 1.05832052, "balance_loss_mlp": 1.02956665, "epoch": 0.2329104791679192, "flos": 16727514215040.0, "grad_norm": 2.561820210003978, "language_loss": 0.82029152, "learning_rate": 3.5835795993272513e-06, "loss": 0.84278047, "num_input_tokens_seen": 41318540, "step": 1937, "time_per_iteration": 2.5738909244537354 }, { "auxiliary_loss_clip": 0.01105222, "auxiliary_loss_mlp": 0.01036501, "balance_loss_clip": 1.04756975, "balance_loss_mlp": 1.02619481, "epoch": 0.2330307220585583, "flos": 22163743100160.0, "grad_norm": 1.9887716808686196, "language_loss": 0.71402603, "learning_rate": 3.583103688270391e-06, "loss": 0.73544323, "num_input_tokens_seen": 41338320, "step": 1938, "time_per_iteration": 3.004837989807129 }, { "auxiliary_loss_clip": 0.01168089, "auxiliary_loss_mlp": 0.01032118, "balance_loss_clip": 1.05370998, "balance_loss_mlp": 1.0220325, "epoch": 0.23315096494919738, "flos": 19317319787520.0, "grad_norm": 3.1969866990678324, "language_loss": 0.89302576, "learning_rate": 3.58262753705796e-06, "loss": 0.9150278, "num_input_tokens_seen": 41353210, "step": 1939, "time_per_iteration": 4.1691484451293945 }, { "auxiliary_loss_clip": 0.01083228, "auxiliary_loss_mlp": 0.01008097, "balance_loss_clip": 1.02596927, "balance_loss_mlp": 1.00587928, "epoch": 0.23327120783983646, "flos": 53031048946560.0, "grad_norm": 0.7869695707959454, "language_loss": 0.55556172, "learning_rate": 3.5821511457621902e-06, "loss": 0.57647502, "num_input_tokens_seen": 41410510, "step": 1940, "time_per_iteration": 3.253009557723999 }, { "auxiliary_loss_clip": 0.01176109, "auxiliary_loss_mlp": 0.01047007, "balance_loss_clip": 1.0559504, "balance_loss_mlp": 1.03583133, "epoch": 0.23339145073047557, "flos": 17126984344320.0, "grad_norm": 16.358084656869632, "language_loss": 0.81418169, "learning_rate": 3.5816745144553497e-06, "loss": 0.83641285, "num_input_tokens_seen": 41425830, "step": 1941, "time_per_iteration": 2.668553352355957 }, { "auxiliary_loss_clip": 0.01147844, "auxiliary_loss_mlp": 0.01030666, "balance_loss_clip": 1.05245125, "balance_loss_mlp": 1.02021778, "epoch": 0.23351169362111465, "flos": 13078918419840.0, "grad_norm": 1.8874232734363083, "language_loss": 0.75640577, "learning_rate": 3.5811976432097424e-06, "loss": 0.77819085, "num_input_tokens_seen": 41443500, "step": 1942, "time_per_iteration": 3.7219457626342773 }, { "auxiliary_loss_clip": 0.01197136, "auxiliary_loss_mlp": 0.00765487, "balance_loss_clip": 1.0590204, "balance_loss_mlp": 1.0008384, "epoch": 0.23363193651175373, "flos": 15851257931520.0, "grad_norm": 2.652551843517523, "language_loss": 0.84654796, "learning_rate": 3.58072053209771e-06, "loss": 0.86617416, "num_input_tokens_seen": 41460055, "step": 1943, "time_per_iteration": 3.5429184436798096 }, { "auxiliary_loss_clip": 0.01175832, "auxiliary_loss_mlp": 0.01041339, "balance_loss_clip": 1.05335808, "balance_loss_mlp": 1.03051448, "epoch": 0.23375217940239285, "flos": 21025769345280.0, "grad_norm": 2.0992521587035395, "language_loss": 0.7889539, "learning_rate": 3.5802431811916296e-06, "loss": 0.81112564, "num_input_tokens_seen": 41476665, "step": 1944, "time_per_iteration": 3.6838607788085938 }, { "auxiliary_loss_clip": 0.01177648, "auxiliary_loss_mlp": 0.0103747, "balance_loss_clip": 1.05498183, "balance_loss_mlp": 1.02807057, "epoch": 0.23387242229303193, "flos": 20594698225920.0, "grad_norm": 2.935185755685507, "language_loss": 0.80402327, "learning_rate": 3.579765590563916e-06, "loss": 0.82617438, "num_input_tokens_seen": 41496065, "step": 1945, "time_per_iteration": 2.706803321838379 }, { "auxiliary_loss_clip": 0.0118592, "auxiliary_loss_mlp": 0.01038104, "balance_loss_clip": 1.0556953, "balance_loss_mlp": 1.02832901, "epoch": 0.233992665183671, "flos": 24279491952000.0, "grad_norm": 3.686965369359839, "language_loss": 0.82125109, "learning_rate": 3.579287760287017e-06, "loss": 0.84349132, "num_input_tokens_seen": 41516815, "step": 1946, "time_per_iteration": 2.7241063117980957 }, { "auxiliary_loss_clip": 0.01193413, "auxiliary_loss_mlp": 0.01033345, "balance_loss_clip": 1.0570004, "balance_loss_mlp": 1.023844, "epoch": 0.2341129080743101, "flos": 30154621121280.0, "grad_norm": 2.3239861432827067, "language_loss": 0.72890013, "learning_rate": 3.578809690433421e-06, "loss": 0.75116777, "num_input_tokens_seen": 41538525, "step": 1947, "time_per_iteration": 2.6950912475585938 }, { "auxiliary_loss_clip": 0.01218651, "auxiliary_loss_mlp": 0.01042904, "balance_loss_clip": 1.06217957, "balance_loss_mlp": 1.03251505, "epoch": 0.2342331509649492, "flos": 22784135829120.0, "grad_norm": 2.3303986744999197, "language_loss": 0.81401575, "learning_rate": 3.578331381075651e-06, "loss": 0.8366313, "num_input_tokens_seen": 41559025, "step": 1948, "time_per_iteration": 2.5736050605773926 }, { "auxiliary_loss_clip": 0.01198886, "auxiliary_loss_mlp": 0.01043065, "balance_loss_clip": 1.05799699, "balance_loss_mlp": 1.03338504, "epoch": 0.2343533938555883, "flos": 23623152687360.0, "grad_norm": 3.179421382306876, "language_loss": 0.69912469, "learning_rate": 3.5778528322862646e-06, "loss": 0.72154427, "num_input_tokens_seen": 41577845, "step": 1949, "time_per_iteration": 2.662689447402954 }, { "auxiliary_loss_clip": 0.01200461, "auxiliary_loss_mlp": 0.01032384, "balance_loss_clip": 1.05766606, "balance_loss_mlp": 1.02241182, "epoch": 0.23447363674622737, "flos": 24570332375040.0, "grad_norm": 1.6018091473868232, "language_loss": 0.86570346, "learning_rate": 3.5773740441378585e-06, "loss": 0.88803196, "num_input_tokens_seen": 41598600, "step": 1950, "time_per_iteration": 2.632403612136841 }, { "auxiliary_loss_clip": 0.01197038, "auxiliary_loss_mlp": 0.01030992, "balance_loss_clip": 1.05846024, "balance_loss_mlp": 1.02168155, "epoch": 0.23459387963686648, "flos": 53140322119680.0, "grad_norm": 1.9233195385520032, "language_loss": 0.73819208, "learning_rate": 3.5768950167030633e-06, "loss": 0.76047242, "num_input_tokens_seen": 41623300, "step": 1951, "time_per_iteration": 2.9693715572357178 }, { "auxiliary_loss_clip": 0.01173176, "auxiliary_loss_mlp": 0.01040782, "balance_loss_clip": 1.05385244, "balance_loss_mlp": 1.03060162, "epoch": 0.23471412252750556, "flos": 23951412103680.0, "grad_norm": 1.8057207786555303, "language_loss": 0.78311634, "learning_rate": 3.576415750054548e-06, "loss": 0.80525595, "num_input_tokens_seen": 41643420, "step": 1952, "time_per_iteration": 2.7148361206054688 }, { "auxiliary_loss_clip": 0.0117159, "auxiliary_loss_mlp": 0.01035473, "balance_loss_clip": 1.05407071, "balance_loss_mlp": 1.02624643, "epoch": 0.23483436541814465, "flos": 15706573948800.0, "grad_norm": 1.9006016815197615, "language_loss": 0.85836512, "learning_rate": 3.5759362442650172e-06, "loss": 0.88043571, "num_input_tokens_seen": 41660170, "step": 1953, "time_per_iteration": 2.6864397525787354 }, { "auxiliary_loss_clip": 0.01201631, "auxiliary_loss_mlp": 0.01035244, "balance_loss_clip": 1.06293678, "balance_loss_mlp": 1.02468765, "epoch": 0.23495460830878373, "flos": 24936262179840.0, "grad_norm": 2.451700401787181, "language_loss": 0.85213423, "learning_rate": 3.5754564994072113e-06, "loss": 0.87450302, "num_input_tokens_seen": 41679010, "step": 1954, "time_per_iteration": 2.631483554840088 }, { "auxiliary_loss_clip": 0.01179444, "auxiliary_loss_mlp": 0.01040768, "balance_loss_clip": 1.05448854, "balance_loss_mlp": 1.03027129, "epoch": 0.23507485119942284, "flos": 30482665056000.0, "grad_norm": 2.281646406729147, "language_loss": 0.60124522, "learning_rate": 3.5749765155539067e-06, "loss": 0.62344736, "num_input_tokens_seen": 41699495, "step": 1955, "time_per_iteration": 2.7441000938415527 }, { "auxiliary_loss_clip": 0.0116821, "auxiliary_loss_mlp": 0.01037683, "balance_loss_clip": 1.05503154, "balance_loss_mlp": 1.02620959, "epoch": 0.23519509409006192, "flos": 18329129746560.0, "grad_norm": 2.2563439850321845, "language_loss": 0.92266214, "learning_rate": 3.574496292777917e-06, "loss": 0.9447211, "num_input_tokens_seen": 41717705, "step": 1956, "time_per_iteration": 2.713909149169922 }, { "auxiliary_loss_clip": 0.01190576, "auxiliary_loss_mlp": 0.01036396, "balance_loss_clip": 1.05750108, "balance_loss_mlp": 1.02549386, "epoch": 0.235315336980701, "flos": 29643217234560.0, "grad_norm": 4.416154661613202, "language_loss": 0.71510023, "learning_rate": 3.574015831152092e-06, "loss": 0.73736995, "num_input_tokens_seen": 41738120, "step": 1957, "time_per_iteration": 2.7646992206573486 }, { "auxiliary_loss_clip": 0.01171297, "auxiliary_loss_mlp": 0.01037641, "balance_loss_clip": 1.05550623, "balance_loss_mlp": 1.02797902, "epoch": 0.23543557987134012, "flos": 18551704371840.0, "grad_norm": 2.1657030861780986, "language_loss": 0.83356643, "learning_rate": 3.573535130749316e-06, "loss": 0.85565585, "num_input_tokens_seen": 41756070, "step": 1958, "time_per_iteration": 2.7113566398620605 }, { "auxiliary_loss_clip": 0.01173824, "auxiliary_loss_mlp": 0.01040674, "balance_loss_clip": 1.05674732, "balance_loss_mlp": 1.03058875, "epoch": 0.2355558227619792, "flos": 24679033908480.0, "grad_norm": 2.32715424022086, "language_loss": 0.73966348, "learning_rate": 3.5730541916425127e-06, "loss": 0.76180845, "num_input_tokens_seen": 41777550, "step": 1959, "time_per_iteration": 2.760509490966797 }, { "auxiliary_loss_clip": 0.011676, "auxiliary_loss_mlp": 0.01036635, "balance_loss_clip": 1.05380201, "balance_loss_mlp": 1.0263834, "epoch": 0.23567606565261828, "flos": 21944795748480.0, "grad_norm": 4.497737178257931, "language_loss": 0.86270058, "learning_rate": 3.572573013904639e-06, "loss": 0.88474298, "num_input_tokens_seen": 41797460, "step": 1960, "time_per_iteration": 2.7796976566314697 }, { "auxiliary_loss_clip": 0.01209561, "auxiliary_loss_mlp": 0.01031886, "balance_loss_clip": 1.05805063, "balance_loss_mlp": 1.02212906, "epoch": 0.2357963085432574, "flos": 13589352639360.0, "grad_norm": 2.144069170185652, "language_loss": 0.91961336, "learning_rate": 3.572091597608689e-06, "loss": 0.94202781, "num_input_tokens_seen": 41815585, "step": 1961, "time_per_iteration": 2.7263734340667725 }, { "auxiliary_loss_clip": 0.01188851, "auxiliary_loss_mlp": 0.0103846, "balance_loss_clip": 1.05745411, "balance_loss_mlp": 1.02827907, "epoch": 0.23591655143389648, "flos": 22088689632000.0, "grad_norm": 2.2484658261004653, "language_loss": 0.73312354, "learning_rate": 3.571609942827694e-06, "loss": 0.7553966, "num_input_tokens_seen": 41834700, "step": 1962, "time_per_iteration": 2.7929601669311523 }, { "auxiliary_loss_clip": 0.01179431, "auxiliary_loss_mlp": 0.01038521, "balance_loss_clip": 1.05456209, "balance_loss_mlp": 1.0288527, "epoch": 0.23603679432453556, "flos": 17017349057280.0, "grad_norm": 1.7712787463125517, "language_loss": 0.88594162, "learning_rate": 3.57112804963472e-06, "loss": 0.90812111, "num_input_tokens_seen": 41852915, "step": 1963, "time_per_iteration": 2.728073835372925 }, { "auxiliary_loss_clip": 0.01161727, "auxiliary_loss_mlp": 0.01039478, "balance_loss_clip": 1.05680025, "balance_loss_mlp": 1.03001904, "epoch": 0.23615703721517464, "flos": 19171307001600.0, "grad_norm": 1.7904824707714606, "language_loss": 0.76605976, "learning_rate": 3.57064591810287e-06, "loss": 0.78807181, "num_input_tokens_seen": 41870415, "step": 1964, "time_per_iteration": 2.818753719329834 }, { "auxiliary_loss_clip": 0.01212821, "auxiliary_loss_mlp": 0.00764456, "balance_loss_clip": 1.0614841, "balance_loss_mlp": 1.00066268, "epoch": 0.23627728010581375, "flos": 19098803399040.0, "grad_norm": 2.4922525355660943, "language_loss": 0.80490601, "learning_rate": 3.570163548305284e-06, "loss": 0.82467878, "num_input_tokens_seen": 41889345, "step": 1965, "time_per_iteration": 3.607412576675415 }, { "auxiliary_loss_clip": 0.01183061, "auxiliary_loss_mlp": 0.01038946, "balance_loss_clip": 1.05592847, "balance_loss_mlp": 1.02848554, "epoch": 0.23639752299645284, "flos": 14282213057280.0, "grad_norm": 3.483136336137784, "language_loss": 0.70155245, "learning_rate": 3.569680940315135e-06, "loss": 0.72377253, "num_input_tokens_seen": 41905745, "step": 1966, "time_per_iteration": 2.725879669189453 }, { "auxiliary_loss_clip": 0.01173689, "auxiliary_loss_mlp": 0.01035078, "balance_loss_clip": 1.05448198, "balance_loss_mlp": 1.02400923, "epoch": 0.23651776588709192, "flos": 22893411980160.0, "grad_norm": 2.008246096625574, "language_loss": 0.81737256, "learning_rate": 3.5691980942056356e-06, "loss": 0.83946025, "num_input_tokens_seen": 41925115, "step": 1967, "time_per_iteration": 3.782923936843872 }, { "auxiliary_loss_clip": 0.01198319, "auxiliary_loss_mlp": 0.01034648, "balance_loss_clip": 1.05576921, "balance_loss_mlp": 1.02404404, "epoch": 0.23663800877773103, "flos": 18624531196800.0, "grad_norm": 2.1170206880090614, "language_loss": 0.79525512, "learning_rate": 3.5687150100500332e-06, "loss": 0.81758481, "num_input_tokens_seen": 41944815, "step": 1968, "time_per_iteration": 2.651975631713867 }, { "auxiliary_loss_clip": 0.01198145, "auxiliary_loss_mlp": 0.01040294, "balance_loss_clip": 1.05793345, "balance_loss_mlp": 1.0303638, "epoch": 0.2367582516683701, "flos": 25555828896000.0, "grad_norm": 1.733356478797489, "language_loss": 0.7453838, "learning_rate": 3.568231687921611e-06, "loss": 0.76776826, "num_input_tokens_seen": 41964990, "step": 1969, "time_per_iteration": 4.4237449169158936 }, { "auxiliary_loss_clip": 0.01207899, "auxiliary_loss_mlp": 0.01039275, "balance_loss_clip": 1.05738711, "balance_loss_mlp": 1.02999496, "epoch": 0.2368784945590092, "flos": 23295072839040.0, "grad_norm": 1.5623991225259253, "language_loss": 0.80483359, "learning_rate": 3.5677481278936883e-06, "loss": 0.82730532, "num_input_tokens_seen": 41984570, "step": 1970, "time_per_iteration": 2.6627728939056396 }, { "auxiliary_loss_clip": 0.01095315, "auxiliary_loss_mlp": 0.01006133, "balance_loss_clip": 1.03529024, "balance_loss_mlp": 1.00414217, "epoch": 0.23699873744964828, "flos": 69859291875840.0, "grad_norm": 0.8268789166211841, "language_loss": 0.57798064, "learning_rate": 3.5672643300396214e-06, "loss": 0.59899509, "num_input_tokens_seen": 42053715, "step": 1971, "time_per_iteration": 3.4624624252319336 }, { "auxiliary_loss_clip": 0.01167828, "auxiliary_loss_mlp": 0.01032423, "balance_loss_clip": 1.05527377, "balance_loss_mlp": 1.02261806, "epoch": 0.2371189803402874, "flos": 21835052720640.0, "grad_norm": 2.987003397972215, "language_loss": 0.67722964, "learning_rate": 3.566780294432802e-06, "loss": 0.6992321, "num_input_tokens_seen": 42070890, "step": 1972, "time_per_iteration": 2.728698968887329 }, { "auxiliary_loss_clip": 0.01211538, "auxiliary_loss_mlp": 0.01040336, "balance_loss_clip": 1.05868077, "balance_loss_mlp": 1.03015542, "epoch": 0.23723922323092647, "flos": 21908490076800.0, "grad_norm": 4.99851176282828, "language_loss": 0.75097215, "learning_rate": 3.566296021146657e-06, "loss": 0.77349079, "num_input_tokens_seen": 42090270, "step": 1973, "time_per_iteration": 2.6775996685028076 }, { "auxiliary_loss_clip": 0.01213711, "auxiliary_loss_mlp": 0.01033602, "balance_loss_clip": 1.06004465, "balance_loss_mlp": 1.02391672, "epoch": 0.23735946612156555, "flos": 32708803380480.0, "grad_norm": 2.0770141046460258, "language_loss": 0.73362607, "learning_rate": 3.565811510254652e-06, "loss": 0.75609922, "num_input_tokens_seen": 42111150, "step": 1974, "time_per_iteration": 2.7020983695983887 }, { "auxiliary_loss_clip": 0.01117449, "auxiliary_loss_mlp": 0.01002799, "balance_loss_clip": 1.04451942, "balance_loss_mlp": 1.0004859, "epoch": 0.23747970901220466, "flos": 70546944821760.0, "grad_norm": 0.8340068779108762, "language_loss": 0.58233798, "learning_rate": 3.5653267618302845e-06, "loss": 0.60354054, "num_input_tokens_seen": 42178730, "step": 1975, "time_per_iteration": 3.3298583030700684 }, { "auxiliary_loss_clip": 0.01210754, "auxiliary_loss_mlp": 0.01033942, "balance_loss_clip": 1.05854416, "balance_loss_mlp": 1.02411914, "epoch": 0.23759995190284375, "flos": 20849807594880.0, "grad_norm": 1.846372160079544, "language_loss": 0.85704225, "learning_rate": 3.564841775947093e-06, "loss": 0.87948918, "num_input_tokens_seen": 42199620, "step": 1976, "time_per_iteration": 2.6351826190948486 }, { "auxiliary_loss_clip": 0.0116454, "auxiliary_loss_mlp": 0.01037242, "balance_loss_clip": 1.05238295, "balance_loss_mlp": 1.02718019, "epoch": 0.23772019479348283, "flos": 32921645420160.0, "grad_norm": 2.7526769269689577, "language_loss": 0.76480389, "learning_rate": 3.5643565526786475e-06, "loss": 0.78682166, "num_input_tokens_seen": 42219560, "step": 1977, "time_per_iteration": 2.7980806827545166 }, { "auxiliary_loss_clip": 0.01214946, "auxiliary_loss_mlp": 0.01030868, "balance_loss_clip": 1.06131196, "balance_loss_mlp": 1.02121854, "epoch": 0.2378404376841219, "flos": 32342765834880.0, "grad_norm": 12.886705168617794, "language_loss": 0.77325833, "learning_rate": 3.5638710920985574e-06, "loss": 0.79571646, "num_input_tokens_seen": 42241020, "step": 1978, "time_per_iteration": 2.687873601913452 }, { "auxiliary_loss_clip": 0.01203027, "auxiliary_loss_mlp": 0.00765548, "balance_loss_clip": 1.0567832, "balance_loss_mlp": 1.00069046, "epoch": 0.23796068057476102, "flos": 22997624313600.0, "grad_norm": 2.840450634526938, "language_loss": 0.82379967, "learning_rate": 3.5633853942804655e-06, "loss": 0.84348541, "num_input_tokens_seen": 42259345, "step": 1979, "time_per_iteration": 2.655470371246338 }, { "auxiliary_loss_clip": 0.01164534, "auxiliary_loss_mlp": 0.01039653, "balance_loss_clip": 1.05068958, "balance_loss_mlp": 1.02867389, "epoch": 0.2380809234654001, "flos": 13480938414720.0, "grad_norm": 3.6781863135157122, "language_loss": 0.7705214, "learning_rate": 3.5628994592980527e-06, "loss": 0.7925632, "num_input_tokens_seen": 42277250, "step": 1980, "time_per_iteration": 2.6891067028045654 }, { "auxiliary_loss_clip": 0.01212986, "auxiliary_loss_mlp": 0.01035701, "balance_loss_clip": 1.05905545, "balance_loss_mlp": 1.02575898, "epoch": 0.2382011663560392, "flos": 16871803148160.0, "grad_norm": 1.7277145467076493, "language_loss": 0.70288521, "learning_rate": 3.562413287225034e-06, "loss": 0.72537202, "num_input_tokens_seen": 42295360, "step": 1981, "time_per_iteration": 2.556370735168457 }, { "auxiliary_loss_clip": 0.01193464, "auxiliary_loss_mlp": 0.01044769, "balance_loss_clip": 1.0596025, "balance_loss_mlp": 1.03451085, "epoch": 0.2383214092466783, "flos": 18441135331200.0, "grad_norm": 2.83827075708479, "language_loss": 0.89315999, "learning_rate": 3.5619268781351623e-06, "loss": 0.91554224, "num_input_tokens_seen": 42313430, "step": 1982, "time_per_iteration": 2.704265832901001 }, { "auxiliary_loss_clip": 0.01179016, "auxiliary_loss_mlp": 0.01037739, "balance_loss_clip": 1.05936408, "balance_loss_mlp": 1.02866709, "epoch": 0.23844165213731738, "flos": 19755717281280.0, "grad_norm": 2.009027121618499, "language_loss": 0.77016032, "learning_rate": 3.5614402321022256e-06, "loss": 0.79232788, "num_input_tokens_seen": 42331260, "step": 1983, "time_per_iteration": 2.645494222640991 }, { "auxiliary_loss_clip": 0.01142343, "auxiliary_loss_mlp": 0.01041973, "balance_loss_clip": 1.05187809, "balance_loss_mlp": 1.03243041, "epoch": 0.23856189502795647, "flos": 23367360960000.0, "grad_norm": 1.7686829448512968, "language_loss": 0.87083721, "learning_rate": 3.5609533492000463e-06, "loss": 0.89268041, "num_input_tokens_seen": 42350150, "step": 1984, "time_per_iteration": 2.8393704891204834 }, { "auxiliary_loss_clip": 0.01180976, "auxiliary_loss_mlp": 0.01036005, "balance_loss_clip": 1.05966246, "balance_loss_mlp": 1.02577066, "epoch": 0.23868213791859555, "flos": 23475056912640.0, "grad_norm": 2.3741227029999137, "language_loss": 0.78769839, "learning_rate": 3.560466229502485e-06, "loss": 0.80986816, "num_input_tokens_seen": 42369495, "step": 1985, "time_per_iteration": 2.8316457271575928 }, { "auxiliary_loss_clip": 0.01179492, "auxiliary_loss_mlp": 0.00765, "balance_loss_clip": 1.05901265, "balance_loss_mlp": 1.00072527, "epoch": 0.23880238080923466, "flos": 16617340224000.0, "grad_norm": 2.060615563519021, "language_loss": 0.90056872, "learning_rate": 3.5599788730834384e-06, "loss": 0.92001373, "num_input_tokens_seen": 42387455, "step": 1986, "time_per_iteration": 2.68450927734375 }, { "auxiliary_loss_clip": 0.01202622, "auxiliary_loss_mlp": 0.01036115, "balance_loss_clip": 1.0595938, "balance_loss_mlp": 1.02629769, "epoch": 0.23892262369987374, "flos": 17348409734400.0, "grad_norm": 6.983394046605042, "language_loss": 0.7906993, "learning_rate": 3.559491280016836e-06, "loss": 0.81308669, "num_input_tokens_seen": 42405400, "step": 1987, "time_per_iteration": 2.639719247817993 }, { "auxiliary_loss_clip": 0.01182768, "auxiliary_loss_mlp": 0.01036348, "balance_loss_clip": 1.05700672, "balance_loss_mlp": 1.02637661, "epoch": 0.23904286659051283, "flos": 22309899540480.0, "grad_norm": 1.7701738136566376, "language_loss": 0.71124113, "learning_rate": 3.5590034503766465e-06, "loss": 0.73343229, "num_input_tokens_seen": 42425065, "step": 1988, "time_per_iteration": 2.745944023132324 }, { "auxiliary_loss_clip": 0.01215618, "auxiliary_loss_mlp": 0.01042243, "balance_loss_clip": 1.06205511, "balance_loss_mlp": 1.03225315, "epoch": 0.23916310948115194, "flos": 21178246579200.0, "grad_norm": 2.2573178453710097, "language_loss": 0.81116229, "learning_rate": 3.558515384236874e-06, "loss": 0.83374095, "num_input_tokens_seen": 42442495, "step": 1989, "time_per_iteration": 2.6128361225128174 }, { "auxiliary_loss_clip": 0.01158033, "auxiliary_loss_mlp": 0.00765471, "balance_loss_clip": 1.05472374, "balance_loss_mlp": 1.00064957, "epoch": 0.23928335237179102, "flos": 14137349506560.0, "grad_norm": 1.9087884241875184, "language_loss": 0.84081233, "learning_rate": 3.558027081671556e-06, "loss": 0.86004734, "num_input_tokens_seen": 42459480, "step": 1990, "time_per_iteration": 2.7520062923431396 }, { "auxiliary_loss_clip": 0.012036, "auxiliary_loss_mlp": 0.0103712, "balance_loss_clip": 1.05837548, "balance_loss_mlp": 1.02696395, "epoch": 0.2394035952624301, "flos": 23769596436480.0, "grad_norm": 3.171122643729757, "language_loss": 0.68608105, "learning_rate": 3.557538542754769e-06, "loss": 0.70848823, "num_input_tokens_seen": 42479175, "step": 1991, "time_per_iteration": 3.6641550064086914 }, { "auxiliary_loss_clip": 0.01215343, "auxiliary_loss_mlp": 0.01042179, "balance_loss_clip": 1.06241548, "balance_loss_mlp": 1.03102136, "epoch": 0.2395238381530692, "flos": 24206198250240.0, "grad_norm": 2.200211060809397, "language_loss": 0.67143953, "learning_rate": 3.557049767560623e-06, "loss": 0.69401479, "num_input_tokens_seen": 42498090, "step": 1992, "time_per_iteration": 2.6709232330322266 }, { "auxiliary_loss_clip": 0.01159363, "auxiliary_loss_mlp": 0.01032797, "balance_loss_clip": 1.05816722, "balance_loss_mlp": 1.02317691, "epoch": 0.2396440810437083, "flos": 25295763450240.0, "grad_norm": 2.1047936524406317, "language_loss": 0.86007529, "learning_rate": 3.5565607561632655e-06, "loss": 0.88199693, "num_input_tokens_seen": 42516930, "step": 1993, "time_per_iteration": 2.8814609050750732 }, { "auxiliary_loss_clip": 0.01179853, "auxiliary_loss_mlp": 0.0103404, "balance_loss_clip": 1.05660939, "balance_loss_mlp": 1.02337682, "epoch": 0.23976432393434738, "flos": 28543093436160.0, "grad_norm": 2.992230044159644, "language_loss": 0.79044586, "learning_rate": 3.5560715086368787e-06, "loss": 0.81258476, "num_input_tokens_seen": 42534800, "step": 1994, "time_per_iteration": 3.6999244689941406 }, { "auxiliary_loss_clip": 0.01178053, "auxiliary_loss_mlp": 0.01038751, "balance_loss_clip": 1.05819428, "balance_loss_mlp": 1.02849901, "epoch": 0.23988456682498646, "flos": 19494358945920.0, "grad_norm": 2.7324409232418683, "language_loss": 0.8253864, "learning_rate": 3.5555820250556816e-06, "loss": 0.84755445, "num_input_tokens_seen": 42552000, "step": 1995, "time_per_iteration": 3.614504337310791 }, { "auxiliary_loss_clip": 0.01190818, "auxiliary_loss_mlp": 0.01036306, "balance_loss_clip": 1.06128597, "balance_loss_mlp": 1.02568424, "epoch": 0.24000480971562557, "flos": 20266331068800.0, "grad_norm": 4.030747429049923, "language_loss": 0.69356394, "learning_rate": 3.5550923054939278e-06, "loss": 0.71583521, "num_input_tokens_seen": 42571455, "step": 1996, "time_per_iteration": 3.694998025894165 }, { "auxiliary_loss_clip": 0.01149292, "auxiliary_loss_mlp": 0.01037034, "balance_loss_clip": 1.05255103, "balance_loss_mlp": 1.02713358, "epoch": 0.24012505260626466, "flos": 25443176866560.0, "grad_norm": 2.251061970684212, "language_loss": 0.74747276, "learning_rate": 3.5546023500259083e-06, "loss": 0.76933599, "num_input_tokens_seen": 42592550, "step": 1997, "time_per_iteration": 2.8813302516937256 }, { "auxiliary_loss_clip": 0.01160887, "auxiliary_loss_mlp": 0.01033658, "balance_loss_clip": 1.05597234, "balance_loss_mlp": 1.02413344, "epoch": 0.24024529549690374, "flos": 15553342529280.0, "grad_norm": 2.2278629049654937, "language_loss": 0.81059968, "learning_rate": 3.5541121587259477e-06, "loss": 0.83254516, "num_input_tokens_seen": 42610385, "step": 1998, "time_per_iteration": 2.7959396839141846 }, { "auxiliary_loss_clip": 0.01105963, "auxiliary_loss_mlp": 0.01004751, "balance_loss_clip": 1.0379827, "balance_loss_mlp": 1.00273609, "epoch": 0.24036553838754285, "flos": 57122351867520.0, "grad_norm": 0.8483933557061812, "language_loss": 0.57813954, "learning_rate": 3.553621731668408e-06, "loss": 0.59924668, "num_input_tokens_seen": 42673595, "step": 1999, "time_per_iteration": 3.2449729442596436 }, { "auxiliary_loss_clip": 0.0119422, "auxiliary_loss_mlp": 0.01039136, "balance_loss_clip": 1.05755472, "balance_loss_mlp": 1.02872288, "epoch": 0.24048578127818193, "flos": 24969946158720.0, "grad_norm": 1.7383209746081076, "language_loss": 0.83310312, "learning_rate": 3.553131068927688e-06, "loss": 0.85543668, "num_input_tokens_seen": 42692000, "step": 2000, "time_per_iteration": 2.7133445739746094 }, { "auxiliary_loss_clip": 0.01170899, "auxiliary_loss_mlp": 0.0103501, "balance_loss_clip": 1.05794787, "balance_loss_mlp": 1.02474046, "epoch": 0.24060602416882101, "flos": 23330947547520.0, "grad_norm": 1.716602722049479, "language_loss": 0.80298728, "learning_rate": 3.552640170578219e-06, "loss": 0.82504636, "num_input_tokens_seen": 42712250, "step": 2001, "time_per_iteration": 2.8069820404052734 }, { "auxiliary_loss_clip": 0.0118589, "auxiliary_loss_mlp": 0.01037171, "balance_loss_clip": 1.05877948, "balance_loss_mlp": 1.02719927, "epoch": 0.2407262670594601, "flos": 14173260128640.0, "grad_norm": 2.3431572687297706, "language_loss": 0.78004354, "learning_rate": 3.5521490366944703e-06, "loss": 0.80227411, "num_input_tokens_seen": 42729900, "step": 2002, "time_per_iteration": 2.6220438480377197 }, { "auxiliary_loss_clip": 0.01168054, "auxiliary_loss_mlp": 0.01039688, "balance_loss_clip": 1.05651522, "balance_loss_mlp": 1.02975774, "epoch": 0.2408465099500992, "flos": 13663113217920.0, "grad_norm": 2.9677083367355737, "language_loss": 0.80454993, "learning_rate": 3.5516576673509474e-06, "loss": 0.82662731, "num_input_tokens_seen": 42747900, "step": 2003, "time_per_iteration": 2.7236740589141846 }, { "auxiliary_loss_clip": 0.01214283, "auxiliary_loss_mlp": 0.01037657, "balance_loss_clip": 1.06021154, "balance_loss_mlp": 1.02735734, "epoch": 0.2409667528407383, "flos": 31248029076480.0, "grad_norm": 1.9551561629891352, "language_loss": 0.85969341, "learning_rate": 3.5511660626221896e-06, "loss": 0.88221282, "num_input_tokens_seen": 42768540, "step": 2004, "time_per_iteration": 2.6839094161987305 }, { "auxiliary_loss_clip": 0.01183742, "auxiliary_loss_mlp": 0.00765447, "balance_loss_clip": 1.05851483, "balance_loss_mlp": 1.00070953, "epoch": 0.24108699573137737, "flos": 22199941031040.0, "grad_norm": 3.8103128675627143, "language_loss": 0.89175749, "learning_rate": 3.5506742225827744e-06, "loss": 0.9112494, "num_input_tokens_seen": 42785395, "step": 2005, "time_per_iteration": 2.658069372177124 }, { "auxiliary_loss_clip": 0.01165989, "auxiliary_loss_mlp": 0.01038791, "balance_loss_clip": 1.05478621, "balance_loss_mlp": 1.02816963, "epoch": 0.24120723862201648, "flos": 26103035664000.0, "grad_norm": 2.319572626270833, "language_loss": 0.90465385, "learning_rate": 3.5501821473073116e-06, "loss": 0.92670166, "num_input_tokens_seen": 42801980, "step": 2006, "time_per_iteration": 2.7185373306274414 }, { "auxiliary_loss_clip": 0.01166932, "auxiliary_loss_mlp": 0.01037401, "balance_loss_clip": 1.05856574, "balance_loss_mlp": 1.028234, "epoch": 0.24132748151265557, "flos": 18624926246400.0, "grad_norm": 2.5675914254246432, "language_loss": 0.86949396, "learning_rate": 3.54968983687045e-06, "loss": 0.89153731, "num_input_tokens_seen": 42818850, "step": 2007, "time_per_iteration": 2.6639606952667236 }, { "auxiliary_loss_clip": 0.01186992, "auxiliary_loss_mlp": 0.01034617, "balance_loss_clip": 1.05817759, "balance_loss_mlp": 1.02428782, "epoch": 0.24144772440329465, "flos": 15267673664640.0, "grad_norm": 5.599681791832032, "language_loss": 0.89493906, "learning_rate": 3.549197291346872e-06, "loss": 0.91715515, "num_input_tokens_seen": 42835375, "step": 2008, "time_per_iteration": 2.6706700325012207 }, { "auxiliary_loss_clip": 0.01200275, "auxiliary_loss_mlp": 0.0103439, "balance_loss_clip": 1.0591352, "balance_loss_mlp": 1.02411413, "epoch": 0.24156796729393373, "flos": 24024274842240.0, "grad_norm": 2.4559462413325366, "language_loss": 0.79653823, "learning_rate": 3.548704510811297e-06, "loss": 0.81888485, "num_input_tokens_seen": 42854570, "step": 2009, "time_per_iteration": 2.6534206867218018 }, { "auxiliary_loss_clip": 0.01157506, "auxiliary_loss_mlp": 0.01030234, "balance_loss_clip": 1.05218899, "balance_loss_mlp": 1.0199635, "epoch": 0.24168821018457284, "flos": 26286790665600.0, "grad_norm": 3.9723532804591235, "language_loss": 0.74644762, "learning_rate": 3.5482114953384787e-06, "loss": 0.76832497, "num_input_tokens_seen": 42873800, "step": 2010, "time_per_iteration": 2.7932276725769043 }, { "auxiliary_loss_clip": 0.0120553, "auxiliary_loss_mlp": 0.01039524, "balance_loss_clip": 1.06070685, "balance_loss_mlp": 1.02986801, "epoch": 0.24180845307521193, "flos": 18223193560320.0, "grad_norm": 2.115904867805552, "language_loss": 0.84445441, "learning_rate": 3.5477182450032077e-06, "loss": 0.86690491, "num_input_tokens_seen": 42892400, "step": 2011, "time_per_iteration": 2.6598806381225586 }, { "auxiliary_loss_clip": 0.01198819, "auxiliary_loss_mlp": 0.01039634, "balance_loss_clip": 1.06034636, "balance_loss_mlp": 1.02917361, "epoch": 0.241928695965851, "flos": 20449260057600.0, "grad_norm": 2.2011405519957794, "language_loss": 0.83051968, "learning_rate": 3.5472247598803097e-06, "loss": 0.8529042, "num_input_tokens_seen": 42911745, "step": 2012, "time_per_iteration": 2.73429012298584 }, { "auxiliary_loss_clip": 0.01212486, "auxiliary_loss_mlp": 0.01034037, "balance_loss_clip": 1.05932772, "balance_loss_mlp": 1.0235827, "epoch": 0.24204893885649012, "flos": 25556475340800.0, "grad_norm": 14.414151707812543, "language_loss": 0.85294187, "learning_rate": 3.546731040044645e-06, "loss": 0.87540716, "num_input_tokens_seen": 42926915, "step": 2013, "time_per_iteration": 2.6350269317626953 }, { "auxiliary_loss_clip": 0.01214582, "auxiliary_loss_mlp": 0.01040463, "balance_loss_clip": 1.06210208, "balance_loss_mlp": 1.03022909, "epoch": 0.2421691817471292, "flos": 30660207004800.0, "grad_norm": 1.9019973223995599, "language_loss": 0.75359774, "learning_rate": 3.546237085571112e-06, "loss": 0.77614814, "num_input_tokens_seen": 42945350, "step": 2014, "time_per_iteration": 2.7070388793945312 }, { "auxiliary_loss_clip": 0.01200731, "auxiliary_loss_mlp": 0.01040306, "balance_loss_clip": 1.05989289, "balance_loss_mlp": 1.03019106, "epoch": 0.24228942463776829, "flos": 21945011230080.0, "grad_norm": 2.1927855688270417, "language_loss": 0.72535086, "learning_rate": 3.5457428965346425e-06, "loss": 0.74776125, "num_input_tokens_seen": 42964290, "step": 2015, "time_per_iteration": 2.707904815673828 }, { "auxiliary_loss_clip": 0.01139492, "auxiliary_loss_mlp": 0.01038266, "balance_loss_clip": 1.05338597, "balance_loss_mlp": 1.02840114, "epoch": 0.2424096675284074, "flos": 33984493879680.0, "grad_norm": 1.738795974554511, "language_loss": 0.74727052, "learning_rate": 3.545248473010205e-06, "loss": 0.76904809, "num_input_tokens_seen": 42987095, "step": 2016, "time_per_iteration": 3.775074005126953 }, { "auxiliary_loss_clip": 0.01217829, "auxiliary_loss_mlp": 0.00765687, "balance_loss_clip": 1.06156278, "balance_loss_mlp": 1.00059283, "epoch": 0.24252991041904648, "flos": 21653416621440.0, "grad_norm": 1.8967531969010005, "language_loss": 0.87898171, "learning_rate": 3.544753815072802e-06, "loss": 0.89881694, "num_input_tokens_seen": 43005750, "step": 2017, "time_per_iteration": 2.653320550918579 }, { "auxiliary_loss_clip": 0.01114328, "auxiliary_loss_mlp": 0.0103577, "balance_loss_clip": 1.04510069, "balance_loss_mlp": 1.02597117, "epoch": 0.24265015330968556, "flos": 21870065502720.0, "grad_norm": 2.1156823808454788, "language_loss": 0.88167858, "learning_rate": 3.544258922797474e-06, "loss": 0.90317953, "num_input_tokens_seen": 43023870, "step": 2018, "time_per_iteration": 2.762349843978882 }, { "auxiliary_loss_clip": 0.01213472, "auxiliary_loss_mlp": 0.01035509, "balance_loss_clip": 1.06151104, "balance_loss_mlp": 1.02542341, "epoch": 0.24277039620032465, "flos": 25628260671360.0, "grad_norm": 2.1663152406064627, "language_loss": 0.78361022, "learning_rate": 3.543763796259295e-06, "loss": 0.80610001, "num_input_tokens_seen": 43043825, "step": 2019, "time_per_iteration": 2.674020290374756 }, { "auxiliary_loss_clip": 0.01198249, "auxiliary_loss_mlp": 0.01035427, "balance_loss_clip": 1.05914366, "balance_loss_mlp": 1.02547884, "epoch": 0.24289063909096376, "flos": 26286575184000.0, "grad_norm": 1.8976977203169956, "language_loss": 0.91214162, "learning_rate": 3.5432684355333754e-06, "loss": 0.93447834, "num_input_tokens_seen": 43062480, "step": 2020, "time_per_iteration": 3.630615711212158 }, { "auxiliary_loss_clip": 0.01197658, "auxiliary_loss_mlp": 0.01032436, "balance_loss_clip": 1.05699801, "balance_loss_mlp": 1.02187455, "epoch": 0.24301088198160284, "flos": 25075056332160.0, "grad_norm": 2.459321933808246, "language_loss": 0.76646554, "learning_rate": 3.5427728406948613e-06, "loss": 0.78876644, "num_input_tokens_seen": 43081595, "step": 2021, "time_per_iteration": 3.604193925857544 }, { "auxiliary_loss_clip": 0.01089574, "auxiliary_loss_mlp": 0.01002889, "balance_loss_clip": 1.02868152, "balance_loss_mlp": 1.00080287, "epoch": 0.24313112487224192, "flos": 69900948673920.0, "grad_norm": 0.7958063599227025, "language_loss": 0.57874006, "learning_rate": 3.542277011818934e-06, "loss": 0.59966475, "num_input_tokens_seen": 43145430, "step": 2022, "time_per_iteration": 4.293999195098877 }, { "auxiliary_loss_clip": 0.01183391, "auxiliary_loss_mlp": 0.01031875, "balance_loss_clip": 1.05768824, "balance_loss_mlp": 1.02304184, "epoch": 0.24325136776288103, "flos": 40662334235520.0, "grad_norm": 2.38061674499193, "language_loss": 0.74062324, "learning_rate": 3.5417809489808104e-06, "loss": 0.76277596, "num_input_tokens_seen": 43167040, "step": 2023, "time_per_iteration": 2.8267881870269775 }, { "auxiliary_loss_clip": 0.01202115, "auxiliary_loss_mlp": 0.01037558, "balance_loss_clip": 1.06207561, "balance_loss_mlp": 1.02760983, "epoch": 0.24337161065352012, "flos": 25046400257280.0, "grad_norm": 1.8290291536382646, "language_loss": 0.72434151, "learning_rate": 3.5412846522557422e-06, "loss": 0.74673826, "num_input_tokens_seen": 43187930, "step": 2024, "time_per_iteration": 2.7770321369171143 }, { "auxiliary_loss_clip": 0.01215433, "auxiliary_loss_mlp": 0.01038503, "balance_loss_clip": 1.06268084, "balance_loss_mlp": 1.02860844, "epoch": 0.2434918535441592, "flos": 18661160090880.0, "grad_norm": 3.5759426026039076, "language_loss": 0.74037898, "learning_rate": 3.540788121719018e-06, "loss": 0.76291829, "num_input_tokens_seen": 43206350, "step": 2025, "time_per_iteration": 2.6739232540130615 }, { "auxiliary_loss_clip": 0.01162018, "auxiliary_loss_mlp": 0.01029825, "balance_loss_clip": 1.05833817, "balance_loss_mlp": 1.02040708, "epoch": 0.24361209643479828, "flos": 23915142345600.0, "grad_norm": 1.9525908821329123, "language_loss": 0.81684649, "learning_rate": 3.5402913574459604e-06, "loss": 0.83876497, "num_input_tokens_seen": 43226255, "step": 2026, "time_per_iteration": 2.8382441997528076 }, { "auxiliary_loss_clip": 0.01134507, "auxiliary_loss_mlp": 0.01032234, "balance_loss_clip": 1.04900861, "balance_loss_mlp": 1.02280998, "epoch": 0.2437323393254374, "flos": 28657505232000.0, "grad_norm": 2.1791462552988636, "language_loss": 0.85920799, "learning_rate": 3.5397943595119297e-06, "loss": 0.88087535, "num_input_tokens_seen": 43247675, "step": 2027, "time_per_iteration": 3.0320804119110107 }, { "auxiliary_loss_clip": 0.01181685, "auxiliary_loss_mlp": 0.01036022, "balance_loss_clip": 1.06036174, "balance_loss_mlp": 1.02629483, "epoch": 0.24385258221607647, "flos": 23550325862400.0, "grad_norm": 4.475027494552107, "language_loss": 0.77100861, "learning_rate": 3.5392971279923177e-06, "loss": 0.79318571, "num_input_tokens_seen": 43265895, "step": 2028, "time_per_iteration": 2.90582537651062 }, { "auxiliary_loss_clip": 0.01161985, "auxiliary_loss_mlp": 0.01038543, "balance_loss_clip": 1.05227256, "balance_loss_mlp": 1.0284282, "epoch": 0.24397282510671556, "flos": 25336091445120.0, "grad_norm": 2.735913459405839, "language_loss": 0.83022892, "learning_rate": 3.5387996629625557e-06, "loss": 0.85223418, "num_input_tokens_seen": 43283485, "step": 2029, "time_per_iteration": 2.7067718505859375 }, { "auxiliary_loss_clip": 0.01115913, "auxiliary_loss_mlp": 0.01003068, "balance_loss_clip": 1.03254104, "balance_loss_mlp": 1.0011605, "epoch": 0.24409306799735467, "flos": 65187421430400.0, "grad_norm": 0.8116011816495262, "language_loss": 0.54968417, "learning_rate": 3.5383019644981083e-06, "loss": 0.57087398, "num_input_tokens_seen": 43347180, "step": 2030, "time_per_iteration": 3.1867733001708984 }, { "auxiliary_loss_clip": 0.0118372, "auxiliary_loss_mlp": 0.01037124, "balance_loss_clip": 1.05943441, "balance_loss_mlp": 1.02673507, "epoch": 0.24421331088799375, "flos": 19537093152000.0, "grad_norm": 2.570066423171972, "language_loss": 0.73133808, "learning_rate": 3.5378040326744763e-06, "loss": 0.75354648, "num_input_tokens_seen": 43366665, "step": 2031, "time_per_iteration": 2.6384527683258057 }, { "auxiliary_loss_clip": 0.01168581, "auxiliary_loss_mlp": 0.01036645, "balance_loss_clip": 1.06007528, "balance_loss_mlp": 1.02700162, "epoch": 0.24433355377863283, "flos": 21068575378560.0, "grad_norm": 2.2096005779334797, "language_loss": 0.857867, "learning_rate": 3.5373058675671946e-06, "loss": 0.87991923, "num_input_tokens_seen": 43384670, "step": 2032, "time_per_iteration": 2.7296409606933594 }, { "auxiliary_loss_clip": 0.01144473, "auxiliary_loss_mlp": 0.0103767, "balance_loss_clip": 1.05142188, "balance_loss_mlp": 1.02750754, "epoch": 0.24445379666927192, "flos": 22637189289600.0, "grad_norm": 2.514126869431076, "language_loss": 0.72859848, "learning_rate": 3.536807469251836e-06, "loss": 0.75041997, "num_input_tokens_seen": 43403825, "step": 2033, "time_per_iteration": 2.7600364685058594 }, { "auxiliary_loss_clip": 0.01172068, "auxiliary_loss_mlp": 0.01037039, "balance_loss_clip": 1.05254388, "balance_loss_mlp": 1.02722239, "epoch": 0.24457403955991103, "flos": 21251612108160.0, "grad_norm": 3.079338675454707, "language_loss": 0.82810831, "learning_rate": 3.5363088378040055e-06, "loss": 0.85019946, "num_input_tokens_seen": 43422715, "step": 2034, "time_per_iteration": 2.8573055267333984 }, { "auxiliary_loss_clip": 0.01110879, "auxiliary_loss_mlp": 0.00755255, "balance_loss_clip": 1.02828383, "balance_loss_mlp": 1.00055492, "epoch": 0.2446942824505501, "flos": 66997820764800.0, "grad_norm": 0.7591812012724692, "language_loss": 0.64284837, "learning_rate": 3.5358099732993463e-06, "loss": 0.66150975, "num_input_tokens_seen": 43481825, "step": 2035, "time_per_iteration": 3.114967107772827 }, { "auxiliary_loss_clip": 0.01191251, "auxiliary_loss_mlp": 0.01038105, "balance_loss_clip": 1.05793977, "balance_loss_mlp": 1.02794862, "epoch": 0.2448145253411892, "flos": 20411122792320.0, "grad_norm": 2.125605031312977, "language_loss": 0.89667505, "learning_rate": 3.535310875813535e-06, "loss": 0.91896862, "num_input_tokens_seen": 43500220, "step": 2036, "time_per_iteration": 2.6819303035736084 }, { "auxiliary_loss_clip": 0.01196903, "auxiliary_loss_mlp": 0.01039999, "balance_loss_clip": 1.05845189, "balance_loss_mlp": 1.0304029, "epoch": 0.2449347682318283, "flos": 28804739080320.0, "grad_norm": 1.8293719527655232, "language_loss": 0.81798041, "learning_rate": 3.5348115454222843e-06, "loss": 0.84034944, "num_input_tokens_seen": 43522805, "step": 2037, "time_per_iteration": 2.683457851409912 }, { "auxiliary_loss_clip": 0.01176982, "auxiliary_loss_mlp": 0.01040947, "balance_loss_clip": 1.05285549, "balance_loss_mlp": 1.03039694, "epoch": 0.2450550111224674, "flos": 22528990546560.0, "grad_norm": 2.1568862054604265, "language_loss": 0.86730862, "learning_rate": 3.5343119822013425e-06, "loss": 0.88948798, "num_input_tokens_seen": 43541915, "step": 2038, "time_per_iteration": 2.6563141345977783 }, { "auxiliary_loss_clip": 0.01207191, "auxiliary_loss_mlp": 0.01041554, "balance_loss_clip": 1.06160426, "balance_loss_mlp": 1.03167772, "epoch": 0.24517525401310647, "flos": 21759137326080.0, "grad_norm": 1.9221797844441748, "language_loss": 0.77858287, "learning_rate": 3.533812186226493e-06, "loss": 0.80107033, "num_input_tokens_seen": 43562625, "step": 2039, "time_per_iteration": 2.7181053161621094 }, { "auxiliary_loss_clip": 0.01208335, "auxiliary_loss_mlp": 0.01033017, "balance_loss_clip": 1.05885792, "balance_loss_mlp": 1.02349186, "epoch": 0.24529549690374555, "flos": 25043311687680.0, "grad_norm": 1.6905698385586518, "language_loss": 0.75919688, "learning_rate": 3.5333121575735545e-06, "loss": 0.78161037, "num_input_tokens_seen": 43582265, "step": 2040, "time_per_iteration": 2.6123206615448 }, { "auxiliary_loss_clip": 0.01181923, "auxiliary_loss_mlp": 0.01032315, "balance_loss_clip": 1.05770659, "balance_loss_mlp": 1.02264762, "epoch": 0.24541573979438466, "flos": 32123638915200.0, "grad_norm": 2.651263188535779, "language_loss": 0.75390327, "learning_rate": 3.532811896318381e-06, "loss": 0.77604562, "num_input_tokens_seen": 43604335, "step": 2041, "time_per_iteration": 2.7912018299102783 }, { "auxiliary_loss_clip": 0.01173154, "auxiliary_loss_mlp": 0.01033091, "balance_loss_clip": 1.05653989, "balance_loss_mlp": 1.02288628, "epoch": 0.24553598268502375, "flos": 31357556622720.0, "grad_norm": 13.075809416925239, "language_loss": 0.82063532, "learning_rate": 3.5323114025368615e-06, "loss": 0.84269774, "num_input_tokens_seen": 43619400, "step": 2042, "time_per_iteration": 3.7641818523406982 }, { "auxiliary_loss_clip": 0.01190509, "auxiliary_loss_mlp": 0.01040051, "balance_loss_clip": 1.05386281, "balance_loss_mlp": 1.03016281, "epoch": 0.24565622557566283, "flos": 14027462824320.0, "grad_norm": 6.431210233954517, "language_loss": 0.82230502, "learning_rate": 3.53181067630492e-06, "loss": 0.84461063, "num_input_tokens_seen": 43636870, "step": 2043, "time_per_iteration": 2.6107115745544434 }, { "auxiliary_loss_clip": 0.01171942, "auxiliary_loss_mlp": 0.01041521, "balance_loss_clip": 1.05349135, "balance_loss_mlp": 1.03201985, "epoch": 0.24577646846630194, "flos": 16581465515520.0, "grad_norm": 1.8556758533871944, "language_loss": 0.76223731, "learning_rate": 3.5313097176985175e-06, "loss": 0.78437191, "num_input_tokens_seen": 43655180, "step": 2044, "time_per_iteration": 2.7360823154449463 }, { "auxiliary_loss_clip": 0.01197874, "auxiliary_loss_mlp": 0.01038578, "balance_loss_clip": 1.05821276, "balance_loss_mlp": 1.02924967, "epoch": 0.24589671135694102, "flos": 18807424272000.0, "grad_norm": 1.8388424858560708, "language_loss": 0.81585276, "learning_rate": 3.5308085267936482e-06, "loss": 0.83821726, "num_input_tokens_seen": 43672895, "step": 2045, "time_per_iteration": 2.6189157962799072 }, { "auxiliary_loss_clip": 0.01138885, "auxiliary_loss_mlp": 0.00765004, "balance_loss_clip": 1.05370128, "balance_loss_mlp": 1.0005424, "epoch": 0.2460169542475801, "flos": 19938538529280.0, "grad_norm": 1.8155050058829276, "language_loss": 0.89688802, "learning_rate": 3.530307103666342e-06, "loss": 0.91592687, "num_input_tokens_seen": 43691975, "step": 2046, "time_per_iteration": 3.8107922077178955 }, { "auxiliary_loss_clip": 0.01174868, "auxiliary_loss_mlp": 0.01036657, "balance_loss_clip": 1.057127, "balance_loss_mlp": 1.02672052, "epoch": 0.24613719713821922, "flos": 24171221381760.0, "grad_norm": 4.107715126636233, "language_loss": 0.80340886, "learning_rate": 3.5298054483926658e-06, "loss": 0.82552409, "num_input_tokens_seen": 43712670, "step": 2047, "time_per_iteration": 3.0098817348480225 }, { "auxiliary_loss_clip": 0.01205234, "auxiliary_loss_mlp": 0.01038292, "balance_loss_clip": 1.06002212, "balance_loss_mlp": 1.02753973, "epoch": 0.2462574400288583, "flos": 30221055325440.0, "grad_norm": 2.5493713921016283, "language_loss": 0.82666504, "learning_rate": 3.5293035610487187e-06, "loss": 0.84910029, "num_input_tokens_seen": 43732035, "step": 2048, "time_per_iteration": 4.182825326919556 }, { "auxiliary_loss_clip": 0.01083281, "auxiliary_loss_mlp": 0.01008407, "balance_loss_clip": 1.02531004, "balance_loss_mlp": 1.00627315, "epoch": 0.24637768291949738, "flos": 68943030819840.0, "grad_norm": 0.7239360895710083, "language_loss": 0.61987591, "learning_rate": 3.5288014417106374e-06, "loss": 0.64079273, "num_input_tokens_seen": 43798055, "step": 2049, "time_per_iteration": 3.337618827819824 }, { "auxiliary_loss_clip": 0.01164081, "auxiliary_loss_mlp": 0.01032834, "balance_loss_clip": 1.05405986, "balance_loss_mlp": 1.02296972, "epoch": 0.24649792581013646, "flos": 34383999922560.0, "grad_norm": 2.3595246376297543, "language_loss": 0.75594622, "learning_rate": 3.528299090454593e-06, "loss": 0.77791536, "num_input_tokens_seen": 43818590, "step": 2050, "time_per_iteration": 2.8811540603637695 }, { "auxiliary_loss_clip": 0.01202195, "auxiliary_loss_mlp": 0.0103596, "balance_loss_clip": 1.06064606, "balance_loss_mlp": 1.02532089, "epoch": 0.24661816870077558, "flos": 19680448331520.0, "grad_norm": 2.390247633135295, "language_loss": 0.82734758, "learning_rate": 3.527796507356792e-06, "loss": 0.84972918, "num_input_tokens_seen": 43832480, "step": 2051, "time_per_iteration": 2.7242166996002197 }, { "auxiliary_loss_clip": 0.01201358, "auxiliary_loss_mlp": 0.01037948, "balance_loss_clip": 1.05755496, "balance_loss_mlp": 1.02820277, "epoch": 0.24673841159141466, "flos": 20002279213440.0, "grad_norm": 2.7180965374643273, "language_loss": 0.89651322, "learning_rate": 3.527293692493475e-06, "loss": 0.91890627, "num_input_tokens_seen": 43848345, "step": 2052, "time_per_iteration": 2.739546060562134 }, { "auxiliary_loss_clip": 0.01202183, "auxiliary_loss_mlp": 0.01040646, "balance_loss_clip": 1.0584991, "balance_loss_mlp": 1.02997673, "epoch": 0.24685865448205374, "flos": 21646593037440.0, "grad_norm": 3.0559759164521534, "language_loss": 0.73077273, "learning_rate": 3.52679064594092e-06, "loss": 0.75320101, "num_input_tokens_seen": 43865685, "step": 2053, "time_per_iteration": 2.742281675338745 }, { "auxiliary_loss_clip": 0.01133657, "auxiliary_loss_mlp": 0.01030971, "balance_loss_clip": 1.04106259, "balance_loss_mlp": 1.02148771, "epoch": 0.24697889737269285, "flos": 17960470508160.0, "grad_norm": 3.394304796272737, "language_loss": 0.74966234, "learning_rate": 3.5262873677754375e-06, "loss": 0.77130866, "num_input_tokens_seen": 43883690, "step": 2054, "time_per_iteration": 2.890406370162964 }, { "auxiliary_loss_clip": 0.01210177, "auxiliary_loss_mlp": 0.01035012, "balance_loss_clip": 1.0610702, "balance_loss_mlp": 1.02555227, "epoch": 0.24709914026333193, "flos": 27344611221120.0, "grad_norm": 1.7321416245176058, "language_loss": 0.80612814, "learning_rate": 3.5257838580733745e-06, "loss": 0.82858002, "num_input_tokens_seen": 43903295, "step": 2055, "time_per_iteration": 2.8042657375335693 }, { "auxiliary_loss_clip": 0.01200856, "auxiliary_loss_mlp": 0.01033341, "balance_loss_clip": 1.05878901, "balance_loss_mlp": 1.02304721, "epoch": 0.24721938315397102, "flos": 19275519335040.0, "grad_norm": 2.1448791513439796, "language_loss": 0.87206191, "learning_rate": 3.5252801169111138e-06, "loss": 0.89440387, "num_input_tokens_seen": 43920960, "step": 2056, "time_per_iteration": 2.7577953338623047 }, { "auxiliary_loss_clip": 0.01181285, "auxiliary_loss_mlp": 0.01039343, "balance_loss_clip": 1.05972767, "balance_loss_mlp": 1.02974069, "epoch": 0.2473396260446101, "flos": 23185796688000.0, "grad_norm": 1.8257254478481604, "language_loss": 0.79878592, "learning_rate": 3.524776144365072e-06, "loss": 0.82099223, "num_input_tokens_seen": 43939415, "step": 2057, "time_per_iteration": 2.9318113327026367 }, { "auxiliary_loss_clip": 0.01179008, "auxiliary_loss_mlp": 0.01045851, "balance_loss_clip": 1.06057215, "balance_loss_mlp": 1.03620148, "epoch": 0.2474598689352492, "flos": 21142443697920.0, "grad_norm": 1.7296872760580633, "language_loss": 0.79152167, "learning_rate": 3.5242719405117016e-06, "loss": 0.81377023, "num_input_tokens_seen": 43959220, "step": 2058, "time_per_iteration": 2.8375179767608643 }, { "auxiliary_loss_clip": 0.01185682, "auxiliary_loss_mlp": 0.0076495, "balance_loss_clip": 1.05809426, "balance_loss_mlp": 1.00054789, "epoch": 0.2475801118258883, "flos": 21648352803840.0, "grad_norm": 2.5084681589595856, "language_loss": 0.74840117, "learning_rate": 3.5237675054274893e-06, "loss": 0.76790744, "num_input_tokens_seen": 43978420, "step": 2059, "time_per_iteration": 2.824300765991211 }, { "auxiliary_loss_clip": 0.01195481, "auxiliary_loss_mlp": 0.01032347, "balance_loss_clip": 1.05616498, "balance_loss_mlp": 1.02267289, "epoch": 0.24770035471652738, "flos": 22674500542080.0, "grad_norm": 1.8879911199354242, "language_loss": 0.80312002, "learning_rate": 3.5232628391889584e-06, "loss": 0.82539827, "num_input_tokens_seen": 43996710, "step": 2060, "time_per_iteration": 2.7522695064544678 }, { "auxiliary_loss_clip": 0.01150928, "auxiliary_loss_mlp": 0.01032884, "balance_loss_clip": 1.05439234, "balance_loss_mlp": 1.02371728, "epoch": 0.2478205976071665, "flos": 22163814927360.0, "grad_norm": 3.7823550577878327, "language_loss": 0.64433658, "learning_rate": 3.522757941872666e-06, "loss": 0.66617477, "num_input_tokens_seen": 44014865, "step": 2061, "time_per_iteration": 2.836489677429199 }, { "auxiliary_loss_clip": 0.01216368, "auxiliary_loss_mlp": 0.00763843, "balance_loss_clip": 1.06544161, "balance_loss_mlp": 1.0004611, "epoch": 0.24794084049780557, "flos": 24973106555520.0, "grad_norm": 1.9213396395174953, "language_loss": 0.82350236, "learning_rate": 3.5222528135552042e-06, "loss": 0.8433044, "num_input_tokens_seen": 44036325, "step": 2062, "time_per_iteration": 2.7551403045654297 }, { "auxiliary_loss_clip": 0.01196508, "auxiliary_loss_mlp": 0.01033459, "balance_loss_clip": 1.06041312, "balance_loss_mlp": 1.02456605, "epoch": 0.24806108338844465, "flos": 18296379521280.0, "grad_norm": 2.6364570522645643, "language_loss": 0.80666083, "learning_rate": 3.521747454313201e-06, "loss": 0.82896054, "num_input_tokens_seen": 44055005, "step": 2063, "time_per_iteration": 2.7217555046081543 }, { "auxiliary_loss_clip": 0.0115845, "auxiliary_loss_mlp": 0.01031237, "balance_loss_clip": 1.0514884, "balance_loss_mlp": 1.02066302, "epoch": 0.24818132627908374, "flos": 19282163351040.0, "grad_norm": 2.8138088439362523, "language_loss": 0.67231488, "learning_rate": 3.521241864223319e-06, "loss": 0.69421178, "num_input_tokens_seen": 44073965, "step": 2064, "time_per_iteration": 2.811619281768799 }, { "auxiliary_loss_clip": 0.01089748, "auxiliary_loss_mlp": 0.0100276, "balance_loss_clip": 1.02581513, "balance_loss_mlp": 1.00045907, "epoch": 0.24830156916972285, "flos": 70285837881600.0, "grad_norm": 0.7950553177015903, "language_loss": 0.61991215, "learning_rate": 3.5207360433622552e-06, "loss": 0.64083719, "num_input_tokens_seen": 44135965, "step": 2065, "time_per_iteration": 3.3014681339263916 }, { "auxiliary_loss_clip": 0.01176094, "auxiliary_loss_mlp": 0.0103343, "balance_loss_clip": 1.05637121, "balance_loss_mlp": 1.02448893, "epoch": 0.24842181206036193, "flos": 40409128287360.0, "grad_norm": 1.9767388972482969, "language_loss": 0.74661988, "learning_rate": 3.5202299918067437e-06, "loss": 0.76871514, "num_input_tokens_seen": 44159560, "step": 2066, "time_per_iteration": 2.952139139175415 }, { "auxiliary_loss_clip": 0.01191661, "auxiliary_loss_mlp": 0.01029962, "balance_loss_clip": 1.05683839, "balance_loss_mlp": 1.02039611, "epoch": 0.248542054951001, "flos": 20082432412800.0, "grad_norm": 2.4115896185735, "language_loss": 0.69690764, "learning_rate": 3.519723709633551e-06, "loss": 0.71912384, "num_input_tokens_seen": 44178320, "step": 2067, "time_per_iteration": 2.6985466480255127 }, { "auxiliary_loss_clip": 0.0117005, "auxiliary_loss_mlp": 0.0103987, "balance_loss_clip": 1.05371046, "balance_loss_mlp": 1.03062487, "epoch": 0.24866229784164012, "flos": 23513948363520.0, "grad_norm": 2.5456413462936425, "language_loss": 0.83950841, "learning_rate": 3.519217196919479e-06, "loss": 0.86160761, "num_input_tokens_seen": 44197305, "step": 2068, "time_per_iteration": 3.9210190773010254 }, { "auxiliary_loss_clip": 0.01188038, "auxiliary_loss_mlp": 0.01028826, "balance_loss_clip": 1.06088519, "balance_loss_mlp": 1.01893163, "epoch": 0.2487825407322792, "flos": 19865101173120.0, "grad_norm": 1.9082019589335015, "language_loss": 0.73112613, "learning_rate": 3.518710453741367e-06, "loss": 0.75329477, "num_input_tokens_seen": 44216505, "step": 2069, "time_per_iteration": 2.921764612197876 }, { "auxiliary_loss_clip": 0.01171122, "auxiliary_loss_mlp": 0.00764668, "balance_loss_clip": 1.05412054, "balance_loss_mlp": 1.00037456, "epoch": 0.2489027836229183, "flos": 22017622573440.0, "grad_norm": 2.2877345526597606, "language_loss": 0.68410498, "learning_rate": 3.518203480176086e-06, "loss": 0.70346284, "num_input_tokens_seen": 44235435, "step": 2070, "time_per_iteration": 2.764524459838867 }, { "auxiliary_loss_clip": 0.01121801, "auxiliary_loss_mlp": 0.01034896, "balance_loss_clip": 1.04631329, "balance_loss_mlp": 1.02523971, "epoch": 0.2490230265135574, "flos": 23294354567040.0, "grad_norm": 2.451052085552238, "language_loss": 0.8049444, "learning_rate": 3.517696276300545e-06, "loss": 0.82651132, "num_input_tokens_seen": 44256975, "step": 2071, "time_per_iteration": 3.1783180236816406 }, { "auxiliary_loss_clip": 0.01201793, "auxiliary_loss_mlp": 0.0103751, "balance_loss_clip": 1.06605566, "balance_loss_mlp": 1.02757394, "epoch": 0.24914326940419648, "flos": 19826784339840.0, "grad_norm": 3.2207487926735845, "language_loss": 0.69791341, "learning_rate": 3.517188842191685e-06, "loss": 0.72030646, "num_input_tokens_seen": 44275125, "step": 2072, "time_per_iteration": 3.7400591373443604 }, { "auxiliary_loss_clip": 0.01191927, "auxiliary_loss_mlp": 0.01039257, "balance_loss_clip": 1.05583453, "balance_loss_mlp": 1.0300777, "epoch": 0.24926351229483557, "flos": 20229271211520.0, "grad_norm": 1.6837915052542278, "language_loss": 0.73876727, "learning_rate": 3.5166811779264837e-06, "loss": 0.76107913, "num_input_tokens_seen": 44295445, "step": 2073, "time_per_iteration": 2.767338275909424 }, { "auxiliary_loss_clip": 0.01209892, "auxiliary_loss_mlp": 0.01036649, "balance_loss_clip": 1.05863369, "balance_loss_mlp": 1.02677846, "epoch": 0.24938375518547465, "flos": 23294570048640.0, "grad_norm": 2.1669018670170836, "language_loss": 0.77729058, "learning_rate": 3.5161732835819545e-06, "loss": 0.79975605, "num_input_tokens_seen": 44314755, "step": 2074, "time_per_iteration": 3.701648712158203 }, { "auxiliary_loss_clip": 0.0120973, "auxiliary_loss_mlp": 0.01045866, "balance_loss_clip": 1.05990684, "balance_loss_mlp": 1.03623414, "epoch": 0.24950399807611376, "flos": 17311673099520.0, "grad_norm": 2.373086213798395, "language_loss": 0.83161569, "learning_rate": 3.515665159235143e-06, "loss": 0.85417163, "num_input_tokens_seen": 44333640, "step": 2075, "time_per_iteration": 2.6778178215026855 }, { "auxiliary_loss_clip": 0.01177445, "auxiliary_loss_mlp": 0.01030846, "balance_loss_clip": 1.05240023, "balance_loss_mlp": 1.02214956, "epoch": 0.24962424096675284, "flos": 19024863252480.0, "grad_norm": 1.720002874889821, "language_loss": 0.75437963, "learning_rate": 3.5151568049631318e-06, "loss": 0.77646255, "num_input_tokens_seen": 44352355, "step": 2076, "time_per_iteration": 2.7661502361297607 }, { "auxiliary_loss_clip": 0.01212187, "auxiliary_loss_mlp": 0.010338, "balance_loss_clip": 1.06087756, "balance_loss_mlp": 1.02441263, "epoch": 0.24974448385739192, "flos": 33398790710400.0, "grad_norm": 2.33962585203716, "language_loss": 0.80274928, "learning_rate": 3.5146482208430385e-06, "loss": 0.82520908, "num_input_tokens_seen": 44374185, "step": 2077, "time_per_iteration": 2.7240095138549805 }, { "auxiliary_loss_clip": 0.01119686, "auxiliary_loss_mlp": 0.01036704, "balance_loss_clip": 1.04357481, "balance_loss_mlp": 1.02598703, "epoch": 0.24986472674803104, "flos": 30007279532160.0, "grad_norm": 1.9876625207351892, "language_loss": 0.67939067, "learning_rate": 3.514139406952014e-06, "loss": 0.7009545, "num_input_tokens_seen": 44396210, "step": 2078, "time_per_iteration": 2.969722270965576 }, { "auxiliary_loss_clip": 0.01196874, "auxiliary_loss_mlp": 0.01036294, "balance_loss_clip": 1.05887973, "balance_loss_mlp": 1.0267272, "epoch": 0.24998496963867012, "flos": 26613074833920.0, "grad_norm": 1.9053723760923638, "language_loss": 0.83095759, "learning_rate": 3.5136303633672454e-06, "loss": 0.85328925, "num_input_tokens_seen": 44416340, "step": 2079, "time_per_iteration": 2.78934383392334 }, { "auxiliary_loss_clip": 0.01175117, "auxiliary_loss_mlp": 0.00764958, "balance_loss_clip": 1.05699205, "balance_loss_mlp": 1.00041342, "epoch": 0.25010521252930923, "flos": 23553989049600.0, "grad_norm": 1.916182810327473, "language_loss": 0.74538177, "learning_rate": 3.5131210901659544e-06, "loss": 0.76478255, "num_input_tokens_seen": 44438095, "step": 2080, "time_per_iteration": 2.9293713569641113 }, { "auxiliary_loss_clip": 0.01160653, "auxiliary_loss_mlp": 0.01032507, "balance_loss_clip": 1.05276704, "balance_loss_mlp": 1.02312541, "epoch": 0.2502254554199483, "flos": 23441193365760.0, "grad_norm": 2.7974632088318345, "language_loss": 0.82008088, "learning_rate": 3.5126115874253967e-06, "loss": 0.84201252, "num_input_tokens_seen": 44457650, "step": 2081, "time_per_iteration": 2.87642502784729 }, { "auxiliary_loss_clip": 0.01165725, "auxiliary_loss_mlp": 0.01038326, "balance_loss_clip": 1.05714345, "balance_loss_mlp": 1.02878892, "epoch": 0.2503456983105874, "flos": 28761681651840.0, "grad_norm": 2.3144013342467757, "language_loss": 0.81058764, "learning_rate": 3.5121018552228644e-06, "loss": 0.83262813, "num_input_tokens_seen": 44476155, "step": 2082, "time_per_iteration": 2.855018138885498 }, { "auxiliary_loss_clip": 0.01166169, "auxiliary_loss_mlp": 0.01032248, "balance_loss_clip": 1.05354679, "balance_loss_mlp": 1.02302718, "epoch": 0.2504659412012265, "flos": 18770256673920.0, "grad_norm": 2.1202660526467185, "language_loss": 0.7656942, "learning_rate": 3.5115918936356827e-06, "loss": 0.78767836, "num_input_tokens_seen": 44492910, "step": 2083, "time_per_iteration": 2.850407600402832 }, { "auxiliary_loss_clip": 0.01143838, "auxiliary_loss_mlp": 0.01030193, "balance_loss_clip": 1.04978132, "balance_loss_mlp": 1.02064383, "epoch": 0.25058618409186556, "flos": 16873383346560.0, "grad_norm": 2.3254730007314115, "language_loss": 0.78762031, "learning_rate": 3.5110817027412123e-06, "loss": 0.80936062, "num_input_tokens_seen": 44512000, "step": 2084, "time_per_iteration": 2.801792860031128 }, { "auxiliary_loss_clip": 0.01160109, "auxiliary_loss_mlp": 0.01038458, "balance_loss_clip": 1.05161953, "balance_loss_mlp": 1.0293144, "epoch": 0.25070642698250467, "flos": 24425540651520.0, "grad_norm": 1.992033321140226, "language_loss": 0.68939644, "learning_rate": 3.5105712826168493e-06, "loss": 0.71138215, "num_input_tokens_seen": 44531650, "step": 2085, "time_per_iteration": 2.808412790298462 }, { "auxiliary_loss_clip": 0.01193775, "auxiliary_loss_mlp": 0.00764503, "balance_loss_clip": 1.05737984, "balance_loss_mlp": 1.00033522, "epoch": 0.2508266698731437, "flos": 20260944028800.0, "grad_norm": 2.054083386523751, "language_loss": 0.7089113, "learning_rate": 3.5100606333400235e-06, "loss": 0.72849405, "num_input_tokens_seen": 44548785, "step": 2086, "time_per_iteration": 2.778442144393921 }, { "auxiliary_loss_clip": 0.01193523, "auxiliary_loss_mlp": 0.01036873, "balance_loss_clip": 1.06006527, "balance_loss_mlp": 1.02658474, "epoch": 0.25094691276378284, "flos": 19245318975360.0, "grad_norm": 2.455778940500041, "language_loss": 0.76703489, "learning_rate": 3.5095497549882006e-06, "loss": 0.78933883, "num_input_tokens_seen": 44567230, "step": 2087, "time_per_iteration": 2.6960651874542236 }, { "auxiliary_loss_clip": 0.01196978, "auxiliary_loss_mlp": 0.01030883, "balance_loss_clip": 1.0614233, "balance_loss_mlp": 1.02098846, "epoch": 0.25106715565442195, "flos": 26943237671040.0, "grad_norm": 2.6274202721294753, "language_loss": 0.72131842, "learning_rate": 3.50903864763888e-06, "loss": 0.74359703, "num_input_tokens_seen": 44588020, "step": 2088, "time_per_iteration": 2.8209304809570312 }, { "auxiliary_loss_clip": 0.01197862, "auxiliary_loss_mlp": 0.01034692, "balance_loss_clip": 1.05658317, "balance_loss_mlp": 1.0245589, "epoch": 0.251187398545061, "flos": 48359570572800.0, "grad_norm": 1.979996228397373, "language_loss": 0.76201183, "learning_rate": 3.5085273113695965e-06, "loss": 0.78433734, "num_input_tokens_seen": 44612590, "step": 2089, "time_per_iteration": 2.9548180103302 }, { "auxiliary_loss_clip": 0.01210386, "auxiliary_loss_mlp": 0.01035909, "balance_loss_clip": 1.06060505, "balance_loss_mlp": 1.02615726, "epoch": 0.2513076414357001, "flos": 27016100409600.0, "grad_norm": 2.4255743283953586, "language_loss": 0.78970957, "learning_rate": 3.508015746257919e-06, "loss": 0.81217253, "num_input_tokens_seen": 44631630, "step": 2090, "time_per_iteration": 2.7150540351867676 }, { "auxiliary_loss_clip": 0.0116894, "auxiliary_loss_mlp": 0.01037783, "balance_loss_clip": 1.05553675, "balance_loss_mlp": 1.02754307, "epoch": 0.2514278843263392, "flos": 19463619882240.0, "grad_norm": 2.0982594889736257, "language_loss": 0.83303434, "learning_rate": 3.5075039523814518e-06, "loss": 0.85510159, "num_input_tokens_seen": 44650820, "step": 2091, "time_per_iteration": 2.772280216217041 }, { "auxiliary_loss_clip": 0.01198772, "auxiliary_loss_mlp": 0.01037313, "balance_loss_clip": 1.05619061, "balance_loss_mlp": 1.02739477, "epoch": 0.2515481272169783, "flos": 16866092885760.0, "grad_norm": 2.193088283659089, "language_loss": 0.82297373, "learning_rate": 3.506991929817834e-06, "loss": 0.84533453, "num_input_tokens_seen": 44667540, "step": 2092, "time_per_iteration": 2.7446253299713135 }, { "auxiliary_loss_clip": 0.01210816, "auxiliary_loss_mlp": 0.01042636, "balance_loss_clip": 1.06287575, "balance_loss_mlp": 1.03322458, "epoch": 0.2516683701076174, "flos": 23732464752000.0, "grad_norm": 1.9132180173084559, "language_loss": 0.82404816, "learning_rate": 3.506479678644738e-06, "loss": 0.84658265, "num_input_tokens_seen": 44687935, "step": 2093, "time_per_iteration": 2.7368557453155518 }, { "auxiliary_loss_clip": 0.01141322, "auxiliary_loss_mlp": 0.01033733, "balance_loss_clip": 1.05021346, "balance_loss_mlp": 1.02388644, "epoch": 0.2517886129982565, "flos": 27635954434560.0, "grad_norm": 3.787529160621812, "language_loss": 0.74330527, "learning_rate": 3.505967198939873e-06, "loss": 0.76505584, "num_input_tokens_seen": 44704975, "step": 2094, "time_per_iteration": 3.78482985496521 }, { "auxiliary_loss_clip": 0.01175368, "auxiliary_loss_mlp": 0.01027657, "balance_loss_clip": 1.05153322, "balance_loss_mlp": 1.01847243, "epoch": 0.25190885588889556, "flos": 38104596529920.0, "grad_norm": 2.289844789887299, "language_loss": 0.7836237, "learning_rate": 3.5054544907809813e-06, "loss": 0.80565393, "num_input_tokens_seen": 44725475, "step": 2095, "time_per_iteration": 2.9173803329467773 }, { "auxiliary_loss_clip": 0.01179268, "auxiliary_loss_mlp": 0.00765011, "balance_loss_clip": 1.05823636, "balance_loss_mlp": 1.00048053, "epoch": 0.25202909877953467, "flos": 22269894768000.0, "grad_norm": 2.3562729946411687, "language_loss": 0.80759048, "learning_rate": 3.50494155424584e-06, "loss": 0.82703328, "num_input_tokens_seen": 44744380, "step": 2096, "time_per_iteration": 2.813070774078369 }, { "auxiliary_loss_clip": 0.01199673, "auxiliary_loss_mlp": 0.01036545, "balance_loss_clip": 1.05877781, "balance_loss_mlp": 1.02688909, "epoch": 0.2521493416701738, "flos": 21761759018880.0, "grad_norm": 1.538458745889626, "language_loss": 0.82841027, "learning_rate": 3.504428389412262e-06, "loss": 0.8507725, "num_input_tokens_seen": 44765190, "step": 2097, "time_per_iteration": 2.71197509765625 }, { "auxiliary_loss_clip": 0.01192604, "auxiliary_loss_mlp": 0.01034004, "balance_loss_clip": 1.05614793, "balance_loss_mlp": 1.02360332, "epoch": 0.25226958456081283, "flos": 27746738956800.0, "grad_norm": 2.206747127047161, "language_loss": 0.73337299, "learning_rate": 3.5039149963580927e-06, "loss": 0.75563908, "num_input_tokens_seen": 44785210, "step": 2098, "time_per_iteration": 3.728706121444702 }, { "auxiliary_loss_clip": 0.01175896, "auxiliary_loss_mlp": 0.01036115, "balance_loss_clip": 1.05799174, "balance_loss_mlp": 1.0262562, "epoch": 0.25238982745145194, "flos": 30732171903360.0, "grad_norm": 2.9626831965411182, "language_loss": 0.70408916, "learning_rate": 3.503401375161215e-06, "loss": 0.72620928, "num_input_tokens_seen": 44804955, "step": 2099, "time_per_iteration": 3.8294928073883057 }, { "auxiliary_loss_clip": 0.01206507, "auxiliary_loss_mlp": 0.01032362, "balance_loss_clip": 1.05835581, "balance_loss_mlp": 1.02217519, "epoch": 0.252510070342091, "flos": 20266331068800.0, "grad_norm": 1.558947084924156, "language_loss": 0.83935559, "learning_rate": 3.502887525899544e-06, "loss": 0.86174434, "num_input_tokens_seen": 44823935, "step": 2100, "time_per_iteration": 3.644076347351074 }, { "auxiliary_loss_clip": 0.01181774, "auxiliary_loss_mlp": 0.01043802, "balance_loss_clip": 1.05500484, "balance_loss_mlp": 1.03396153, "epoch": 0.2526303132327301, "flos": 22747399194240.0, "grad_norm": 3.010336497512438, "language_loss": 0.83238769, "learning_rate": 3.50237344865103e-06, "loss": 0.85464346, "num_input_tokens_seen": 44844935, "step": 2101, "time_per_iteration": 2.90242075920105 }, { "auxiliary_loss_clip": 0.0120977, "auxiliary_loss_mlp": 0.01034468, "balance_loss_clip": 1.05916834, "balance_loss_mlp": 1.02459788, "epoch": 0.2527505561233692, "flos": 30263466309120.0, "grad_norm": 2.166633424514748, "language_loss": 0.76218939, "learning_rate": 3.501859143493658e-06, "loss": 0.78463173, "num_input_tokens_seen": 44865565, "step": 2102, "time_per_iteration": 2.692326545715332 }, { "auxiliary_loss_clip": 0.01113325, "auxiliary_loss_mlp": 0.01002113, "balance_loss_clip": 1.03145957, "balance_loss_mlp": 0.99987143, "epoch": 0.2528707990140083, "flos": 58492917164160.0, "grad_norm": 0.9944362767761876, "language_loss": 0.60552663, "learning_rate": 3.5013446105054488e-06, "loss": 0.62668103, "num_input_tokens_seen": 44918485, "step": 2103, "time_per_iteration": 2.9874167442321777 }, { "auxiliary_loss_clip": 0.0115198, "auxiliary_loss_mlp": 0.01030251, "balance_loss_clip": 1.05325305, "balance_loss_mlp": 1.02055883, "epoch": 0.2529910419046474, "flos": 24645134448000.0, "grad_norm": 1.826116255193779, "language_loss": 0.75234997, "learning_rate": 3.5008298497644555e-06, "loss": 0.77417231, "num_input_tokens_seen": 44937530, "step": 2104, "time_per_iteration": 2.7872188091278076 }, { "auxiliary_loss_clip": 0.01171006, "auxiliary_loss_mlp": 0.01033557, "balance_loss_clip": 1.05623889, "balance_loss_mlp": 1.02370977, "epoch": 0.2531112847952865, "flos": 23842135952640.0, "grad_norm": 1.5824702435074918, "language_loss": 0.8812499, "learning_rate": 3.500314861348767e-06, "loss": 0.90329546, "num_input_tokens_seen": 44958165, "step": 2105, "time_per_iteration": 2.8249404430389404 }, { "auxiliary_loss_clip": 0.01156062, "auxiliary_loss_mlp": 0.01036635, "balance_loss_clip": 1.05506158, "balance_loss_mlp": 1.02699125, "epoch": 0.25323152768592555, "flos": 16143822207360.0, "grad_norm": 2.088055278211382, "language_loss": 0.77033854, "learning_rate": 3.499799645336507e-06, "loss": 0.79226553, "num_input_tokens_seen": 44975060, "step": 2106, "time_per_iteration": 2.9788529872894287 }, { "auxiliary_loss_clip": 0.01196619, "auxiliary_loss_mlp": 0.01035717, "balance_loss_clip": 1.05902719, "balance_loss_mlp": 1.02653813, "epoch": 0.25335177057656466, "flos": 28405161210240.0, "grad_norm": 1.9487282598925275, "language_loss": 0.87118274, "learning_rate": 3.4992842018058336e-06, "loss": 0.89350611, "num_input_tokens_seen": 44997960, "step": 2107, "time_per_iteration": 2.7902863025665283 }, { "auxiliary_loss_clip": 0.0117032, "auxiliary_loss_mlp": 0.01032964, "balance_loss_clip": 1.05404294, "balance_loss_mlp": 1.02351642, "epoch": 0.25347201346720377, "flos": 18799666934400.0, "grad_norm": 2.079012602543159, "language_loss": 0.88438392, "learning_rate": 3.4987685308349384e-06, "loss": 0.90641677, "num_input_tokens_seen": 45015690, "step": 2108, "time_per_iteration": 2.8269617557525635 }, { "auxiliary_loss_clip": 0.01162452, "auxiliary_loss_mlp": 0.01032585, "balance_loss_clip": 1.05016184, "balance_loss_mlp": 1.02244604, "epoch": 0.2535922563578428, "flos": 15815490963840.0, "grad_norm": 2.3815593466259215, "language_loss": 0.61252975, "learning_rate": 3.4982526325020497e-06, "loss": 0.63448012, "num_input_tokens_seen": 45032660, "step": 2109, "time_per_iteration": 2.7323598861694336 }, { "auxiliary_loss_clip": 0.01183323, "auxiliary_loss_mlp": 0.01039757, "balance_loss_clip": 1.05629325, "balance_loss_mlp": 1.03007686, "epoch": 0.25371249924848194, "flos": 16318922031360.0, "grad_norm": 2.3535751722183953, "language_loss": 0.81786191, "learning_rate": 3.4977365068854273e-06, "loss": 0.84009272, "num_input_tokens_seen": 45048280, "step": 2110, "time_per_iteration": 2.7319705486297607 }, { "auxiliary_loss_clip": 0.01170902, "auxiliary_loss_mlp": 0.01039338, "balance_loss_clip": 1.0531168, "balance_loss_mlp": 1.02937233, "epoch": 0.25383274213912105, "flos": 21761615364480.0, "grad_norm": 2.7380751622515964, "language_loss": 0.7373144, "learning_rate": 3.4972201540633676e-06, "loss": 0.75941676, "num_input_tokens_seen": 45067635, "step": 2111, "time_per_iteration": 2.772465944290161 }, { "auxiliary_loss_clip": 0.01171114, "auxiliary_loss_mlp": 0.01040407, "balance_loss_clip": 1.05370092, "balance_loss_mlp": 1.0305903, "epoch": 0.2539529850297601, "flos": 21396870708480.0, "grad_norm": 2.644865874237856, "language_loss": 0.85352665, "learning_rate": 3.4967035741142008e-06, "loss": 0.87564188, "num_input_tokens_seen": 45086455, "step": 2112, "time_per_iteration": 2.734591007232666 }, { "auxiliary_loss_clip": 0.0117725, "auxiliary_loss_mlp": 0.01038565, "balance_loss_clip": 1.06290865, "balance_loss_mlp": 1.02912927, "epoch": 0.2540732279203992, "flos": 25228467319680.0, "grad_norm": 1.945353072080155, "language_loss": 0.81861377, "learning_rate": 3.4961867671162917e-06, "loss": 0.84077191, "num_input_tokens_seen": 45106385, "step": 2113, "time_per_iteration": 2.7956430912017822 }, { "auxiliary_loss_clip": 0.01211733, "auxiliary_loss_mlp": 0.01035047, "balance_loss_clip": 1.05980217, "balance_loss_mlp": 1.02539742, "epoch": 0.2541934708110383, "flos": 19427386037760.0, "grad_norm": 24.17308036490114, "language_loss": 0.77476478, "learning_rate": 3.4956697331480402e-06, "loss": 0.79723257, "num_input_tokens_seen": 45124955, "step": 2114, "time_per_iteration": 2.6350395679473877 }, { "auxiliary_loss_clip": 0.01169144, "auxiliary_loss_mlp": 0.01033151, "balance_loss_clip": 1.05109441, "balance_loss_mlp": 1.02302432, "epoch": 0.2543137137016774, "flos": 23949436855680.0, "grad_norm": 1.7929217185537303, "language_loss": 0.80137265, "learning_rate": 3.495152472287879e-06, "loss": 0.82339561, "num_input_tokens_seen": 45145665, "step": 2115, "time_per_iteration": 2.8250572681427 }, { "auxiliary_loss_clip": 0.01170133, "auxiliary_loss_mlp": 0.01040135, "balance_loss_clip": 1.05603552, "balance_loss_mlp": 1.03024101, "epoch": 0.2544339565923165, "flos": 25593283802880.0, "grad_norm": 2.2045217641814268, "language_loss": 0.7365135, "learning_rate": 3.4946349846142766e-06, "loss": 0.75861615, "num_input_tokens_seen": 45164805, "step": 2116, "time_per_iteration": 2.7879958152770996 }, { "auxiliary_loss_clip": 0.01206955, "auxiliary_loss_mlp": 0.01031452, "balance_loss_clip": 1.05637228, "balance_loss_mlp": 1.02161705, "epoch": 0.25455419948295555, "flos": 21689470897920.0, "grad_norm": 2.6228960204533265, "language_loss": 0.75870514, "learning_rate": 3.4941172702057353e-06, "loss": 0.78108925, "num_input_tokens_seen": 45184865, "step": 2117, "time_per_iteration": 2.684295415878296 }, { "auxiliary_loss_clip": 0.01178897, "auxiliary_loss_mlp": 0.01028739, "balance_loss_clip": 1.05703068, "balance_loss_mlp": 1.01932168, "epoch": 0.25467444237359466, "flos": 26250341339520.0, "grad_norm": 2.095004872376581, "language_loss": 0.80710781, "learning_rate": 3.4935993291407924e-06, "loss": 0.82918417, "num_input_tokens_seen": 45203690, "step": 2118, "time_per_iteration": 2.715148687362671 }, { "auxiliary_loss_clip": 0.01173171, "auxiliary_loss_mlp": 0.01037342, "balance_loss_clip": 1.05348051, "balance_loss_mlp": 1.02786446, "epoch": 0.25479468526423377, "flos": 26979686997120.0, "grad_norm": 2.330762367787955, "language_loss": 0.7109521, "learning_rate": 3.4930811614980183e-06, "loss": 0.73305726, "num_input_tokens_seen": 45225385, "step": 2119, "time_per_iteration": 2.790679931640625 }, { "auxiliary_loss_clip": 0.01184125, "auxiliary_loss_mlp": 0.01039836, "balance_loss_clip": 1.05327165, "balance_loss_mlp": 1.03040099, "epoch": 0.2549149281548728, "flos": 23475811098240.0, "grad_norm": 1.8511334546024447, "language_loss": 0.79416692, "learning_rate": 3.4925627673560198e-06, "loss": 0.81640649, "num_input_tokens_seen": 45246045, "step": 2120, "time_per_iteration": 3.616147041320801 }, { "auxiliary_loss_clip": 0.01166544, "auxiliary_loss_mlp": 0.01032999, "balance_loss_clip": 1.05476046, "balance_loss_mlp": 1.02347386, "epoch": 0.25503517104551193, "flos": 25812302981760.0, "grad_norm": 2.0014682201325864, "language_loss": 0.88637519, "learning_rate": 3.4920441467934357e-06, "loss": 0.90837061, "num_input_tokens_seen": 45266560, "step": 2121, "time_per_iteration": 2.879087209701538 }, { "auxiliary_loss_clip": 0.01154851, "auxiliary_loss_mlp": 0.01032899, "balance_loss_clip": 1.05275333, "balance_loss_mlp": 1.02312422, "epoch": 0.25515541393615104, "flos": 26645106787200.0, "grad_norm": 2.662889723242326, "language_loss": 0.82796359, "learning_rate": 3.491525299888941e-06, "loss": 0.84984112, "num_input_tokens_seen": 45285405, "step": 2122, "time_per_iteration": 2.838909149169922 }, { "auxiliary_loss_clip": 0.01076056, "auxiliary_loss_mlp": 0.00754994, "balance_loss_clip": 1.02702904, "balance_loss_mlp": 0.99997914, "epoch": 0.2552756568267901, "flos": 65955945847680.0, "grad_norm": 0.8856006731170495, "language_loss": 0.62598377, "learning_rate": 3.491006226721244e-06, "loss": 0.64429426, "num_input_tokens_seen": 45349615, "step": 2123, "time_per_iteration": 3.4008078575134277 }, { "auxiliary_loss_clip": 0.01182109, "auxiliary_loss_mlp": 0.00765308, "balance_loss_clip": 1.05752134, "balance_loss_mlp": 1.00032997, "epoch": 0.2553958997174292, "flos": 17931096161280.0, "grad_norm": 2.0940899259548558, "language_loss": 0.7790463, "learning_rate": 3.4904869273690882e-06, "loss": 0.79852057, "num_input_tokens_seen": 45367505, "step": 2124, "time_per_iteration": 3.8781068325042725 }, { "auxiliary_loss_clip": 0.01196375, "auxiliary_loss_mlp": 0.01038066, "balance_loss_clip": 1.05640233, "balance_loss_mlp": 1.02888119, "epoch": 0.2555161426080683, "flos": 23367791923200.0, "grad_norm": 7.451055772982942, "language_loss": 0.89110541, "learning_rate": 3.489967401911251e-06, "loss": 0.91344976, "num_input_tokens_seen": 45386805, "step": 2125, "time_per_iteration": 2.731290102005005 }, { "auxiliary_loss_clip": 0.01213874, "auxiliary_loss_mlp": 0.01041714, "balance_loss_clip": 1.06037855, "balance_loss_mlp": 1.03123546, "epoch": 0.2556363854987074, "flos": 40625130723840.0, "grad_norm": 1.8677728351604943, "language_loss": 0.69655752, "learning_rate": 3.4894476504265428e-06, "loss": 0.71911347, "num_input_tokens_seen": 45411045, "step": 2126, "time_per_iteration": 4.784552335739136 }, { "auxiliary_loss_clip": 0.01091001, "auxiliary_loss_mlp": 0.0100738, "balance_loss_clip": 1.0253849, "balance_loss_mlp": 1.00504398, "epoch": 0.2557566283893465, "flos": 68019443389440.0, "grad_norm": 0.7361090218422459, "language_loss": 0.54396695, "learning_rate": 3.4889276729938104e-06, "loss": 0.56495076, "num_input_tokens_seen": 45469575, "step": 2127, "time_per_iteration": 3.181098461151123 }, { "auxiliary_loss_clip": 0.01178412, "auxiliary_loss_mlp": 0.01034596, "balance_loss_clip": 1.056705, "balance_loss_mlp": 1.02494597, "epoch": 0.2558768712799856, "flos": 22635645004800.0, "grad_norm": 2.61794494991567, "language_loss": 0.80379659, "learning_rate": 3.488407469691934e-06, "loss": 0.82592666, "num_input_tokens_seen": 45490270, "step": 2128, "time_per_iteration": 2.7905967235565186 }, { "auxiliary_loss_clip": 0.0117545, "auxiliary_loss_mlp": 0.01038346, "balance_loss_clip": 1.05155885, "balance_loss_mlp": 1.02751637, "epoch": 0.25599711417062465, "flos": 26396354125440.0, "grad_norm": 6.184548946773142, "language_loss": 0.80730665, "learning_rate": 3.487887040599828e-06, "loss": 0.82944459, "num_input_tokens_seen": 45510070, "step": 2129, "time_per_iteration": 2.7658798694610596 }, { "auxiliary_loss_clip": 0.01209172, "auxiliary_loss_mlp": 0.0103584, "balance_loss_clip": 1.05855966, "balance_loss_mlp": 1.0254693, "epoch": 0.25611735706126376, "flos": 22852042490880.0, "grad_norm": 3.0497883968533217, "language_loss": 0.76680368, "learning_rate": 3.4873663857964407e-06, "loss": 0.78925383, "num_input_tokens_seen": 45527285, "step": 2130, "time_per_iteration": 2.7820546627044678 }, { "auxiliary_loss_clip": 0.01150067, "auxiliary_loss_mlp": 0.01037488, "balance_loss_clip": 1.0517019, "balance_loss_mlp": 1.02768862, "epoch": 0.2562375999519028, "flos": 23367863750400.0, "grad_norm": 1.9981819365431217, "language_loss": 0.66441989, "learning_rate": 3.4868455053607556e-06, "loss": 0.68629539, "num_input_tokens_seen": 45546900, "step": 2131, "time_per_iteration": 2.795693874359131 }, { "auxiliary_loss_clip": 0.01198031, "auxiliary_loss_mlp": 0.01040856, "balance_loss_clip": 1.0555222, "balance_loss_mlp": 1.02991891, "epoch": 0.2563578428425419, "flos": 22856962654080.0, "grad_norm": 2.321354515367387, "language_loss": 0.71469575, "learning_rate": 3.486324399371789e-06, "loss": 0.73708463, "num_input_tokens_seen": 45566200, "step": 2132, "time_per_iteration": 2.706897258758545 }, { "auxiliary_loss_clip": 0.01157375, "auxiliary_loss_mlp": 0.0103432, "balance_loss_clip": 1.05261362, "balance_loss_mlp": 1.02515292, "epoch": 0.25647808573318104, "flos": 21653883498240.0, "grad_norm": 2.3529984929513006, "language_loss": 0.78497601, "learning_rate": 3.485803067908593e-06, "loss": 0.80689299, "num_input_tokens_seen": 45585710, "step": 2133, "time_per_iteration": 2.81225323677063 }, { "auxiliary_loss_clip": 0.01109165, "auxiliary_loss_mlp": 0.01035258, "balance_loss_clip": 1.04120779, "balance_loss_mlp": 1.02474403, "epoch": 0.2565983286238201, "flos": 33730569659520.0, "grad_norm": 1.7131906706540836, "language_loss": 0.79711282, "learning_rate": 3.485281511050253e-06, "loss": 0.81855714, "num_input_tokens_seen": 45607845, "step": 2134, "time_per_iteration": 2.927603244781494 }, { "auxiliary_loss_clip": 0.01194759, "auxiliary_loss_mlp": 0.01031059, "balance_loss_clip": 1.05515313, "balance_loss_mlp": 1.02156401, "epoch": 0.2567185715144592, "flos": 16216002587520.0, "grad_norm": 2.7731304703744524, "language_loss": 0.9006232, "learning_rate": 3.484759728875889e-06, "loss": 0.92288136, "num_input_tokens_seen": 45623210, "step": 2135, "time_per_iteration": 2.775146245956421 }, { "auxiliary_loss_clip": 0.01138723, "auxiliary_loss_mlp": 0.01041339, "balance_loss_clip": 1.0508976, "balance_loss_mlp": 1.03093755, "epoch": 0.2568388144050983, "flos": 17458475984640.0, "grad_norm": 1.8329001645156806, "language_loss": 0.80755979, "learning_rate": 3.4842377214646543e-06, "loss": 0.82936043, "num_input_tokens_seen": 45641505, "step": 2136, "time_per_iteration": 2.832365036010742 }, { "auxiliary_loss_clip": 0.01209816, "auxiliary_loss_mlp": 0.01032661, "balance_loss_clip": 1.06009054, "balance_loss_mlp": 1.02370846, "epoch": 0.25695905729573737, "flos": 20887442069760.0, "grad_norm": 1.9241005057382108, "language_loss": 0.66732454, "learning_rate": 3.483715488895737e-06, "loss": 0.6897493, "num_input_tokens_seen": 45661835, "step": 2137, "time_per_iteration": 2.663938045501709 }, { "auxiliary_loss_clip": 0.01145476, "auxiliary_loss_mlp": 0.01035919, "balance_loss_clip": 1.04854965, "balance_loss_mlp": 1.02618527, "epoch": 0.2570793001863765, "flos": 24717278914560.0, "grad_norm": 1.9509059256255346, "language_loss": 0.78783184, "learning_rate": 3.48319303124836e-06, "loss": 0.80964577, "num_input_tokens_seen": 45682215, "step": 2138, "time_per_iteration": 2.8674726486206055 }, { "auxiliary_loss_clip": 0.01177483, "auxiliary_loss_mlp": 0.01041974, "balance_loss_clip": 1.05764079, "balance_loss_mlp": 1.031883, "epoch": 0.2571995430770156, "flos": 26906896085760.0, "grad_norm": 2.5561622939841393, "language_loss": 0.66996789, "learning_rate": 3.4826703486017798e-06, "loss": 0.69216251, "num_input_tokens_seen": 45701840, "step": 2139, "time_per_iteration": 2.931387424468994 }, { "auxiliary_loss_clip": 0.01192368, "auxiliary_loss_mlp": 0.01035344, "balance_loss_clip": 1.05831504, "balance_loss_mlp": 1.0252769, "epoch": 0.25731978596765465, "flos": 19792561656960.0, "grad_norm": 1.8913248310369348, "language_loss": 0.76746511, "learning_rate": 3.4821474410352867e-06, "loss": 0.78974223, "num_input_tokens_seen": 45720500, "step": 2140, "time_per_iteration": 2.725515604019165 }, { "auxiliary_loss_clip": 0.01080166, "auxiliary_loss_mlp": 0.010066, "balance_loss_clip": 1.02787566, "balance_loss_mlp": 1.00424004, "epoch": 0.25744002885829376, "flos": 70564970471040.0, "grad_norm": 1.0690847285607001, "language_loss": 0.62592328, "learning_rate": 3.481624308628205e-06, "loss": 0.64679098, "num_input_tokens_seen": 45781870, "step": 2141, "time_per_iteration": 3.4509148597717285 }, { "auxiliary_loss_clip": 0.01179578, "auxiliary_loss_mlp": 0.01042891, "balance_loss_clip": 1.05512834, "balance_loss_mlp": 1.03240037, "epoch": 0.25756027174893287, "flos": 18038181582720.0, "grad_norm": 3.5636346313760625, "language_loss": 1.00075114, "learning_rate": 3.481100951459893e-06, "loss": 1.0229758, "num_input_tokens_seen": 45794890, "step": 2142, "time_per_iteration": 2.765876531600952 }, { "auxiliary_loss_clip": 0.01190534, "auxiliary_loss_mlp": 0.01032798, "balance_loss_clip": 1.05724955, "balance_loss_mlp": 1.02243304, "epoch": 0.2576805146395719, "flos": 22674069578880.0, "grad_norm": 1.7108477102943007, "language_loss": 0.78869939, "learning_rate": 3.4805773696097453e-06, "loss": 0.81093276, "num_input_tokens_seen": 45815780, "step": 2143, "time_per_iteration": 2.75534725189209 }, { "auxiliary_loss_clip": 0.01177468, "auxiliary_loss_mlp": 0.01038022, "balance_loss_clip": 1.05837083, "balance_loss_mlp": 1.02891445, "epoch": 0.25780075753021103, "flos": 16472225278080.0, "grad_norm": 2.092290207792063, "language_loss": 0.88063502, "learning_rate": 3.4800535631571874e-06, "loss": 0.90278983, "num_input_tokens_seen": 45831310, "step": 2144, "time_per_iteration": 2.8079986572265625 }, { "auxiliary_loss_clip": 0.01184843, "auxiliary_loss_mlp": 0.01037538, "balance_loss_clip": 1.05578279, "balance_loss_mlp": 1.02692258, "epoch": 0.25792100042085014, "flos": 22820297846400.0, "grad_norm": 2.4049017093568725, "language_loss": 0.76524204, "learning_rate": 3.4795295321816804e-06, "loss": 0.78746581, "num_input_tokens_seen": 45850135, "step": 2145, "time_per_iteration": 2.7470638751983643 }, { "auxiliary_loss_clip": 0.01166792, "auxiliary_loss_mlp": 0.01039792, "balance_loss_clip": 1.05238223, "balance_loss_mlp": 1.02982593, "epoch": 0.2580412433114892, "flos": 18697286194560.0, "grad_norm": 3.314422632594769, "language_loss": 0.91288388, "learning_rate": 3.47900527676272e-06, "loss": 0.9349497, "num_input_tokens_seen": 45868470, "step": 2146, "time_per_iteration": 3.6622936725616455 }, { "auxiliary_loss_clip": 0.01213274, "auxiliary_loss_mlp": 0.01044486, "balance_loss_clip": 1.06343484, "balance_loss_mlp": 1.03472877, "epoch": 0.2581614862021283, "flos": 14283146810880.0, "grad_norm": 2.274901630807928, "language_loss": 0.88746941, "learning_rate": 3.478480796979835e-06, "loss": 0.91004705, "num_input_tokens_seen": 45886355, "step": 2147, "time_per_iteration": 2.8576083183288574 }, { "auxiliary_loss_clip": 0.01175965, "auxiliary_loss_mlp": 0.01034403, "balance_loss_clip": 1.05579114, "balance_loss_mlp": 1.02409148, "epoch": 0.25828172909276736, "flos": 29498281856640.0, "grad_norm": 2.7903651755737644, "language_loss": 0.77905107, "learning_rate": 3.4779560929125894e-06, "loss": 0.80115473, "num_input_tokens_seen": 45907900, "step": 2148, "time_per_iteration": 2.7434988021850586 }, { "auxiliary_loss_clip": 0.01064798, "auxiliary_loss_mlp": 0.01006299, "balance_loss_clip": 1.02136183, "balance_loss_mlp": 1.00416517, "epoch": 0.2584019719834065, "flos": 67114387376640.0, "grad_norm": 0.6843763675756703, "language_loss": 0.56946301, "learning_rate": 3.4774311646405783e-06, "loss": 0.59017396, "num_input_tokens_seen": 45977805, "step": 2149, "time_per_iteration": 3.47347354888916 }, { "auxiliary_loss_clip": 0.01150933, "auxiliary_loss_mlp": 0.01031857, "balance_loss_clip": 1.04926896, "balance_loss_mlp": 1.0214144, "epoch": 0.2585222148740456, "flos": 22893555634560.0, "grad_norm": 5.004169448472716, "language_loss": 0.83967167, "learning_rate": 3.476906012243435e-06, "loss": 0.86149955, "num_input_tokens_seen": 45996715, "step": 2150, "time_per_iteration": 3.713287353515625 }, { "auxiliary_loss_clip": 0.01184103, "auxiliary_loss_mlp": 0.01038721, "balance_loss_clip": 1.05645382, "balance_loss_mlp": 1.02877927, "epoch": 0.25864245776468464, "flos": 28909202808960.0, "grad_norm": 1.7704039038727888, "language_loss": 0.81496239, "learning_rate": 3.476380635800824e-06, "loss": 0.83719063, "num_input_tokens_seen": 46017915, "step": 2151, "time_per_iteration": 3.8134374618530273 }, { "auxiliary_loss_clip": 0.01174656, "auxiliary_loss_mlp": 0.01034654, "balance_loss_clip": 1.0569787, "balance_loss_mlp": 1.02542126, "epoch": 0.25876270065532375, "flos": 14793185980800.0, "grad_norm": 2.194096105092921, "language_loss": 0.86397839, "learning_rate": 3.475855035392444e-06, "loss": 0.88607144, "num_input_tokens_seen": 46033235, "step": 2152, "time_per_iteration": 3.6637401580810547 }, { "auxiliary_loss_clip": 0.01134491, "auxiliary_loss_mlp": 0.01027614, "balance_loss_clip": 1.05275917, "balance_loss_mlp": 1.01827955, "epoch": 0.25888294354596286, "flos": 60467821810560.0, "grad_norm": 2.1239627315234615, "language_loss": 0.7138356, "learning_rate": 3.475329211098029e-06, "loss": 0.73545665, "num_input_tokens_seen": 46056390, "step": 2153, "time_per_iteration": 3.238281488418579 }, { "auxiliary_loss_clip": 0.01155336, "auxiliary_loss_mlp": 0.0103493, "balance_loss_clip": 1.05355394, "balance_loss_mlp": 1.02564371, "epoch": 0.2590031864366019, "flos": 27851166771840.0, "grad_norm": 1.7417133918443026, "language_loss": 0.82448679, "learning_rate": 3.4748031629973453e-06, "loss": 0.84638941, "num_input_tokens_seen": 46077120, "step": 2154, "time_per_iteration": 2.975493907928467 }, { "auxiliary_loss_clip": 0.01051174, "auxiliary_loss_mlp": 0.01002258, "balance_loss_clip": 1.02084756, "balance_loss_mlp": 1.00006473, "epoch": 0.25912342932724103, "flos": 62422444206720.0, "grad_norm": 0.9112428620346538, "language_loss": 0.56599599, "learning_rate": 3.4742768911701944e-06, "loss": 0.58653033, "num_input_tokens_seen": 46139815, "step": 2155, "time_per_iteration": 3.4819154739379883 }, { "auxiliary_loss_clip": 0.01199918, "auxiliary_loss_mlp": 0.01042272, "balance_loss_clip": 1.06041312, "balance_loss_mlp": 1.03215694, "epoch": 0.25924367221788014, "flos": 12378839368320.0, "grad_norm": 3.1095609490669505, "language_loss": 0.70518059, "learning_rate": 3.4737503956964113e-06, "loss": 0.72760248, "num_input_tokens_seen": 46152120, "step": 2156, "time_per_iteration": 2.7426395416259766 }, { "auxiliary_loss_clip": 0.0116977, "auxiliary_loss_mlp": 0.01033402, "balance_loss_clip": 1.05192232, "balance_loss_mlp": 1.02307296, "epoch": 0.2593639151085192, "flos": 14575208296320.0, "grad_norm": 2.1910608382395202, "language_loss": 0.6756202, "learning_rate": 3.473223676655865e-06, "loss": 0.69765192, "num_input_tokens_seen": 46170120, "step": 2157, "time_per_iteration": 2.7701849937438965 }, { "auxiliary_loss_clip": 0.01173262, "auxiliary_loss_mlp": 0.01040302, "balance_loss_clip": 1.05379856, "balance_loss_mlp": 1.02979946, "epoch": 0.2594841579991583, "flos": 15230937029760.0, "grad_norm": 2.4930472947072735, "language_loss": 0.80002081, "learning_rate": 3.472696734128459e-06, "loss": 0.82215643, "num_input_tokens_seen": 46187985, "step": 2158, "time_per_iteration": 2.7413103580474854 }, { "auxiliary_loss_clip": 0.01194436, "auxiliary_loss_mlp": 0.01038129, "balance_loss_clip": 1.05688751, "balance_loss_mlp": 1.02812159, "epoch": 0.2596044008897974, "flos": 23623583650560.0, "grad_norm": 2.0899533275350226, "language_loss": 0.75534683, "learning_rate": 3.4721695681941286e-06, "loss": 0.77767247, "num_input_tokens_seen": 46207025, "step": 2159, "time_per_iteration": 2.8154211044311523 }, { "auxiliary_loss_clip": 0.0117781, "auxiliary_loss_mlp": 0.00764978, "balance_loss_clip": 1.05486274, "balance_loss_mlp": 1.00054526, "epoch": 0.25972464378043647, "flos": 13772281628160.0, "grad_norm": 2.1129042815780026, "language_loss": 0.82275307, "learning_rate": 3.471642178932845e-06, "loss": 0.84218097, "num_input_tokens_seen": 46225670, "step": 2160, "time_per_iteration": 2.7799184322357178 }, { "auxiliary_loss_clip": 0.01177577, "auxiliary_loss_mlp": 0.01038868, "balance_loss_clip": 1.05226755, "balance_loss_mlp": 1.02931297, "epoch": 0.2598448866710756, "flos": 19573578391680.0, "grad_norm": 1.936514405578926, "language_loss": 0.89751923, "learning_rate": 3.471114566424613e-06, "loss": 0.91968369, "num_input_tokens_seen": 46244130, "step": 2161, "time_per_iteration": 2.901782989501953 }, { "auxiliary_loss_clip": 0.01175666, "auxiliary_loss_mlp": 0.01029722, "balance_loss_clip": 1.05627453, "balance_loss_mlp": 1.02007186, "epoch": 0.25996512956171464, "flos": 21653237053440.0, "grad_norm": 2.2103103150702235, "language_loss": 0.7623933, "learning_rate": 3.4705867307494715e-06, "loss": 0.78444713, "num_input_tokens_seen": 46263200, "step": 2162, "time_per_iteration": 2.8005387783050537 }, { "auxiliary_loss_clip": 0.01193319, "auxiliary_loss_mlp": 0.01037348, "balance_loss_clip": 1.05383658, "balance_loss_mlp": 1.02706075, "epoch": 0.26008537245235375, "flos": 18223480869120.0, "grad_norm": 4.416136885711264, "language_loss": 0.8449707, "learning_rate": 3.470058671987492e-06, "loss": 0.86727738, "num_input_tokens_seen": 46281465, "step": 2163, "time_per_iteration": 2.743229627609253 }, { "auxiliary_loss_clip": 0.0119742, "auxiliary_loss_mlp": 0.01033849, "balance_loss_clip": 1.05565071, "balance_loss_mlp": 1.02339423, "epoch": 0.26020561534299286, "flos": 24645385843200.0, "grad_norm": 1.860966638672939, "language_loss": 0.84184545, "learning_rate": 3.4695303902187805e-06, "loss": 0.86415815, "num_input_tokens_seen": 46301020, "step": 2164, "time_per_iteration": 2.7318592071533203 }, { "auxiliary_loss_clip": 0.01158897, "auxiliary_loss_mlp": 0.01043429, "balance_loss_clip": 1.0489018, "balance_loss_mlp": 1.03354073, "epoch": 0.2603258582336319, "flos": 25773662926080.0, "grad_norm": 2.087856966405713, "language_loss": 0.78345549, "learning_rate": 3.469001885523478e-06, "loss": 0.80547881, "num_input_tokens_seen": 46321740, "step": 2165, "time_per_iteration": 2.8161497116088867 }, { "auxiliary_loss_clip": 0.01207479, "auxiliary_loss_mlp": 0.01028191, "balance_loss_clip": 1.05798817, "balance_loss_mlp": 1.01883256, "epoch": 0.260446101124271, "flos": 28766314506240.0, "grad_norm": 1.8186532244428, "language_loss": 0.8131355, "learning_rate": 3.4684731579817568e-06, "loss": 0.83549225, "num_input_tokens_seen": 46342730, "step": 2166, "time_per_iteration": 2.7604758739471436 }, { "auxiliary_loss_clip": 0.01133991, "auxiliary_loss_mlp": 0.01038768, "balance_loss_clip": 1.0512712, "balance_loss_mlp": 1.02888536, "epoch": 0.26056634401491013, "flos": 25666757072640.0, "grad_norm": 1.7063066253593668, "language_loss": 0.76923245, "learning_rate": 3.4679442076738247e-06, "loss": 0.79096007, "num_input_tokens_seen": 46362445, "step": 2167, "time_per_iteration": 2.871920585632324 }, { "auxiliary_loss_clip": 0.01211173, "auxiliary_loss_mlp": 0.01035963, "balance_loss_clip": 1.05940676, "balance_loss_mlp": 1.02558017, "epoch": 0.2606865869055492, "flos": 27052765217280.0, "grad_norm": 1.860217774491937, "language_loss": 0.83401871, "learning_rate": 3.4674150346799245e-06, "loss": 0.85649002, "num_input_tokens_seen": 46382145, "step": 2168, "time_per_iteration": 2.69301700592041 }, { "auxiliary_loss_clip": 0.01180665, "auxiliary_loss_mlp": 0.01030314, "balance_loss_clip": 1.05592287, "balance_loss_mlp": 1.02005029, "epoch": 0.2608068297961883, "flos": 17712615686400.0, "grad_norm": 2.1296615444067273, "language_loss": 0.80305302, "learning_rate": 3.4668856390803295e-06, "loss": 0.82516289, "num_input_tokens_seen": 46400025, "step": 2169, "time_per_iteration": 2.7036795616149902 }, { "auxiliary_loss_clip": 0.01182633, "auxiliary_loss_mlp": 0.01040415, "balance_loss_clip": 1.05450833, "balance_loss_mlp": 1.0300976, "epoch": 0.2609270726868274, "flos": 18551632544640.0, "grad_norm": 2.101467351468322, "language_loss": 0.90076733, "learning_rate": 3.4663560209553495e-06, "loss": 0.92299783, "num_input_tokens_seen": 46418090, "step": 2170, "time_per_iteration": 2.741187334060669 }, { "auxiliary_loss_clip": 0.01167436, "auxiliary_loss_mlp": 0.01035671, "balance_loss_clip": 1.05249321, "balance_loss_mlp": 1.02555037, "epoch": 0.26104731557746647, "flos": 21835699165440.0, "grad_norm": 1.5820962747393674, "language_loss": 0.79232526, "learning_rate": 3.4658261803853267e-06, "loss": 0.81435627, "num_input_tokens_seen": 46436015, "step": 2171, "time_per_iteration": 2.7068581581115723 }, { "auxiliary_loss_clip": 0.01175742, "auxiliary_loss_mlp": 0.01028674, "balance_loss_clip": 1.05617809, "balance_loss_mlp": 1.01877928, "epoch": 0.2611675584681056, "flos": 21689650465920.0, "grad_norm": 2.139791024455814, "language_loss": 0.80061936, "learning_rate": 3.4652961174506383e-06, "loss": 0.82266349, "num_input_tokens_seen": 46455885, "step": 2172, "time_per_iteration": 3.5975639820098877 }, { "auxiliary_loss_clip": 0.0108444, "auxiliary_loss_mlp": 0.01005786, "balance_loss_clip": 1.01999354, "balance_loss_mlp": 1.00323474, "epoch": 0.2612878013587447, "flos": 71862101389440.0, "grad_norm": 0.9615243342852774, "language_loss": 0.58069944, "learning_rate": 3.464765832231694e-06, "loss": 0.60160172, "num_input_tokens_seen": 46510050, "step": 2173, "time_per_iteration": 3.298933982849121 }, { "auxiliary_loss_clip": 0.01195969, "auxiliary_loss_mlp": 0.01036208, "balance_loss_clip": 1.05890679, "balance_loss_mlp": 1.02623582, "epoch": 0.26140804424938374, "flos": 20227511445120.0, "grad_norm": 2.2990585542003266, "language_loss": 0.70660454, "learning_rate": 3.4642353248089373e-06, "loss": 0.7289263, "num_input_tokens_seen": 46528810, "step": 2174, "time_per_iteration": 2.8091073036193848 }, { "auxiliary_loss_clip": 0.01171861, "auxiliary_loss_mlp": 0.01036182, "balance_loss_clip": 1.05329525, "balance_loss_mlp": 1.02585888, "epoch": 0.26152828714002285, "flos": 25557085872000.0, "grad_norm": 1.8034301111642723, "language_loss": 0.80496556, "learning_rate": 3.463704595262846e-06, "loss": 0.82704604, "num_input_tokens_seen": 46549690, "step": 2175, "time_per_iteration": 3.8178837299346924 }, { "auxiliary_loss_clip": 0.01164182, "auxiliary_loss_mlp": 0.01030556, "balance_loss_clip": 1.05607224, "balance_loss_mlp": 1.02093029, "epoch": 0.26164853003066196, "flos": 25446516831360.0, "grad_norm": 1.8763549566028066, "language_loss": 0.70642006, "learning_rate": 3.463173643673931e-06, "loss": 0.72836745, "num_input_tokens_seen": 46572215, "step": 2176, "time_per_iteration": 2.7861552238464355 }, { "auxiliary_loss_clip": 0.01094338, "auxiliary_loss_mlp": 0.01004648, "balance_loss_clip": 1.02289259, "balance_loss_mlp": 1.00233555, "epoch": 0.261768772921301, "flos": 53944580568960.0, "grad_norm": 0.8987036025135968, "language_loss": 0.63464659, "learning_rate": 3.4626424701227387e-06, "loss": 0.65563643, "num_input_tokens_seen": 46627275, "step": 2177, "time_per_iteration": 4.102824926376343 }, { "auxiliary_loss_clip": 0.0110583, "auxiliary_loss_mlp": 0.01009487, "balance_loss_clip": 1.02470589, "balance_loss_mlp": 1.00707853, "epoch": 0.26188901581194013, "flos": 70687606481280.0, "grad_norm": 0.8334593467847138, "language_loss": 0.55782431, "learning_rate": 3.4621110746898452e-06, "loss": 0.57897747, "num_input_tokens_seen": 46695135, "step": 2178, "time_per_iteration": 4.3226542472839355 }, { "auxiliary_loss_clip": 0.01198421, "auxiliary_loss_mlp": 0.01035405, "balance_loss_clip": 1.05922937, "balance_loss_mlp": 1.02536762, "epoch": 0.2620092587025792, "flos": 21069580959360.0, "grad_norm": 1.840232647129895, "language_loss": 0.75102186, "learning_rate": 3.4615794574558654e-06, "loss": 0.77336013, "num_input_tokens_seen": 46714145, "step": 2179, "time_per_iteration": 2.7355494499206543 }, { "auxiliary_loss_clip": 0.01175348, "auxiliary_loss_mlp": 0.01030893, "balance_loss_clip": 1.05278587, "balance_loss_mlp": 1.02123666, "epoch": 0.2621295015932183, "flos": 18369601395840.0, "grad_norm": 2.4706412017102948, "language_loss": 0.84754932, "learning_rate": 3.4610476185014436e-06, "loss": 0.86961174, "num_input_tokens_seen": 46731405, "step": 2180, "time_per_iteration": 2.6961328983306885 }, { "auxiliary_loss_clip": 0.01208733, "auxiliary_loss_mlp": 0.01041557, "balance_loss_clip": 1.05797112, "balance_loss_mlp": 1.0313046, "epoch": 0.2622497444838574, "flos": 23659997063040.0, "grad_norm": 1.9022042559792318, "language_loss": 0.79593945, "learning_rate": 3.4605155579072597e-06, "loss": 0.81844229, "num_input_tokens_seen": 46751260, "step": 2181, "time_per_iteration": 2.720080614089966 }, { "auxiliary_loss_clip": 0.0114067, "auxiliary_loss_mlp": 0.01036271, "balance_loss_clip": 1.04921651, "balance_loss_mlp": 1.02683568, "epoch": 0.26236998737449646, "flos": 22123810154880.0, "grad_norm": 1.7972152796016299, "language_loss": 0.7117247, "learning_rate": 3.459983275754027e-06, "loss": 0.7334941, "num_input_tokens_seen": 46770155, "step": 2182, "time_per_iteration": 2.837202787399292 }, { "auxiliary_loss_clip": 0.01207463, "auxiliary_loss_mlp": 0.01032889, "balance_loss_clip": 1.0573355, "balance_loss_mlp": 1.02303624, "epoch": 0.26249023026513557, "flos": 17895185539200.0, "grad_norm": 3.974735705417144, "language_loss": 0.79529798, "learning_rate": 3.4594507721224918e-06, "loss": 0.81770152, "num_input_tokens_seen": 46788805, "step": 2183, "time_per_iteration": 2.668518304824829 }, { "auxiliary_loss_clip": 0.01181747, "auxiliary_loss_mlp": 0.01041515, "balance_loss_clip": 1.05528641, "balance_loss_mlp": 1.03237212, "epoch": 0.2626104731557747, "flos": 18332936588160.0, "grad_norm": 1.6536594359770505, "language_loss": 0.82277644, "learning_rate": 3.4589180470934353e-06, "loss": 0.84500903, "num_input_tokens_seen": 46808670, "step": 2184, "time_per_iteration": 2.7255167961120605 }, { "auxiliary_loss_clip": 0.01197688, "auxiliary_loss_mlp": 0.0103631, "balance_loss_clip": 1.05397189, "balance_loss_mlp": 1.02615309, "epoch": 0.26273071604641374, "flos": 19317714837120.0, "grad_norm": 2.1611291128342334, "language_loss": 0.7702406, "learning_rate": 3.4583851007476713e-06, "loss": 0.7925806, "num_input_tokens_seen": 46827140, "step": 2185, "time_per_iteration": 2.7311878204345703 }, { "auxiliary_loss_clip": 0.01165102, "auxiliary_loss_mlp": 0.01040107, "balance_loss_clip": 1.05444121, "balance_loss_mlp": 1.0299207, "epoch": 0.26285095893705285, "flos": 18327477720960.0, "grad_norm": 2.135956953688338, "language_loss": 0.68645656, "learning_rate": 3.4578519331660464e-06, "loss": 0.70850861, "num_input_tokens_seen": 46844135, "step": 2186, "time_per_iteration": 2.7207775115966797 }, { "auxiliary_loss_clip": 0.01190914, "auxiliary_loss_mlp": 0.01034134, "balance_loss_clip": 1.05875897, "balance_loss_mlp": 1.0247643, "epoch": 0.26297120182769196, "flos": 20193827466240.0, "grad_norm": 17.809857089328183, "language_loss": 0.82067966, "learning_rate": 3.4573185444294426e-06, "loss": 0.84293014, "num_input_tokens_seen": 46862500, "step": 2187, "time_per_iteration": 2.7988717555999756 }, { "auxiliary_loss_clip": 0.01176632, "auxiliary_loss_mlp": 0.00765028, "balance_loss_clip": 1.0542202, "balance_loss_mlp": 1.00057769, "epoch": 0.263091444718331, "flos": 22418421505920.0, "grad_norm": 2.0510791474024535, "language_loss": 0.7874338, "learning_rate": 3.456784934618774e-06, "loss": 0.80685037, "num_input_tokens_seen": 46883665, "step": 2188, "time_per_iteration": 2.795130968093872 }, { "auxiliary_loss_clip": 0.01174268, "auxiliary_loss_mlp": 0.01039173, "balance_loss_clip": 1.05261922, "balance_loss_mlp": 1.02937949, "epoch": 0.2632116876089701, "flos": 19024827338880.0, "grad_norm": 2.102418552233927, "language_loss": 0.796381, "learning_rate": 3.4562511038149897e-06, "loss": 0.8185153, "num_input_tokens_seen": 46899160, "step": 2189, "time_per_iteration": 2.693253993988037 }, { "auxiliary_loss_clip": 0.01052286, "auxiliary_loss_mlp": 0.01008738, "balance_loss_clip": 1.02056551, "balance_loss_mlp": 1.0066514, "epoch": 0.26333193049960923, "flos": 67308054531840.0, "grad_norm": 0.8593881775456508, "language_loss": 0.57728714, "learning_rate": 3.4557170520990705e-06, "loss": 0.59789735, "num_input_tokens_seen": 46959835, "step": 2190, "time_per_iteration": 3.4310836791992188 }, { "auxiliary_loss_clip": 0.01188511, "auxiliary_loss_mlp": 0.01038746, "balance_loss_clip": 1.05677354, "balance_loss_mlp": 1.02988839, "epoch": 0.2634521733902483, "flos": 25048806468480.0, "grad_norm": 2.3401930449806394, "language_loss": 0.86477405, "learning_rate": 3.4551827795520324e-06, "loss": 0.88704669, "num_input_tokens_seen": 46982720, "step": 2191, "time_per_iteration": 2.7360470294952393 }, { "auxiliary_loss_clip": 0.01190251, "auxiliary_loss_mlp": 0.0103506, "balance_loss_clip": 1.05276489, "balance_loss_mlp": 1.02530289, "epoch": 0.2635724162808874, "flos": 20594985534720.0, "grad_norm": 1.7063172322199287, "language_loss": 0.85233325, "learning_rate": 3.4546482862549226e-06, "loss": 0.87458634, "num_input_tokens_seen": 47003035, "step": 2192, "time_per_iteration": 2.7484350204467773 }, { "auxiliary_loss_clip": 0.01159714, "auxiliary_loss_mlp": 0.0103439, "balance_loss_clip": 1.05284488, "balance_loss_mlp": 1.02428746, "epoch": 0.2636926591715265, "flos": 19244636616960.0, "grad_norm": 2.231681787812652, "language_loss": 0.79020196, "learning_rate": 3.4541135722888253e-06, "loss": 0.81214297, "num_input_tokens_seen": 47019625, "step": 2193, "time_per_iteration": 2.7736289501190186 }, { "auxiliary_loss_clip": 0.01208724, "auxiliary_loss_mlp": 0.01037253, "balance_loss_clip": 1.05882609, "balance_loss_mlp": 1.02780557, "epoch": 0.26381290206216557, "flos": 28804882734720.0, "grad_norm": 2.1465003968563545, "language_loss": 0.80461538, "learning_rate": 3.453578637734854e-06, "loss": 0.82707512, "num_input_tokens_seen": 47040815, "step": 2194, "time_per_iteration": 2.681612014770508 }, { "auxiliary_loss_clip": 0.01213133, "auxiliary_loss_mlp": 0.01038825, "balance_loss_clip": 1.06305838, "balance_loss_mlp": 1.02930617, "epoch": 0.2639331449528047, "flos": 25008909436800.0, "grad_norm": 1.6745718268148027, "language_loss": 0.78559887, "learning_rate": 3.4530434826741605e-06, "loss": 0.80811846, "num_input_tokens_seen": 47061755, "step": 2195, "time_per_iteration": 2.6600494384765625 }, { "auxiliary_loss_clip": 0.01173468, "auxiliary_loss_mlp": 0.01033287, "balance_loss_clip": 1.05574429, "balance_loss_mlp": 1.02404785, "epoch": 0.26405338784344373, "flos": 46535775465600.0, "grad_norm": 1.620102685890706, "language_loss": 0.69317663, "learning_rate": 3.452508107187926e-06, "loss": 0.71524417, "num_input_tokens_seen": 47085130, "step": 2196, "time_per_iteration": 3.0667669773101807 }, { "auxiliary_loss_clip": 0.01131302, "auxiliary_loss_mlp": 0.01035448, "balance_loss_clip": 1.04663563, "balance_loss_mlp": 1.02534461, "epoch": 0.26417363073408284, "flos": 21179467641600.0, "grad_norm": 1.9505966582222527, "language_loss": 0.77490026, "learning_rate": 3.451972511357366e-06, "loss": 0.7965678, "num_input_tokens_seen": 47104675, "step": 2197, "time_per_iteration": 3.7696046829223633 }, { "auxiliary_loss_clip": 0.01191808, "auxiliary_loss_mlp": 0.01039134, "balance_loss_clip": 1.0587039, "balance_loss_mlp": 1.02941227, "epoch": 0.26429387362472195, "flos": 22674751937280.0, "grad_norm": 1.7553847475644808, "language_loss": 0.85581696, "learning_rate": 3.45143669526373e-06, "loss": 0.87812638, "num_input_tokens_seen": 47124435, "step": 2198, "time_per_iteration": 2.949803352355957 }, { "auxiliary_loss_clip": 0.01083148, "auxiliary_loss_mlp": 0.01003335, "balance_loss_clip": 1.02117181, "balance_loss_mlp": 1.00087929, "epoch": 0.264414116515361, "flos": 67180534272000.0, "grad_norm": 0.7858887255874223, "language_loss": 0.63203478, "learning_rate": 3.450900658988302e-06, "loss": 0.65289956, "num_input_tokens_seen": 47185985, "step": 2199, "time_per_iteration": 3.2081148624420166 }, { "auxiliary_loss_clip": 0.0116877, "auxiliary_loss_mlp": 0.01030559, "balance_loss_clip": 1.05416107, "balance_loss_mlp": 1.02042007, "epoch": 0.2645343594060001, "flos": 25664709997440.0, "grad_norm": 1.925851027586999, "language_loss": 0.77978539, "learning_rate": 3.450364402612397e-06, "loss": 0.80177867, "num_input_tokens_seen": 47203140, "step": 2200, "time_per_iteration": 2.9211459159851074 }, { "auxiliary_loss_clip": 0.0117188, "auxiliary_loss_mlp": 0.01034775, "balance_loss_clip": 1.05399227, "balance_loss_mlp": 1.02505374, "epoch": 0.26465460229663923, "flos": 22491822948480.0, "grad_norm": 2.068813562591618, "language_loss": 0.84078729, "learning_rate": 3.449827926217366e-06, "loss": 0.86285383, "num_input_tokens_seen": 47222575, "step": 2201, "time_per_iteration": 3.695934295654297 }, { "auxiliary_loss_clip": 0.01178164, "auxiliary_loss_mlp": 0.01031537, "balance_loss_clip": 1.05020833, "balance_loss_mlp": 1.02198243, "epoch": 0.2647748451872783, "flos": 29388036038400.0, "grad_norm": 3.046940668421117, "language_loss": 0.80871421, "learning_rate": 3.449291229884591e-06, "loss": 0.83081126, "num_input_tokens_seen": 47243815, "step": 2202, "time_per_iteration": 2.7967514991760254 }, { "auxiliary_loss_clip": 0.01166177, "auxiliary_loss_mlp": 0.01039522, "balance_loss_clip": 1.0522182, "balance_loss_mlp": 1.02996111, "epoch": 0.2648950880779174, "flos": 26797799502720.0, "grad_norm": 2.1918455144218814, "language_loss": 0.86615229, "learning_rate": 3.4487543136954887e-06, "loss": 0.88820922, "num_input_tokens_seen": 47263435, "step": 2203, "time_per_iteration": 4.118595838546753 }, { "auxiliary_loss_clip": 0.01161024, "auxiliary_loss_mlp": 0.01038383, "balance_loss_clip": 1.05245149, "balance_loss_mlp": 1.02973485, "epoch": 0.2650153309685565, "flos": 28841008838400.0, "grad_norm": 3.1364960919281066, "language_loss": 0.90942639, "learning_rate": 3.448217177731509e-06, "loss": 0.93142051, "num_input_tokens_seen": 47283920, "step": 2204, "time_per_iteration": 3.7646875381469727 }, { "auxiliary_loss_clip": 0.01173943, "auxiliary_loss_mlp": 0.01034572, "balance_loss_clip": 1.05686116, "balance_loss_mlp": 1.02533329, "epoch": 0.26513557385919556, "flos": 20303247271680.0, "grad_norm": 2.3170343535227738, "language_loss": 0.77963054, "learning_rate": 3.4476798220741348e-06, "loss": 0.80171561, "num_input_tokens_seen": 47302800, "step": 2205, "time_per_iteration": 2.6515469551086426 }, { "auxiliary_loss_clip": 0.0120555, "auxiliary_loss_mlp": 0.01030253, "balance_loss_clip": 1.05951381, "balance_loss_mlp": 1.02199793, "epoch": 0.26525581674983467, "flos": 17676274101120.0, "grad_norm": 1.8979961268120034, "language_loss": 0.78407425, "learning_rate": 3.4471422468048826e-06, "loss": 0.80643231, "num_input_tokens_seen": 47321525, "step": 2206, "time_per_iteration": 2.7334678173065186 }, { "auxiliary_loss_clip": 0.01187018, "auxiliary_loss_mlp": 0.01029155, "balance_loss_clip": 1.05805933, "balance_loss_mlp": 1.0194037, "epoch": 0.2653760596404738, "flos": 26833746038400.0, "grad_norm": 2.277201840629732, "language_loss": 0.72842371, "learning_rate": 3.4466044520053022e-06, "loss": 0.75058544, "num_input_tokens_seen": 47340530, "step": 2207, "time_per_iteration": 2.6941213607788086 }, { "auxiliary_loss_clip": 0.01162428, "auxiliary_loss_mlp": 0.01029874, "balance_loss_clip": 1.0503912, "balance_loss_mlp": 1.02028346, "epoch": 0.26549630253111284, "flos": 22782160581120.0, "grad_norm": 1.8891307797456605, "language_loss": 0.60347766, "learning_rate": 3.446066437756977e-06, "loss": 0.62540066, "num_input_tokens_seen": 47359735, "step": 2208, "time_per_iteration": 2.639209508895874 }, { "auxiliary_loss_clip": 0.01174516, "auxiliary_loss_mlp": 0.01028589, "balance_loss_clip": 1.05377853, "balance_loss_mlp": 1.01922512, "epoch": 0.26561654542175195, "flos": 23550002640000.0, "grad_norm": 3.651381689863993, "language_loss": 0.75389469, "learning_rate": 3.4455282041415224e-06, "loss": 0.7759257, "num_input_tokens_seen": 47378945, "step": 2209, "time_per_iteration": 2.74253249168396 }, { "auxiliary_loss_clip": 0.01165353, "auxiliary_loss_mlp": 0.01025909, "balance_loss_clip": 1.05466533, "balance_loss_mlp": 1.01708794, "epoch": 0.265736788312391, "flos": 26906680604160.0, "grad_norm": 2.66029368421958, "language_loss": 0.87474716, "learning_rate": 3.4449897512405894e-06, "loss": 0.89665973, "num_input_tokens_seen": 47398095, "step": 2210, "time_per_iteration": 2.8266594409942627 }, { "auxiliary_loss_clip": 0.0112563, "auxiliary_loss_mlp": 0.00764771, "balance_loss_clip": 1.04431343, "balance_loss_mlp": 1.00049996, "epoch": 0.2658570312030301, "flos": 23477139901440.0, "grad_norm": 4.062567572063346, "language_loss": 0.75333297, "learning_rate": 3.444451079135859e-06, "loss": 0.77223694, "num_input_tokens_seen": 47417605, "step": 2211, "time_per_iteration": 2.7900304794311523 }, { "auxiliary_loss_clip": 0.0113752, "auxiliary_loss_mlp": 0.00765521, "balance_loss_clip": 1.04547262, "balance_loss_mlp": 1.00056267, "epoch": 0.2659772740936692, "flos": 21866402315520.0, "grad_norm": 2.343494770971865, "language_loss": 0.74437428, "learning_rate": 3.4439121879090493e-06, "loss": 0.76340473, "num_input_tokens_seen": 47435385, "step": 2212, "time_per_iteration": 2.788841724395752 }, { "auxiliary_loss_clip": 0.01180068, "auxiliary_loss_mlp": 0.01029973, "balance_loss_clip": 1.05415487, "balance_loss_mlp": 1.01999485, "epoch": 0.2660975169843083, "flos": 19793100360960.0, "grad_norm": 2.26005472447892, "language_loss": 0.83260524, "learning_rate": 3.4433730776419082e-06, "loss": 0.85470569, "num_input_tokens_seen": 47454310, "step": 2213, "time_per_iteration": 2.7120585441589355 }, { "auxiliary_loss_clip": 0.01194425, "auxiliary_loss_mlp": 0.00765094, "balance_loss_clip": 1.05515933, "balance_loss_mlp": 1.00060618, "epoch": 0.2662177598749474, "flos": 29018981750400.0, "grad_norm": 3.7683472434301657, "language_loss": 0.8070997, "learning_rate": 3.4428337484162183e-06, "loss": 0.82669485, "num_input_tokens_seen": 47475120, "step": 2214, "time_per_iteration": 2.6899020671844482 }, { "auxiliary_loss_clip": 0.01169915, "auxiliary_loss_mlp": 0.01033955, "balance_loss_clip": 1.05018306, "balance_loss_mlp": 1.02436447, "epoch": 0.2663380027655865, "flos": 21762549118080.0, "grad_norm": 2.4714374393625023, "language_loss": 0.84697616, "learning_rate": 3.442294200313797e-06, "loss": 0.86901486, "num_input_tokens_seen": 47493150, "step": 2215, "time_per_iteration": 2.771312713623047 }, { "auxiliary_loss_clip": 0.0110376, "auxiliary_loss_mlp": 0.01002996, "balance_loss_clip": 1.02397573, "balance_loss_mlp": 1.00074315, "epoch": 0.26645824565622556, "flos": 66980333819520.0, "grad_norm": 0.7580198241322796, "language_loss": 0.5273037, "learning_rate": 3.4417544334164916e-06, "loss": 0.54837132, "num_input_tokens_seen": 47557295, "step": 2216, "time_per_iteration": 3.213793992996216 }, { "auxiliary_loss_clip": 0.01163105, "auxiliary_loss_mlp": 0.01026035, "balance_loss_clip": 1.05635834, "balance_loss_mlp": 1.01627803, "epoch": 0.26657848854686467, "flos": 25264198373760.0, "grad_norm": 1.6001771199829078, "language_loss": 0.77367234, "learning_rate": 3.4412144478061854e-06, "loss": 0.7955637, "num_input_tokens_seen": 47579705, "step": 2217, "time_per_iteration": 2.823359251022339 }, { "auxiliary_loss_clip": 0.01099554, "auxiliary_loss_mlp": 0.01045631, "balance_loss_clip": 1.04512, "balance_loss_mlp": 1.03568304, "epoch": 0.2666987314375038, "flos": 23696769611520.0, "grad_norm": 2.318847722805783, "language_loss": 0.75470817, "learning_rate": 3.4406742435647925e-06, "loss": 0.77616, "num_input_tokens_seen": 47599770, "step": 2218, "time_per_iteration": 3.0209999084472656 }, { "auxiliary_loss_clip": 0.01191854, "auxiliary_loss_mlp": 0.01033939, "balance_loss_clip": 1.05799127, "balance_loss_mlp": 1.02515936, "epoch": 0.26681897432814283, "flos": 27048958375680.0, "grad_norm": 2.203887392903152, "language_loss": 0.7875753, "learning_rate": 3.440133820774263e-06, "loss": 0.80983323, "num_input_tokens_seen": 47619580, "step": 2219, "time_per_iteration": 2.919261932373047 }, { "auxiliary_loss_clip": 0.01180266, "auxiliary_loss_mlp": 0.01037246, "balance_loss_clip": 1.05545056, "balance_loss_mlp": 1.02709556, "epoch": 0.26693921721878194, "flos": 28985944216320.0, "grad_norm": 2.1575144226540366, "language_loss": 0.82079363, "learning_rate": 3.439593179516578e-06, "loss": 0.84296876, "num_input_tokens_seen": 47639490, "step": 2220, "time_per_iteration": 2.717928886413574 }, { "auxiliary_loss_clip": 0.01183341, "auxiliary_loss_mlp": 0.01032399, "balance_loss_clip": 1.05661511, "balance_loss_mlp": 1.02225447, "epoch": 0.26705946010942105, "flos": 21507834798720.0, "grad_norm": 1.8409073249817467, "language_loss": 0.81362921, "learning_rate": 3.4390523198737524e-06, "loss": 0.83578658, "num_input_tokens_seen": 47658650, "step": 2221, "time_per_iteration": 2.6761701107025146 }, { "auxiliary_loss_clip": 0.01208861, "auxiliary_loss_mlp": 0.00764551, "balance_loss_clip": 1.05828381, "balance_loss_mlp": 1.00053918, "epoch": 0.2671797030000601, "flos": 21471277731840.0, "grad_norm": 2.208974590899092, "language_loss": 0.73598623, "learning_rate": 3.4385112419278333e-06, "loss": 0.75572038, "num_input_tokens_seen": 47679875, "step": 2222, "time_per_iteration": 2.6392171382904053 }, { "auxiliary_loss_clip": 0.01094371, "auxiliary_loss_mlp": 0.01003475, "balance_loss_clip": 1.0234437, "balance_loss_mlp": 1.00135362, "epoch": 0.2672999458906992, "flos": 64189929767040.0, "grad_norm": 0.7903585930415805, "language_loss": 0.64762592, "learning_rate": 3.4379699457609033e-06, "loss": 0.66860437, "num_input_tokens_seen": 47737700, "step": 2223, "time_per_iteration": 4.2064690589904785 }, { "auxiliary_loss_clip": 0.01167187, "auxiliary_loss_mlp": 0.01038381, "balance_loss_clip": 1.05111909, "balance_loss_mlp": 1.02864802, "epoch": 0.26742018878133833, "flos": 16909042573440.0, "grad_norm": 3.5092957980360606, "language_loss": 0.90148485, "learning_rate": 3.4374284314550755e-06, "loss": 0.92354047, "num_input_tokens_seen": 47756740, "step": 2224, "time_per_iteration": 2.7145373821258545 }, { "auxiliary_loss_clip": 0.01206341, "auxiliary_loss_mlp": 0.01033135, "balance_loss_clip": 1.0579654, "balance_loss_mlp": 1.02369392, "epoch": 0.2675404316719774, "flos": 20667560964480.0, "grad_norm": 5.600253131259644, "language_loss": 0.81072879, "learning_rate": 3.436886699092498e-06, "loss": 0.83312356, "num_input_tokens_seen": 47775255, "step": 2225, "time_per_iteration": 2.6467885971069336 }, { "auxiliary_loss_clip": 0.01209723, "auxiliary_loss_mlp": 0.01033582, "balance_loss_clip": 1.05911064, "balance_loss_mlp": 1.02330661, "epoch": 0.2676606745626165, "flos": 17485013157120.0, "grad_norm": 3.6126822767378806, "language_loss": 0.71566981, "learning_rate": 3.4363447487553502e-06, "loss": 0.73810285, "num_input_tokens_seen": 47788570, "step": 2226, "time_per_iteration": 2.5871429443359375 }, { "auxiliary_loss_clip": 0.01174831, "auxiliary_loss_mlp": 0.01030288, "balance_loss_clip": 1.05610907, "balance_loss_mlp": 1.02025664, "epoch": 0.26778091745325555, "flos": 27852675143040.0, "grad_norm": 2.9344914465544645, "language_loss": 0.7838732, "learning_rate": 3.4358025805258455e-06, "loss": 0.80592442, "num_input_tokens_seen": 47808275, "step": 2227, "time_per_iteration": 3.6969895362854004 }, { "auxiliary_loss_clip": 0.01151781, "auxiliary_loss_mlp": 0.01033916, "balance_loss_clip": 1.05118525, "balance_loss_mlp": 1.02461147, "epoch": 0.26790116034389466, "flos": 20955995176320.0, "grad_norm": 2.074283011053654, "language_loss": 0.83585846, "learning_rate": 3.435260194486232e-06, "loss": 0.85771543, "num_input_tokens_seen": 47826245, "step": 2228, "time_per_iteration": 2.8312504291534424 }, { "auxiliary_loss_clip": 0.01175381, "auxiliary_loss_mlp": 0.01030128, "balance_loss_clip": 1.05546379, "balance_loss_mlp": 1.01988184, "epoch": 0.2680214032345338, "flos": 18040659621120.0, "grad_norm": 2.623568291515022, "language_loss": 0.82777762, "learning_rate": 3.4347175907187875e-06, "loss": 0.84983277, "num_input_tokens_seen": 47843235, "step": 2229, "time_per_iteration": 4.031090497970581 }, { "auxiliary_loss_clip": 0.01187474, "auxiliary_loss_mlp": 0.01040285, "balance_loss_clip": 1.05483842, "balance_loss_mlp": 1.03117788, "epoch": 0.26814164612517283, "flos": 22419427086720.0, "grad_norm": 2.2295506647598398, "language_loss": 0.87935054, "learning_rate": 3.4341747693058254e-06, "loss": 0.90162814, "num_input_tokens_seen": 47861710, "step": 2230, "time_per_iteration": 3.563784599304199 }, { "auxiliary_loss_clip": 0.01096257, "auxiliary_loss_mlp": 0.01027787, "balance_loss_clip": 1.0443368, "balance_loss_mlp": 1.01838779, "epoch": 0.26826188901581194, "flos": 35627371159680.0, "grad_norm": 1.8285461580452447, "language_loss": 0.77447498, "learning_rate": 3.4336317303296916e-06, "loss": 0.79571545, "num_input_tokens_seen": 47882685, "step": 2231, "time_per_iteration": 3.4041006565093994 }, { "auxiliary_loss_clip": 0.01185438, "auxiliary_loss_mlp": 0.01039052, "balance_loss_clip": 1.05485821, "balance_loss_mlp": 1.0299325, "epoch": 0.26838213190645105, "flos": 17639788861440.0, "grad_norm": 2.28244769017708, "language_loss": 0.75351381, "learning_rate": 3.4330884738727635e-06, "loss": 0.77575874, "num_input_tokens_seen": 47900860, "step": 2232, "time_per_iteration": 2.847151517868042 }, { "auxiliary_loss_clip": 0.01135259, "auxiliary_loss_mlp": 0.01030107, "balance_loss_clip": 1.05008101, "balance_loss_mlp": 1.02101183, "epoch": 0.2685023747970901, "flos": 22674823764480.0, "grad_norm": 2.4463649638151583, "language_loss": 0.70890176, "learning_rate": 3.4325450000174535e-06, "loss": 0.73055542, "num_input_tokens_seen": 47917500, "step": 2233, "time_per_iteration": 3.0780458450317383 }, { "auxiliary_loss_clip": 0.01137847, "auxiliary_loss_mlp": 0.0103523, "balance_loss_clip": 1.04939842, "balance_loss_mlp": 1.02524018, "epoch": 0.2686226176877292, "flos": 20120533764480.0, "grad_norm": 3.531227007619784, "language_loss": 0.74778259, "learning_rate": 3.4320013088462067e-06, "loss": 0.76951337, "num_input_tokens_seen": 47934860, "step": 2234, "time_per_iteration": 2.7203075885772705 }, { "auxiliary_loss_clip": 0.0116323, "auxiliary_loss_mlp": 0.01035187, "balance_loss_clip": 1.05042088, "balance_loss_mlp": 1.02623391, "epoch": 0.2687428605783683, "flos": 21872040750720.0, "grad_norm": 1.6287222567088606, "language_loss": 0.81778407, "learning_rate": 3.431457400441499e-06, "loss": 0.83976823, "num_input_tokens_seen": 47955255, "step": 2235, "time_per_iteration": 2.765872001647949 }, { "auxiliary_loss_clip": 0.01035556, "auxiliary_loss_mlp": 0.01006201, "balance_loss_clip": 1.01941466, "balance_loss_mlp": 1.00393569, "epoch": 0.2688631034690074, "flos": 69943320766080.0, "grad_norm": 0.9123079505956964, "language_loss": 0.60835618, "learning_rate": 3.4309132748858424e-06, "loss": 0.62877369, "num_input_tokens_seen": 48016245, "step": 2236, "time_per_iteration": 3.429778575897217 }, { "auxiliary_loss_clip": 0.01188564, "auxiliary_loss_mlp": 0.01030612, "balance_loss_clip": 1.05692816, "balance_loss_mlp": 1.02183867, "epoch": 0.2689833463596465, "flos": 22856639431680.0, "grad_norm": 1.66521320211076, "language_loss": 0.83395159, "learning_rate": 3.430368932261779e-06, "loss": 0.85614336, "num_input_tokens_seen": 48036600, "step": 2237, "time_per_iteration": 2.695408821105957 }, { "auxiliary_loss_clip": 0.01175367, "auxiliary_loss_mlp": 0.01032113, "balance_loss_clip": 1.05442691, "balance_loss_mlp": 1.02231395, "epoch": 0.2691035892502856, "flos": 17200242132480.0, "grad_norm": 3.503876587831936, "language_loss": 0.75005698, "learning_rate": 3.429824372651886e-06, "loss": 0.7721318, "num_input_tokens_seen": 48054750, "step": 2238, "time_per_iteration": 2.7807703018188477 }, { "auxiliary_loss_clip": 0.01156177, "auxiliary_loss_mlp": 0.01036308, "balance_loss_clip": 1.0519973, "balance_loss_mlp": 1.02585912, "epoch": 0.26922383214092466, "flos": 17747484814080.0, "grad_norm": 2.15517679234154, "language_loss": 0.8361038, "learning_rate": 3.4292795961387732e-06, "loss": 0.85802859, "num_input_tokens_seen": 48072650, "step": 2239, "time_per_iteration": 2.711087226867676 }, { "auxiliary_loss_clip": 0.01205176, "auxiliary_loss_mlp": 0.01033422, "balance_loss_clip": 1.05669045, "balance_loss_mlp": 1.02397442, "epoch": 0.26934407503156377, "flos": 16173376122240.0, "grad_norm": 2.0940325808336517, "language_loss": 0.8762455, "learning_rate": 3.4287346028050818e-06, "loss": 0.89863151, "num_input_tokens_seen": 48088720, "step": 2240, "time_per_iteration": 2.6220645904541016 }, { "auxiliary_loss_clip": 0.01172563, "auxiliary_loss_mlp": 0.01028457, "balance_loss_clip": 1.0520227, "balance_loss_mlp": 1.01971865, "epoch": 0.2694643179222028, "flos": 23732895715200.0, "grad_norm": 1.5767410987140882, "language_loss": 0.79628795, "learning_rate": 3.4281893927334866e-06, "loss": 0.8182981, "num_input_tokens_seen": 48108630, "step": 2241, "time_per_iteration": 2.731638193130493 }, { "auxiliary_loss_clip": 0.0119096, "auxiliary_loss_mlp": 0.01029256, "balance_loss_clip": 1.05524337, "balance_loss_mlp": 1.02008331, "epoch": 0.26958456081284193, "flos": 24718140840960.0, "grad_norm": 1.984485092082006, "language_loss": 0.75532472, "learning_rate": 3.4276439660066963e-06, "loss": 0.77752686, "num_input_tokens_seen": 48128330, "step": 2242, "time_per_iteration": 2.674990177154541 }, { "auxiliary_loss_clip": 0.01204347, "auxiliary_loss_mlp": 0.0103475, "balance_loss_clip": 1.05775428, "balance_loss_mlp": 1.02568984, "epoch": 0.26970480370348104, "flos": 18112588606080.0, "grad_norm": 2.7755946024011613, "language_loss": 0.84265792, "learning_rate": 3.427098322707452e-06, "loss": 0.86504889, "num_input_tokens_seen": 48144295, "step": 2243, "time_per_iteration": 2.563952684402466 }, { "auxiliary_loss_clip": 0.01194117, "auxiliary_loss_mlp": 0.01037633, "balance_loss_clip": 1.06196141, "balance_loss_mlp": 1.02812016, "epoch": 0.2698250465941201, "flos": 10816546250880.0, "grad_norm": 2.492226179288465, "language_loss": 0.89648795, "learning_rate": 3.426552462918526e-06, "loss": 0.91880548, "num_input_tokens_seen": 48162230, "step": 2244, "time_per_iteration": 2.6364941596984863 }, { "auxiliary_loss_clip": 0.01204324, "auxiliary_loss_mlp": 0.01031565, "balance_loss_clip": 1.05976498, "balance_loss_mlp": 1.02199209, "epoch": 0.2699452894847592, "flos": 17308117653120.0, "grad_norm": 2.253748281086285, "language_loss": 0.73136568, "learning_rate": 3.426006386722726e-06, "loss": 0.75372458, "num_input_tokens_seen": 48180290, "step": 2245, "time_per_iteration": 2.5815622806549072 }, { "auxiliary_loss_clip": 0.01169408, "auxiliary_loss_mlp": 0.0104032, "balance_loss_clip": 1.059587, "balance_loss_mlp": 1.03028297, "epoch": 0.2700655323753983, "flos": 18078150441600.0, "grad_norm": 1.8702319731934123, "language_loss": 0.92366099, "learning_rate": 3.4254600942028914e-06, "loss": 0.94575828, "num_input_tokens_seen": 48198165, "step": 2246, "time_per_iteration": 2.705512523651123 }, { "auxiliary_loss_clip": 0.01171411, "auxiliary_loss_mlp": 0.0103686, "balance_loss_clip": 1.05578923, "balance_loss_mlp": 1.02835476, "epoch": 0.2701857752660374, "flos": 18186636493440.0, "grad_norm": 2.4974395671781133, "language_loss": 0.82603467, "learning_rate": 3.424913585441893e-06, "loss": 0.84811741, "num_input_tokens_seen": 48216000, "step": 2247, "time_per_iteration": 2.63370680809021 }, { "auxiliary_loss_clip": 0.0118764, "auxiliary_loss_mlp": 0.01032383, "balance_loss_clip": 1.05633914, "balance_loss_mlp": 1.02319753, "epoch": 0.2703060181566765, "flos": 16319496648960.0, "grad_norm": 3.618570510720264, "language_loss": 0.8777504, "learning_rate": 3.4243668605226374e-06, "loss": 0.89995056, "num_input_tokens_seen": 48233025, "step": 2248, "time_per_iteration": 2.62382435798645 }, { "auxiliary_loss_clip": 0.01163402, "auxiliary_loss_mlp": 0.01037847, "balance_loss_clip": 1.05673623, "balance_loss_mlp": 1.02748132, "epoch": 0.2704262610473156, "flos": 19572357329280.0, "grad_norm": 2.5118718628518013, "language_loss": 0.82806599, "learning_rate": 3.423819919528061e-06, "loss": 0.85007852, "num_input_tokens_seen": 48251110, "step": 2249, "time_per_iteration": 3.8477587699890137 }, { "auxiliary_loss_clip": 0.01147069, "auxiliary_loss_mlp": 0.01027972, "balance_loss_clip": 1.04655266, "balance_loss_mlp": 1.01904321, "epoch": 0.27054650393795465, "flos": 20740746925440.0, "grad_norm": 1.7716723723822048, "language_loss": 0.78478092, "learning_rate": 3.4232727625411355e-06, "loss": 0.80653137, "num_input_tokens_seen": 48270215, "step": 2250, "time_per_iteration": 2.767078399658203 }, { "auxiliary_loss_clip": 0.01123326, "auxiliary_loss_mlp": 0.01031635, "balance_loss_clip": 1.0472697, "balance_loss_mlp": 1.02268267, "epoch": 0.27066674682859376, "flos": 18658322916480.0, "grad_norm": 1.8022671458723503, "language_loss": 0.868294, "learning_rate": 3.4227253896448626e-06, "loss": 0.88984358, "num_input_tokens_seen": 48288075, "step": 2251, "time_per_iteration": 2.787029504776001 }, { "auxiliary_loss_clip": 0.01204036, "auxiliary_loss_mlp": 0.01030839, "balance_loss_clip": 1.05746925, "balance_loss_mlp": 1.0222857, "epoch": 0.2707869897192329, "flos": 23002759958400.0, "grad_norm": 2.3350177002462567, "language_loss": 0.82068485, "learning_rate": 3.42217780092228e-06, "loss": 0.84303367, "num_input_tokens_seen": 48306415, "step": 2252, "time_per_iteration": 2.6354494094848633 }, { "auxiliary_loss_clip": 0.01075377, "auxiliary_loss_mlp": 0.01004319, "balance_loss_clip": 1.02228248, "balance_loss_mlp": 1.00208938, "epoch": 0.27090723260987193, "flos": 58323240293760.0, "grad_norm": 0.8024672355690275, "language_loss": 0.60266376, "learning_rate": 3.421629996456456e-06, "loss": 0.62346077, "num_input_tokens_seen": 48365035, "step": 2253, "time_per_iteration": 4.105928659439087 }, { "auxiliary_loss_clip": 0.01185328, "auxiliary_loss_mlp": 0.01035695, "balance_loss_clip": 1.05395365, "balance_loss_mlp": 1.02565181, "epoch": 0.27102747550051104, "flos": 11984540797440.0, "grad_norm": 2.072238863216963, "language_loss": 0.82984418, "learning_rate": 3.421081976330491e-06, "loss": 0.85205448, "num_input_tokens_seen": 48383550, "step": 2254, "time_per_iteration": 2.667833089828491 }, { "auxiliary_loss_clip": 0.01170353, "auxiliary_loss_mlp": 0.01044033, "balance_loss_clip": 1.05175066, "balance_loss_mlp": 1.03511012, "epoch": 0.27114771839115015, "flos": 19900401264000.0, "grad_norm": 2.9279864294154123, "language_loss": 0.87830806, "learning_rate": 3.4205337406275207e-06, "loss": 0.9004519, "num_input_tokens_seen": 48403670, "step": 2255, "time_per_iteration": 2.7073915004730225 }, { "auxiliary_loss_clip": 0.01200841, "auxiliary_loss_mlp": 0.01034671, "balance_loss_clip": 1.05403852, "balance_loss_mlp": 1.02533078, "epoch": 0.2712679612817892, "flos": 18331966920960.0, "grad_norm": 3.6973722134784395, "language_loss": 0.75817657, "learning_rate": 3.4199852894307114e-06, "loss": 0.78053164, "num_input_tokens_seen": 48420420, "step": 2256, "time_per_iteration": 4.436914920806885 }, { "auxiliary_loss_clip": 0.01130749, "auxiliary_loss_mlp": 0.01035851, "balance_loss_clip": 1.05104673, "balance_loss_mlp": 1.02642798, "epoch": 0.2713882041724283, "flos": 24460302038400.0, "grad_norm": 2.020935595226801, "language_loss": 0.78907758, "learning_rate": 3.419436622823262e-06, "loss": 0.81074357, "num_input_tokens_seen": 48441140, "step": 2257, "time_per_iteration": 2.8501999378204346 }, { "auxiliary_loss_clip": 0.01171159, "auxiliary_loss_mlp": 0.01029118, "balance_loss_clip": 1.0516876, "balance_loss_mlp": 1.02015352, "epoch": 0.27150844706306737, "flos": 23039317025280.0, "grad_norm": 1.5244851836775115, "language_loss": 0.74349898, "learning_rate": 3.4188877408884063e-06, "loss": 0.76550168, "num_input_tokens_seen": 48461845, "step": 2258, "time_per_iteration": 2.9187352657318115 }, { "auxiliary_loss_clip": 0.01165911, "auxiliary_loss_mlp": 0.01025104, "balance_loss_clip": 1.05272889, "balance_loss_mlp": 1.01618743, "epoch": 0.2716286899537065, "flos": 22563644192640.0, "grad_norm": 3.133842791265892, "language_loss": 0.6529209, "learning_rate": 3.4183386437094088e-06, "loss": 0.67483103, "num_input_tokens_seen": 48478510, "step": 2259, "time_per_iteration": 2.6737172603607178 }, { "auxiliary_loss_clip": 0.01175572, "auxiliary_loss_mlp": 0.01032411, "balance_loss_clip": 1.05327082, "balance_loss_mlp": 1.02337503, "epoch": 0.2717489328443456, "flos": 13115044523520.0, "grad_norm": 2.3353183897836, "language_loss": 0.82491642, "learning_rate": 3.417789331369565e-06, "loss": 0.84699625, "num_input_tokens_seen": 48494300, "step": 2260, "time_per_iteration": 2.7134249210357666 }, { "auxiliary_loss_clip": 0.01207014, "auxiliary_loss_mlp": 0.01034179, "balance_loss_clip": 1.0597049, "balance_loss_mlp": 1.02481556, "epoch": 0.27186917573498465, "flos": 29278688060160.0, "grad_norm": 2.0414070262122004, "language_loss": 0.91550297, "learning_rate": 3.4172398039522088e-06, "loss": 0.93791497, "num_input_tokens_seen": 48515585, "step": 2261, "time_per_iteration": 2.7078797817230225 }, { "auxiliary_loss_clip": 0.01189596, "auxiliary_loss_mlp": 0.01031936, "balance_loss_clip": 1.05665159, "balance_loss_mlp": 1.02305484, "epoch": 0.27198941862562376, "flos": 26032220000640.0, "grad_norm": 1.8310170756824937, "language_loss": 0.80002964, "learning_rate": 3.4166900615407e-06, "loss": 0.82224488, "num_input_tokens_seen": 48533500, "step": 2262, "time_per_iteration": 2.6920950412750244 }, { "auxiliary_loss_clip": 0.01185497, "auxiliary_loss_mlp": 0.01032027, "balance_loss_clip": 1.05559468, "balance_loss_mlp": 1.02317572, "epoch": 0.27210966151626287, "flos": 32780983760640.0, "grad_norm": 2.225004896837347, "language_loss": 0.74870867, "learning_rate": 3.416140104218436e-06, "loss": 0.77088392, "num_input_tokens_seen": 48552865, "step": 2263, "time_per_iteration": 2.7059764862060547 }, { "auxiliary_loss_clip": 0.0107623, "auxiliary_loss_mlp": 0.00755436, "balance_loss_clip": 1.02033305, "balance_loss_mlp": 1.00021708, "epoch": 0.2722299044069019, "flos": 65471043219840.0, "grad_norm": 0.8439753316452376, "language_loss": 0.69589019, "learning_rate": 3.4155899320688437e-06, "loss": 0.71420681, "num_input_tokens_seen": 48618940, "step": 2264, "time_per_iteration": 3.2804648876190186 }, { "auxiliary_loss_clip": 0.01131037, "auxiliary_loss_mlp": 0.01031735, "balance_loss_clip": 1.05193293, "balance_loss_mlp": 1.02236474, "epoch": 0.27235014729754103, "flos": 15334143782400.0, "grad_norm": 2.355352376633204, "language_loss": 0.74075484, "learning_rate": 3.415039545175384e-06, "loss": 0.76238251, "num_input_tokens_seen": 48634665, "step": 2265, "time_per_iteration": 2.8144209384918213 }, { "auxiliary_loss_clip": 0.01189467, "auxiliary_loss_mlp": 0.01036904, "balance_loss_clip": 1.05561638, "balance_loss_mlp": 1.0285002, "epoch": 0.27247039018818014, "flos": 21872363973120.0, "grad_norm": 2.3401154833609805, "language_loss": 0.653283, "learning_rate": 3.414488943621551e-06, "loss": 0.67554677, "num_input_tokens_seen": 48653330, "step": 2266, "time_per_iteration": 2.6721744537353516 }, { "auxiliary_loss_clip": 0.0118707, "auxiliary_loss_mlp": 0.01037487, "balance_loss_clip": 1.05684018, "balance_loss_mlp": 1.02828372, "epoch": 0.2725906330788192, "flos": 18695490514560.0, "grad_norm": 1.8646913072600009, "language_loss": 0.7355082, "learning_rate": 3.41393812749087e-06, "loss": 0.75775373, "num_input_tokens_seen": 48671375, "step": 2267, "time_per_iteration": 2.605172872543335 }, { "auxiliary_loss_clip": 0.01173285, "auxiliary_loss_mlp": 0.01032521, "balance_loss_clip": 1.05768967, "balance_loss_mlp": 1.02340794, "epoch": 0.2727108759694583, "flos": 17886099398400.0, "grad_norm": 2.7747946154326737, "language_loss": 0.71699882, "learning_rate": 3.4133870968668984e-06, "loss": 0.73905689, "num_input_tokens_seen": 48686175, "step": 2268, "time_per_iteration": 2.684117078781128 }, { "auxiliary_loss_clip": 0.01176263, "auxiliary_loss_mlp": 0.01033818, "balance_loss_clip": 1.05631661, "balance_loss_mlp": 1.02388823, "epoch": 0.2728311188600974, "flos": 24461666755200.0, "grad_norm": 2.042080036318221, "language_loss": 0.78988075, "learning_rate": 3.412835851833229e-06, "loss": 0.81198156, "num_input_tokens_seen": 48708370, "step": 2269, "time_per_iteration": 2.705493450164795 }, { "auxiliary_loss_clip": 0.01183614, "auxiliary_loss_mlp": 0.01038441, "balance_loss_clip": 1.05600286, "balance_loss_mlp": 1.03035855, "epoch": 0.2729513617507365, "flos": 30993314757120.0, "grad_norm": 2.129643050326694, "language_loss": 0.78208184, "learning_rate": 3.4122843924734834e-06, "loss": 0.80430245, "num_input_tokens_seen": 48730670, "step": 2270, "time_per_iteration": 2.7089879512786865 }, { "auxiliary_loss_clip": 0.01164888, "auxiliary_loss_mlp": 0.01029413, "balance_loss_clip": 1.05095243, "balance_loss_mlp": 1.02033544, "epoch": 0.2730716046413756, "flos": 19094637421440.0, "grad_norm": 2.0168288392842606, "language_loss": 0.8827309, "learning_rate": 3.411732718871319e-06, "loss": 0.90467387, "num_input_tokens_seen": 48746510, "step": 2271, "time_per_iteration": 2.6455936431884766 }, { "auxiliary_loss_clip": 0.0120326, "auxiliary_loss_mlp": 0.01032228, "balance_loss_clip": 1.06216168, "balance_loss_mlp": 1.02356124, "epoch": 0.27319184753201464, "flos": 26944566474240.0, "grad_norm": 2.022940801363115, "language_loss": 0.78935623, "learning_rate": 3.4111808311104227e-06, "loss": 0.81171107, "num_input_tokens_seen": 48768825, "step": 2272, "time_per_iteration": 2.6933422088623047 }, { "auxiliary_loss_clip": 0.01180154, "auxiliary_loss_mlp": 0.01030921, "balance_loss_clip": 1.05394912, "balance_loss_mlp": 1.02149749, "epoch": 0.27331209042265375, "flos": 31759828012800.0, "grad_norm": 1.9813013230545355, "language_loss": 0.69558036, "learning_rate": 3.410628729274517e-06, "loss": 0.71769106, "num_input_tokens_seen": 48790345, "step": 2273, "time_per_iteration": 2.743873119354248 }, { "auxiliary_loss_clip": 0.0116834, "auxiliary_loss_mlp": 0.00763334, "balance_loss_clip": 1.05335879, "balance_loss_mlp": 1.00037646, "epoch": 0.27343233331329286, "flos": 25739081107200.0, "grad_norm": 1.995230637962598, "language_loss": 0.828354, "learning_rate": 3.4100764134473546e-06, "loss": 0.84767079, "num_input_tokens_seen": 48809630, "step": 2274, "time_per_iteration": 2.7149124145507812 }, { "auxiliary_loss_clip": 0.01202647, "auxiliary_loss_mlp": 0.01037756, "balance_loss_clip": 1.05793571, "balance_loss_mlp": 1.02884531, "epoch": 0.2735525762039319, "flos": 24389414547840.0, "grad_norm": 2.3712941154444547, "language_loss": 0.84656417, "learning_rate": 3.4095238837127215e-06, "loss": 0.86896819, "num_input_tokens_seen": 48828770, "step": 2275, "time_per_iteration": 3.568774938583374 }, { "auxiliary_loss_clip": 0.01153155, "auxiliary_loss_mlp": 0.01032735, "balance_loss_clip": 1.0503633, "balance_loss_mlp": 1.02365732, "epoch": 0.27367281909457103, "flos": 14465357527680.0, "grad_norm": 2.069719949048533, "language_loss": 0.79552901, "learning_rate": 3.4089711401544355e-06, "loss": 0.81738794, "num_input_tokens_seen": 48846365, "step": 2276, "time_per_iteration": 2.6595826148986816 }, { "auxiliary_loss_clip": 0.01186803, "auxiliary_loss_mlp": 0.01032305, "balance_loss_clip": 1.05444407, "balance_loss_mlp": 1.02267241, "epoch": 0.27379306198521014, "flos": 23476996247040.0, "grad_norm": 2.571770339442371, "language_loss": 0.67240494, "learning_rate": 3.4084181828563486e-06, "loss": 0.69459605, "num_input_tokens_seen": 48863085, "step": 2277, "time_per_iteration": 2.74448823928833 }, { "auxiliary_loss_clip": 0.01140748, "auxiliary_loss_mlp": 0.01031877, "balance_loss_clip": 1.05022073, "balance_loss_mlp": 1.02293611, "epoch": 0.2739133048758492, "flos": 17458152762240.0, "grad_norm": 1.8460070607065087, "language_loss": 0.70763659, "learning_rate": 3.4078650119023428e-06, "loss": 0.72936285, "num_input_tokens_seen": 48881400, "step": 2278, "time_per_iteration": 2.705878496170044 }, { "auxiliary_loss_clip": 0.01128034, "auxiliary_loss_mlp": 0.01041348, "balance_loss_clip": 1.04487729, "balance_loss_mlp": 1.03194284, "epoch": 0.2740335477664883, "flos": 19273113123840.0, "grad_norm": 2.2697561567033633, "language_loss": 0.74011034, "learning_rate": 3.4073116273763337e-06, "loss": 0.76180416, "num_input_tokens_seen": 48895845, "step": 2279, "time_per_iteration": 3.639044761657715 }, { "auxiliary_loss_clip": 0.01176851, "auxiliary_loss_mlp": 0.01028069, "balance_loss_clip": 1.0533421, "balance_loss_mlp": 1.01866364, "epoch": 0.2741537906571274, "flos": 26104723603200.0, "grad_norm": 1.7712504389927055, "language_loss": 0.81206346, "learning_rate": 3.40675802936227e-06, "loss": 0.83411264, "num_input_tokens_seen": 48916630, "step": 2280, "time_per_iteration": 2.771557092666626 }, { "auxiliary_loss_clip": 0.01164632, "auxiliary_loss_mlp": 0.01030056, "balance_loss_clip": 1.05222321, "balance_loss_mlp": 1.02044153, "epoch": 0.27427403354776647, "flos": 34164190644480.0, "grad_norm": 2.1171290157856855, "language_loss": 0.72074854, "learning_rate": 3.4062042179441318e-06, "loss": 0.74269545, "num_input_tokens_seen": 48937100, "step": 2281, "time_per_iteration": 3.7796597480773926 }, { "auxiliary_loss_clip": 0.01186476, "auxiliary_loss_mlp": 0.01026508, "balance_loss_clip": 1.05874896, "balance_loss_mlp": 1.01798475, "epoch": 0.2743942764384056, "flos": 18766988536320.0, "grad_norm": 1.9256935997915743, "language_loss": 0.80767041, "learning_rate": 3.4056501932059314e-06, "loss": 0.82980025, "num_input_tokens_seen": 48955175, "step": 2282, "time_per_iteration": 3.6783294677734375 }, { "auxiliary_loss_clip": 0.01102571, "auxiliary_loss_mlp": 0.0100744, "balance_loss_clip": 1.02294993, "balance_loss_mlp": 1.0054251, "epoch": 0.2745145193290447, "flos": 64904048058240.0, "grad_norm": 0.7778266523865007, "language_loss": 0.58028936, "learning_rate": 3.405095955231715e-06, "loss": 0.60138941, "num_input_tokens_seen": 49006830, "step": 2283, "time_per_iteration": 3.1335926055908203 }, { "auxiliary_loss_clip": 0.0119102, "auxiliary_loss_mlp": 0.0103371, "balance_loss_clip": 1.05309367, "balance_loss_mlp": 1.02508521, "epoch": 0.27463476221968375, "flos": 16136926796160.0, "grad_norm": 3.021598428508459, "language_loss": 0.94548357, "learning_rate": 3.4045415041055585e-06, "loss": 0.96773088, "num_input_tokens_seen": 49022470, "step": 2284, "time_per_iteration": 2.6129205226898193 }, { "auxiliary_loss_clip": 0.01176211, "auxiliary_loss_mlp": 0.01032848, "balance_loss_clip": 1.05369663, "balance_loss_mlp": 1.02412748, "epoch": 0.27475500511032286, "flos": 10376712213120.0, "grad_norm": 6.768879137803772, "language_loss": 0.78925574, "learning_rate": 3.4039868399115728e-06, "loss": 0.81134629, "num_input_tokens_seen": 49037110, "step": 2285, "time_per_iteration": 2.6596314907073975 }, { "auxiliary_loss_clip": 0.01132711, "auxiliary_loss_mlp": 0.01032818, "balance_loss_clip": 1.05136299, "balance_loss_mlp": 1.02344251, "epoch": 0.27487524800096197, "flos": 17311062568320.0, "grad_norm": 2.0460277080554867, "language_loss": 0.80511183, "learning_rate": 3.4034319627339003e-06, "loss": 0.82676709, "num_input_tokens_seen": 49053975, "step": 2286, "time_per_iteration": 2.745696783065796 }, { "auxiliary_loss_clip": 0.01177862, "auxiliary_loss_mlp": 0.01034363, "balance_loss_clip": 1.05641699, "balance_loss_mlp": 1.02574372, "epoch": 0.274995490891601, "flos": 27120205002240.0, "grad_norm": 4.2806835233643294, "language_loss": 0.70085186, "learning_rate": 3.402876872656715e-06, "loss": 0.72297406, "num_input_tokens_seen": 49072295, "step": 2287, "time_per_iteration": 2.748152017593384 }, { "auxiliary_loss_clip": 0.01171177, "auxiliary_loss_mlp": 0.01033235, "balance_loss_clip": 1.05601978, "balance_loss_mlp": 1.02383542, "epoch": 0.27511573378224013, "flos": 23436093634560.0, "grad_norm": 2.046667845909836, "language_loss": 0.89463043, "learning_rate": 3.402321569764223e-06, "loss": 0.91667455, "num_input_tokens_seen": 49091600, "step": 2288, "time_per_iteration": 2.6951262950897217 }, { "auxiliary_loss_clip": 0.01144561, "auxiliary_loss_mlp": 0.00764761, "balance_loss_clip": 1.05102849, "balance_loss_mlp": 1.00059903, "epoch": 0.2752359766728792, "flos": 16722019434240.0, "grad_norm": 1.966592425854624, "language_loss": 0.83729303, "learning_rate": 3.4017660541406635e-06, "loss": 0.85638624, "num_input_tokens_seen": 49107665, "step": 2289, "time_per_iteration": 2.7898805141448975 }, { "auxiliary_loss_clip": 0.01178135, "auxiliary_loss_mlp": 0.01029645, "balance_loss_clip": 1.05188024, "balance_loss_mlp": 1.01997662, "epoch": 0.2753562195635183, "flos": 25297738698240.0, "grad_norm": 1.7808938890664154, "language_loss": 0.74668616, "learning_rate": 3.4012103258703092e-06, "loss": 0.76876396, "num_input_tokens_seen": 49126420, "step": 2290, "time_per_iteration": 2.7106032371520996 }, { "auxiliary_loss_clip": 0.01157952, "auxiliary_loss_mlp": 0.01033364, "balance_loss_clip": 1.05237091, "balance_loss_mlp": 1.02354109, "epoch": 0.2754764624541574, "flos": 27338972785920.0, "grad_norm": 1.9154796267311263, "language_loss": 0.83065927, "learning_rate": 3.4006543850374616e-06, "loss": 0.85257244, "num_input_tokens_seen": 49141470, "step": 2291, "time_per_iteration": 2.7091152667999268 }, { "auxiliary_loss_clip": 0.01188049, "auxiliary_loss_mlp": 0.01034514, "balance_loss_clip": 1.05309689, "balance_loss_mlp": 1.02522206, "epoch": 0.27559670534479647, "flos": 17238379397760.0, "grad_norm": 2.0387133373870276, "language_loss": 0.75360715, "learning_rate": 3.400098231726458e-06, "loss": 0.77583277, "num_input_tokens_seen": 49158570, "step": 2292, "time_per_iteration": 2.632749557495117 }, { "auxiliary_loss_clip": 0.0116225, "auxiliary_loss_mlp": 0.01025496, "balance_loss_clip": 1.05011165, "balance_loss_mlp": 1.01607227, "epoch": 0.2757169482354356, "flos": 21939085486080.0, "grad_norm": 2.2279482624046527, "language_loss": 0.86742681, "learning_rate": 3.3995418660216657e-06, "loss": 0.88930422, "num_input_tokens_seen": 49176025, "step": 2293, "time_per_iteration": 2.8080203533172607 }, { "auxiliary_loss_clip": 0.01209449, "auxiliary_loss_mlp": 0.01034496, "balance_loss_clip": 1.06007719, "balance_loss_mlp": 1.02443516, "epoch": 0.2758371911260747, "flos": 20850669521280.0, "grad_norm": 2.262884773003336, "language_loss": 0.80434501, "learning_rate": 3.3989852880074848e-06, "loss": 0.82678449, "num_input_tokens_seen": 49197455, "step": 2294, "time_per_iteration": 2.7196407318115234 }, { "auxiliary_loss_clip": 0.01083828, "auxiliary_loss_mlp": 0.01004149, "balance_loss_clip": 1.02812636, "balance_loss_mlp": 1.0021466, "epoch": 0.27595743401671374, "flos": 69269063592960.0, "grad_norm": 0.7476867123627904, "language_loss": 0.60615277, "learning_rate": 3.398428497768348e-06, "loss": 0.62703252, "num_input_tokens_seen": 49262625, "step": 2295, "time_per_iteration": 3.3849334716796875 }, { "auxiliary_loss_clip": 0.01166101, "auxiliary_loss_mlp": 0.01029497, "balance_loss_clip": 1.05083323, "balance_loss_mlp": 1.02059245, "epoch": 0.27607767690735285, "flos": 21215019127680.0, "grad_norm": 1.8463181454058246, "language_loss": 0.7201854, "learning_rate": 3.3978714953887205e-06, "loss": 0.74214137, "num_input_tokens_seen": 49282380, "step": 2296, "time_per_iteration": 2.7992069721221924 }, { "auxiliary_loss_clip": 0.01128037, "auxiliary_loss_mlp": 0.01033578, "balance_loss_clip": 1.04570699, "balance_loss_mlp": 1.02444053, "epoch": 0.27619791979799196, "flos": 24825334003200.0, "grad_norm": 1.801660517983336, "language_loss": 0.86351573, "learning_rate": 3.397314280953098e-06, "loss": 0.88513184, "num_input_tokens_seen": 49303205, "step": 2297, "time_per_iteration": 2.803788423538208 }, { "auxiliary_loss_clip": 0.01164808, "auxiliary_loss_mlp": 0.01036623, "balance_loss_clip": 1.05086195, "balance_loss_mlp": 1.02835631, "epoch": 0.276318162688631, "flos": 24753548672640.0, "grad_norm": 3.550408142048715, "language_loss": 0.80528313, "learning_rate": 3.3967568545460108e-06, "loss": 0.82729745, "num_input_tokens_seen": 49322745, "step": 2298, "time_per_iteration": 2.7134318351745605 }, { "auxiliary_loss_clip": 0.01182831, "auxiliary_loss_mlp": 0.01032407, "balance_loss_clip": 1.05352747, "balance_loss_mlp": 1.02340674, "epoch": 0.27643840557927013, "flos": 18150007599360.0, "grad_norm": 1.9498801075970262, "language_loss": 0.80673146, "learning_rate": 3.3961992162520185e-06, "loss": 0.82888377, "num_input_tokens_seen": 49341370, "step": 2299, "time_per_iteration": 2.6921231746673584 }, { "auxiliary_loss_clip": 0.0118633, "auxiliary_loss_mlp": 0.01032919, "balance_loss_clip": 1.05384958, "balance_loss_mlp": 1.02372825, "epoch": 0.27655864846990924, "flos": 24823933372800.0, "grad_norm": 2.876944248995244, "language_loss": 0.72201419, "learning_rate": 3.3956413661557156e-06, "loss": 0.74420667, "num_input_tokens_seen": 49361545, "step": 2300, "time_per_iteration": 2.660743474960327 }, { "auxiliary_loss_clip": 0.01165615, "auxiliary_loss_mlp": 0.01034657, "balance_loss_clip": 1.05230045, "balance_loss_mlp": 1.02528095, "epoch": 0.2766788913605483, "flos": 20266582464000.0, "grad_norm": 2.6109510457264475, "language_loss": 0.66770899, "learning_rate": 3.3950833043417273e-06, "loss": 0.68971169, "num_input_tokens_seen": 49379690, "step": 2301, "time_per_iteration": 3.641766309738159 }, { "auxiliary_loss_clip": 0.01192406, "auxiliary_loss_mlp": 0.010304, "balance_loss_clip": 1.059659, "balance_loss_mlp": 1.02082181, "epoch": 0.2767991342511874, "flos": 21470272151040.0, "grad_norm": 3.3700659403415996, "language_loss": 0.74108267, "learning_rate": 3.3945250308947105e-06, "loss": 0.76331079, "num_input_tokens_seen": 49395995, "step": 2302, "time_per_iteration": 2.651050090789795 }, { "auxiliary_loss_clip": 0.01090723, "auxiliary_loss_mlp": 0.01002214, "balance_loss_clip": 1.02027011, "balance_loss_mlp": 1.00011599, "epoch": 0.2769193771418265, "flos": 66002627571840.0, "grad_norm": 1.2237202648789334, "language_loss": 0.68300402, "learning_rate": 3.3939665458993556e-06, "loss": 0.70393342, "num_input_tokens_seen": 49450415, "step": 2303, "time_per_iteration": 3.1721460819244385 }, { "auxiliary_loss_clip": 0.0116337, "auxiliary_loss_mlp": 0.01030969, "balance_loss_clip": 1.05034792, "balance_loss_mlp": 1.02140808, "epoch": 0.27703962003246557, "flos": 20704441253760.0, "grad_norm": 2.0023148307354246, "language_loss": 0.76907074, "learning_rate": 3.3934078494403843e-06, "loss": 0.79101419, "num_input_tokens_seen": 49469990, "step": 2304, "time_per_iteration": 2.7728493213653564 }, { "auxiliary_loss_clip": 0.01111336, "auxiliary_loss_mlp": 0.00764713, "balance_loss_clip": 1.04737234, "balance_loss_mlp": 1.00050557, "epoch": 0.2771598629231047, "flos": 22929897219840.0, "grad_norm": 1.9987916024747108, "language_loss": 0.81838477, "learning_rate": 3.3928489416025495e-06, "loss": 0.83714527, "num_input_tokens_seen": 49490835, "step": 2305, "time_per_iteration": 3.7889559268951416 }, { "auxiliary_loss_clip": 0.0117374, "auxiliary_loss_mlp": 0.01034953, "balance_loss_clip": 1.05447483, "balance_loss_mlp": 1.02508235, "epoch": 0.27728010581374374, "flos": 18369457741440.0, "grad_norm": 2.308304761136325, "language_loss": 0.79066783, "learning_rate": 3.392289822470638e-06, "loss": 0.81275481, "num_input_tokens_seen": 49508815, "step": 2306, "time_per_iteration": 2.9027576446533203 }, { "auxiliary_loss_clip": 0.01171264, "auxiliary_loss_mlp": 0.01027936, "balance_loss_clip": 1.05332255, "balance_loss_mlp": 1.01851285, "epoch": 0.27740034870438285, "flos": 19427637432960.0, "grad_norm": 2.0856969119466915, "language_loss": 0.75945306, "learning_rate": 3.3917304921294674e-06, "loss": 0.78144509, "num_input_tokens_seen": 49526980, "step": 2307, "time_per_iteration": 3.660510778427124 }, { "auxiliary_loss_clip": 0.01188626, "auxiliary_loss_mlp": 0.01033141, "balance_loss_clip": 1.05529594, "balance_loss_mlp": 1.02359819, "epoch": 0.27752059159502196, "flos": 21614776565760.0, "grad_norm": 5.274908796967043, "language_loss": 0.80978274, "learning_rate": 3.3911709506638876e-06, "loss": 0.83200037, "num_input_tokens_seen": 49546290, "step": 2308, "time_per_iteration": 2.675731897354126 }, { "auxiliary_loss_clip": 0.01146595, "auxiliary_loss_mlp": 0.00763574, "balance_loss_clip": 1.04698062, "balance_loss_mlp": 1.00046074, "epoch": 0.277640834485661, "flos": 26608011016320.0, "grad_norm": 2.074614842925102, "language_loss": 0.81319058, "learning_rate": 3.390611198158781e-06, "loss": 0.83229226, "num_input_tokens_seen": 49564165, "step": 2309, "time_per_iteration": 2.836418628692627 }, { "auxiliary_loss_clip": 0.01208232, "auxiliary_loss_mlp": 0.01029721, "balance_loss_clip": 1.05940163, "balance_loss_mlp": 1.02025533, "epoch": 0.2777610773763001, "flos": 19492814661120.0, "grad_norm": 2.1799536867259186, "language_loss": 0.90261602, "learning_rate": 3.3900512346990612e-06, "loss": 0.9249956, "num_input_tokens_seen": 49580155, "step": 2310, "time_per_iteration": 2.6711266040802 }, { "auxiliary_loss_clip": 0.0114078, "auxiliary_loss_mlp": 0.01030434, "balance_loss_clip": 1.04602373, "balance_loss_mlp": 1.0208379, "epoch": 0.27788132026693924, "flos": 38290650001920.0, "grad_norm": 1.7731119353029887, "language_loss": 0.66141504, "learning_rate": 3.389491060369674e-06, "loss": 0.68312716, "num_input_tokens_seen": 49605830, "step": 2311, "time_per_iteration": 2.898549795150757 }, { "auxiliary_loss_clip": 0.0113666, "auxiliary_loss_mlp": 0.01043643, "balance_loss_clip": 1.04922903, "balance_loss_mlp": 1.03508377, "epoch": 0.2780015631575783, "flos": 22382546797440.0, "grad_norm": 1.890797936035661, "language_loss": 0.89483529, "learning_rate": 3.388930675255598e-06, "loss": 0.91663837, "num_input_tokens_seen": 49625680, "step": 2312, "time_per_iteration": 2.8286221027374268 }, { "auxiliary_loss_clip": 0.01177237, "auxiliary_loss_mlp": 0.01037616, "balance_loss_clip": 1.05383706, "balance_loss_mlp": 1.02828836, "epoch": 0.2781218060482174, "flos": 12203200840320.0, "grad_norm": 2.362091959738511, "language_loss": 0.78872275, "learning_rate": 3.388370079441843e-06, "loss": 0.8108713, "num_input_tokens_seen": 49641195, "step": 2313, "time_per_iteration": 2.7313311100006104 }, { "auxiliary_loss_clip": 0.01160948, "auxiliary_loss_mlp": 0.01038207, "balance_loss_clip": 1.05470157, "balance_loss_mlp": 1.02882516, "epoch": 0.2782420489388565, "flos": 18107632529280.0, "grad_norm": 2.039637090754787, "language_loss": 0.92850465, "learning_rate": 3.3878092730134505e-06, "loss": 0.9504962, "num_input_tokens_seen": 49659180, "step": 2314, "time_per_iteration": 2.6790573596954346 }, { "auxiliary_loss_clip": 0.01180886, "auxiliary_loss_mlp": 0.01035467, "balance_loss_clip": 1.0542587, "balance_loss_mlp": 1.02597177, "epoch": 0.27836229182949557, "flos": 18514752255360.0, "grad_norm": 2.203977772985784, "language_loss": 0.80386686, "learning_rate": 3.3872482560554947e-06, "loss": 0.82603043, "num_input_tokens_seen": 49677955, "step": 2315, "time_per_iteration": 2.7949914932250977 }, { "auxiliary_loss_clip": 0.01090378, "auxiliary_loss_mlp": 0.01007097, "balance_loss_clip": 1.0202837, "balance_loss_mlp": 1.00515354, "epoch": 0.2784825347201347, "flos": 67079230940160.0, "grad_norm": 0.8073154683488314, "language_loss": 0.56907481, "learning_rate": 3.386687028653082e-06, "loss": 0.59004956, "num_input_tokens_seen": 49740800, "step": 2316, "time_per_iteration": 3.2383580207824707 }, { "auxiliary_loss_clip": 0.01146828, "auxiliary_loss_mlp": 0.01034519, "balance_loss_clip": 1.05074799, "balance_loss_mlp": 1.02506614, "epoch": 0.2786027776107738, "flos": 22631119891200.0, "grad_norm": 1.8556607749606644, "language_loss": 0.85256493, "learning_rate": 3.386125590891349e-06, "loss": 0.87437838, "num_input_tokens_seen": 49757675, "step": 2317, "time_per_iteration": 2.7349190711975098 }, { "auxiliary_loss_clip": 0.01162487, "auxiliary_loss_mlp": 0.01035832, "balance_loss_clip": 1.05173254, "balance_loss_mlp": 1.02696252, "epoch": 0.27872302050141284, "flos": 15778826156160.0, "grad_norm": 2.468117294163745, "language_loss": 0.83242744, "learning_rate": 3.3855639428554657e-06, "loss": 0.85441065, "num_input_tokens_seen": 49775205, "step": 2318, "time_per_iteration": 2.6711957454681396 }, { "auxiliary_loss_clip": 0.01146873, "auxiliary_loss_mlp": 0.01029771, "balance_loss_clip": 1.0492065, "balance_loss_mlp": 1.02134848, "epoch": 0.27884326339205195, "flos": 22126970551680.0, "grad_norm": 2.156973575202536, "language_loss": 0.80043381, "learning_rate": 3.385002084630635e-06, "loss": 0.82220024, "num_input_tokens_seen": 49794175, "step": 2319, "time_per_iteration": 2.672119140625 }, { "auxiliary_loss_clip": 0.01191616, "auxiliary_loss_mlp": 0.01029939, "balance_loss_clip": 1.05629826, "balance_loss_mlp": 1.02042055, "epoch": 0.278963506282691, "flos": 20558715776640.0, "grad_norm": 3.986280193640279, "language_loss": 0.84840155, "learning_rate": 3.384440016302088e-06, "loss": 0.87061715, "num_input_tokens_seen": 49812850, "step": 2320, "time_per_iteration": 2.6638221740722656 }, { "auxiliary_loss_clip": 0.01185444, "auxiliary_loss_mlp": 0.01030785, "balance_loss_clip": 1.05580819, "balance_loss_mlp": 1.02159441, "epoch": 0.2790837491733301, "flos": 21942928241280.0, "grad_norm": 2.3954258695713113, "language_loss": 0.62434393, "learning_rate": 3.3838777379550923e-06, "loss": 0.64650619, "num_input_tokens_seen": 49832295, "step": 2321, "time_per_iteration": 2.6696975231170654 }, { "auxiliary_loss_clip": 0.01177054, "auxiliary_loss_mlp": 0.0102903, "balance_loss_clip": 1.05662096, "balance_loss_mlp": 1.02008331, "epoch": 0.27920399206396923, "flos": 26286790665600.0, "grad_norm": 16.88766770285102, "language_loss": 0.78691518, "learning_rate": 3.383315249674944e-06, "loss": 0.80897605, "num_input_tokens_seen": 49850860, "step": 2322, "time_per_iteration": 2.791940212249756 }, { "auxiliary_loss_clip": 0.01159606, "auxiliary_loss_mlp": 0.01035514, "balance_loss_clip": 1.05357122, "balance_loss_mlp": 1.02638221, "epoch": 0.2793242349546083, "flos": 25400981364480.0, "grad_norm": 4.325402427028171, "language_loss": 0.86075574, "learning_rate": 3.3827525515469715e-06, "loss": 0.882707, "num_input_tokens_seen": 49865765, "step": 2323, "time_per_iteration": 2.7910640239715576 }, { "auxiliary_loss_clip": 0.0114816, "auxiliary_loss_mlp": 0.01029771, "balance_loss_clip": 1.04846537, "balance_loss_mlp": 1.02025843, "epoch": 0.2794444778452474, "flos": 20850346298880.0, "grad_norm": 2.1144659409293363, "language_loss": 0.71305335, "learning_rate": 3.3821896436565367e-06, "loss": 0.7348327, "num_input_tokens_seen": 49885425, "step": 2324, "time_per_iteration": 2.6691856384277344 }, { "auxiliary_loss_clip": 0.0119335, "auxiliary_loss_mlp": 0.01032615, "balance_loss_clip": 1.06107306, "balance_loss_mlp": 1.02369833, "epoch": 0.2795647207358865, "flos": 21576244250880.0, "grad_norm": 1.8169946084430786, "language_loss": 0.69966763, "learning_rate": 3.381626526089032e-06, "loss": 0.72192729, "num_input_tokens_seen": 49904990, "step": 2325, "time_per_iteration": 2.621131420135498 }, { "auxiliary_loss_clip": 0.01168692, "auxiliary_loss_mlp": 0.01033927, "balance_loss_clip": 1.05027902, "balance_loss_mlp": 1.02502847, "epoch": 0.27968496362652556, "flos": 21471744608640.0, "grad_norm": 2.254284981771605, "language_loss": 0.79317021, "learning_rate": 3.3810631989298815e-06, "loss": 0.81519639, "num_input_tokens_seen": 49924600, "step": 2326, "time_per_iteration": 2.670252799987793 }, { "auxiliary_loss_clip": 0.01151497, "auxiliary_loss_mlp": 0.01039679, "balance_loss_clip": 1.05253959, "balance_loss_mlp": 1.02853858, "epoch": 0.2798052065171647, "flos": 23258695340160.0, "grad_norm": 3.3127688821182493, "language_loss": 0.84899724, "learning_rate": 3.3804996622645423e-06, "loss": 0.87090898, "num_input_tokens_seen": 49942600, "step": 2327, "time_per_iteration": 3.829312324523926 }, { "auxiliary_loss_clip": 0.01206231, "auxiliary_loss_mlp": 0.01030882, "balance_loss_clip": 1.05935216, "balance_loss_mlp": 1.02157831, "epoch": 0.2799254494078038, "flos": 21539328048000.0, "grad_norm": 1.771302012413825, "language_loss": 0.89434636, "learning_rate": 3.3799359161785015e-06, "loss": 0.91671753, "num_input_tokens_seen": 49962250, "step": 2328, "time_per_iteration": 2.6131808757781982 }, { "auxiliary_loss_clip": 0.01186839, "auxiliary_loss_mlp": 0.0103392, "balance_loss_clip": 1.05604041, "balance_loss_mlp": 1.02477694, "epoch": 0.28004569229844284, "flos": 26393912000640.0, "grad_norm": 1.895704559922919, "language_loss": 0.85690743, "learning_rate": 3.3793719607572798e-06, "loss": 0.87911505, "num_input_tokens_seen": 49983215, "step": 2329, "time_per_iteration": 2.67622709274292 }, { "auxiliary_loss_clip": 0.01159324, "auxiliary_loss_mlp": 0.01032256, "balance_loss_clip": 1.05124772, "balance_loss_mlp": 1.02271926, "epoch": 0.28016593518908195, "flos": 33547676584320.0, "grad_norm": 5.580542672259064, "language_loss": 0.77181137, "learning_rate": 3.378807796086428e-06, "loss": 0.79372722, "num_input_tokens_seen": 50006075, "step": 2330, "time_per_iteration": 3.6839306354522705 }, { "auxiliary_loss_clip": 0.01205082, "auxiliary_loss_mlp": 0.01030683, "balance_loss_clip": 1.0590477, "balance_loss_mlp": 1.02184379, "epoch": 0.28028617807972106, "flos": 15340823712000.0, "grad_norm": 2.317322704040704, "language_loss": 0.77230871, "learning_rate": 3.37824342225153e-06, "loss": 0.79466641, "num_input_tokens_seen": 50022495, "step": 2331, "time_per_iteration": 2.5543482303619385 }, { "auxiliary_loss_clip": 0.01147018, "auxiliary_loss_mlp": 0.01033185, "balance_loss_clip": 1.05393839, "balance_loss_mlp": 1.02450705, "epoch": 0.2804064209703601, "flos": 25520277409920.0, "grad_norm": 1.863933779593855, "language_loss": 0.77598453, "learning_rate": 3.3776788393382006e-06, "loss": 0.79778659, "num_input_tokens_seen": 50041975, "step": 2332, "time_per_iteration": 2.743546962738037 }, { "auxiliary_loss_clip": 0.01206149, "auxiliary_loss_mlp": 0.0103186, "balance_loss_clip": 1.05940235, "balance_loss_mlp": 1.02194798, "epoch": 0.2805266638609992, "flos": 29351766280320.0, "grad_norm": 7.7931706299215495, "language_loss": 0.77034098, "learning_rate": 3.3771140474320872e-06, "loss": 0.79272103, "num_input_tokens_seen": 50061925, "step": 2333, "time_per_iteration": 3.5931408405303955 }, { "auxiliary_loss_clip": 0.01167686, "auxiliary_loss_mlp": 0.01032477, "balance_loss_clip": 1.05489826, "balance_loss_mlp": 1.02263606, "epoch": 0.28064690675163834, "flos": 21463735875840.0, "grad_norm": 2.4326784765509024, "language_loss": 0.79771459, "learning_rate": 3.3765490466188664e-06, "loss": 0.81971622, "num_input_tokens_seen": 50079325, "step": 2334, "time_per_iteration": 3.6336872577667236 }, { "auxiliary_loss_clip": 0.01159598, "auxiliary_loss_mlp": 0.0103384, "balance_loss_clip": 1.05272365, "balance_loss_mlp": 1.02500677, "epoch": 0.2807671496422774, "flos": 20995640812800.0, "grad_norm": 3.1796229367392663, "language_loss": 0.74202758, "learning_rate": 3.3759838369842508e-06, "loss": 0.76396197, "num_input_tokens_seen": 50097400, "step": 2335, "time_per_iteration": 2.779179334640503 }, { "auxiliary_loss_clip": 0.01162616, "auxiliary_loss_mlp": 0.01025825, "balance_loss_clip": 1.05428648, "balance_loss_mlp": 1.01688981, "epoch": 0.2808873925329165, "flos": 21506577822720.0, "grad_norm": 1.9715535697772997, "language_loss": 0.73194915, "learning_rate": 3.375418418613981e-06, "loss": 0.75383353, "num_input_tokens_seen": 50116425, "step": 2336, "time_per_iteration": 2.7656161785125732 }, { "auxiliary_loss_clip": 0.01175548, "auxiliary_loss_mlp": 0.01030641, "balance_loss_clip": 1.05503392, "balance_loss_mlp": 1.02051425, "epoch": 0.28100763542355556, "flos": 16070815814400.0, "grad_norm": 5.340638372012982, "language_loss": 0.83243579, "learning_rate": 3.374852791593831e-06, "loss": 0.85449767, "num_input_tokens_seen": 50132625, "step": 2337, "time_per_iteration": 2.701455593109131 }, { "auxiliary_loss_clip": 0.01156317, "auxiliary_loss_mlp": 0.01029805, "balance_loss_clip": 1.05219746, "balance_loss_mlp": 1.01972568, "epoch": 0.28112787831419467, "flos": 19062605468160.0, "grad_norm": 3.6624035501222534, "language_loss": 0.54361433, "learning_rate": 3.374286956009605e-06, "loss": 0.56547558, "num_input_tokens_seen": 50151190, "step": 2338, "time_per_iteration": 2.6794064044952393 }, { "auxiliary_loss_clip": 0.01190467, "auxiliary_loss_mlp": 0.01033103, "balance_loss_clip": 1.06100678, "balance_loss_mlp": 1.02422786, "epoch": 0.2812481212048338, "flos": 12823629482880.0, "grad_norm": 2.430558146683905, "language_loss": 0.75278306, "learning_rate": 3.3737209119471405e-06, "loss": 0.77501881, "num_input_tokens_seen": 50167700, "step": 2339, "time_per_iteration": 2.6339893341064453 }, { "auxiliary_loss_clip": 0.01195815, "auxiliary_loss_mlp": 0.01034879, "balance_loss_clip": 1.05815053, "balance_loss_mlp": 1.02458572, "epoch": 0.28136836409547283, "flos": 15633064765440.0, "grad_norm": 3.08269738292447, "language_loss": 0.63587224, "learning_rate": 3.373154659492306e-06, "loss": 0.65817922, "num_input_tokens_seen": 50185840, "step": 2340, "time_per_iteration": 2.5992653369903564 }, { "auxiliary_loss_clip": 0.01180818, "auxiliary_loss_mlp": 0.01031229, "balance_loss_clip": 1.05659413, "balance_loss_mlp": 1.02147818, "epoch": 0.28148860698611194, "flos": 19933726106880.0, "grad_norm": 1.8227994109788508, "language_loss": 0.85256785, "learning_rate": 3.3725881987310016e-06, "loss": 0.87468833, "num_input_tokens_seen": 50203375, "step": 2341, "time_per_iteration": 2.6649835109710693 }, { "auxiliary_loss_clip": 0.01174456, "auxiliary_loss_mlp": 0.01031597, "balance_loss_clip": 1.05467331, "balance_loss_mlp": 1.0221082, "epoch": 0.28160884987675106, "flos": 17457219008640.0, "grad_norm": 1.8676261489450174, "language_loss": 0.87863839, "learning_rate": 3.372021529749159e-06, "loss": 0.9006989, "num_input_tokens_seen": 50222435, "step": 2342, "time_per_iteration": 2.6611344814300537 }, { "auxiliary_loss_clip": 0.01135256, "auxiliary_loss_mlp": 0.01038057, "balance_loss_clip": 1.05363441, "balance_loss_mlp": 1.02821612, "epoch": 0.2817290927673901, "flos": 16834743290880.0, "grad_norm": 4.108949708462741, "language_loss": 0.92194033, "learning_rate": 3.3714546526327405e-06, "loss": 0.94367349, "num_input_tokens_seen": 50240435, "step": 2343, "time_per_iteration": 2.71604585647583 }, { "auxiliary_loss_clip": 0.01165799, "auxiliary_loss_mlp": 0.01035659, "balance_loss_clip": 1.05146134, "balance_loss_mlp": 1.02593756, "epoch": 0.2818493356580292, "flos": 15414081500160.0, "grad_norm": 2.1266252811715276, "language_loss": 0.88349807, "learning_rate": 3.3708875674677423e-06, "loss": 0.90551269, "num_input_tokens_seen": 50258410, "step": 2344, "time_per_iteration": 2.7140326499938965 }, { "auxiliary_loss_clip": 0.01185868, "auxiliary_loss_mlp": 0.01030455, "balance_loss_clip": 1.05810797, "balance_loss_mlp": 1.02103162, "epoch": 0.28196957854866833, "flos": 20412451595520.0, "grad_norm": 2.8117928211943357, "language_loss": 0.8367548, "learning_rate": 3.37032027434019e-06, "loss": 0.85891795, "num_input_tokens_seen": 50277930, "step": 2345, "time_per_iteration": 2.6686089038848877 }, { "auxiliary_loss_clip": 0.01198196, "auxiliary_loss_mlp": 0.01036161, "balance_loss_clip": 1.0566932, "balance_loss_mlp": 1.02616501, "epoch": 0.2820898214393074, "flos": 19973120348160.0, "grad_norm": 4.824029872486958, "language_loss": 0.82920337, "learning_rate": 3.369752773336141e-06, "loss": 0.851547, "num_input_tokens_seen": 50297410, "step": 2346, "time_per_iteration": 2.6435608863830566 }, { "auxiliary_loss_clip": 0.01173213, "auxiliary_loss_mlp": 0.01033175, "balance_loss_clip": 1.05304217, "balance_loss_mlp": 1.02302432, "epoch": 0.2822100643299465, "flos": 22528308188160.0, "grad_norm": 1.9930865718489856, "language_loss": 0.78527427, "learning_rate": 3.3691850645416864e-06, "loss": 0.80733824, "num_input_tokens_seen": 50317120, "step": 2347, "time_per_iteration": 2.635512113571167 }, { "auxiliary_loss_clip": 0.01194288, "auxiliary_loss_mlp": 0.01038756, "balance_loss_clip": 1.05678844, "balance_loss_mlp": 1.02923727, "epoch": 0.2823303072205856, "flos": 11546682007680.0, "grad_norm": 2.740958457610266, "language_loss": 0.83556855, "learning_rate": 3.368617148042945e-06, "loss": 0.85789901, "num_input_tokens_seen": 50334790, "step": 2348, "time_per_iteration": 2.7242624759674072 }, { "auxiliary_loss_clip": 0.01170788, "auxiliary_loss_mlp": 0.01030641, "balance_loss_clip": 1.05206227, "balance_loss_mlp": 1.02066922, "epoch": 0.28245055011122466, "flos": 18259894281600.0, "grad_norm": 2.221702501327401, "language_loss": 0.8447262, "learning_rate": 3.368049023926071e-06, "loss": 0.86674047, "num_input_tokens_seen": 50353785, "step": 2349, "time_per_iteration": 2.6637260913848877 }, { "auxiliary_loss_clip": 0.01190609, "auxiliary_loss_mlp": 0.01030628, "balance_loss_clip": 1.0572927, "balance_loss_mlp": 1.02193165, "epoch": 0.2825707930018638, "flos": 24608110504320.0, "grad_norm": 1.8972147367739607, "language_loss": 0.83817601, "learning_rate": 3.3674806922772476e-06, "loss": 0.8603884, "num_input_tokens_seen": 50374670, "step": 2350, "time_per_iteration": 2.7485814094543457 }, { "auxiliary_loss_clip": 0.01166605, "auxiliary_loss_mlp": 0.01032933, "balance_loss_clip": 1.05191469, "balance_loss_mlp": 1.02313983, "epoch": 0.28269103589250283, "flos": 25226994862080.0, "grad_norm": 2.1930162062835366, "language_loss": 0.75180459, "learning_rate": 3.3669121531826904e-06, "loss": 0.77379996, "num_input_tokens_seen": 50395650, "step": 2351, "time_per_iteration": 2.7867517471313477 }, { "auxiliary_loss_clip": 0.0115689, "auxiliary_loss_mlp": 0.01032733, "balance_loss_clip": 1.05674386, "balance_loss_mlp": 1.02292776, "epoch": 0.28281127878314194, "flos": 19281552819840.0, "grad_norm": 2.309193021789076, "language_loss": 0.83663595, "learning_rate": 3.366343406728647e-06, "loss": 0.85853219, "num_input_tokens_seen": 50415100, "step": 2352, "time_per_iteration": 2.6951420307159424 }, { "auxiliary_loss_clip": 0.01180288, "auxiliary_loss_mlp": 0.01030778, "balance_loss_clip": 1.05128658, "balance_loss_mlp": 1.02186692, "epoch": 0.28293152167378105, "flos": 23878405710720.0, "grad_norm": 1.8597286148053394, "language_loss": 0.68668878, "learning_rate": 3.3657744530013946e-06, "loss": 0.70879948, "num_input_tokens_seen": 50434335, "step": 2353, "time_per_iteration": 3.5917623043060303 }, { "auxiliary_loss_clip": 0.01198688, "auxiliary_loss_mlp": 0.01029618, "balance_loss_clip": 1.05972528, "balance_loss_mlp": 1.01961613, "epoch": 0.2830517645644201, "flos": 43866965928960.0, "grad_norm": 2.642301119548798, "language_loss": 0.71953571, "learning_rate": 3.3652052920872437e-06, "loss": 0.74181879, "num_input_tokens_seen": 50457200, "step": 2354, "time_per_iteration": 2.8305740356445312 }, { "auxiliary_loss_clip": 0.0117896, "auxiliary_loss_mlp": 0.01035057, "balance_loss_clip": 1.05553424, "balance_loss_mlp": 1.02558625, "epoch": 0.2831720074550592, "flos": 26651750803200.0, "grad_norm": 1.8902573193071233, "language_loss": 0.85439122, "learning_rate": 3.3646359240725355e-06, "loss": 0.87653142, "num_input_tokens_seen": 50476390, "step": 2355, "time_per_iteration": 3.6528236865997314 }, { "auxiliary_loss_clip": 0.0118502, "auxiliary_loss_mlp": 0.00764729, "balance_loss_clip": 1.05549729, "balance_loss_mlp": 1.00055289, "epoch": 0.2832922503456983, "flos": 31029979564800.0, "grad_norm": 2.3434940527708483, "language_loss": 0.67822748, "learning_rate": 3.364066349043643e-06, "loss": 0.69772494, "num_input_tokens_seen": 50497595, "step": 2356, "time_per_iteration": 2.7397255897521973 }, { "auxiliary_loss_clip": 0.01170474, "auxiliary_loss_mlp": 0.01030258, "balance_loss_clip": 1.05227256, "balance_loss_mlp": 1.0217346, "epoch": 0.2834124932363374, "flos": 20405699838720.0, "grad_norm": 1.7449124731344097, "language_loss": 0.82115197, "learning_rate": 3.363496567086969e-06, "loss": 0.84315932, "num_input_tokens_seen": 50514690, "step": 2357, "time_per_iteration": 2.610638380050659 }, { "auxiliary_loss_clip": 0.01206112, "auxiliary_loss_mlp": 0.01037043, "balance_loss_clip": 1.05990195, "balance_loss_mlp": 1.02743506, "epoch": 0.2835327361269765, "flos": 39384848056320.0, "grad_norm": 1.9838583069343096, "language_loss": 0.75989985, "learning_rate": 3.3629265782889506e-06, "loss": 0.78233135, "num_input_tokens_seen": 50536515, "step": 2358, "time_per_iteration": 2.753045082092285 }, { "auxiliary_loss_clip": 0.01152291, "auxiliary_loss_mlp": 0.01032508, "balance_loss_clip": 1.04792845, "balance_loss_mlp": 1.02260804, "epoch": 0.2836529790176156, "flos": 30261598801920.0, "grad_norm": 2.089831767182233, "language_loss": 0.72149885, "learning_rate": 3.362356382736054e-06, "loss": 0.74334681, "num_input_tokens_seen": 50557120, "step": 2359, "time_per_iteration": 3.6589319705963135 }, { "auxiliary_loss_clip": 0.01158839, "auxiliary_loss_mlp": 0.01028384, "balance_loss_clip": 1.0491538, "balance_loss_mlp": 1.01865053, "epoch": 0.28377322190825466, "flos": 12677796264960.0, "grad_norm": 1.865901441084261, "language_loss": 0.91042376, "learning_rate": 3.361785980514777e-06, "loss": 0.93229598, "num_input_tokens_seen": 50573320, "step": 2360, "time_per_iteration": 3.597364664077759 }, { "auxiliary_loss_clip": 0.01125839, "auxiliary_loss_mlp": 0.01031932, "balance_loss_clip": 1.04859495, "balance_loss_mlp": 1.02259779, "epoch": 0.28389346479889377, "flos": 18296666830080.0, "grad_norm": 6.276199404584636, "language_loss": 0.7637834, "learning_rate": 3.361215371711649e-06, "loss": 0.78536111, "num_input_tokens_seen": 50592415, "step": 2361, "time_per_iteration": 2.7613251209259033 }, { "auxiliary_loss_clip": 0.01152738, "auxiliary_loss_mlp": 0.01027495, "balance_loss_clip": 1.05158329, "balance_loss_mlp": 1.0181725, "epoch": 0.2840137076895329, "flos": 20406992728320.0, "grad_norm": 1.832699020278592, "language_loss": 0.83609688, "learning_rate": 3.3606445564132326e-06, "loss": 0.85789925, "num_input_tokens_seen": 50609710, "step": 2362, "time_per_iteration": 2.6977882385253906 }, { "auxiliary_loss_clip": 0.01208895, "auxiliary_loss_mlp": 0.0076411, "balance_loss_clip": 1.06083751, "balance_loss_mlp": 1.00033307, "epoch": 0.28413395058017193, "flos": 20048030161920.0, "grad_norm": 2.0901393834707465, "language_loss": 0.82397544, "learning_rate": 3.360073534706118e-06, "loss": 0.84370553, "num_input_tokens_seen": 50626865, "step": 2363, "time_per_iteration": 2.642947196960449 }, { "auxiliary_loss_clip": 0.01179062, "auxiliary_loss_mlp": 0.01039641, "balance_loss_clip": 1.05773568, "balance_loss_mlp": 1.02938318, "epoch": 0.28425419347081105, "flos": 37663613256960.0, "grad_norm": 2.2255208551177668, "language_loss": 0.76164031, "learning_rate": 3.35950230667693e-06, "loss": 0.7838273, "num_input_tokens_seen": 50648560, "step": 2364, "time_per_iteration": 2.828190565109253 }, { "auxiliary_loss_clip": 0.01191679, "auxiliary_loss_mlp": 0.01034338, "balance_loss_clip": 1.05609524, "balance_loss_mlp": 1.02422333, "epoch": 0.28437443636145016, "flos": 13845072539520.0, "grad_norm": 2.90331377882048, "language_loss": 0.86436778, "learning_rate": 3.358930872412323e-06, "loss": 0.88662791, "num_input_tokens_seen": 50665725, "step": 2365, "time_per_iteration": 2.589320421218872 }, { "auxiliary_loss_clip": 0.01189504, "auxiliary_loss_mlp": 0.01034233, "balance_loss_clip": 1.05647612, "balance_loss_mlp": 1.02497661, "epoch": 0.2844946792520892, "flos": 22747794243840.0, "grad_norm": 1.9133376359335028, "language_loss": 0.81353688, "learning_rate": 3.3583592319989825e-06, "loss": 0.8357743, "num_input_tokens_seen": 50685095, "step": 2366, "time_per_iteration": 2.6664140224456787 }, { "auxiliary_loss_clip": 0.01203317, "auxiliary_loss_mlp": 0.0103846, "balance_loss_clip": 1.06035662, "balance_loss_mlp": 1.02759457, "epoch": 0.2846149221427283, "flos": 32415987709440.0, "grad_norm": 1.8777296327517488, "language_loss": 0.68705016, "learning_rate": 3.357787385523627e-06, "loss": 0.70946789, "num_input_tokens_seen": 50706500, "step": 2367, "time_per_iteration": 2.7343101501464844 }, { "auxiliary_loss_clip": 0.01141287, "auxiliary_loss_mlp": 0.01033645, "balance_loss_clip": 1.05096889, "balance_loss_mlp": 1.02422726, "epoch": 0.2847351650333674, "flos": 28475976873600.0, "grad_norm": 2.136752091524356, "language_loss": 0.82838303, "learning_rate": 3.3572153330730048e-06, "loss": 0.85013235, "num_input_tokens_seen": 50727595, "step": 2368, "time_per_iteration": 2.824789524078369 }, { "auxiliary_loss_clip": 0.01070108, "auxiliary_loss_mlp": 0.01009358, "balance_loss_clip": 1.02103043, "balance_loss_mlp": 1.00752199, "epoch": 0.2848554079240065, "flos": 55753399704960.0, "grad_norm": 0.8275862869796837, "language_loss": 0.64750534, "learning_rate": 3.3566430747338956e-06, "loss": 0.66829997, "num_input_tokens_seen": 50782800, "step": 2369, "time_per_iteration": 3.1232106685638428 }, { "auxiliary_loss_clip": 0.01190909, "auxiliary_loss_mlp": 0.01032935, "balance_loss_clip": 1.05415356, "balance_loss_mlp": 1.02364206, "epoch": 0.2849756508146456, "flos": 11836875985920.0, "grad_norm": 2.3433930803403826, "language_loss": 0.86371946, "learning_rate": 3.35607061059311e-06, "loss": 0.88595784, "num_input_tokens_seen": 50797730, "step": 2370, "time_per_iteration": 2.651404619216919 }, { "auxiliary_loss_clip": 0.01202915, "auxiliary_loss_mlp": 0.01032987, "balance_loss_clip": 1.05829644, "balance_loss_mlp": 1.02365875, "epoch": 0.28509589370528465, "flos": 25155209531520.0, "grad_norm": 2.200817479193329, "language_loss": 0.75336301, "learning_rate": 3.3554979407374917e-06, "loss": 0.77572203, "num_input_tokens_seen": 50819840, "step": 2371, "time_per_iteration": 2.699162483215332 }, { "auxiliary_loss_clip": 0.01191985, "auxiliary_loss_mlp": 0.01038596, "balance_loss_clip": 1.05625653, "balance_loss_mlp": 1.02965569, "epoch": 0.28521613659592376, "flos": 19974808287360.0, "grad_norm": 1.8804384194775674, "language_loss": 0.73713142, "learning_rate": 3.3549250652539134e-06, "loss": 0.7594372, "num_input_tokens_seen": 50838935, "step": 2372, "time_per_iteration": 2.6486406326293945 }, { "auxiliary_loss_clip": 0.01172165, "auxiliary_loss_mlp": 0.01034215, "balance_loss_clip": 1.05029178, "balance_loss_mlp": 1.02389109, "epoch": 0.2853363794865629, "flos": 23367971491200.0, "grad_norm": 2.4775226470520253, "language_loss": 0.81620634, "learning_rate": 3.3543519842292794e-06, "loss": 0.83827013, "num_input_tokens_seen": 50858590, "step": 2373, "time_per_iteration": 2.666287660598755 }, { "auxiliary_loss_clip": 0.01202244, "auxiliary_loss_mlp": 0.00764412, "balance_loss_clip": 1.05640221, "balance_loss_mlp": 1.00039399, "epoch": 0.28545662237720193, "flos": 19861940776320.0, "grad_norm": 1.9474675456926365, "language_loss": 0.83626366, "learning_rate": 3.353778697750527e-06, "loss": 0.85593027, "num_input_tokens_seen": 50876995, "step": 2374, "time_per_iteration": 2.611072301864624 }, { "auxiliary_loss_clip": 0.01167066, "auxiliary_loss_mlp": 0.0103257, "balance_loss_clip": 1.05200422, "balance_loss_mlp": 1.02310479, "epoch": 0.28557686526784104, "flos": 23879016241920.0, "grad_norm": 1.642586791783303, "language_loss": 0.89424503, "learning_rate": 3.353205205904622e-06, "loss": 0.91624141, "num_input_tokens_seen": 50896105, "step": 2375, "time_per_iteration": 2.678462266921997 }, { "auxiliary_loss_clip": 0.01174128, "auxiliary_loss_mlp": 0.01034095, "balance_loss_clip": 1.05490589, "balance_loss_mlp": 1.02508235, "epoch": 0.28569710815848015, "flos": 44890384233600.0, "grad_norm": 2.424116785006551, "language_loss": 0.719051, "learning_rate": 3.3526315087785637e-06, "loss": 0.74113321, "num_input_tokens_seen": 50917220, "step": 2376, "time_per_iteration": 2.8992092609405518 }, { "auxiliary_loss_clip": 0.01127671, "auxiliary_loss_mlp": 0.01036265, "balance_loss_clip": 1.04735553, "balance_loss_mlp": 1.0265851, "epoch": 0.2858173510491192, "flos": 26829759628800.0, "grad_norm": 1.8885922861898812, "language_loss": 0.80657971, "learning_rate": 3.3520576064593805e-06, "loss": 0.82821906, "num_input_tokens_seen": 50937175, "step": 2377, "time_per_iteration": 2.7777557373046875 }, { "auxiliary_loss_clip": 0.01196312, "auxiliary_loss_mlp": 0.01040952, "balance_loss_clip": 1.05845571, "balance_loss_mlp": 1.03117132, "epoch": 0.2859375939397583, "flos": 23148916398720.0, "grad_norm": 1.6856780030056797, "language_loss": 0.82096159, "learning_rate": 3.3514834990341337e-06, "loss": 0.8433342, "num_input_tokens_seen": 50957500, "step": 2378, "time_per_iteration": 3.6613659858703613 }, { "auxiliary_loss_clip": 0.011802, "auxiliary_loss_mlp": 0.01027797, "balance_loss_clip": 1.05416417, "balance_loss_mlp": 1.01849818, "epoch": 0.2860578368303974, "flos": 12129799397760.0, "grad_norm": 4.0444912001607225, "language_loss": 0.92894399, "learning_rate": 3.3509091865899144e-06, "loss": 0.95102394, "num_input_tokens_seen": 50972690, "step": 2379, "time_per_iteration": 2.6827635765075684 }, { "auxiliary_loss_clip": 0.01204371, "auxiliary_loss_mlp": 0.01034577, "balance_loss_clip": 1.05646777, "balance_loss_mlp": 1.02449203, "epoch": 0.2861780797210365, "flos": 19938035738880.0, "grad_norm": 4.067698797997826, "language_loss": 0.70658076, "learning_rate": 3.350334669213846e-06, "loss": 0.72897029, "num_input_tokens_seen": 50990095, "step": 2380, "time_per_iteration": 2.5419352054595947 }, { "auxiliary_loss_clip": 0.01191781, "auxiliary_loss_mlp": 0.0102952, "balance_loss_clip": 1.05905056, "balance_loss_mlp": 1.01994705, "epoch": 0.2862983226116756, "flos": 27563127609600.0, "grad_norm": 3.428783396565654, "language_loss": 0.75500453, "learning_rate": 3.3497599469930816e-06, "loss": 0.77721751, "num_input_tokens_seen": 51008305, "step": 2381, "time_per_iteration": 3.5748863220214844 }, { "auxiliary_loss_clip": 0.01205557, "auxiliary_loss_mlp": 0.01032438, "balance_loss_clip": 1.05640197, "balance_loss_mlp": 1.02322936, "epoch": 0.28641856550231465, "flos": 22053964158720.0, "grad_norm": 3.823761988096933, "language_loss": 0.83538461, "learning_rate": 3.349185020014807e-06, "loss": 0.85776448, "num_input_tokens_seen": 51025570, "step": 2382, "time_per_iteration": 2.544820547103882 }, { "auxiliary_loss_clip": 0.01189352, "auxiliary_loss_mlp": 0.01036556, "balance_loss_clip": 1.05187106, "balance_loss_mlp": 1.02620244, "epoch": 0.28653880839295376, "flos": 22378775869440.0, "grad_norm": 8.321271629868242, "language_loss": 0.74774051, "learning_rate": 3.348609888366237e-06, "loss": 0.76999956, "num_input_tokens_seen": 51044585, "step": 2383, "time_per_iteration": 2.624713182449341 }, { "auxiliary_loss_clip": 0.01126788, "auxiliary_loss_mlp": 0.01034081, "balance_loss_clip": 1.04797971, "balance_loss_mlp": 1.02477121, "epoch": 0.28665905128359287, "flos": 23367971491200.0, "grad_norm": 2.3556798549128173, "language_loss": 0.63184077, "learning_rate": 3.348034552134619e-06, "loss": 0.65344948, "num_input_tokens_seen": 51063990, "step": 2384, "time_per_iteration": 2.7318496704101562 }, { "auxiliary_loss_clip": 0.01137432, "auxiliary_loss_mlp": 0.01032716, "balance_loss_clip": 1.04958856, "balance_loss_mlp": 1.02360892, "epoch": 0.2867792941742319, "flos": 20881695893760.0, "grad_norm": 2.3551537662458877, "language_loss": 0.84326243, "learning_rate": 3.3474590114072316e-06, "loss": 0.86496389, "num_input_tokens_seen": 51081990, "step": 2385, "time_per_iteration": 4.521415948867798 }, { "auxiliary_loss_clip": 0.01156555, "auxiliary_loss_mlp": 0.01030791, "balance_loss_clip": 1.05448937, "balance_loss_mlp": 1.02105129, "epoch": 0.28689953706487104, "flos": 20664005518080.0, "grad_norm": 2.1050925993417704, "language_loss": 0.82653689, "learning_rate": 3.3468832662713836e-06, "loss": 0.84841037, "num_input_tokens_seen": 51100235, "step": 2386, "time_per_iteration": 2.728040933609009 }, { "auxiliary_loss_clip": 0.01160352, "auxiliary_loss_mlp": 0.01032063, "balance_loss_clip": 1.05347872, "balance_loss_mlp": 1.02263975, "epoch": 0.28701977995551015, "flos": 12675533708160.0, "grad_norm": 3.1015702629130644, "language_loss": 0.8372196, "learning_rate": 3.346307316814415e-06, "loss": 0.85914373, "num_input_tokens_seen": 51115405, "step": 2387, "time_per_iteration": 2.616672992706299 }, { "auxiliary_loss_clip": 0.01191479, "auxiliary_loss_mlp": 0.01034593, "balance_loss_clip": 1.05790448, "balance_loss_mlp": 1.02413273, "epoch": 0.2871400228461492, "flos": 21252366293760.0, "grad_norm": 2.1915325339577243, "language_loss": 0.75968874, "learning_rate": 3.3457311631236965e-06, "loss": 0.78194952, "num_input_tokens_seen": 51136390, "step": 2388, "time_per_iteration": 2.659308671951294 }, { "auxiliary_loss_clip": 0.01160862, "auxiliary_loss_mlp": 0.01029355, "balance_loss_clip": 1.04882979, "balance_loss_mlp": 1.01990736, "epoch": 0.2872602657367883, "flos": 25119262995840.0, "grad_norm": 2.152616623258441, "language_loss": 0.84515446, "learning_rate": 3.345154805286631e-06, "loss": 0.86705667, "num_input_tokens_seen": 51156650, "step": 2389, "time_per_iteration": 2.67872953414917 }, { "auxiliary_loss_clip": 0.01182006, "auxiliary_loss_mlp": 0.01034028, "balance_loss_clip": 1.05254364, "balance_loss_mlp": 1.02475905, "epoch": 0.2873805086274274, "flos": 16646606830080.0, "grad_norm": 2.527725482025029, "language_loss": 0.76417685, "learning_rate": 3.344578243390651e-06, "loss": 0.78633714, "num_input_tokens_seen": 51172210, "step": 2390, "time_per_iteration": 2.587218761444092 }, { "auxiliary_loss_clip": 0.01173762, "auxiliary_loss_mlp": 0.01031829, "balance_loss_clip": 1.05412269, "balance_loss_mlp": 1.02263761, "epoch": 0.2875007515180665, "flos": 17420123237760.0, "grad_norm": 3.0106549331136128, "language_loss": 0.78881091, "learning_rate": 3.3440014775232206e-06, "loss": 0.81086683, "num_input_tokens_seen": 51190265, "step": 2391, "time_per_iteration": 2.6404542922973633 }, { "auxiliary_loss_clip": 0.01161145, "auxiliary_loss_mlp": 0.01033726, "balance_loss_clip": 1.05148983, "balance_loss_mlp": 1.02486849, "epoch": 0.2876209944087056, "flos": 23434190213760.0, "grad_norm": 2.5821261794465475, "language_loss": 0.71506053, "learning_rate": 3.343424507771834e-06, "loss": 0.73700923, "num_input_tokens_seen": 51208475, "step": 2392, "time_per_iteration": 2.777653455734253 }, { "auxiliary_loss_clip": 0.01162866, "auxiliary_loss_mlp": 0.01028026, "balance_loss_clip": 1.05521345, "balance_loss_mlp": 1.01915669, "epoch": 0.2877412372993447, "flos": 13735509079680.0, "grad_norm": 2.4204264317444593, "language_loss": 0.86592531, "learning_rate": 3.342847334224018e-06, "loss": 0.88783419, "num_input_tokens_seen": 51225875, "step": 2393, "time_per_iteration": 2.7588393688201904 }, { "auxiliary_loss_clip": 0.01088028, "auxiliary_loss_mlp": 0.01003552, "balance_loss_clip": 1.01688385, "balance_loss_mlp": 1.00176382, "epoch": 0.28786148018998375, "flos": 58079695104000.0, "grad_norm": 0.9770628880035306, "language_loss": 0.62375307, "learning_rate": 3.342269956967329e-06, "loss": 0.64466888, "num_input_tokens_seen": 51287780, "step": 2394, "time_per_iteration": 3.241800546646118 }, { "auxiliary_loss_clip": 0.01191198, "auxiliary_loss_mlp": 0.01032443, "balance_loss_clip": 1.05471504, "balance_loss_mlp": 1.02258432, "epoch": 0.28798172308062286, "flos": 23435052140160.0, "grad_norm": 2.977586888667627, "language_loss": 0.71994209, "learning_rate": 3.341692376089355e-06, "loss": 0.74217856, "num_input_tokens_seen": 51303335, "step": 2395, "time_per_iteration": 2.6765310764312744 }, { "auxiliary_loss_clip": 0.01187749, "auxiliary_loss_mlp": 0.01029072, "balance_loss_clip": 1.05584168, "balance_loss_mlp": 1.01996422, "epoch": 0.288101965971262, "flos": 25110033200640.0, "grad_norm": 4.398265863393965, "language_loss": 0.84220457, "learning_rate": 3.3411145916777146e-06, "loss": 0.86437279, "num_input_tokens_seen": 51317495, "step": 2396, "time_per_iteration": 2.623741626739502 }, { "auxiliary_loss_clip": 0.01168146, "auxiliary_loss_mlp": 0.01031953, "balance_loss_clip": 1.05085969, "balance_loss_mlp": 1.02235651, "epoch": 0.28822220886190103, "flos": 16252559654400.0, "grad_norm": 2.7655052855531177, "language_loss": 0.91208339, "learning_rate": 3.3405366038200566e-06, "loss": 0.93408442, "num_input_tokens_seen": 51336430, "step": 2397, "time_per_iteration": 2.686751365661621 }, { "auxiliary_loss_clip": 0.01177373, "auxiliary_loss_mlp": 0.01033793, "balance_loss_clip": 1.05693805, "balance_loss_mlp": 1.02362466, "epoch": 0.28834245175254014, "flos": 24535642815360.0, "grad_norm": 2.2529085106580595, "language_loss": 0.84806389, "learning_rate": 3.3399584126040617e-06, "loss": 0.87017554, "num_input_tokens_seen": 51355930, "step": 2398, "time_per_iteration": 2.668748378753662 }, { "auxiliary_loss_clip": 0.01202342, "auxiliary_loss_mlp": 0.0076418, "balance_loss_clip": 1.0555377, "balance_loss_mlp": 1.00034642, "epoch": 0.2884626946431792, "flos": 24571445696640.0, "grad_norm": 2.9083317675200426, "language_loss": 0.91380656, "learning_rate": 3.339380018117441e-06, "loss": 0.93347174, "num_input_tokens_seen": 51376765, "step": 2399, "time_per_iteration": 2.700516700744629 }, { "auxiliary_loss_clip": 0.01184681, "auxiliary_loss_mlp": 0.01027963, "balance_loss_clip": 1.05560267, "balance_loss_mlp": 1.01876569, "epoch": 0.2885829375338183, "flos": 16544657053440.0, "grad_norm": 5.228414917706568, "language_loss": 0.78581941, "learning_rate": 3.3388014204479366e-06, "loss": 0.80794585, "num_input_tokens_seen": 51394570, "step": 2400, "time_per_iteration": 2.622215986251831 }, { "auxiliary_loss_clip": 0.01202886, "auxiliary_loss_mlp": 0.01033788, "balance_loss_clip": 1.05508375, "balance_loss_mlp": 1.02403045, "epoch": 0.2887031804244574, "flos": 24061226958720.0, "grad_norm": 4.516009038903702, "language_loss": 0.91647804, "learning_rate": 3.338222619683321e-06, "loss": 0.93884474, "num_input_tokens_seen": 51414535, "step": 2401, "time_per_iteration": 2.622659683227539 }, { "auxiliary_loss_clip": 0.01174317, "auxiliary_loss_mlp": 0.01035095, "balance_loss_clip": 1.05212212, "balance_loss_mlp": 1.02480745, "epoch": 0.2888234233150965, "flos": 23330696152320.0, "grad_norm": 2.3729915691173216, "language_loss": 0.74016118, "learning_rate": 3.337643615911398e-06, "loss": 0.76225531, "num_input_tokens_seen": 51434160, "step": 2402, "time_per_iteration": 2.7715706825256348 }, { "auxiliary_loss_clip": 0.0118831, "auxiliary_loss_mlp": 0.01032367, "balance_loss_clip": 1.05354106, "balance_loss_mlp": 1.0229975, "epoch": 0.2889436662057356, "flos": 22272767856000.0, "grad_norm": 3.683729262652092, "language_loss": 0.78875661, "learning_rate": 3.3370644092200026e-06, "loss": 0.81096339, "num_input_tokens_seen": 51451435, "step": 2403, "time_per_iteration": 2.652864933013916 }, { "auxiliary_loss_clip": 0.01143688, "auxiliary_loss_mlp": 0.01036459, "balance_loss_clip": 1.04569805, "balance_loss_mlp": 1.02731538, "epoch": 0.2890639090963747, "flos": 21616931381760.0, "grad_norm": 1.8529550867152302, "language_loss": 0.78243172, "learning_rate": 3.3364849996969985e-06, "loss": 0.80423319, "num_input_tokens_seen": 51471455, "step": 2404, "time_per_iteration": 2.7073814868927 }, { "auxiliary_loss_clip": 0.01189567, "auxiliary_loss_mlp": 0.01033572, "balance_loss_clip": 1.0562923, "balance_loss_mlp": 1.02442265, "epoch": 0.28918415198701375, "flos": 28585540333440.0, "grad_norm": 2.245727133306413, "language_loss": 0.85530466, "learning_rate": 3.335905387430283e-06, "loss": 0.87753606, "num_input_tokens_seen": 51492890, "step": 2405, "time_per_iteration": 3.5333008766174316 }, { "auxiliary_loss_clip": 0.01174395, "auxiliary_loss_mlp": 0.01033304, "balance_loss_clip": 1.04991603, "balance_loss_mlp": 1.02405906, "epoch": 0.28930439487765286, "flos": 21944688007680.0, "grad_norm": 1.9046157277898264, "language_loss": 0.83213198, "learning_rate": 3.335325572507782e-06, "loss": 0.85420901, "num_input_tokens_seen": 51513390, "step": 2406, "time_per_iteration": 2.6870779991149902 }, { "auxiliary_loss_clip": 0.01206539, "auxiliary_loss_mlp": 0.00764851, "balance_loss_clip": 1.06024718, "balance_loss_mlp": 1.00054097, "epoch": 0.28942463776829197, "flos": 19281911955840.0, "grad_norm": 2.8538677621758586, "language_loss": 0.742935, "learning_rate": 3.3347455550174537e-06, "loss": 0.76264894, "num_input_tokens_seen": 51532730, "step": 2407, "time_per_iteration": 2.5980687141418457 }, { "auxiliary_loss_clip": 0.01153283, "auxiliary_loss_mlp": 0.01027564, "balance_loss_clip": 1.04847312, "balance_loss_mlp": 1.01762211, "epoch": 0.289544880658931, "flos": 14645700737280.0, "grad_norm": 2.093581379731839, "language_loss": 0.68122798, "learning_rate": 3.3341653350472864e-06, "loss": 0.70303649, "num_input_tokens_seen": 51549560, "step": 2408, "time_per_iteration": 3.610114574432373 }, { "auxiliary_loss_clip": 0.01212234, "auxiliary_loss_mlp": 0.01040243, "balance_loss_clip": 1.05917954, "balance_loss_mlp": 1.02940094, "epoch": 0.28966512354957014, "flos": 28621881918720.0, "grad_norm": 2.7458599108900383, "language_loss": 0.6931839, "learning_rate": 3.333584912685298e-06, "loss": 0.71570867, "num_input_tokens_seen": 51568180, "step": 2409, "time_per_iteration": 2.6895904541015625 }, { "auxiliary_loss_clip": 0.01061372, "auxiliary_loss_mlp": 0.01001843, "balance_loss_clip": 1.01579273, "balance_loss_mlp": 0.99981624, "epoch": 0.28978536644020925, "flos": 64711784511360.0, "grad_norm": 0.8789924063458642, "language_loss": 0.55554307, "learning_rate": 3.3330042880195385e-06, "loss": 0.57617521, "num_input_tokens_seen": 51622530, "step": 2410, "time_per_iteration": 3.1711766719818115 }, { "auxiliary_loss_clip": 0.01171671, "auxiliary_loss_mlp": 0.01033285, "balance_loss_clip": 1.05197453, "balance_loss_mlp": 1.02424884, "epoch": 0.2899056093308483, "flos": 18624638937600.0, "grad_norm": 2.078522745397886, "language_loss": 0.78414595, "learning_rate": 3.3324234611380888e-06, "loss": 0.8061955, "num_input_tokens_seen": 51641260, "step": 2411, "time_per_iteration": 3.6079792976379395 }, { "auxiliary_loss_clip": 0.01153658, "auxiliary_loss_mlp": 0.01027833, "balance_loss_clip": 1.05251074, "balance_loss_mlp": 1.01834369, "epoch": 0.2900258522214874, "flos": 22893735202560.0, "grad_norm": 1.925096488460499, "language_loss": 0.81712049, "learning_rate": 3.3318424321290596e-06, "loss": 0.83893538, "num_input_tokens_seen": 51660975, "step": 2412, "time_per_iteration": 2.7638180255889893 }, { "auxiliary_loss_clip": 0.01062571, "auxiliary_loss_mlp": 0.01004533, "balance_loss_clip": 1.01530004, "balance_loss_mlp": 1.00267375, "epoch": 0.2901460951121265, "flos": 71106036013440.0, "grad_norm": 0.827019108291743, "language_loss": 0.59916425, "learning_rate": 3.3312612010805917e-06, "loss": 0.61983526, "num_input_tokens_seen": 51720550, "step": 2413, "time_per_iteration": 3.3167805671691895 }, { "auxiliary_loss_clip": 0.01163299, "auxiliary_loss_mlp": 0.01034659, "balance_loss_clip": 1.05216837, "balance_loss_mlp": 1.02545607, "epoch": 0.2902663380027656, "flos": 32160986081280.0, "grad_norm": 1.9139314560332206, "language_loss": 0.70266998, "learning_rate": 3.330679768080858e-06, "loss": 0.72464955, "num_input_tokens_seen": 51744435, "step": 2414, "time_per_iteration": 2.832214832305908 }, { "auxiliary_loss_clip": 0.01188876, "auxiliary_loss_mlp": 0.01035049, "balance_loss_clip": 1.0580399, "balance_loss_mlp": 1.0255425, "epoch": 0.2903865808934047, "flos": 29351658539520.0, "grad_norm": 2.315234144692434, "language_loss": 0.83911312, "learning_rate": 3.3300981332180627e-06, "loss": 0.86135238, "num_input_tokens_seen": 51763640, "step": 2415, "time_per_iteration": 2.6691818237304688 }, { "auxiliary_loss_clip": 0.01167299, "auxiliary_loss_mlp": 0.01039547, "balance_loss_clip": 1.05370235, "balance_loss_mlp": 1.03017116, "epoch": 0.29050682378404374, "flos": 17089026647040.0, "grad_norm": 2.0643627793197763, "language_loss": 0.79983377, "learning_rate": 3.3295162965804373e-06, "loss": 0.82190228, "num_input_tokens_seen": 51782135, "step": 2416, "time_per_iteration": 2.67122483253479 }, { "auxiliary_loss_clip": 0.01158996, "auxiliary_loss_mlp": 0.01028976, "balance_loss_clip": 1.05566764, "balance_loss_mlp": 1.02044654, "epoch": 0.29062706667468285, "flos": 17858233422720.0, "grad_norm": 2.813231956991232, "language_loss": 0.78821075, "learning_rate": 3.328934258256247e-06, "loss": 0.81009042, "num_input_tokens_seen": 51800200, "step": 2417, "time_per_iteration": 2.6374270915985107 }, { "auxiliary_loss_clip": 0.0118599, "auxiliary_loss_mlp": 0.0103367, "balance_loss_clip": 1.05178189, "balance_loss_mlp": 1.02424026, "epoch": 0.29074730956532197, "flos": 24279815174400.0, "grad_norm": 6.008853151699176, "language_loss": 0.67290795, "learning_rate": 3.3283520183337856e-06, "loss": 0.69510454, "num_input_tokens_seen": 51819905, "step": 2418, "time_per_iteration": 2.6699626445770264 }, { "auxiliary_loss_clip": 0.0117069, "auxiliary_loss_mlp": 0.01034913, "balance_loss_clip": 1.05231977, "balance_loss_mlp": 1.02625275, "epoch": 0.290867552455961, "flos": 22340961826560.0, "grad_norm": 1.9157815318305342, "language_loss": 0.6960302, "learning_rate": 3.3277695769013797e-06, "loss": 0.71808624, "num_input_tokens_seen": 51839350, "step": 2419, "time_per_iteration": 2.689283609390259 }, { "auxiliary_loss_clip": 0.01186664, "auxiliary_loss_mlp": 0.01035933, "balance_loss_clip": 1.05580473, "balance_loss_mlp": 1.02575898, "epoch": 0.29098779534660013, "flos": 23186155824000.0, "grad_norm": 2.953332792148371, "language_loss": 0.77559125, "learning_rate": 3.327186934047385e-06, "loss": 0.79781723, "num_input_tokens_seen": 51858045, "step": 2420, "time_per_iteration": 2.6506004333496094 }, { "auxiliary_loss_clip": 0.01160566, "auxiliary_loss_mlp": 0.01027843, "balance_loss_clip": 1.04702818, "balance_loss_mlp": 1.0190568, "epoch": 0.29110803823723924, "flos": 15304194817920.0, "grad_norm": 2.3753526577024453, "language_loss": 0.65266418, "learning_rate": 3.3266040898601877e-06, "loss": 0.67454827, "num_input_tokens_seen": 51875880, "step": 2421, "time_per_iteration": 2.6396050453186035 }, { "auxiliary_loss_clip": 0.01134845, "auxiliary_loss_mlp": 0.01029732, "balance_loss_clip": 1.04598784, "balance_loss_mlp": 1.02049351, "epoch": 0.2912282811278783, "flos": 22595352923520.0, "grad_norm": 1.9131875916078578, "language_loss": 0.78210151, "learning_rate": 3.3260210444282045e-06, "loss": 0.80374724, "num_input_tokens_seen": 51893835, "step": 2422, "time_per_iteration": 2.789106607437134 }, { "auxiliary_loss_clip": 0.01185207, "auxiliary_loss_mlp": 0.01032847, "balance_loss_clip": 1.05853915, "balance_loss_mlp": 1.02415085, "epoch": 0.2913485240185174, "flos": 24497900599680.0, "grad_norm": 2.930877205862439, "language_loss": 0.73549193, "learning_rate": 3.325437797839883e-06, "loss": 0.75767249, "num_input_tokens_seen": 51912205, "step": 2423, "time_per_iteration": 2.670621871948242 }, { "auxiliary_loss_clip": 0.01202732, "auxiliary_loss_mlp": 0.01040298, "balance_loss_clip": 1.05671811, "balance_loss_mlp": 1.03067207, "epoch": 0.2914687669091565, "flos": 17931024334080.0, "grad_norm": 2.5990482625669706, "language_loss": 0.75094366, "learning_rate": 3.3248543501837015e-06, "loss": 0.77337396, "num_input_tokens_seen": 51929410, "step": 2424, "time_per_iteration": 2.551671266555786 }, { "auxiliary_loss_clip": 0.01147915, "auxiliary_loss_mlp": 0.01034647, "balance_loss_clip": 1.05177855, "balance_loss_mlp": 1.02539682, "epoch": 0.2915890097997956, "flos": 22529313768960.0, "grad_norm": 2.2205908031764645, "language_loss": 0.77473003, "learning_rate": 3.3242707015481684e-06, "loss": 0.79655564, "num_input_tokens_seen": 51949345, "step": 2425, "time_per_iteration": 2.8151302337646484 }, { "auxiliary_loss_clip": 0.01168402, "auxiliary_loss_mlp": 0.01022389, "balance_loss_clip": 1.04795551, "balance_loss_mlp": 1.01378155, "epoch": 0.2917092526904347, "flos": 13845216193920.0, "grad_norm": 2.11826436669137, "language_loss": 0.80743587, "learning_rate": 3.323686852021823e-06, "loss": 0.8293438, "num_input_tokens_seen": 51966855, "step": 2426, "time_per_iteration": 2.6223642826080322 }, { "auxiliary_loss_clip": 0.01163239, "auxiliary_loss_mlp": 0.0103193, "balance_loss_clip": 1.04848599, "balance_loss_mlp": 1.02261376, "epoch": 0.2918294955810738, "flos": 22674859678080.0, "grad_norm": 6.340243945991299, "language_loss": 0.79522443, "learning_rate": 3.323102801693235e-06, "loss": 0.8171761, "num_input_tokens_seen": 51985620, "step": 2427, "time_per_iteration": 2.7346606254577637 }, { "auxiliary_loss_clip": 0.01181787, "auxiliary_loss_mlp": 0.01028046, "balance_loss_clip": 1.05289674, "balance_loss_mlp": 1.01906347, "epoch": 0.29194973847171285, "flos": 23438284364160.0, "grad_norm": 2.3304855886962836, "language_loss": 0.80914336, "learning_rate": 3.322518550651003e-06, "loss": 0.83124173, "num_input_tokens_seen": 52004930, "step": 2428, "time_per_iteration": 2.64074969291687 }, { "auxiliary_loss_clip": 0.01179844, "auxiliary_loss_mlp": 0.01029776, "balance_loss_clip": 1.05395555, "balance_loss_mlp": 1.02076399, "epoch": 0.29206998136235196, "flos": 21909064694400.0, "grad_norm": 1.741034378808215, "language_loss": 0.81406438, "learning_rate": 3.3219340989837586e-06, "loss": 0.83616066, "num_input_tokens_seen": 52024920, "step": 2429, "time_per_iteration": 2.6724853515625 }, { "auxiliary_loss_clip": 0.01170953, "auxiliary_loss_mlp": 0.01025551, "balance_loss_clip": 1.0533812, "balance_loss_mlp": 1.01739693, "epoch": 0.292190224252991, "flos": 23215925220480.0, "grad_norm": 1.8928266013504385, "language_loss": 0.80639541, "learning_rate": 3.3213494467801625e-06, "loss": 0.82836044, "num_input_tokens_seen": 52044095, "step": 2430, "time_per_iteration": 3.559154510498047 }, { "auxiliary_loss_clip": 0.01113348, "auxiliary_loss_mlp": 0.01026899, "balance_loss_clip": 1.04432905, "balance_loss_mlp": 1.01770246, "epoch": 0.2923104671436301, "flos": 20740818752640.0, "grad_norm": 2.1942537758196834, "language_loss": 0.71587175, "learning_rate": 3.3207645941289063e-06, "loss": 0.73727417, "num_input_tokens_seen": 52062440, "step": 2431, "time_per_iteration": 2.7955310344696045 }, { "auxiliary_loss_clip": 0.01188298, "auxiliary_loss_mlp": 0.00763447, "balance_loss_clip": 1.05808103, "balance_loss_mlp": 1.00031328, "epoch": 0.29243071003426924, "flos": 35809114999680.0, "grad_norm": 1.7560568733729187, "language_loss": 0.79991996, "learning_rate": 3.320179541118711e-06, "loss": 0.81943738, "num_input_tokens_seen": 52084940, "step": 2432, "time_per_iteration": 2.7660868167877197 }, { "auxiliary_loss_clip": 0.01091084, "auxiliary_loss_mlp": 0.01005642, "balance_loss_clip": 1.02035546, "balance_loss_mlp": 1.00383055, "epoch": 0.2925509529249083, "flos": 58081598524800.0, "grad_norm": 1.0015028548756435, "language_loss": 0.6032207, "learning_rate": 3.3195942878383293e-06, "loss": 0.62418807, "num_input_tokens_seen": 52141040, "step": 2433, "time_per_iteration": 4.134268045425415 }, { "auxiliary_loss_clip": 0.01185882, "auxiliary_loss_mlp": 0.01028517, "balance_loss_clip": 1.05534208, "balance_loss_mlp": 1.0182656, "epoch": 0.2926711958155474, "flos": 21397122103680.0, "grad_norm": 1.9107247588445682, "language_loss": 0.78021657, "learning_rate": 3.319008834376543e-06, "loss": 0.80236053, "num_input_tokens_seen": 52160730, "step": 2434, "time_per_iteration": 2.7216079235076904 }, { "auxiliary_loss_clip": 0.01162321, "auxiliary_loss_mlp": 0.01031888, "balance_loss_clip": 1.04842567, "balance_loss_mlp": 1.02218401, "epoch": 0.2927914387061865, "flos": 23185796688000.0, "grad_norm": 2.459901177462285, "language_loss": 0.88869387, "learning_rate": 3.3184231808221654e-06, "loss": 0.91063595, "num_input_tokens_seen": 52175055, "step": 2435, "time_per_iteration": 2.760983943939209 }, { "auxiliary_loss_clip": 0.01164509, "auxiliary_loss_mlp": 0.01030067, "balance_loss_clip": 1.05495834, "balance_loss_mlp": 1.02156782, "epoch": 0.29291168159682557, "flos": 22455553190400.0, "grad_norm": 2.74438637460682, "language_loss": 0.63145071, "learning_rate": 3.3178373272640394e-06, "loss": 0.65339649, "num_input_tokens_seen": 52194150, "step": 2436, "time_per_iteration": 2.812869071960449 }, { "auxiliary_loss_clip": 0.0119863, "auxiliary_loss_mlp": 0.01032737, "balance_loss_clip": 1.05552232, "balance_loss_mlp": 1.02412438, "epoch": 0.2930319244874647, "flos": 21170632896000.0, "grad_norm": 2.8328547184374244, "language_loss": 0.85114354, "learning_rate": 3.3172512737910387e-06, "loss": 0.87345725, "num_input_tokens_seen": 52211660, "step": 2437, "time_per_iteration": 3.573444128036499 }, { "auxiliary_loss_clip": 0.01186002, "auxiliary_loss_mlp": 0.01032698, "balance_loss_clip": 1.0532136, "balance_loss_mlp": 1.0240314, "epoch": 0.2931521673781038, "flos": 31357843931520.0, "grad_norm": 2.6416590212450197, "language_loss": 0.89017123, "learning_rate": 3.3166650204920674e-06, "loss": 0.91235828, "num_input_tokens_seen": 52232830, "step": 2438, "time_per_iteration": 3.6097118854522705 }, { "auxiliary_loss_clip": 0.01188376, "auxiliary_loss_mlp": 0.0103266, "balance_loss_clip": 1.05830407, "balance_loss_mlp": 1.02388072, "epoch": 0.29327241026874284, "flos": 24200990778240.0, "grad_norm": 1.8432197583996215, "language_loss": 0.81750166, "learning_rate": 3.316078567456059e-06, "loss": 0.83971202, "num_input_tokens_seen": 52250670, "step": 2439, "time_per_iteration": 2.6202855110168457 }, { "auxiliary_loss_clip": 0.01130926, "auxiliary_loss_mlp": 0.01025513, "balance_loss_clip": 1.04725409, "balance_loss_mlp": 1.01648855, "epoch": 0.29339265315938196, "flos": 24242611662720.0, "grad_norm": 3.1314712731901775, "language_loss": 0.75919116, "learning_rate": 3.3154919147719786e-06, "loss": 0.78075552, "num_input_tokens_seen": 52271685, "step": 2440, "time_per_iteration": 2.7564847469329834 }, { "auxiliary_loss_clip": 0.01186802, "auxiliary_loss_mlp": 0.01034739, "balance_loss_clip": 1.05441952, "balance_loss_mlp": 1.02563119, "epoch": 0.29351289605002107, "flos": 16946641134720.0, "grad_norm": 3.0156303243899405, "language_loss": 0.86625469, "learning_rate": 3.31490506252882e-06, "loss": 0.88847005, "num_input_tokens_seen": 52291065, "step": 2441, "time_per_iteration": 2.590186595916748 }, { "auxiliary_loss_clip": 0.01148656, "auxiliary_loss_mlp": 0.01032887, "balance_loss_clip": 1.04763675, "balance_loss_mlp": 1.02416658, "epoch": 0.2936331389406601, "flos": 19829082810240.0, "grad_norm": 1.8469770517470634, "language_loss": 0.8423996, "learning_rate": 3.31431801081561e-06, "loss": 0.86421502, "num_input_tokens_seen": 52310000, "step": 2442, "time_per_iteration": 2.7188212871551514 }, { "auxiliary_loss_clip": 0.01070704, "auxiliary_loss_mlp": 0.01003858, "balance_loss_clip": 1.01710725, "balance_loss_mlp": 1.00189137, "epoch": 0.29375338183129923, "flos": 71416844398080.0, "grad_norm": 0.8884605046579317, "language_loss": 0.67882746, "learning_rate": 3.313730759721402e-06, "loss": 0.6995731, "num_input_tokens_seen": 52372930, "step": 2443, "time_per_iteration": 3.3173913955688477 }, { "auxiliary_loss_clip": 0.01170234, "auxiliary_loss_mlp": 0.01030714, "balance_loss_clip": 1.05556679, "balance_loss_mlp": 1.02188039, "epoch": 0.29387362472193834, "flos": 22054502862720.0, "grad_norm": 2.0510558615251107, "language_loss": 0.87016106, "learning_rate": 3.313143309335282e-06, "loss": 0.89217055, "num_input_tokens_seen": 52391420, "step": 2444, "time_per_iteration": 2.685771942138672 }, { "auxiliary_loss_clip": 0.01157304, "auxiliary_loss_mlp": 0.01030519, "balance_loss_clip": 1.0530349, "balance_loss_mlp": 1.02213895, "epoch": 0.2939938676125774, "flos": 22966418373120.0, "grad_norm": 2.02889833744704, "language_loss": 0.85080636, "learning_rate": 3.3125556597463665e-06, "loss": 0.8726846, "num_input_tokens_seen": 52410725, "step": 2445, "time_per_iteration": 2.7091870307922363 }, { "auxiliary_loss_clip": 0.01186189, "auxiliary_loss_mlp": 0.01028195, "balance_loss_clip": 1.05737948, "balance_loss_mlp": 1.02026749, "epoch": 0.2941141105032165, "flos": 31358705857920.0, "grad_norm": 1.5668387550079224, "language_loss": 0.66323811, "learning_rate": 3.311967811043801e-06, "loss": 0.68538195, "num_input_tokens_seen": 52432645, "step": 2446, "time_per_iteration": 2.723324775695801 }, { "auxiliary_loss_clip": 0.01184052, "auxiliary_loss_mlp": 0.01028851, "balance_loss_clip": 1.05617642, "balance_loss_mlp": 1.02022004, "epoch": 0.29423435339385556, "flos": 23222138273280.0, "grad_norm": 2.8935287764486453, "language_loss": 0.82250184, "learning_rate": 3.3113797633167617e-06, "loss": 0.84463084, "num_input_tokens_seen": 52450940, "step": 2447, "time_per_iteration": 2.6904137134552 }, { "auxiliary_loss_clip": 0.01201157, "auxiliary_loss_mlp": 0.01027667, "balance_loss_clip": 1.05780005, "balance_loss_mlp": 1.01875067, "epoch": 0.2943545962844947, "flos": 26864054138880.0, "grad_norm": 2.418451203267424, "language_loss": 0.69242311, "learning_rate": 3.310791516654455e-06, "loss": 0.71471143, "num_input_tokens_seen": 52468000, "step": 2448, "time_per_iteration": 2.6190404891967773 }, { "auxiliary_loss_clip": 0.01163752, "auxiliary_loss_mlp": 0.0102917, "balance_loss_clip": 1.05236208, "balance_loss_mlp": 1.01939464, "epoch": 0.2944748391751338, "flos": 20231677422720.0, "grad_norm": 2.625249568010857, "language_loss": 0.79737365, "learning_rate": 3.3102030711461177e-06, "loss": 0.81930292, "num_input_tokens_seen": 52487575, "step": 2449, "time_per_iteration": 2.7124245166778564 }, { "auxiliary_loss_clip": 0.01158231, "auxiliary_loss_mlp": 0.01032777, "balance_loss_clip": 1.05126524, "balance_loss_mlp": 1.02368128, "epoch": 0.29459508206577284, "flos": 15960965045760.0, "grad_norm": 2.883461741914008, "language_loss": 0.68151021, "learning_rate": 3.3096144268810156e-06, "loss": 0.70342028, "num_input_tokens_seen": 52506335, "step": 2450, "time_per_iteration": 2.638737678527832 }, { "auxiliary_loss_clip": 0.01177764, "auxiliary_loss_mlp": 0.01029256, "balance_loss_clip": 1.05318737, "balance_loss_mlp": 1.01958179, "epoch": 0.29471532495641195, "flos": 20412882558720.0, "grad_norm": 1.9634107849911926, "language_loss": 0.73060155, "learning_rate": 3.3090255839484462e-06, "loss": 0.75267172, "num_input_tokens_seen": 52524330, "step": 2451, "time_per_iteration": 2.6517651081085205 }, { "auxiliary_loss_clip": 0.01172124, "auxiliary_loss_mlp": 0.01031472, "balance_loss_clip": 1.05168271, "balance_loss_mlp": 1.02216804, "epoch": 0.29483556784705106, "flos": 20376576887040.0, "grad_norm": 4.193378356667574, "language_loss": 0.85514486, "learning_rate": 3.3084365424377366e-06, "loss": 0.87718081, "num_input_tokens_seen": 52543095, "step": 2452, "time_per_iteration": 2.635852336883545 }, { "auxiliary_loss_clip": 0.01066202, "auxiliary_loss_mlp": 0.01009692, "balance_loss_clip": 1.02826595, "balance_loss_mlp": 1.00741529, "epoch": 0.2949558107376901, "flos": 68555660595840.0, "grad_norm": 0.7267308794142744, "language_loss": 0.55947506, "learning_rate": 3.307847302438245e-06, "loss": 0.58023405, "num_input_tokens_seen": 52597075, "step": 2453, "time_per_iteration": 3.2400941848754883 }, { "auxiliary_loss_clip": 0.01126889, "auxiliary_loss_mlp": 0.01033754, "balance_loss_clip": 1.0438627, "balance_loss_mlp": 1.02469444, "epoch": 0.2950760536283292, "flos": 16107085572480.0, "grad_norm": 2.574712245679792, "language_loss": 0.77621293, "learning_rate": 3.3072578640393562e-06, "loss": 0.79781938, "num_input_tokens_seen": 52614410, "step": 2454, "time_per_iteration": 2.7028069496154785 }, { "auxiliary_loss_clip": 0.01170393, "auxiliary_loss_mlp": 0.01032872, "balance_loss_clip": 1.05305529, "balance_loss_mlp": 1.02402687, "epoch": 0.29519629651896834, "flos": 20483626394880.0, "grad_norm": 2.0731907783948995, "language_loss": 0.79884654, "learning_rate": 3.3066682273304886e-06, "loss": 0.82087922, "num_input_tokens_seen": 52632055, "step": 2455, "time_per_iteration": 2.719906806945801 }, { "auxiliary_loss_clip": 0.01191615, "auxiliary_loss_mlp": 0.00763997, "balance_loss_clip": 1.05600107, "balance_loss_mlp": 1.00040317, "epoch": 0.2953165394096074, "flos": 18916484941440.0, "grad_norm": 2.0477286566281006, "language_loss": 0.78717607, "learning_rate": 3.3060783924010904e-06, "loss": 0.80673218, "num_input_tokens_seen": 52649980, "step": 2456, "time_per_iteration": 3.5529534816741943 }, { "auxiliary_loss_clip": 0.01157635, "auxiliary_loss_mlp": 0.01030897, "balance_loss_clip": 1.05199957, "balance_loss_mlp": 1.02137256, "epoch": 0.2954367823002465, "flos": 20624467622400.0, "grad_norm": 2.3640834019290162, "language_loss": 0.8504703, "learning_rate": 3.3054883593406387e-06, "loss": 0.87235558, "num_input_tokens_seen": 52664730, "step": 2457, "time_per_iteration": 2.7160181999206543 }, { "auxiliary_loss_clip": 0.01172088, "auxiliary_loss_mlp": 0.01034907, "balance_loss_clip": 1.05190444, "balance_loss_mlp": 1.02669358, "epoch": 0.2955570251908856, "flos": 31175525473920.0, "grad_norm": 2.132650035963358, "language_loss": 0.65337348, "learning_rate": 3.3048981282386404e-06, "loss": 0.67544341, "num_input_tokens_seen": 52686040, "step": 2458, "time_per_iteration": 2.7735068798065186 }, { "auxiliary_loss_clip": 0.01146172, "auxiliary_loss_mlp": 0.01031449, "balance_loss_clip": 1.05156922, "balance_loss_mlp": 1.02157879, "epoch": 0.29567726808152467, "flos": 21650328051840.0, "grad_norm": 1.9458781015591813, "language_loss": 0.82235259, "learning_rate": 3.304307699184634e-06, "loss": 0.84412879, "num_input_tokens_seen": 52704630, "step": 2459, "time_per_iteration": 3.623737096786499 }, { "auxiliary_loss_clip": 0.01175797, "auxiliary_loss_mlp": 0.01033505, "balance_loss_clip": 1.05615973, "balance_loss_mlp": 1.02485681, "epoch": 0.2957975109721638, "flos": 24243868638720.0, "grad_norm": 1.674108417774582, "language_loss": 0.78746378, "learning_rate": 3.3037170722681866e-06, "loss": 0.80955684, "num_input_tokens_seen": 52725465, "step": 2460, "time_per_iteration": 2.7167670726776123 }, { "auxiliary_loss_clip": 0.01150115, "auxiliary_loss_mlp": 0.01029564, "balance_loss_clip": 1.05109453, "balance_loss_mlp": 1.02042103, "epoch": 0.29591775386280283, "flos": 13479717352320.0, "grad_norm": 1.8576289432757902, "language_loss": 0.68161768, "learning_rate": 3.3031262475788956e-06, "loss": 0.70341444, "num_input_tokens_seen": 52742405, "step": 2461, "time_per_iteration": 2.7279629707336426 }, { "auxiliary_loss_clip": 0.01169856, "auxiliary_loss_mlp": 0.01027582, "balance_loss_clip": 1.05427384, "balance_loss_mlp": 1.0187906, "epoch": 0.29603799675344195, "flos": 17749783284480.0, "grad_norm": 2.1702633124896544, "language_loss": 0.73218375, "learning_rate": 3.3025352252063897e-06, "loss": 0.75415814, "num_input_tokens_seen": 52761100, "step": 2462, "time_per_iteration": 2.664778232574463 }, { "auxiliary_loss_clip": 0.01187036, "auxiliary_loss_mlp": 0.01031723, "balance_loss_clip": 1.05966806, "balance_loss_mlp": 1.02231109, "epoch": 0.29615823964408106, "flos": 22783920347520.0, "grad_norm": 1.7724278875271713, "language_loss": 0.75051969, "learning_rate": 3.3019440052403252e-06, "loss": 0.77270728, "num_input_tokens_seen": 52780965, "step": 2463, "time_per_iteration": 4.514355897903442 }, { "auxiliary_loss_clip": 0.01174261, "auxiliary_loss_mlp": 0.01031111, "balance_loss_clip": 1.05478764, "balance_loss_mlp": 1.02190197, "epoch": 0.2962784825347201, "flos": 23514199758720.0, "grad_norm": 2.2187383389293567, "language_loss": 0.70877004, "learning_rate": 3.30135258777039e-06, "loss": 0.73082376, "num_input_tokens_seen": 52800335, "step": 2464, "time_per_iteration": 2.7360997200012207 }, { "auxiliary_loss_clip": 0.01191077, "auxiliary_loss_mlp": 0.00764373, "balance_loss_clip": 1.05483794, "balance_loss_mlp": 1.00043404, "epoch": 0.2963987254253592, "flos": 16362769559040.0, "grad_norm": 2.304217941397056, "language_loss": 0.70460379, "learning_rate": 3.3007609728863024e-06, "loss": 0.72415829, "num_input_tokens_seen": 52818425, "step": 2465, "time_per_iteration": 2.5990383625030518 }, { "auxiliary_loss_clip": 0.01123274, "auxiliary_loss_mlp": 0.01027825, "balance_loss_clip": 1.05217564, "balance_loss_mlp": 1.0196588, "epoch": 0.29651896831599833, "flos": 33472263980160.0, "grad_norm": 2.126704081789709, "language_loss": 0.72858155, "learning_rate": 3.300169160677809e-06, "loss": 0.75009251, "num_input_tokens_seen": 52842340, "step": 2466, "time_per_iteration": 2.904372453689575 }, { "auxiliary_loss_clip": 0.0116655, "auxiliary_loss_mlp": 0.01032748, "balance_loss_clip": 1.05389416, "balance_loss_mlp": 1.02320552, "epoch": 0.2966392112066374, "flos": 23805363404160.0, "grad_norm": 2.5885375924364507, "language_loss": 0.77659857, "learning_rate": 3.2995771512346878e-06, "loss": 0.79859161, "num_input_tokens_seen": 52860690, "step": 2467, "time_per_iteration": 2.687779426574707 }, { "auxiliary_loss_clip": 0.01208041, "auxiliary_loss_mlp": 0.00764918, "balance_loss_clip": 1.06073987, "balance_loss_mlp": 1.0004915, "epoch": 0.2967594540972765, "flos": 19938466702080.0, "grad_norm": 1.9764046160098536, "language_loss": 0.73144019, "learning_rate": 3.298984944646746e-06, "loss": 0.7511698, "num_input_tokens_seen": 52879370, "step": 2468, "time_per_iteration": 2.6537508964538574 }, { "auxiliary_loss_clip": 0.01192347, "auxiliary_loss_mlp": 0.00764528, "balance_loss_clip": 1.05869532, "balance_loss_mlp": 1.0005095, "epoch": 0.2968796969879156, "flos": 23732823888000.0, "grad_norm": 2.6906080870447906, "language_loss": 0.82018733, "learning_rate": 3.298392541003822e-06, "loss": 0.83975607, "num_input_tokens_seen": 52898775, "step": 2469, "time_per_iteration": 2.6473257541656494 }, { "auxiliary_loss_clip": 0.0117555, "auxiliary_loss_mlp": 0.01020398, "balance_loss_clip": 1.05707383, "balance_loss_mlp": 1.01229215, "epoch": 0.29699993987855466, "flos": 22893699288960.0, "grad_norm": 1.740324877208916, "language_loss": 0.89414406, "learning_rate": 3.2977999403957806e-06, "loss": 0.9161036, "num_input_tokens_seen": 52917535, "step": 2470, "time_per_iteration": 2.7564756870269775 }, { "auxiliary_loss_clip": 0.01207034, "auxiliary_loss_mlp": 0.01033166, "balance_loss_clip": 1.06130755, "balance_loss_mlp": 1.02386749, "epoch": 0.2971201827691938, "flos": 33832555349760.0, "grad_norm": 4.263796001732715, "language_loss": 0.67156446, "learning_rate": 3.2972071429125207e-06, "loss": 0.69396651, "num_input_tokens_seen": 52938755, "step": 2471, "time_per_iteration": 2.7117512226104736 }, { "auxiliary_loss_clip": 0.01154827, "auxiliary_loss_mlp": 0.01035167, "balance_loss_clip": 1.05210972, "balance_loss_mlp": 1.02593446, "epoch": 0.2972404256598329, "flos": 22054359208320.0, "grad_norm": 2.748850826992151, "language_loss": 0.88424659, "learning_rate": 3.2966141486439682e-06, "loss": 0.90614653, "num_input_tokens_seen": 52957945, "step": 2472, "time_per_iteration": 2.7050881385803223 }, { "auxiliary_loss_clip": 0.01131424, "auxiliary_loss_mlp": 0.01033015, "balance_loss_clip": 1.04792929, "balance_loss_mlp": 1.02300107, "epoch": 0.29736066855047194, "flos": 31978595796480.0, "grad_norm": 2.1290290323555947, "language_loss": 0.65068579, "learning_rate": 3.29602095768008e-06, "loss": 0.67233014, "num_input_tokens_seen": 52978460, "step": 2473, "time_per_iteration": 2.854241132736206 }, { "auxiliary_loss_clip": 0.01167947, "auxiliary_loss_mlp": 0.01037977, "balance_loss_clip": 1.05530834, "balance_loss_mlp": 1.02937031, "epoch": 0.29748091144111105, "flos": 33510401245440.0, "grad_norm": 2.078410767951869, "language_loss": 0.64167035, "learning_rate": 3.2954275701108437e-06, "loss": 0.66372961, "num_input_tokens_seen": 52999640, "step": 2474, "time_per_iteration": 2.7921977043151855 }, { "auxiliary_loss_clip": 0.01140443, "auxiliary_loss_mlp": 0.01036199, "balance_loss_clip": 1.05080807, "balance_loss_mlp": 1.02644181, "epoch": 0.29760115433175016, "flos": 41283373409280.0, "grad_norm": 2.3633618237283227, "language_loss": 0.68829334, "learning_rate": 3.294833986026275e-06, "loss": 0.71005976, "num_input_tokens_seen": 53022880, "step": 2475, "time_per_iteration": 2.965322732925415 }, { "auxiliary_loss_clip": 0.01154853, "auxiliary_loss_mlp": 0.01031186, "balance_loss_clip": 1.0541501, "balance_loss_mlp": 1.02172065, "epoch": 0.2977213972223892, "flos": 24493339572480.0, "grad_norm": 2.1006562233543282, "language_loss": 0.85395849, "learning_rate": 3.29424020551642e-06, "loss": 0.87581885, "num_input_tokens_seen": 53041515, "step": 2476, "time_per_iteration": 2.7666096687316895 }, { "auxiliary_loss_clip": 0.0120782, "auxiliary_loss_mlp": 0.01036559, "balance_loss_clip": 1.05911422, "balance_loss_mlp": 1.02594924, "epoch": 0.2978416401130283, "flos": 21285116519040.0, "grad_norm": 2.5788125152917916, "language_loss": 0.72052622, "learning_rate": 3.2936462286713546e-06, "loss": 0.74296999, "num_input_tokens_seen": 53059865, "step": 2477, "time_per_iteration": 2.707519769668579 }, { "auxiliary_loss_clip": 0.01194282, "auxiliary_loss_mlp": 0.01032991, "balance_loss_clip": 1.05930793, "balance_loss_mlp": 1.02353144, "epoch": 0.2979618830036674, "flos": 25772154554880.0, "grad_norm": 1.8856476438125973, "language_loss": 0.7715075, "learning_rate": 3.2930520555811846e-06, "loss": 0.79378021, "num_input_tokens_seen": 53079490, "step": 2478, "time_per_iteration": 2.653155565261841 }, { "auxiliary_loss_clip": 0.01094414, "auxiliary_loss_mlp": 0.00765358, "balance_loss_clip": 1.04681492, "balance_loss_mlp": 1.00039649, "epoch": 0.2980821258943065, "flos": 23476996247040.0, "grad_norm": 1.8163278360308488, "language_loss": 0.79809916, "learning_rate": 3.292457686336046e-06, "loss": 0.81669688, "num_input_tokens_seen": 53098810, "step": 2479, "time_per_iteration": 3.0359456539154053 }, { "auxiliary_loss_clip": 0.01084514, "auxiliary_loss_mlp": 0.01001606, "balance_loss_clip": 1.02242446, "balance_loss_mlp": 0.99990124, "epoch": 0.2982023687849456, "flos": 69752314195200.0, "grad_norm": 0.8571747934078684, "language_loss": 0.61238003, "learning_rate": 3.291863121026105e-06, "loss": 0.6332413, "num_input_tokens_seen": 53162590, "step": 2480, "time_per_iteration": 3.634397029876709 }, { "auxiliary_loss_clip": 0.01191348, "auxiliary_loss_mlp": 0.01031083, "balance_loss_clip": 1.05806947, "balance_loss_mlp": 1.02173078, "epoch": 0.29832261167558466, "flos": 29825930741760.0, "grad_norm": 2.2938195479561903, "language_loss": 0.76813704, "learning_rate": 3.2912683597415547e-06, "loss": 0.79036134, "num_input_tokens_seen": 53186675, "step": 2481, "time_per_iteration": 2.7483150959014893 }, { "auxiliary_loss_clip": 0.0116298, "auxiliary_loss_mlp": 0.01030249, "balance_loss_clip": 1.05538726, "balance_loss_mlp": 1.02091515, "epoch": 0.29844285456622377, "flos": 33910158683520.0, "grad_norm": 3.692901810665582, "language_loss": 0.78226292, "learning_rate": 3.2906734025726213e-06, "loss": 0.80419517, "num_input_tokens_seen": 53205940, "step": 2482, "time_per_iteration": 3.8432607650756836 }, { "auxiliary_loss_clip": 0.01200753, "auxiliary_loss_mlp": 0.01039803, "balance_loss_clip": 1.0614028, "balance_loss_mlp": 1.03062391, "epoch": 0.2985630974568629, "flos": 23876933253120.0, "grad_norm": 2.5906711410169554, "language_loss": 0.88180351, "learning_rate": 3.290078249609559e-06, "loss": 0.90420902, "num_input_tokens_seen": 53225360, "step": 2483, "time_per_iteration": 2.753770351409912 }, { "auxiliary_loss_clip": 0.01191353, "auxiliary_loss_mlp": 0.01034447, "balance_loss_clip": 1.061728, "balance_loss_mlp": 1.02514887, "epoch": 0.29868334034750194, "flos": 21799106184960.0, "grad_norm": 2.0282227161839375, "language_loss": 0.87848169, "learning_rate": 3.2894829009426514e-06, "loss": 0.90073967, "num_input_tokens_seen": 53243195, "step": 2484, "time_per_iteration": 2.683485984802246 }, { "auxiliary_loss_clip": 0.01190725, "auxiliary_loss_mlp": 0.01036006, "balance_loss_clip": 1.05981696, "balance_loss_mlp": 1.02706575, "epoch": 0.29880358323814105, "flos": 25666649331840.0, "grad_norm": 1.9836738464607877, "language_loss": 0.77715284, "learning_rate": 3.288887356662213e-06, "loss": 0.79942018, "num_input_tokens_seen": 53264530, "step": 2485, "time_per_iteration": 3.6818442344665527 }, { "auxiliary_loss_clip": 0.01086721, "auxiliary_loss_mlp": 0.0100174, "balance_loss_clip": 1.02255845, "balance_loss_mlp": 1.00002337, "epoch": 0.29892382612878016, "flos": 71005846003200.0, "grad_norm": 0.7890467341990293, "language_loss": 0.59736454, "learning_rate": 3.288291616858588e-06, "loss": 0.61824918, "num_input_tokens_seen": 53319920, "step": 2486, "time_per_iteration": 3.0973880290985107 }, { "auxiliary_loss_clip": 0.01140716, "auxiliary_loss_mlp": 0.01036075, "balance_loss_clip": 1.05508351, "balance_loss_mlp": 1.02626419, "epoch": 0.2990440690194192, "flos": 25481134563840.0, "grad_norm": 2.5765667930303087, "language_loss": 0.77001148, "learning_rate": 3.287695681622149e-06, "loss": 0.7917794, "num_input_tokens_seen": 53339270, "step": 2487, "time_per_iteration": 2.8812294006347656 }, { "auxiliary_loss_clip": 0.01180501, "auxiliary_loss_mlp": 0.0102833, "balance_loss_clip": 1.05634618, "balance_loss_mlp": 1.01946688, "epoch": 0.2991643119100583, "flos": 23732357011200.0, "grad_norm": 2.1717729596790516, "language_loss": 0.81096554, "learning_rate": 3.2870995510432982e-06, "loss": 0.83305383, "num_input_tokens_seen": 53357750, "step": 2488, "time_per_iteration": 2.7497847080230713 }, { "auxiliary_loss_clip": 0.0118655, "auxiliary_loss_mlp": 0.01026703, "balance_loss_clip": 1.05871451, "balance_loss_mlp": 1.01830459, "epoch": 0.29928455480069743, "flos": 27417545786880.0, "grad_norm": 2.255519732523873, "language_loss": 0.7736901, "learning_rate": 3.2865032252124697e-06, "loss": 0.79582256, "num_input_tokens_seen": 53378265, "step": 2489, "time_per_iteration": 3.827218532562256 }, { "auxiliary_loss_clip": 0.01172374, "auxiliary_loss_mlp": 0.01027463, "balance_loss_clip": 1.05274212, "balance_loss_mlp": 1.01907063, "epoch": 0.2994047976913365, "flos": 33692935184640.0, "grad_norm": 1.5463425157771602, "language_loss": 0.77925354, "learning_rate": 3.2859067042201243e-06, "loss": 0.80125189, "num_input_tokens_seen": 53400305, "step": 2490, "time_per_iteration": 2.8704824447631836 }, { "auxiliary_loss_clip": 0.01107774, "auxiliary_loss_mlp": 0.01027948, "balance_loss_clip": 1.04541278, "balance_loss_mlp": 1.01922798, "epoch": 0.2995250405819756, "flos": 16763963541120.0, "grad_norm": 1.8098043887933326, "language_loss": 0.7814616, "learning_rate": 3.2853099881567544e-06, "loss": 0.80281883, "num_input_tokens_seen": 53418705, "step": 2491, "time_per_iteration": 2.7769112586975098 }, { "auxiliary_loss_clip": 0.01201444, "auxiliary_loss_mlp": 0.01026891, "balance_loss_clip": 1.05985343, "balance_loss_mlp": 1.01867795, "epoch": 0.29964528347261465, "flos": 22963976248320.0, "grad_norm": 1.8063621969857508, "language_loss": 0.79249883, "learning_rate": 3.284713077112881e-06, "loss": 0.8147822, "num_input_tokens_seen": 53438135, "step": 2492, "time_per_iteration": 2.737680673599243 }, { "auxiliary_loss_clip": 0.01167681, "auxiliary_loss_mlp": 0.01036417, "balance_loss_clip": 1.05673313, "balance_loss_mlp": 1.02584863, "epoch": 0.29976552636325376, "flos": 16938021870720.0, "grad_norm": 3.7142739908279965, "language_loss": 0.86570978, "learning_rate": 3.284115971179056e-06, "loss": 0.8877508, "num_input_tokens_seen": 53452165, "step": 2493, "time_per_iteration": 2.6638522148132324 }, { "auxiliary_loss_clip": 0.0113572, "auxiliary_loss_mlp": 0.01030007, "balance_loss_clip": 1.05198574, "balance_loss_mlp": 1.0214417, "epoch": 0.2998857692538929, "flos": 17056455989760.0, "grad_norm": 1.703724364981277, "language_loss": 0.78776354, "learning_rate": 3.283518670445859e-06, "loss": 0.80942082, "num_input_tokens_seen": 53470075, "step": 2494, "time_per_iteration": 2.7404215335845947 }, { "auxiliary_loss_clip": 0.01074703, "auxiliary_loss_mlp": 0.00754919, "balance_loss_clip": 1.02317786, "balance_loss_mlp": 1.00025368, "epoch": 0.30000601214453193, "flos": 68831528025600.0, "grad_norm": 0.6988227591603532, "language_loss": 0.54335272, "learning_rate": 3.2829211750038995e-06, "loss": 0.56164896, "num_input_tokens_seen": 53538705, "step": 2495, "time_per_iteration": 3.3492865562438965 }, { "auxiliary_loss_clip": 0.01156434, "auxiliary_loss_mlp": 0.01028526, "balance_loss_clip": 1.05181515, "balance_loss_mlp": 1.01996708, "epoch": 0.30012625503517104, "flos": 17603267708160.0, "grad_norm": 1.8786488999244617, "language_loss": 0.89301306, "learning_rate": 3.2823234849438183e-06, "loss": 0.91486263, "num_input_tokens_seen": 53556740, "step": 2496, "time_per_iteration": 2.687960386276245 }, { "auxiliary_loss_clip": 0.01180836, "auxiliary_loss_mlp": 0.01035388, "balance_loss_clip": 1.0561837, "balance_loss_mlp": 1.02617931, "epoch": 0.30024649792581015, "flos": 21252581775360.0, "grad_norm": 2.4030353844697596, "language_loss": 0.75814128, "learning_rate": 3.2817256003562836e-06, "loss": 0.78030348, "num_input_tokens_seen": 53577115, "step": 2497, "time_per_iteration": 2.750623941421509 }, { "auxiliary_loss_clip": 0.01135251, "auxiliary_loss_mlp": 0.01028595, "balance_loss_clip": 1.05164969, "balance_loss_mlp": 1.01893258, "epoch": 0.3003667408164492, "flos": 23003262748800.0, "grad_norm": 2.142463032523493, "language_loss": 0.66236413, "learning_rate": 3.281127521331995e-06, "loss": 0.68400264, "num_input_tokens_seen": 53598295, "step": 2498, "time_per_iteration": 2.8403232097625732 }, { "auxiliary_loss_clip": 0.0110212, "auxiliary_loss_mlp": 0.0100579, "balance_loss_clip": 1.02249146, "balance_loss_mlp": 1.00404918, "epoch": 0.3004869837070883, "flos": 64232340750720.0, "grad_norm": 0.8753208160624256, "language_loss": 0.60642016, "learning_rate": 3.2805292479616798e-06, "loss": 0.62749922, "num_input_tokens_seen": 53657160, "step": 2499, "time_per_iteration": 3.1126906871795654 }, { "auxiliary_loss_clip": 0.01176343, "auxiliary_loss_mlp": 0.01030848, "balance_loss_clip": 1.05574179, "balance_loss_mlp": 1.02199662, "epoch": 0.30060722659772743, "flos": 26248653400320.0, "grad_norm": 2.213076221387785, "language_loss": 0.92408389, "learning_rate": 3.2799307803360955e-06, "loss": 0.94615579, "num_input_tokens_seen": 53673090, "step": 2500, "time_per_iteration": 2.758390426635742 }, { "auxiliary_loss_clip": 0.0120174, "auxiliary_loss_mlp": 0.01028484, "balance_loss_clip": 1.05699325, "balance_loss_mlp": 1.02024031, "epoch": 0.3007274694883665, "flos": 24970879912320.0, "grad_norm": 1.500481568395846, "language_loss": 0.81473511, "learning_rate": 3.27933211854603e-06, "loss": 0.83703744, "num_input_tokens_seen": 53692145, "step": 2501, "time_per_iteration": 2.6924455165863037 }, { "auxiliary_loss_clip": 0.01177146, "auxiliary_loss_mlp": 0.01032422, "balance_loss_clip": 1.05751657, "balance_loss_mlp": 1.02323675, "epoch": 0.3008477123790056, "flos": 17055845458560.0, "grad_norm": 1.7515453063742352, "language_loss": 0.87011474, "learning_rate": 3.278733262682299e-06, "loss": 0.89221042, "num_input_tokens_seen": 53710000, "step": 2502, "time_per_iteration": 2.694704532623291 }, { "auxiliary_loss_clip": 0.01205529, "auxiliary_loss_mlp": 0.01036929, "balance_loss_clip": 1.05855966, "balance_loss_mlp": 1.02702248, "epoch": 0.3009679552696447, "flos": 21506398254720.0, "grad_norm": 2.0728986678160717, "language_loss": 0.82664561, "learning_rate": 3.2781342128357484e-06, "loss": 0.84907019, "num_input_tokens_seen": 53729355, "step": 2503, "time_per_iteration": 2.624593496322632 }, { "auxiliary_loss_clip": 0.0115846, "auxiliary_loss_mlp": 0.01028952, "balance_loss_clip": 1.05176854, "balance_loss_mlp": 1.02004671, "epoch": 0.30108819816028376, "flos": 21134004001920.0, "grad_norm": 2.5907989320226097, "language_loss": 0.807302, "learning_rate": 3.2775349690972547e-06, "loss": 0.82917613, "num_input_tokens_seen": 53743505, "step": 2504, "time_per_iteration": 2.7438576221466064 }, { "auxiliary_loss_clip": 0.01082332, "auxiliary_loss_mlp": 0.01002214, "balance_loss_clip": 1.01881611, "balance_loss_mlp": 1.00053334, "epoch": 0.30120844105092287, "flos": 71126434938240.0, "grad_norm": 0.7678316732222886, "language_loss": 0.51830083, "learning_rate": 3.276935531557722e-06, "loss": 0.5391463, "num_input_tokens_seen": 53808725, "step": 2505, "time_per_iteration": 3.3351778984069824 }, { "auxiliary_loss_clip": 0.01149407, "auxiliary_loss_mlp": 0.01032905, "balance_loss_clip": 1.05079353, "balance_loss_mlp": 1.02293301, "epoch": 0.301328683941562, "flos": 20264571302400.0, "grad_norm": 5.413042466296938, "language_loss": 0.79869533, "learning_rate": 3.2763359003080837e-06, "loss": 0.82051843, "num_input_tokens_seen": 53825680, "step": 2506, "time_per_iteration": 2.7837071418762207 }, { "auxiliary_loss_clip": 0.01079266, "auxiliary_loss_mlp": 0.0100237, "balance_loss_clip": 1.02116084, "balance_loss_mlp": 1.00046241, "epoch": 0.30144892683220104, "flos": 70648212240000.0, "grad_norm": 0.8543175300556911, "language_loss": 0.62473786, "learning_rate": 3.2757360754393047e-06, "loss": 0.64555424, "num_input_tokens_seen": 53889750, "step": 2507, "time_per_iteration": 4.191912889480591 }, { "auxiliary_loss_clip": 0.01186469, "auxiliary_loss_mlp": 0.01034054, "balance_loss_clip": 1.05433905, "balance_loss_mlp": 1.02445805, "epoch": 0.30156916972284015, "flos": 22820549241600.0, "grad_norm": 3.356270739661911, "language_loss": 0.64561367, "learning_rate": 3.2751360570423767e-06, "loss": 0.6678189, "num_input_tokens_seen": 53908135, "step": 2508, "time_per_iteration": 2.7321279048919678 }, { "auxiliary_loss_clip": 0.01173203, "auxiliary_loss_mlp": 0.01025885, "balance_loss_clip": 1.0546006, "balance_loss_mlp": 1.01722491, "epoch": 0.3016894126134792, "flos": 29899188529920.0, "grad_norm": 2.3887724132560177, "language_loss": 0.75779271, "learning_rate": 3.2745358452083236e-06, "loss": 0.77978355, "num_input_tokens_seen": 53931035, "step": 2509, "time_per_iteration": 2.7862813472747803 }, { "auxiliary_loss_clip": 0.01185595, "auxiliary_loss_mlp": 0.01028649, "balance_loss_clip": 1.05607796, "balance_loss_mlp": 1.02075732, "epoch": 0.3018096555041183, "flos": 21546331200000.0, "grad_norm": 1.4974425420085826, "language_loss": 0.82163697, "learning_rate": 3.2739354400281955e-06, "loss": 0.84377944, "num_input_tokens_seen": 53952255, "step": 2510, "time_per_iteration": 2.6842591762542725 }, { "auxiliary_loss_clip": 0.01068214, "auxiliary_loss_mlp": 0.00755169, "balance_loss_clip": 1.02035618, "balance_loss_mlp": 1.00016773, "epoch": 0.3019298983947574, "flos": 59136294597120.0, "grad_norm": 0.8607184144590732, "language_loss": 0.63678193, "learning_rate": 3.2733348415930744e-06, "loss": 0.65501571, "num_input_tokens_seen": 54014125, "step": 2511, "time_per_iteration": 4.225912809371948 }, { "auxiliary_loss_clip": 0.01155835, "auxiliary_loss_mlp": 0.01028997, "balance_loss_clip": 1.0549984, "balance_loss_mlp": 1.02040803, "epoch": 0.3020501412853965, "flos": 34423070941440.0, "grad_norm": 3.5904280977935503, "language_loss": 0.80539465, "learning_rate": 3.27273404999407e-06, "loss": 0.82724297, "num_input_tokens_seen": 54036345, "step": 2512, "time_per_iteration": 2.8455705642700195 }, { "auxiliary_loss_clip": 0.01079077, "auxiliary_loss_mlp": 0.01003105, "balance_loss_clip": 1.01898885, "balance_loss_mlp": 1.00128078, "epoch": 0.3021703841760356, "flos": 71008288128000.0, "grad_norm": 0.7968676786434831, "language_loss": 0.60457325, "learning_rate": 3.272133065322322e-06, "loss": 0.62539506, "num_input_tokens_seen": 54094615, "step": 2513, "time_per_iteration": 3.2093701362609863 }, { "auxiliary_loss_clip": 0.01199369, "auxiliary_loss_mlp": 0.01033631, "balance_loss_clip": 1.05607796, "balance_loss_mlp": 1.02530408, "epoch": 0.3022906270666747, "flos": 21510528318720.0, "grad_norm": 1.900517210764832, "language_loss": 0.79629225, "learning_rate": 3.271531887669e-06, "loss": 0.81862223, "num_input_tokens_seen": 54114675, "step": 2514, "time_per_iteration": 2.6656150817871094 }, { "auxiliary_loss_clip": 0.01149241, "auxiliary_loss_mlp": 0.01033164, "balance_loss_clip": 1.05070782, "balance_loss_mlp": 1.02387774, "epoch": 0.30241086995731375, "flos": 31132001168640.0, "grad_norm": 2.2723411582577677, "language_loss": 0.63235587, "learning_rate": 3.2709305171253015e-06, "loss": 0.65417993, "num_input_tokens_seen": 54134795, "step": 2515, "time_per_iteration": 3.8105626106262207 }, { "auxiliary_loss_clip": 0.01188737, "auxiliary_loss_mlp": 0.01037594, "balance_loss_clip": 1.05778158, "balance_loss_mlp": 1.02888548, "epoch": 0.30253111284795287, "flos": 23511542152320.0, "grad_norm": 2.174041968346941, "language_loss": 0.78224921, "learning_rate": 3.2703289537824536e-06, "loss": 0.80451262, "num_input_tokens_seen": 54154595, "step": 2516, "time_per_iteration": 3.5870609283447266 }, { "auxiliary_loss_clip": 0.01147314, "auxiliary_loss_mlp": 0.0103644, "balance_loss_clip": 1.05164325, "balance_loss_mlp": 1.02794683, "epoch": 0.302651355738592, "flos": 18725367651840.0, "grad_norm": 3.060411624782866, "language_loss": 0.79172039, "learning_rate": 3.269727197731714e-06, "loss": 0.81355792, "num_input_tokens_seen": 54167360, "step": 2517, "time_per_iteration": 2.765566110610962 }, { "auxiliary_loss_clip": 0.01136225, "auxiliary_loss_mlp": 0.01025685, "balance_loss_clip": 1.05282211, "balance_loss_mlp": 1.0171262, "epoch": 0.30277159862923103, "flos": 22418888382720.0, "grad_norm": 2.4456448979672216, "language_loss": 0.77986515, "learning_rate": 3.269125249064367e-06, "loss": 0.80148423, "num_input_tokens_seen": 54187055, "step": 2518, "time_per_iteration": 2.7988667488098145 }, { "auxiliary_loss_clip": 0.01202, "auxiliary_loss_mlp": 0.0102895, "balance_loss_clip": 1.05745769, "balance_loss_mlp": 1.01981282, "epoch": 0.30289184151987014, "flos": 22273126992000.0, "grad_norm": 1.6987891023528463, "language_loss": 0.83344972, "learning_rate": 3.2685231078717297e-06, "loss": 0.85575926, "num_input_tokens_seen": 54207245, "step": 2519, "time_per_iteration": 2.7061755657196045 }, { "auxiliary_loss_clip": 0.01149664, "auxiliary_loss_mlp": 0.00764356, "balance_loss_clip": 1.05249119, "balance_loss_mlp": 1.00040722, "epoch": 0.30301208441050925, "flos": 25225594231680.0, "grad_norm": 2.105991573089684, "language_loss": 0.75633693, "learning_rate": 3.267920774245145e-06, "loss": 0.77547717, "num_input_tokens_seen": 54226650, "step": 2520, "time_per_iteration": 2.7748045921325684 }, { "auxiliary_loss_clip": 0.01187463, "auxiliary_loss_mlp": 0.01035517, "balance_loss_clip": 1.05669475, "balance_loss_mlp": 1.02579522, "epoch": 0.3031323273011483, "flos": 23039245198080.0, "grad_norm": 2.1368505671351214, "language_loss": 0.84306633, "learning_rate": 3.2673182482759876e-06, "loss": 0.86529607, "num_input_tokens_seen": 54245765, "step": 2521, "time_per_iteration": 2.689213514328003 }, { "auxiliary_loss_clip": 0.01186175, "auxiliary_loss_mlp": 0.01025656, "balance_loss_clip": 1.05663514, "balance_loss_mlp": 1.01777601, "epoch": 0.3032525701917874, "flos": 18876695650560.0, "grad_norm": 1.985118739777771, "language_loss": 0.66315633, "learning_rate": 3.266715530055659e-06, "loss": 0.68527466, "num_input_tokens_seen": 54263915, "step": 2522, "time_per_iteration": 2.6304843425750732 }, { "auxiliary_loss_clip": 0.01179387, "auxiliary_loss_mlp": 0.01032416, "balance_loss_clip": 1.05462635, "balance_loss_mlp": 1.02378535, "epoch": 0.30337281308242653, "flos": 17782641250560.0, "grad_norm": 2.238378980812776, "language_loss": 0.80554807, "learning_rate": 3.2661126196755927e-06, "loss": 0.82766616, "num_input_tokens_seen": 54283025, "step": 2523, "time_per_iteration": 2.6478872299194336 }, { "auxiliary_loss_clip": 0.01099328, "auxiliary_loss_mlp": 0.01003083, "balance_loss_clip": 1.02046442, "balance_loss_mlp": 1.00135469, "epoch": 0.3034930559730656, "flos": 57824298426240.0, "grad_norm": 0.7816601314695385, "language_loss": 0.55879402, "learning_rate": 3.265509517227248e-06, "loss": 0.57981813, "num_input_tokens_seen": 54339840, "step": 2524, "time_per_iteration": 3.203942060470581 }, { "auxiliary_loss_clip": 0.01168959, "auxiliary_loss_mlp": 0.01029411, "balance_loss_clip": 1.05125678, "balance_loss_mlp": 1.02125156, "epoch": 0.3036132988637047, "flos": 14755587419520.0, "grad_norm": 1.9100876056966332, "language_loss": 0.8088308, "learning_rate": 3.264906222802115e-06, "loss": 0.83081448, "num_input_tokens_seen": 54357690, "step": 2525, "time_per_iteration": 2.680345296859741 }, { "auxiliary_loss_clip": 0.01202144, "auxiliary_loss_mlp": 0.01030922, "balance_loss_clip": 1.05718446, "balance_loss_mlp": 1.02211833, "epoch": 0.30373354175434375, "flos": 21033203460480.0, "grad_norm": 2.870908680385623, "language_loss": 0.78035665, "learning_rate": 3.264302736491715e-06, "loss": 0.80268729, "num_input_tokens_seen": 54377810, "step": 2526, "time_per_iteration": 2.612384557723999 }, { "auxiliary_loss_clip": 0.01188616, "auxiliary_loss_mlp": 0.01030834, "balance_loss_clip": 1.06028306, "balance_loss_mlp": 1.02248907, "epoch": 0.30385378464498286, "flos": 21143233797120.0, "grad_norm": 2.5467490701723325, "language_loss": 0.87875843, "learning_rate": 3.263699058387594e-06, "loss": 0.90095288, "num_input_tokens_seen": 54395245, "step": 2527, "time_per_iteration": 2.6477293968200684 }, { "auxiliary_loss_clip": 0.01153217, "auxiliary_loss_mlp": 0.01034518, "balance_loss_clip": 1.05004728, "balance_loss_mlp": 1.02539277, "epoch": 0.30397402753562197, "flos": 20629244131200.0, "grad_norm": 2.2197968156811747, "language_loss": 0.89905256, "learning_rate": 3.2630951885813315e-06, "loss": 0.92092991, "num_input_tokens_seen": 54412640, "step": 2528, "time_per_iteration": 2.732787609100342 }, { "auxiliary_loss_clip": 0.01173083, "auxiliary_loss_mlp": 0.01028987, "balance_loss_clip": 1.05082035, "balance_loss_mlp": 1.02044559, "epoch": 0.304094270426261, "flos": 15085678429440.0, "grad_norm": 2.0364507872405646, "language_loss": 0.78031754, "learning_rate": 3.262491127164533e-06, "loss": 0.80233824, "num_input_tokens_seen": 54431455, "step": 2529, "time_per_iteration": 2.6846084594726562 }, { "auxiliary_loss_clip": 0.01180586, "auxiliary_loss_mlp": 0.00764064, "balance_loss_clip": 1.05590427, "balance_loss_mlp": 1.00057173, "epoch": 0.30421451331690014, "flos": 13845216193920.0, "grad_norm": 2.7802427377644543, "language_loss": 0.80142665, "learning_rate": 3.2618868742288337e-06, "loss": 0.82087314, "num_input_tokens_seen": 54448380, "step": 2530, "time_per_iteration": 2.7011606693267822 }, { "auxiliary_loss_clip": 0.01188221, "auxiliary_loss_mlp": 0.01029013, "balance_loss_clip": 1.05889666, "balance_loss_mlp": 1.01973224, "epoch": 0.30433475620753925, "flos": 17384212615680.0, "grad_norm": 2.623697958218013, "language_loss": 0.72592634, "learning_rate": 3.261282429865899e-06, "loss": 0.74809873, "num_input_tokens_seen": 54466385, "step": 2531, "time_per_iteration": 2.6986594200134277 }, { "auxiliary_loss_clip": 0.01177399, "auxiliary_loss_mlp": 0.00763666, "balance_loss_clip": 1.0570972, "balance_loss_mlp": 1.00045156, "epoch": 0.3044549990981783, "flos": 18916951818240.0, "grad_norm": 2.0196712735308093, "language_loss": 0.72308493, "learning_rate": 3.2606777941674225e-06, "loss": 0.7424956, "num_input_tokens_seen": 54485040, "step": 2532, "time_per_iteration": 2.747828483581543 }, { "auxiliary_loss_clip": 0.01134296, "auxiliary_loss_mlp": 0.01030882, "balance_loss_clip": 1.05026889, "balance_loss_mlp": 1.02225077, "epoch": 0.3045752419888174, "flos": 21068431724160.0, "grad_norm": 2.148986370739615, "language_loss": 0.8448025, "learning_rate": 3.2600729672251276e-06, "loss": 0.86645424, "num_input_tokens_seen": 54502755, "step": 2533, "time_per_iteration": 3.7471518516540527 }, { "auxiliary_loss_clip": 0.01203273, "auxiliary_loss_mlp": 0.0076386, "balance_loss_clip": 1.05990338, "balance_loss_mlp": 1.00040722, "epoch": 0.3046954848794565, "flos": 29096405516160.0, "grad_norm": 2.2088525381443502, "language_loss": 0.66078496, "learning_rate": 3.259467949130765e-06, "loss": 0.68045628, "num_input_tokens_seen": 54524165, "step": 2534, "time_per_iteration": 2.7948508262634277 }, { "auxiliary_loss_clip": 0.01176037, "auxiliary_loss_mlp": 0.01029919, "balance_loss_clip": 1.05819368, "balance_loss_mlp": 1.02074611, "epoch": 0.3048157277700956, "flos": 20295346279680.0, "grad_norm": 2.756156218830885, "language_loss": 0.82855809, "learning_rate": 3.2588627399761164e-06, "loss": 0.85061771, "num_input_tokens_seen": 54540160, "step": 2535, "time_per_iteration": 2.713184118270874 }, { "auxiliary_loss_clip": 0.01174256, "auxiliary_loss_mlp": 0.0103117, "balance_loss_clip": 1.05795288, "balance_loss_mlp": 1.02237868, "epoch": 0.3049359706607347, "flos": 22739929165440.0, "grad_norm": 1.9973814974975044, "language_loss": 0.70652384, "learning_rate": 3.2582573398529903e-06, "loss": 0.72857809, "num_input_tokens_seen": 54557515, "step": 2536, "time_per_iteration": 2.7494189739227295 }, { "auxiliary_loss_clip": 0.01158783, "auxiliary_loss_mlp": 0.01033747, "balance_loss_clip": 1.05181146, "balance_loss_mlp": 1.0247705, "epoch": 0.3050562135513738, "flos": 18434634969600.0, "grad_norm": 2.2338474890683324, "language_loss": 0.74292922, "learning_rate": 3.2576517488532265e-06, "loss": 0.76485455, "num_input_tokens_seen": 54573865, "step": 2537, "time_per_iteration": 3.6021199226379395 }, { "auxiliary_loss_clip": 0.0118657, "auxiliary_loss_mlp": 0.01026596, "balance_loss_clip": 1.05521274, "balance_loss_mlp": 1.01798892, "epoch": 0.30517645644201286, "flos": 20370327920640.0, "grad_norm": 1.870424540584384, "language_loss": 0.87736702, "learning_rate": 3.257045967068692e-06, "loss": 0.89949864, "num_input_tokens_seen": 54593120, "step": 2538, "time_per_iteration": 2.7130579948425293 }, { "auxiliary_loss_clip": 0.01204406, "auxiliary_loss_mlp": 0.01033336, "balance_loss_clip": 1.05968511, "balance_loss_mlp": 1.0242517, "epoch": 0.30529669933265197, "flos": 21945118970880.0, "grad_norm": 1.9850244089208842, "language_loss": 0.82122552, "learning_rate": 3.2564399945912848e-06, "loss": 0.8436029, "num_input_tokens_seen": 54612910, "step": 2539, "time_per_iteration": 2.6539738178253174 }, { "auxiliary_loss_clip": 0.0114717, "auxiliary_loss_mlp": 0.01034352, "balance_loss_clip": 1.05190563, "balance_loss_mlp": 1.02570391, "epoch": 0.305416942223291, "flos": 21835411856640.0, "grad_norm": 2.9930199525763252, "language_loss": 0.82151151, "learning_rate": 3.2558338315129287e-06, "loss": 0.84332675, "num_input_tokens_seen": 54631055, "step": 2540, "time_per_iteration": 3.7890100479125977 }, { "auxiliary_loss_clip": 0.01181407, "auxiliary_loss_mlp": 0.01028476, "balance_loss_clip": 1.05699944, "balance_loss_mlp": 1.02019048, "epoch": 0.30553718511393013, "flos": 33911810709120.0, "grad_norm": 3.6114232261528993, "language_loss": 0.7593928, "learning_rate": 3.2552274779255785e-06, "loss": 0.78149164, "num_input_tokens_seen": 54651985, "step": 2541, "time_per_iteration": 3.6540184020996094 }, { "auxiliary_loss_clip": 0.01186341, "auxiliary_loss_mlp": 0.01030843, "balance_loss_clip": 1.05877066, "balance_loss_mlp": 1.02228975, "epoch": 0.30565742800456924, "flos": 22268530051200.0, "grad_norm": 2.270128575365527, "language_loss": 0.77071536, "learning_rate": 3.2546209339212184e-06, "loss": 0.79288715, "num_input_tokens_seen": 54671005, "step": 2542, "time_per_iteration": 2.69398832321167 }, { "auxiliary_loss_clip": 0.01173758, "auxiliary_loss_mlp": 0.01027191, "balance_loss_clip": 1.0535332, "balance_loss_mlp": 1.01909649, "epoch": 0.3057776708952083, "flos": 22565044823040.0, "grad_norm": 1.7821031026213785, "language_loss": 0.77578527, "learning_rate": 3.25401419959186e-06, "loss": 0.79779476, "num_input_tokens_seen": 54691615, "step": 2543, "time_per_iteration": 2.783055067062378 }, { "auxiliary_loss_clip": 0.01186232, "auxiliary_loss_mlp": 0.01032921, "balance_loss_clip": 1.06113386, "balance_loss_mlp": 1.0245347, "epoch": 0.3058979137858474, "flos": 21799213925760.0, "grad_norm": 2.0929887151446263, "language_loss": 0.76067483, "learning_rate": 3.253407275029545e-06, "loss": 0.78286636, "num_input_tokens_seen": 54710520, "step": 2544, "time_per_iteration": 2.7258830070495605 }, { "auxiliary_loss_clip": 0.01162606, "auxiliary_loss_mlp": 0.0103511, "balance_loss_clip": 1.05553555, "balance_loss_mlp": 1.02574563, "epoch": 0.3060181566764865, "flos": 26979435601920.0, "grad_norm": 1.8991332383649282, "language_loss": 0.80180836, "learning_rate": 3.2528001603263425e-06, "loss": 0.82378554, "num_input_tokens_seen": 54732590, "step": 2545, "time_per_iteration": 2.785421371459961 }, { "auxiliary_loss_clip": 0.01190103, "auxiliary_loss_mlp": 0.01032186, "balance_loss_clip": 1.06193876, "balance_loss_mlp": 1.02335262, "epoch": 0.3061383995671256, "flos": 19865101173120.0, "grad_norm": 2.3072770908602207, "language_loss": 0.81228584, "learning_rate": 3.2521928555743514e-06, "loss": 0.83450872, "num_input_tokens_seen": 54749935, "step": 2546, "time_per_iteration": 2.6757330894470215 }, { "auxiliary_loss_clip": 0.01167876, "auxiliary_loss_mlp": 0.00763363, "balance_loss_clip": 1.05493462, "balance_loss_mlp": 1.00052094, "epoch": 0.3062586424577647, "flos": 22127509255680.0, "grad_norm": 2.2559636464682122, "language_loss": 0.66995209, "learning_rate": 3.2515853608657e-06, "loss": 0.68926454, "num_input_tokens_seen": 54767935, "step": 2547, "time_per_iteration": 2.732675790786743 }, { "auxiliary_loss_clip": 0.01184236, "auxiliary_loss_mlp": 0.01034656, "balance_loss_clip": 1.05810332, "balance_loss_mlp": 1.0254178, "epoch": 0.3063788853484038, "flos": 20845497962880.0, "grad_norm": 3.181600729558309, "language_loss": 0.75058985, "learning_rate": 3.250977676292545e-06, "loss": 0.77277875, "num_input_tokens_seen": 54786175, "step": 2548, "time_per_iteration": 2.7223854064941406 }, { "auxiliary_loss_clip": 0.01176378, "auxiliary_loss_mlp": 0.01035136, "balance_loss_clip": 1.05557203, "balance_loss_mlp": 1.02632082, "epoch": 0.30649912823904285, "flos": 16209717707520.0, "grad_norm": 2.1858008360544545, "language_loss": 0.79537177, "learning_rate": 3.2503698019470712e-06, "loss": 0.81748694, "num_input_tokens_seen": 54801945, "step": 2549, "time_per_iteration": 2.711002826690674 }, { "auxiliary_loss_clip": 0.01185126, "auxiliary_loss_mlp": 0.0103239, "balance_loss_clip": 1.05447507, "balance_loss_mlp": 1.02398002, "epoch": 0.30661937112968196, "flos": 18617815353600.0, "grad_norm": 2.0406707051112374, "language_loss": 0.78466588, "learning_rate": 3.249761737921492e-06, "loss": 0.80684102, "num_input_tokens_seen": 54818475, "step": 2550, "time_per_iteration": 2.643123149871826 }, { "auxiliary_loss_clip": 0.01171799, "auxiliary_loss_mlp": 0.01035411, "balance_loss_clip": 1.05829358, "balance_loss_mlp": 1.02638125, "epoch": 0.30673961402032107, "flos": 31390809638400.0, "grad_norm": 2.1476164101227813, "language_loss": 0.74512768, "learning_rate": 3.249153484308051e-06, "loss": 0.76719975, "num_input_tokens_seen": 54837090, "step": 2551, "time_per_iteration": 2.7386698722839355 }, { "auxiliary_loss_clip": 0.01130321, "auxiliary_loss_mlp": 0.01030342, "balance_loss_clip": 1.04781103, "balance_loss_mlp": 1.02137709, "epoch": 0.3068598569109601, "flos": 20229809915520.0, "grad_norm": 2.5325233156785147, "language_loss": 0.77970266, "learning_rate": 3.2485450411990194e-06, "loss": 0.80130935, "num_input_tokens_seen": 54856445, "step": 2552, "time_per_iteration": 2.7836077213287354 }, { "auxiliary_loss_clip": 0.01202424, "auxiliary_loss_mlp": 0.01028879, "balance_loss_clip": 1.05735028, "balance_loss_mlp": 1.02009964, "epoch": 0.30698009980159924, "flos": 29601991399680.0, "grad_norm": 3.1293389870743047, "language_loss": 0.82261252, "learning_rate": 3.2479364086866983e-06, "loss": 0.84492558, "num_input_tokens_seen": 54876700, "step": 2553, "time_per_iteration": 2.6642520427703857 }, { "auxiliary_loss_clip": 0.01177213, "auxiliary_loss_mlp": 0.00764112, "balance_loss_clip": 1.05974722, "balance_loss_mlp": 1.00072467, "epoch": 0.30710034269223835, "flos": 23842423261440.0, "grad_norm": 1.900181989570923, "language_loss": 0.8152833, "learning_rate": 3.247327586863416e-06, "loss": 0.83469653, "num_input_tokens_seen": 54897580, "step": 2554, "time_per_iteration": 2.75164532661438 }, { "auxiliary_loss_clip": 0.01163208, "auxiliary_loss_mlp": 0.010293, "balance_loss_clip": 1.05371976, "balance_loss_mlp": 1.01984692, "epoch": 0.3072205855828774, "flos": 25884986152320.0, "grad_norm": 2.2947512867564877, "language_loss": 0.76993906, "learning_rate": 3.2467185758215304e-06, "loss": 0.7918641, "num_input_tokens_seen": 54917320, "step": 2555, "time_per_iteration": 2.7952990531921387 }, { "auxiliary_loss_clip": 0.01164664, "auxiliary_loss_mlp": 0.00763116, "balance_loss_clip": 1.05754578, "balance_loss_mlp": 1.00062752, "epoch": 0.3073408284735165, "flos": 22236390357120.0, "grad_norm": 2.3535564430192712, "language_loss": 0.85708416, "learning_rate": 3.246109375653428e-06, "loss": 0.87636197, "num_input_tokens_seen": 54934085, "step": 2556, "time_per_iteration": 2.7071268558502197 }, { "auxiliary_loss_clip": 0.01201734, "auxiliary_loss_mlp": 0.01035999, "balance_loss_clip": 1.0585382, "balance_loss_mlp": 1.0272193, "epoch": 0.30746107136415557, "flos": 19500284689920.0, "grad_norm": 2.9537839152332617, "language_loss": 0.78637147, "learning_rate": 3.2454999864515243e-06, "loss": 0.80874878, "num_input_tokens_seen": 54953460, "step": 2557, "time_per_iteration": 2.588740825653076 }, { "auxiliary_loss_clip": 0.0116739, "auxiliary_loss_mlp": 0.00764454, "balance_loss_clip": 1.05395174, "balance_loss_mlp": 1.00051045, "epoch": 0.3075813142547947, "flos": 21724806902400.0, "grad_norm": 1.9159072587725827, "language_loss": 0.69212139, "learning_rate": 3.244890408308263e-06, "loss": 0.71143985, "num_input_tokens_seen": 54974165, "step": 2558, "time_per_iteration": 2.6535444259643555 }, { "auxiliary_loss_clip": 0.01143364, "auxiliary_loss_mlp": 0.01025364, "balance_loss_clip": 1.04961431, "balance_loss_mlp": 1.01696527, "epoch": 0.3077015571454338, "flos": 24097963593600.0, "grad_norm": 2.0314973443564632, "language_loss": 0.61608171, "learning_rate": 3.2442806413161165e-06, "loss": 0.63776898, "num_input_tokens_seen": 54993810, "step": 2559, "time_per_iteration": 3.679738759994507 }, { "auxiliary_loss_clip": 0.0115003, "auxiliary_loss_mlp": 0.01024618, "balance_loss_clip": 1.05430663, "balance_loss_mlp": 1.01621985, "epoch": 0.30782180003607285, "flos": 18405476104320.0, "grad_norm": 2.4819398896672857, "language_loss": 0.75627804, "learning_rate": 3.243670685567586e-06, "loss": 0.77802455, "num_input_tokens_seen": 55011210, "step": 2560, "time_per_iteration": 2.707723617553711 }, { "auxiliary_loss_clip": 0.01173032, "auxiliary_loss_mlp": 0.00764056, "balance_loss_clip": 1.05682278, "balance_loss_mlp": 1.00051355, "epoch": 0.30794204292671196, "flos": 23878549365120.0, "grad_norm": 2.746445225789566, "language_loss": 0.7982195, "learning_rate": 3.2430605411552012e-06, "loss": 0.81759042, "num_input_tokens_seen": 55031325, "step": 2561, "time_per_iteration": 2.740462303161621 }, { "auxiliary_loss_clip": 0.01076164, "auxiliary_loss_mlp": 0.01008684, "balance_loss_clip": 1.02472615, "balance_loss_mlp": 1.00696778, "epoch": 0.30806228581735107, "flos": 67927800816000.0, "grad_norm": 0.8941780687140051, "language_loss": 0.70603007, "learning_rate": 3.2424502081715205e-06, "loss": 0.72687852, "num_input_tokens_seen": 55094440, "step": 2562, "time_per_iteration": 3.3082668781280518 }, { "auxiliary_loss_clip": 0.01176855, "auxiliary_loss_mlp": 0.01032376, "balance_loss_clip": 1.05657792, "balance_loss_mlp": 1.02404356, "epoch": 0.3081825287079901, "flos": 23843213360640.0, "grad_norm": 2.396378388871925, "language_loss": 0.78102612, "learning_rate": 3.241839686709132e-06, "loss": 0.80311847, "num_input_tokens_seen": 55115375, "step": 2563, "time_per_iteration": 3.642937421798706 }, { "auxiliary_loss_clip": 0.01184281, "auxiliary_loss_mlp": 0.0102845, "balance_loss_clip": 1.05328131, "balance_loss_mlp": 1.01915121, "epoch": 0.30830277159862923, "flos": 16209969102720.0, "grad_norm": 2.426865793171944, "language_loss": 0.82155353, "learning_rate": 3.2412289768606495e-06, "loss": 0.84368086, "num_input_tokens_seen": 55131945, "step": 2564, "time_per_iteration": 2.639086961746216 }, { "auxiliary_loss_clip": 0.01190844, "auxiliary_loss_mlp": 0.01032871, "balance_loss_clip": 1.05612886, "balance_loss_mlp": 1.02413273, "epoch": 0.30842301448926834, "flos": 29349503723520.0, "grad_norm": 2.163959129570945, "language_loss": 0.82894349, "learning_rate": 3.240618078718718e-06, "loss": 0.85118067, "num_input_tokens_seen": 55153405, "step": 2565, "time_per_iteration": 2.6950745582580566 }, { "auxiliary_loss_clip": 0.01157836, "auxiliary_loss_mlp": 0.0103485, "balance_loss_clip": 1.05172563, "balance_loss_mlp": 1.02545643, "epoch": 0.3085432573799074, "flos": 21945190798080.0, "grad_norm": 2.1595072409700844, "language_loss": 0.74630737, "learning_rate": 3.240006992376011e-06, "loss": 0.76823425, "num_input_tokens_seen": 55173030, "step": 2566, "time_per_iteration": 3.673495292663574 }, { "auxiliary_loss_clip": 0.01179735, "auxiliary_loss_mlp": 0.0103074, "balance_loss_clip": 1.05790925, "balance_loss_mlp": 1.02188241, "epoch": 0.3086635002705465, "flos": 22054718344320.0, "grad_norm": 2.817728080030157, "language_loss": 0.75982308, "learning_rate": 3.2393957179252284e-06, "loss": 0.78192782, "num_input_tokens_seen": 55189565, "step": 2567, "time_per_iteration": 3.53012752532959 }, { "auxiliary_loss_clip": 0.01203306, "auxiliary_loss_mlp": 0.01033142, "balance_loss_clip": 1.05854177, "balance_loss_mlp": 1.02387977, "epoch": 0.3087837431611856, "flos": 32665925520000.0, "grad_norm": 2.043182069824605, "language_loss": 0.80890149, "learning_rate": 3.2387842554591016e-06, "loss": 0.83126593, "num_input_tokens_seen": 55210380, "step": 2568, "time_per_iteration": 2.682248115539551 }, { "auxiliary_loss_clip": 0.01205315, "auxiliary_loss_mlp": 0.01034431, "balance_loss_clip": 1.05966926, "balance_loss_mlp": 1.02535868, "epoch": 0.3089039860518247, "flos": 17599245384960.0, "grad_norm": 2.261043135377255, "language_loss": 0.88242084, "learning_rate": 3.238172605070388e-06, "loss": 0.9048183, "num_input_tokens_seen": 55225795, "step": 2569, "time_per_iteration": 2.5525317192077637 }, { "auxiliary_loss_clip": 0.01189687, "auxiliary_loss_mlp": 0.00764496, "balance_loss_clip": 1.0595243, "balance_loss_mlp": 1.00047648, "epoch": 0.3090242289424638, "flos": 14383839611520.0, "grad_norm": 8.008105498194258, "language_loss": 0.78411329, "learning_rate": 3.2375607668518745e-06, "loss": 0.80365515, "num_input_tokens_seen": 55238830, "step": 2570, "time_per_iteration": 2.591534376144409 }, { "auxiliary_loss_clip": 0.01164644, "auxiliary_loss_mlp": 0.01031063, "balance_loss_clip": 1.0544374, "balance_loss_mlp": 1.0217768, "epoch": 0.30914447183310284, "flos": 16068625084800.0, "grad_norm": 2.1591151238592325, "language_loss": 0.90010691, "learning_rate": 3.236948740896377e-06, "loss": 0.92206395, "num_input_tokens_seen": 55253630, "step": 2571, "time_per_iteration": 2.6720001697540283 }, { "auxiliary_loss_clip": 0.01189599, "auxiliary_loss_mlp": 0.01038132, "balance_loss_clip": 1.05844569, "balance_loss_mlp": 1.02811861, "epoch": 0.30926471472374195, "flos": 32230221546240.0, "grad_norm": 33.418514357449105, "language_loss": 0.84684944, "learning_rate": 3.2363365272967384e-06, "loss": 0.8691268, "num_input_tokens_seen": 55276200, "step": 2572, "time_per_iteration": 2.722968101501465 }, { "auxiliary_loss_clip": 0.0119176, "auxiliary_loss_mlp": 0.01039983, "balance_loss_clip": 1.06210589, "balance_loss_mlp": 1.02982664, "epoch": 0.30938495761438106, "flos": 20370722970240.0, "grad_norm": 2.2945231579371947, "language_loss": 0.81879592, "learning_rate": 3.235724126145832e-06, "loss": 0.84111333, "num_input_tokens_seen": 55292235, "step": 2573, "time_per_iteration": 2.678511381149292 }, { "auxiliary_loss_clip": 0.01177926, "auxiliary_loss_mlp": 0.01031551, "balance_loss_clip": 1.05394983, "balance_loss_mlp": 1.02251506, "epoch": 0.3095052005050201, "flos": 24061155131520.0, "grad_norm": 1.7425188555628297, "language_loss": 0.78034097, "learning_rate": 3.235111537536558e-06, "loss": 0.80243576, "num_input_tokens_seen": 55313050, "step": 2574, "time_per_iteration": 2.7052993774414062 }, { "auxiliary_loss_clip": 0.0118958, "auxiliary_loss_mlp": 0.010319, "balance_loss_clip": 1.05735552, "balance_loss_mlp": 1.02317953, "epoch": 0.30962544339565923, "flos": 23401547729280.0, "grad_norm": 2.4435278207919637, "language_loss": 0.83063269, "learning_rate": 3.2344987615618456e-06, "loss": 0.85284752, "num_input_tokens_seen": 55332885, "step": 2575, "time_per_iteration": 2.6447343826293945 }, { "auxiliary_loss_clip": 0.01159932, "auxiliary_loss_mlp": 0.01035848, "balance_loss_clip": 1.05717945, "balance_loss_mlp": 1.02760434, "epoch": 0.30974568628629834, "flos": 33799984692480.0, "grad_norm": 1.8565731782668493, "language_loss": 0.78786826, "learning_rate": 3.2338857983146533e-06, "loss": 0.80982608, "num_input_tokens_seen": 55354385, "step": 2576, "time_per_iteration": 2.9394004344940186 }, { "auxiliary_loss_clip": 0.01164945, "auxiliary_loss_mlp": 0.01033012, "balance_loss_clip": 1.05483437, "balance_loss_mlp": 1.02386248, "epoch": 0.3098659291769374, "flos": 20229594433920.0, "grad_norm": 1.8312249764822368, "language_loss": 0.76357073, "learning_rate": 3.233272647887966e-06, "loss": 0.7855503, "num_input_tokens_seen": 55373275, "step": 2577, "time_per_iteration": 2.6968865394592285 }, { "auxiliary_loss_clip": 0.01205478, "auxiliary_loss_mlp": 0.01030554, "balance_loss_clip": 1.06135154, "balance_loss_mlp": 1.02136874, "epoch": 0.3099861720675765, "flos": 24748556682240.0, "grad_norm": 1.6927839550756358, "language_loss": 0.90434593, "learning_rate": 3.2326593103747985e-06, "loss": 0.92670631, "num_input_tokens_seen": 55392290, "step": 2578, "time_per_iteration": 2.6784839630126953 }, { "auxiliary_loss_clip": 0.01188863, "auxiliary_loss_mlp": 0.01030454, "balance_loss_clip": 1.06004572, "balance_loss_mlp": 1.02151322, "epoch": 0.3101064149582156, "flos": 11765485704960.0, "grad_norm": 2.6965111785922296, "language_loss": 0.8487767, "learning_rate": 3.2320457858681936e-06, "loss": 0.87096989, "num_input_tokens_seen": 55410680, "step": 2579, "time_per_iteration": 2.641016960144043 }, { "auxiliary_loss_clip": 0.01171592, "auxiliary_loss_mlp": 0.01034938, "balance_loss_clip": 1.05468464, "balance_loss_mlp": 1.02694511, "epoch": 0.31022665784885467, "flos": 23033247626880.0, "grad_norm": 3.71921468736521, "language_loss": 0.85400718, "learning_rate": 3.2314320744612228e-06, "loss": 0.87607253, "num_input_tokens_seen": 55425980, "step": 2580, "time_per_iteration": 2.734492540359497 }, { "auxiliary_loss_clip": 0.01184975, "auxiliary_loss_mlp": 0.01032553, "balance_loss_clip": 1.05757487, "balance_loss_mlp": 1.02384496, "epoch": 0.3103469007394938, "flos": 16289188548480.0, "grad_norm": 1.650737736578647, "language_loss": 0.76373357, "learning_rate": 3.2308181762469854e-06, "loss": 0.78590882, "num_input_tokens_seen": 55443925, "step": 2581, "time_per_iteration": 2.591212272644043 }, { "auxiliary_loss_clip": 0.01206415, "auxiliary_loss_mlp": 0.01031096, "balance_loss_clip": 1.05939579, "balance_loss_mlp": 1.02225721, "epoch": 0.3104671436301329, "flos": 30515271626880.0, "grad_norm": 2.09458243121925, "language_loss": 0.78497112, "learning_rate": 3.230204091318609e-06, "loss": 0.80734622, "num_input_tokens_seen": 55464465, "step": 2582, "time_per_iteration": 2.651573657989502 }, { "auxiliary_loss_clip": 0.01204779, "auxiliary_loss_mlp": 0.00764298, "balance_loss_clip": 1.06062913, "balance_loss_mlp": 1.00055909, "epoch": 0.31058738652077195, "flos": 20047240062720.0, "grad_norm": 2.0390617803814455, "language_loss": 0.84664947, "learning_rate": 3.2295898197692503e-06, "loss": 0.86634022, "num_input_tokens_seen": 55483425, "step": 2583, "time_per_iteration": 2.5570414066314697 }, { "auxiliary_loss_clip": 0.01201967, "auxiliary_loss_mlp": 0.01034719, "balance_loss_clip": 1.05916786, "balance_loss_mlp": 1.02615333, "epoch": 0.31070762941141106, "flos": 28074639237120.0, "grad_norm": 1.769656889703578, "language_loss": 0.79186171, "learning_rate": 3.228975361692094e-06, "loss": 0.81422859, "num_input_tokens_seen": 55504445, "step": 2584, "time_per_iteration": 2.6346144676208496 }, { "auxiliary_loss_clip": 0.01192792, "auxiliary_loss_mlp": 0.00764988, "balance_loss_clip": 1.05662787, "balance_loss_mlp": 1.00067759, "epoch": 0.31082787230205017, "flos": 20521907314560.0, "grad_norm": 2.2628548258909347, "language_loss": 0.80352134, "learning_rate": 3.228360717180352e-06, "loss": 0.82309914, "num_input_tokens_seen": 55521970, "step": 2585, "time_per_iteration": 3.8267152309417725 }, { "auxiliary_loss_clip": 0.01110977, "auxiliary_loss_mlp": 0.00755242, "balance_loss_clip": 1.03085256, "balance_loss_mlp": 1.00013697, "epoch": 0.3109481151926892, "flos": 62445928723200.0, "grad_norm": 0.8115967776018719, "language_loss": 0.59379828, "learning_rate": 3.227745886327266e-06, "loss": 0.61246049, "num_input_tokens_seen": 55580665, "step": 2586, "time_per_iteration": 3.1432158946990967 }, { "auxiliary_loss_clip": 0.01111219, "auxiliary_loss_mlp": 0.01007313, "balance_loss_clip": 1.03095448, "balance_loss_mlp": 1.00562024, "epoch": 0.31106835808332833, "flos": 44746744723200.0, "grad_norm": 0.8258452167089132, "language_loss": 0.55849344, "learning_rate": 3.227130869226105e-06, "loss": 0.57967877, "num_input_tokens_seen": 55637825, "step": 2587, "time_per_iteration": 3.260242462158203 }, { "auxiliary_loss_clip": 0.01187964, "auxiliary_loss_mlp": 0.01033791, "balance_loss_clip": 1.05566764, "balance_loss_mlp": 1.02496934, "epoch": 0.3111886009739674, "flos": 23403056100480.0, "grad_norm": 6.370900320680225, "language_loss": 0.82553178, "learning_rate": 3.226515665970167e-06, "loss": 0.84774935, "num_input_tokens_seen": 55655365, "step": 2588, "time_per_iteration": 3.6194047927856445 }, { "auxiliary_loss_clip": 0.01185723, "auxiliary_loss_mlp": 0.01029111, "balance_loss_clip": 1.05610681, "balance_loss_mlp": 1.01943672, "epoch": 0.3113088438646065, "flos": 17530728192000.0, "grad_norm": 2.4252852462791457, "language_loss": 0.86650389, "learning_rate": 3.225900276652777e-06, "loss": 0.88865227, "num_input_tokens_seen": 55672140, "step": 2589, "time_per_iteration": 2.686906337738037 }, { "auxiliary_loss_clip": 0.0117771, "auxiliary_loss_mlp": 0.0102862, "balance_loss_clip": 1.05460358, "balance_loss_mlp": 1.01999474, "epoch": 0.3114290867552456, "flos": 28365802882560.0, "grad_norm": 1.736884287791215, "language_loss": 0.76096851, "learning_rate": 3.2252847013672906e-06, "loss": 0.78303182, "num_input_tokens_seen": 55694800, "step": 2590, "time_per_iteration": 2.7721645832061768 }, { "auxiliary_loss_clip": 0.01150733, "auxiliary_loss_mlp": 0.01029742, "balance_loss_clip": 1.0504967, "balance_loss_mlp": 1.02073538, "epoch": 0.31154932964588467, "flos": 27379157126400.0, "grad_norm": 2.086670221297666, "language_loss": 0.76929253, "learning_rate": 3.224668940207089e-06, "loss": 0.79109728, "num_input_tokens_seen": 55713785, "step": 2591, "time_per_iteration": 2.733731269836426 }, { "auxiliary_loss_clip": 0.0113611, "auxiliary_loss_mlp": 0.01034048, "balance_loss_clip": 1.04721594, "balance_loss_mlp": 1.02505994, "epoch": 0.3116695725365238, "flos": 26541864120960.0, "grad_norm": 2.4930027815179763, "language_loss": 0.87789726, "learning_rate": 3.2240529932655828e-06, "loss": 0.89959884, "num_input_tokens_seen": 55733050, "step": 2592, "time_per_iteration": 3.8972930908203125 }, { "auxiliary_loss_clip": 0.01171551, "auxiliary_loss_mlp": 0.01035058, "balance_loss_clip": 1.05526781, "balance_loss_mlp": 1.02626622, "epoch": 0.3117898154271629, "flos": 21177600134400.0, "grad_norm": 3.39127244373679, "language_loss": 0.88870031, "learning_rate": 3.223436860636211e-06, "loss": 0.91076636, "num_input_tokens_seen": 55748685, "step": 2593, "time_per_iteration": 2.6810288429260254 }, { "auxiliary_loss_clip": 0.01204228, "auxiliary_loss_mlp": 0.01034554, "balance_loss_clip": 1.0593338, "balance_loss_mlp": 1.02570868, "epoch": 0.31191005831780194, "flos": 27272430840960.0, "grad_norm": 1.734949811409212, "language_loss": 0.74262643, "learning_rate": 3.2228205424124403e-06, "loss": 0.76501429, "num_input_tokens_seen": 55771840, "step": 2594, "time_per_iteration": 3.597313165664673 }, { "auxiliary_loss_clip": 0.01157482, "auxiliary_loss_mlp": 0.01029059, "balance_loss_clip": 1.04964185, "balance_loss_mlp": 1.02071476, "epoch": 0.31203030120844105, "flos": 12963501043200.0, "grad_norm": 2.747775808366881, "language_loss": 0.74428678, "learning_rate": 3.222204038687765e-06, "loss": 0.7661522, "num_input_tokens_seen": 55784975, "step": 2595, "time_per_iteration": 2.689415454864502 }, { "auxiliary_loss_clip": 0.01182848, "auxiliary_loss_mlp": 0.01030733, "balance_loss_clip": 1.05498409, "balance_loss_mlp": 1.02143502, "epoch": 0.31215054409908016, "flos": 27562014288000.0, "grad_norm": 1.612437533210896, "language_loss": 0.88068271, "learning_rate": 3.221587349555709e-06, "loss": 0.9028185, "num_input_tokens_seen": 55805235, "step": 2596, "time_per_iteration": 2.6912341117858887 }, { "auxiliary_loss_clip": 0.01177662, "auxiliary_loss_mlp": 0.01029995, "balance_loss_clip": 1.05553377, "balance_loss_mlp": 1.02142978, "epoch": 0.3122707869897192, "flos": 21506326427520.0, "grad_norm": 1.8405239547389505, "language_loss": 0.69638354, "learning_rate": 3.2209704751098236e-06, "loss": 0.7184602, "num_input_tokens_seen": 55824265, "step": 2597, "time_per_iteration": 2.8742594718933105 }, { "auxiliary_loss_clip": 0.01175576, "auxiliary_loss_mlp": 0.01033424, "balance_loss_clip": 1.05699563, "balance_loss_mlp": 1.02375627, "epoch": 0.31239102988035833, "flos": 15187017674880.0, "grad_norm": 2.4360313229463686, "language_loss": 0.82853889, "learning_rate": 3.2203534154436875e-06, "loss": 0.85062897, "num_input_tokens_seen": 55838620, "step": 2598, "time_per_iteration": 2.731823682785034 }, { "auxiliary_loss_clip": 0.01127833, "auxiliary_loss_mlp": 0.01034909, "balance_loss_clip": 1.04914618, "balance_loss_mlp": 1.02638578, "epoch": 0.31251127277099744, "flos": 22053712763520.0, "grad_norm": 2.3233275236733264, "language_loss": 0.75887567, "learning_rate": 3.2197361706509084e-06, "loss": 0.78050315, "num_input_tokens_seen": 55859375, "step": 2599, "time_per_iteration": 2.8066349029541016 }, { "auxiliary_loss_clip": 0.01206307, "auxiliary_loss_mlp": 0.01029394, "balance_loss_clip": 1.05828881, "balance_loss_mlp": 1.01966023, "epoch": 0.3126315156616365, "flos": 15193984913280.0, "grad_norm": 2.7058289841667293, "language_loss": 0.83328021, "learning_rate": 3.2191187408251228e-06, "loss": 0.85563719, "num_input_tokens_seen": 55876535, "step": 2600, "time_per_iteration": 2.6307055950164795 }, { "auxiliary_loss_clip": 0.01191587, "auxiliary_loss_mlp": 0.01034022, "balance_loss_clip": 1.05425096, "balance_loss_mlp": 1.0241034, "epoch": 0.3127517585522756, "flos": 18145338831360.0, "grad_norm": 2.0859993487783512, "language_loss": 0.79221201, "learning_rate": 3.218501126059993e-06, "loss": 0.81446809, "num_input_tokens_seen": 55891930, "step": 2601, "time_per_iteration": 2.6731486320495605 }, { "auxiliary_loss_clip": 0.01189033, "auxiliary_loss_mlp": 0.01035055, "balance_loss_clip": 1.05473423, "balance_loss_mlp": 1.02631116, "epoch": 0.31287200144291466, "flos": 21908633731200.0, "grad_norm": 2.794102552641243, "language_loss": 0.81344163, "learning_rate": 3.2178833264492116e-06, "loss": 0.83568257, "num_input_tokens_seen": 55910635, "step": 2602, "time_per_iteration": 2.6586289405822754 }, { "auxiliary_loss_clip": 0.01194569, "auxiliary_loss_mlp": 0.01031131, "balance_loss_clip": 1.05801857, "balance_loss_mlp": 1.02139711, "epoch": 0.31299224433355377, "flos": 29896997800320.0, "grad_norm": 1.8485177835533062, "language_loss": 0.76011968, "learning_rate": 3.217265342086498e-06, "loss": 0.78237665, "num_input_tokens_seen": 55931125, "step": 2603, "time_per_iteration": 2.7473864555358887 }, { "auxiliary_loss_clip": 0.0116726, "auxiliary_loss_mlp": 0.00765199, "balance_loss_clip": 1.05670714, "balance_loss_mlp": 1.00078487, "epoch": 0.3131124872241929, "flos": 11655886331520.0, "grad_norm": 3.1814177350985555, "language_loss": 0.73139071, "learning_rate": 3.216647173065599e-06, "loss": 0.75071526, "num_input_tokens_seen": 55946590, "step": 2604, "time_per_iteration": 2.744267225265503 }, { "auxiliary_loss_clip": 0.01173404, "auxiliary_loss_mlp": 0.01028487, "balance_loss_clip": 1.05746365, "balance_loss_mlp": 1.01982093, "epoch": 0.31323273011483194, "flos": 49848785470080.0, "grad_norm": 2.421024362308577, "language_loss": 0.73993158, "learning_rate": 3.216028819480292e-06, "loss": 0.76195049, "num_input_tokens_seen": 55967930, "step": 2605, "time_per_iteration": 2.9302990436553955 }, { "auxiliary_loss_clip": 0.01162256, "auxiliary_loss_mlp": 0.01031832, "balance_loss_clip": 1.05264711, "balance_loss_mlp": 1.02152085, "epoch": 0.31335297300547105, "flos": 22601278667520.0, "grad_norm": 4.519481272155611, "language_loss": 0.75618207, "learning_rate": 3.2154102814243793e-06, "loss": 0.7781229, "num_input_tokens_seen": 55987070, "step": 2606, "time_per_iteration": 2.7568044662475586 }, { "auxiliary_loss_clip": 0.01167848, "auxiliary_loss_mlp": 0.0103244, "balance_loss_clip": 1.05643177, "balance_loss_mlp": 1.02314734, "epoch": 0.31347321589611016, "flos": 34710858708480.0, "grad_norm": 2.102517068844022, "language_loss": 0.67177868, "learning_rate": 3.2147915589916937e-06, "loss": 0.6937815, "num_input_tokens_seen": 56008630, "step": 2607, "time_per_iteration": 2.8514387607574463 }, { "auxiliary_loss_clip": 0.01165002, "auxiliary_loss_mlp": 0.01038883, "balance_loss_clip": 1.05089855, "balance_loss_mlp": 1.02938247, "epoch": 0.3135934587867492, "flos": 19755789108480.0, "grad_norm": 2.778112753052168, "language_loss": 0.8272658, "learning_rate": 3.2141726522760938e-06, "loss": 0.84930468, "num_input_tokens_seen": 56026690, "step": 2608, "time_per_iteration": 2.725917339324951 }, { "auxiliary_loss_clip": 0.01100471, "auxiliary_loss_mlp": 0.01005125, "balance_loss_clip": 1.03345561, "balance_loss_mlp": 1.00333643, "epoch": 0.3137137016773883, "flos": 65815535583360.0, "grad_norm": 0.7023265582996232, "language_loss": 0.52581531, "learning_rate": 3.213553561371469e-06, "loss": 0.54687124, "num_input_tokens_seen": 56090425, "step": 2609, "time_per_iteration": 3.306851863861084 }, { "auxiliary_loss_clip": 0.01148221, "auxiliary_loss_mlp": 0.0102752, "balance_loss_clip": 1.05588531, "balance_loss_mlp": 1.01841831, "epoch": 0.31383394456802743, "flos": 16252739222400.0, "grad_norm": 2.2322890902633183, "language_loss": 0.96001619, "learning_rate": 3.212934286371733e-06, "loss": 0.98177356, "num_input_tokens_seen": 56107135, "step": 2610, "time_per_iteration": 2.734653949737549 }, { "auxiliary_loss_clip": 0.01192152, "auxiliary_loss_mlp": 0.01039808, "balance_loss_clip": 1.06242371, "balance_loss_mlp": 1.03041482, "epoch": 0.3139541874586665, "flos": 38795517613440.0, "grad_norm": 13.816818044994886, "language_loss": 0.83839154, "learning_rate": 3.2123148273708304e-06, "loss": 0.8607111, "num_input_tokens_seen": 56127325, "step": 2611, "time_per_iteration": 3.7003185749053955 }, { "auxiliary_loss_clip": 0.01202129, "auxiliary_loss_mlp": 0.01031255, "balance_loss_clip": 1.05816519, "balance_loss_mlp": 1.02286267, "epoch": 0.3140744303493056, "flos": 25046328430080.0, "grad_norm": 2.025157068147185, "language_loss": 0.7687043, "learning_rate": 3.211695184462733e-06, "loss": 0.79103816, "num_input_tokens_seen": 56148500, "step": 2612, "time_per_iteration": 2.6558837890625 }, { "auxiliary_loss_clip": 0.01074829, "auxiliary_loss_mlp": 0.01001964, "balance_loss_clip": 1.02780521, "balance_loss_mlp": 1.00015187, "epoch": 0.3141946732399447, "flos": 72504254782080.0, "grad_norm": 0.8932223328667137, "language_loss": 0.60517657, "learning_rate": 3.2110753577414383e-06, "loss": 0.62594444, "num_input_tokens_seen": 56210080, "step": 2613, "time_per_iteration": 3.2563836574554443 }, { "auxiliary_loss_clip": 0.01177826, "auxiliary_loss_mlp": 0.01035769, "balance_loss_clip": 1.05482149, "balance_loss_mlp": 1.02657771, "epoch": 0.31431491613058377, "flos": 19239788280960.0, "grad_norm": 3.3333816410693298, "language_loss": 0.78780884, "learning_rate": 3.2104553473009757e-06, "loss": 0.80994475, "num_input_tokens_seen": 56228200, "step": 2614, "time_per_iteration": 3.575906991958618 }, { "auxiliary_loss_clip": 0.01139851, "auxiliary_loss_mlp": 0.01028918, "balance_loss_clip": 1.05139959, "balance_loss_mlp": 1.01966715, "epoch": 0.3144351590212229, "flos": 36210596290560.0, "grad_norm": 1.8917408290515596, "language_loss": 0.67458177, "learning_rate": 3.209835153235399e-06, "loss": 0.69626939, "num_input_tokens_seen": 56249755, "step": 2615, "time_per_iteration": 2.828031539916992 }, { "auxiliary_loss_clip": 0.01149615, "auxiliary_loss_mlp": 0.01034519, "balance_loss_clip": 1.0514437, "balance_loss_mlp": 1.02536988, "epoch": 0.314555401911862, "flos": 18551740285440.0, "grad_norm": 2.8531178427800357, "language_loss": 0.677369, "learning_rate": 3.2092147756387916e-06, "loss": 0.69921035, "num_input_tokens_seen": 56270080, "step": 2616, "time_per_iteration": 2.7846598625183105 }, { "auxiliary_loss_clip": 0.01164865, "auxiliary_loss_mlp": 0.0103692, "balance_loss_clip": 1.05082631, "balance_loss_mlp": 1.02774715, "epoch": 0.31467564480250104, "flos": 16362877299840.0, "grad_norm": 2.410927905975255, "language_loss": 0.83783311, "learning_rate": 3.208594214605264e-06, "loss": 0.85985094, "num_input_tokens_seen": 56288625, "step": 2617, "time_per_iteration": 2.683873176574707 }, { "auxiliary_loss_clip": 0.01164584, "auxiliary_loss_mlp": 0.01037123, "balance_loss_clip": 1.05425215, "balance_loss_mlp": 1.02831352, "epoch": 0.31479588769314015, "flos": 21652375127040.0, "grad_norm": 3.0099375339724115, "language_loss": 0.77368975, "learning_rate": 3.2079734702289553e-06, "loss": 0.79570675, "num_input_tokens_seen": 56307520, "step": 2618, "time_per_iteration": 2.646979570388794 }, { "auxiliary_loss_clip": 0.01091526, "auxiliary_loss_mlp": 0.00755501, "balance_loss_clip": 1.02663028, "balance_loss_mlp": 1.0001235, "epoch": 0.3149161305837792, "flos": 66051072040320.0, "grad_norm": 0.8264986375275456, "language_loss": 0.60435736, "learning_rate": 3.207352542604031e-06, "loss": 0.62282765, "num_input_tokens_seen": 56369855, "step": 2619, "time_per_iteration": 5.120925426483154 }, { "auxiliary_loss_clip": 0.01148998, "auxiliary_loss_mlp": 0.01030642, "balance_loss_clip": 1.05236936, "balance_loss_mlp": 1.02213001, "epoch": 0.3150363734744183, "flos": 28987201192320.0, "grad_norm": 1.6767756855932965, "language_loss": 0.78367007, "learning_rate": 3.2067314318246864e-06, "loss": 0.80546647, "num_input_tokens_seen": 56390570, "step": 2620, "time_per_iteration": 2.766129493713379 }, { "auxiliary_loss_clip": 0.01161504, "auxiliary_loss_mlp": 0.01034291, "balance_loss_clip": 1.05668855, "balance_loss_mlp": 1.02538013, "epoch": 0.31515661636505743, "flos": 27636600879360.0, "grad_norm": 2.3608924098108264, "language_loss": 0.78109753, "learning_rate": 3.206110137985143e-06, "loss": 0.80305552, "num_input_tokens_seen": 56410775, "step": 2621, "time_per_iteration": 2.7752673625946045 }, { "auxiliary_loss_clip": 0.01147499, "auxiliary_loss_mlp": 0.0103533, "balance_loss_clip": 1.05190694, "balance_loss_mlp": 1.0265677, "epoch": 0.3152768592556965, "flos": 24605632465920.0, "grad_norm": 1.8410629335559188, "language_loss": 0.92106497, "learning_rate": 3.2054886611796505e-06, "loss": 0.94289333, "num_input_tokens_seen": 56429770, "step": 2622, "time_per_iteration": 2.77343487739563 }, { "auxiliary_loss_clip": 0.0111333, "auxiliary_loss_mlp": 0.01002795, "balance_loss_clip": 1.03328633, "balance_loss_mlp": 1.00105429, "epoch": 0.3153971021463356, "flos": 68476908026880.0, "grad_norm": 0.9415554576475984, "language_loss": 0.63508356, "learning_rate": 3.204867001502487e-06, "loss": 0.65624481, "num_input_tokens_seen": 56488425, "step": 2623, "time_per_iteration": 3.2479238510131836 }, { "auxiliary_loss_clip": 0.01205936, "auxiliary_loss_mlp": 0.01030265, "balance_loss_clip": 1.0611248, "balance_loss_mlp": 1.02133584, "epoch": 0.3155173450369747, "flos": 25593714766080.0, "grad_norm": 2.7221207354673918, "language_loss": 0.81095088, "learning_rate": 3.2042451590479567e-06, "loss": 0.83331287, "num_input_tokens_seen": 56508940, "step": 2624, "time_per_iteration": 2.768083333969116 }, { "auxiliary_loss_clip": 0.01201034, "auxiliary_loss_mlp": 0.0102741, "balance_loss_clip": 1.06065083, "balance_loss_mlp": 1.01891613, "epoch": 0.31563758792761376, "flos": 24309333175680.0, "grad_norm": 1.8028327724919309, "language_loss": 0.86670387, "learning_rate": 3.203623133910394e-06, "loss": 0.88898838, "num_input_tokens_seen": 56527245, "step": 2625, "time_per_iteration": 2.678718328475952 }, { "auxiliary_loss_clip": 0.01131388, "auxiliary_loss_mlp": 0.01031093, "balance_loss_clip": 1.05080783, "balance_loss_mlp": 1.02159786, "epoch": 0.31575783081825287, "flos": 31903865550720.0, "grad_norm": 2.58081583704734, "language_loss": 0.77533019, "learning_rate": 3.203000926184158e-06, "loss": 0.79695499, "num_input_tokens_seen": 56546170, "step": 2626, "time_per_iteration": 2.862945079803467 }, { "auxiliary_loss_clip": 0.01202313, "auxiliary_loss_mlp": 0.01028562, "balance_loss_clip": 1.05982423, "balance_loss_mlp": 1.0199728, "epoch": 0.315878073708892, "flos": 30810960385920.0, "grad_norm": 1.8393919812365884, "language_loss": 0.77714896, "learning_rate": 3.202378535963639e-06, "loss": 0.79945773, "num_input_tokens_seen": 56567085, "step": 2627, "time_per_iteration": 2.776393175125122 }, { "auxiliary_loss_clip": 0.01165065, "auxiliary_loss_mlp": 0.00764602, "balance_loss_clip": 1.05245519, "balance_loss_mlp": 1.00065517, "epoch": 0.31599831659953104, "flos": 22200264253440.0, "grad_norm": 1.6259465237682558, "language_loss": 0.84043336, "learning_rate": 3.2017559633432516e-06, "loss": 0.85973001, "num_input_tokens_seen": 56586715, "step": 2628, "time_per_iteration": 2.685262680053711 }, { "auxiliary_loss_clip": 0.01181081, "auxiliary_loss_mlp": 0.01033742, "balance_loss_clip": 1.05447388, "balance_loss_mlp": 1.02475965, "epoch": 0.31611855949017015, "flos": 25593463370880.0, "grad_norm": 2.0238212245823064, "language_loss": 0.66577661, "learning_rate": 3.2011332084174398e-06, "loss": 0.68792486, "num_input_tokens_seen": 56607585, "step": 2629, "time_per_iteration": 2.7650938034057617 }, { "auxiliary_loss_clip": 0.01186739, "auxiliary_loss_mlp": 0.010297, "balance_loss_clip": 1.05790079, "balance_loss_mlp": 1.02034199, "epoch": 0.31623880238080926, "flos": 20594087694720.0, "grad_norm": 1.613461353861769, "language_loss": 0.89079487, "learning_rate": 3.2005102712806756e-06, "loss": 0.91295922, "num_input_tokens_seen": 56626415, "step": 2630, "time_per_iteration": 2.7306313514709473 }, { "auxiliary_loss_clip": 0.01193524, "auxiliary_loss_mlp": 0.0103111, "balance_loss_clip": 1.05725121, "balance_loss_mlp": 1.02213991, "epoch": 0.3163590452714483, "flos": 12784917600000.0, "grad_norm": 2.2683170909575585, "language_loss": 0.72567487, "learning_rate": 3.1998871520274575e-06, "loss": 0.74792123, "num_input_tokens_seen": 56641750, "step": 2631, "time_per_iteration": 2.715928792953491 }, { "auxiliary_loss_clip": 0.01172694, "auxiliary_loss_mlp": 0.01029167, "balance_loss_clip": 1.0519619, "balance_loss_mlp": 1.0199821, "epoch": 0.3164792881620874, "flos": 23041292273280.0, "grad_norm": 2.001691742140243, "language_loss": 0.84929496, "learning_rate": 3.199263850752312e-06, "loss": 0.87131357, "num_input_tokens_seen": 56662585, "step": 2632, "time_per_iteration": 2.6578855514526367 }, { "auxiliary_loss_clip": 0.01193622, "auxiliary_loss_mlp": 0.0102657, "balance_loss_clip": 1.05903339, "balance_loss_mlp": 1.01808214, "epoch": 0.31659953105272653, "flos": 18296271780480.0, "grad_norm": 2.433490123139996, "language_loss": 0.85471421, "learning_rate": 3.198640367549795e-06, "loss": 0.87691617, "num_input_tokens_seen": 56681480, "step": 2633, "time_per_iteration": 2.713207244873047 }, { "auxiliary_loss_clip": 0.01190114, "auxiliary_loss_mlp": 0.00764106, "balance_loss_clip": 1.05664539, "balance_loss_mlp": 1.00042272, "epoch": 0.3167197739433656, "flos": 25703421880320.0, "grad_norm": 1.8657111697015272, "language_loss": 0.86002499, "learning_rate": 3.198016702514487e-06, "loss": 0.87956715, "num_input_tokens_seen": 56701760, "step": 2634, "time_per_iteration": 2.7980594635009766 }, { "auxiliary_loss_clip": 0.01200557, "auxiliary_loss_mlp": 0.010322, "balance_loss_clip": 1.05760634, "balance_loss_mlp": 1.02331328, "epoch": 0.3168400168340047, "flos": 23546016230400.0, "grad_norm": 1.67890672035078, "language_loss": 0.84526592, "learning_rate": 3.1973928557409972e-06, "loss": 0.86759347, "num_input_tokens_seen": 56719800, "step": 2635, "time_per_iteration": 2.7512800693511963 }, { "auxiliary_loss_clip": 0.01199868, "auxiliary_loss_mlp": 0.01037612, "balance_loss_clip": 1.05910039, "balance_loss_mlp": 1.02904677, "epoch": 0.31696025972464376, "flos": 28366449327360.0, "grad_norm": 2.1437159437840925, "language_loss": 0.71276867, "learning_rate": 3.1967688273239636e-06, "loss": 0.73514354, "num_input_tokens_seen": 56739605, "step": 2636, "time_per_iteration": 2.6691768169403076 }, { "auxiliary_loss_clip": 0.01159727, "auxiliary_loss_mlp": 0.01034068, "balance_loss_clip": 1.05498743, "balance_loss_mlp": 1.02459121, "epoch": 0.31708050261528287, "flos": 16399111144320.0, "grad_norm": 1.8378356169694257, "language_loss": 0.82050645, "learning_rate": 3.1961446173580503e-06, "loss": 0.84244442, "num_input_tokens_seen": 56756545, "step": 2637, "time_per_iteration": 3.593311309814453 }, { "auxiliary_loss_clip": 0.0117176, "auxiliary_loss_mlp": 0.01036931, "balance_loss_clip": 1.05584383, "balance_loss_mlp": 1.0280385, "epoch": 0.317200745505922, "flos": 26212347728640.0, "grad_norm": 2.062776611139064, "language_loss": 0.77575248, "learning_rate": 3.1955202259379502e-06, "loss": 0.79783946, "num_input_tokens_seen": 56778275, "step": 2638, "time_per_iteration": 2.723254919052124 }, { "auxiliary_loss_clip": 0.01182808, "auxiliary_loss_mlp": 0.01034042, "balance_loss_clip": 1.05293953, "balance_loss_mlp": 1.02448726, "epoch": 0.31732098839656103, "flos": 31350876693120.0, "grad_norm": 1.891802392218053, "language_loss": 0.83003306, "learning_rate": 3.194895653158381e-06, "loss": 0.85220158, "num_input_tokens_seen": 56797215, "step": 2639, "time_per_iteration": 2.6900076866149902 }, { "auxiliary_loss_clip": 0.0111398, "auxiliary_loss_mlp": 0.01002136, "balance_loss_clip": 1.03354549, "balance_loss_mlp": 1.00027585, "epoch": 0.31744123128720014, "flos": 58989024835200.0, "grad_norm": 0.8622434202157263, "language_loss": 0.55475438, "learning_rate": 3.194270899114093e-06, "loss": 0.57591552, "num_input_tokens_seen": 56863010, "step": 2640, "time_per_iteration": 3.2613706588745117 }, { "auxiliary_loss_clip": 0.011959, "auxiliary_loss_mlp": 0.01031353, "balance_loss_clip": 1.0584954, "balance_loss_mlp": 1.02252555, "epoch": 0.31756147417783925, "flos": 17417573372160.0, "grad_norm": 1.797841984987475, "language_loss": 0.82110029, "learning_rate": 3.193645963899858e-06, "loss": 0.84337282, "num_input_tokens_seen": 56880625, "step": 2641, "time_per_iteration": 3.5762710571289062 }, { "auxiliary_loss_clip": 0.01168057, "auxiliary_loss_mlp": 0.01034129, "balance_loss_clip": 1.05321193, "balance_loss_mlp": 1.02446091, "epoch": 0.3176817170684783, "flos": 25481673267840.0, "grad_norm": 1.827788289540545, "language_loss": 0.84107763, "learning_rate": 3.193020847610479e-06, "loss": 0.86309946, "num_input_tokens_seen": 56900945, "step": 2642, "time_per_iteration": 2.7009689807891846 }, { "auxiliary_loss_clip": 0.01169682, "auxiliary_loss_mlp": 0.01035692, "balance_loss_clip": 1.05655122, "balance_loss_mlp": 1.02607799, "epoch": 0.3178019599591174, "flos": 24972603765120.0, "grad_norm": 2.245734990135557, "language_loss": 0.71723342, "learning_rate": 3.192395550340787e-06, "loss": 0.73928714, "num_input_tokens_seen": 56918895, "step": 2643, "time_per_iteration": 2.790253162384033 }, { "auxiliary_loss_clip": 0.01187082, "auxiliary_loss_mlp": 0.01036156, "balance_loss_clip": 1.05635858, "balance_loss_mlp": 1.02673841, "epoch": 0.31792220284975653, "flos": 12422220019200.0, "grad_norm": 3.155109229749805, "language_loss": 0.76569188, "learning_rate": 3.191770072185638e-06, "loss": 0.78792423, "num_input_tokens_seen": 56935890, "step": 2644, "time_per_iteration": 3.598496675491333 }, { "auxiliary_loss_clip": 0.01188082, "auxiliary_loss_mlp": 0.01036123, "balance_loss_clip": 1.05918217, "balance_loss_mlp": 1.02711725, "epoch": 0.3180424457403956, "flos": 15485759089920.0, "grad_norm": 2.560303057579813, "language_loss": 0.72711205, "learning_rate": 3.191144413239916e-06, "loss": 0.74935412, "num_input_tokens_seen": 56952460, "step": 2645, "time_per_iteration": 3.5060436725616455 }, { "auxiliary_loss_clip": 0.0117469, "auxiliary_loss_mlp": 0.01024358, "balance_loss_clip": 1.05627441, "balance_loss_mlp": 1.01501167, "epoch": 0.3181626886310347, "flos": 26174964648960.0, "grad_norm": 2.062135016092122, "language_loss": 0.88290763, "learning_rate": 3.190518573598534e-06, "loss": 0.90489805, "num_input_tokens_seen": 56969065, "step": 2646, "time_per_iteration": 2.693160057067871 }, { "auxiliary_loss_clip": 0.01164797, "auxiliary_loss_mlp": 0.01034565, "balance_loss_clip": 1.05076027, "balance_loss_mlp": 1.02532065, "epoch": 0.3182829315216738, "flos": 25483109811840.0, "grad_norm": 1.5098846619109556, "language_loss": 0.77240378, "learning_rate": 3.1898925533564308e-06, "loss": 0.79439741, "num_input_tokens_seen": 56990535, "step": 2647, "time_per_iteration": 2.6794095039367676 }, { "auxiliary_loss_clip": 0.01150265, "auxiliary_loss_mlp": 0.01038679, "balance_loss_clip": 1.05268478, "balance_loss_mlp": 1.02982235, "epoch": 0.31840317441231286, "flos": 18113701927680.0, "grad_norm": 2.040530845000522, "language_loss": 0.64331335, "learning_rate": 3.1892663526085733e-06, "loss": 0.6652028, "num_input_tokens_seen": 57008910, "step": 2648, "time_per_iteration": 2.716061592102051 }, { "auxiliary_loss_clip": 0.01111013, "auxiliary_loss_mlp": 0.01000461, "balance_loss_clip": 1.03096509, "balance_loss_mlp": 0.99873257, "epoch": 0.31852341730295197, "flos": 64741948957440.0, "grad_norm": 0.7554043591838755, "language_loss": 0.56937969, "learning_rate": 3.188639971449956e-06, "loss": 0.59049439, "num_input_tokens_seen": 57074960, "step": 2649, "time_per_iteration": 3.1449673175811768 }, { "auxiliary_loss_clip": 0.01206892, "auxiliary_loss_mlp": 0.01027615, "balance_loss_clip": 1.06129241, "balance_loss_mlp": 1.01834083, "epoch": 0.318643660193591, "flos": 20668135582080.0, "grad_norm": 2.160410633016965, "language_loss": 0.7220965, "learning_rate": 3.1880134099756e-06, "loss": 0.74444163, "num_input_tokens_seen": 57094595, "step": 2650, "time_per_iteration": 2.586362838745117 }, { "auxiliary_loss_clip": 0.01186174, "auxiliary_loss_mlp": 0.0103224, "balance_loss_clip": 1.05524802, "balance_loss_mlp": 1.0226382, "epoch": 0.31876390308423014, "flos": 26943345411840.0, "grad_norm": 2.059900866819686, "language_loss": 0.69800842, "learning_rate": 3.1873866682805535e-06, "loss": 0.72019261, "num_input_tokens_seen": 57115290, "step": 2651, "time_per_iteration": 2.6323041915893555 }, { "auxiliary_loss_clip": 0.01179122, "auxiliary_loss_mlp": 0.01027151, "balance_loss_clip": 1.05728364, "balance_loss_mlp": 1.01874721, "epoch": 0.31888414597486925, "flos": 18041916597120.0, "grad_norm": 1.8292060605653502, "language_loss": 0.88797295, "learning_rate": 3.186759746459894e-06, "loss": 0.91003573, "num_input_tokens_seen": 57134400, "step": 2652, "time_per_iteration": 2.58427095413208 }, { "auxiliary_loss_clip": 0.01174249, "auxiliary_loss_mlp": 0.01029201, "balance_loss_clip": 1.05728281, "balance_loss_mlp": 1.02024209, "epoch": 0.3190043888655083, "flos": 25149319701120.0, "grad_norm": 1.9541746741175319, "language_loss": 0.79754037, "learning_rate": 3.1861326446087246e-06, "loss": 0.81957483, "num_input_tokens_seen": 57153140, "step": 2653, "time_per_iteration": 2.6651721000671387 }, { "auxiliary_loss_clip": 0.01189141, "auxiliary_loss_mlp": 0.01030091, "balance_loss_clip": 1.05745327, "balance_loss_mlp": 1.02125728, "epoch": 0.3191246317561474, "flos": 22053892331520.0, "grad_norm": 2.147429252579355, "language_loss": 0.71481299, "learning_rate": 3.1855053628221763e-06, "loss": 0.73700529, "num_input_tokens_seen": 57172395, "step": 2654, "time_per_iteration": 2.5365352630615234 }, { "auxiliary_loss_clip": 0.01148972, "auxiliary_loss_mlp": 0.01035172, "balance_loss_clip": 1.04869509, "balance_loss_mlp": 1.02509904, "epoch": 0.3192448746467865, "flos": 14901815687040.0, "grad_norm": 2.717073045207291, "language_loss": 0.89930892, "learning_rate": 3.184877901195407e-06, "loss": 0.92115033, "num_input_tokens_seen": 57189090, "step": 2655, "time_per_iteration": 2.6384241580963135 }, { "auxiliary_loss_clip": 0.0108329, "auxiliary_loss_mlp": 0.01002807, "balance_loss_clip": 1.0296762, "balance_loss_mlp": 1.00073266, "epoch": 0.3193651175374256, "flos": 67234832657280.0, "grad_norm": 0.7932010337600385, "language_loss": 0.62812161, "learning_rate": 3.184250259823602e-06, "loss": 0.64898258, "num_input_tokens_seen": 57251620, "step": 2656, "time_per_iteration": 3.2411487102508545 }, { "auxiliary_loss_clip": 0.01157957, "auxiliary_loss_mlp": 0.01039284, "balance_loss_clip": 1.05211139, "balance_loss_mlp": 1.02968168, "epoch": 0.3194853604280647, "flos": 12233077977600.0, "grad_norm": 2.2777250357320233, "language_loss": 0.8170957, "learning_rate": 3.183622438801974e-06, "loss": 0.83906806, "num_input_tokens_seen": 57266910, "step": 2657, "time_per_iteration": 2.7138192653656006 }, { "auxiliary_loss_clip": 0.01203314, "auxiliary_loss_mlp": 0.01028863, "balance_loss_clip": 1.05961597, "balance_loss_mlp": 1.02042294, "epoch": 0.3196056033187038, "flos": 14939917038720.0, "grad_norm": 2.0808455768533207, "language_loss": 0.75539839, "learning_rate": 3.1829944382257637e-06, "loss": 0.77772015, "num_input_tokens_seen": 57285040, "step": 2658, "time_per_iteration": 2.5761444568634033 }, { "auxiliary_loss_clip": 0.01184822, "auxiliary_loss_mlp": 0.01029782, "balance_loss_clip": 1.05807185, "balance_loss_mlp": 1.02127647, "epoch": 0.31972584620934286, "flos": 23768878164480.0, "grad_norm": 2.5102832990760833, "language_loss": 0.8172366, "learning_rate": 3.1823662581902373e-06, "loss": 0.83938265, "num_input_tokens_seen": 57302725, "step": 2659, "time_per_iteration": 2.646191120147705 }, { "auxiliary_loss_clip": 0.01142415, "auxiliary_loss_mlp": 0.01037386, "balance_loss_clip": 1.04534554, "balance_loss_mlp": 1.02790844, "epoch": 0.31984608909998197, "flos": 21251540280960.0, "grad_norm": 2.315722707100912, "language_loss": 0.7472682, "learning_rate": 3.1817378987906896e-06, "loss": 0.76906621, "num_input_tokens_seen": 57322230, "step": 2660, "time_per_iteration": 2.7173209190368652 }, { "auxiliary_loss_clip": 0.01139208, "auxiliary_loss_mlp": 0.01041791, "balance_loss_clip": 1.05236459, "balance_loss_mlp": 1.03256464, "epoch": 0.3199663319906211, "flos": 18296235866880.0, "grad_norm": 1.856731989896956, "language_loss": 0.80013651, "learning_rate": 3.181109360122442e-06, "loss": 0.8219465, "num_input_tokens_seen": 57339820, "step": 2661, "time_per_iteration": 2.7141568660736084 }, { "auxiliary_loss_clip": 0.0115575, "auxiliary_loss_mlp": 0.01028795, "balance_loss_clip": 1.05226803, "balance_loss_mlp": 1.01925194, "epoch": 0.32008657488126013, "flos": 18733627779840.0, "grad_norm": 12.717548356599197, "language_loss": 0.7830075, "learning_rate": 3.1804806422808445e-06, "loss": 0.80485296, "num_input_tokens_seen": 57356955, "step": 2662, "time_per_iteration": 2.6475565433502197 }, { "auxiliary_loss_clip": 0.01160835, "auxiliary_loss_mlp": 0.0102929, "balance_loss_clip": 1.05161011, "balance_loss_mlp": 1.0200932, "epoch": 0.32020681777189924, "flos": 20595344670720.0, "grad_norm": 2.019564904205496, "language_loss": 0.72773623, "learning_rate": 3.1798517453612714e-06, "loss": 0.74963742, "num_input_tokens_seen": 57376760, "step": 2663, "time_per_iteration": 3.678034782409668 }, { "auxiliary_loss_clip": 0.01188805, "auxiliary_loss_mlp": 0.01032099, "balance_loss_clip": 1.06200576, "balance_loss_mlp": 1.02325964, "epoch": 0.32032706066253835, "flos": 35261692750080.0, "grad_norm": 1.8536858884831489, "language_loss": 0.75676847, "learning_rate": 3.1792226694591265e-06, "loss": 0.77897745, "num_input_tokens_seen": 57398145, "step": 2664, "time_per_iteration": 2.797323703765869 }, { "auxiliary_loss_clip": 0.01159655, "auxiliary_loss_mlp": 0.01035582, "balance_loss_clip": 1.05540061, "balance_loss_mlp": 1.02728486, "epoch": 0.3204473035531774, "flos": 15304230731520.0, "grad_norm": 2.1999846292850664, "language_loss": 0.80655587, "learning_rate": 3.178593414669841e-06, "loss": 0.82850826, "num_input_tokens_seen": 57416730, "step": 2665, "time_per_iteration": 2.6691126823425293 }, { "auxiliary_loss_clip": 0.01192261, "auxiliary_loss_mlp": 0.01033548, "balance_loss_clip": 1.05801356, "balance_loss_mlp": 1.02371943, "epoch": 0.3205675464438165, "flos": 24462564595200.0, "grad_norm": 3.6194245220349774, "language_loss": 0.70929843, "learning_rate": 3.1779639810888707e-06, "loss": 0.73155659, "num_input_tokens_seen": 57436325, "step": 2666, "time_per_iteration": 2.6896119117736816 }, { "auxiliary_loss_clip": 0.0118585, "auxiliary_loss_mlp": 0.01035621, "balance_loss_clip": 1.05905342, "balance_loss_mlp": 1.0267514, "epoch": 0.3206877893344556, "flos": 22456235548800.0, "grad_norm": 4.444255100585475, "language_loss": 0.75947434, "learning_rate": 3.1773343688117013e-06, "loss": 0.78168905, "num_input_tokens_seen": 57457235, "step": 2667, "time_per_iteration": 3.671142816543579 }, { "auxiliary_loss_clip": 0.01175563, "auxiliary_loss_mlp": 0.00764016, "balance_loss_clip": 1.05527055, "balance_loss_mlp": 1.00067687, "epoch": 0.3208080322250947, "flos": 20412236113920.0, "grad_norm": 2.4619134730490573, "language_loss": 0.84092939, "learning_rate": 3.1767045779338445e-06, "loss": 0.86032522, "num_input_tokens_seen": 57474895, "step": 2668, "time_per_iteration": 2.654910087585449 }, { "auxiliary_loss_clip": 0.01185924, "auxiliary_loss_mlp": 0.01032518, "balance_loss_clip": 1.05610991, "balance_loss_mlp": 1.02367282, "epoch": 0.3209282751157338, "flos": 21762118154880.0, "grad_norm": 2.6101778224097516, "language_loss": 0.92002618, "learning_rate": 3.176074608550839e-06, "loss": 0.94221056, "num_input_tokens_seen": 57490715, "step": 2669, "time_per_iteration": 2.6926400661468506 }, { "auxiliary_loss_clip": 0.01127689, "auxiliary_loss_mlp": 0.01028566, "balance_loss_clip": 1.04982281, "balance_loss_mlp": 1.01944637, "epoch": 0.32104851800637285, "flos": 22055041566720.0, "grad_norm": 4.526927705808214, "language_loss": 0.82451153, "learning_rate": 3.17544446075825e-06, "loss": 0.8460741, "num_input_tokens_seen": 57509880, "step": 2670, "time_per_iteration": 3.7198309898376465 }, { "auxiliary_loss_clip": 0.01178198, "auxiliary_loss_mlp": 0.01040229, "balance_loss_clip": 1.05589473, "balance_loss_mlp": 1.03189659, "epoch": 0.32116876089701196, "flos": 37012301896320.0, "grad_norm": 1.6072743879347087, "language_loss": 0.71155024, "learning_rate": 3.174814134651671e-06, "loss": 0.73373449, "num_input_tokens_seen": 57532430, "step": 2671, "time_per_iteration": 3.7673261165618896 }, { "auxiliary_loss_clip": 0.01200183, "auxiliary_loss_mlp": 0.01032068, "balance_loss_clip": 1.05831993, "balance_loss_mlp": 1.02352631, "epoch": 0.3212890037876511, "flos": 21979233912960.0, "grad_norm": 2.00341738107503, "language_loss": 0.80526698, "learning_rate": 3.1741836303267215e-06, "loss": 0.82758951, "num_input_tokens_seen": 57551965, "step": 2672, "time_per_iteration": 2.668379068374634 }, { "auxiliary_loss_clip": 0.01200784, "auxiliary_loss_mlp": 0.01026278, "balance_loss_clip": 1.05927181, "balance_loss_mlp": 1.01821971, "epoch": 0.32140924667829013, "flos": 10342345875840.0, "grad_norm": 2.2027818663589813, "language_loss": 0.75143903, "learning_rate": 3.1735529478790496e-06, "loss": 0.77370965, "num_input_tokens_seen": 57569955, "step": 2673, "time_per_iteration": 2.5593314170837402 }, { "auxiliary_loss_clip": 0.01186477, "auxiliary_loss_mlp": 0.01034853, "balance_loss_clip": 1.05760396, "balance_loss_mlp": 1.02564371, "epoch": 0.32152948956892924, "flos": 50798910072960.0, "grad_norm": 2.6038315450773735, "language_loss": 0.79391623, "learning_rate": 3.172922087404328e-06, "loss": 0.81612945, "num_input_tokens_seen": 57592215, "step": 2674, "time_per_iteration": 3.006711959838867 }, { "auxiliary_loss_clip": 0.01113378, "auxiliary_loss_mlp": 0.01001224, "balance_loss_clip": 1.03344941, "balance_loss_mlp": 0.9994117, "epoch": 0.32164973245956835, "flos": 63863250549120.0, "grad_norm": 0.7787441855430769, "language_loss": 0.55247223, "learning_rate": 3.1722910489982586e-06, "loss": 0.57361817, "num_input_tokens_seen": 57652575, "step": 2675, "time_per_iteration": 3.2773170471191406 }, { "auxiliary_loss_clip": 0.01168376, "auxiliary_loss_mlp": 0.01027415, "balance_loss_clip": 1.0542587, "balance_loss_mlp": 1.01905847, "epoch": 0.3217699753502074, "flos": 23513948363520.0, "grad_norm": 1.4558518399401064, "language_loss": 0.80182612, "learning_rate": 3.1716598327565694e-06, "loss": 0.82378399, "num_input_tokens_seen": 57672215, "step": 2676, "time_per_iteration": 2.6673173904418945 }, { "auxiliary_loss_clip": 0.01200121, "auxiliary_loss_mlp": 0.01028462, "balance_loss_clip": 1.05835462, "balance_loss_mlp": 1.01995599, "epoch": 0.3218902182408465, "flos": 19062533640960.0, "grad_norm": 1.818222936595592, "language_loss": 0.84403127, "learning_rate": 3.171028438775015e-06, "loss": 0.86631709, "num_input_tokens_seen": 57691410, "step": 2677, "time_per_iteration": 2.6804986000061035 }, { "auxiliary_loss_clip": 0.0119887, "auxiliary_loss_mlp": 0.01027673, "balance_loss_clip": 1.05770195, "balance_loss_mlp": 1.01882827, "epoch": 0.3220104611314856, "flos": 20375571306240.0, "grad_norm": 2.239854653174669, "language_loss": 0.84183598, "learning_rate": 3.170396867149377e-06, "loss": 0.86410141, "num_input_tokens_seen": 57709415, "step": 2678, "time_per_iteration": 2.5963778495788574 }, { "auxiliary_loss_clip": 0.01137047, "auxiliary_loss_mlp": 0.010264, "balance_loss_clip": 1.05215406, "balance_loss_mlp": 1.01785827, "epoch": 0.3221307040221247, "flos": 20117014231680.0, "grad_norm": 1.8077321639356727, "language_loss": 0.86147922, "learning_rate": 3.1697651179754653e-06, "loss": 0.88311368, "num_input_tokens_seen": 57728075, "step": 2679, "time_per_iteration": 2.7663826942443848 }, { "auxiliary_loss_clip": 0.01162432, "auxiliary_loss_mlp": 0.01034202, "balance_loss_clip": 1.05871558, "balance_loss_mlp": 1.02604866, "epoch": 0.3222509469127638, "flos": 23987789602560.0, "grad_norm": 2.0939866428061498, "language_loss": 0.73339653, "learning_rate": 3.1691331913491153e-06, "loss": 0.75536287, "num_input_tokens_seen": 57750645, "step": 2680, "time_per_iteration": 2.8074047565460205 }, { "auxiliary_loss_clip": 0.01199465, "auxiliary_loss_mlp": 0.01031313, "balance_loss_clip": 1.05671239, "balance_loss_mlp": 1.02313566, "epoch": 0.32237118980340285, "flos": 17675735397120.0, "grad_norm": 2.303828002156036, "language_loss": 0.85336363, "learning_rate": 3.1685010873661898e-06, "loss": 0.87567151, "num_input_tokens_seen": 57769820, "step": 2681, "time_per_iteration": 2.620922803878784 }, { "auxiliary_loss_clip": 0.01181703, "auxiliary_loss_mlp": 0.01029544, "balance_loss_clip": 1.05534959, "balance_loss_mlp": 1.02030575, "epoch": 0.32249143269404196, "flos": 23147982645120.0, "grad_norm": 2.841733961832477, "language_loss": 0.7957238, "learning_rate": 3.167868806122578e-06, "loss": 0.81783628, "num_input_tokens_seen": 57788870, "step": 2682, "time_per_iteration": 2.6248533725738525 }, { "auxiliary_loss_clip": 0.01175875, "auxiliary_loss_mlp": 0.0103261, "balance_loss_clip": 1.05551839, "balance_loss_mlp": 1.02386642, "epoch": 0.32261167558468107, "flos": 24422308427520.0, "grad_norm": 1.8500413183996878, "language_loss": 0.6588155, "learning_rate": 3.1672363477141968e-06, "loss": 0.68090034, "num_input_tokens_seen": 57808165, "step": 2683, "time_per_iteration": 2.6878409385681152 }, { "auxiliary_loss_clip": 0.01174659, "auxiliary_loss_mlp": 0.01033253, "balance_loss_clip": 1.05270362, "balance_loss_mlp": 1.02413988, "epoch": 0.3227319184753201, "flos": 30367175852160.0, "grad_norm": 2.385675418284763, "language_loss": 0.85025442, "learning_rate": 3.1666037122369903e-06, "loss": 0.87233359, "num_input_tokens_seen": 57828825, "step": 2684, "time_per_iteration": 2.816071033477783 }, { "auxiliary_loss_clip": 0.01180412, "auxiliary_loss_mlp": 0.01029844, "balance_loss_clip": 1.05033648, "balance_loss_mlp": 1.02032471, "epoch": 0.32285216136595923, "flos": 16946174257920.0, "grad_norm": 7.804299814259131, "language_loss": 0.86498702, "learning_rate": 3.165970899786928e-06, "loss": 0.88708961, "num_input_tokens_seen": 57846740, "step": 2685, "time_per_iteration": 2.7324421405792236 }, { "auxiliary_loss_clip": 0.01161657, "auxiliary_loss_mlp": 0.01032866, "balance_loss_clip": 1.05338311, "balance_loss_mlp": 1.02371097, "epoch": 0.32297240425659834, "flos": 21981532383360.0, "grad_norm": 1.9519212969107351, "language_loss": 0.75558466, "learning_rate": 3.1653379104600067e-06, "loss": 0.77752984, "num_input_tokens_seen": 57866885, "step": 2686, "time_per_iteration": 2.6811389923095703 }, { "auxiliary_loss_clip": 0.01184418, "auxiliary_loss_mlp": 0.01035187, "balance_loss_clip": 1.05381632, "balance_loss_mlp": 1.02635944, "epoch": 0.3230926471472374, "flos": 22748045639040.0, "grad_norm": 1.4794399716933917, "language_loss": 0.69333667, "learning_rate": 3.164704744352251e-06, "loss": 0.71553266, "num_input_tokens_seen": 57887690, "step": 2687, "time_per_iteration": 2.641662836074829 }, { "auxiliary_loss_clip": 0.01180484, "auxiliary_loss_mlp": 0.01028419, "balance_loss_clip": 1.05222178, "balance_loss_mlp": 1.01950192, "epoch": 0.3232128900378765, "flos": 16942977947520.0, "grad_norm": 2.126783457420904, "language_loss": 0.80950356, "learning_rate": 3.164071401559713e-06, "loss": 0.83159256, "num_input_tokens_seen": 57905090, "step": 2688, "time_per_iteration": 2.581743001937866 }, { "auxiliary_loss_clip": 0.01172693, "auxiliary_loss_mlp": 0.01035582, "balance_loss_clip": 1.0538435, "balance_loss_mlp": 1.02692151, "epoch": 0.3233331329285156, "flos": 24023736138240.0, "grad_norm": 1.9411633349304152, "language_loss": 0.71291256, "learning_rate": 3.1634378821784674e-06, "loss": 0.73499537, "num_input_tokens_seen": 57925305, "step": 2689, "time_per_iteration": 3.6882474422454834 }, { "auxiliary_loss_clip": 0.01161486, "auxiliary_loss_mlp": 0.01029921, "balance_loss_clip": 1.05653632, "balance_loss_mlp": 1.02119446, "epoch": 0.3234533758191547, "flos": 18113845582080.0, "grad_norm": 2.181804237186721, "language_loss": 0.74275684, "learning_rate": 3.1628041863046208e-06, "loss": 0.76467097, "num_input_tokens_seen": 57942720, "step": 2690, "time_per_iteration": 2.717665910720825 }, { "auxiliary_loss_clip": 0.01202664, "auxiliary_loss_mlp": 0.01031514, "balance_loss_clip": 1.05548096, "balance_loss_mlp": 1.02238226, "epoch": 0.3235736187097938, "flos": 16946138344320.0, "grad_norm": 2.1688446784817, "language_loss": 0.91246855, "learning_rate": 3.162170314034304e-06, "loss": 0.93481028, "num_input_tokens_seen": 57960135, "step": 2691, "time_per_iteration": 2.5122272968292236 }, { "auxiliary_loss_clip": 0.01204652, "auxiliary_loss_mlp": 0.01034011, "balance_loss_clip": 1.05794799, "balance_loss_mlp": 1.02479577, "epoch": 0.3236938616004329, "flos": 22127150119680.0, "grad_norm": 1.7283905481137893, "language_loss": 0.809744, "learning_rate": 3.1615362654636738e-06, "loss": 0.83213067, "num_input_tokens_seen": 57980875, "step": 2692, "time_per_iteration": 2.5773417949676514 }, { "auxiliary_loss_clip": 0.01152783, "auxiliary_loss_mlp": 0.01033291, "balance_loss_clip": 1.0547297, "balance_loss_mlp": 1.02517891, "epoch": 0.32381410449107195, "flos": 17164618819200.0, "grad_norm": 1.697457137586407, "language_loss": 0.86964941, "learning_rate": 3.1609020406889163e-06, "loss": 0.89151013, "num_input_tokens_seen": 57998310, "step": 2693, "time_per_iteration": 3.5669150352478027 }, { "auxiliary_loss_clip": 0.01170754, "auxiliary_loss_mlp": 0.01034687, "balance_loss_clip": 1.0510149, "balance_loss_mlp": 1.02587748, "epoch": 0.32393434738171106, "flos": 16578125550720.0, "grad_norm": 1.6582915558559088, "language_loss": 0.85000008, "learning_rate": 3.1602676398062416e-06, "loss": 0.87205446, "num_input_tokens_seen": 58017220, "step": 2694, "time_per_iteration": 2.688220977783203 }, { "auxiliary_loss_clip": 0.01181408, "auxiliary_loss_mlp": 0.01030072, "balance_loss_clip": 1.05519795, "balance_loss_mlp": 1.02068436, "epoch": 0.3240545902723502, "flos": 25483612602240.0, "grad_norm": 3.607763711615817, "language_loss": 0.61276221, "learning_rate": 3.1596330629118886e-06, "loss": 0.63487703, "num_input_tokens_seen": 58037190, "step": 2695, "time_per_iteration": 2.675760269165039 }, { "auxiliary_loss_clip": 0.01132187, "auxiliary_loss_mlp": 0.01030921, "balance_loss_clip": 1.04867327, "balance_loss_mlp": 1.0221833, "epoch": 0.32417483316298923, "flos": 35845851634560.0, "grad_norm": 2.080011649110052, "language_loss": 0.73202497, "learning_rate": 3.1589983101021223e-06, "loss": 0.75365609, "num_input_tokens_seen": 58055820, "step": 2696, "time_per_iteration": 3.8391034603118896 }, { "auxiliary_loss_clip": 0.0117194, "auxiliary_loss_mlp": 0.01030873, "balance_loss_clip": 1.05369258, "balance_loss_mlp": 1.02228403, "epoch": 0.32429507605362834, "flos": 30080501406720.0, "grad_norm": 2.095173365583587, "language_loss": 0.84784007, "learning_rate": 3.1583633814732337e-06, "loss": 0.86986822, "num_input_tokens_seen": 58075340, "step": 2697, "time_per_iteration": 3.6581292152404785 }, { "auxiliary_loss_clip": 0.01195993, "auxiliary_loss_mlp": 0.0103467, "balance_loss_clip": 1.0535357, "balance_loss_mlp": 1.02596784, "epoch": 0.3244153189442674, "flos": 18223265387520.0, "grad_norm": 2.4280442470545025, "language_loss": 0.71742296, "learning_rate": 3.157728277121541e-06, "loss": 0.73972964, "num_input_tokens_seen": 58093515, "step": 2698, "time_per_iteration": 2.5995421409606934 }, { "auxiliary_loss_clip": 0.01198776, "auxiliary_loss_mlp": 0.0103385, "balance_loss_clip": 1.05444634, "balance_loss_mlp": 1.02536237, "epoch": 0.3245355618349065, "flos": 17710317216000.0, "grad_norm": 3.4922552122903214, "language_loss": 0.78478909, "learning_rate": 3.1570929971433897e-06, "loss": 0.80711532, "num_input_tokens_seen": 58109300, "step": 2699, "time_per_iteration": 2.5398757457733154 }, { "auxiliary_loss_clip": 0.0118703, "auxiliary_loss_mlp": 0.01035243, "balance_loss_clip": 1.05788898, "balance_loss_mlp": 1.02569437, "epoch": 0.3246558047255456, "flos": 23440798316160.0, "grad_norm": 2.013188420593872, "language_loss": 0.83747292, "learning_rate": 3.1564575416351504e-06, "loss": 0.85969567, "num_input_tokens_seen": 58128000, "step": 2700, "time_per_iteration": 2.707736015319824 }, { "auxiliary_loss_clip": 0.01201643, "auxiliary_loss_mlp": 0.01028354, "balance_loss_clip": 1.05733097, "balance_loss_mlp": 1.01959777, "epoch": 0.32477604761618467, "flos": 21760861178880.0, "grad_norm": 2.2926862266450945, "language_loss": 0.74221408, "learning_rate": 3.155821910693221e-06, "loss": 0.76451403, "num_input_tokens_seen": 58147415, "step": 2701, "time_per_iteration": 2.598891258239746 }, { "auxiliary_loss_clip": 0.01167121, "auxiliary_loss_mlp": 0.01031058, "balance_loss_clip": 1.05099344, "balance_loss_mlp": 1.02143741, "epoch": 0.3248962905068238, "flos": 19828328624640.0, "grad_norm": 1.6585726538962215, "language_loss": 0.86138117, "learning_rate": 3.1551861044140275e-06, "loss": 0.88336301, "num_input_tokens_seen": 58167050, "step": 2702, "time_per_iteration": 2.681370258331299 }, { "auxiliary_loss_clip": 0.01134887, "auxiliary_loss_mlp": 0.01030648, "balance_loss_clip": 1.05053306, "balance_loss_mlp": 1.02134943, "epoch": 0.3250165333974629, "flos": 23948215793280.0, "grad_norm": 1.7630983921575254, "language_loss": 0.77691948, "learning_rate": 3.15455012289402e-06, "loss": 0.79857481, "num_input_tokens_seen": 58186695, "step": 2703, "time_per_iteration": 2.756605625152588 }, { "auxiliary_loss_clip": 0.01186021, "auxiliary_loss_mlp": 0.01030883, "balance_loss_clip": 1.05699563, "balance_loss_mlp": 1.02161479, "epoch": 0.32513677628810195, "flos": 23989333887360.0, "grad_norm": 2.384380470252949, "language_loss": 0.84684229, "learning_rate": 3.153913966229677e-06, "loss": 0.86901134, "num_input_tokens_seen": 58205815, "step": 2704, "time_per_iteration": 2.6943352222442627 }, { "auxiliary_loss_clip": 0.01098558, "auxiliary_loss_mlp": 0.01008128, "balance_loss_clip": 1.02944577, "balance_loss_mlp": 1.00648332, "epoch": 0.32525701917874106, "flos": 70655790009600.0, "grad_norm": 0.644508628970415, "language_loss": 0.50211573, "learning_rate": 3.1532776345175027e-06, "loss": 0.52318257, "num_input_tokens_seen": 58270960, "step": 2705, "time_per_iteration": 3.2086570262908936 }, { "auxiliary_loss_clip": 0.01196053, "auxiliary_loss_mlp": 0.01028983, "balance_loss_clip": 1.05486655, "balance_loss_mlp": 1.01998878, "epoch": 0.32537726206938017, "flos": 19682639061120.0, "grad_norm": 1.7446376379257678, "language_loss": 0.78467262, "learning_rate": 3.1526411278540285e-06, "loss": 0.80692291, "num_input_tokens_seen": 58289390, "step": 2706, "time_per_iteration": 2.5861656665802 }, { "auxiliary_loss_clip": 0.01172528, "auxiliary_loss_mlp": 0.01030973, "balance_loss_clip": 1.04970145, "balance_loss_mlp": 1.02164507, "epoch": 0.3254975049600192, "flos": 28760999293440.0, "grad_norm": 2.4325227784662395, "language_loss": 0.81630635, "learning_rate": 3.1520044463358116e-06, "loss": 0.83834136, "num_input_tokens_seen": 58306120, "step": 2707, "time_per_iteration": 2.69480562210083 }, { "auxiliary_loss_clip": 0.01181407, "auxiliary_loss_mlp": 0.01037312, "balance_loss_clip": 1.05369127, "balance_loss_mlp": 1.0284493, "epoch": 0.32561774785065833, "flos": 18877378008960.0, "grad_norm": 2.6494665956555314, "language_loss": 0.80337524, "learning_rate": 3.151367590059436e-06, "loss": 0.82556248, "num_input_tokens_seen": 58324545, "step": 2708, "time_per_iteration": 2.6525285243988037 }, { "auxiliary_loss_clip": 0.01201753, "auxiliary_loss_mlp": 0.00764278, "balance_loss_clip": 1.05856109, "balance_loss_mlp": 1.00063896, "epoch": 0.32573799074129745, "flos": 23112107936640.0, "grad_norm": 2.003324126507848, "language_loss": 0.86920458, "learning_rate": 3.1507305591215117e-06, "loss": 0.88886493, "num_input_tokens_seen": 58342455, "step": 2709, "time_per_iteration": 2.647879123687744 }, { "auxiliary_loss_clip": 0.01095487, "auxiliary_loss_mlp": 0.01003741, "balance_loss_clip": 1.02736974, "balance_loss_mlp": 1.00204849, "epoch": 0.3258582336319365, "flos": 71237650423680.0, "grad_norm": 0.6793714373587663, "language_loss": 0.55718565, "learning_rate": 3.150093353618677e-06, "loss": 0.57817793, "num_input_tokens_seen": 58407185, "step": 2710, "time_per_iteration": 3.2688558101654053 }, { "auxiliary_loss_clip": 0.01190397, "auxiliary_loss_mlp": 0.01032038, "balance_loss_clip": 1.05490875, "balance_loss_mlp": 1.02254903, "epoch": 0.3259784765225756, "flos": 22456020067200.0, "grad_norm": 2.203156727383354, "language_loss": 0.88424277, "learning_rate": 3.149455973647596e-06, "loss": 0.90646708, "num_input_tokens_seen": 58425245, "step": 2711, "time_per_iteration": 2.727320671081543 }, { "auxiliary_loss_clip": 0.01145603, "auxiliary_loss_mlp": 0.01037936, "balance_loss_clip": 1.04603148, "balance_loss_mlp": 1.02862036, "epoch": 0.32609871941321467, "flos": 20484811543680.0, "grad_norm": 2.0436688276475294, "language_loss": 0.77381182, "learning_rate": 3.1488184193049563e-06, "loss": 0.79564726, "num_input_tokens_seen": 58444780, "step": 2712, "time_per_iteration": 2.6932926177978516 }, { "auxiliary_loss_clip": 0.01195645, "auxiliary_loss_mlp": 0.01032021, "balance_loss_clip": 1.05589318, "balance_loss_mlp": 1.02297878, "epoch": 0.3262189623038538, "flos": 22416805393920.0, "grad_norm": 1.927821861631247, "language_loss": 0.72152805, "learning_rate": 3.1481806906874767e-06, "loss": 0.74380469, "num_input_tokens_seen": 58466090, "step": 2713, "time_per_iteration": 2.6816134452819824 }, { "auxiliary_loss_clip": 0.01199366, "auxiliary_loss_mlp": 0.01030634, "balance_loss_clip": 1.05724692, "balance_loss_mlp": 1.02201486, "epoch": 0.3263392051944929, "flos": 20923496346240.0, "grad_norm": 1.688457753955921, "language_loss": 0.87984264, "learning_rate": 3.147542787891899e-06, "loss": 0.90214264, "num_input_tokens_seen": 58485435, "step": 2714, "time_per_iteration": 2.5901939868927 }, { "auxiliary_loss_clip": 0.01175617, "auxiliary_loss_mlp": 0.01040731, "balance_loss_clip": 1.05757809, "balance_loss_mlp": 1.03105128, "epoch": 0.32645944808513194, "flos": 24025172682240.0, "grad_norm": 1.7642589594246745, "language_loss": 0.75608838, "learning_rate": 3.1469047110149926e-06, "loss": 0.77825189, "num_input_tokens_seen": 58504175, "step": 2715, "time_per_iteration": 2.7627196311950684 }, { "auxiliary_loss_clip": 0.01131864, "auxiliary_loss_mlp": 0.01040227, "balance_loss_clip": 1.04791284, "balance_loss_mlp": 1.03082681, "epoch": 0.32657969097577105, "flos": 21032413361280.0, "grad_norm": 2.1669625716632903, "language_loss": 0.85252929, "learning_rate": 3.146266460153554e-06, "loss": 0.87425017, "num_input_tokens_seen": 58523885, "step": 2716, "time_per_iteration": 3.6698122024536133 }, { "auxiliary_loss_clip": 0.01166124, "auxiliary_loss_mlp": 0.00764439, "balance_loss_clip": 1.0531404, "balance_loss_mlp": 1.00055552, "epoch": 0.32669993386641016, "flos": 22710267509760.0, "grad_norm": 1.6507118938602057, "language_loss": 0.80202168, "learning_rate": 3.145628035404404e-06, "loss": 0.82132733, "num_input_tokens_seen": 58543085, "step": 2717, "time_per_iteration": 2.696194887161255 }, { "auxiliary_loss_clip": 0.01089147, "auxiliary_loss_mlp": 0.0100364, "balance_loss_clip": 1.02462018, "balance_loss_mlp": 1.00189996, "epoch": 0.3268201767570492, "flos": 72105718406400.0, "grad_norm": 0.8844296341462872, "language_loss": 0.57475275, "learning_rate": 3.1449894368643922e-06, "loss": 0.59568065, "num_input_tokens_seen": 58605400, "step": 2718, "time_per_iteration": 3.3167662620544434 }, { "auxiliary_loss_clip": 0.01156548, "auxiliary_loss_mlp": 0.0103612, "balance_loss_clip": 1.05682421, "balance_loss_mlp": 1.02787721, "epoch": 0.32694041964768833, "flos": 24535175938560.0, "grad_norm": 1.495595141410126, "language_loss": 0.71276188, "learning_rate": 3.1443506646303934e-06, "loss": 0.73468852, "num_input_tokens_seen": 58626700, "step": 2719, "time_per_iteration": 3.6778078079223633 }, { "auxiliary_loss_clip": 0.01187519, "auxiliary_loss_mlp": 0.01033361, "balance_loss_clip": 1.05527353, "balance_loss_mlp": 1.02390218, "epoch": 0.32706066253832744, "flos": 33183003755520.0, "grad_norm": 1.8922990074290353, "language_loss": 0.66900104, "learning_rate": 3.1437117187993086e-06, "loss": 0.69120979, "num_input_tokens_seen": 58649020, "step": 2720, "time_per_iteration": 2.716674327850342 }, { "auxiliary_loss_clip": 0.01146954, "auxiliary_loss_mlp": 0.01036626, "balance_loss_clip": 1.04799747, "balance_loss_mlp": 1.02787006, "epoch": 0.3271809054289665, "flos": 24061622008320.0, "grad_norm": 1.52626288594651, "language_loss": 0.80026329, "learning_rate": 3.143072599468065e-06, "loss": 0.82209909, "num_input_tokens_seen": 58668845, "step": 2721, "time_per_iteration": 2.7395308017730713 }, { "auxiliary_loss_clip": 0.01169925, "auxiliary_loss_mlp": 0.01033294, "balance_loss_clip": 1.05724251, "balance_loss_mlp": 1.02516353, "epoch": 0.3273011483196056, "flos": 38253769712640.0, "grad_norm": 2.0901700707802555, "language_loss": 0.755234, "learning_rate": 3.1424333067336174e-06, "loss": 0.7772662, "num_input_tokens_seen": 58691610, "step": 2722, "time_per_iteration": 2.767987012863159 }, { "auxiliary_loss_clip": 0.01188489, "auxiliary_loss_mlp": 0.0102626, "balance_loss_clip": 1.05452013, "balance_loss_mlp": 1.01652098, "epoch": 0.3274213912102447, "flos": 29054389582080.0, "grad_norm": 1.9110965813729102, "language_loss": 0.78544033, "learning_rate": 3.141793840692945e-06, "loss": 0.8075878, "num_input_tokens_seen": 58712360, "step": 2723, "time_per_iteration": 4.443854093551636 }, { "auxiliary_loss_clip": 0.01159917, "auxiliary_loss_mlp": 0.0103846, "balance_loss_clip": 1.05030847, "balance_loss_mlp": 1.02907276, "epoch": 0.32754163410088377, "flos": 29133249891840.0, "grad_norm": 4.375530108118556, "language_loss": 0.61836398, "learning_rate": 3.1411542014430553e-06, "loss": 0.64034784, "num_input_tokens_seen": 58733440, "step": 2724, "time_per_iteration": 2.7210636138916016 }, { "auxiliary_loss_clip": 0.01154698, "auxiliary_loss_mlp": 0.01030267, "balance_loss_clip": 1.04926348, "balance_loss_mlp": 1.02236366, "epoch": 0.3276618769915229, "flos": 20631075724800.0, "grad_norm": 1.675444218359621, "language_loss": 0.82086873, "learning_rate": 3.1405143890809804e-06, "loss": 0.84271836, "num_input_tokens_seen": 58752735, "step": 2725, "time_per_iteration": 2.692605972290039 }, { "auxiliary_loss_clip": 0.01165002, "auxiliary_loss_mlp": 0.0102873, "balance_loss_clip": 1.05159974, "balance_loss_mlp": 1.02024841, "epoch": 0.327782119882162, "flos": 18657425076480.0, "grad_norm": 1.7586110953937648, "language_loss": 0.7004565, "learning_rate": 3.1398744037037796e-06, "loss": 0.72239381, "num_input_tokens_seen": 58772070, "step": 2726, "time_per_iteration": 2.655078411102295 }, { "auxiliary_loss_clip": 0.01168012, "auxiliary_loss_mlp": 0.0103649, "balance_loss_clip": 1.05345798, "balance_loss_mlp": 1.02818704, "epoch": 0.32790236277280105, "flos": 21795802133760.0, "grad_norm": 2.432855516289774, "language_loss": 0.84516215, "learning_rate": 3.139234245408538e-06, "loss": 0.86720717, "num_input_tokens_seen": 58790950, "step": 2727, "time_per_iteration": 2.662137269973755 }, { "auxiliary_loss_clip": 0.01153588, "auxiliary_loss_mlp": 0.00763653, "balance_loss_clip": 1.05129695, "balance_loss_mlp": 1.00061703, "epoch": 0.32802260566344016, "flos": 23331414424320.0, "grad_norm": 1.8418717717494817, "language_loss": 0.7591095, "learning_rate": 3.1385939142923666e-06, "loss": 0.77828193, "num_input_tokens_seen": 58813340, "step": 2728, "time_per_iteration": 2.773582696914673 }, { "auxiliary_loss_clip": 0.01169, "auxiliary_loss_mlp": 0.01034719, "balance_loss_clip": 1.05198228, "balance_loss_mlp": 1.02545059, "epoch": 0.3281428485540792, "flos": 24206988349440.0, "grad_norm": 2.3015369566123205, "language_loss": 0.7830205, "learning_rate": 3.137953410452405e-06, "loss": 0.80505764, "num_input_tokens_seen": 58833610, "step": 2729, "time_per_iteration": 2.651838779449463 }, { "auxiliary_loss_clip": 0.01166122, "auxiliary_loss_mlp": 0.01026874, "balance_loss_clip": 1.05188966, "balance_loss_mlp": 1.01823723, "epoch": 0.3282630914447183, "flos": 34128962380800.0, "grad_norm": 1.6071313154171916, "language_loss": 0.7462821, "learning_rate": 3.1373127339858146e-06, "loss": 0.76821208, "num_input_tokens_seen": 58856210, "step": 2730, "time_per_iteration": 2.838601589202881 }, { "auxiliary_loss_clip": 0.01148767, "auxiliary_loss_mlp": 0.01024677, "balance_loss_clip": 1.05100334, "balance_loss_mlp": 1.01673782, "epoch": 0.32838333433535744, "flos": 27600726170880.0, "grad_norm": 1.8124446777107657, "language_loss": 0.74855691, "learning_rate": 3.136671884989787e-06, "loss": 0.77029139, "num_input_tokens_seen": 58876120, "step": 2731, "time_per_iteration": 2.731778621673584 }, { "auxiliary_loss_clip": 0.01127252, "auxiliary_loss_mlp": 0.01034107, "balance_loss_clip": 1.04796124, "balance_loss_mlp": 1.02526712, "epoch": 0.3285035772259965, "flos": 12349500935040.0, "grad_norm": 3.6125201833346297, "language_loss": 0.87894964, "learning_rate": 3.1360308635615383e-06, "loss": 0.90056324, "num_input_tokens_seen": 58894660, "step": 2732, "time_per_iteration": 2.7847747802734375 }, { "auxiliary_loss_clip": 0.01177303, "auxiliary_loss_mlp": 0.01027616, "balance_loss_clip": 1.05559576, "balance_loss_mlp": 1.01837111, "epoch": 0.3286238201166356, "flos": 24316084932480.0, "grad_norm": 2.5663987903993926, "language_loss": 0.78839916, "learning_rate": 3.135389669798311e-06, "loss": 0.81044841, "num_input_tokens_seen": 58912720, "step": 2733, "time_per_iteration": 2.718369245529175 }, { "auxiliary_loss_clip": 0.01180951, "auxiliary_loss_mlp": 0.00763938, "balance_loss_clip": 1.05279458, "balance_loss_mlp": 1.00051045, "epoch": 0.3287440630072747, "flos": 21392812471680.0, "grad_norm": 2.1203648417051997, "language_loss": 0.7999928, "learning_rate": 3.134748303797373e-06, "loss": 0.81944174, "num_input_tokens_seen": 58930090, "step": 2734, "time_per_iteration": 2.622119665145874 }, { "auxiliary_loss_clip": 0.01137341, "auxiliary_loss_mlp": 0.01033878, "balance_loss_clip": 1.04753184, "balance_loss_mlp": 1.02419257, "epoch": 0.32886430589791377, "flos": 23732536579200.0, "grad_norm": 2.3062577776158757, "language_loss": 0.8126601, "learning_rate": 3.1341067656560203e-06, "loss": 0.83437228, "num_input_tokens_seen": 58947935, "step": 2735, "time_per_iteration": 2.83725643157959 }, { "auxiliary_loss_clip": 0.011772, "auxiliary_loss_mlp": 0.01025924, "balance_loss_clip": 1.05431223, "balance_loss_mlp": 1.01684642, "epoch": 0.3289845487885529, "flos": 22418708814720.0, "grad_norm": 1.9174676875169832, "language_loss": 0.8644948, "learning_rate": 3.133465055471572e-06, "loss": 0.88652605, "num_input_tokens_seen": 58967720, "step": 2736, "time_per_iteration": 2.7002885341644287 }, { "auxiliary_loss_clip": 0.01145983, "auxiliary_loss_mlp": 0.01034743, "balance_loss_clip": 1.05050159, "balance_loss_mlp": 1.02574217, "epoch": 0.329104791679192, "flos": 19682603147520.0, "grad_norm": 2.4803616734978022, "language_loss": 0.66790497, "learning_rate": 3.1328231733413767e-06, "loss": 0.68971217, "num_input_tokens_seen": 58984360, "step": 2737, "time_per_iteration": 2.7366583347320557 }, { "auxiliary_loss_clip": 0.01177782, "auxiliary_loss_mlp": 0.01029465, "balance_loss_clip": 1.05436945, "balance_loss_mlp": 1.02074456, "epoch": 0.32922503456983104, "flos": 15997234803840.0, "grad_norm": 2.0592519836628282, "language_loss": 0.91126168, "learning_rate": 3.1321811193628067e-06, "loss": 0.93333417, "num_input_tokens_seen": 59002505, "step": 2738, "time_per_iteration": 2.6712002754211426 }, { "auxiliary_loss_clip": 0.0118352, "auxiliary_loss_mlp": 0.00764426, "balance_loss_clip": 1.05516744, "balance_loss_mlp": 1.00059998, "epoch": 0.32934527746047015, "flos": 26834069260800.0, "grad_norm": 1.8715024411965862, "language_loss": 0.69649929, "learning_rate": 3.131538893633261e-06, "loss": 0.71597874, "num_input_tokens_seen": 59022065, "step": 2739, "time_per_iteration": 2.8487672805786133 }, { "auxiliary_loss_clip": 0.01199617, "auxiliary_loss_mlp": 0.01027976, "balance_loss_clip": 1.05857217, "balance_loss_mlp": 1.01952386, "epoch": 0.32946552035110926, "flos": 23403774372480.0, "grad_norm": 2.310796069720394, "language_loss": 0.77843308, "learning_rate": 3.130896496250165e-06, "loss": 0.80070901, "num_input_tokens_seen": 59041890, "step": 2740, "time_per_iteration": 2.709935426712036 }, { "auxiliary_loss_clip": 0.01196597, "auxiliary_loss_mlp": 0.0102801, "balance_loss_clip": 1.05300164, "balance_loss_mlp": 1.019135, "epoch": 0.3295857632417483, "flos": 14172470029440.0, "grad_norm": 1.9443878705393807, "language_loss": 0.87073892, "learning_rate": 3.1302539273109693e-06, "loss": 0.89298493, "num_input_tokens_seen": 59058715, "step": 2741, "time_per_iteration": 3.503798007965088 }, { "auxiliary_loss_clip": 0.01163464, "auxiliary_loss_mlp": 0.01032671, "balance_loss_clip": 1.05428672, "balance_loss_mlp": 1.02360559, "epoch": 0.32970600613238743, "flos": 22196708807040.0, "grad_norm": 1.6440839861621774, "language_loss": 0.80638808, "learning_rate": 3.1296111869131513e-06, "loss": 0.82834941, "num_input_tokens_seen": 59076140, "step": 2742, "time_per_iteration": 2.716278553009033 }, { "auxiliary_loss_clip": 0.01195244, "auxiliary_loss_mlp": 0.01034529, "balance_loss_clip": 1.05435705, "balance_loss_mlp": 1.02601159, "epoch": 0.32982624902302654, "flos": 22053784590720.0, "grad_norm": 1.924245613651228, "language_loss": 0.86040592, "learning_rate": 3.1289682751542153e-06, "loss": 0.88270366, "num_input_tokens_seen": 59095700, "step": 2743, "time_per_iteration": 2.594625234603882 }, { "auxiliary_loss_clip": 0.01177995, "auxiliary_loss_mlp": 0.01028528, "balance_loss_clip": 1.05233264, "balance_loss_mlp": 1.019665, "epoch": 0.3299464919136656, "flos": 18661626967680.0, "grad_norm": 1.9313328465259096, "language_loss": 0.71468514, "learning_rate": 3.1283251921316883e-06, "loss": 0.73675036, "num_input_tokens_seen": 59113445, "step": 2744, "time_per_iteration": 2.6850879192352295 }, { "auxiliary_loss_clip": 0.01138701, "auxiliary_loss_mlp": 0.01030974, "balance_loss_clip": 1.05246329, "balance_loss_mlp": 1.02320099, "epoch": 0.3300667348043047, "flos": 13407357404160.0, "grad_norm": 3.517469270049778, "language_loss": 0.80781817, "learning_rate": 3.1276819379431277e-06, "loss": 0.82951498, "num_input_tokens_seen": 59131535, "step": 2745, "time_per_iteration": 3.630502223968506 }, { "auxiliary_loss_clip": 0.01175063, "auxiliary_loss_mlp": 0.00764098, "balance_loss_clip": 1.0537591, "balance_loss_mlp": 1.00050354, "epoch": 0.33018697769494376, "flos": 15742556398080.0, "grad_norm": 2.0751916934723162, "language_loss": 0.75446367, "learning_rate": 3.1270385126861134e-06, "loss": 0.77385527, "num_input_tokens_seen": 59149520, "step": 2746, "time_per_iteration": 2.673215389251709 }, { "auxiliary_loss_clip": 0.01195992, "auxiliary_loss_mlp": 0.01029203, "balance_loss_clip": 1.0543673, "balance_loss_mlp": 1.02067971, "epoch": 0.3303072205855829, "flos": 18258601392000.0, "grad_norm": 1.8595928738847969, "language_loss": 0.81959462, "learning_rate": 3.1263949164582533e-06, "loss": 0.84184659, "num_input_tokens_seen": 59169170, "step": 2747, "time_per_iteration": 2.5732316970825195 }, { "auxiliary_loss_clip": 0.01197703, "auxiliary_loss_mlp": 0.01028638, "balance_loss_clip": 1.05394101, "balance_loss_mlp": 1.02072811, "epoch": 0.330427463476222, "flos": 17749424148480.0, "grad_norm": 3.980768436195556, "language_loss": 0.78436404, "learning_rate": 3.1257511493571797e-06, "loss": 0.80662745, "num_input_tokens_seen": 59187675, "step": 2748, "time_per_iteration": 3.5830931663513184 }, { "auxiliary_loss_clip": 0.01156475, "auxiliary_loss_mlp": 0.01031397, "balance_loss_clip": 1.05486333, "balance_loss_mlp": 1.02307653, "epoch": 0.33054770636686104, "flos": 27162580072320.0, "grad_norm": 2.4615579324648094, "language_loss": 0.78758657, "learning_rate": 3.125107211480552e-06, "loss": 0.80946529, "num_input_tokens_seen": 59207610, "step": 2749, "time_per_iteration": 2.7574968338012695 }, { "auxiliary_loss_clip": 0.01122088, "auxiliary_loss_mlp": 0.01030103, "balance_loss_clip": 1.04756391, "balance_loss_mlp": 1.02170157, "epoch": 0.33066794925750015, "flos": 20117193799680.0, "grad_norm": 1.660941808985637, "language_loss": 0.7993902, "learning_rate": 3.124463102926054e-06, "loss": 0.82091212, "num_input_tokens_seen": 59226945, "step": 2750, "time_per_iteration": 3.693782329559326 }, { "auxiliary_loss_clip": 0.0108432, "auxiliary_loss_mlp": 0.01003989, "balance_loss_clip": 1.02398694, "balance_loss_mlp": 1.0021174, "epoch": 0.33078819214813926, "flos": 70642609718400.0, "grad_norm": 0.8006265573960069, "language_loss": 0.61651969, "learning_rate": 3.1238188237913984e-06, "loss": 0.63740277, "num_input_tokens_seen": 59291485, "step": 2751, "time_per_iteration": 3.2915022373199463 }, { "auxiliary_loss_clip": 0.01205073, "auxiliary_loss_mlp": 0.01035457, "balance_loss_clip": 1.059147, "balance_loss_mlp": 1.0262605, "epoch": 0.3309084350387783, "flos": 21141940907520.0, "grad_norm": 3.467555231418141, "language_loss": 0.76673847, "learning_rate": 3.1231743741743202e-06, "loss": 0.7891438, "num_input_tokens_seen": 59310990, "step": 2752, "time_per_iteration": 2.6316752433776855 }, { "auxiliary_loss_clip": 0.01177779, "auxiliary_loss_mlp": 0.01027107, "balance_loss_clip": 1.05091429, "balance_loss_mlp": 1.01798749, "epoch": 0.3310286779294174, "flos": 14209350318720.0, "grad_norm": 3.5509819221546177, "language_loss": 0.84183729, "learning_rate": 3.122529754172582e-06, "loss": 0.86388612, "num_input_tokens_seen": 59327875, "step": 2753, "time_per_iteration": 2.5843453407287598 }, { "auxiliary_loss_clip": 0.01183911, "auxiliary_loss_mlp": 0.01027278, "balance_loss_clip": 1.05586922, "balance_loss_mlp": 1.01858187, "epoch": 0.33114892082005654, "flos": 20778130005120.0, "grad_norm": 1.8537745980482834, "language_loss": 0.72648108, "learning_rate": 3.1218849638839736e-06, "loss": 0.74859297, "num_input_tokens_seen": 59347135, "step": 2754, "time_per_iteration": 2.6683666706085205 }, { "auxiliary_loss_clip": 0.01140969, "auxiliary_loss_mlp": 0.01035068, "balance_loss_clip": 1.04611528, "balance_loss_mlp": 1.02560902, "epoch": 0.3312691637106956, "flos": 17090750499840.0, "grad_norm": 2.4195691490461813, "language_loss": 0.78467965, "learning_rate": 3.121240003406307e-06, "loss": 0.80644006, "num_input_tokens_seen": 59365985, "step": 2755, "time_per_iteration": 2.6800169944763184 }, { "auxiliary_loss_clip": 0.01160748, "auxiliary_loss_mlp": 0.01040033, "balance_loss_clip": 1.05449092, "balance_loss_mlp": 1.03066301, "epoch": 0.3313894066013347, "flos": 29456230008960.0, "grad_norm": 2.187078467345408, "language_loss": 0.72534245, "learning_rate": 3.120594872837425e-06, "loss": 0.74735034, "num_input_tokens_seen": 59384655, "step": 2756, "time_per_iteration": 2.8027520179748535 }, { "auxiliary_loss_clip": 0.01088092, "auxiliary_loss_mlp": 0.00755158, "balance_loss_clip": 1.02406764, "balance_loss_mlp": 1.00024736, "epoch": 0.3315096494919738, "flos": 61419242280960.0, "grad_norm": 0.8292190739309188, "language_loss": 0.62370116, "learning_rate": 3.1199495722751906e-06, "loss": 0.64213377, "num_input_tokens_seen": 59444185, "step": 2757, "time_per_iteration": 3.2961905002593994 }, { "auxiliary_loss_clip": 0.01145187, "auxiliary_loss_mlp": 0.01032607, "balance_loss_clip": 1.05067515, "balance_loss_mlp": 1.02375603, "epoch": 0.33162989238261287, "flos": 21653057485440.0, "grad_norm": 2.175579658264054, "language_loss": 0.84069932, "learning_rate": 3.1193041018174972e-06, "loss": 0.86247718, "num_input_tokens_seen": 59464900, "step": 2758, "time_per_iteration": 2.7551767826080322 }, { "auxiliary_loss_clip": 0.01189524, "auxiliary_loss_mlp": 0.01029348, "balance_loss_clip": 1.05880404, "balance_loss_mlp": 1.0201565, "epoch": 0.331750135273252, "flos": 22674787850880.0, "grad_norm": 3.8892618632284224, "language_loss": 0.94621348, "learning_rate": 3.118658461562261e-06, "loss": 0.96840221, "num_input_tokens_seen": 59481000, "step": 2759, "time_per_iteration": 2.6369640827178955 }, { "auxiliary_loss_clip": 0.0116993, "auxiliary_loss_mlp": 0.01033705, "balance_loss_clip": 1.05734086, "balance_loss_mlp": 1.0245738, "epoch": 0.33187037816389103, "flos": 22746896403840.0, "grad_norm": 1.4501376393381444, "language_loss": 0.84905493, "learning_rate": 3.118012651607426e-06, "loss": 0.87109131, "num_input_tokens_seen": 59502605, "step": 2760, "time_per_iteration": 2.7259433269500732 }, { "auxiliary_loss_clip": 0.01201352, "auxiliary_loss_mlp": 0.01040284, "balance_loss_clip": 1.05892813, "balance_loss_mlp": 1.03139138, "epoch": 0.33199062105453014, "flos": 19203769918080.0, "grad_norm": 2.1611612267010822, "language_loss": 0.83954895, "learning_rate": 3.1173666720509603e-06, "loss": 0.86196536, "num_input_tokens_seen": 59519540, "step": 2761, "time_per_iteration": 2.7091403007507324 }, { "auxiliary_loss_clip": 0.01174641, "auxiliary_loss_mlp": 0.01028537, "balance_loss_clip": 1.05453396, "balance_loss_mlp": 1.02000809, "epoch": 0.33211086394516925, "flos": 31577006764800.0, "grad_norm": 1.6927443908533388, "language_loss": 0.68455118, "learning_rate": 3.116720522990859e-06, "loss": 0.7065829, "num_input_tokens_seen": 59540415, "step": 2762, "time_per_iteration": 2.7502403259277344 }, { "auxiliary_loss_clip": 0.01127735, "auxiliary_loss_mlp": 0.01039812, "balance_loss_clip": 1.04798675, "balance_loss_mlp": 1.03077543, "epoch": 0.3322311068358083, "flos": 17932496791680.0, "grad_norm": 2.871445007361222, "language_loss": 0.61708784, "learning_rate": 3.116074204525142e-06, "loss": 0.63876331, "num_input_tokens_seen": 59558590, "step": 2763, "time_per_iteration": 2.807497501373291 }, { "auxiliary_loss_clip": 0.01175602, "auxiliary_loss_mlp": 0.01032169, "balance_loss_clip": 1.05485713, "balance_loss_mlp": 1.02225637, "epoch": 0.3323513497264474, "flos": 32269831269120.0, "grad_norm": 1.5450952812937975, "language_loss": 0.83677655, "learning_rate": 3.1154277167518553e-06, "loss": 0.85885429, "num_input_tokens_seen": 59580205, "step": 2764, "time_per_iteration": 2.7140774726867676 }, { "auxiliary_loss_clip": 0.01069754, "auxiliary_loss_mlp": 0.0100307, "balance_loss_clip": 1.02085829, "balance_loss_mlp": 1.00135314, "epoch": 0.33247159261708653, "flos": 52668674588160.0, "grad_norm": 0.7875501090503614, "language_loss": 0.59536761, "learning_rate": 3.114781059769072e-06, "loss": 0.61609584, "num_input_tokens_seen": 59631530, "step": 2765, "time_per_iteration": 3.17437481880188 }, { "auxiliary_loss_clip": 0.01168423, "auxiliary_loss_mlp": 0.01031339, "balance_loss_clip": 1.05361676, "balance_loss_mlp": 1.02169538, "epoch": 0.3325918355077256, "flos": 27125232906240.0, "grad_norm": 2.705560606974291, "language_loss": 0.67095315, "learning_rate": 3.1141342336748874e-06, "loss": 0.69295079, "num_input_tokens_seen": 59651090, "step": 2766, "time_per_iteration": 2.717733144760132 }, { "auxiliary_loss_clip": 0.01180643, "auxiliary_loss_mlp": 0.01029118, "balance_loss_clip": 1.05492651, "balance_loss_mlp": 1.01990902, "epoch": 0.3327120783983647, "flos": 23664414435840.0, "grad_norm": 1.7772123307067857, "language_loss": 0.82220244, "learning_rate": 3.1134872385674253e-06, "loss": 0.84430003, "num_input_tokens_seen": 59675245, "step": 2767, "time_per_iteration": 3.589491605758667 }, { "auxiliary_loss_clip": 0.01169875, "auxiliary_loss_mlp": 0.01033348, "balance_loss_clip": 1.05099547, "balance_loss_mlp": 1.02409136, "epoch": 0.3328323212890038, "flos": 19171378828800.0, "grad_norm": 1.7526796924187293, "language_loss": 0.85739338, "learning_rate": 3.1128400745448353e-06, "loss": 0.87942559, "num_input_tokens_seen": 59694625, "step": 2768, "time_per_iteration": 2.6876275539398193 }, { "auxiliary_loss_clip": 0.01188025, "auxiliary_loss_mlp": 0.0103445, "balance_loss_clip": 1.05657089, "balance_loss_mlp": 1.02541947, "epoch": 0.33295256417964286, "flos": 37706347463040.0, "grad_norm": 2.052329576057974, "language_loss": 0.62879753, "learning_rate": 3.11219274170529e-06, "loss": 0.65102232, "num_input_tokens_seen": 59716435, "step": 2769, "time_per_iteration": 2.8205909729003906 }, { "auxiliary_loss_clip": 0.0116519, "auxiliary_loss_mlp": 0.01036768, "balance_loss_clip": 1.05313504, "balance_loss_mlp": 1.02814937, "epoch": 0.333072807070282, "flos": 26505989412480.0, "grad_norm": 2.02557401494615, "language_loss": 0.81650162, "learning_rate": 3.1115452401469903e-06, "loss": 0.83852124, "num_input_tokens_seen": 59736835, "step": 2770, "time_per_iteration": 2.7227792739868164 }, { "auxiliary_loss_clip": 0.01131542, "auxiliary_loss_mlp": 0.0103758, "balance_loss_clip": 1.0474422, "balance_loss_mlp": 1.02844262, "epoch": 0.3331930499609211, "flos": 21430913823360.0, "grad_norm": 1.9302578542474742, "language_loss": 0.86741459, "learning_rate": 3.1108975699681613e-06, "loss": 0.8891058, "num_input_tokens_seen": 59754230, "step": 2771, "time_per_iteration": 3.7085483074188232 }, { "auxiliary_loss_clip": 0.01153837, "auxiliary_loss_mlp": 0.01028927, "balance_loss_clip": 1.05171514, "balance_loss_mlp": 1.02066028, "epoch": 0.33331329285156014, "flos": 20659947281280.0, "grad_norm": 2.1672825587528064, "language_loss": 0.71801674, "learning_rate": 3.1102497312670542e-06, "loss": 0.73984438, "num_input_tokens_seen": 59772235, "step": 2772, "time_per_iteration": 2.729571580886841 }, { "auxiliary_loss_clip": 0.0115754, "auxiliary_loss_mlp": 0.01029777, "balance_loss_clip": 1.05178428, "balance_loss_mlp": 1.02131295, "epoch": 0.33343353574219925, "flos": 28001596930560.0, "grad_norm": 1.9510537949633502, "language_loss": 0.80806005, "learning_rate": 3.109601724141946e-06, "loss": 0.82993329, "num_input_tokens_seen": 59791230, "step": 2773, "time_per_iteration": 2.782825469970703 }, { "auxiliary_loss_clip": 0.01167232, "auxiliary_loss_mlp": 0.0104062, "balance_loss_clip": 1.0544939, "balance_loss_mlp": 1.0312264, "epoch": 0.33355377863283836, "flos": 23764963582080.0, "grad_norm": 1.6834253650595827, "language_loss": 0.68229687, "learning_rate": 3.108953548691138e-06, "loss": 0.70437539, "num_input_tokens_seen": 59811315, "step": 2774, "time_per_iteration": 3.6489365100860596 }, { "auxiliary_loss_clip": 0.01199203, "auxiliary_loss_mlp": 0.01034402, "balance_loss_clip": 1.05658519, "balance_loss_mlp": 1.02480602, "epoch": 0.3336740215234774, "flos": 37779677078400.0, "grad_norm": 2.4904774893621227, "language_loss": 0.72381419, "learning_rate": 3.108305205012959e-06, "loss": 0.74615026, "num_input_tokens_seen": 59832010, "step": 2775, "time_per_iteration": 3.71881103515625 }, { "auxiliary_loss_clip": 0.01172031, "auxiliary_loss_mlp": 0.01028271, "balance_loss_clip": 1.05593038, "balance_loss_mlp": 1.0196166, "epoch": 0.3337942644141165, "flos": 25519056347520.0, "grad_norm": 2.065119224142056, "language_loss": 0.87441444, "learning_rate": 3.107656693205761e-06, "loss": 0.89641738, "num_input_tokens_seen": 59851450, "step": 2776, "time_per_iteration": 2.7264277935028076 }, { "auxiliary_loss_clip": 0.01204706, "auxiliary_loss_mlp": 0.01035209, "balance_loss_clip": 1.0582993, "balance_loss_mlp": 1.0246892, "epoch": 0.3339145073047556, "flos": 25989844930560.0, "grad_norm": 2.7657690928032226, "language_loss": 0.70830971, "learning_rate": 3.107008013367924e-06, "loss": 0.7307089, "num_input_tokens_seen": 59870245, "step": 2777, "time_per_iteration": 2.5891430377960205 }, { "auxiliary_loss_clip": 0.01154088, "auxiliary_loss_mlp": 0.01038114, "balance_loss_clip": 1.05188537, "balance_loss_mlp": 1.02866042, "epoch": 0.3340347501953947, "flos": 19062569554560.0, "grad_norm": 4.63623091206627, "language_loss": 0.86902034, "learning_rate": 3.1063591655978507e-06, "loss": 0.89094239, "num_input_tokens_seen": 59886195, "step": 2778, "time_per_iteration": 2.652740716934204 }, { "auxiliary_loss_clip": 0.01129116, "auxiliary_loss_mlp": 0.01035561, "balance_loss_clip": 1.0468179, "balance_loss_mlp": 1.02694845, "epoch": 0.3341549930860338, "flos": 18109715518080.0, "grad_norm": 1.7381894403158913, "language_loss": 0.79720557, "learning_rate": 3.105710149993972e-06, "loss": 0.81885231, "num_input_tokens_seen": 59905525, "step": 2779, "time_per_iteration": 2.808835506439209 }, { "auxiliary_loss_clip": 0.01203642, "auxiliary_loss_mlp": 0.01033911, "balance_loss_clip": 1.06002271, "balance_loss_mlp": 1.02495885, "epoch": 0.33427523597667286, "flos": 22674967418880.0, "grad_norm": 1.941720006706963, "language_loss": 0.8541255, "learning_rate": 3.1050609666547427e-06, "loss": 0.87650096, "num_input_tokens_seen": 59925085, "step": 2780, "time_per_iteration": 2.6581199169158936 }, { "auxiliary_loss_clip": 0.01161051, "auxiliary_loss_mlp": 0.01034235, "balance_loss_clip": 1.0537504, "balance_loss_mlp": 1.02484727, "epoch": 0.33439547886731197, "flos": 22638338524800.0, "grad_norm": 5.283062925689047, "language_loss": 0.77756256, "learning_rate": 3.104411615678644e-06, "loss": 0.79951537, "num_input_tokens_seen": 59943935, "step": 2781, "time_per_iteration": 2.7014265060424805 }, { "auxiliary_loss_clip": 0.01169294, "auxiliary_loss_mlp": 0.01034572, "balance_loss_clip": 1.05559826, "balance_loss_mlp": 1.02478504, "epoch": 0.3345157217579511, "flos": 24096383395200.0, "grad_norm": 2.3102890478465645, "language_loss": 0.73055589, "learning_rate": 3.1037620971641803e-06, "loss": 0.75259459, "num_input_tokens_seen": 59963725, "step": 2782, "time_per_iteration": 2.751354932785034 }, { "auxiliary_loss_clip": 0.01200128, "auxiliary_loss_mlp": 0.0103606, "balance_loss_clip": 1.05684745, "balance_loss_mlp": 1.02649939, "epoch": 0.33463596464859013, "flos": 18989491334400.0, "grad_norm": 3.1767676337035335, "language_loss": 0.64895451, "learning_rate": 3.1031124112098844e-06, "loss": 0.67131639, "num_input_tokens_seen": 59981935, "step": 2783, "time_per_iteration": 2.656076431274414 }, { "auxiliary_loss_clip": 0.01175575, "auxiliary_loss_mlp": 0.01032925, "balance_loss_clip": 1.05662513, "balance_loss_mlp": 1.02416897, "epoch": 0.33475620753922924, "flos": 20375607219840.0, "grad_norm": 2.0334236063998334, "language_loss": 0.72513419, "learning_rate": 3.1024625579143127e-06, "loss": 0.7472192, "num_input_tokens_seen": 59999455, "step": 2784, "time_per_iteration": 2.6542627811431885 }, { "auxiliary_loss_clip": 0.01198904, "auxiliary_loss_mlp": 0.01029854, "balance_loss_clip": 1.05575025, "balance_loss_mlp": 1.02143741, "epoch": 0.33487645042986836, "flos": 18182578256640.0, "grad_norm": 1.8852435844452802, "language_loss": 0.73233676, "learning_rate": 3.101812537376048e-06, "loss": 0.75462437, "num_input_tokens_seen": 60018475, "step": 2785, "time_per_iteration": 2.5882985591888428 }, { "auxiliary_loss_clip": 0.01161797, "auxiliary_loss_mlp": 0.00764082, "balance_loss_clip": 1.05156612, "balance_loss_mlp": 1.00050426, "epoch": 0.3349966933205074, "flos": 25848824135040.0, "grad_norm": 2.8201991452249797, "language_loss": 0.84739023, "learning_rate": 3.1011623496936973e-06, "loss": 0.86664903, "num_input_tokens_seen": 60036770, "step": 2786, "time_per_iteration": 2.7940471172332764 }, { "auxiliary_loss_clip": 0.01200298, "auxiliary_loss_mlp": 0.01028357, "balance_loss_clip": 1.05871451, "balance_loss_mlp": 1.02017367, "epoch": 0.3351169362111465, "flos": 28111447699200.0, "grad_norm": 2.6818820332907642, "language_loss": 0.69569206, "learning_rate": 3.100511994965893e-06, "loss": 0.7179786, "num_input_tokens_seen": 60056725, "step": 2787, "time_per_iteration": 2.6679646968841553 }, { "auxiliary_loss_clip": 0.01179438, "auxiliary_loss_mlp": 0.0103053, "balance_loss_clip": 1.05507398, "balance_loss_mlp": 1.02158928, "epoch": 0.33523717910178563, "flos": 22673315393280.0, "grad_norm": 1.7229060595479473, "language_loss": 0.84943473, "learning_rate": 3.0998614732912947e-06, "loss": 0.87153435, "num_input_tokens_seen": 60076100, "step": 2788, "time_per_iteration": 2.615370035171509 }, { "auxiliary_loss_clip": 0.01183583, "auxiliary_loss_mlp": 0.01032914, "balance_loss_clip": 1.05715358, "balance_loss_mlp": 1.02472472, "epoch": 0.3353574219924247, "flos": 15669801400320.0, "grad_norm": 10.149686133009592, "language_loss": 0.68092316, "learning_rate": 3.0992107847685855e-06, "loss": 0.70308816, "num_input_tokens_seen": 60093815, "step": 2789, "time_per_iteration": 2.6418635845184326 }, { "auxiliary_loss_clip": 0.01168481, "auxiliary_loss_mlp": 0.01032548, "balance_loss_clip": 1.05576241, "balance_loss_mlp": 1.02333903, "epoch": 0.3354776648830638, "flos": 24790644443520.0, "grad_norm": 1.6665968633384247, "language_loss": 0.79348218, "learning_rate": 3.0985599294964736e-06, "loss": 0.81549251, "num_input_tokens_seen": 60113370, "step": 2790, "time_per_iteration": 2.6592540740966797 }, { "auxiliary_loss_clip": 0.01167466, "auxiliary_loss_mlp": 0.01033749, "balance_loss_clip": 1.05383074, "balance_loss_mlp": 1.02484417, "epoch": 0.33559790777370285, "flos": 28694852398080.0, "grad_norm": 1.8818440899557454, "language_loss": 0.69890761, "learning_rate": 3.097908907573695e-06, "loss": 0.72091979, "num_input_tokens_seen": 60131350, "step": 2791, "time_per_iteration": 2.7778708934783936 }, { "auxiliary_loss_clip": 0.01125732, "auxiliary_loss_mlp": 0.01034308, "balance_loss_clip": 1.05034316, "balance_loss_mlp": 1.0258621, "epoch": 0.33571815066434196, "flos": 22235779825920.0, "grad_norm": 2.018703092970654, "language_loss": 0.89691931, "learning_rate": 3.0972577190990067e-06, "loss": 0.91851968, "num_input_tokens_seen": 60149830, "step": 2792, "time_per_iteration": 2.7718048095703125 }, { "auxiliary_loss_clip": 0.01160297, "auxiliary_loss_mlp": 0.0103237, "balance_loss_clip": 1.05341375, "balance_loss_mlp": 1.02325034, "epoch": 0.3358383935549811, "flos": 23842279607040.0, "grad_norm": 1.8826570461259735, "language_loss": 0.79732108, "learning_rate": 3.096606364171196e-06, "loss": 0.81924772, "num_input_tokens_seen": 60169620, "step": 2793, "time_per_iteration": 3.680549144744873 }, { "auxiliary_loss_clip": 0.01141044, "auxiliary_loss_mlp": 0.01033499, "balance_loss_clip": 1.05004573, "balance_loss_mlp": 1.02421284, "epoch": 0.33595863644562013, "flos": 22267308988800.0, "grad_norm": 3.3402076041702533, "language_loss": 0.85195374, "learning_rate": 3.0959548428890703e-06, "loss": 0.87369919, "num_input_tokens_seen": 60188490, "step": 2794, "time_per_iteration": 2.7962357997894287 }, { "auxiliary_loss_clip": 0.01180993, "auxiliary_loss_mlp": 0.01033383, "balance_loss_clip": 1.05635762, "balance_loss_mlp": 1.02410913, "epoch": 0.33607887933625924, "flos": 20119779578880.0, "grad_norm": 1.7203195887057026, "language_loss": 0.84222066, "learning_rate": 3.095303155351468e-06, "loss": 0.86436445, "num_input_tokens_seen": 60208695, "step": 2795, "time_per_iteration": 2.669407844543457 }, { "auxiliary_loss_clip": 0.01126729, "auxiliary_loss_mlp": 0.01029857, "balance_loss_clip": 1.04736042, "balance_loss_mlp": 1.0203141, "epoch": 0.33619912222689835, "flos": 19318109886720.0, "grad_norm": 2.1545803532969856, "language_loss": 0.79919744, "learning_rate": 3.0946513016572464e-06, "loss": 0.82076329, "num_input_tokens_seen": 60227600, "step": 2796, "time_per_iteration": 2.760286808013916 }, { "auxiliary_loss_clip": 0.01186132, "auxiliary_loss_mlp": 0.01030419, "balance_loss_clip": 1.05476964, "balance_loss_mlp": 1.02086473, "epoch": 0.3363193651175374, "flos": 16800664262400.0, "grad_norm": 2.131567966234348, "language_loss": 0.76982653, "learning_rate": 3.0939992819052938e-06, "loss": 0.79199207, "num_input_tokens_seen": 60245110, "step": 2797, "time_per_iteration": 3.5698530673980713 }, { "auxiliary_loss_clip": 0.01168158, "auxiliary_loss_mlp": 0.01029347, "balance_loss_clip": 1.05310917, "balance_loss_mlp": 1.02114892, "epoch": 0.3364396080081765, "flos": 23550289948800.0, "grad_norm": 2.2927965669825863, "language_loss": 0.81525993, "learning_rate": 3.0933470961945193e-06, "loss": 0.83723497, "num_input_tokens_seen": 60263405, "step": 2798, "time_per_iteration": 2.7090811729431152 }, { "auxiliary_loss_clip": 0.01163864, "auxiliary_loss_mlp": 0.01035787, "balance_loss_clip": 1.05369282, "balance_loss_mlp": 1.02712035, "epoch": 0.3365598508988156, "flos": 28037902602240.0, "grad_norm": 1.7733210904358991, "language_loss": 0.68200386, "learning_rate": 3.0926947446238597e-06, "loss": 0.70400029, "num_input_tokens_seen": 60282975, "step": 2799, "time_per_iteration": 2.7108585834503174 }, { "auxiliary_loss_clip": 0.01185605, "auxiliary_loss_mlp": 0.01031278, "balance_loss_clip": 1.05149758, "balance_loss_mlp": 1.02121675, "epoch": 0.3366800937894547, "flos": 16982767238400.0, "grad_norm": 2.9296072510530333, "language_loss": 0.83034962, "learning_rate": 3.092042227292276e-06, "loss": 0.85251844, "num_input_tokens_seen": 60299810, "step": 2800, "time_per_iteration": 2.6267004013061523 }, { "auxiliary_loss_clip": 0.01196862, "auxiliary_loss_mlp": 0.01036726, "balance_loss_clip": 1.05706692, "balance_loss_mlp": 1.02839363, "epoch": 0.3368003366800938, "flos": 23915321913600.0, "grad_norm": 1.632235398624199, "language_loss": 0.87999731, "learning_rate": 3.0913895442987557e-06, "loss": 0.90233314, "num_input_tokens_seen": 60320775, "step": 2801, "time_per_iteration": 3.571031332015991 }, { "auxiliary_loss_clip": 0.01152625, "auxiliary_loss_mlp": 0.00764079, "balance_loss_clip": 1.05284381, "balance_loss_mlp": 1.00045705, "epoch": 0.3369205795707329, "flos": 24791219061120.0, "grad_norm": 1.5711717708205655, "language_loss": 0.86081803, "learning_rate": 3.090736695742308e-06, "loss": 0.87998509, "num_input_tokens_seen": 60341905, "step": 2802, "time_per_iteration": 2.742711305618286 }, { "auxiliary_loss_clip": 0.01132032, "auxiliary_loss_mlp": 0.01032355, "balance_loss_clip": 1.04797518, "balance_loss_mlp": 1.02370656, "epoch": 0.33704082246137196, "flos": 17931096161280.0, "grad_norm": 2.801381425337414, "language_loss": 0.52326983, "learning_rate": 3.0900836817219713e-06, "loss": 0.54491365, "num_input_tokens_seen": 60358335, "step": 2803, "time_per_iteration": 2.674513578414917 }, { "auxiliary_loss_clip": 0.01196115, "auxiliary_loss_mlp": 0.01033845, "balance_loss_clip": 1.05483282, "balance_loss_mlp": 1.02525616, "epoch": 0.33716106535201107, "flos": 21286517149440.0, "grad_norm": 1.7423530096992423, "language_loss": 0.84192306, "learning_rate": 3.089430502336807e-06, "loss": 0.86422265, "num_input_tokens_seen": 60378305, "step": 2804, "time_per_iteration": 2.6149492263793945 }, { "auxiliary_loss_clip": 0.01186031, "auxiliary_loss_mlp": 0.01034604, "balance_loss_clip": 1.05531621, "balance_loss_mlp": 1.02553844, "epoch": 0.3372813082426502, "flos": 18402962152320.0, "grad_norm": 2.5983167148425697, "language_loss": 0.90893143, "learning_rate": 3.088777157685902e-06, "loss": 0.9311378, "num_input_tokens_seen": 60393895, "step": 2805, "time_per_iteration": 2.672654390335083 }, { "auxiliary_loss_clip": 0.01166044, "auxiliary_loss_mlp": 0.01032113, "balance_loss_clip": 1.05452132, "balance_loss_mlp": 1.02346396, "epoch": 0.33740155113328923, "flos": 17201391367680.0, "grad_norm": 2.335731490446569, "language_loss": 0.85586655, "learning_rate": 3.088123647868367e-06, "loss": 0.87784815, "num_input_tokens_seen": 60410445, "step": 2806, "time_per_iteration": 2.664973020553589 }, { "auxiliary_loss_clip": 0.01184481, "auxiliary_loss_mlp": 0.01036169, "balance_loss_clip": 1.05087185, "balance_loss_mlp": 1.0270853, "epoch": 0.33752179402392835, "flos": 29058950609280.0, "grad_norm": 2.258607861398835, "language_loss": 0.81345415, "learning_rate": 3.0874699729833405e-06, "loss": 0.8356607, "num_input_tokens_seen": 60431815, "step": 2807, "time_per_iteration": 2.749363660812378 }, { "auxiliary_loss_clip": 0.01168162, "auxiliary_loss_mlp": 0.01032276, "balance_loss_clip": 1.05549347, "balance_loss_mlp": 1.02338922, "epoch": 0.3376420369145674, "flos": 25080730680960.0, "grad_norm": 2.125469663323752, "language_loss": 0.80090201, "learning_rate": 3.086816133129983e-06, "loss": 0.82290637, "num_input_tokens_seen": 60452075, "step": 2808, "time_per_iteration": 2.6814825534820557 }, { "auxiliary_loss_clip": 0.01200755, "auxiliary_loss_mlp": 0.01030892, "balance_loss_clip": 1.0599879, "balance_loss_mlp": 1.02217793, "epoch": 0.3377622798052065, "flos": 27490624007040.0, "grad_norm": 2.401757926194325, "language_loss": 0.76078475, "learning_rate": 3.0861621284074826e-06, "loss": 0.7831012, "num_input_tokens_seen": 60472600, "step": 2809, "time_per_iteration": 2.7200918197631836 }, { "auxiliary_loss_clip": 0.01177321, "auxiliary_loss_mlp": 0.01035206, "balance_loss_clip": 1.05557168, "balance_loss_mlp": 1.0262537, "epoch": 0.3378825226958456, "flos": 21975211589760.0, "grad_norm": 1.6809913030309178, "language_loss": 0.73073161, "learning_rate": 3.085507958915051e-06, "loss": 0.75285697, "num_input_tokens_seen": 60491030, "step": 2810, "time_per_iteration": 2.654655694961548 }, { "auxiliary_loss_clip": 0.01167784, "auxiliary_loss_mlp": 0.01036366, "balance_loss_clip": 1.05541229, "balance_loss_mlp": 1.0270555, "epoch": 0.3380027655864847, "flos": 42523189200000.0, "grad_norm": 2.8119749370231264, "language_loss": 0.71059239, "learning_rate": 3.084853624751925e-06, "loss": 0.73263383, "num_input_tokens_seen": 60512615, "step": 2811, "time_per_iteration": 2.8271470069885254 }, { "auxiliary_loss_clip": 0.01158712, "auxiliary_loss_mlp": 0.01030993, "balance_loss_clip": 1.05478907, "balance_loss_mlp": 1.02218342, "epoch": 0.3381230084771238, "flos": 26725080418560.0, "grad_norm": 1.7703771495871892, "language_loss": 0.85612559, "learning_rate": 3.0841991260173668e-06, "loss": 0.87802255, "num_input_tokens_seen": 60532520, "step": 2812, "time_per_iteration": 2.786619186401367 }, { "auxiliary_loss_clip": 0.01202808, "auxiliary_loss_mlp": 0.01030176, "balance_loss_clip": 1.05889559, "balance_loss_mlp": 1.0209254, "epoch": 0.3382432513677629, "flos": 22710375250560.0, "grad_norm": 1.9078769962642366, "language_loss": 0.80593419, "learning_rate": 3.0835444628106634e-06, "loss": 0.82826406, "num_input_tokens_seen": 60551500, "step": 2813, "time_per_iteration": 2.5884058475494385 }, { "auxiliary_loss_clip": 0.01201542, "auxiliary_loss_mlp": 0.00765014, "balance_loss_clip": 1.05907917, "balance_loss_mlp": 1.00044179, "epoch": 0.33836349425840195, "flos": 22122409524480.0, "grad_norm": 1.9742119693899398, "language_loss": 0.83493108, "learning_rate": 3.082889635231126e-06, "loss": 0.85459661, "num_input_tokens_seen": 60570160, "step": 2814, "time_per_iteration": 2.631774425506592 }, { "auxiliary_loss_clip": 0.01171936, "auxiliary_loss_mlp": 0.01025619, "balance_loss_clip": 1.0526545, "balance_loss_mlp": 1.01632071, "epoch": 0.33848373714904106, "flos": 27308090067840.0, "grad_norm": 2.3043014587678794, "language_loss": 0.76489413, "learning_rate": 3.0822346433780925e-06, "loss": 0.78686965, "num_input_tokens_seen": 60590885, "step": 2815, "time_per_iteration": 2.713453531265259 }, { "auxiliary_loss_clip": 0.01184235, "auxiliary_loss_mlp": 0.01035135, "balance_loss_clip": 1.0524199, "balance_loss_mlp": 1.0252763, "epoch": 0.3386039800396802, "flos": 25848716394240.0, "grad_norm": 1.9372993705655075, "language_loss": 0.87278295, "learning_rate": 3.0815794873509237e-06, "loss": 0.89497662, "num_input_tokens_seen": 60609170, "step": 2816, "time_per_iteration": 2.6761701107025146 }, { "auxiliary_loss_clip": 0.01195595, "auxiliary_loss_mlp": 0.01026055, "balance_loss_clip": 1.05471861, "balance_loss_mlp": 1.01703095, "epoch": 0.33872422293031923, "flos": 18880646146560.0, "grad_norm": 1.7315712451747685, "language_loss": 0.72755194, "learning_rate": 3.0809241672490066e-06, "loss": 0.7497685, "num_input_tokens_seen": 60627340, "step": 2817, "time_per_iteration": 2.617640495300293 }, { "auxiliary_loss_clip": 0.01170895, "auxiliary_loss_mlp": 0.01033799, "balance_loss_clip": 1.05466354, "balance_loss_mlp": 1.02583599, "epoch": 0.33884446582095834, "flos": 23146977064320.0, "grad_norm": 4.629190821051471, "language_loss": 0.85146689, "learning_rate": 3.080268683171753e-06, "loss": 0.87351382, "num_input_tokens_seen": 60647630, "step": 2818, "time_per_iteration": 2.7083539962768555 }, { "auxiliary_loss_clip": 0.01182077, "auxiliary_loss_mlp": 0.01028056, "balance_loss_clip": 1.05195808, "balance_loss_mlp": 1.01958668, "epoch": 0.33896470871159745, "flos": 15997342544640.0, "grad_norm": 2.115155520875265, "language_loss": 0.89091986, "learning_rate": 3.0796130352185985e-06, "loss": 0.91302121, "num_input_tokens_seen": 60664485, "step": 2819, "time_per_iteration": 3.6213529109954834 }, { "auxiliary_loss_clip": 0.0115844, "auxiliary_loss_mlp": 0.00764947, "balance_loss_clip": 1.05053878, "balance_loss_mlp": 1.00054336, "epoch": 0.3390849516022365, "flos": 34495754112000.0, "grad_norm": 1.9484982247758387, "language_loss": 0.66536212, "learning_rate": 3.0789572234890057e-06, "loss": 0.68459594, "num_input_tokens_seen": 60686125, "step": 2820, "time_per_iteration": 2.7846906185150146 }, { "auxiliary_loss_clip": 0.01170457, "auxiliary_loss_mlp": 0.0103507, "balance_loss_clip": 1.05545449, "balance_loss_mlp": 1.02509248, "epoch": 0.3392051944928756, "flos": 16180307447040.0, "grad_norm": 1.7263497554181821, "language_loss": 0.77486765, "learning_rate": 3.0783012480824596e-06, "loss": 0.79692292, "num_input_tokens_seen": 60705270, "step": 2821, "time_per_iteration": 2.6532399654388428 }, { "auxiliary_loss_clip": 0.01198138, "auxiliary_loss_mlp": 0.01032282, "balance_loss_clip": 1.05590832, "balance_loss_mlp": 1.0224297, "epoch": 0.33932543738351467, "flos": 17086656349440.0, "grad_norm": 3.2473647271259716, "language_loss": 0.74694902, "learning_rate": 3.077645109098471e-06, "loss": 0.76925325, "num_input_tokens_seen": 60721540, "step": 2822, "time_per_iteration": 2.54506254196167 }, { "auxiliary_loss_clip": 0.01142773, "auxiliary_loss_mlp": 0.01036044, "balance_loss_clip": 1.05041027, "balance_loss_mlp": 1.02708602, "epoch": 0.3394456802741538, "flos": 22126970551680.0, "grad_norm": 1.629071663640302, "language_loss": 0.72238272, "learning_rate": 3.076988806636577e-06, "loss": 0.7441709, "num_input_tokens_seen": 60739300, "step": 2823, "time_per_iteration": 3.6988980770111084 }, { "auxiliary_loss_clip": 0.01173932, "auxiliary_loss_mlp": 0.00764985, "balance_loss_clip": 1.05492973, "balance_loss_mlp": 1.00060225, "epoch": 0.3395659231647929, "flos": 25226887121280.0, "grad_norm": 2.040491346928349, "language_loss": 0.8885498, "learning_rate": 3.0763323407963377e-06, "loss": 0.90793896, "num_input_tokens_seen": 60758910, "step": 2824, "time_per_iteration": 2.7320497035980225 }, { "auxiliary_loss_clip": 0.01183586, "auxiliary_loss_mlp": 0.01033474, "balance_loss_clip": 1.05301356, "balance_loss_mlp": 1.02467072, "epoch": 0.33968616605543195, "flos": 29096477343360.0, "grad_norm": 1.7187708445914205, "language_loss": 0.80011594, "learning_rate": 3.075675711677337e-06, "loss": 0.82228655, "num_input_tokens_seen": 60779005, "step": 2825, "time_per_iteration": 2.6903128623962402 }, { "auxiliary_loss_clip": 0.01164287, "auxiliary_loss_mlp": 0.01035507, "balance_loss_clip": 1.05413461, "balance_loss_mlp": 1.02660775, "epoch": 0.33980640894607106, "flos": 21433966479360.0, "grad_norm": 2.0786850703833397, "language_loss": 0.78006089, "learning_rate": 3.0750189193791865e-06, "loss": 0.80205882, "num_input_tokens_seen": 60798590, "step": 2826, "time_per_iteration": 2.6671502590179443 }, { "auxiliary_loss_clip": 0.01178917, "auxiliary_loss_mlp": 0.01037518, "balance_loss_clip": 1.05187857, "balance_loss_mlp": 1.02681327, "epoch": 0.33992665183671017, "flos": 32490035596800.0, "grad_norm": 2.0608531301958184, "language_loss": 0.69875908, "learning_rate": 3.0743619640015203e-06, "loss": 0.72092342, "num_input_tokens_seen": 60818840, "step": 2827, "time_per_iteration": 3.6292309761047363 }, { "auxiliary_loss_clip": 0.01174791, "auxiliary_loss_mlp": 0.01030764, "balance_loss_clip": 1.05249858, "balance_loss_mlp": 1.02119148, "epoch": 0.3400468947273492, "flos": 17055414495360.0, "grad_norm": 1.998968339583174, "language_loss": 0.92469299, "learning_rate": 3.073704845643999e-06, "loss": 0.94674861, "num_input_tokens_seen": 60835965, "step": 2828, "time_per_iteration": 2.684772253036499 }, { "auxiliary_loss_clip": 0.01186107, "auxiliary_loss_mlp": 0.01036345, "balance_loss_clip": 1.05260658, "balance_loss_mlp": 1.02655816, "epoch": 0.34016713761798834, "flos": 16872988296960.0, "grad_norm": 3.4057709300365318, "language_loss": 0.7771582, "learning_rate": 3.0730475644063063e-06, "loss": 0.79938275, "num_input_tokens_seen": 60851065, "step": 2829, "time_per_iteration": 2.627323627471924 }, { "auxiliary_loss_clip": 0.01161378, "auxiliary_loss_mlp": 0.00764218, "balance_loss_clip": 1.04953694, "balance_loss_mlp": 1.00042844, "epoch": 0.34028738050862745, "flos": 21907161273600.0, "grad_norm": 1.6844471749524403, "language_loss": 0.65132397, "learning_rate": 3.072390120388151e-06, "loss": 0.67058003, "num_input_tokens_seen": 60869390, "step": 2830, "time_per_iteration": 2.691819906234741 }, { "auxiliary_loss_clip": 0.01182068, "auxiliary_loss_mlp": 0.01033641, "balance_loss_clip": 1.05340576, "balance_loss_mlp": 1.02391386, "epoch": 0.3404076233992665, "flos": 22746034477440.0, "grad_norm": 4.72046042540528, "language_loss": 0.71333981, "learning_rate": 3.071732513689267e-06, "loss": 0.73549688, "num_input_tokens_seen": 60887925, "step": 2831, "time_per_iteration": 2.643235921859741 }, { "auxiliary_loss_clip": 0.01190595, "auxiliary_loss_mlp": 0.01036378, "balance_loss_clip": 1.05995131, "balance_loss_mlp": 1.02662122, "epoch": 0.3405278662899056, "flos": 17052361839360.0, "grad_norm": 2.2830905733589963, "language_loss": 0.67349213, "learning_rate": 3.0710747444094134e-06, "loss": 0.6957618, "num_input_tokens_seen": 60905955, "step": 2832, "time_per_iteration": 2.6071627140045166 }, { "auxiliary_loss_clip": 0.01171576, "auxiliary_loss_mlp": 0.01032916, "balance_loss_clip": 1.0532763, "balance_loss_mlp": 1.02376676, "epoch": 0.3406481091805447, "flos": 42813131783040.0, "grad_norm": 2.0053007801790903, "language_loss": 0.65211052, "learning_rate": 3.070416812648372e-06, "loss": 0.67415541, "num_input_tokens_seen": 60929405, "step": 2833, "time_per_iteration": 2.84083890914917 }, { "auxiliary_loss_clip": 0.01152286, "auxiliary_loss_mlp": 0.01026554, "balance_loss_clip": 1.0495863, "balance_loss_mlp": 1.01722622, "epoch": 0.3407683520711838, "flos": 26761457917440.0, "grad_norm": 2.4662652333914807, "language_loss": 0.65136611, "learning_rate": 3.069758718505951e-06, "loss": 0.67315447, "num_input_tokens_seen": 60951145, "step": 2834, "time_per_iteration": 2.74892258644104 }, { "auxiliary_loss_clip": 0.01197094, "auxiliary_loss_mlp": 0.01032694, "balance_loss_clip": 1.05724537, "balance_loss_mlp": 1.02356231, "epoch": 0.3408885949618229, "flos": 28767643309440.0, "grad_norm": 1.6564426154688001, "language_loss": 0.80295801, "learning_rate": 3.0691004620819836e-06, "loss": 0.82525593, "num_input_tokens_seen": 60971275, "step": 2835, "time_per_iteration": 2.6762404441833496 }, { "auxiliary_loss_clip": 0.0104673, "auxiliary_loss_mlp": 0.0100435, "balance_loss_clip": 1.01976395, "balance_loss_mlp": 1.00253797, "epoch": 0.341008837852462, "flos": 63576252881280.0, "grad_norm": 0.7995754838705618, "language_loss": 0.60143733, "learning_rate": 3.0684420434763254e-06, "loss": 0.62194812, "num_input_tokens_seen": 61037460, "step": 2836, "time_per_iteration": 3.322528600692749 }, { "auxiliary_loss_clip": 0.01148743, "auxiliary_loss_mlp": 0.01034412, "balance_loss_clip": 1.05397606, "balance_loss_mlp": 1.02551937, "epoch": 0.34112908074310105, "flos": 20812173120000.0, "grad_norm": 2.0513875816765386, "language_loss": 0.76779783, "learning_rate": 3.06778346278886e-06, "loss": 0.78962934, "num_input_tokens_seen": 61056295, "step": 2837, "time_per_iteration": 2.724052906036377 }, { "auxiliary_loss_clip": 0.01200124, "auxiliary_loss_mlp": 0.01033256, "balance_loss_clip": 1.05820274, "balance_loss_mlp": 1.02401757, "epoch": 0.34124932363374016, "flos": 24976446520320.0, "grad_norm": 2.8515673255095413, "language_loss": 0.79226649, "learning_rate": 3.0671247201194906e-06, "loss": 0.81460029, "num_input_tokens_seen": 61078430, "step": 2838, "time_per_iteration": 2.6360459327697754 }, { "auxiliary_loss_clip": 0.01155118, "auxiliary_loss_mlp": 0.01036935, "balance_loss_clip": 1.05024981, "balance_loss_mlp": 1.02754164, "epoch": 0.3413695665243792, "flos": 28402970480640.0, "grad_norm": 1.636593991037116, "language_loss": 0.75525796, "learning_rate": 3.066465815568151e-06, "loss": 0.77717853, "num_input_tokens_seen": 61099260, "step": 2839, "time_per_iteration": 2.809102773666382 }, { "auxiliary_loss_clip": 0.01183212, "auxiliary_loss_mlp": 0.01028983, "balance_loss_clip": 1.05224907, "balance_loss_mlp": 1.02005398, "epoch": 0.34148980941501833, "flos": 25302012416640.0, "grad_norm": 1.7736875327987929, "language_loss": 0.69107842, "learning_rate": 3.0658067492347947e-06, "loss": 0.71320033, "num_input_tokens_seen": 61121900, "step": 2840, "time_per_iteration": 2.6929428577423096 }, { "auxiliary_loss_clip": 0.01099438, "auxiliary_loss_mlp": 0.01039776, "balance_loss_clip": 1.04298997, "balance_loss_mlp": 1.03062701, "epoch": 0.34161005230565744, "flos": 17530081747200.0, "grad_norm": 1.9914166940659639, "language_loss": 0.66682637, "learning_rate": 3.065147521219402e-06, "loss": 0.68821847, "num_input_tokens_seen": 61141155, "step": 2841, "time_per_iteration": 2.821664571762085 }, { "auxiliary_loss_clip": 0.01161306, "auxiliary_loss_mlp": 0.01040333, "balance_loss_clip": 1.05410123, "balance_loss_mlp": 1.03210723, "epoch": 0.3417302951962965, "flos": 43650101566080.0, "grad_norm": 1.5251257817472503, "language_loss": 0.74676859, "learning_rate": 3.064488131621977e-06, "loss": 0.768785, "num_input_tokens_seen": 61164480, "step": 2842, "time_per_iteration": 2.894273281097412 }, { "auxiliary_loss_clip": 0.01172457, "auxiliary_loss_mlp": 0.0102866, "balance_loss_clip": 1.04879522, "balance_loss_mlp": 1.01925492, "epoch": 0.3418505380869356, "flos": 30882207012480.0, "grad_norm": 1.649158485868022, "language_loss": 0.73771644, "learning_rate": 3.063828580542549e-06, "loss": 0.7597276, "num_input_tokens_seen": 61185675, "step": 2843, "time_per_iteration": 2.721012830734253 }, { "auxiliary_loss_clip": 0.01168515, "auxiliary_loss_mlp": 0.01034242, "balance_loss_clip": 1.05252802, "balance_loss_mlp": 1.0261718, "epoch": 0.3419707809775747, "flos": 19463871277440.0, "grad_norm": 2.257092417795917, "language_loss": 0.73516053, "learning_rate": 3.0631688680811706e-06, "loss": 0.75718808, "num_input_tokens_seen": 61205300, "step": 2844, "time_per_iteration": 2.7157676219940186 }, { "auxiliary_loss_clip": 0.01196222, "auxiliary_loss_mlp": 0.01027087, "balance_loss_clip": 1.05376196, "balance_loss_mlp": 1.01873016, "epoch": 0.3420910238682138, "flos": 28727818104960.0, "grad_norm": 2.3137204545747396, "language_loss": 0.75612062, "learning_rate": 3.062508994337921e-06, "loss": 0.77835369, "num_input_tokens_seen": 61224905, "step": 2845, "time_per_iteration": 3.5770349502563477 }, { "auxiliary_loss_clip": 0.01178336, "auxiliary_loss_mlp": 0.01032252, "balance_loss_clip": 1.04969049, "balance_loss_mlp": 1.02259064, "epoch": 0.3422112667588529, "flos": 21397265758080.0, "grad_norm": 2.19985438351455, "language_loss": 0.79291874, "learning_rate": 3.0618489594129013e-06, "loss": 0.81502461, "num_input_tokens_seen": 61243045, "step": 2846, "time_per_iteration": 2.6371586322784424 }, { "auxiliary_loss_clip": 0.01162016, "auxiliary_loss_mlp": 0.0103485, "balance_loss_clip": 1.05326307, "balance_loss_mlp": 1.02555752, "epoch": 0.342331509649492, "flos": 13881450038400.0, "grad_norm": 2.926954843890118, "language_loss": 0.71478474, "learning_rate": 3.061188763406239e-06, "loss": 0.73675334, "num_input_tokens_seen": 61259190, "step": 2847, "time_per_iteration": 2.6804728507995605 }, { "auxiliary_loss_clip": 0.01162096, "auxiliary_loss_mlp": 0.01033662, "balance_loss_clip": 1.04872596, "balance_loss_mlp": 1.02479291, "epoch": 0.34245175254013105, "flos": 28621450955520.0, "grad_norm": 2.9979014380559716, "language_loss": 0.82621658, "learning_rate": 3.060528406418085e-06, "loss": 0.8481741, "num_input_tokens_seen": 61279040, "step": 2848, "time_per_iteration": 2.7577226161956787 }, { "auxiliary_loss_clip": 0.0115818, "auxiliary_loss_mlp": 0.01034581, "balance_loss_clip": 1.0486362, "balance_loss_mlp": 1.02596223, "epoch": 0.34257199543077016, "flos": 34127058960000.0, "grad_norm": 1.783017449588709, "language_loss": 0.62196153, "learning_rate": 3.0598678885486145e-06, "loss": 0.64388913, "num_input_tokens_seen": 61301580, "step": 2849, "time_per_iteration": 3.7052552700042725 }, { "auxiliary_loss_clip": 0.0115384, "auxiliary_loss_mlp": 0.00764538, "balance_loss_clip": 1.04710543, "balance_loss_mlp": 1.00041521, "epoch": 0.34269223832140927, "flos": 19974018188160.0, "grad_norm": 1.6228463838476326, "language_loss": 0.74397957, "learning_rate": 3.0592072098980282e-06, "loss": 0.76316339, "num_input_tokens_seen": 61321240, "step": 2850, "time_per_iteration": 2.747894048690796 }, { "auxiliary_loss_clip": 0.01167304, "auxiliary_loss_mlp": 0.01032218, "balance_loss_clip": 1.05446982, "balance_loss_mlp": 1.02279449, "epoch": 0.3428124812120483, "flos": 27235658292480.0, "grad_norm": 9.860825127107347, "language_loss": 0.73225224, "learning_rate": 3.0585463705665514e-06, "loss": 0.75424743, "num_input_tokens_seen": 61341615, "step": 2851, "time_per_iteration": 2.7802491188049316 }, { "auxiliary_loss_clip": 0.01150671, "auxiliary_loss_mlp": 0.01032792, "balance_loss_clip": 1.04866767, "balance_loss_mlp": 1.0237205, "epoch": 0.34293272410268744, "flos": 24570871079040.0, "grad_norm": 2.182687778186288, "language_loss": 0.70883656, "learning_rate": 3.0578853706544304e-06, "loss": 0.73067117, "num_input_tokens_seen": 61359005, "step": 2852, "time_per_iteration": 2.716604232788086 }, { "auxiliary_loss_clip": 0.01158729, "auxiliary_loss_mlp": 0.00765694, "balance_loss_clip": 1.05195796, "balance_loss_mlp": 1.00050068, "epoch": 0.34305296699332655, "flos": 21506865131520.0, "grad_norm": 2.0664333003523763, "language_loss": 0.65424538, "learning_rate": 3.0572242102619404e-06, "loss": 0.67348963, "num_input_tokens_seen": 61376160, "step": 2853, "time_per_iteration": 4.613559246063232 }, { "auxiliary_loss_clip": 0.01167819, "auxiliary_loss_mlp": 0.01034619, "balance_loss_clip": 1.05464435, "balance_loss_mlp": 1.02547526, "epoch": 0.3431732098839656, "flos": 24056665931520.0, "grad_norm": 2.266202526678044, "language_loss": 0.80314934, "learning_rate": 3.0565628894893784e-06, "loss": 0.82517374, "num_input_tokens_seen": 61396795, "step": 2854, "time_per_iteration": 2.7695865631103516 }, { "auxiliary_loss_clip": 0.01175277, "auxiliary_loss_mlp": 0.01032528, "balance_loss_clip": 1.0538274, "balance_loss_mlp": 1.02318788, "epoch": 0.3432934527746047, "flos": 16800879744000.0, "grad_norm": 1.7322494643114317, "language_loss": 0.74874997, "learning_rate": 3.0559014084370655e-06, "loss": 0.77082801, "num_input_tokens_seen": 61415320, "step": 2855, "time_per_iteration": 2.5943195819854736 }, { "auxiliary_loss_clip": 0.01174848, "auxiliary_loss_mlp": 0.01029988, "balance_loss_clip": 1.05183434, "balance_loss_mlp": 1.02130342, "epoch": 0.34341369566524377, "flos": 23439720908160.0, "grad_norm": 1.6783372036722697, "language_loss": 0.79107606, "learning_rate": 3.055239767205349e-06, "loss": 0.81312448, "num_input_tokens_seen": 61437070, "step": 2856, "time_per_iteration": 2.6969807147979736 }, { "auxiliary_loss_clip": 0.01184433, "auxiliary_loss_mlp": 0.01033702, "balance_loss_clip": 1.05910695, "balance_loss_mlp": 1.02448666, "epoch": 0.3435339385558829, "flos": 17267466435840.0, "grad_norm": 1.8244372709003325, "language_loss": 0.78408372, "learning_rate": 3.054577965894599e-06, "loss": 0.80626512, "num_input_tokens_seen": 61453215, "step": 2857, "time_per_iteration": 2.6055562496185303 }, { "auxiliary_loss_clip": 0.01178166, "auxiliary_loss_mlp": 0.01034437, "balance_loss_clip": 1.05764127, "balance_loss_mlp": 1.0246619, "epoch": 0.343654181446522, "flos": 22199366413440.0, "grad_norm": 1.6263068050894454, "language_loss": 0.70554394, "learning_rate": 3.0539160046052094e-06, "loss": 0.72766995, "num_input_tokens_seen": 61472915, "step": 2858, "time_per_iteration": 2.704587697982788 }, { "auxiliary_loss_clip": 0.0116071, "auxiliary_loss_mlp": 0.01029741, "balance_loss_clip": 1.04961073, "balance_loss_mlp": 1.01964378, "epoch": 0.34377442433716104, "flos": 19901801894400.0, "grad_norm": 2.4987884022229125, "language_loss": 0.70993578, "learning_rate": 3.0532538834376003e-06, "loss": 0.73184031, "num_input_tokens_seen": 61492475, "step": 2859, "time_per_iteration": 2.6559133529663086 }, { "auxiliary_loss_clip": 0.01188795, "auxiliary_loss_mlp": 0.01028133, "balance_loss_clip": 1.05535388, "balance_loss_mlp": 1.01860857, "epoch": 0.34389466722780015, "flos": 22197678474240.0, "grad_norm": 1.74606880100356, "language_loss": 0.78414196, "learning_rate": 3.0525916024922143e-06, "loss": 0.80631125, "num_input_tokens_seen": 61511660, "step": 2860, "time_per_iteration": 2.6374945640563965 }, { "auxiliary_loss_clip": 0.01165339, "auxiliary_loss_mlp": 0.01035252, "balance_loss_clip": 1.05136895, "balance_loss_mlp": 1.02608466, "epoch": 0.34401491011843927, "flos": 18624567110400.0, "grad_norm": 3.3647883122334425, "language_loss": 0.84345669, "learning_rate": 3.0519291618695193e-06, "loss": 0.8654626, "num_input_tokens_seen": 61529060, "step": 2861, "time_per_iteration": 2.7542717456817627 }, { "auxiliary_loss_clip": 0.01144258, "auxiliary_loss_mlp": 0.01033508, "balance_loss_clip": 1.04726577, "balance_loss_mlp": 1.02454352, "epoch": 0.3441351530090783, "flos": 17858197509120.0, "grad_norm": 1.7856859761648556, "language_loss": 0.75898218, "learning_rate": 3.0512665616700065e-06, "loss": 0.78075981, "num_input_tokens_seen": 61548125, "step": 2862, "time_per_iteration": 2.647963523864746 }, { "auxiliary_loss_clip": 0.01131008, "auxiliary_loss_mlp": 0.01031774, "balance_loss_clip": 1.04606116, "balance_loss_mlp": 1.02263081, "epoch": 0.34425539589971743, "flos": 23112754381440.0, "grad_norm": 2.2003947585094323, "language_loss": 0.8901161, "learning_rate": 3.0506038019941933e-06, "loss": 0.91174394, "num_input_tokens_seen": 61568135, "step": 2863, "time_per_iteration": 2.7640604972839355 }, { "auxiliary_loss_clip": 0.01155877, "auxiliary_loss_mlp": 0.01034949, "balance_loss_clip": 1.05197382, "balance_loss_mlp": 1.02573395, "epoch": 0.34437563879035654, "flos": 21907699977600.0, "grad_norm": 2.2976310968306812, "language_loss": 0.67866457, "learning_rate": 3.049940882942617e-06, "loss": 0.70057285, "num_input_tokens_seen": 61586920, "step": 2864, "time_per_iteration": 2.7059905529022217 }, { "auxiliary_loss_clip": 0.0119872, "auxiliary_loss_mlp": 0.01036276, "balance_loss_clip": 1.05596566, "balance_loss_mlp": 1.02729392, "epoch": 0.3444958816809956, "flos": 23076915586560.0, "grad_norm": 1.8978674968867242, "language_loss": 0.80128455, "learning_rate": 3.0492778046158448e-06, "loss": 0.82363451, "num_input_tokens_seen": 61608340, "step": 2865, "time_per_iteration": 2.6251463890075684 }, { "auxiliary_loss_clip": 0.01182291, "auxiliary_loss_mlp": 0.01043336, "balance_loss_clip": 1.05659151, "balance_loss_mlp": 1.03391838, "epoch": 0.3446161245716347, "flos": 21908633731200.0, "grad_norm": 2.467866111083122, "language_loss": 0.76898736, "learning_rate": 3.0486145671144633e-06, "loss": 0.79124361, "num_input_tokens_seen": 61628130, "step": 2866, "time_per_iteration": 2.649470806121826 }, { "auxiliary_loss_clip": 0.01109651, "auxiliary_loss_mlp": 0.01037497, "balance_loss_clip": 1.04620302, "balance_loss_mlp": 1.02816844, "epoch": 0.3447363674622738, "flos": 25112834461440.0, "grad_norm": 2.9902340593064585, "language_loss": 0.77028465, "learning_rate": 3.047951170539086e-06, "loss": 0.79175615, "num_input_tokens_seen": 61647755, "step": 2867, "time_per_iteration": 2.806272029876709 }, { "auxiliary_loss_clip": 0.01153494, "auxiliary_loss_mlp": 0.01029313, "balance_loss_clip": 1.05496538, "balance_loss_mlp": 1.02159405, "epoch": 0.3448566103529129, "flos": 11984684451840.0, "grad_norm": 2.389450639577141, "language_loss": 0.84310877, "learning_rate": 3.047287614990349e-06, "loss": 0.86493683, "num_input_tokens_seen": 61665675, "step": 2868, "time_per_iteration": 2.6753413677215576 }, { "auxiliary_loss_clip": 0.01160611, "auxiliary_loss_mlp": 0.01034555, "balance_loss_clip": 1.05167043, "balance_loss_mlp": 1.02512538, "epoch": 0.344976853243552, "flos": 40187882465280.0, "grad_norm": 4.5580492841690186, "language_loss": 0.62136453, "learning_rate": 3.046623900568914e-06, "loss": 0.64331615, "num_input_tokens_seen": 61688240, "step": 2869, "time_per_iteration": 2.8364362716674805 }, { "auxiliary_loss_clip": 0.0116668, "auxiliary_loss_mlp": 0.01031469, "balance_loss_clip": 1.05259931, "balance_loss_mlp": 1.02230215, "epoch": 0.34509709613419104, "flos": 28723652127360.0, "grad_norm": 2.669725601366213, "language_loss": 0.70567656, "learning_rate": 3.045960027375465e-06, "loss": 0.72765803, "num_input_tokens_seen": 61706075, "step": 2870, "time_per_iteration": 3.6382625102996826 }, { "auxiliary_loss_clip": 0.01183621, "auxiliary_loss_mlp": 0.01033302, "balance_loss_clip": 1.05145407, "balance_loss_mlp": 1.0233357, "epoch": 0.34521733902483015, "flos": 29967597982080.0, "grad_norm": 3.7615810647688424, "language_loss": 0.82995391, "learning_rate": 3.045295995510711e-06, "loss": 0.85212314, "num_input_tokens_seen": 61723045, "step": 2871, "time_per_iteration": 2.6747677326202393 }, { "auxiliary_loss_clip": 0.0116694, "auxiliary_loss_mlp": 0.01027287, "balance_loss_clip": 1.0542115, "balance_loss_mlp": 1.01899576, "epoch": 0.34533758191546926, "flos": 27923059843200.0, "grad_norm": 1.7900756850485666, "language_loss": 0.73687518, "learning_rate": 3.0446318050753865e-06, "loss": 0.75881743, "num_input_tokens_seen": 61743525, "step": 2872, "time_per_iteration": 2.749258041381836 }, { "auxiliary_loss_clip": 0.01173147, "auxiliary_loss_mlp": 0.01034379, "balance_loss_clip": 1.05095756, "balance_loss_mlp": 1.02606463, "epoch": 0.3454578248061083, "flos": 27125879351040.0, "grad_norm": 1.9999534637097485, "language_loss": 0.77585185, "learning_rate": 3.0439674561702474e-06, "loss": 0.79792708, "num_input_tokens_seen": 61763025, "step": 2873, "time_per_iteration": 2.619555711746216 }, { "auxiliary_loss_clip": 0.0117788, "auxiliary_loss_mlp": 0.01028336, "balance_loss_clip": 1.05203247, "balance_loss_mlp": 1.01926982, "epoch": 0.3455780676967474, "flos": 19024899166080.0, "grad_norm": 2.6805770412502206, "language_loss": 0.88129765, "learning_rate": 3.043302948896076e-06, "loss": 0.90335977, "num_input_tokens_seen": 61781630, "step": 2874, "time_per_iteration": 2.638460397720337 }, { "auxiliary_loss_clip": 0.01126874, "auxiliary_loss_mlp": 0.01037629, "balance_loss_clip": 1.04737663, "balance_loss_mlp": 1.02813435, "epoch": 0.34569831058738654, "flos": 34496005507200.0, "grad_norm": 1.7865243925462448, "language_loss": 0.60478449, "learning_rate": 3.0426382833536756e-06, "loss": 0.62642956, "num_input_tokens_seen": 61804985, "step": 2875, "time_per_iteration": 3.727134943008423 }, { "auxiliary_loss_clip": 0.01145448, "auxiliary_loss_mlp": 0.01030679, "balance_loss_clip": 1.04743695, "balance_loss_mlp": 1.02073693, "epoch": 0.3458185534780256, "flos": 31138681098240.0, "grad_norm": 2.8996408047444957, "language_loss": 0.78090417, "learning_rate": 3.041973459643877e-06, "loss": 0.80266547, "num_input_tokens_seen": 61824440, "step": 2876, "time_per_iteration": 2.7294657230377197 }, { "auxiliary_loss_clip": 0.01130143, "auxiliary_loss_mlp": 0.01029149, "balance_loss_clip": 1.04448164, "balance_loss_mlp": 1.01992214, "epoch": 0.3459387963686647, "flos": 32452508862720.0, "grad_norm": 3.164096910885201, "language_loss": 0.67439353, "learning_rate": 3.0413084778675334e-06, "loss": 0.69598639, "num_input_tokens_seen": 61845690, "step": 2877, "time_per_iteration": 2.8787052631378174 }, { "auxiliary_loss_clip": 0.01159682, "auxiliary_loss_mlp": 0.00763886, "balance_loss_clip": 1.04812133, "balance_loss_mlp": 1.00036764, "epoch": 0.3460590392593038, "flos": 24675658030080.0, "grad_norm": 2.108638406057002, "language_loss": 0.83878267, "learning_rate": 3.0406433381255214e-06, "loss": 0.85801828, "num_input_tokens_seen": 61863725, "step": 2878, "time_per_iteration": 2.692603349685669 }, { "auxiliary_loss_clip": 0.01182279, "auxiliary_loss_mlp": 0.01032062, "balance_loss_clip": 1.05661821, "balance_loss_mlp": 1.02353263, "epoch": 0.34617928214994287, "flos": 18807316531200.0, "grad_norm": 3.5299347199522035, "language_loss": 0.82588059, "learning_rate": 3.0399780405187425e-06, "loss": 0.84802401, "num_input_tokens_seen": 61882720, "step": 2879, "time_per_iteration": 4.46898078918457 }, { "auxiliary_loss_clip": 0.01175361, "auxiliary_loss_mlp": 0.01026183, "balance_loss_clip": 1.05087245, "balance_loss_mlp": 1.01736176, "epoch": 0.346299525040582, "flos": 24857653265280.0, "grad_norm": 1.7925401548908713, "language_loss": 0.78722072, "learning_rate": 3.0393125851481216e-06, "loss": 0.80923611, "num_input_tokens_seen": 61902595, "step": 2880, "time_per_iteration": 2.70742130279541 }, { "auxiliary_loss_clip": 0.01146425, "auxiliary_loss_mlp": 0.01025734, "balance_loss_clip": 1.04992115, "balance_loss_mlp": 1.01698995, "epoch": 0.3464197679312211, "flos": 16434914025600.0, "grad_norm": 2.1586192424280477, "language_loss": 0.86443168, "learning_rate": 3.038646972114608e-06, "loss": 0.88615328, "num_input_tokens_seen": 61918920, "step": 2881, "time_per_iteration": 2.6598329544067383 }, { "auxiliary_loss_clip": 0.01149637, "auxiliary_loss_mlp": 0.01031893, "balance_loss_clip": 1.05235338, "balance_loss_mlp": 1.02307153, "epoch": 0.34654001082186014, "flos": 22382474970240.0, "grad_norm": 1.7664041781111552, "language_loss": 0.67468321, "learning_rate": 3.037981201519174e-06, "loss": 0.69649851, "num_input_tokens_seen": 61939520, "step": 2882, "time_per_iteration": 2.791757822036743 }, { "auxiliary_loss_clip": 0.01181564, "auxiliary_loss_mlp": 0.01029375, "balance_loss_clip": 1.05540395, "balance_loss_mlp": 1.02082145, "epoch": 0.34666025371249926, "flos": 19573901614080.0, "grad_norm": 2.018592588478196, "language_loss": 0.71181327, "learning_rate": 3.0373152734628175e-06, "loss": 0.7339226, "num_input_tokens_seen": 61957800, "step": 2883, "time_per_iteration": 2.5992274284362793 }, { "auxiliary_loss_clip": 0.011751, "auxiliary_loss_mlp": 0.01032658, "balance_loss_clip": 1.05171108, "balance_loss_mlp": 1.0242002, "epoch": 0.34678049660313837, "flos": 15267637751040.0, "grad_norm": 2.357607523761916, "language_loss": 0.75991654, "learning_rate": 3.0366491880465584e-06, "loss": 0.7819941, "num_input_tokens_seen": 61975820, "step": 2884, "time_per_iteration": 2.6236507892608643 }, { "auxiliary_loss_clip": 0.01201355, "auxiliary_loss_mlp": 0.01034887, "balance_loss_clip": 1.05933952, "balance_loss_mlp": 1.02557063, "epoch": 0.3469007394937774, "flos": 21181550630400.0, "grad_norm": 1.642785465326633, "language_loss": 0.81999558, "learning_rate": 3.035982945371443e-06, "loss": 0.84235799, "num_input_tokens_seen": 61997515, "step": 2885, "time_per_iteration": 2.6460988521575928 }, { "auxiliary_loss_clip": 0.01175191, "auxiliary_loss_mlp": 0.01029775, "balance_loss_clip": 1.05365038, "balance_loss_mlp": 1.02114975, "epoch": 0.34702098238441653, "flos": 22375471818240.0, "grad_norm": 2.0345710573140416, "language_loss": 0.8548528, "learning_rate": 3.035316545538537e-06, "loss": 0.87690252, "num_input_tokens_seen": 62016310, "step": 2886, "time_per_iteration": 2.7000975608825684 }, { "auxiliary_loss_clip": 0.01168764, "auxiliary_loss_mlp": 0.01030408, "balance_loss_clip": 1.05856061, "balance_loss_mlp": 1.02202177, "epoch": 0.3471412252750556, "flos": 22929430343040.0, "grad_norm": 3.107626737647839, "language_loss": 0.79695755, "learning_rate": 3.034649988648935e-06, "loss": 0.81894928, "num_input_tokens_seen": 62036075, "step": 2887, "time_per_iteration": 2.664337158203125 }, { "auxiliary_loss_clip": 0.01167208, "auxiliary_loss_mlp": 0.01027824, "balance_loss_clip": 1.05183291, "balance_loss_mlp": 1.0189662, "epoch": 0.3472614681656947, "flos": 21324259365120.0, "grad_norm": 1.829341436275867, "language_loss": 0.80652487, "learning_rate": 3.033983274803752e-06, "loss": 0.82847512, "num_input_tokens_seen": 62055865, "step": 2888, "time_per_iteration": 2.7027506828308105 }, { "auxiliary_loss_clip": 0.01161904, "auxiliary_loss_mlp": 0.01033257, "balance_loss_clip": 1.05066836, "balance_loss_mlp": 1.02475178, "epoch": 0.3473817110563338, "flos": 23475739271040.0, "grad_norm": 2.0733122407669224, "language_loss": 0.72475326, "learning_rate": 3.0333164041041283e-06, "loss": 0.74670494, "num_input_tokens_seen": 62072180, "step": 2889, "time_per_iteration": 2.6812119483947754 }, { "auxiliary_loss_clip": 0.01126991, "auxiliary_loss_mlp": 0.01027991, "balance_loss_clip": 1.04945183, "balance_loss_mlp": 1.01970553, "epoch": 0.34750195394697286, "flos": 22346025644160.0, "grad_norm": 2.5021119396543354, "language_loss": 0.71676064, "learning_rate": 3.032649376651228e-06, "loss": 0.73831046, "num_input_tokens_seen": 62091600, "step": 2890, "time_per_iteration": 2.792116165161133 }, { "auxiliary_loss_clip": 0.01152676, "auxiliary_loss_mlp": 0.01035971, "balance_loss_clip": 1.05083287, "balance_loss_mlp": 1.02605319, "epoch": 0.347622196837612, "flos": 29095004885760.0, "grad_norm": 2.243178749572877, "language_loss": 0.7594465, "learning_rate": 3.031982192546238e-06, "loss": 0.78133303, "num_input_tokens_seen": 62114695, "step": 2891, "time_per_iteration": 2.7642650604248047 }, { "auxiliary_loss_clip": 0.01183718, "auxiliary_loss_mlp": 0.0102844, "balance_loss_clip": 1.05355501, "balance_loss_mlp": 1.02051854, "epoch": 0.3477424397282511, "flos": 22455732758400.0, "grad_norm": 8.420097990764068, "language_loss": 0.95007885, "learning_rate": 3.0313148518903696e-06, "loss": 0.97220039, "num_input_tokens_seen": 62134520, "step": 2892, "time_per_iteration": 2.6494662761688232 }, { "auxiliary_loss_clip": 0.01174852, "auxiliary_loss_mlp": 0.01031806, "balance_loss_clip": 1.05749059, "balance_loss_mlp": 1.02345514, "epoch": 0.34786268261889014, "flos": 15778790242560.0, "grad_norm": 2.479159479969363, "language_loss": 0.81102443, "learning_rate": 3.030647354784859e-06, "loss": 0.83309102, "num_input_tokens_seen": 62151560, "step": 2893, "time_per_iteration": 2.7526423931121826 }, { "auxiliary_loss_clip": 0.01148995, "auxiliary_loss_mlp": 0.0102987, "balance_loss_clip": 1.04957628, "balance_loss_mlp": 1.02206171, "epoch": 0.34798292550952925, "flos": 20777627214720.0, "grad_norm": 1.8509895514881833, "language_loss": 0.77278435, "learning_rate": 3.029979701330964e-06, "loss": 0.79457301, "num_input_tokens_seen": 62170985, "step": 2894, "time_per_iteration": 2.723860263824463 }, { "auxiliary_loss_clip": 0.01172855, "auxiliary_loss_mlp": 0.01032695, "balance_loss_clip": 1.05386794, "balance_loss_mlp": 1.02377212, "epoch": 0.34810316840016836, "flos": 19937820257280.0, "grad_norm": 2.5211748898191777, "language_loss": 0.8044644, "learning_rate": 3.029311891629966e-06, "loss": 0.82651985, "num_input_tokens_seen": 62189440, "step": 2895, "time_per_iteration": 2.6895737648010254 }, { "auxiliary_loss_clip": 0.01164186, "auxiliary_loss_mlp": 0.01029764, "balance_loss_clip": 1.05314589, "balance_loss_mlp": 1.02088308, "epoch": 0.3482234112908074, "flos": 23623296341760.0, "grad_norm": 1.8313039277942187, "language_loss": 0.74818659, "learning_rate": 3.0286439257831744e-06, "loss": 0.7701261, "num_input_tokens_seen": 62208910, "step": 2896, "time_per_iteration": 3.612828016281128 }, { "auxiliary_loss_clip": 0.01199023, "auxiliary_loss_mlp": 0.01032185, "balance_loss_clip": 1.05663133, "balance_loss_mlp": 1.02208161, "epoch": 0.3483436541814465, "flos": 23986712194560.0, "grad_norm": 2.15714261934903, "language_loss": 0.71846879, "learning_rate": 3.0279758038919156e-06, "loss": 0.74078083, "num_input_tokens_seen": 62227135, "step": 2897, "time_per_iteration": 2.6443543434143066 }, { "auxiliary_loss_clip": 0.01185435, "auxiliary_loss_mlp": 0.01029107, "balance_loss_clip": 1.05736327, "balance_loss_mlp": 1.01977921, "epoch": 0.34846389707208564, "flos": 22638338524800.0, "grad_norm": 2.0154863267991954, "language_loss": 0.7854352, "learning_rate": 3.0273075260575455e-06, "loss": 0.80758065, "num_input_tokens_seen": 62246035, "step": 2898, "time_per_iteration": 2.6476950645446777 }, { "auxiliary_loss_clip": 0.01169868, "auxiliary_loss_mlp": 0.010316, "balance_loss_clip": 1.05182791, "balance_loss_mlp": 1.02248633, "epoch": 0.3485841399627247, "flos": 21792857218560.0, "grad_norm": 1.8491760669656492, "language_loss": 0.80910677, "learning_rate": 3.0266390923814396e-06, "loss": 0.8311215, "num_input_tokens_seen": 62264095, "step": 2899, "time_per_iteration": 2.67299485206604 }, { "auxiliary_loss_clip": 0.0117492, "auxiliary_loss_mlp": 0.01030865, "balance_loss_clip": 1.05857134, "balance_loss_mlp": 1.02204347, "epoch": 0.3487043828533638, "flos": 17019036996480.0, "grad_norm": 1.820856832493506, "language_loss": 0.8239553, "learning_rate": 3.0259705029650008e-06, "loss": 0.84601313, "num_input_tokens_seen": 62282025, "step": 2900, "time_per_iteration": 2.7655794620513916 }, { "auxiliary_loss_clip": 0.0118347, "auxiliary_loss_mlp": 0.01026336, "balance_loss_clip": 1.05371737, "balance_loss_mlp": 1.01784849, "epoch": 0.34882462574400286, "flos": 22601135013120.0, "grad_norm": 1.7937924106543541, "language_loss": 0.729379, "learning_rate": 3.025301757909652e-06, "loss": 0.75147712, "num_input_tokens_seen": 62302220, "step": 2901, "time_per_iteration": 3.5888545513153076 }, { "auxiliary_loss_clip": 0.01151367, "auxiliary_loss_mlp": 0.00764075, "balance_loss_clip": 1.05110073, "balance_loss_mlp": 1.00029755, "epoch": 0.34894486863464197, "flos": 29861518141440.0, "grad_norm": 1.4592093581033876, "language_loss": 0.80749345, "learning_rate": 3.024632857316842e-06, "loss": 0.82664788, "num_input_tokens_seen": 62323535, "step": 2902, "time_per_iteration": 2.8104183673858643 }, { "auxiliary_loss_clip": 0.01184938, "auxiliary_loss_mlp": 0.01038986, "balance_loss_clip": 1.05744886, "balance_loss_mlp": 1.03061104, "epoch": 0.3490651115252811, "flos": 22122265870080.0, "grad_norm": 2.5239466025280417, "language_loss": 0.77306384, "learning_rate": 3.0239638012880412e-06, "loss": 0.79530305, "num_input_tokens_seen": 62343430, "step": 2903, "time_per_iteration": 2.6399543285369873 }, { "auxiliary_loss_clip": 0.01131492, "auxiliary_loss_mlp": 0.01030589, "balance_loss_clip": 1.0480957, "balance_loss_mlp": 1.02149963, "epoch": 0.34918535441592014, "flos": 12676682943360.0, "grad_norm": 2.4889272468711616, "language_loss": 0.81751055, "learning_rate": 3.0232945899247466e-06, "loss": 0.83913136, "num_input_tokens_seen": 62360365, "step": 2904, "time_per_iteration": 2.7688820362091064 }, { "auxiliary_loss_clip": 0.01183776, "auxiliary_loss_mlp": 0.01032673, "balance_loss_clip": 1.05317509, "balance_loss_mlp": 1.02325523, "epoch": 0.34930559730655925, "flos": 23185617120000.0, "grad_norm": 2.0882445401698613, "language_loss": 0.77496684, "learning_rate": 3.022625223328476e-06, "loss": 0.7971313, "num_input_tokens_seen": 62382105, "step": 2905, "time_per_iteration": 3.6787290573120117 }, { "auxiliary_loss_clip": 0.01190399, "auxiliary_loss_mlp": 0.01032856, "balance_loss_clip": 1.05583966, "balance_loss_mlp": 1.02366543, "epoch": 0.34942584019719836, "flos": 22855023319680.0, "grad_norm": 2.657250919549311, "language_loss": 0.69455576, "learning_rate": 3.0219557016007723e-06, "loss": 0.71678835, "num_input_tokens_seen": 62402235, "step": 2906, "time_per_iteration": 2.6393113136291504 }, { "auxiliary_loss_clip": 0.01179451, "auxiliary_loss_mlp": 0.01035452, "balance_loss_clip": 1.05653214, "balance_loss_mlp": 1.02650571, "epoch": 0.3495460830878374, "flos": 24426043441920.0, "grad_norm": 1.8787776544060832, "language_loss": 0.69687843, "learning_rate": 3.021286024843202e-06, "loss": 0.71902746, "num_input_tokens_seen": 62420430, "step": 2907, "time_per_iteration": 2.7353363037109375 }, { "auxiliary_loss_clip": 0.01099926, "auxiliary_loss_mlp": 0.01001339, "balance_loss_clip": 1.02659392, "balance_loss_mlp": 0.99967051, "epoch": 0.3496663259784765, "flos": 70008749389440.0, "grad_norm": 1.072254078965234, "language_loss": 0.64745384, "learning_rate": 3.0206161931573526e-06, "loss": 0.66846651, "num_input_tokens_seen": 62472980, "step": 2908, "time_per_iteration": 3.146068811416626 }, { "auxiliary_loss_clip": 0.01163665, "auxiliary_loss_mlp": 0.01031917, "balance_loss_clip": 1.04975796, "balance_loss_mlp": 1.02343512, "epoch": 0.34978656886911563, "flos": 28692805322880.0, "grad_norm": 1.6110440033108082, "language_loss": 0.93010294, "learning_rate": 3.0199462066448388e-06, "loss": 0.95205879, "num_input_tokens_seen": 62495175, "step": 2909, "time_per_iteration": 2.770305633544922 }, { "auxiliary_loss_clip": 0.01186676, "auxiliary_loss_mlp": 0.01030429, "balance_loss_clip": 1.05657959, "balance_loss_mlp": 1.02156019, "epoch": 0.3499068117597547, "flos": 21142156389120.0, "grad_norm": 1.8147112413376805, "language_loss": 0.69623852, "learning_rate": 3.019276065407296e-06, "loss": 0.7184096, "num_input_tokens_seen": 62514295, "step": 2910, "time_per_iteration": 2.6421573162078857 }, { "auxiliary_loss_clip": 0.01141609, "auxiliary_loss_mlp": 0.01028789, "balance_loss_clip": 1.05036974, "balance_loss_mlp": 1.01972306, "epoch": 0.3500270546503938, "flos": 22782699285120.0, "grad_norm": 1.7737035787021966, "language_loss": 0.80629694, "learning_rate": 3.018605769546385e-06, "loss": 0.8280009, "num_input_tokens_seen": 62534850, "step": 2911, "time_per_iteration": 2.8473026752471924 }, { "auxiliary_loss_clip": 0.01179569, "auxiliary_loss_mlp": 0.01035185, "balance_loss_clip": 1.05129838, "balance_loss_mlp": 1.02572536, "epoch": 0.3501472975410329, "flos": 22894058424960.0, "grad_norm": 2.1656836444925, "language_loss": 0.79656351, "learning_rate": 3.017935319163788e-06, "loss": 0.81871098, "num_input_tokens_seen": 62553810, "step": 2912, "time_per_iteration": 2.706317901611328 }, { "auxiliary_loss_clip": 0.01183612, "auxiliary_loss_mlp": 0.01030637, "balance_loss_clip": 1.05419207, "balance_loss_mlp": 1.01988411, "epoch": 0.35026754043167196, "flos": 25446588658560.0, "grad_norm": 1.8699807260351646, "language_loss": 0.70505321, "learning_rate": 3.017264714361213e-06, "loss": 0.72719568, "num_input_tokens_seen": 62573460, "step": 2913, "time_per_iteration": 2.6591193675994873 }, { "auxiliary_loss_clip": 0.01165466, "auxiliary_loss_mlp": 0.0076402, "balance_loss_clip": 1.05277324, "balance_loss_mlp": 1.00029826, "epoch": 0.3503877833223111, "flos": 19573757959680.0, "grad_norm": 2.080590051632714, "language_loss": 0.82362032, "learning_rate": 3.016593955240389e-06, "loss": 0.84291518, "num_input_tokens_seen": 62592150, "step": 2914, "time_per_iteration": 2.699784994125366 }, { "auxiliary_loss_clip": 0.01081561, "auxiliary_loss_mlp": 0.01001596, "balance_loss_clip": 1.02177024, "balance_loss_mlp": 0.99977189, "epoch": 0.3505080262129502, "flos": 65072075880960.0, "grad_norm": 0.8233668207484838, "language_loss": 0.63670266, "learning_rate": 3.015923041903071e-06, "loss": 0.65753424, "num_input_tokens_seen": 62658275, "step": 2915, "time_per_iteration": 3.2850747108459473 }, { "auxiliary_loss_clip": 0.0118212, "auxiliary_loss_mlp": 0.01035759, "balance_loss_clip": 1.05563402, "balance_loss_mlp": 1.02715778, "epoch": 0.35062826910358924, "flos": 29314562768640.0, "grad_norm": 2.2978005890296314, "language_loss": 0.83528304, "learning_rate": 3.0152519744510347e-06, "loss": 0.85746181, "num_input_tokens_seen": 62678075, "step": 2916, "time_per_iteration": 2.82252836227417 }, { "auxiliary_loss_clip": 0.01154689, "auxiliary_loss_mlp": 0.01028916, "balance_loss_clip": 1.05044389, "balance_loss_mlp": 1.02029085, "epoch": 0.35074851199422835, "flos": 23987717775360.0, "grad_norm": 2.448849498275923, "language_loss": 0.83083051, "learning_rate": 3.014580752986081e-06, "loss": 0.85266656, "num_input_tokens_seen": 62696950, "step": 2917, "time_per_iteration": 2.690643787384033 }, { "auxiliary_loss_clip": 0.01141484, "auxiliary_loss_mlp": 0.01035136, "balance_loss_clip": 1.04971433, "balance_loss_mlp": 1.02630806, "epoch": 0.3508687548848674, "flos": 15224436668160.0, "grad_norm": 1.9955559047212241, "language_loss": 0.78702056, "learning_rate": 3.0139093776100345e-06, "loss": 0.80878675, "num_input_tokens_seen": 62713540, "step": 2918, "time_per_iteration": 2.6704654693603516 }, { "auxiliary_loss_clip": 0.01194166, "auxiliary_loss_mlp": 0.01031172, "balance_loss_clip": 1.05414379, "balance_loss_mlp": 1.02225506, "epoch": 0.3509889977755065, "flos": 21361750185600.0, "grad_norm": 1.9795531191064755, "language_loss": 0.75727868, "learning_rate": 3.013237848424741e-06, "loss": 0.77953213, "num_input_tokens_seen": 62732925, "step": 2919, "time_per_iteration": 2.612204074859619 }, { "auxiliary_loss_clip": 0.01173285, "auxiliary_loss_mlp": 0.01028415, "balance_loss_clip": 1.05589366, "balance_loss_mlp": 1.01943886, "epoch": 0.35110924066614563, "flos": 19135360465920.0, "grad_norm": 3.354863537911646, "language_loss": 0.74948627, "learning_rate": 3.012566165532072e-06, "loss": 0.77150327, "num_input_tokens_seen": 62751715, "step": 2920, "time_per_iteration": 2.6619694232940674 }, { "auxiliary_loss_clip": 0.01132317, "auxiliary_loss_mlp": 0.01037597, "balance_loss_clip": 1.04908228, "balance_loss_mlp": 1.02813208, "epoch": 0.3512294835567847, "flos": 21980885938560.0, "grad_norm": 2.1243630269617655, "language_loss": 0.76251388, "learning_rate": 3.0118943290339207e-06, "loss": 0.78421307, "num_input_tokens_seen": 62771925, "step": 2921, "time_per_iteration": 2.761431932449341 }, { "auxiliary_loss_clip": 0.01139551, "auxiliary_loss_mlp": 0.01036488, "balance_loss_clip": 1.04478097, "balance_loss_mlp": 1.02693975, "epoch": 0.3513497264474238, "flos": 17817294896640.0, "grad_norm": 1.8750901537890243, "language_loss": 0.68380392, "learning_rate": 3.011222339032204e-06, "loss": 0.70556432, "num_input_tokens_seen": 62790075, "step": 2922, "time_per_iteration": 3.6116373538970947 }, { "auxiliary_loss_clip": 0.01197587, "auxiliary_loss_mlp": 0.01037314, "balance_loss_clip": 1.05790448, "balance_loss_mlp": 1.0283556, "epoch": 0.3514699693380629, "flos": 26943417239040.0, "grad_norm": 1.9635529349910479, "language_loss": 0.69381917, "learning_rate": 3.0105501956288626e-06, "loss": 0.71616822, "num_input_tokens_seen": 62810545, "step": 2923, "time_per_iteration": 2.6699538230895996 }, { "auxiliary_loss_clip": 0.01185471, "auxiliary_loss_mlp": 0.01030773, "balance_loss_clip": 1.05320644, "balance_loss_mlp": 1.02208877, "epoch": 0.35159021222870196, "flos": 15267565923840.0, "grad_norm": 2.0745590666074114, "language_loss": 0.72743207, "learning_rate": 3.0098778989258602e-06, "loss": 0.74959445, "num_input_tokens_seen": 62829155, "step": 2924, "time_per_iteration": 2.6024160385131836 }, { "auxiliary_loss_clip": 0.01145722, "auxiliary_loss_mlp": 0.01028771, "balance_loss_clip": 1.04951608, "balance_loss_mlp": 1.01985979, "epoch": 0.35171045511934107, "flos": 13984154000640.0, "grad_norm": 3.1909506649341934, "language_loss": 0.88458627, "learning_rate": 3.009205449025183e-06, "loss": 0.90633118, "num_input_tokens_seen": 62845350, "step": 2925, "time_per_iteration": 2.6759209632873535 }, { "auxiliary_loss_clip": 0.01146602, "auxiliary_loss_mlp": 0.01030287, "balance_loss_clip": 1.04666054, "balance_loss_mlp": 1.02137041, "epoch": 0.3518306980099802, "flos": 14283434119680.0, "grad_norm": 1.9159730601500395, "language_loss": 0.63191628, "learning_rate": 3.008532846028842e-06, "loss": 0.65368521, "num_input_tokens_seen": 62862110, "step": 2926, "time_per_iteration": 2.6243698596954346 }, { "auxiliary_loss_clip": 0.01198938, "auxiliary_loss_mlp": 0.01038394, "balance_loss_clip": 1.05803025, "balance_loss_mlp": 1.02858901, "epoch": 0.35195094090061924, "flos": 27052872958080.0, "grad_norm": 3.1806536793345406, "language_loss": 0.71862161, "learning_rate": 3.0078600900388694e-06, "loss": 0.74099493, "num_input_tokens_seen": 62882415, "step": 2927, "time_per_iteration": 3.6043879985809326 }, { "auxiliary_loss_clip": 0.01140422, "auxiliary_loss_mlp": 0.01034156, "balance_loss_clip": 1.04548383, "balance_loss_mlp": 1.02478635, "epoch": 0.35207118379125835, "flos": 25629266252160.0, "grad_norm": 3.3311178340406404, "language_loss": 0.74353015, "learning_rate": 3.007187181157323e-06, "loss": 0.76527596, "num_input_tokens_seen": 62902425, "step": 2928, "time_per_iteration": 2.8107335567474365 }, { "auxiliary_loss_clip": 0.01110793, "auxiliary_loss_mlp": 0.01042651, "balance_loss_clip": 1.04371452, "balance_loss_mlp": 1.03344214, "epoch": 0.35219142668189746, "flos": 18004713085440.0, "grad_norm": 2.867851687076375, "language_loss": 0.6805799, "learning_rate": 3.006514119486282e-06, "loss": 0.70211434, "num_input_tokens_seen": 62919255, "step": 2929, "time_per_iteration": 2.7337088584899902 }, { "auxiliary_loss_clip": 0.01141407, "auxiliary_loss_mlp": 0.01028597, "balance_loss_clip": 1.04712343, "balance_loss_mlp": 1.01941192, "epoch": 0.3523116695725365, "flos": 14028109269120.0, "grad_norm": 1.914410051371642, "language_loss": 0.70152932, "learning_rate": 3.005840905127849e-06, "loss": 0.72322941, "num_input_tokens_seen": 62936160, "step": 2930, "time_per_iteration": 3.568060874938965 }, { "auxiliary_loss_clip": 0.01196596, "auxiliary_loss_mlp": 0.01035767, "balance_loss_clip": 1.05724788, "balance_loss_mlp": 1.02614713, "epoch": 0.3524319124631756, "flos": 21433966479360.0, "grad_norm": 3.00388824873739, "language_loss": 0.867975, "learning_rate": 3.0051675381841516e-06, "loss": 0.8902986, "num_input_tokens_seen": 62953470, "step": 2931, "time_per_iteration": 3.5451879501342773 }, { "auxiliary_loss_clip": 0.01108099, "auxiliary_loss_mlp": 0.00764991, "balance_loss_clip": 1.04323912, "balance_loss_mlp": 1.0002172, "epoch": 0.3525521553538147, "flos": 26322773114880.0, "grad_norm": 1.5167175861539668, "language_loss": 0.76852381, "learning_rate": 3.0044940187573363e-06, "loss": 0.78725475, "num_input_tokens_seen": 62974480, "step": 2932, "time_per_iteration": 2.834671974182129 }, { "auxiliary_loss_clip": 0.01183304, "auxiliary_loss_mlp": 0.01028607, "balance_loss_clip": 1.05249751, "balance_loss_mlp": 1.01926136, "epoch": 0.3526723982444538, "flos": 21543314457600.0, "grad_norm": 2.0190965462306574, "language_loss": 0.65252453, "learning_rate": 3.003820346949578e-06, "loss": 0.67464364, "num_input_tokens_seen": 62992560, "step": 2933, "time_per_iteration": 2.703700065612793 }, { "auxiliary_loss_clip": 0.01196393, "auxiliary_loss_mlp": 0.01032651, "balance_loss_clip": 1.05455184, "balance_loss_mlp": 1.02429485, "epoch": 0.3527926411350929, "flos": 23733649900800.0, "grad_norm": 2.3141155908921736, "language_loss": 0.79646766, "learning_rate": 3.003146522863071e-06, "loss": 0.81875813, "num_input_tokens_seen": 63013445, "step": 2934, "time_per_iteration": 2.6311252117156982 }, { "auxiliary_loss_clip": 0.01165958, "auxiliary_loss_mlp": 0.01031845, "balance_loss_clip": 1.05341005, "balance_loss_mlp": 1.02276671, "epoch": 0.35291288402573195, "flos": 30445461544320.0, "grad_norm": 2.5313947431030357, "language_loss": 0.85920423, "learning_rate": 3.0024725466000345e-06, "loss": 0.88118225, "num_input_tokens_seen": 63033400, "step": 2935, "time_per_iteration": 2.7431368827819824 }, { "auxiliary_loss_clip": 0.01180677, "auxiliary_loss_mlp": 0.01039198, "balance_loss_clip": 1.05456233, "balance_loss_mlp": 1.03026891, "epoch": 0.35303312691637107, "flos": 23112179763840.0, "grad_norm": 1.757103577465668, "language_loss": 0.78776145, "learning_rate": 3.0017984182627087e-06, "loss": 0.80996013, "num_input_tokens_seen": 63052725, "step": 2936, "time_per_iteration": 2.6730575561523438 }, { "auxiliary_loss_clip": 0.01149331, "auxiliary_loss_mlp": 0.00764209, "balance_loss_clip": 1.04834092, "balance_loss_mlp": 1.00029624, "epoch": 0.3531533698070102, "flos": 21835699165440.0, "grad_norm": 2.0160478149532395, "language_loss": 0.82409286, "learning_rate": 3.00112413795336e-06, "loss": 0.84322822, "num_input_tokens_seen": 63072560, "step": 2937, "time_per_iteration": 2.794373035430908 }, { "auxiliary_loss_clip": 0.01163917, "auxiliary_loss_mlp": 0.01029182, "balance_loss_clip": 1.04769254, "balance_loss_mlp": 1.01972914, "epoch": 0.35327361269764923, "flos": 15778969810560.0, "grad_norm": 2.2810593737799554, "language_loss": 0.80245125, "learning_rate": 3.000449705774275e-06, "loss": 0.82438231, "num_input_tokens_seen": 63090800, "step": 2938, "time_per_iteration": 2.676865339279175 }, { "auxiliary_loss_clip": 0.01185927, "auxiliary_loss_mlp": 0.01032865, "balance_loss_clip": 1.05614805, "balance_loss_mlp": 1.02408528, "epoch": 0.35339385558828834, "flos": 22090413484800.0, "grad_norm": 1.8500346653595896, "language_loss": 0.71661198, "learning_rate": 2.9997751218277654e-06, "loss": 0.73879993, "num_input_tokens_seen": 63108955, "step": 2939, "time_per_iteration": 2.6270527839660645 }, { "auxiliary_loss_clip": 0.01197623, "auxiliary_loss_mlp": 0.01029208, "balance_loss_clip": 1.05613792, "balance_loss_mlp": 1.02003503, "epoch": 0.35351409847892745, "flos": 24165008328960.0, "grad_norm": 3.5706132456323463, "language_loss": 0.78341234, "learning_rate": 2.999100386216166e-06, "loss": 0.80568075, "num_input_tokens_seen": 63127895, "step": 2940, "time_per_iteration": 2.629265546798706 }, { "auxiliary_loss_clip": 0.01167449, "auxiliary_loss_mlp": 0.01030479, "balance_loss_clip": 1.05194783, "balance_loss_mlp": 1.02193809, "epoch": 0.3536343413695665, "flos": 27052298340480.0, "grad_norm": 2.098110009771241, "language_loss": 0.74429965, "learning_rate": 2.998425499041831e-06, "loss": 0.76627886, "num_input_tokens_seen": 63148410, "step": 2941, "time_per_iteration": 2.72666335105896 }, { "auxiliary_loss_clip": 0.01081388, "auxiliary_loss_mlp": 0.01003662, "balance_loss_clip": 1.02041698, "balance_loss_mlp": 1.00193357, "epoch": 0.3537545842602056, "flos": 65991066370560.0, "grad_norm": 1.2687750889226708, "language_loss": 0.64489585, "learning_rate": 2.997750460407142e-06, "loss": 0.66574639, "num_input_tokens_seen": 63209765, "step": 2942, "time_per_iteration": 3.2771735191345215 }, { "auxiliary_loss_clip": 0.01156593, "auxiliary_loss_mlp": 0.01027231, "balance_loss_clip": 1.04961658, "balance_loss_mlp": 1.01791453, "epoch": 0.35387482715084473, "flos": 18436897526400.0, "grad_norm": 2.1614805445511625, "language_loss": 0.69833148, "learning_rate": 2.997075270414501e-06, "loss": 0.72016966, "num_input_tokens_seen": 63226980, "step": 2943, "time_per_iteration": 2.7233757972717285 }, { "auxiliary_loss_clip": 0.01070333, "auxiliary_loss_mlp": 0.01004111, "balance_loss_clip": 1.02097201, "balance_loss_mlp": 1.0024178, "epoch": 0.3539950700414838, "flos": 65588579498880.0, "grad_norm": 0.7090042950016123, "language_loss": 0.5773083, "learning_rate": 2.9963999291663347e-06, "loss": 0.59805274, "num_input_tokens_seen": 63292760, "step": 2944, "time_per_iteration": 3.246868848800659 }, { "auxiliary_loss_clip": 0.01138095, "auxiliary_loss_mlp": 0.0103187, "balance_loss_clip": 1.05094242, "balance_loss_mlp": 1.0227983, "epoch": 0.3541153129321229, "flos": 20521655919360.0, "grad_norm": 3.679678595111307, "language_loss": 0.74211258, "learning_rate": 2.9957244367650915e-06, "loss": 0.76381218, "num_input_tokens_seen": 63309005, "step": 2945, "time_per_iteration": 2.727323055267334 }, { "auxiliary_loss_clip": 0.01135532, "auxiliary_loss_mlp": 0.01034962, "balance_loss_clip": 1.0508635, "balance_loss_mlp": 1.02619362, "epoch": 0.354235555822762, "flos": 19573578391680.0, "grad_norm": 1.9885413575104811, "language_loss": 0.83715379, "learning_rate": 2.9950487933132425e-06, "loss": 0.8588587, "num_input_tokens_seen": 63326420, "step": 2946, "time_per_iteration": 2.7159605026245117 }, { "auxiliary_loss_clip": 0.01185076, "auxiliary_loss_mlp": 0.01035365, "balance_loss_clip": 1.05432892, "balance_loss_mlp": 1.02658498, "epoch": 0.35435579871340106, "flos": 20777268078720.0, "grad_norm": 2.677571011943611, "language_loss": 0.71448016, "learning_rate": 2.994372998913283e-06, "loss": 0.73668456, "num_input_tokens_seen": 63344925, "step": 2947, "time_per_iteration": 2.6551353931427 }, { "auxiliary_loss_clip": 0.0116804, "auxiliary_loss_mlp": 0.01033449, "balance_loss_clip": 1.05508173, "balance_loss_mlp": 1.02469862, "epoch": 0.35447604160404017, "flos": 23951807153280.0, "grad_norm": 2.807209481944425, "language_loss": 0.62374508, "learning_rate": 2.99369705366773e-06, "loss": 0.64575994, "num_input_tokens_seen": 63365170, "step": 2948, "time_per_iteration": 3.621102809906006 }, { "auxiliary_loss_clip": 0.01163733, "auxiliary_loss_mlp": 0.0103461, "balance_loss_clip": 1.05232048, "balance_loss_mlp": 1.02550268, "epoch": 0.3545962844946792, "flos": 23435662671360.0, "grad_norm": 1.8980530493076788, "language_loss": 0.82112199, "learning_rate": 2.9930209576791244e-06, "loss": 0.84310544, "num_input_tokens_seen": 63383645, "step": 2949, "time_per_iteration": 2.7469656467437744 }, { "auxiliary_loss_clip": 0.01177694, "auxiliary_loss_mlp": 0.01030058, "balance_loss_clip": 1.05332756, "balance_loss_mlp": 1.0206759, "epoch": 0.35471652738531834, "flos": 22085134185600.0, "grad_norm": 2.4523922925816857, "language_loss": 0.63599706, "learning_rate": 2.9923447110500285e-06, "loss": 0.65807462, "num_input_tokens_seen": 63402390, "step": 2950, "time_per_iteration": 2.61568021774292 }, { "auxiliary_loss_clip": 0.01170972, "auxiliary_loss_mlp": 0.01032445, "balance_loss_clip": 1.05293775, "balance_loss_mlp": 1.02368879, "epoch": 0.35483677027595745, "flos": 27341881787520.0, "grad_norm": 1.6251560381479642, "language_loss": 0.75455844, "learning_rate": 2.9916683138830295e-06, "loss": 0.77659267, "num_input_tokens_seen": 63423055, "step": 2951, "time_per_iteration": 2.7109241485595703 }, { "auxiliary_loss_clip": 0.01164807, "auxiliary_loss_mlp": 0.01030045, "balance_loss_clip": 1.05272043, "balance_loss_mlp": 1.02131343, "epoch": 0.3549570131665965, "flos": 13516166678400.0, "grad_norm": 1.9315953052432235, "language_loss": 0.81155837, "learning_rate": 2.9909917662807353e-06, "loss": 0.83350688, "num_input_tokens_seen": 63440855, "step": 2952, "time_per_iteration": 3.6108484268188477 }, { "auxiliary_loss_clip": 0.0117927, "auxiliary_loss_mlp": 0.01034896, "balance_loss_clip": 1.05400109, "balance_loss_mlp": 1.02633107, "epoch": 0.3550772560572356, "flos": 20887549810560.0, "grad_norm": 2.111690326077062, "language_loss": 0.69758821, "learning_rate": 2.9903150683457783e-06, "loss": 0.7197299, "num_input_tokens_seen": 63459400, "step": 2953, "time_per_iteration": 2.5993800163269043 }, { "auxiliary_loss_clip": 0.01165926, "auxiliary_loss_mlp": 0.01033871, "balance_loss_clip": 1.04934025, "balance_loss_mlp": 1.02436352, "epoch": 0.3551974989478747, "flos": 20194042947840.0, "grad_norm": 2.3535776781244695, "language_loss": 0.6548934, "learning_rate": 2.9896382201808126e-06, "loss": 0.67689139, "num_input_tokens_seen": 63476800, "step": 2954, "time_per_iteration": 2.654672145843506 }, { "auxiliary_loss_clip": 0.01196657, "auxiliary_loss_mlp": 0.01027703, "balance_loss_clip": 1.05602098, "balance_loss_mlp": 1.01869082, "epoch": 0.3553177418385138, "flos": 19828831415040.0, "grad_norm": 2.512205705408409, "language_loss": 0.81470621, "learning_rate": 2.988961221888516e-06, "loss": 0.83694983, "num_input_tokens_seen": 63493475, "step": 2955, "time_per_iteration": 2.565715789794922 }, { "auxiliary_loss_clip": 0.01143441, "auxiliary_loss_mlp": 0.01031439, "balance_loss_clip": 1.04883492, "balance_loss_mlp": 1.02312374, "epoch": 0.3554379847291529, "flos": 14829132516480.0, "grad_norm": 3.256802443448596, "language_loss": 0.7898249, "learning_rate": 2.988284073571589e-06, "loss": 0.81157374, "num_input_tokens_seen": 63509560, "step": 2956, "time_per_iteration": 3.601400852203369 }, { "auxiliary_loss_clip": 0.01182662, "auxiliary_loss_mlp": 0.00764378, "balance_loss_clip": 1.05432057, "balance_loss_mlp": 1.00025332, "epoch": 0.355558227619792, "flos": 20485350247680.0, "grad_norm": 2.333207943624781, "language_loss": 0.73181719, "learning_rate": 2.9876067753327528e-06, "loss": 0.75128758, "num_input_tokens_seen": 63527290, "step": 2957, "time_per_iteration": 3.508537530899048 }, { "auxiliary_loss_clip": 0.01185728, "auxiliary_loss_mlp": 0.01031117, "balance_loss_clip": 1.05342591, "balance_loss_mlp": 1.02219427, "epoch": 0.35567847051043106, "flos": 37663613256960.0, "grad_norm": 2.108432696218772, "language_loss": 0.80613005, "learning_rate": 2.986929327274754e-06, "loss": 0.82829845, "num_input_tokens_seen": 63547870, "step": 2958, "time_per_iteration": 2.7309229373931885 }, { "auxiliary_loss_clip": 0.0117729, "auxiliary_loss_mlp": 0.01031246, "balance_loss_clip": 1.05306399, "balance_loss_mlp": 1.02237689, "epoch": 0.35579871340107017, "flos": 26943058103040.0, "grad_norm": 1.6750687991197504, "language_loss": 0.78900027, "learning_rate": 2.9862517295003617e-06, "loss": 0.81108564, "num_input_tokens_seen": 63568285, "step": 2959, "time_per_iteration": 2.709367513656616 }, { "auxiliary_loss_clip": 0.01149279, "auxiliary_loss_mlp": 0.01033383, "balance_loss_clip": 1.04696083, "balance_loss_mlp": 1.02463949, "epoch": 0.3559189562917093, "flos": 28293335193600.0, "grad_norm": 3.433574473161194, "language_loss": 0.72887468, "learning_rate": 2.9855739821123654e-06, "loss": 0.75070137, "num_input_tokens_seen": 63589865, "step": 2960, "time_per_iteration": 2.8028271198272705 }, { "auxiliary_loss_clip": 0.01179244, "auxiliary_loss_mlp": 0.01035552, "balance_loss_clip": 1.05487168, "balance_loss_mlp": 1.02702272, "epoch": 0.35603919918234833, "flos": 25664063552640.0, "grad_norm": 2.2144469309302535, "language_loss": 0.82033348, "learning_rate": 2.98489608521358e-06, "loss": 0.84248149, "num_input_tokens_seen": 63609805, "step": 2961, "time_per_iteration": 2.713576078414917 }, { "auxiliary_loss_clip": 0.01187003, "auxiliary_loss_mlp": 0.00763761, "balance_loss_clip": 1.05611074, "balance_loss_mlp": 1.00027084, "epoch": 0.35615944207298744, "flos": 23000856537600.0, "grad_norm": 2.4947540724011756, "language_loss": 0.79494023, "learning_rate": 2.9842180389068425e-06, "loss": 0.81444788, "num_input_tokens_seen": 63627115, "step": 2962, "time_per_iteration": 2.6376824378967285 }, { "auxiliary_loss_clip": 0.01065162, "auxiliary_loss_mlp": 0.01012221, "balance_loss_clip": 1.02843785, "balance_loss_mlp": 1.01039755, "epoch": 0.35627968496362655, "flos": 68251283723520.0, "grad_norm": 0.766997578516349, "language_loss": 0.59225297, "learning_rate": 2.98353984329501e-06, "loss": 0.61302686, "num_input_tokens_seen": 63691460, "step": 2963, "time_per_iteration": 3.315363645553589 }, { "auxiliary_loss_clip": 0.01166103, "auxiliary_loss_mlp": 0.01032488, "balance_loss_clip": 1.0536716, "balance_loss_mlp": 1.02261722, "epoch": 0.3563999278542656, "flos": 22641714403200.0, "grad_norm": 1.663735700407347, "language_loss": 0.7056365, "learning_rate": 2.982861498480965e-06, "loss": 0.72762239, "num_input_tokens_seen": 63713840, "step": 2964, "time_per_iteration": 2.7815921306610107 }, { "auxiliary_loss_clip": 0.01146039, "auxiliary_loss_mlp": 0.01025731, "balance_loss_clip": 1.0475843, "balance_loss_mlp": 1.01711786, "epoch": 0.3565201707449047, "flos": 25952533678080.0, "grad_norm": 1.7013612272510852, "language_loss": 0.82699519, "learning_rate": 2.9821830045676122e-06, "loss": 0.84871286, "num_input_tokens_seen": 63733540, "step": 2965, "time_per_iteration": 2.7753610610961914 }, { "auxiliary_loss_clip": 0.01199598, "auxiliary_loss_mlp": 0.01033445, "balance_loss_clip": 1.05775881, "balance_loss_mlp": 1.02360415, "epoch": 0.3566404136355438, "flos": 28475725478400.0, "grad_norm": 1.805597824194273, "language_loss": 0.72968823, "learning_rate": 2.9815043616578793e-06, "loss": 0.75201863, "num_input_tokens_seen": 63754335, "step": 2966, "time_per_iteration": 2.6762287616729736 }, { "auxiliary_loss_clip": 0.01148386, "auxiliary_loss_mlp": 0.01031952, "balance_loss_clip": 1.04961085, "balance_loss_mlp": 1.02366066, "epoch": 0.3567606565261829, "flos": 38363117690880.0, "grad_norm": 2.0328047610945768, "language_loss": 0.77292073, "learning_rate": 2.9808255698547145e-06, "loss": 0.79472411, "num_input_tokens_seen": 63777135, "step": 2967, "time_per_iteration": 2.8567299842834473 }, { "auxiliary_loss_clip": 0.01182731, "auxiliary_loss_mlp": 0.01030138, "balance_loss_clip": 1.05764782, "balance_loss_mlp": 1.02159083, "epoch": 0.356880899416822, "flos": 21981029592960.0, "grad_norm": 3.096418581797977, "language_loss": 0.7966423, "learning_rate": 2.9801466292610913e-06, "loss": 0.818771, "num_input_tokens_seen": 63797020, "step": 2968, "time_per_iteration": 2.7444419860839844 }, { "auxiliary_loss_clip": 0.01180556, "auxiliary_loss_mlp": 0.01026527, "balance_loss_clip": 1.05362666, "balance_loss_mlp": 1.0179919, "epoch": 0.35700114230746105, "flos": 18989132198400.0, "grad_norm": 2.1545782062715655, "language_loss": 0.81112742, "learning_rate": 2.979467539980003e-06, "loss": 0.83319825, "num_input_tokens_seen": 63813810, "step": 2969, "time_per_iteration": 2.695913791656494 }, { "auxiliary_loss_clip": 0.01183702, "auxiliary_loss_mlp": 0.01033977, "balance_loss_clip": 1.05573773, "balance_loss_mlp": 1.02495241, "epoch": 0.35712138519810016, "flos": 19756112330880.0, "grad_norm": 1.9876642146353385, "language_loss": 0.76902425, "learning_rate": 2.978788302114468e-06, "loss": 0.79120111, "num_input_tokens_seen": 63830925, "step": 2970, "time_per_iteration": 2.82220458984375 }, { "auxiliary_loss_clip": 0.01177019, "auxiliary_loss_mlp": 0.01041491, "balance_loss_clip": 1.05295455, "balance_loss_mlp": 1.0319066, "epoch": 0.35724162808873927, "flos": 35183012008320.0, "grad_norm": 2.194411800081643, "language_loss": 0.83444369, "learning_rate": 2.9781089157675255e-06, "loss": 0.85662878, "num_input_tokens_seen": 63849385, "step": 2971, "time_per_iteration": 2.781601667404175 }, { "auxiliary_loss_clip": 0.01178638, "auxiliary_loss_mlp": 0.0103105, "balance_loss_clip": 1.05693793, "balance_loss_mlp": 1.0227468, "epoch": 0.3573618709793783, "flos": 25556726736000.0, "grad_norm": 1.5909647843914085, "language_loss": 0.88462132, "learning_rate": 2.977429381042238e-06, "loss": 0.90671813, "num_input_tokens_seen": 63870060, "step": 2972, "time_per_iteration": 2.805368423461914 }, { "auxiliary_loss_clip": 0.01168981, "auxiliary_loss_mlp": 0.01029465, "balance_loss_clip": 1.05505311, "balance_loss_mlp": 1.02157331, "epoch": 0.35748211387001744, "flos": 29132352051840.0, "grad_norm": 2.1520768308544045, "language_loss": 0.89103043, "learning_rate": 2.9767496980416913e-06, "loss": 0.91301489, "num_input_tokens_seen": 63889355, "step": 2973, "time_per_iteration": 2.72102427482605 }, { "auxiliary_loss_clip": 0.01158509, "auxiliary_loss_mlp": 0.01026882, "balance_loss_clip": 1.04836512, "balance_loss_mlp": 1.01720786, "epoch": 0.35760235676065655, "flos": 13954169122560.0, "grad_norm": 2.1066987669919244, "language_loss": 0.8104344, "learning_rate": 2.9760698668689914e-06, "loss": 0.83228832, "num_input_tokens_seen": 63905580, "step": 2974, "time_per_iteration": 2.713947296142578 }, { "auxiliary_loss_clip": 0.0117757, "auxiliary_loss_mlp": 0.01029941, "balance_loss_clip": 1.05274701, "balance_loss_mlp": 1.02176905, "epoch": 0.3577225996512956, "flos": 44018688977280.0, "grad_norm": 1.9084311372201959, "language_loss": 0.71604347, "learning_rate": 2.975389887627269e-06, "loss": 0.73811865, "num_input_tokens_seen": 63928180, "step": 2975, "time_per_iteration": 3.7302658557891846 }, { "auxiliary_loss_clip": 0.01154512, "auxiliary_loss_mlp": 0.01031504, "balance_loss_clip": 1.04933119, "balance_loss_mlp": 1.02321935, "epoch": 0.3578428425419347, "flos": 17055199013760.0, "grad_norm": 2.2951536408469964, "language_loss": 0.89737028, "learning_rate": 2.9747097604196764e-06, "loss": 0.91923046, "num_input_tokens_seen": 63944825, "step": 2976, "time_per_iteration": 2.669689893722534 }, { "auxiliary_loss_clip": 0.01050488, "auxiliary_loss_mlp": 0.01015512, "balance_loss_clip": 1.02361274, "balance_loss_mlp": 1.0138911, "epoch": 0.3579630854325738, "flos": 71676550707840.0, "grad_norm": 0.7033145679024358, "language_loss": 0.56614566, "learning_rate": 2.9740294853493875e-06, "loss": 0.5868057, "num_input_tokens_seen": 64016385, "step": 2977, "time_per_iteration": 3.570793867111206 }, { "auxiliary_loss_clip": 0.01142414, "auxiliary_loss_mlp": 0.01031721, "balance_loss_clip": 1.048558, "balance_loss_mlp": 1.02279806, "epoch": 0.3580833283232129, "flos": 25046651652480.0, "grad_norm": 2.7261054759323877, "language_loss": 0.6734587, "learning_rate": 2.9733490625196008e-06, "loss": 0.69520009, "num_input_tokens_seen": 64036245, "step": 2978, "time_per_iteration": 3.6809422969818115 }, { "auxiliary_loss_clip": 0.0114368, "auxiliary_loss_mlp": 0.01035235, "balance_loss_clip": 1.04972458, "balance_loss_mlp": 1.02704477, "epoch": 0.358203571213852, "flos": 13953127628160.0, "grad_norm": 3.003121916110987, "language_loss": 0.76546204, "learning_rate": 2.9726684920335353e-06, "loss": 0.78725111, "num_input_tokens_seen": 64054110, "step": 2979, "time_per_iteration": 2.688033103942871 }, { "auxiliary_loss_clip": 0.01194467, "auxiliary_loss_mlp": 0.00764242, "balance_loss_clip": 1.05461645, "balance_loss_mlp": 1.00023985, "epoch": 0.35832381410449105, "flos": 20302457172480.0, "grad_norm": 2.168891584958668, "language_loss": 0.82312334, "learning_rate": 2.971987773994432e-06, "loss": 0.84271044, "num_input_tokens_seen": 64070295, "step": 2980, "time_per_iteration": 2.6672730445861816 }, { "auxiliary_loss_clip": 0.01171287, "auxiliary_loss_mlp": 0.01030714, "balance_loss_clip": 1.05096018, "balance_loss_mlp": 1.02221394, "epoch": 0.35844405699513016, "flos": 16983234115200.0, "grad_norm": 1.9685711916746054, "language_loss": 0.83232069, "learning_rate": 2.9713069085055566e-06, "loss": 0.85434067, "num_input_tokens_seen": 64088605, "step": 2981, "time_per_iteration": 2.731266975402832 }, { "auxiliary_loss_clip": 0.01151413, "auxiliary_loss_mlp": 0.01024755, "balance_loss_clip": 1.05167234, "balance_loss_mlp": 1.01678002, "epoch": 0.35856429988576927, "flos": 23216858974080.0, "grad_norm": 1.9175138221577082, "language_loss": 0.79310322, "learning_rate": 2.9706258956701958e-06, "loss": 0.81486487, "num_input_tokens_seen": 64108595, "step": 2982, "time_per_iteration": 3.6448233127593994 }, { "auxiliary_loss_clip": 0.01179126, "auxiliary_loss_mlp": 0.01029428, "balance_loss_clip": 1.05255318, "balance_loss_mlp": 1.02003431, "epoch": 0.3586845427764083, "flos": 23034576430080.0, "grad_norm": 55.016705483691055, "language_loss": 0.77734053, "learning_rate": 2.9699447355916575e-06, "loss": 0.79942608, "num_input_tokens_seen": 64127405, "step": 2983, "time_per_iteration": 3.591041326522827 }, { "auxiliary_loss_clip": 0.01194474, "auxiliary_loss_mlp": 0.00763996, "balance_loss_clip": 1.056885, "balance_loss_mlp": 1.00018907, "epoch": 0.35880478566704743, "flos": 20010682995840.0, "grad_norm": 1.9935260306153408, "language_loss": 0.7410599, "learning_rate": 2.969263428373275e-06, "loss": 0.76064467, "num_input_tokens_seen": 64145755, "step": 2984, "time_per_iteration": 2.593135356903076 }, { "auxiliary_loss_clip": 0.01167368, "auxiliary_loss_mlp": 0.01032851, "balance_loss_clip": 1.05174839, "balance_loss_mlp": 1.02420235, "epoch": 0.35892502855768654, "flos": 13699095667200.0, "grad_norm": 2.173234855886229, "language_loss": 0.79472613, "learning_rate": 2.9685819741184007e-06, "loss": 0.81672835, "num_input_tokens_seen": 64164195, "step": 2985, "time_per_iteration": 2.657317638397217 }, { "auxiliary_loss_clip": 0.01145295, "auxiliary_loss_mlp": 0.01030979, "balance_loss_clip": 1.05099905, "balance_loss_mlp": 1.02275395, "epoch": 0.3590452714483256, "flos": 18114096977280.0, "grad_norm": 2.689639831757914, "language_loss": 0.68873572, "learning_rate": 2.967900372930411e-06, "loss": 0.71049845, "num_input_tokens_seen": 64182705, "step": 2986, "time_per_iteration": 2.7035906314849854 }, { "auxiliary_loss_clip": 0.01160683, "auxiliary_loss_mlp": 0.01033049, "balance_loss_clip": 1.05188048, "balance_loss_mlp": 1.0245676, "epoch": 0.3591655143389647, "flos": 17749352321280.0, "grad_norm": 2.321788593443789, "language_loss": 0.79679108, "learning_rate": 2.9672186249127046e-06, "loss": 0.81872839, "num_input_tokens_seen": 64202170, "step": 2987, "time_per_iteration": 2.6422650814056396 }, { "auxiliary_loss_clip": 0.01167314, "auxiliary_loss_mlp": 0.01032796, "balance_loss_clip": 1.05391419, "balance_loss_mlp": 1.02463579, "epoch": 0.3592857572296038, "flos": 25224409082880.0, "grad_norm": 1.9545723729178024, "language_loss": 0.79115868, "learning_rate": 2.9665367301687014e-06, "loss": 0.81315982, "num_input_tokens_seen": 64220415, "step": 2988, "time_per_iteration": 2.6516716480255127 }, { "auxiliary_loss_clip": 0.01158918, "auxiliary_loss_mlp": 0.0102366, "balance_loss_clip": 1.05003059, "balance_loss_mlp": 1.01517797, "epoch": 0.3594060001202429, "flos": 29384408764800.0, "grad_norm": 2.1339009421681054, "language_loss": 0.76974463, "learning_rate": 2.965854688801845e-06, "loss": 0.79157043, "num_input_tokens_seen": 64242475, "step": 2989, "time_per_iteration": 2.7399585247039795 }, { "auxiliary_loss_clip": 0.01176372, "auxiliary_loss_mlp": 0.01030739, "balance_loss_clip": 1.04959679, "balance_loss_mlp": 1.02210867, "epoch": 0.359526243010882, "flos": 17052900543360.0, "grad_norm": 1.7239232497674615, "language_loss": 0.76499951, "learning_rate": 2.9651725009156005e-06, "loss": 0.78707063, "num_input_tokens_seen": 64260220, "step": 2990, "time_per_iteration": 2.6978909969329834 }, { "auxiliary_loss_clip": 0.01155974, "auxiliary_loss_mlp": 0.01034352, "balance_loss_clip": 1.04845119, "balance_loss_mlp": 1.02578664, "epoch": 0.3596464859015211, "flos": 22965089569920.0, "grad_norm": 1.7576081413897597, "language_loss": 0.74503267, "learning_rate": 2.964490166613454e-06, "loss": 0.76693588, "num_input_tokens_seen": 64280145, "step": 2991, "time_per_iteration": 2.695620536804199 }, { "auxiliary_loss_clip": 0.01101402, "auxiliary_loss_mlp": 0.01002363, "balance_loss_clip": 1.02935791, "balance_loss_mlp": 1.00076604, "epoch": 0.35976672879216015, "flos": 54739462590720.0, "grad_norm": 0.7555452525745984, "language_loss": 0.57741505, "learning_rate": 2.963807685998917e-06, "loss": 0.59845275, "num_input_tokens_seen": 64336010, "step": 2992, "time_per_iteration": 3.0939486026763916 }, { "auxiliary_loss_clip": 0.01137982, "auxiliary_loss_mlp": 0.01030561, "balance_loss_clip": 1.04829085, "balance_loss_mlp": 1.02193642, "epoch": 0.35988697168279926, "flos": 43139020901760.0, "grad_norm": 1.6591960963864554, "language_loss": 0.78054154, "learning_rate": 2.9631250591755196e-06, "loss": 0.8022269, "num_input_tokens_seen": 64358725, "step": 2993, "time_per_iteration": 2.9499051570892334 }, { "auxiliary_loss_clip": 0.01162957, "auxiliary_loss_mlp": 0.01032594, "balance_loss_clip": 1.05399168, "balance_loss_mlp": 1.02428508, "epoch": 0.36000721457343837, "flos": 35845600239360.0, "grad_norm": 1.9813116716206676, "language_loss": 0.57557082, "learning_rate": 2.962442286246817e-06, "loss": 0.59752631, "num_input_tokens_seen": 64381555, "step": 2994, "time_per_iteration": 2.817528247833252 }, { "auxiliary_loss_clip": 0.01169461, "auxiliary_loss_mlp": 0.01032506, "balance_loss_clip": 1.05247378, "balance_loss_mlp": 1.02353525, "epoch": 0.3601274574640774, "flos": 18291100222080.0, "grad_norm": 1.6488867344931484, "language_loss": 0.69759023, "learning_rate": 2.9617593673163853e-06, "loss": 0.71960992, "num_input_tokens_seen": 64400375, "step": 2995, "time_per_iteration": 2.832857131958008 }, { "auxiliary_loss_clip": 0.01168339, "auxiliary_loss_mlp": 0.01024825, "balance_loss_clip": 1.05024576, "balance_loss_mlp": 1.01638508, "epoch": 0.36024770035471654, "flos": 13333955961600.0, "grad_norm": 2.20907951348623, "language_loss": 0.77256602, "learning_rate": 2.9610763024878216e-06, "loss": 0.79449767, "num_input_tokens_seen": 64415880, "step": 2996, "time_per_iteration": 2.816661834716797 }, { "auxiliary_loss_clip": 0.01159807, "auxiliary_loss_mlp": 0.01028538, "balance_loss_clip": 1.05028975, "balance_loss_mlp": 1.02045608, "epoch": 0.3603679432453556, "flos": 20267013427200.0, "grad_norm": 1.9168041613923466, "language_loss": 0.91684777, "learning_rate": 2.960393091864747e-06, "loss": 0.93873125, "num_input_tokens_seen": 64434260, "step": 2997, "time_per_iteration": 2.783811330795288 }, { "auxiliary_loss_clip": 0.01166248, "auxiliary_loss_mlp": 0.01030466, "balance_loss_clip": 1.05232882, "balance_loss_mlp": 1.02227664, "epoch": 0.3604881861359947, "flos": 22451135817600.0, "grad_norm": 1.7615305240322987, "language_loss": 0.74957097, "learning_rate": 2.959709735550804e-06, "loss": 0.77153814, "num_input_tokens_seen": 64453855, "step": 2998, "time_per_iteration": 2.726381778717041 }, { "auxiliary_loss_clip": 0.01139389, "auxiliary_loss_mlp": 0.01029253, "balance_loss_clip": 1.04740024, "balance_loss_mlp": 1.02052653, "epoch": 0.3606084290266338, "flos": 22054251467520.0, "grad_norm": 1.9018128265148717, "language_loss": 0.75972396, "learning_rate": 2.9590262336496575e-06, "loss": 0.7814104, "num_input_tokens_seen": 64473585, "step": 2999, "time_per_iteration": 2.885101795196533 }, { "auxiliary_loss_clip": 0.01148507, "auxiliary_loss_mlp": 0.01025989, "balance_loss_clip": 1.05280638, "balance_loss_mlp": 1.01771045, "epoch": 0.36072867191727287, "flos": 15632921111040.0, "grad_norm": 2.597915275242869, "language_loss": 0.85597467, "learning_rate": 2.9583425862649936e-06, "loss": 0.87771964, "num_input_tokens_seen": 64491720, "step": 3000, "time_per_iteration": 4.237358808517456 }, { "auxiliary_loss_clip": 0.01198027, "auxiliary_loss_mlp": 0.01037805, "balance_loss_clip": 1.05814552, "balance_loss_mlp": 1.02883434, "epoch": 0.360848914807912, "flos": 19677000625920.0, "grad_norm": 2.529947337355544, "language_loss": 0.74085289, "learning_rate": 2.9576587935005215e-06, "loss": 0.76321113, "num_input_tokens_seen": 64509800, "step": 3001, "time_per_iteration": 2.6573708057403564 }, { "auxiliary_loss_clip": 0.01181382, "auxiliary_loss_mlp": 0.01036996, "balance_loss_clip": 1.05431259, "balance_loss_mlp": 1.02744722, "epoch": 0.3609691576985511, "flos": 18877808972160.0, "grad_norm": 2.3283847168288596, "language_loss": 0.72103858, "learning_rate": 2.9569748554599713e-06, "loss": 0.74322236, "num_input_tokens_seen": 64525410, "step": 3002, "time_per_iteration": 2.660202980041504 }, { "auxiliary_loss_clip": 0.01164384, "auxiliary_loss_mlp": 0.01030584, "balance_loss_clip": 1.05124807, "balance_loss_mlp": 1.02226877, "epoch": 0.36108940058919015, "flos": 42224088648960.0, "grad_norm": 2.207089238317271, "language_loss": 0.73260808, "learning_rate": 2.956290772247097e-06, "loss": 0.75455773, "num_input_tokens_seen": 64544085, "step": 3003, "time_per_iteration": 2.917388439178467 }, { "auxiliary_loss_clip": 0.01130587, "auxiliary_loss_mlp": 0.01028938, "balance_loss_clip": 1.04983497, "balance_loss_mlp": 1.02092743, "epoch": 0.36120964347982926, "flos": 23185150243200.0, "grad_norm": 2.5290326974899964, "language_loss": 0.73434436, "learning_rate": 2.9556065439656724e-06, "loss": 0.7559396, "num_input_tokens_seen": 64563135, "step": 3004, "time_per_iteration": 3.7215256690979004 }, { "auxiliary_loss_clip": 0.01110826, "auxiliary_loss_mlp": 0.01034675, "balance_loss_clip": 1.04129195, "balance_loss_mlp": 1.02657509, "epoch": 0.36132988637046837, "flos": 18113055482880.0, "grad_norm": 1.7127789026907507, "language_loss": 0.81939715, "learning_rate": 2.9549221707194952e-06, "loss": 0.84085214, "num_input_tokens_seen": 64581985, "step": 3005, "time_per_iteration": 2.858633279800415 }, { "auxiliary_loss_clip": 0.01182771, "auxiliary_loss_mlp": 0.01031475, "balance_loss_clip": 1.0549686, "balance_loss_mlp": 1.02244496, "epoch": 0.3614501292611074, "flos": 27813101333760.0, "grad_norm": 1.9966521993581747, "language_loss": 0.72795415, "learning_rate": 2.954237652612384e-06, "loss": 0.75009656, "num_input_tokens_seen": 64601035, "step": 3006, "time_per_iteration": 2.714794874191284 }, { "auxiliary_loss_clip": 0.01163728, "auxiliary_loss_mlp": 0.01029571, "balance_loss_clip": 1.05238163, "balance_loss_mlp": 1.02184319, "epoch": 0.36157037215174653, "flos": 22634926732800.0, "grad_norm": 1.9477136769913024, "language_loss": 0.846645, "learning_rate": 2.9535529897481796e-06, "loss": 0.86857802, "num_input_tokens_seen": 64618580, "step": 3007, "time_per_iteration": 2.754840612411499 }, { "auxiliary_loss_clip": 0.0119362, "auxiliary_loss_mlp": 0.01031055, "balance_loss_clip": 1.05366576, "balance_loss_mlp": 1.02211988, "epoch": 0.36169061504238564, "flos": 12600839376000.0, "grad_norm": 2.367705580839698, "language_loss": 0.77432978, "learning_rate": 2.9528681822307446e-06, "loss": 0.7965765, "num_input_tokens_seen": 64635430, "step": 3008, "time_per_iteration": 3.592416524887085 }, { "auxiliary_loss_clip": 0.01177008, "auxiliary_loss_mlp": 0.00763314, "balance_loss_clip": 1.05602264, "balance_loss_mlp": 1.00020218, "epoch": 0.3618108579330247, "flos": 26684644682880.0, "grad_norm": 2.1046109992905153, "language_loss": 0.8238008, "learning_rate": 2.952183230163964e-06, "loss": 0.84320402, "num_input_tokens_seen": 64655005, "step": 3009, "time_per_iteration": 3.5639495849609375 }, { "auxiliary_loss_clip": 0.01140361, "auxiliary_loss_mlp": 0.01028193, "balance_loss_clip": 1.04554725, "balance_loss_mlp": 1.01915669, "epoch": 0.3619311008236638, "flos": 22817029708800.0, "grad_norm": 1.936470142533772, "language_loss": 0.7368924, "learning_rate": 2.9514981336517448e-06, "loss": 0.75857794, "num_input_tokens_seen": 64674775, "step": 3010, "time_per_iteration": 2.799267530441284 }, { "auxiliary_loss_clip": 0.01180217, "auxiliary_loss_mlp": 0.01030578, "balance_loss_clip": 1.05647635, "balance_loss_mlp": 1.02193546, "epoch": 0.36205134371430286, "flos": 25919603884800.0, "grad_norm": 2.663304808391954, "language_loss": 0.81681979, "learning_rate": 2.950812892798015e-06, "loss": 0.83892775, "num_input_tokens_seen": 64695670, "step": 3011, "time_per_iteration": 2.695112705230713 }, { "auxiliary_loss_clip": 0.01126266, "auxiliary_loss_mlp": 0.00763669, "balance_loss_clip": 1.04763138, "balance_loss_mlp": 1.00022089, "epoch": 0.362171586604942, "flos": 26139592730880.0, "grad_norm": 3.75012947896066, "language_loss": 0.87312937, "learning_rate": 2.9501275077067256e-06, "loss": 0.89202881, "num_input_tokens_seen": 64716290, "step": 3012, "time_per_iteration": 2.8496928215026855 }, { "auxiliary_loss_clip": 0.01102656, "auxiliary_loss_mlp": 0.01026957, "balance_loss_clip": 1.04200137, "balance_loss_mlp": 1.01886547, "epoch": 0.3622918294955811, "flos": 28074208273920.0, "grad_norm": 1.7594165928459538, "language_loss": 0.88533318, "learning_rate": 2.949441978481848e-06, "loss": 0.9066292, "num_input_tokens_seen": 64737190, "step": 3013, "time_per_iteration": 2.8808069229125977 }, { "auxiliary_loss_clip": 0.01154577, "auxiliary_loss_mlp": 0.01032856, "balance_loss_clip": 1.04930639, "balance_loss_mlp": 1.02359962, "epoch": 0.36241207238622014, "flos": 19828005402240.0, "grad_norm": 2.0412071320149003, "language_loss": 0.80003285, "learning_rate": 2.9487563052273778e-06, "loss": 0.82190716, "num_input_tokens_seen": 64753950, "step": 3014, "time_per_iteration": 2.8149890899658203 }, { "auxiliary_loss_clip": 0.01178817, "auxiliary_loss_mlp": 0.01025504, "balance_loss_clip": 1.05724621, "balance_loss_mlp": 1.01701069, "epoch": 0.36253231527685925, "flos": 21397158017280.0, "grad_norm": 1.6861201308886742, "language_loss": 0.85621542, "learning_rate": 2.94807048804733e-06, "loss": 0.87825859, "num_input_tokens_seen": 64773570, "step": 3015, "time_per_iteration": 2.7524633407592773 }, { "auxiliary_loss_clip": 0.01155257, "auxiliary_loss_mlp": 0.01034526, "balance_loss_clip": 1.04898763, "balance_loss_mlp": 1.02510881, "epoch": 0.36265255816749836, "flos": 18362885552640.0, "grad_norm": 2.429001613076155, "language_loss": 0.90285921, "learning_rate": 2.9473845270457434e-06, "loss": 0.92475712, "num_input_tokens_seen": 64790385, "step": 3016, "time_per_iteration": 2.914992332458496 }, { "auxiliary_loss_clip": 0.01158287, "auxiliary_loss_mlp": 0.01030987, "balance_loss_clip": 1.05055952, "balance_loss_mlp": 1.02260637, "epoch": 0.3627728010581374, "flos": 18660046769280.0, "grad_norm": 3.041654709976678, "language_loss": 0.70047027, "learning_rate": 2.946698422326677e-06, "loss": 0.722363, "num_input_tokens_seen": 64807845, "step": 3017, "time_per_iteration": 3.052154064178467 }, { "auxiliary_loss_clip": 0.01133808, "auxiliary_loss_mlp": 0.01029214, "balance_loss_clip": 1.046731, "balance_loss_mlp": 1.01995778, "epoch": 0.36289304394877653, "flos": 27524272072320.0, "grad_norm": 2.0677791717600322, "language_loss": 0.80265737, "learning_rate": 2.946012173994213e-06, "loss": 0.82428765, "num_input_tokens_seen": 64827630, "step": 3018, "time_per_iteration": 2.96711802482605 }, { "auxiliary_loss_clip": 0.01175718, "auxiliary_loss_mlp": 0.01026906, "balance_loss_clip": 1.0564785, "balance_loss_mlp": 1.01832271, "epoch": 0.36301328683941564, "flos": 34533244932480.0, "grad_norm": 2.032708331925737, "language_loss": 0.67871171, "learning_rate": 2.945325782152454e-06, "loss": 0.70073795, "num_input_tokens_seen": 64850665, "step": 3019, "time_per_iteration": 2.9132590293884277 }, { "auxiliary_loss_clip": 0.01167853, "auxiliary_loss_mlp": 0.01025417, "balance_loss_clip": 1.05207038, "balance_loss_mlp": 1.01748395, "epoch": 0.3631335297300547, "flos": 19025976574080.0, "grad_norm": 2.4441300716797074, "language_loss": 0.79039603, "learning_rate": 2.9446392469055257e-06, "loss": 0.8123287, "num_input_tokens_seen": 64868700, "step": 3020, "time_per_iteration": 2.835613965988159 }, { "auxiliary_loss_clip": 0.01151761, "auxiliary_loss_mlp": 0.01028764, "balance_loss_clip": 1.05658543, "balance_loss_mlp": 1.02031815, "epoch": 0.3632537726206938, "flos": 19536769929600.0, "grad_norm": 1.803843155378383, "language_loss": 0.79762447, "learning_rate": 2.9439525683575745e-06, "loss": 0.81942976, "num_input_tokens_seen": 64887620, "step": 3021, "time_per_iteration": 2.8791427612304688 }, { "auxiliary_loss_clip": 0.01199858, "auxiliary_loss_mlp": 0.01025202, "balance_loss_clip": 1.0591253, "balance_loss_mlp": 1.01642799, "epoch": 0.3633740155113329, "flos": 21068611292160.0, "grad_norm": 2.6326434021022433, "language_loss": 0.7487964, "learning_rate": 2.9432657466127694e-06, "loss": 0.771047, "num_input_tokens_seen": 64907190, "step": 3022, "time_per_iteration": 2.721956491470337 }, { "auxiliary_loss_clip": 0.01140573, "auxiliary_loss_mlp": 0.01031037, "balance_loss_clip": 1.05553961, "balance_loss_mlp": 1.02302873, "epoch": 0.36349425840197197, "flos": 20298722158080.0, "grad_norm": 1.9308089420836378, "language_loss": 0.76693016, "learning_rate": 2.9425787817753007e-06, "loss": 0.78864622, "num_input_tokens_seen": 64925850, "step": 3023, "time_per_iteration": 2.8514485359191895 }, { "auxiliary_loss_clip": 0.01152094, "auxiliary_loss_mlp": 0.01033744, "balance_loss_clip": 1.05125368, "balance_loss_mlp": 1.02499402, "epoch": 0.3636145012926111, "flos": 29716762331520.0, "grad_norm": 1.4737271804959662, "language_loss": 0.71712446, "learning_rate": 2.94189167394938e-06, "loss": 0.7389828, "num_input_tokens_seen": 64948285, "step": 3024, "time_per_iteration": 2.9803271293640137 }, { "auxiliary_loss_clip": 0.01196271, "auxiliary_loss_mlp": 0.01026125, "balance_loss_clip": 1.05892563, "balance_loss_mlp": 1.01726151, "epoch": 0.3637347441832502, "flos": 21431847576960.0, "grad_norm": 1.9553254160462754, "language_loss": 0.81424248, "learning_rate": 2.941204423239241e-06, "loss": 0.83646637, "num_input_tokens_seen": 64967160, "step": 3025, "time_per_iteration": 2.76127552986145 }, { "auxiliary_loss_clip": 0.01178395, "auxiliary_loss_mlp": 0.01029962, "balance_loss_clip": 1.05551052, "balance_loss_mlp": 1.02109265, "epoch": 0.36385498707388925, "flos": 29533941083520.0, "grad_norm": 2.2313277704435457, "language_loss": 0.76082414, "learning_rate": 2.9405170297491395e-06, "loss": 0.78290772, "num_input_tokens_seen": 64987155, "step": 3026, "time_per_iteration": 3.9228198528289795 }, { "auxiliary_loss_clip": 0.0111501, "auxiliary_loss_mlp": 0.00764052, "balance_loss_clip": 1.05293345, "balance_loss_mlp": 1.00023627, "epoch": 0.36397522996452836, "flos": 22236569925120.0, "grad_norm": 2.4687679240571896, "language_loss": 0.80481339, "learning_rate": 2.939829493583353e-06, "loss": 0.82360399, "num_input_tokens_seen": 65003800, "step": 3027, "time_per_iteration": 3.0461294651031494 }, { "auxiliary_loss_clip": 0.01143561, "auxiliary_loss_mlp": 0.0102487, "balance_loss_clip": 1.04680467, "balance_loss_mlp": 1.01678181, "epoch": 0.3640954728551674, "flos": 21506505995520.0, "grad_norm": 2.452790317211603, "language_loss": 0.83476084, "learning_rate": 2.939141814846179e-06, "loss": 0.85644519, "num_input_tokens_seen": 65021215, "step": 3028, "time_per_iteration": 2.7621371746063232 }, { "auxiliary_loss_clip": 0.01166684, "auxiliary_loss_mlp": 0.01029214, "balance_loss_clip": 1.05299115, "balance_loss_mlp": 1.02060127, "epoch": 0.3642157157458065, "flos": 17712867081600.0, "grad_norm": 1.5733046126238512, "language_loss": 0.82668245, "learning_rate": 2.938453993641938e-06, "loss": 0.84864146, "num_input_tokens_seen": 65039590, "step": 3029, "time_per_iteration": 2.7919559478759766 }, { "auxiliary_loss_clip": 0.01164358, "auxiliary_loss_mlp": 0.01030736, "balance_loss_clip": 1.05610812, "balance_loss_mlp": 1.02174211, "epoch": 0.36433595863644563, "flos": 17639537466240.0, "grad_norm": 2.1227228105526064, "language_loss": 0.70844746, "learning_rate": 2.937766030074973e-06, "loss": 0.73039836, "num_input_tokens_seen": 65056845, "step": 3030, "time_per_iteration": 3.725620746612549 }, { "auxiliary_loss_clip": 0.01157961, "auxiliary_loss_mlp": 0.01030488, "balance_loss_clip": 1.05329752, "balance_loss_mlp": 1.02142215, "epoch": 0.3644562015270847, "flos": 26833279161600.0, "grad_norm": 2.0512648300478764, "language_loss": 0.82850456, "learning_rate": 2.937077924249646e-06, "loss": 0.85038906, "num_input_tokens_seen": 65079435, "step": 3031, "time_per_iteration": 2.8781497478485107 }, { "auxiliary_loss_clip": 0.01171258, "auxiliary_loss_mlp": 0.01032745, "balance_loss_clip": 1.05261672, "balance_loss_mlp": 1.0237329, "epoch": 0.3645764444177238, "flos": 14282715847680.0, "grad_norm": 2.192152377107307, "language_loss": 0.76043856, "learning_rate": 2.9363896762703443e-06, "loss": 0.78247857, "num_input_tokens_seen": 65096500, "step": 3032, "time_per_iteration": 2.866676092147827 }, { "auxiliary_loss_clip": 0.01195249, "auxiliary_loss_mlp": 0.0102649, "balance_loss_clip": 1.05673051, "balance_loss_mlp": 1.01774597, "epoch": 0.3646966873083629, "flos": 20667489137280.0, "grad_norm": 1.6930484979076361, "language_loss": 0.84800828, "learning_rate": 2.9357012862414725e-06, "loss": 0.87022567, "num_input_tokens_seen": 65115860, "step": 3033, "time_per_iteration": 2.7183830738067627 }, { "auxiliary_loss_clip": 0.01182945, "auxiliary_loss_mlp": 0.01031621, "balance_loss_clip": 1.05687213, "balance_loss_mlp": 1.02325857, "epoch": 0.36481693019900197, "flos": 27782613665280.0, "grad_norm": 2.058529981792853, "language_loss": 0.72044933, "learning_rate": 2.9350127542674593e-06, "loss": 0.74259496, "num_input_tokens_seen": 65138070, "step": 3034, "time_per_iteration": 4.58564019203186 }, { "auxiliary_loss_clip": 0.01172563, "auxiliary_loss_mlp": 0.01029838, "balance_loss_clip": 1.05740809, "balance_loss_mlp": 1.02100468, "epoch": 0.3649371730896411, "flos": 19712588025600.0, "grad_norm": 2.5815760642663537, "language_loss": 0.76589555, "learning_rate": 2.934324080452755e-06, "loss": 0.78791958, "num_input_tokens_seen": 65155860, "step": 3035, "time_per_iteration": 2.9124064445495605 }, { "auxiliary_loss_clip": 0.01141011, "auxiliary_loss_mlp": 0.00765008, "balance_loss_clip": 1.04920566, "balance_loss_mlp": 1.00017214, "epoch": 0.3650574159802802, "flos": 24750496016640.0, "grad_norm": 2.1381602425065545, "language_loss": 0.78404009, "learning_rate": 2.9336352649018307e-06, "loss": 0.80310035, "num_input_tokens_seen": 65175930, "step": 3036, "time_per_iteration": 2.8277554512023926 }, { "auxiliary_loss_clip": 0.01165841, "auxiliary_loss_mlp": 0.01030323, "balance_loss_clip": 1.05298531, "balance_loss_mlp": 1.02163255, "epoch": 0.36517765887091924, "flos": 32853487363200.0, "grad_norm": 1.7769811768693267, "language_loss": 0.70255965, "learning_rate": 2.9329463077191783e-06, "loss": 0.72452128, "num_input_tokens_seen": 65199305, "step": 3037, "time_per_iteration": 2.9407994747161865 }, { "auxiliary_loss_clip": 0.01134185, "auxiliary_loss_mlp": 0.01028907, "balance_loss_clip": 1.04877472, "balance_loss_mlp": 1.02030611, "epoch": 0.36529790176155835, "flos": 20120318282880.0, "grad_norm": 2.211960444867724, "language_loss": 0.64097971, "learning_rate": 2.9322572090093135e-06, "loss": 0.66261065, "num_input_tokens_seen": 65218010, "step": 3038, "time_per_iteration": 2.8294153213500977 }, { "auxiliary_loss_clip": 0.01133923, "auxiliary_loss_mlp": 0.01030244, "balance_loss_clip": 1.04891205, "balance_loss_mlp": 1.02175033, "epoch": 0.36541814465219746, "flos": 17639573379840.0, "grad_norm": 3.647554331010622, "language_loss": 0.76461101, "learning_rate": 2.9315679688767713e-06, "loss": 0.78625268, "num_input_tokens_seen": 65236020, "step": 3039, "time_per_iteration": 2.8355793952941895 }, { "auxiliary_loss_clip": 0.01161784, "auxiliary_loss_mlp": 0.01027185, "balance_loss_clip": 1.05158126, "balance_loss_mlp": 1.01924527, "epoch": 0.3655383875428365, "flos": 22674356887680.0, "grad_norm": 1.543477004064135, "language_loss": 0.66670322, "learning_rate": 2.9308785874261085e-06, "loss": 0.68859291, "num_input_tokens_seen": 65256210, "step": 3040, "time_per_iteration": 2.8440096378326416 }, { "auxiliary_loss_clip": 0.0119812, "auxiliary_loss_mlp": 0.01028893, "balance_loss_clip": 1.05913401, "balance_loss_mlp": 1.02025044, "epoch": 0.36565863043347563, "flos": 21981173247360.0, "grad_norm": 1.6425336510538318, "language_loss": 0.8169533, "learning_rate": 2.9301890647619045e-06, "loss": 0.8392235, "num_input_tokens_seen": 65275505, "step": 3041, "time_per_iteration": 2.7902135848999023 }, { "auxiliary_loss_clip": 0.01172245, "auxiliary_loss_mlp": 0.01027829, "balance_loss_clip": 1.05529749, "balance_loss_mlp": 1.01913881, "epoch": 0.36577887332411474, "flos": 24827632473600.0, "grad_norm": 2.1218694753116787, "language_loss": 0.80589843, "learning_rate": 2.929499400988759e-06, "loss": 0.82789922, "num_input_tokens_seen": 65296665, "step": 3042, "time_per_iteration": 2.818767786026001 }, { "auxiliary_loss_clip": 0.01179454, "auxiliary_loss_mlp": 0.0103167, "balance_loss_clip": 1.05665982, "balance_loss_mlp": 1.02252698, "epoch": 0.3658991162147538, "flos": 28293191539200.0, "grad_norm": 1.6459572618933778, "language_loss": 0.65432727, "learning_rate": 2.9288095962112927e-06, "loss": 0.67643851, "num_input_tokens_seen": 65317370, "step": 3043, "time_per_iteration": 2.8477816581726074 }, { "auxiliary_loss_clip": 0.01197074, "auxiliary_loss_mlp": 0.01029479, "balance_loss_clip": 1.05853009, "balance_loss_mlp": 1.02055573, "epoch": 0.3660193591053929, "flos": 17785550252160.0, "grad_norm": 1.94478186464327, "language_loss": 0.85252208, "learning_rate": 2.9281196505341503e-06, "loss": 0.87478757, "num_input_tokens_seen": 65334540, "step": 3044, "time_per_iteration": 2.767505407333374 }, { "auxiliary_loss_clip": 0.01136151, "auxiliary_loss_mlp": 0.00763724, "balance_loss_clip": 1.05300426, "balance_loss_mlp": 1.00021577, "epoch": 0.36613960199603196, "flos": 10342776839040.0, "grad_norm": 2.041767060352574, "language_loss": 0.78719819, "learning_rate": 2.9274295640619946e-06, "loss": 0.80619699, "num_input_tokens_seen": 65351670, "step": 3045, "time_per_iteration": 2.8605852127075195 }, { "auxiliary_loss_clip": 0.01150582, "auxiliary_loss_mlp": 0.0102851, "balance_loss_clip": 1.04909348, "balance_loss_mlp": 1.02076149, "epoch": 0.36625984488667107, "flos": 19755609540480.0, "grad_norm": 1.752359137090996, "language_loss": 0.7872802, "learning_rate": 2.9267393368995103e-06, "loss": 0.80907112, "num_input_tokens_seen": 65370900, "step": 3046, "time_per_iteration": 2.8611161708831787 }, { "auxiliary_loss_clip": 0.01196942, "auxiliary_loss_mlp": 0.01033517, "balance_loss_clip": 1.05752325, "balance_loss_mlp": 1.0250051, "epoch": 0.3663800877773102, "flos": 17674262939520.0, "grad_norm": 2.5661232337030944, "language_loss": 0.7488867, "learning_rate": 2.926048969151407e-06, "loss": 0.7711913, "num_input_tokens_seen": 65388185, "step": 3047, "time_per_iteration": 2.678215742111206 }, { "auxiliary_loss_clip": 0.01133008, "auxiliary_loss_mlp": 0.01026255, "balance_loss_clip": 1.0536648, "balance_loss_mlp": 1.0174334, "epoch": 0.36650033066794924, "flos": 20303606407680.0, "grad_norm": 1.7864753397349535, "language_loss": 0.68705332, "learning_rate": 2.92535846092241e-06, "loss": 0.70864594, "num_input_tokens_seen": 65407200, "step": 3048, "time_per_iteration": 2.851482391357422 }, { "auxiliary_loss_clip": 0.01168551, "auxiliary_loss_mlp": 0.01029984, "balance_loss_clip": 1.05483961, "balance_loss_mlp": 1.02090645, "epoch": 0.36662057355858835, "flos": 24716237420160.0, "grad_norm": 1.6883106797194478, "language_loss": 0.82673126, "learning_rate": 2.9246678123172704e-06, "loss": 0.84871656, "num_input_tokens_seen": 65427290, "step": 3049, "time_per_iteration": 2.8687281608581543 }, { "auxiliary_loss_clip": 0.01199931, "auxiliary_loss_mlp": 0.01029932, "balance_loss_clip": 1.05981731, "balance_loss_mlp": 1.02073526, "epoch": 0.36674081644922746, "flos": 12385267902720.0, "grad_norm": 2.203208828200931, "language_loss": 0.74581409, "learning_rate": 2.9239770234407596e-06, "loss": 0.76811272, "num_input_tokens_seen": 65445595, "step": 3050, "time_per_iteration": 2.662234306335449 }, { "auxiliary_loss_clip": 0.01182408, "auxiliary_loss_mlp": 0.01028291, "balance_loss_clip": 1.05566895, "balance_loss_mlp": 1.01949906, "epoch": 0.3668610593398665, "flos": 21105922544640.0, "grad_norm": 1.806790746651207, "language_loss": 0.68115407, "learning_rate": 2.9232860943976686e-06, "loss": 0.70326108, "num_input_tokens_seen": 65466330, "step": 3051, "time_per_iteration": 2.759047269821167 }, { "auxiliary_loss_clip": 0.0116463, "auxiliary_loss_mlp": 0.01028207, "balance_loss_clip": 1.05499923, "balance_loss_mlp": 1.01990354, "epoch": 0.3669813022305056, "flos": 26758082039040.0, "grad_norm": 1.6857267510734768, "language_loss": 0.84490538, "learning_rate": 2.9225950252928115e-06, "loss": 0.86683381, "num_input_tokens_seen": 65487180, "step": 3052, "time_per_iteration": 3.854109287261963 }, { "auxiliary_loss_clip": 0.01180262, "auxiliary_loss_mlp": 0.01036073, "balance_loss_clip": 1.05632687, "balance_loss_mlp": 1.02737653, "epoch": 0.36710154512114473, "flos": 19099521671040.0, "grad_norm": 2.9130872327647954, "language_loss": 0.81811649, "learning_rate": 2.9219038162310217e-06, "loss": 0.84027982, "num_input_tokens_seen": 65505380, "step": 3053, "time_per_iteration": 2.680817127227783 }, { "auxiliary_loss_clip": 0.01111145, "auxiliary_loss_mlp": 0.00764017, "balance_loss_clip": 1.04791605, "balance_loss_mlp": 1.00021982, "epoch": 0.3672217880117838, "flos": 20812029465600.0, "grad_norm": 1.9475270594694967, "language_loss": 0.82426548, "learning_rate": 2.921212467317157e-06, "loss": 0.8430171, "num_input_tokens_seen": 65524825, "step": 3054, "time_per_iteration": 2.9275104999542236 }, { "auxiliary_loss_clip": 0.01150354, "auxiliary_loss_mlp": 0.01036795, "balance_loss_clip": 1.04862833, "balance_loss_mlp": 1.02756238, "epoch": 0.3673420309024229, "flos": 13590394133760.0, "grad_norm": 1.8045060207874672, "language_loss": 0.8026917, "learning_rate": 2.920520978656093e-06, "loss": 0.82456326, "num_input_tokens_seen": 65541790, "step": 3055, "time_per_iteration": 3.610708236694336 }, { "auxiliary_loss_clip": 0.01193588, "auxiliary_loss_mlp": 0.00763872, "balance_loss_clip": 1.0563587, "balance_loss_mlp": 1.00020671, "epoch": 0.367462273793062, "flos": 28986877969920.0, "grad_norm": 1.9426802500743918, "language_loss": 0.76685506, "learning_rate": 2.919829350352729e-06, "loss": 0.78642964, "num_input_tokens_seen": 65563395, "step": 3056, "time_per_iteration": 2.7628262042999268 }, { "auxiliary_loss_clip": 0.01100171, "auxiliary_loss_mlp": 0.01003911, "balance_loss_clip": 1.02711916, "balance_loss_mlp": 1.00236118, "epoch": 0.36758251668370107, "flos": 62643148346880.0, "grad_norm": 0.7555108572020115, "language_loss": 0.5992595, "learning_rate": 2.919137582511983e-06, "loss": 0.62030029, "num_input_tokens_seen": 65619835, "step": 3057, "time_per_iteration": 3.1424074172973633 }, { "auxiliary_loss_clip": 0.01161529, "auxiliary_loss_mlp": 0.01028168, "balance_loss_clip": 1.05715001, "balance_loss_mlp": 1.01949573, "epoch": 0.3677027595743402, "flos": 12713886455040.0, "grad_norm": 1.7508229426224924, "language_loss": 0.64130658, "learning_rate": 2.918445675238797e-06, "loss": 0.66320354, "num_input_tokens_seen": 65636760, "step": 3058, "time_per_iteration": 2.6767497062683105 }, { "auxiliary_loss_clip": 0.0119555, "auxiliary_loss_mlp": 0.01031318, "balance_loss_clip": 1.05581057, "balance_loss_mlp": 1.02350426, "epoch": 0.36782300246497923, "flos": 25046579825280.0, "grad_norm": 2.424419482803805, "language_loss": 0.70084631, "learning_rate": 2.917753628638132e-06, "loss": 0.72311497, "num_input_tokens_seen": 65657065, "step": 3059, "time_per_iteration": 2.6764109134674072 }, { "auxiliary_loss_clip": 0.01168892, "auxiliary_loss_mlp": 0.01035045, "balance_loss_clip": 1.05538952, "balance_loss_mlp": 1.02663481, "epoch": 0.36794324535561834, "flos": 17419512706560.0, "grad_norm": 2.18570161282597, "language_loss": 0.70237249, "learning_rate": 2.9170614428149716e-06, "loss": 0.72441185, "num_input_tokens_seen": 65675400, "step": 3060, "time_per_iteration": 4.51049017906189 }, { "auxiliary_loss_clip": 0.01152551, "auxiliary_loss_mlp": 0.01030423, "balance_loss_clip": 1.05562949, "balance_loss_mlp": 1.02139866, "epoch": 0.36806348824625745, "flos": 24089128848000.0, "grad_norm": 2.192932537871339, "language_loss": 0.8638804, "learning_rate": 2.9163691178743195e-06, "loss": 0.88571012, "num_input_tokens_seen": 65694050, "step": 3061, "time_per_iteration": 2.750082015991211 }, { "auxiliary_loss_clip": 0.01179095, "auxiliary_loss_mlp": 0.01027644, "balance_loss_clip": 1.05640459, "balance_loss_mlp": 1.01963329, "epoch": 0.3681837311368965, "flos": 20521871400960.0, "grad_norm": 2.0633169664747215, "language_loss": 0.77733457, "learning_rate": 2.9156766539212006e-06, "loss": 0.799402, "num_input_tokens_seen": 65711695, "step": 3062, "time_per_iteration": 2.6384732723236084 }, { "auxiliary_loss_clip": 0.01182585, "auxiliary_loss_mlp": 0.01033204, "balance_loss_clip": 1.05304742, "balance_loss_mlp": 1.02489471, "epoch": 0.3683039740275356, "flos": 21466644877440.0, "grad_norm": 2.172812970818998, "language_loss": 0.72023094, "learning_rate": 2.9149840510606614e-06, "loss": 0.74238884, "num_input_tokens_seen": 65730350, "step": 3063, "time_per_iteration": 2.707871675491333 }, { "auxiliary_loss_clip": 0.01082311, "auxiliary_loss_mlp": 0.00755715, "balance_loss_clip": 1.02279305, "balance_loss_mlp": 1.00041103, "epoch": 0.36842421691817473, "flos": 70380999987840.0, "grad_norm": 2.2307007250479054, "language_loss": 0.64146864, "learning_rate": 2.914291309397769e-06, "loss": 0.65984887, "num_input_tokens_seen": 65787820, "step": 3064, "time_per_iteration": 3.4315669536590576 }, { "auxiliary_loss_clip": 0.01109501, "auxiliary_loss_mlp": 0.01028949, "balance_loss_clip": 1.04499769, "balance_loss_mlp": 1.02057493, "epoch": 0.3685444598088138, "flos": 23331378510720.0, "grad_norm": 2.148740462374405, "language_loss": 0.78882915, "learning_rate": 2.9135984290376117e-06, "loss": 0.81021363, "num_input_tokens_seen": 65806685, "step": 3065, "time_per_iteration": 2.779705762863159 }, { "auxiliary_loss_clip": 0.0111795, "auxiliary_loss_mlp": 0.01030711, "balance_loss_clip": 1.04649103, "balance_loss_mlp": 1.02173996, "epoch": 0.3686647026994529, "flos": 23070271570560.0, "grad_norm": 1.692092650318129, "language_loss": 0.82916558, "learning_rate": 2.9129054100853e-06, "loss": 0.85065216, "num_input_tokens_seen": 65825525, "step": 3066, "time_per_iteration": 2.8168540000915527 }, { "auxiliary_loss_clip": 0.01167685, "auxiliary_loss_mlp": 0.01032408, "balance_loss_clip": 1.05275428, "balance_loss_mlp": 1.02381933, "epoch": 0.368784945590092, "flos": 25119909440640.0, "grad_norm": 1.640213611373079, "language_loss": 0.76296139, "learning_rate": 2.912212252645963e-06, "loss": 0.78496236, "num_input_tokens_seen": 65848110, "step": 3067, "time_per_iteration": 2.685175895690918 }, { "auxiliary_loss_clip": 0.01184961, "auxiliary_loss_mlp": 0.01030642, "balance_loss_clip": 1.05390596, "balance_loss_mlp": 1.02162969, "epoch": 0.36890518848073106, "flos": 18442284566400.0, "grad_norm": 1.994615964227408, "language_loss": 0.76792657, "learning_rate": 2.9115189568247523e-06, "loss": 0.79008257, "num_input_tokens_seen": 65865670, "step": 3068, "time_per_iteration": 2.6370818614959717 }, { "auxiliary_loss_clip": 0.01133791, "auxiliary_loss_mlp": 0.01033506, "balance_loss_clip": 1.05685234, "balance_loss_mlp": 1.02522671, "epoch": 0.36902543137137017, "flos": 16362446336640.0, "grad_norm": 1.8809974313331506, "language_loss": 0.92508709, "learning_rate": 2.910825522726841e-06, "loss": 0.94676006, "num_input_tokens_seen": 65883195, "step": 3069, "time_per_iteration": 2.793888807296753 }, { "auxiliary_loss_clip": 0.0112873, "auxiliary_loss_mlp": 0.0102808, "balance_loss_clip": 1.04677069, "balance_loss_mlp": 1.01959836, "epoch": 0.3691456742620093, "flos": 12275596702080.0, "grad_norm": 2.195110598555077, "language_loss": 0.77404684, "learning_rate": 2.9101319504574215e-06, "loss": 0.79561496, "num_input_tokens_seen": 65899635, "step": 3070, "time_per_iteration": 2.658608913421631 }, { "auxiliary_loss_clip": 0.01169611, "auxiliary_loss_mlp": 0.01035667, "balance_loss_clip": 1.04989696, "balance_loss_mlp": 1.02670884, "epoch": 0.36926591715264834, "flos": 17786412178560.0, "grad_norm": 1.6130309965195073, "language_loss": 0.76268196, "learning_rate": 2.909438240121709e-06, "loss": 0.78473473, "num_input_tokens_seen": 65919910, "step": 3071, "time_per_iteration": 2.746612787246704 }, { "auxiliary_loss_clip": 0.01161052, "auxiliary_loss_mlp": 0.01028342, "balance_loss_clip": 1.05272245, "balance_loss_mlp": 1.01959229, "epoch": 0.36938616004328745, "flos": 28948309741440.0, "grad_norm": 1.7529545056745583, "language_loss": 0.70313042, "learning_rate": 2.908744391824939e-06, "loss": 0.72502434, "num_input_tokens_seen": 65940930, "step": 3072, "time_per_iteration": 2.713883638381958 }, { "auxiliary_loss_clip": 0.01123527, "auxiliary_loss_mlp": 0.01029216, "balance_loss_clip": 1.04567122, "balance_loss_mlp": 1.02093744, "epoch": 0.36950640293392656, "flos": 29205394358400.0, "grad_norm": 1.7241076735374798, "language_loss": 0.7927959, "learning_rate": 2.908050405672367e-06, "loss": 0.81432331, "num_input_tokens_seen": 65960475, "step": 3073, "time_per_iteration": 2.910400867462158 }, { "auxiliary_loss_clip": 0.01171893, "auxiliary_loss_mlp": 0.01030202, "balance_loss_clip": 1.0501771, "balance_loss_mlp": 1.02226233, "epoch": 0.3696266458245656, "flos": 24827776128000.0, "grad_norm": 1.8169211174192257, "language_loss": 0.79017371, "learning_rate": 2.9073562817692703e-06, "loss": 0.81219465, "num_input_tokens_seen": 65979160, "step": 3074, "time_per_iteration": 2.7333781719207764 }, { "auxiliary_loss_clip": 0.01056736, "auxiliary_loss_mlp": 0.01009013, "balance_loss_clip": 1.02024841, "balance_loss_mlp": 1.00737977, "epoch": 0.3697468887152047, "flos": 59887257264000.0, "grad_norm": 0.7213550041027801, "language_loss": 0.56526554, "learning_rate": 2.9066620202209468e-06, "loss": 0.58592308, "num_input_tokens_seen": 66041650, "step": 3075, "time_per_iteration": 3.25860595703125 }, { "auxiliary_loss_clip": 0.01141447, "auxiliary_loss_mlp": 0.01028125, "balance_loss_clip": 1.04960465, "balance_loss_mlp": 1.01913035, "epoch": 0.3698671316058438, "flos": 26137581569280.0, "grad_norm": 2.1337151708874322, "language_loss": 0.77557176, "learning_rate": 2.905967621132716e-06, "loss": 0.7972675, "num_input_tokens_seen": 66059260, "step": 3076, "time_per_iteration": 2.6882569789886475 }, { "auxiliary_loss_clip": 0.0117004, "auxiliary_loss_mlp": 0.01031588, "balance_loss_clip": 1.05203331, "balance_loss_mlp": 1.02190828, "epoch": 0.3699873744964829, "flos": 24607464059520.0, "grad_norm": 2.0340322670214355, "language_loss": 0.75474054, "learning_rate": 2.9052730846099172e-06, "loss": 0.77675676, "num_input_tokens_seen": 66080605, "step": 3077, "time_per_iteration": 2.8291192054748535 }, { "auxiliary_loss_clip": 0.01074273, "auxiliary_loss_mlp": 0.01000686, "balance_loss_clip": 1.02217853, "balance_loss_mlp": 0.99902898, "epoch": 0.370107617387122, "flos": 64885340050560.0, "grad_norm": 0.8619617994667297, "language_loss": 0.60957265, "learning_rate": 2.9045784107579123e-06, "loss": 0.63032222, "num_input_tokens_seen": 66140710, "step": 3078, "time_per_iteration": 4.373821258544922 }, { "auxiliary_loss_clip": 0.01195788, "auxiliary_loss_mlp": 0.01025729, "balance_loss_clip": 1.05781329, "balance_loss_mlp": 1.01665759, "epoch": 0.37022786027776106, "flos": 15961683317760.0, "grad_norm": 1.994230855456401, "language_loss": 0.67068684, "learning_rate": 2.9038835996820807e-06, "loss": 0.69290197, "num_input_tokens_seen": 66158320, "step": 3079, "time_per_iteration": 2.7236931324005127 }, { "auxiliary_loss_clip": 0.01155942, "auxiliary_loss_mlp": 0.01039903, "balance_loss_clip": 1.04866683, "balance_loss_mlp": 1.03131974, "epoch": 0.37034810316840017, "flos": 18546927863040.0, "grad_norm": 1.9941432821420537, "language_loss": 0.79834127, "learning_rate": 2.903188651487826e-06, "loss": 0.82029974, "num_input_tokens_seen": 66176875, "step": 3080, "time_per_iteration": 2.680091142654419 }, { "auxiliary_loss_clip": 0.01184626, "auxiliary_loss_mlp": 0.01030138, "balance_loss_clip": 1.05577981, "balance_loss_mlp": 1.02216315, "epoch": 0.3704683460590393, "flos": 17821927751040.0, "grad_norm": 2.2960773612329253, "language_loss": 0.86952579, "learning_rate": 2.902493566280571e-06, "loss": 0.89167345, "num_input_tokens_seen": 66194980, "step": 3081, "time_per_iteration": 3.516897439956665 }, { "auxiliary_loss_clip": 0.01164699, "auxiliary_loss_mlp": 0.0104016, "balance_loss_clip": 1.05263412, "balance_loss_mlp": 1.03124285, "epoch": 0.37058858894967833, "flos": 14134081368960.0, "grad_norm": 2.0452844680893874, "language_loss": 0.81307817, "learning_rate": 2.9017983441657595e-06, "loss": 0.8351267, "num_input_tokens_seen": 66212310, "step": 3082, "time_per_iteration": 2.6495707035064697 }, { "auxiliary_loss_clip": 0.01138117, "auxiliary_loss_mlp": 0.01032239, "balance_loss_clip": 1.04749334, "balance_loss_mlp": 1.02363181, "epoch": 0.37070883184031744, "flos": 13954492344960.0, "grad_norm": 3.039449288428381, "language_loss": 0.74758214, "learning_rate": 2.9011029852488564e-06, "loss": 0.76928568, "num_input_tokens_seen": 66229545, "step": 3083, "time_per_iteration": 2.7994070053100586 }, { "auxiliary_loss_clip": 0.0109503, "auxiliary_loss_mlp": 0.01000863, "balance_loss_clip": 1.02243996, "balance_loss_mlp": 0.99924183, "epoch": 0.37082907473095655, "flos": 52315419306240.0, "grad_norm": 0.9919923221261641, "language_loss": 0.62466234, "learning_rate": 2.9004074896353465e-06, "loss": 0.64562124, "num_input_tokens_seen": 66283545, "step": 3084, "time_per_iteration": 3.100482225418091 }, { "auxiliary_loss_clip": 0.01196637, "auxiliary_loss_mlp": 0.0102684, "balance_loss_clip": 1.0603919, "balance_loss_mlp": 1.01791084, "epoch": 0.3709493176215956, "flos": 15998096730240.0, "grad_norm": 1.7790861009011831, "language_loss": 0.81869829, "learning_rate": 2.8997118574307362e-06, "loss": 0.84093308, "num_input_tokens_seen": 66300500, "step": 3085, "time_per_iteration": 3.525451898574829 }, { "auxiliary_loss_clip": 0.01159554, "auxiliary_loss_mlp": 0.01028474, "balance_loss_clip": 1.05384707, "balance_loss_mlp": 1.0198015, "epoch": 0.3710695605122347, "flos": 20959837931520.0, "grad_norm": 2.4595475372905358, "language_loss": 0.74485022, "learning_rate": 2.899016088740553e-06, "loss": 0.76673049, "num_input_tokens_seen": 66318610, "step": 3086, "time_per_iteration": 3.524543523788452 }, { "auxiliary_loss_clip": 0.01136116, "auxiliary_loss_mlp": 0.01032024, "balance_loss_clip": 1.05152333, "balance_loss_mlp": 1.02358413, "epoch": 0.37118980340287383, "flos": 14355578586240.0, "grad_norm": 2.1117755790104393, "language_loss": 0.79866171, "learning_rate": 2.898320183670344e-06, "loss": 0.82034314, "num_input_tokens_seen": 66336025, "step": 3087, "time_per_iteration": 2.74395489692688 }, { "auxiliary_loss_clip": 0.01136176, "auxiliary_loss_mlp": 0.01029502, "balance_loss_clip": 1.05065429, "balance_loss_mlp": 1.02097845, "epoch": 0.3713100462935129, "flos": 25885381201920.0, "grad_norm": 1.7130075465696646, "language_loss": 0.89045918, "learning_rate": 2.8976241423256767e-06, "loss": 0.91211605, "num_input_tokens_seen": 66356120, "step": 3088, "time_per_iteration": 2.8897385597229004 }, { "auxiliary_loss_clip": 0.01161198, "auxiliary_loss_mlp": 0.01031227, "balance_loss_clip": 1.052858, "balance_loss_mlp": 1.02335274, "epoch": 0.371430289184152, "flos": 30518934814080.0, "grad_norm": 2.8403154910916992, "language_loss": 0.68163151, "learning_rate": 2.896927964812142e-06, "loss": 0.7035557, "num_input_tokens_seen": 66376685, "step": 3089, "time_per_iteration": 2.729454278945923 }, { "auxiliary_loss_clip": 0.0116921, "auxiliary_loss_mlp": 0.0102852, "balance_loss_clip": 1.05689132, "balance_loss_mlp": 1.01962709, "epoch": 0.37155053207479105, "flos": 15742233175680.0, "grad_norm": 2.6993800336594327, "language_loss": 0.75011301, "learning_rate": 2.8962316512353465e-06, "loss": 0.77209032, "num_input_tokens_seen": 66394230, "step": 3090, "time_per_iteration": 2.6533782482147217 }, { "auxiliary_loss_clip": 0.01118095, "auxiliary_loss_mlp": 0.01027062, "balance_loss_clip": 1.04640317, "balance_loss_mlp": 1.01889658, "epoch": 0.37167077496543016, "flos": 23404061681280.0, "grad_norm": 3.432138814668347, "language_loss": 0.74989343, "learning_rate": 2.8955352017009233e-06, "loss": 0.77134502, "num_input_tokens_seen": 66413475, "step": 3091, "time_per_iteration": 2.836380958557129 }, { "auxiliary_loss_clip": 0.01167266, "auxiliary_loss_mlp": 0.01028031, "balance_loss_clip": 1.056898, "balance_loss_mlp": 1.01953745, "epoch": 0.3717910178560693, "flos": 22088653718400.0, "grad_norm": 2.4708831387069514, "language_loss": 0.77352983, "learning_rate": 2.8948386163145212e-06, "loss": 0.79548281, "num_input_tokens_seen": 66432685, "step": 3092, "time_per_iteration": 2.6632354259490967 }, { "auxiliary_loss_clip": 0.011867, "auxiliary_loss_mlp": 0.01031044, "balance_loss_clip": 1.05734193, "balance_loss_mlp": 1.02306867, "epoch": 0.3719112607467083, "flos": 26939969533440.0, "grad_norm": 1.8941405710429402, "language_loss": 0.79500437, "learning_rate": 2.8941418951818135e-06, "loss": 0.81718183, "num_input_tokens_seen": 66452245, "step": 3093, "time_per_iteration": 2.7025389671325684 }, { "auxiliary_loss_clip": 0.01153771, "auxiliary_loss_mlp": 0.01032169, "balance_loss_clip": 1.05309701, "balance_loss_mlp": 1.02386045, "epoch": 0.37203150363734744, "flos": 12166500119040.0, "grad_norm": 2.59169488633503, "language_loss": 0.71184647, "learning_rate": 2.8934450384084903e-06, "loss": 0.73370594, "num_input_tokens_seen": 66469760, "step": 3094, "time_per_iteration": 2.811070680618286 }, { "auxiliary_loss_clip": 0.01157357, "auxiliary_loss_mlp": 0.01034165, "balance_loss_clip": 1.04926109, "balance_loss_mlp": 1.02490807, "epoch": 0.37215174652798655, "flos": 23697595624320.0, "grad_norm": 2.0298270751156875, "language_loss": 0.69744933, "learning_rate": 2.8927480461002653e-06, "loss": 0.71936452, "num_input_tokens_seen": 66489730, "step": 3095, "time_per_iteration": 2.7436277866363525 }, { "auxiliary_loss_clip": 0.01163562, "auxiliary_loss_mlp": 0.01033734, "balance_loss_clip": 1.05045223, "balance_loss_mlp": 1.02437592, "epoch": 0.3722719894186256, "flos": 17887751424000.0, "grad_norm": 2.345533385277675, "language_loss": 0.8597436, "learning_rate": 2.892050918362872e-06, "loss": 0.88171661, "num_input_tokens_seen": 66504785, "step": 3096, "time_per_iteration": 2.630793809890747 }, { "auxiliary_loss_clip": 0.01021836, "auxiliary_loss_mlp": 0.01008044, "balance_loss_clip": 1.01770747, "balance_loss_mlp": 1.00642288, "epoch": 0.3723922323092647, "flos": 62419891363200.0, "grad_norm": 0.8455447584831847, "language_loss": 0.55807292, "learning_rate": 2.8913536553020626e-06, "loss": 0.57837176, "num_input_tokens_seen": 66558840, "step": 3097, "time_per_iteration": 3.3812026977539062 }, { "auxiliary_loss_clip": 0.01126562, "auxiliary_loss_mlp": 0.01029088, "balance_loss_clip": 1.04595876, "balance_loss_mlp": 1.02080858, "epoch": 0.3725124751999038, "flos": 23039747988480.0, "grad_norm": 1.9735876152899559, "language_loss": 0.85033166, "learning_rate": 2.8906562570236137e-06, "loss": 0.87188816, "num_input_tokens_seen": 66576750, "step": 3098, "time_per_iteration": 2.9260549545288086 }, { "auxiliary_loss_clip": 0.01121096, "auxiliary_loss_mlp": 0.01030152, "balance_loss_clip": 1.04550052, "balance_loss_mlp": 1.02180123, "epoch": 0.3726327180905429, "flos": 20920551431040.0, "grad_norm": 1.4857896792319343, "language_loss": 0.76524508, "learning_rate": 2.889958723633318e-06, "loss": 0.78675759, "num_input_tokens_seen": 66595690, "step": 3099, "time_per_iteration": 2.817406415939331 }, { "auxiliary_loss_clip": 0.01151893, "auxiliary_loss_mlp": 0.01035688, "balance_loss_clip": 1.05190349, "balance_loss_mlp": 1.02804637, "epoch": 0.372752960981182, "flos": 30592156688640.0, "grad_norm": 1.64983343824259, "language_loss": 0.73755586, "learning_rate": 2.889261055236992e-06, "loss": 0.75943166, "num_input_tokens_seen": 66617905, "step": 3100, "time_per_iteration": 2.8219411373138428 }, { "auxiliary_loss_clip": 0.01162799, "auxiliary_loss_mlp": 0.01028478, "balance_loss_clip": 1.05445969, "balance_loss_mlp": 1.0194602, "epoch": 0.3728732038718211, "flos": 25116749043840.0, "grad_norm": 1.7902986495082749, "language_loss": 0.82910311, "learning_rate": 2.8885632519404704e-06, "loss": 0.85101593, "num_input_tokens_seen": 66638175, "step": 3101, "time_per_iteration": 2.7250545024871826 }, { "auxiliary_loss_clip": 0.01162883, "auxiliary_loss_mlp": 0.01029365, "balance_loss_clip": 1.05293632, "balance_loss_mlp": 1.02100217, "epoch": 0.37299344676246016, "flos": 25302048330240.0, "grad_norm": 1.9237264746332927, "language_loss": 0.75658518, "learning_rate": 2.8878653138496107e-06, "loss": 0.77850771, "num_input_tokens_seen": 66658670, "step": 3102, "time_per_iteration": 2.7599291801452637 }, { "auxiliary_loss_clip": 0.01115292, "auxiliary_loss_mlp": 0.01023896, "balance_loss_clip": 1.04254127, "balance_loss_mlp": 1.01519382, "epoch": 0.37311368965309927, "flos": 23842531002240.0, "grad_norm": 2.6006699952504824, "language_loss": 0.76203072, "learning_rate": 2.8871672410702878e-06, "loss": 0.78342259, "num_input_tokens_seen": 66676030, "step": 3103, "time_per_iteration": 2.786940336227417 }, { "auxiliary_loss_clip": 0.01157996, "auxiliary_loss_mlp": 0.0103372, "balance_loss_clip": 1.05008709, "balance_loss_mlp": 1.02460623, "epoch": 0.3732339325437384, "flos": 25811943845760.0, "grad_norm": 1.918152970527542, "language_loss": 0.82559925, "learning_rate": 2.8864690337084008e-06, "loss": 0.84751642, "num_input_tokens_seen": 66695305, "step": 3104, "time_per_iteration": 3.782810926437378 }, { "auxiliary_loss_clip": 0.01173969, "auxiliary_loss_mlp": 0.01030762, "balance_loss_clip": 1.05192888, "balance_loss_mlp": 1.02281082, "epoch": 0.37335417543437743, "flos": 26208433146240.0, "grad_norm": 2.4380417375740806, "language_loss": 0.78052223, "learning_rate": 2.885770691869866e-06, "loss": 0.80256957, "num_input_tokens_seen": 66716185, "step": 3105, "time_per_iteration": 2.8215389251708984 }, { "auxiliary_loss_clip": 0.01177016, "auxiliary_loss_mlp": 0.0102864, "balance_loss_clip": 1.05439639, "balance_loss_mlp": 1.02098131, "epoch": 0.37347441832501654, "flos": 24023879792640.0, "grad_norm": 2.0087004852746544, "language_loss": 0.7424919, "learning_rate": 2.8850722156606207e-06, "loss": 0.76454842, "num_input_tokens_seen": 66734575, "step": 3106, "time_per_iteration": 2.674243688583374 }, { "auxiliary_loss_clip": 0.01171183, "auxiliary_loss_mlp": 0.01030642, "balance_loss_clip": 1.05207372, "balance_loss_mlp": 1.02232075, "epoch": 0.3735946612156556, "flos": 19714922409600.0, "grad_norm": 1.5450383144588031, "language_loss": 0.67305291, "learning_rate": 2.8843736051866252e-06, "loss": 0.6950711, "num_input_tokens_seen": 66753500, "step": 3107, "time_per_iteration": 3.5739047527313232 }, { "auxiliary_loss_clip": 0.01131353, "auxiliary_loss_mlp": 0.00763624, "balance_loss_clip": 1.04784453, "balance_loss_mlp": 1.00021279, "epoch": 0.3737149041062947, "flos": 23039604334080.0, "grad_norm": 1.5283885625990974, "language_loss": 0.6932227, "learning_rate": 2.8836748605538557e-06, "loss": 0.71217245, "num_input_tokens_seen": 66775140, "step": 3108, "time_per_iteration": 2.7971391677856445 }, { "auxiliary_loss_clip": 0.01167554, "auxiliary_loss_mlp": 0.01030753, "balance_loss_clip": 1.05192971, "balance_loss_mlp": 1.02243829, "epoch": 0.3738351469969338, "flos": 34678108483200.0, "grad_norm": 4.863172078664337, "language_loss": 0.63635671, "learning_rate": 2.882975981868313e-06, "loss": 0.6583398, "num_input_tokens_seen": 66795525, "step": 3109, "time_per_iteration": 2.8542168140411377 }, { "auxiliary_loss_clip": 0.01179868, "auxiliary_loss_mlp": 0.0102696, "balance_loss_clip": 1.05568862, "balance_loss_mlp": 1.01879418, "epoch": 0.3739553898875729, "flos": 43507967448960.0, "grad_norm": 2.3676359308169825, "language_loss": 0.68668866, "learning_rate": 2.882276969236016e-06, "loss": 0.70875692, "num_input_tokens_seen": 66816885, "step": 3110, "time_per_iteration": 2.868337392807007 }, { "auxiliary_loss_clip": 0.01158373, "auxiliary_loss_mlp": 0.01035216, "balance_loss_clip": 1.05120826, "balance_loss_mlp": 1.02680612, "epoch": 0.374075632778212, "flos": 12856487448960.0, "grad_norm": 1.881200579700236, "language_loss": 0.762061, "learning_rate": 2.881577822763005e-06, "loss": 0.78399694, "num_input_tokens_seen": 66834835, "step": 3111, "time_per_iteration": 3.5591001510620117 }, { "auxiliary_loss_clip": 0.01176791, "auxiliary_loss_mlp": 0.01023758, "balance_loss_clip": 1.05259013, "balance_loss_mlp": 1.0156157, "epoch": 0.3741958756688511, "flos": 26024031699840.0, "grad_norm": 1.8963390414213954, "language_loss": 0.87422931, "learning_rate": 2.880878542555338e-06, "loss": 0.89623475, "num_input_tokens_seen": 66852600, "step": 3112, "time_per_iteration": 3.618725061416626 }, { "auxiliary_loss_clip": 0.01193453, "auxiliary_loss_mlp": 0.01028611, "balance_loss_clip": 1.05467439, "balance_loss_mlp": 1.01958048, "epoch": 0.37431611855949015, "flos": 21433894652160.0, "grad_norm": 1.9774108060355124, "language_loss": 0.80533004, "learning_rate": 2.8801791287190976e-06, "loss": 0.82755065, "num_input_tokens_seen": 66870595, "step": 3113, "time_per_iteration": 2.633082866668701 }, { "auxiliary_loss_clip": 0.01181986, "auxiliary_loss_mlp": 0.01030075, "balance_loss_clip": 1.05386019, "balance_loss_mlp": 1.0211339, "epoch": 0.37443636145012926, "flos": 24207096090240.0, "grad_norm": 4.262007071234999, "language_loss": 0.86295319, "learning_rate": 2.8794795813603817e-06, "loss": 0.88507378, "num_input_tokens_seen": 66886060, "step": 3114, "time_per_iteration": 2.662886381149292 }, { "auxiliary_loss_clip": 0.01181132, "auxiliary_loss_mlp": 0.01032354, "balance_loss_clip": 1.05161321, "balance_loss_mlp": 1.02403355, "epoch": 0.3745566043407684, "flos": 15378601841280.0, "grad_norm": 1.8196089630324046, "language_loss": 0.8162294, "learning_rate": 2.878779900585314e-06, "loss": 0.8383643, "num_input_tokens_seen": 66903900, "step": 3115, "time_per_iteration": 2.6496787071228027 }, { "auxiliary_loss_clip": 0.01172023, "auxiliary_loss_mlp": 0.01031443, "balance_loss_clip": 1.05440211, "balance_loss_mlp": 1.02310419, "epoch": 0.37467684723140743, "flos": 24608218245120.0, "grad_norm": 1.4904887391934618, "language_loss": 0.75238293, "learning_rate": 2.8780800865000336e-06, "loss": 0.77441758, "num_input_tokens_seen": 66925210, "step": 3116, "time_per_iteration": 2.7056055068969727 }, { "auxiliary_loss_clip": 0.0108247, "auxiliary_loss_mlp": 0.01000789, "balance_loss_clip": 1.02050471, "balance_loss_mlp": 0.9992038, "epoch": 0.37479709012204654, "flos": 64377491610240.0, "grad_norm": 0.9760962482515769, "language_loss": 0.5918715, "learning_rate": 2.877380139210702e-06, "loss": 0.61270404, "num_input_tokens_seen": 66983880, "step": 3117, "time_per_iteration": 3.275131940841675 }, { "auxiliary_loss_clip": 0.01157959, "auxiliary_loss_mlp": 0.01035616, "balance_loss_clip": 1.0536319, "balance_loss_mlp": 1.02681828, "epoch": 0.37491733301268565, "flos": 23803962773760.0, "grad_norm": 1.807271101212916, "language_loss": 0.76701802, "learning_rate": 2.876680058823501e-06, "loss": 0.78895378, "num_input_tokens_seen": 67004280, "step": 3118, "time_per_iteration": 2.727661609649658 }, { "auxiliary_loss_clip": 0.01156902, "auxiliary_loss_mlp": 0.01031574, "balance_loss_clip": 1.05291247, "balance_loss_mlp": 1.02253234, "epoch": 0.3750375759033247, "flos": 32160950167680.0, "grad_norm": 2.1904064825227945, "language_loss": 0.66449189, "learning_rate": 2.8759798454446314e-06, "loss": 0.68637669, "num_input_tokens_seen": 67027445, "step": 3119, "time_per_iteration": 2.7748589515686035 }, { "auxiliary_loss_clip": 0.01182183, "auxiliary_loss_mlp": 0.01034159, "balance_loss_clip": 1.05466759, "balance_loss_mlp": 1.02561164, "epoch": 0.3751578187939638, "flos": 23367791923200.0, "grad_norm": 1.7697248166125905, "language_loss": 0.81505036, "learning_rate": 2.8752794991803173e-06, "loss": 0.83721375, "num_input_tokens_seen": 67045130, "step": 3120, "time_per_iteration": 2.7114031314849854 }, { "auxiliary_loss_clip": 0.01163649, "auxiliary_loss_mlp": 0.01027532, "balance_loss_clip": 1.05263603, "balance_loss_mlp": 1.01887143, "epoch": 0.37527806168460287, "flos": 14605731878400.0, "grad_norm": 2.026411523433748, "language_loss": 0.75176764, "learning_rate": 2.8745790201367976e-06, "loss": 0.77367949, "num_input_tokens_seen": 67060885, "step": 3121, "time_per_iteration": 2.587972640991211 }, { "auxiliary_loss_clip": 0.01196371, "auxiliary_loss_mlp": 0.01032436, "balance_loss_clip": 1.05602455, "balance_loss_mlp": 1.0232389, "epoch": 0.375398304575242, "flos": 26390823431040.0, "grad_norm": 8.577034695704883, "language_loss": 0.84443271, "learning_rate": 2.8738784084203373e-06, "loss": 0.86672074, "num_input_tokens_seen": 67080960, "step": 3122, "time_per_iteration": 2.672083854675293 }, { "auxiliary_loss_clip": 0.01155512, "auxiliary_loss_mlp": 0.01030859, "balance_loss_clip": 1.04764867, "balance_loss_mlp": 1.02259135, "epoch": 0.3755185474658811, "flos": 22236605838720.0, "grad_norm": 1.7553019653810469, "language_loss": 0.79029465, "learning_rate": 2.873177664137216e-06, "loss": 0.81215835, "num_input_tokens_seen": 67101890, "step": 3123, "time_per_iteration": 2.6508126258850098 }, { "auxiliary_loss_clip": 0.0114958, "auxiliary_loss_mlp": 0.01036668, "balance_loss_clip": 1.05331016, "balance_loss_mlp": 1.02800751, "epoch": 0.37563879035652015, "flos": 30812935633920.0, "grad_norm": 1.7501432794802991, "language_loss": 0.69329947, "learning_rate": 2.8724767873937384e-06, "loss": 0.71516192, "num_input_tokens_seen": 67126010, "step": 3124, "time_per_iteration": 2.8310606479644775 }, { "auxiliary_loss_clip": 0.01164438, "auxiliary_loss_mlp": 0.01030889, "balance_loss_clip": 1.05159676, "balance_loss_mlp": 1.02275908, "epoch": 0.37575903324715926, "flos": 20773533064320.0, "grad_norm": 2.581565948390894, "language_loss": 0.87481266, "learning_rate": 2.871775778296225e-06, "loss": 0.89676595, "num_input_tokens_seen": 67143100, "step": 3125, "time_per_iteration": 2.6660566329956055 }, { "auxiliary_loss_clip": 0.01180991, "auxiliary_loss_mlp": 0.01027032, "balance_loss_clip": 1.05578864, "balance_loss_mlp": 1.01729882, "epoch": 0.37587927613779837, "flos": 18697681244160.0, "grad_norm": 2.0610033912095265, "language_loss": 0.78552485, "learning_rate": 2.8710746369510196e-06, "loss": 0.80760509, "num_input_tokens_seen": 67161085, "step": 3126, "time_per_iteration": 2.667849540710449 }, { "auxiliary_loss_clip": 0.01162476, "auxiliary_loss_mlp": 0.01029073, "balance_loss_clip": 1.05355203, "balance_loss_mlp": 1.01976824, "epoch": 0.3759995190284374, "flos": 13624796384640.0, "grad_norm": 2.6049600964994912, "language_loss": 0.83501852, "learning_rate": 2.8703733634644846e-06, "loss": 0.85693395, "num_input_tokens_seen": 67175840, "step": 3127, "time_per_iteration": 2.6155519485473633 }, { "auxiliary_loss_clip": 0.01198274, "auxiliary_loss_mlp": 0.01027662, "balance_loss_clip": 1.05948746, "balance_loss_mlp": 1.0197407, "epoch": 0.37611976191907653, "flos": 20484847457280.0, "grad_norm": 1.830247754910172, "language_loss": 0.79140902, "learning_rate": 2.869671957943002e-06, "loss": 0.81366837, "num_input_tokens_seen": 67194995, "step": 3128, "time_per_iteration": 2.6157338619232178 }, { "auxiliary_loss_clip": 0.01160336, "auxiliary_loss_mlp": 0.01028825, "balance_loss_clip": 1.05487418, "balance_loss_mlp": 1.02045083, "epoch": 0.37624000480971564, "flos": 21141797253120.0, "grad_norm": 1.7846701141967873, "language_loss": 0.74643123, "learning_rate": 2.8689704204929747e-06, "loss": 0.76832289, "num_input_tokens_seen": 67214175, "step": 3129, "time_per_iteration": 2.710440158843994 }, { "auxiliary_loss_clip": 0.01195293, "auxiliary_loss_mlp": 0.01034164, "balance_loss_clip": 1.05632782, "balance_loss_mlp": 1.02569413, "epoch": 0.3763602477003547, "flos": 22564470205440.0, "grad_norm": 1.958982572280581, "language_loss": 0.81077927, "learning_rate": 2.8682687512208253e-06, "loss": 0.83307385, "num_input_tokens_seen": 67233185, "step": 3130, "time_per_iteration": 3.753727674484253 }, { "auxiliary_loss_clip": 0.01185883, "auxiliary_loss_mlp": 0.0103215, "balance_loss_clip": 1.05420303, "balance_loss_mlp": 1.02386546, "epoch": 0.3764804905909938, "flos": 27526857851520.0, "grad_norm": 1.912458500436718, "language_loss": 0.80656719, "learning_rate": 2.8675669502329972e-06, "loss": 0.82874745, "num_input_tokens_seen": 67254715, "step": 3131, "time_per_iteration": 2.761674165725708 }, { "auxiliary_loss_clip": 0.01178234, "auxiliary_loss_mlp": 0.00763508, "balance_loss_clip": 1.05333257, "balance_loss_mlp": 1.00017619, "epoch": 0.3766007334816329, "flos": 22528092706560.0, "grad_norm": 2.3234397376545615, "language_loss": 0.86246705, "learning_rate": 2.866865017635952e-06, "loss": 0.88188446, "num_input_tokens_seen": 67272535, "step": 3132, "time_per_iteration": 2.8670501708984375 }, { "auxiliary_loss_clip": 0.01153393, "auxiliary_loss_mlp": 0.01031577, "balance_loss_clip": 1.05634403, "balance_loss_mlp": 1.02311897, "epoch": 0.376720976372272, "flos": 25957166532480.0, "grad_norm": 1.5806862004513775, "language_loss": 0.79378366, "learning_rate": 2.866162953536174e-06, "loss": 0.8156333, "num_input_tokens_seen": 67293505, "step": 3133, "time_per_iteration": 3.633573532104492 }, { "auxiliary_loss_clip": 0.01164897, "auxiliary_loss_mlp": 0.00763422, "balance_loss_clip": 1.05125606, "balance_loss_mlp": 1.00014925, "epoch": 0.3768412192629111, "flos": 18041162411520.0, "grad_norm": 1.8028845128396391, "language_loss": 0.74989998, "learning_rate": 2.8654607580401634e-06, "loss": 0.76918316, "num_input_tokens_seen": 67313240, "step": 3134, "time_per_iteration": 2.6736795902252197 }, { "auxiliary_loss_clip": 0.01080224, "auxiliary_loss_mlp": 0.01000205, "balance_loss_clip": 1.02035439, "balance_loss_mlp": 0.99858361, "epoch": 0.3769614621535502, "flos": 62989472304000.0, "grad_norm": 0.9603080408296224, "language_loss": 0.65157372, "learning_rate": 2.8647584312544446e-06, "loss": 0.67237806, "num_input_tokens_seen": 67378445, "step": 3135, "time_per_iteration": 3.2186474800109863 }, { "auxiliary_loss_clip": 0.01143374, "auxiliary_loss_mlp": 0.00762758, "balance_loss_clip": 1.04701495, "balance_loss_mlp": 1.00015044, "epoch": 0.37708170504418925, "flos": 23661685002240.0, "grad_norm": 1.5116638911097344, "language_loss": 0.85566509, "learning_rate": 2.864055973285559e-06, "loss": 0.87472641, "num_input_tokens_seen": 67400445, "step": 3136, "time_per_iteration": 2.781022787094116 }, { "auxiliary_loss_clip": 0.01151374, "auxiliary_loss_mlp": 0.01033672, "balance_loss_clip": 1.04845476, "balance_loss_mlp": 1.02551782, "epoch": 0.37720194793482836, "flos": 24423170353920.0, "grad_norm": 1.8348489084765285, "language_loss": 0.86465299, "learning_rate": 2.8633533842400698e-06, "loss": 0.88650346, "num_input_tokens_seen": 67420645, "step": 3137, "time_per_iteration": 3.629549980163574 }, { "auxiliary_loss_clip": 0.01181924, "auxiliary_loss_mlp": 0.00764261, "balance_loss_clip": 1.05563366, "balance_loss_mlp": 1.00024498, "epoch": 0.3773221908254674, "flos": 20996502739200.0, "grad_norm": 1.9327771069270452, "language_loss": 0.77805209, "learning_rate": 2.862650664224558e-06, "loss": 0.79751396, "num_input_tokens_seen": 67439495, "step": 3138, "time_per_iteration": 3.5395736694335938 }, { "auxiliary_loss_clip": 0.01178925, "auxiliary_loss_mlp": 0.01030092, "balance_loss_clip": 1.0574317, "balance_loss_mlp": 1.02205157, "epoch": 0.37744243371610653, "flos": 37631724958080.0, "grad_norm": 1.6208341410494207, "language_loss": 0.6969803, "learning_rate": 2.861947813345627e-06, "loss": 0.71907043, "num_input_tokens_seen": 67462195, "step": 3139, "time_per_iteration": 2.8611836433410645 }, { "auxiliary_loss_clip": 0.01193407, "auxiliary_loss_mlp": 0.00764332, "balance_loss_clip": 1.05504501, "balance_loss_mlp": 1.00018239, "epoch": 0.37756267660674564, "flos": 26140526484480.0, "grad_norm": 1.579126424608873, "language_loss": 0.72380638, "learning_rate": 2.8612448317098974e-06, "loss": 0.74338377, "num_input_tokens_seen": 67482530, "step": 3140, "time_per_iteration": 2.6237947940826416 }, { "auxiliary_loss_clip": 0.01149032, "auxiliary_loss_mlp": 0.00764152, "balance_loss_clip": 1.0475564, "balance_loss_mlp": 1.00011706, "epoch": 0.3776829194973847, "flos": 19427888828160.0, "grad_norm": 1.939347168214533, "language_loss": 0.83495647, "learning_rate": 2.8605417194240114e-06, "loss": 0.85408831, "num_input_tokens_seen": 67500890, "step": 3141, "time_per_iteration": 2.757746934890747 }, { "auxiliary_loss_clip": 0.01171696, "auxiliary_loss_mlp": 0.01031846, "balance_loss_clip": 1.05080664, "balance_loss_mlp": 1.0235368, "epoch": 0.3778031623880238, "flos": 17382309194880.0, "grad_norm": 1.8482666696067072, "language_loss": 0.79258883, "learning_rate": 2.8598384765946315e-06, "loss": 0.81462419, "num_input_tokens_seen": 67519545, "step": 3142, "time_per_iteration": 2.6092066764831543 }, { "auxiliary_loss_clip": 0.01188558, "auxiliary_loss_mlp": 0.01031425, "balance_loss_clip": 1.05177474, "balance_loss_mlp": 1.02362323, "epoch": 0.3779234052786629, "flos": 27125843437440.0, "grad_norm": 2.3725289941613914, "language_loss": 0.71881294, "learning_rate": 2.8591351033284377e-06, "loss": 0.74101281, "num_input_tokens_seen": 67539275, "step": 3143, "time_per_iteration": 2.7344861030578613 }, { "auxiliary_loss_clip": 0.01180097, "auxiliary_loss_mlp": 0.01030203, "balance_loss_clip": 1.05203199, "balance_loss_mlp": 1.02157807, "epoch": 0.37804364816930197, "flos": 19682639061120.0, "grad_norm": 2.341116283893558, "language_loss": 0.83755016, "learning_rate": 2.8584315997321325e-06, "loss": 0.85965323, "num_input_tokens_seen": 67558280, "step": 3144, "time_per_iteration": 2.6385817527770996 }, { "auxiliary_loss_clip": 0.01191815, "auxiliary_loss_mlp": 0.01026672, "balance_loss_clip": 1.05338979, "balance_loss_mlp": 1.01800001, "epoch": 0.3781638910599411, "flos": 22702905221760.0, "grad_norm": 2.332977081904027, "language_loss": 0.777179, "learning_rate": 2.8577279659124356e-06, "loss": 0.79936385, "num_input_tokens_seen": 67575955, "step": 3145, "time_per_iteration": 2.627830743789673 }, { "auxiliary_loss_clip": 0.011725, "auxiliary_loss_mlp": 0.01032591, "balance_loss_clip": 1.0506736, "balance_loss_mlp": 1.02455044, "epoch": 0.3782841339505802, "flos": 14647604158080.0, "grad_norm": 1.8916992724775787, "language_loss": 0.83538592, "learning_rate": 2.857024201976089e-06, "loss": 0.8574369, "num_input_tokens_seen": 67593515, "step": 3146, "time_per_iteration": 2.5955352783203125 }, { "auxiliary_loss_clip": 0.01160654, "auxiliary_loss_mlp": 0.0103154, "balance_loss_clip": 1.05041838, "balance_loss_mlp": 1.0220983, "epoch": 0.37840437684121925, "flos": 32818223185920.0, "grad_norm": 1.8290199561080984, "language_loss": 0.73117507, "learning_rate": 2.8563203080298516e-06, "loss": 0.75309694, "num_input_tokens_seen": 67614290, "step": 3147, "time_per_iteration": 2.755469799041748 }, { "auxiliary_loss_clip": 0.01161636, "auxiliary_loss_mlp": 0.00762857, "balance_loss_clip": 1.05088592, "balance_loss_mlp": 1.00016737, "epoch": 0.37852461973185836, "flos": 18369206346240.0, "grad_norm": 4.5001346973546195, "language_loss": 0.89568889, "learning_rate": 2.855616284180505e-06, "loss": 0.9149338, "num_input_tokens_seen": 67631340, "step": 3148, "time_per_iteration": 2.6243646144866943 }, { "auxiliary_loss_clip": 0.01085318, "auxiliary_loss_mlp": 0.01001235, "balance_loss_clip": 1.02225399, "balance_loss_mlp": 0.99950641, "epoch": 0.37864486262249747, "flos": 59500680117120.0, "grad_norm": 0.8739714477496746, "language_loss": 0.6610744, "learning_rate": 2.8549121305348477e-06, "loss": 0.68193984, "num_input_tokens_seen": 67691125, "step": 3149, "time_per_iteration": 3.237043857574463 }, { "auxiliary_loss_clip": 0.01175118, "auxiliary_loss_mlp": 0.01028172, "balance_loss_clip": 1.05047703, "balance_loss_mlp": 1.02029216, "epoch": 0.3787651055131365, "flos": 23363015414400.0, "grad_norm": 2.505306319387676, "language_loss": 0.8317492, "learning_rate": 2.8542078471997006e-06, "loss": 0.85378206, "num_input_tokens_seen": 67708740, "step": 3150, "time_per_iteration": 2.6768908500671387 }, { "auxiliary_loss_clip": 0.01174134, "auxiliary_loss_mlp": 0.01033842, "balance_loss_clip": 1.0501411, "balance_loss_mlp": 1.02586699, "epoch": 0.37888534840377563, "flos": 24601394661120.0, "grad_norm": 1.8076104941695537, "language_loss": 0.76039851, "learning_rate": 2.8535034342819013e-06, "loss": 0.78247827, "num_input_tokens_seen": 67726150, "step": 3151, "time_per_iteration": 2.6127681732177734 }, { "auxiliary_loss_clip": 0.01190971, "auxiliary_loss_mlp": 0.0103206, "balance_loss_clip": 1.05401134, "balance_loss_mlp": 1.02323842, "epoch": 0.37900559129441475, "flos": 23986891762560.0, "grad_norm": 1.5942082090751741, "language_loss": 0.72626925, "learning_rate": 2.85279889188831e-06, "loss": 0.74849951, "num_input_tokens_seen": 67746525, "step": 3152, "time_per_iteration": 2.697158098220825 }, { "auxiliary_loss_clip": 0.01145896, "auxiliary_loss_mlp": 0.0103357, "balance_loss_clip": 1.04606307, "balance_loss_mlp": 1.02499318, "epoch": 0.3791258341850538, "flos": 24644667571200.0, "grad_norm": 1.693193432155523, "language_loss": 0.81015068, "learning_rate": 2.852094220125805e-06, "loss": 0.83194542, "num_input_tokens_seen": 67766035, "step": 3153, "time_per_iteration": 2.8143136501312256 }, { "auxiliary_loss_clip": 0.01178232, "auxiliary_loss_mlp": 0.01036513, "balance_loss_clip": 1.05481744, "balance_loss_mlp": 1.02698863, "epoch": 0.3792460770756929, "flos": 17420841509760.0, "grad_norm": 2.025667632126982, "language_loss": 0.70957363, "learning_rate": 2.8513894191012846e-06, "loss": 0.73172098, "num_input_tokens_seen": 67785015, "step": 3154, "time_per_iteration": 2.675022840499878 }, { "auxiliary_loss_clip": 0.01191131, "auxiliary_loss_mlp": 0.01037467, "balance_loss_clip": 1.05325866, "balance_loss_mlp": 1.02912807, "epoch": 0.37936631996633197, "flos": 24206557386240.0, "grad_norm": 1.4821687563550758, "language_loss": 0.78887057, "learning_rate": 2.8506844889216664e-06, "loss": 0.81115657, "num_input_tokens_seen": 67804400, "step": 3155, "time_per_iteration": 2.6712646484375 }, { "auxiliary_loss_clip": 0.01075467, "auxiliary_loss_mlp": 0.01008341, "balance_loss_clip": 1.01934993, "balance_loss_mlp": 1.00669563, "epoch": 0.3794865628569711, "flos": 70297114752000.0, "grad_norm": 0.8776545489840557, "language_loss": 0.62876135, "learning_rate": 2.849979429693887e-06, "loss": 0.64959943, "num_input_tokens_seen": 67865385, "step": 3156, "time_per_iteration": 3.279393434524536 }, { "auxiliary_loss_clip": 0.01190505, "auxiliary_loss_mlp": 0.01027062, "balance_loss_clip": 1.05276251, "balance_loss_mlp": 1.01773357, "epoch": 0.3796068057476102, "flos": 15779364860160.0, "grad_norm": 2.0104060466078453, "language_loss": 0.7457391, "learning_rate": 2.8492742415249042e-06, "loss": 0.76791471, "num_input_tokens_seen": 67883030, "step": 3157, "time_per_iteration": 3.729616165161133 }, { "auxiliary_loss_clip": 0.01189925, "auxiliary_loss_mlp": 0.01029088, "balance_loss_clip": 1.0529213, "balance_loss_mlp": 1.0209136, "epoch": 0.37972704863824924, "flos": 25191694771200.0, "grad_norm": 1.8394137088950613, "language_loss": 0.76417804, "learning_rate": 2.848568924521694e-06, "loss": 0.78636813, "num_input_tokens_seen": 67903810, "step": 3158, "time_per_iteration": 2.5990488529205322 }, { "auxiliary_loss_clip": 0.01168898, "auxiliary_loss_mlp": 0.01025913, "balance_loss_clip": 1.04888713, "balance_loss_mlp": 1.01736629, "epoch": 0.37984729152888835, "flos": 26210372480640.0, "grad_norm": 1.7320291602760682, "language_loss": 0.73592514, "learning_rate": 2.8478634787912526e-06, "loss": 0.7578733, "num_input_tokens_seen": 67921865, "step": 3159, "time_per_iteration": 3.6381661891937256 }, { "auxiliary_loss_clip": 0.01177326, "auxiliary_loss_mlp": 0.0103539, "balance_loss_clip": 1.05198169, "balance_loss_mlp": 1.02655649, "epoch": 0.37996753441952746, "flos": 25629302165760.0, "grad_norm": 2.5254537544887823, "language_loss": 0.7624349, "learning_rate": 2.847157904440596e-06, "loss": 0.78456199, "num_input_tokens_seen": 67941595, "step": 3160, "time_per_iteration": 2.6349852085113525 }, { "auxiliary_loss_clip": 0.01176252, "auxiliary_loss_mlp": 0.01031977, "balance_loss_clip": 1.05323398, "balance_loss_mlp": 1.02354884, "epoch": 0.3800877773101665, "flos": 20118414862080.0, "grad_norm": 1.4963191135842497, "language_loss": 0.74132907, "learning_rate": 2.846452201576759e-06, "loss": 0.76341128, "num_input_tokens_seen": 67960970, "step": 3161, "time_per_iteration": 2.590013027191162 }, { "auxiliary_loss_clip": 0.01074431, "auxiliary_loss_mlp": 0.01007792, "balance_loss_clip": 1.02203155, "balance_loss_mlp": 1.00623083, "epoch": 0.38020802020080563, "flos": 63053608037760.0, "grad_norm": 0.859587141454022, "language_loss": 0.62839091, "learning_rate": 2.845746370306795e-06, "loss": 0.64921319, "num_input_tokens_seen": 68026160, "step": 3162, "time_per_iteration": 3.3215761184692383 }, { "auxiliary_loss_clip": 0.01179961, "auxiliary_loss_mlp": 0.01028032, "balance_loss_clip": 1.05316126, "balance_loss_mlp": 1.01940739, "epoch": 0.38032826309144474, "flos": 21288420570240.0, "grad_norm": 1.8612755093614402, "language_loss": 0.78916025, "learning_rate": 2.84504041073778e-06, "loss": 0.8112402, "num_input_tokens_seen": 68044575, "step": 3163, "time_per_iteration": 3.5391719341278076 }, { "auxiliary_loss_clip": 0.01152676, "auxiliary_loss_mlp": 0.0103185, "balance_loss_clip": 1.04890847, "balance_loss_mlp": 1.02296352, "epoch": 0.3804485059820838, "flos": 18954119416320.0, "grad_norm": 1.7315060574761743, "language_loss": 0.79387081, "learning_rate": 2.844334322976806e-06, "loss": 0.81571603, "num_input_tokens_seen": 68064790, "step": 3164, "time_per_iteration": 3.597341537475586 }, { "auxiliary_loss_clip": 0.01140137, "auxiliary_loss_mlp": 0.01036157, "balance_loss_clip": 1.04583859, "balance_loss_mlp": 1.02670979, "epoch": 0.3805687488727229, "flos": 21833759831040.0, "grad_norm": 1.8899014346498153, "language_loss": 0.83460498, "learning_rate": 2.8436281071309866e-06, "loss": 0.85636783, "num_input_tokens_seen": 68083330, "step": 3165, "time_per_iteration": 2.736013412475586 }, { "auxiliary_loss_clip": 0.01057311, "auxiliary_loss_mlp": 0.01006465, "balance_loss_clip": 1.02118635, "balance_loss_mlp": 1.00487995, "epoch": 0.380688991763362, "flos": 58546209968640.0, "grad_norm": 0.7261134890584806, "language_loss": 0.52948314, "learning_rate": 2.842921763307455e-06, "loss": 0.55012089, "num_input_tokens_seen": 68146140, "step": 3166, "time_per_iteration": 3.25123929977417 }, { "auxiliary_loss_clip": 0.01156618, "auxiliary_loss_mlp": 0.0103385, "balance_loss_clip": 1.04935181, "balance_loss_mlp": 1.02524304, "epoch": 0.38080923465400107, "flos": 23799509487360.0, "grad_norm": 1.8580132402063838, "language_loss": 0.82377303, "learning_rate": 2.842215291613361e-06, "loss": 0.84567773, "num_input_tokens_seen": 68164520, "step": 3167, "time_per_iteration": 2.731501579284668 }, { "auxiliary_loss_clip": 0.01021834, "auxiliary_loss_mlp": 0.01004727, "balance_loss_clip": 1.02134824, "balance_loss_mlp": 1.00309372, "epoch": 0.3809294775446402, "flos": 54969866380800.0, "grad_norm": 0.8507008396179668, "language_loss": 0.5918625, "learning_rate": 2.8415086921558774e-06, "loss": 0.61212808, "num_input_tokens_seen": 68227945, "step": 3168, "time_per_iteration": 3.506775379180908 }, { "auxiliary_loss_clip": 0.01145487, "auxiliary_loss_mlp": 0.01028627, "balance_loss_clip": 1.04309106, "balance_loss_mlp": 1.01947737, "epoch": 0.38104972043527924, "flos": 24643697904000.0, "grad_norm": 1.531919155144609, "language_loss": 0.78677249, "learning_rate": 2.840801965042194e-06, "loss": 0.80851358, "num_input_tokens_seen": 68247405, "step": 3169, "time_per_iteration": 2.9969325065612793 }, { "auxiliary_loss_clip": 0.01151554, "auxiliary_loss_mlp": 0.01034526, "balance_loss_clip": 1.04648185, "balance_loss_mlp": 1.0257051, "epoch": 0.38116996332591835, "flos": 22856783086080.0, "grad_norm": 1.7433658373835499, "language_loss": 0.84023464, "learning_rate": 2.840095110379521e-06, "loss": 0.86209548, "num_input_tokens_seen": 68266925, "step": 3170, "time_per_iteration": 2.6760458946228027 }, { "auxiliary_loss_clip": 0.0104268, "auxiliary_loss_mlp": 0.01003421, "balance_loss_clip": 1.01907849, "balance_loss_mlp": 1.00187135, "epoch": 0.38129020621655746, "flos": 60836160804480.0, "grad_norm": 0.7519423721049622, "language_loss": 0.53887534, "learning_rate": 2.8393881282750884e-06, "loss": 0.5593363, "num_input_tokens_seen": 68329755, "step": 3171, "time_per_iteration": 3.236588716506958 }, { "auxiliary_loss_clip": 0.01163858, "auxiliary_loss_mlp": 0.01033599, "balance_loss_clip": 1.05182099, "balance_loss_mlp": 1.02472401, "epoch": 0.3814104491071965, "flos": 21648101408640.0, "grad_norm": 2.093053277990453, "language_loss": 0.7879312, "learning_rate": 2.838681018836144e-06, "loss": 0.80990571, "num_input_tokens_seen": 68347075, "step": 3172, "time_per_iteration": 2.6886661052703857 }, { "auxiliary_loss_clip": 0.01148791, "auxiliary_loss_mlp": 0.00763664, "balance_loss_clip": 1.04710937, "balance_loss_mlp": 1.00012398, "epoch": 0.3815306919978356, "flos": 19099090707840.0, "grad_norm": 2.01521044834459, "language_loss": 0.78790075, "learning_rate": 2.837973782169955e-06, "loss": 0.80702525, "num_input_tokens_seen": 68365450, "step": 3173, "time_per_iteration": 2.672671318054199 }, { "auxiliary_loss_clip": 0.01091197, "auxiliary_loss_mlp": 0.0100074, "balance_loss_clip": 1.02059817, "balance_loss_mlp": 0.99911898, "epoch": 0.38165093488847474, "flos": 67067918156160.0, "grad_norm": 0.8084814963839174, "language_loss": 0.59145617, "learning_rate": 2.8372664183838096e-06, "loss": 0.6123755, "num_input_tokens_seen": 68428470, "step": 3174, "time_per_iteration": 3.250335931777954 }, { "auxiliary_loss_clip": 0.01189428, "auxiliary_loss_mlp": 0.01034873, "balance_loss_clip": 1.05325031, "balance_loss_mlp": 1.02581882, "epoch": 0.3817711777791138, "flos": 22341105480960.0, "grad_norm": 2.186396916592698, "language_loss": 0.68656558, "learning_rate": 2.836558927585015e-06, "loss": 0.7088086, "num_input_tokens_seen": 68445440, "step": 3175, "time_per_iteration": 2.6622257232666016 }, { "auxiliary_loss_clip": 0.01178556, "auxiliary_loss_mlp": 0.0103522, "balance_loss_clip": 1.05267477, "balance_loss_mlp": 1.02621984, "epoch": 0.3818914206697529, "flos": 22820621068800.0, "grad_norm": 2.1268316864135444, "language_loss": 0.82605207, "learning_rate": 2.8358513098808957e-06, "loss": 0.84818983, "num_input_tokens_seen": 68465755, "step": 3176, "time_per_iteration": 2.616715908050537 }, { "auxiliary_loss_clip": 0.0112301, "auxiliary_loss_mlp": 0.01032404, "balance_loss_clip": 1.04748642, "balance_loss_mlp": 1.02329612, "epoch": 0.382011663560392, "flos": 24386074583040.0, "grad_norm": 1.7610499942848132, "language_loss": 0.77138519, "learning_rate": 2.835143565378798e-06, "loss": 0.79293931, "num_input_tokens_seen": 68486220, "step": 3177, "time_per_iteration": 2.777756452560425 }, { "auxiliary_loss_clip": 0.01112294, "auxiliary_loss_mlp": 0.01027409, "balance_loss_clip": 1.0433892, "balance_loss_mlp": 1.01750863, "epoch": 0.38213190645103107, "flos": 21981568296960.0, "grad_norm": 2.5177384145573107, "language_loss": 0.78591949, "learning_rate": 2.8344356941860847e-06, "loss": 0.80731654, "num_input_tokens_seen": 68505850, "step": 3178, "time_per_iteration": 2.7951464653015137 }, { "auxiliary_loss_clip": 0.01141974, "auxiliary_loss_mlp": 0.01030953, "balance_loss_clip": 1.04812407, "balance_loss_mlp": 1.02163696, "epoch": 0.3822521493416702, "flos": 35516945773440.0, "grad_norm": 2.6082147227891723, "language_loss": 0.66063166, "learning_rate": 2.8337276964101403e-06, "loss": 0.68236089, "num_input_tokens_seen": 68526290, "step": 3179, "time_per_iteration": 2.833160400390625 }, { "auxiliary_loss_clip": 0.01177466, "auxiliary_loss_mlp": 0.01034308, "balance_loss_clip": 1.0521301, "balance_loss_mlp": 1.02604663, "epoch": 0.3823723922323093, "flos": 21069904181760.0, "grad_norm": 1.9074507318807794, "language_loss": 0.76716292, "learning_rate": 2.833019572158367e-06, "loss": 0.78928065, "num_input_tokens_seen": 68544725, "step": 3180, "time_per_iteration": 2.648097038269043 }, { "auxiliary_loss_clip": 0.01161036, "auxiliary_loss_mlp": 0.01029222, "balance_loss_clip": 1.04947424, "balance_loss_mlp": 1.02096105, "epoch": 0.38249263512294834, "flos": 19789149864960.0, "grad_norm": 1.7703468887687286, "language_loss": 0.80145693, "learning_rate": 2.8323113215381872e-06, "loss": 0.82335949, "num_input_tokens_seen": 68563070, "step": 3181, "time_per_iteration": 2.6935696601867676 }, { "auxiliary_loss_clip": 0.01146372, "auxiliary_loss_mlp": 0.01038847, "balance_loss_clip": 1.05015421, "balance_loss_mlp": 1.02958477, "epoch": 0.38261287801358745, "flos": 21433930565760.0, "grad_norm": 1.827708278738902, "language_loss": 0.76338065, "learning_rate": 2.831602944657042e-06, "loss": 0.78523284, "num_input_tokens_seen": 68581150, "step": 3182, "time_per_iteration": 2.758544683456421 }, { "auxiliary_loss_clip": 0.01169469, "auxiliary_loss_mlp": 0.01032898, "balance_loss_clip": 1.05042887, "balance_loss_mlp": 1.02417779, "epoch": 0.38273312090422656, "flos": 21981568296960.0, "grad_norm": 2.2093715133986764, "language_loss": 0.74496114, "learning_rate": 2.830894441622391e-06, "loss": 0.76698482, "num_input_tokens_seen": 68597800, "step": 3183, "time_per_iteration": 3.6180737018585205 }, { "auxiliary_loss_clip": 0.01142653, "auxiliary_loss_mlp": 0.00764054, "balance_loss_clip": 1.04547572, "balance_loss_mlp": 1.00016129, "epoch": 0.3828533637948656, "flos": 24790895838720.0, "grad_norm": 1.8479732487857226, "language_loss": 0.79576862, "learning_rate": 2.8301858125417134e-06, "loss": 0.81483567, "num_input_tokens_seen": 68617640, "step": 3184, "time_per_iteration": 2.7778563499450684 }, { "auxiliary_loss_clip": 0.01165789, "auxiliary_loss_mlp": 0.01030199, "balance_loss_clip": 1.05384326, "balance_loss_mlp": 1.02206266, "epoch": 0.38297360668550473, "flos": 22455445449600.0, "grad_norm": 1.9828484774405977, "language_loss": 0.73886502, "learning_rate": 2.8294770575225082e-06, "loss": 0.76082492, "num_input_tokens_seen": 68637770, "step": 3185, "time_per_iteration": 3.6044039726257324 }, { "auxiliary_loss_clip": 0.01178637, "auxiliary_loss_mlp": 0.01038046, "balance_loss_clip": 1.05493069, "balance_loss_mlp": 1.0291115, "epoch": 0.3830938495761438, "flos": 24896903852160.0, "grad_norm": 1.696876237179694, "language_loss": 0.83492744, "learning_rate": 2.828768176672293e-06, "loss": 0.85709429, "num_input_tokens_seen": 68656885, "step": 3186, "time_per_iteration": 2.6498985290527344 }, { "auxiliary_loss_clip": 0.01146476, "auxiliary_loss_mlp": 0.01030106, "balance_loss_clip": 1.04659224, "balance_loss_mlp": 1.02045619, "epoch": 0.3832140924667829, "flos": 33036236784000.0, "grad_norm": 1.910138676478638, "language_loss": 0.71559364, "learning_rate": 2.8280591700986044e-06, "loss": 0.73735946, "num_input_tokens_seen": 68678750, "step": 3187, "time_per_iteration": 2.7926204204559326 }, { "auxiliary_loss_clip": 0.01163029, "auxiliary_loss_mlp": 0.0102931, "balance_loss_clip": 1.04670823, "balance_loss_mlp": 1.0209415, "epoch": 0.383334335357422, "flos": 31903721896320.0, "grad_norm": 1.7038827706281723, "language_loss": 0.74809444, "learning_rate": 2.827350037908999e-06, "loss": 0.77001786, "num_input_tokens_seen": 68698190, "step": 3188, "time_per_iteration": 2.7096970081329346 }, { "auxiliary_loss_clip": 0.01152773, "auxiliary_loss_mlp": 0.01035571, "balance_loss_clip": 1.0484848, "balance_loss_mlp": 1.02601624, "epoch": 0.38345457824806106, "flos": 19791915212160.0, "grad_norm": 3.8511360269869352, "language_loss": 0.79260814, "learning_rate": 2.8266407802110496e-06, "loss": 0.81449157, "num_input_tokens_seen": 68716445, "step": 3189, "time_per_iteration": 3.6310768127441406 }, { "auxiliary_loss_clip": 0.01109897, "auxiliary_loss_mlp": 0.01033811, "balance_loss_clip": 1.042243, "balance_loss_mlp": 1.02415538, "epoch": 0.3835748211387002, "flos": 22419391173120.0, "grad_norm": 1.982733387132145, "language_loss": 0.76318955, "learning_rate": 2.8259313971123515e-06, "loss": 0.7846266, "num_input_tokens_seen": 68737565, "step": 3190, "time_per_iteration": 3.7357072830200195 }, { "auxiliary_loss_clip": 0.01174049, "auxiliary_loss_mlp": 0.01036263, "balance_loss_clip": 1.05364466, "balance_loss_mlp": 1.02781081, "epoch": 0.3836950640293393, "flos": 25118436983040.0, "grad_norm": 1.5134947921406596, "language_loss": 0.78702652, "learning_rate": 2.8252218887205166e-06, "loss": 0.8091296, "num_input_tokens_seen": 68758255, "step": 3191, "time_per_iteration": 2.6306204795837402 }, { "auxiliary_loss_clip": 0.01121831, "auxiliary_loss_mlp": 0.01028886, "balance_loss_clip": 1.04651868, "balance_loss_mlp": 1.02034497, "epoch": 0.38381530691997834, "flos": 21799213925760.0, "grad_norm": 3.1332192380506236, "language_loss": 0.80661613, "learning_rate": 2.824512255143178e-06, "loss": 0.82812333, "num_input_tokens_seen": 68777490, "step": 3192, "time_per_iteration": 2.7528295516967773 }, { "auxiliary_loss_clip": 0.01153801, "auxiliary_loss_mlp": 0.01029139, "balance_loss_clip": 1.05056739, "balance_loss_mlp": 1.02078819, "epoch": 0.38393554981061745, "flos": 21252689516160.0, "grad_norm": 1.7110880237153903, "language_loss": 0.79546571, "learning_rate": 2.8238024964879855e-06, "loss": 0.81729513, "num_input_tokens_seen": 68798385, "step": 3193, "time_per_iteration": 2.7326602935791016 }, { "auxiliary_loss_clip": 0.01194927, "auxiliary_loss_mlp": 0.01029379, "balance_loss_clip": 1.05544078, "balance_loss_mlp": 1.02015245, "epoch": 0.38405579270125656, "flos": 17019360218880.0, "grad_norm": 2.3874984353977733, "language_loss": 0.77294612, "learning_rate": 2.8230926128626095e-06, "loss": 0.7951892, "num_input_tokens_seen": 68816880, "step": 3194, "time_per_iteration": 2.5978074073791504 }, { "auxiliary_loss_clip": 0.01152494, "auxiliary_loss_mlp": 0.01033259, "balance_loss_clip": 1.04543698, "balance_loss_mlp": 1.02449083, "epoch": 0.3841760355918956, "flos": 21835375943040.0, "grad_norm": 2.1738544362932095, "language_loss": 0.79701626, "learning_rate": 2.822382604374738e-06, "loss": 0.81887376, "num_input_tokens_seen": 68835805, "step": 3195, "time_per_iteration": 2.667903184890747 }, { "auxiliary_loss_clip": 0.01162084, "auxiliary_loss_mlp": 0.01032105, "balance_loss_clip": 1.05464983, "balance_loss_mlp": 1.02300942, "epoch": 0.3842962784825347, "flos": 25915114684800.0, "grad_norm": 1.9536734641406872, "language_loss": 0.65894604, "learning_rate": 2.8216724711320793e-06, "loss": 0.68088794, "num_input_tokens_seen": 68854930, "step": 3196, "time_per_iteration": 2.837489128112793 }, { "auxiliary_loss_clip": 0.01186299, "auxiliary_loss_mlp": 0.00763554, "balance_loss_clip": 1.05026329, "balance_loss_mlp": 1.00014305, "epoch": 0.38441652137317384, "flos": 25337492075520.0, "grad_norm": 1.5415302938835473, "language_loss": 0.79730481, "learning_rate": 2.820962213242361e-06, "loss": 0.81680334, "num_input_tokens_seen": 68874260, "step": 3197, "time_per_iteration": 2.627007007598877 }, { "auxiliary_loss_clip": 0.01174032, "auxiliary_loss_mlp": 0.010354, "balance_loss_clip": 1.05384576, "balance_loss_mlp": 1.02722788, "epoch": 0.3845367642638129, "flos": 18113486446080.0, "grad_norm": 2.036814230633612, "language_loss": 0.84131694, "learning_rate": 2.8202518308133264e-06, "loss": 0.86341119, "num_input_tokens_seen": 68891535, "step": 3198, "time_per_iteration": 2.625339984893799 }, { "auxiliary_loss_clip": 0.01190621, "auxiliary_loss_mlp": 0.01025973, "balance_loss_clip": 1.05199397, "balance_loss_mlp": 1.01719868, "epoch": 0.384657007154452, "flos": 25228395492480.0, "grad_norm": 1.8181414666782902, "language_loss": 0.7377221, "learning_rate": 2.8195413239527426e-06, "loss": 0.75988805, "num_input_tokens_seen": 68911275, "step": 3199, "time_per_iteration": 2.6491947174072266 }, { "auxiliary_loss_clip": 0.01171735, "auxiliary_loss_mlp": 0.010409, "balance_loss_clip": 1.05112731, "balance_loss_mlp": 1.03231096, "epoch": 0.38477725004509106, "flos": 19865855358720.0, "grad_norm": 3.3439566460348606, "language_loss": 0.8111093, "learning_rate": 2.8188306927683906e-06, "loss": 0.83323562, "num_input_tokens_seen": 68930745, "step": 3200, "time_per_iteration": 2.608884572982788 }, { "auxiliary_loss_clip": 0.01160007, "auxiliary_loss_mlp": 0.01025849, "balance_loss_clip": 1.04905725, "balance_loss_mlp": 1.01721263, "epoch": 0.38489749293573017, "flos": 18259391491200.0, "grad_norm": 2.4558906170138535, "language_loss": 0.75131214, "learning_rate": 2.818119937368074e-06, "loss": 0.77317071, "num_input_tokens_seen": 68949380, "step": 3201, "time_per_iteration": 2.56602144241333 }, { "auxiliary_loss_clip": 0.01180389, "auxiliary_loss_mlp": 0.01031255, "balance_loss_clip": 1.05008698, "balance_loss_mlp": 1.0221771, "epoch": 0.3850177358263693, "flos": 24389163152640.0, "grad_norm": 1.9190787408377363, "language_loss": 0.6554625, "learning_rate": 2.817409057859613e-06, "loss": 0.67757899, "num_input_tokens_seen": 68968370, "step": 3202, "time_per_iteration": 2.601261854171753 }, { "auxiliary_loss_clip": 0.01124841, "auxiliary_loss_mlp": 0.01033084, "balance_loss_clip": 1.04562283, "balance_loss_mlp": 1.02473378, "epoch": 0.38513797871700833, "flos": 17671533505920.0, "grad_norm": 1.8361040505730597, "language_loss": 0.79127181, "learning_rate": 2.8166980543508482e-06, "loss": 0.81285107, "num_input_tokens_seen": 68984260, "step": 3203, "time_per_iteration": 2.673112154006958 }, { "auxiliary_loss_clip": 0.01191134, "auxiliary_loss_mlp": 0.01029706, "balance_loss_clip": 1.05366135, "balance_loss_mlp": 1.02081895, "epoch": 0.38525822160764744, "flos": 25739583897600.0, "grad_norm": 1.8184233365351232, "language_loss": 0.79956901, "learning_rate": 2.815986926949638e-06, "loss": 0.8217774, "num_input_tokens_seen": 69002760, "step": 3204, "time_per_iteration": 2.653207778930664 }, { "auxiliary_loss_clip": 0.01174287, "auxiliary_loss_mlp": 0.01037926, "balance_loss_clip": 1.05187476, "balance_loss_mlp": 1.02950978, "epoch": 0.38537846449828655, "flos": 20193647898240.0, "grad_norm": 2.837519028569492, "language_loss": 0.8036052, "learning_rate": 2.8152756757638597e-06, "loss": 0.82572734, "num_input_tokens_seen": 69021260, "step": 3205, "time_per_iteration": 2.607947587966919 }, { "auxiliary_loss_clip": 0.01175018, "auxiliary_loss_mlp": 0.01031697, "balance_loss_clip": 1.05268836, "balance_loss_mlp": 1.02291107, "epoch": 0.3854987073889256, "flos": 23039352938880.0, "grad_norm": 1.9011011931927924, "language_loss": 0.84710741, "learning_rate": 2.8145643009014093e-06, "loss": 0.8691746, "num_input_tokens_seen": 69039755, "step": 3206, "time_per_iteration": 2.627964735031128 }, { "auxiliary_loss_clip": 0.01176804, "auxiliary_loss_mlp": 0.0102975, "balance_loss_clip": 1.05230308, "balance_loss_mlp": 1.02123833, "epoch": 0.3856189502795647, "flos": 20190631155840.0, "grad_norm": 1.8552769548485435, "language_loss": 0.7928136, "learning_rate": 2.813852802470202e-06, "loss": 0.81487912, "num_input_tokens_seen": 69057650, "step": 3207, "time_per_iteration": 2.666914224624634 }, { "auxiliary_loss_clip": 0.01154252, "auxiliary_loss_mlp": 0.01030572, "balance_loss_clip": 1.04814088, "balance_loss_mlp": 1.02173281, "epoch": 0.38573919317020383, "flos": 25702631781120.0, "grad_norm": 1.7905408898997752, "language_loss": 0.72663307, "learning_rate": 2.8131411805781717e-06, "loss": 0.74848127, "num_input_tokens_seen": 69077775, "step": 3208, "time_per_iteration": 2.6816163063049316 }, { "auxiliary_loss_clip": 0.01163344, "auxiliary_loss_mlp": 0.01032765, "balance_loss_clip": 1.05296612, "balance_loss_mlp": 1.02317464, "epoch": 0.3858594360608429, "flos": 29821405628160.0, "grad_norm": 2.6698321347285914, "language_loss": 0.64229977, "learning_rate": 2.8124294353332707e-06, "loss": 0.66426086, "num_input_tokens_seen": 69096450, "step": 3209, "time_per_iteration": 3.703415632247925 }, { "auxiliary_loss_clip": 0.01151211, "auxiliary_loss_mlp": 0.01037306, "balance_loss_clip": 1.04912984, "balance_loss_mlp": 1.02851987, "epoch": 0.385979678951482, "flos": 24790428961920.0, "grad_norm": 1.5932105167524586, "language_loss": 0.77227539, "learning_rate": 2.8117175668434713e-06, "loss": 0.79416054, "num_input_tokens_seen": 69116110, "step": 3210, "time_per_iteration": 2.8631091117858887 }, { "auxiliary_loss_clip": 0.01190213, "auxiliary_loss_mlp": 0.01030048, "balance_loss_clip": 1.05278826, "balance_loss_mlp": 1.02098167, "epoch": 0.3860999218421211, "flos": 21287881866240.0, "grad_norm": 2.3430755547299005, "language_loss": 0.7086159, "learning_rate": 2.811005575216762e-06, "loss": 0.73081851, "num_input_tokens_seen": 69134825, "step": 3211, "time_per_iteration": 3.532532215118408 }, { "auxiliary_loss_clip": 0.01140513, "auxiliary_loss_mlp": 0.01034532, "balance_loss_clip": 1.04660046, "balance_loss_mlp": 1.02572227, "epoch": 0.38622016473276016, "flos": 24536720223360.0, "grad_norm": 1.6611492537084014, "language_loss": 0.79139346, "learning_rate": 2.8102934605611513e-06, "loss": 0.81314385, "num_input_tokens_seen": 69156460, "step": 3212, "time_per_iteration": 2.711057186126709 }, { "auxiliary_loss_clip": 0.01165775, "auxiliary_loss_mlp": 0.01034671, "balance_loss_clip": 1.05136526, "balance_loss_mlp": 1.02616, "epoch": 0.3863404076233993, "flos": 20558212986240.0, "grad_norm": 2.1881246343245766, "language_loss": 0.67711812, "learning_rate": 2.8095812229846665e-06, "loss": 0.69912255, "num_input_tokens_seen": 69176420, "step": 3213, "time_per_iteration": 2.6505558490753174 }, { "auxiliary_loss_clip": 0.01160164, "auxiliary_loss_mlp": 0.01036176, "balance_loss_clip": 1.04847789, "balance_loss_mlp": 1.02708614, "epoch": 0.3864606505140384, "flos": 22346277039360.0, "grad_norm": 2.204362415290507, "language_loss": 0.69429123, "learning_rate": 2.808868862595355e-06, "loss": 0.71625471, "num_input_tokens_seen": 69196665, "step": 3214, "time_per_iteration": 2.7331881523132324 }, { "auxiliary_loss_clip": 0.0117706, "auxiliary_loss_mlp": 0.01025158, "balance_loss_clip": 1.05110514, "balance_loss_mlp": 1.01643205, "epoch": 0.38658089340467744, "flos": 25703601448320.0, "grad_norm": 1.851272183506496, "language_loss": 0.79854685, "learning_rate": 2.8081563795012795e-06, "loss": 0.82056904, "num_input_tokens_seen": 69216290, "step": 3215, "time_per_iteration": 3.599665641784668 }, { "auxiliary_loss_clip": 0.01168432, "auxiliary_loss_mlp": 0.01033372, "balance_loss_clip": 1.04951787, "balance_loss_mlp": 1.02468705, "epoch": 0.38670113629531655, "flos": 33802534558080.0, "grad_norm": 1.6875968926911713, "language_loss": 0.73623049, "learning_rate": 2.807443773810524e-06, "loss": 0.75824857, "num_input_tokens_seen": 69237550, "step": 3216, "time_per_iteration": 3.6830861568450928 }, { "auxiliary_loss_clip": 0.0114833, "auxiliary_loss_mlp": 0.0103048, "balance_loss_clip": 1.05132365, "balance_loss_mlp": 1.02206373, "epoch": 0.3868213791859556, "flos": 23331522165120.0, "grad_norm": 1.8153258837562531, "language_loss": 0.89890087, "learning_rate": 2.80673104563119e-06, "loss": 0.92068905, "num_input_tokens_seen": 69258175, "step": 3217, "time_per_iteration": 2.787142276763916 }, { "auxiliary_loss_clip": 0.01171124, "auxiliary_loss_mlp": 0.010274, "balance_loss_clip": 1.05223298, "balance_loss_mlp": 1.01882315, "epoch": 0.3869416220765947, "flos": 18441530380800.0, "grad_norm": 2.3887277283156365, "language_loss": 0.78964192, "learning_rate": 2.8060181950713976e-06, "loss": 0.81162721, "num_input_tokens_seen": 69274965, "step": 3218, "time_per_iteration": 2.641911268234253 }, { "auxiliary_loss_clip": 0.01143022, "auxiliary_loss_mlp": 0.01038599, "balance_loss_clip": 1.04640806, "balance_loss_mlp": 1.02921712, "epoch": 0.3870618649672338, "flos": 15632992938240.0, "grad_norm": 2.6183990983269094, "language_loss": 0.80720776, "learning_rate": 2.805305222239286e-06, "loss": 0.82902396, "num_input_tokens_seen": 69292220, "step": 3219, "time_per_iteration": 2.6697747707366943 }, { "auxiliary_loss_clip": 0.01158299, "auxiliary_loss_mlp": 0.01030012, "balance_loss_clip": 1.04865909, "balance_loss_mlp": 1.02156568, "epoch": 0.3871821078578729, "flos": 23513804709120.0, "grad_norm": 1.8591818636861068, "language_loss": 0.73706442, "learning_rate": 2.8045921272430118e-06, "loss": 0.75894749, "num_input_tokens_seen": 69311900, "step": 3220, "time_per_iteration": 2.6817467212677 }, { "auxiliary_loss_clip": 0.01183914, "auxiliary_loss_mlp": 0.01031679, "balance_loss_clip": 1.0530026, "balance_loss_mlp": 1.02289963, "epoch": 0.387302350748512, "flos": 17778259791360.0, "grad_norm": 2.847483525038182, "language_loss": 0.76721942, "learning_rate": 2.803878910190753e-06, "loss": 0.78937536, "num_input_tokens_seen": 69328820, "step": 3221, "time_per_iteration": 2.6325597763061523 }, { "auxiliary_loss_clip": 0.0118026, "auxiliary_loss_mlp": 0.01028916, "balance_loss_clip": 1.05212653, "balance_loss_mlp": 1.02029085, "epoch": 0.3874225936391511, "flos": 11503409097600.0, "grad_norm": 5.1580432727542656, "language_loss": 0.82065547, "learning_rate": 2.8031655711907017e-06, "loss": 0.84274727, "num_input_tokens_seen": 69342525, "step": 3222, "time_per_iteration": 2.6161258220672607 }, { "auxiliary_loss_clip": 0.01178966, "auxiliary_loss_mlp": 0.01025474, "balance_loss_clip": 1.05418193, "balance_loss_mlp": 1.01620507, "epoch": 0.38754283652979016, "flos": 21945154884480.0, "grad_norm": 2.9750624090875526, "language_loss": 0.81059313, "learning_rate": 2.8024521103510723e-06, "loss": 0.83263755, "num_input_tokens_seen": 69359295, "step": 3223, "time_per_iteration": 2.6720969676971436 }, { "auxiliary_loss_clip": 0.01174236, "auxiliary_loss_mlp": 0.01031433, "balance_loss_clip": 1.04950285, "balance_loss_mlp": 1.02296329, "epoch": 0.38766307942042927, "flos": 21175984022400.0, "grad_norm": 1.8452798858271167, "language_loss": 0.75365776, "learning_rate": 2.8017385277800952e-06, "loss": 0.77571446, "num_input_tokens_seen": 69377650, "step": 3224, "time_per_iteration": 2.7115986347198486 }, { "auxiliary_loss_clip": 0.01149491, "auxiliary_loss_mlp": 0.01028611, "balance_loss_clip": 1.04794848, "balance_loss_mlp": 1.01974225, "epoch": 0.3877833223110684, "flos": 27417294391680.0, "grad_norm": 2.1161339192761917, "language_loss": 0.75310135, "learning_rate": 2.8010248235860213e-06, "loss": 0.77488238, "num_input_tokens_seen": 69397765, "step": 3225, "time_per_iteration": 2.7463183403015137 }, { "auxiliary_loss_clip": 0.01073668, "auxiliary_loss_mlp": 0.00755739, "balance_loss_clip": 1.01980603, "balance_loss_mlp": 1.00042903, "epoch": 0.38790356520170743, "flos": 64500019879680.0, "grad_norm": 0.829034908506229, "language_loss": 0.62792313, "learning_rate": 2.8003109978771192e-06, "loss": 0.64621723, "num_input_tokens_seen": 69458930, "step": 3226, "time_per_iteration": 3.360851287841797 }, { "auxiliary_loss_clip": 0.01134923, "auxiliary_loss_mlp": 0.01029623, "balance_loss_clip": 1.04260302, "balance_loss_mlp": 1.02097464, "epoch": 0.38802380809234654, "flos": 22345415112960.0, "grad_norm": 1.9455688411867902, "language_loss": 0.78880906, "learning_rate": 2.799597050761674e-06, "loss": 0.81045449, "num_input_tokens_seen": 69475135, "step": 3227, "time_per_iteration": 2.6835217475891113 }, { "auxiliary_loss_clip": 0.0119262, "auxiliary_loss_mlp": 0.01025488, "balance_loss_clip": 1.05252886, "balance_loss_mlp": 1.01692283, "epoch": 0.38814405098298566, "flos": 25261361199360.0, "grad_norm": 1.858894396097029, "language_loss": 0.7932533, "learning_rate": 2.7988829823479924e-06, "loss": 0.8154344, "num_input_tokens_seen": 69493525, "step": 3228, "time_per_iteration": 2.646496534347534 }, { "auxiliary_loss_clip": 0.0115483, "auxiliary_loss_mlp": 0.01031148, "balance_loss_clip": 1.04914474, "balance_loss_mlp": 1.02236819, "epoch": 0.3882642938736247, "flos": 18841180078080.0, "grad_norm": 1.7831991777319018, "language_loss": 0.64278126, "learning_rate": 2.7981687927443976e-06, "loss": 0.66464102, "num_input_tokens_seen": 69510325, "step": 3229, "time_per_iteration": 2.7169153690338135 }, { "auxiliary_loss_clip": 0.01173908, "auxiliary_loss_mlp": 0.01028938, "balance_loss_clip": 1.04854071, "balance_loss_mlp": 1.01992607, "epoch": 0.3883845367642638, "flos": 21652806090240.0, "grad_norm": 1.8823700329160886, "language_loss": 0.85529864, "learning_rate": 2.797454482059231e-06, "loss": 0.87732708, "num_input_tokens_seen": 69530480, "step": 3230, "time_per_iteration": 2.6146080493927 }, { "auxiliary_loss_clip": 0.01192728, "auxiliary_loss_mlp": 0.01026236, "balance_loss_clip": 1.05366254, "balance_loss_mlp": 1.01776028, "epoch": 0.3885047796549029, "flos": 20557530627840.0, "grad_norm": 1.5856909793639167, "language_loss": 0.84554613, "learning_rate": 2.7967400504008537e-06, "loss": 0.86773574, "num_input_tokens_seen": 69549780, "step": 3231, "time_per_iteration": 2.6036620140075684 }, { "auxiliary_loss_clip": 0.01041706, "auxiliary_loss_mlp": 0.01002217, "balance_loss_clip": 1.01618648, "balance_loss_mlp": 1.0004288, "epoch": 0.388625022545542, "flos": 64325491695360.0, "grad_norm": 0.7905515318842432, "language_loss": 0.57397246, "learning_rate": 2.7960254978776456e-06, "loss": 0.59441161, "num_input_tokens_seen": 69611870, "step": 3232, "time_per_iteration": 3.3132598400115967 }, { "auxiliary_loss_clip": 0.01190453, "auxiliary_loss_mlp": 0.01031469, "balance_loss_clip": 1.05273461, "balance_loss_mlp": 1.02288628, "epoch": 0.3887452654361811, "flos": 18113881495680.0, "grad_norm": 2.049306693897059, "language_loss": 0.81853598, "learning_rate": 2.7953108245980006e-06, "loss": 0.84075522, "num_input_tokens_seen": 69630385, "step": 3233, "time_per_iteration": 2.5742053985595703 }, { "auxiliary_loss_clip": 0.0115917, "auxiliary_loss_mlp": 0.01029623, "balance_loss_clip": 1.05227613, "balance_loss_mlp": 1.02166522, "epoch": 0.38886550832682015, "flos": 24975261371520.0, "grad_norm": 1.6923860301561402, "language_loss": 0.736404, "learning_rate": 2.7945960306703365e-06, "loss": 0.75829196, "num_input_tokens_seen": 69653370, "step": 3234, "time_per_iteration": 2.716172933578491 }, { "auxiliary_loss_clip": 0.01180625, "auxiliary_loss_mlp": 0.01028309, "balance_loss_clip": 1.05233169, "balance_loss_mlp": 1.01925457, "epoch": 0.38898575121745926, "flos": 27199496275200.0, "grad_norm": 1.6830132174096664, "language_loss": 0.6600008, "learning_rate": 2.7938811162030865e-06, "loss": 0.68209016, "num_input_tokens_seen": 69673635, "step": 3235, "time_per_iteration": 3.6284356117248535 }, { "auxiliary_loss_clip": 0.01174214, "auxiliary_loss_mlp": 0.01032502, "balance_loss_clip": 1.051651, "balance_loss_mlp": 1.02466428, "epoch": 0.3891059941080984, "flos": 28763728727040.0, "grad_norm": 1.6225851618359772, "language_loss": 0.82521915, "learning_rate": 2.793166081304702e-06, "loss": 0.84728634, "num_input_tokens_seen": 69694130, "step": 3236, "time_per_iteration": 2.6599864959716797 }, { "auxiliary_loss_clip": 0.01152578, "auxiliary_loss_mlp": 0.01028028, "balance_loss_clip": 1.04751873, "balance_loss_mlp": 1.01933181, "epoch": 0.38922623699873743, "flos": 22893447893760.0, "grad_norm": 1.8615951226464944, "language_loss": 0.82299507, "learning_rate": 2.7924509260836543e-06, "loss": 0.84480107, "num_input_tokens_seen": 69713255, "step": 3237, "time_per_iteration": 3.675790309906006 }, { "auxiliary_loss_clip": 0.01144001, "auxiliary_loss_mlp": 0.01028556, "balance_loss_clip": 1.04660082, "balance_loss_mlp": 1.01960897, "epoch": 0.38934647988937654, "flos": 19792418002560.0, "grad_norm": 1.5376539140713679, "language_loss": 0.68406224, "learning_rate": 2.791735650648431e-06, "loss": 0.7057879, "num_input_tokens_seen": 69732375, "step": 3238, "time_per_iteration": 2.6948530673980713 }, { "auxiliary_loss_clip": 0.01158371, "auxiliary_loss_mlp": 0.01026559, "balance_loss_clip": 1.04916346, "balance_loss_mlp": 1.0175705, "epoch": 0.38946672278001565, "flos": 19202081978880.0, "grad_norm": 2.549064714336416, "language_loss": 0.74365664, "learning_rate": 2.791020255107538e-06, "loss": 0.76550591, "num_input_tokens_seen": 69749745, "step": 3239, "time_per_iteration": 2.6820216178894043 }, { "auxiliary_loss_clip": 0.01140669, "auxiliary_loss_mlp": 0.01031666, "balance_loss_clip": 1.04441357, "balance_loss_mlp": 1.02315402, "epoch": 0.3895869656706547, "flos": 24936477661440.0, "grad_norm": 1.6124931371829117, "language_loss": 0.80830002, "learning_rate": 2.7903047395695023e-06, "loss": 0.83002335, "num_input_tokens_seen": 69769645, "step": 3240, "time_per_iteration": 2.7840681076049805 }, { "auxiliary_loss_clip": 0.01175275, "auxiliary_loss_mlp": 0.00764291, "balance_loss_clip": 1.05337143, "balance_loss_mlp": 1.00019097, "epoch": 0.3897072085612938, "flos": 24133622820480.0, "grad_norm": 4.053779956028131, "language_loss": 0.90030605, "learning_rate": 2.789589104142865e-06, "loss": 0.9197017, "num_input_tokens_seen": 69787270, "step": 3241, "time_per_iteration": 3.6048662662506104 }, { "auxiliary_loss_clip": 0.01149821, "auxiliary_loss_mlp": 0.01027593, "balance_loss_clip": 1.0494653, "balance_loss_mlp": 1.01872361, "epoch": 0.3898274514519329, "flos": 17166342672000.0, "grad_norm": 2.0240883621220638, "language_loss": 0.76373833, "learning_rate": 2.7888733489361895e-06, "loss": 0.78551245, "num_input_tokens_seen": 69805685, "step": 3242, "time_per_iteration": 3.6397271156311035 }, { "auxiliary_loss_clip": 0.01090919, "auxiliary_loss_mlp": 0.01002595, "balance_loss_clip": 1.02002406, "balance_loss_mlp": 1.00073576, "epoch": 0.389947694342572, "flos": 66074807952000.0, "grad_norm": 0.7310796590533779, "language_loss": 0.58682263, "learning_rate": 2.788157474058054e-06, "loss": 0.60775781, "num_input_tokens_seen": 69867960, "step": 3243, "time_per_iteration": 3.3321402072906494 }, { "auxiliary_loss_clip": 0.01186734, "auxiliary_loss_mlp": 0.01029253, "balance_loss_clip": 1.05219829, "balance_loss_mlp": 1.02083051, "epoch": 0.3900679372332111, "flos": 25740912700800.0, "grad_norm": 1.972669062094332, "language_loss": 0.70204961, "learning_rate": 2.7874414796170555e-06, "loss": 0.72420943, "num_input_tokens_seen": 69889450, "step": 3244, "time_per_iteration": 2.633253812789917 }, { "auxiliary_loss_clip": 0.01169299, "auxiliary_loss_mlp": 0.0103546, "balance_loss_clip": 1.04990244, "balance_loss_mlp": 1.02608371, "epoch": 0.3901881801238502, "flos": 11801611808640.0, "grad_norm": 2.238129937106705, "language_loss": 0.84014487, "learning_rate": 2.7867253657218113e-06, "loss": 0.86219251, "num_input_tokens_seen": 69903340, "step": 3245, "time_per_iteration": 2.641946792602539 }, { "auxiliary_loss_clip": 0.01158958, "auxiliary_loss_mlp": 0.00764047, "balance_loss_clip": 1.04768801, "balance_loss_mlp": 1.00022542, "epoch": 0.39030842301448926, "flos": 27308951994240.0, "grad_norm": 1.6484617553314735, "language_loss": 0.73365331, "learning_rate": 2.7860091324809544e-06, "loss": 0.75288332, "num_input_tokens_seen": 69924400, "step": 3246, "time_per_iteration": 2.7086246013641357 }, { "auxiliary_loss_clip": 0.01173083, "auxiliary_loss_mlp": 0.01028303, "balance_loss_clip": 1.05279982, "balance_loss_mlp": 1.02012479, "epoch": 0.39042866590512837, "flos": 27163334257920.0, "grad_norm": 1.654601617606051, "language_loss": 0.81443775, "learning_rate": 2.7852927800031377e-06, "loss": 0.83645165, "num_input_tokens_seen": 69944565, "step": 3247, "time_per_iteration": 2.7154603004455566 }, { "auxiliary_loss_clip": 0.01162844, "auxiliary_loss_mlp": 0.01032218, "balance_loss_clip": 1.04947972, "balance_loss_mlp": 1.02382588, "epoch": 0.3905489087957674, "flos": 29716115886720.0, "grad_norm": 1.7816544813741406, "language_loss": 0.82958043, "learning_rate": 2.7845763083970298e-06, "loss": 0.85153103, "num_input_tokens_seen": 69964965, "step": 3248, "time_per_iteration": 2.7210569381713867 }, { "auxiliary_loss_clip": 0.01166301, "auxiliary_loss_mlp": 0.01029715, "balance_loss_clip": 1.04832757, "balance_loss_mlp": 1.02089381, "epoch": 0.39066915168640653, "flos": 24498618871680.0, "grad_norm": 2.291479479280437, "language_loss": 0.82164752, "learning_rate": 2.7838597177713205e-06, "loss": 0.84360766, "num_input_tokens_seen": 69986055, "step": 3249, "time_per_iteration": 2.697658061981201 }, { "auxiliary_loss_clip": 0.01113451, "auxiliary_loss_mlp": 0.01038111, "balance_loss_clip": 1.04958999, "balance_loss_mlp": 1.02936745, "epoch": 0.39078939457704565, "flos": 20558572122240.0, "grad_norm": 2.597649224064127, "language_loss": 0.73806036, "learning_rate": 2.7831430082347143e-06, "loss": 0.75957596, "num_input_tokens_seen": 70005260, "step": 3250, "time_per_iteration": 2.8245553970336914 }, { "auxiliary_loss_clip": 0.01176939, "auxiliary_loss_mlp": 0.00762897, "balance_loss_clip": 1.05310321, "balance_loss_mlp": 1.0002079, "epoch": 0.3909096374676847, "flos": 22783417557120.0, "grad_norm": 1.9147297985926481, "language_loss": 0.82648611, "learning_rate": 2.7824261798959373e-06, "loss": 0.8458845, "num_input_tokens_seen": 70023440, "step": 3251, "time_per_iteration": 2.6061642169952393 }, { "auxiliary_loss_clip": 0.0115989, "auxiliary_loss_mlp": 0.01030695, "balance_loss_clip": 1.04606843, "balance_loss_mlp": 1.02232623, "epoch": 0.3910298803583238, "flos": 23003119094400.0, "grad_norm": 1.8895253641803071, "language_loss": 0.7956841, "learning_rate": 2.78170923286373e-06, "loss": 0.81758988, "num_input_tokens_seen": 70043040, "step": 3252, "time_per_iteration": 2.7194063663482666 }, { "auxiliary_loss_clip": 0.01101592, "auxiliary_loss_mlp": 0.0102609, "balance_loss_clip": 1.04741931, "balance_loss_mlp": 1.01703572, "epoch": 0.3911501232489629, "flos": 24316264500480.0, "grad_norm": 2.314289940810285, "language_loss": 0.84237635, "learning_rate": 2.780992167246854e-06, "loss": 0.86365318, "num_input_tokens_seen": 70060565, "step": 3253, "time_per_iteration": 2.872896671295166 }, { "auxiliary_loss_clip": 0.01070876, "auxiliary_loss_mlp": 0.01003425, "balance_loss_clip": 1.01755285, "balance_loss_mlp": 1.00161254, "epoch": 0.391270366139602, "flos": 60869054684160.0, "grad_norm": 0.9736589010466935, "language_loss": 0.72114909, "learning_rate": 2.7802749831540883e-06, "loss": 0.7418921, "num_input_tokens_seen": 70119465, "step": 3254, "time_per_iteration": 3.383840799331665 }, { "auxiliary_loss_clip": 0.01136488, "auxiliary_loss_mlp": 0.01027103, "balance_loss_clip": 1.04746914, "balance_loss_mlp": 1.01957512, "epoch": 0.3913906090302411, "flos": 21543494025600.0, "grad_norm": 1.8953867975198238, "language_loss": 0.81710649, "learning_rate": 2.7795576806942268e-06, "loss": 0.83874243, "num_input_tokens_seen": 70138270, "step": 3255, "time_per_iteration": 2.74540114402771 }, { "auxiliary_loss_clip": 0.01079663, "auxiliary_loss_mlp": 0.0100431, "balance_loss_clip": 1.0355792, "balance_loss_mlp": 1.00255775, "epoch": 0.3915108519208802, "flos": 49839953702400.0, "grad_norm": 0.7567055908940665, "language_loss": 0.5484755, "learning_rate": 2.778840259976085e-06, "loss": 0.5693152, "num_input_tokens_seen": 70193500, "step": 3256, "time_per_iteration": 3.2198386192321777 }, { "auxiliary_loss_clip": 0.0117687, "auxiliary_loss_mlp": 0.01032945, "balance_loss_clip": 1.05174327, "balance_loss_mlp": 1.02410555, "epoch": 0.39163109481151925, "flos": 16506447960960.0, "grad_norm": 2.060167716233353, "language_loss": 0.76723361, "learning_rate": 2.778122721108495e-06, "loss": 0.78933173, "num_input_tokens_seen": 70211730, "step": 3257, "time_per_iteration": 2.6169803142547607 }, { "auxiliary_loss_clip": 0.01174736, "auxiliary_loss_mlp": 0.01036034, "balance_loss_clip": 1.05420423, "balance_loss_mlp": 1.02755845, "epoch": 0.39175133770215836, "flos": 26067484177920.0, "grad_norm": 1.9165143063507595, "language_loss": 0.88172525, "learning_rate": 2.7774050642003076e-06, "loss": 0.90383297, "num_input_tokens_seen": 70232540, "step": 3258, "time_per_iteration": 2.6419460773468018 }, { "auxiliary_loss_clip": 0.01195806, "auxiliary_loss_mlp": 0.01032292, "balance_loss_clip": 1.05708456, "balance_loss_mlp": 1.02342868, "epoch": 0.3918715805927975, "flos": 21872076664320.0, "grad_norm": 2.1078726511186603, "language_loss": 0.93146777, "learning_rate": 2.7766872893603896e-06, "loss": 0.95374876, "num_input_tokens_seen": 70252515, "step": 3259, "time_per_iteration": 2.6197965145111084 }, { "auxiliary_loss_clip": 0.01179842, "auxiliary_loss_mlp": 0.01031802, "balance_loss_clip": 1.05412698, "balance_loss_mlp": 1.02388048, "epoch": 0.39199182348343653, "flos": 20376181837440.0, "grad_norm": 1.722732431557317, "language_loss": 0.73325956, "learning_rate": 2.7759693966976275e-06, "loss": 0.75537598, "num_input_tokens_seen": 70271020, "step": 3260, "time_per_iteration": 2.6392407417297363 }, { "auxiliary_loss_clip": 0.01145359, "auxiliary_loss_mlp": 0.01034518, "balance_loss_clip": 1.04826474, "balance_loss_mlp": 1.02498722, "epoch": 0.39211206637407564, "flos": 21683545153920.0, "grad_norm": 2.357233885421734, "language_loss": 0.85097909, "learning_rate": 2.7752513863209242e-06, "loss": 0.87277782, "num_input_tokens_seen": 70289600, "step": 3261, "time_per_iteration": 2.718134880065918 }, { "auxiliary_loss_clip": 0.01157029, "auxiliary_loss_mlp": 0.00762821, "balance_loss_clip": 1.05265713, "balance_loss_mlp": 1.00026906, "epoch": 0.39223230926471475, "flos": 21066276908160.0, "grad_norm": 1.7100269778413049, "language_loss": 0.84515667, "learning_rate": 2.774533258339203e-06, "loss": 0.86435521, "num_input_tokens_seen": 70307060, "step": 3262, "time_per_iteration": 3.676752805709839 }, { "auxiliary_loss_clip": 0.01131442, "auxiliary_loss_mlp": 0.01028285, "balance_loss_clip": 1.04235506, "balance_loss_mlp": 1.02003574, "epoch": 0.3923525521553538, "flos": 17603016312960.0, "grad_norm": 2.048500285926174, "language_loss": 0.80160481, "learning_rate": 2.7738150128614014e-06, "loss": 0.82320207, "num_input_tokens_seen": 70324465, "step": 3263, "time_per_iteration": 3.6646718978881836 }, { "auxiliary_loss_clip": 0.01134961, "auxiliary_loss_mlp": 0.01028997, "balance_loss_clip": 1.04738784, "balance_loss_mlp": 1.02065217, "epoch": 0.3924727950459929, "flos": 20558284813440.0, "grad_norm": 1.881881923438459, "language_loss": 0.89740968, "learning_rate": 2.7730966499964777e-06, "loss": 0.91904926, "num_input_tokens_seen": 70341415, "step": 3264, "time_per_iteration": 2.6850924491882324 }, { "auxiliary_loss_clip": 0.01189869, "auxiliary_loss_mlp": 0.01025672, "balance_loss_clip": 1.05162907, "balance_loss_mlp": 1.01688612, "epoch": 0.39259303793663197, "flos": 16216110328320.0, "grad_norm": 2.58498421946209, "language_loss": 0.81064636, "learning_rate": 2.772378169853408e-06, "loss": 0.83280176, "num_input_tokens_seen": 70358985, "step": 3265, "time_per_iteration": 2.534372568130493 }, { "auxiliary_loss_clip": 0.01147779, "auxiliary_loss_mlp": 0.01028639, "balance_loss_clip": 1.05364573, "balance_loss_mlp": 1.02011514, "epoch": 0.3927132808272711, "flos": 16797001075200.0, "grad_norm": 1.7078610096899944, "language_loss": 0.74596488, "learning_rate": 2.771659572541183e-06, "loss": 0.76772904, "num_input_tokens_seen": 70376915, "step": 3266, "time_per_iteration": 2.678567409515381 }, { "auxiliary_loss_clip": 0.01177894, "auxiliary_loss_mlp": 0.01028644, "balance_loss_clip": 1.05329347, "balance_loss_mlp": 1.02065158, "epoch": 0.3928335237179102, "flos": 20267228908800.0, "grad_norm": 2.135971499697529, "language_loss": 0.86896163, "learning_rate": 2.7709408581688143e-06, "loss": 0.89102697, "num_input_tokens_seen": 70396900, "step": 3267, "time_per_iteration": 3.5650525093078613 }, { "auxiliary_loss_clip": 0.01153876, "auxiliary_loss_mlp": 0.01031102, "balance_loss_clip": 1.05228758, "balance_loss_mlp": 1.02300739, "epoch": 0.39295376660854925, "flos": 24973250209920.0, "grad_norm": 1.665627915089905, "language_loss": 0.880494, "learning_rate": 2.7702220268453307e-06, "loss": 0.90234375, "num_input_tokens_seen": 70417260, "step": 3268, "time_per_iteration": 3.6638028621673584 }, { "auxiliary_loss_clip": 0.01162943, "auxiliary_loss_mlp": 0.01027821, "balance_loss_clip": 1.05123186, "balance_loss_mlp": 1.01940477, "epoch": 0.39307400949918836, "flos": 18697788984960.0, "grad_norm": 1.8579330019904117, "language_loss": 0.84816486, "learning_rate": 2.7695030786797785e-06, "loss": 0.87007248, "num_input_tokens_seen": 70433155, "step": 3269, "time_per_iteration": 2.659712314605713 }, { "auxiliary_loss_clip": 0.01127149, "auxiliary_loss_mlp": 0.01032498, "balance_loss_clip": 1.04657769, "balance_loss_mlp": 1.02387953, "epoch": 0.39319425238982747, "flos": 22415476590720.0, "grad_norm": 2.1651085775961056, "language_loss": 0.7483176, "learning_rate": 2.7687840137812206e-06, "loss": 0.76991409, "num_input_tokens_seen": 70451240, "step": 3270, "time_per_iteration": 2.7541627883911133 }, { "auxiliary_loss_clip": 0.01071234, "auxiliary_loss_mlp": 0.01003551, "balance_loss_clip": 1.01649165, "balance_loss_mlp": 1.00188184, "epoch": 0.3933144952804665, "flos": 66192954762240.0, "grad_norm": 0.7997843179156587, "language_loss": 0.62061441, "learning_rate": 2.7680648322587395e-06, "loss": 0.64136225, "num_input_tokens_seen": 70516115, "step": 3271, "time_per_iteration": 3.2671632766723633 }, { "auxiliary_loss_clip": 0.01189042, "auxiliary_loss_mlp": 0.010227, "balance_loss_clip": 1.0529778, "balance_loss_mlp": 1.01400375, "epoch": 0.39343473817110564, "flos": 15487159720320.0, "grad_norm": 1.84275640504053, "language_loss": 0.8092705, "learning_rate": 2.7673455342214334e-06, "loss": 0.83138788, "num_input_tokens_seen": 70533105, "step": 3272, "time_per_iteration": 2.6199252605438232 }, { "auxiliary_loss_clip": 0.01175373, "auxiliary_loss_mlp": 0.01026792, "balance_loss_clip": 1.05246079, "balance_loss_mlp": 1.01823878, "epoch": 0.39355498106174475, "flos": 21324905809920.0, "grad_norm": 1.8421667791011052, "language_loss": 0.7594797, "learning_rate": 2.7666261197784198e-06, "loss": 0.78150141, "num_input_tokens_seen": 70551920, "step": 3273, "time_per_iteration": 2.624459981918335 }, { "auxiliary_loss_clip": 0.01157154, "auxiliary_loss_mlp": 0.01027736, "balance_loss_clip": 1.0512495, "balance_loss_mlp": 1.0192064, "epoch": 0.3936752239523838, "flos": 13296357400320.0, "grad_norm": 2.0744170495955148, "language_loss": 0.76261717, "learning_rate": 2.7659065890388336e-06, "loss": 0.78446609, "num_input_tokens_seen": 70567920, "step": 3274, "time_per_iteration": 2.6840105056762695 }, { "auxiliary_loss_clip": 0.01161917, "auxiliary_loss_mlp": 0.01022416, "balance_loss_clip": 1.04861045, "balance_loss_mlp": 1.01446438, "epoch": 0.3937954668430229, "flos": 16800161472000.0, "grad_norm": 1.9589124095976442, "language_loss": 0.85063368, "learning_rate": 2.7651869421118266e-06, "loss": 0.87247694, "num_input_tokens_seen": 70584530, "step": 3275, "time_per_iteration": 2.641868829727173 }, { "auxiliary_loss_clip": 0.0117918, "auxiliary_loss_mlp": 0.01031072, "balance_loss_clip": 1.0542475, "balance_loss_mlp": 1.02261996, "epoch": 0.393915709733662, "flos": 21064229832960.0, "grad_norm": 1.6363944918259072, "language_loss": 0.82835627, "learning_rate": 2.76446717910657e-06, "loss": 0.85045886, "num_input_tokens_seen": 70605235, "step": 3276, "time_per_iteration": 2.6670329570770264 }, { "auxiliary_loss_clip": 0.01171683, "auxiliary_loss_mlp": 0.0102985, "balance_loss_clip": 1.05105853, "balance_loss_mlp": 1.02183914, "epoch": 0.3940359526243011, "flos": 17165265264000.0, "grad_norm": 2.16535933780981, "language_loss": 0.77144289, "learning_rate": 2.763747300132249e-06, "loss": 0.79345822, "num_input_tokens_seen": 70622675, "step": 3277, "time_per_iteration": 2.575369358062744 }, { "auxiliary_loss_clip": 0.01189091, "auxiliary_loss_mlp": 0.0103102, "balance_loss_clip": 1.05437779, "balance_loss_mlp": 1.0230335, "epoch": 0.3941561955149402, "flos": 20995856294400.0, "grad_norm": 1.6533014300902698, "language_loss": 0.87055767, "learning_rate": 2.7630273052980704e-06, "loss": 0.89275885, "num_input_tokens_seen": 70643265, "step": 3278, "time_per_iteration": 2.660200357437134 }, { "auxiliary_loss_clip": 0.01149011, "auxiliary_loss_mlp": 0.01030767, "balance_loss_clip": 1.04919636, "balance_loss_mlp": 1.02280951, "epoch": 0.39427643840557924, "flos": 18843406721280.0, "grad_norm": 2.1760271054542, "language_loss": 0.66952157, "learning_rate": 2.7623071947132554e-06, "loss": 0.69131935, "num_input_tokens_seen": 70660295, "step": 3279, "time_per_iteration": 2.6336703300476074 }, { "auxiliary_loss_clip": 0.01166851, "auxiliary_loss_mlp": 0.0103343, "balance_loss_clip": 1.05001855, "balance_loss_mlp": 1.02486467, "epoch": 0.39439668129621835, "flos": 23258659426560.0, "grad_norm": 2.0493208448528404, "language_loss": 0.7907353, "learning_rate": 2.7615869684870458e-06, "loss": 0.81273806, "num_input_tokens_seen": 70679605, "step": 3280, "time_per_iteration": 2.7293214797973633 }, { "auxiliary_loss_clip": 0.0117302, "auxiliary_loss_mlp": 0.01031923, "balance_loss_clip": 1.05284441, "balance_loss_mlp": 1.02352476, "epoch": 0.39451692418685746, "flos": 26652289507200.0, "grad_norm": 1.8314226317763982, "language_loss": 0.8490175, "learning_rate": 2.7608666267286986e-06, "loss": 0.87106693, "num_input_tokens_seen": 70699835, "step": 3281, "time_per_iteration": 2.721987009048462 }, { "auxiliary_loss_clip": 0.01108433, "auxiliary_loss_mlp": 0.01029514, "balance_loss_clip": 1.04144669, "balance_loss_mlp": 1.02019751, "epoch": 0.3946371670774965, "flos": 18258709132800.0, "grad_norm": 2.226008972073127, "language_loss": 0.8671031, "learning_rate": 2.760146169547489e-06, "loss": 0.88848257, "num_input_tokens_seen": 70716600, "step": 3282, "time_per_iteration": 2.871910572052002 }, { "auxiliary_loss_clip": 0.01162818, "auxiliary_loss_mlp": 0.01032559, "balance_loss_clip": 1.05334008, "balance_loss_mlp": 1.02439284, "epoch": 0.39475740996813563, "flos": 24206126423040.0, "grad_norm": 1.7900601078095661, "language_loss": 0.7656064, "learning_rate": 2.75942559705271e-06, "loss": 0.78756022, "num_input_tokens_seen": 70736335, "step": 3283, "time_per_iteration": 2.7776174545288086 }, { "auxiliary_loss_clip": 0.01173996, "auxiliary_loss_mlp": 0.01034297, "balance_loss_clip": 1.05251741, "balance_loss_mlp": 1.02536559, "epoch": 0.39487765285877474, "flos": 19317858491520.0, "grad_norm": 1.83777635418339, "language_loss": 0.89197129, "learning_rate": 2.7587049093536713e-06, "loss": 0.91405421, "num_input_tokens_seen": 70752665, "step": 3284, "time_per_iteration": 2.670316457748413 }, { "auxiliary_loss_clip": 0.01178453, "auxiliary_loss_mlp": 0.01032626, "balance_loss_clip": 1.05219042, "balance_loss_mlp": 1.02502656, "epoch": 0.3949978957494138, "flos": 17311744926720.0, "grad_norm": 1.8707022125905621, "language_loss": 0.80775368, "learning_rate": 2.757984106559701e-06, "loss": 0.8298645, "num_input_tokens_seen": 70771650, "step": 3285, "time_per_iteration": 2.6315786838531494 }, { "auxiliary_loss_clip": 0.01153391, "auxiliary_loss_mlp": 0.01031104, "balance_loss_clip": 1.05038548, "balance_loss_mlp": 1.02317619, "epoch": 0.3951181386400529, "flos": 36317861280000.0, "grad_norm": 2.188978728937141, "language_loss": 0.71100348, "learning_rate": 2.7572631887801446e-06, "loss": 0.73284841, "num_input_tokens_seen": 70793275, "step": 3286, "time_per_iteration": 2.7904465198516846 }, { "auxiliary_loss_clip": 0.01172769, "auxiliary_loss_mlp": 0.01029586, "balance_loss_clip": 1.05044866, "balance_loss_mlp": 1.02129507, "epoch": 0.395238381530692, "flos": 23110348170240.0, "grad_norm": 2.9087878593243284, "language_loss": 0.76917583, "learning_rate": 2.7565421561243654e-06, "loss": 0.79119939, "num_input_tokens_seen": 70811440, "step": 3287, "time_per_iteration": 2.6572914123535156 }, { "auxiliary_loss_clip": 0.01137617, "auxiliary_loss_mlp": 0.01033207, "balance_loss_clip": 1.04558539, "balance_loss_mlp": 1.02516675, "epoch": 0.3953586244213311, "flos": 24347614095360.0, "grad_norm": 1.9668230777160247, "language_loss": 0.82250762, "learning_rate": 2.7558210087017413e-06, "loss": 0.84421587, "num_input_tokens_seen": 70831375, "step": 3288, "time_per_iteration": 3.7082912921905518 }, { "auxiliary_loss_clip": 0.01139674, "auxiliary_loss_mlp": 0.01033536, "balance_loss_clip": 1.04968596, "balance_loss_mlp": 1.02549517, "epoch": 0.3954788673119702, "flos": 23440080044160.0, "grad_norm": 1.7895638363943807, "language_loss": 0.73397362, "learning_rate": 2.7550997466216724e-06, "loss": 0.75570571, "num_input_tokens_seen": 70849170, "step": 3289, "time_per_iteration": 3.6803553104400635 }, { "auxiliary_loss_clip": 0.01159861, "auxiliary_loss_mlp": 0.01028373, "balance_loss_clip": 1.05411792, "balance_loss_mlp": 1.01976037, "epoch": 0.3955991102026093, "flos": 17494063384320.0, "grad_norm": 1.8328788200485375, "language_loss": 0.81208616, "learning_rate": 2.7543783699935714e-06, "loss": 0.83396852, "num_input_tokens_seen": 70867200, "step": 3290, "time_per_iteration": 2.6653316020965576 }, { "auxiliary_loss_clip": 0.01173881, "auxiliary_loss_mlp": 0.01027185, "balance_loss_clip": 1.05380416, "balance_loss_mlp": 1.01898956, "epoch": 0.39571935309324835, "flos": 18221326053120.0, "grad_norm": 2.6464533646277224, "language_loss": 0.86718297, "learning_rate": 2.753656878926872e-06, "loss": 0.88919365, "num_input_tokens_seen": 70883080, "step": 3291, "time_per_iteration": 2.608905553817749 }, { "auxiliary_loss_clip": 0.01147676, "auxiliary_loss_mlp": 0.01031308, "balance_loss_clip": 1.0471493, "balance_loss_mlp": 1.02260602, "epoch": 0.39583959598388746, "flos": 17748813617280.0, "grad_norm": 1.7503547695595836, "language_loss": 0.74183547, "learning_rate": 2.752935273531023e-06, "loss": 0.76362532, "num_input_tokens_seen": 70901230, "step": 3292, "time_per_iteration": 2.7029242515563965 }, { "auxiliary_loss_clip": 0.01178577, "auxiliary_loss_mlp": 0.01034588, "balance_loss_clip": 1.05354905, "balance_loss_mlp": 1.02599263, "epoch": 0.39595983887452657, "flos": 19352368483200.0, "grad_norm": 1.7293157528522765, "language_loss": 0.78497589, "learning_rate": 2.752213553915492e-06, "loss": 0.80710757, "num_input_tokens_seen": 70919585, "step": 3293, "time_per_iteration": 3.510363817214966 }, { "auxiliary_loss_clip": 0.01061534, "auxiliary_loss_mlp": 0.01002257, "balance_loss_clip": 1.0151794, "balance_loss_mlp": 1.00062394, "epoch": 0.3960800817651656, "flos": 60682282940160.0, "grad_norm": 0.8134377214714756, "language_loss": 0.66127127, "learning_rate": 2.751491720189762e-06, "loss": 0.6819092, "num_input_tokens_seen": 70977695, "step": 3294, "time_per_iteration": 4.217949151992798 }, { "auxiliary_loss_clip": 0.01158755, "auxiliary_loss_mlp": 0.00763134, "balance_loss_clip": 1.05002201, "balance_loss_mlp": 1.00027895, "epoch": 0.39620032465580474, "flos": 16836718538880.0, "grad_norm": 2.2694241855612702, "language_loss": 0.91804922, "learning_rate": 2.7507697724633364e-06, "loss": 0.93726814, "num_input_tokens_seen": 70994455, "step": 3295, "time_per_iteration": 2.690751075744629 }, { "auxiliary_loss_clip": 0.01055583, "auxiliary_loss_mlp": 0.01003122, "balance_loss_clip": 1.02379584, "balance_loss_mlp": 1.00156045, "epoch": 0.3963205675464438, "flos": 69071445941760.0, "grad_norm": 0.7763478260805954, "language_loss": 0.54676288, "learning_rate": 2.7500477108457327e-06, "loss": 0.56734997, "num_input_tokens_seen": 71046465, "step": 3296, "time_per_iteration": 3.097503423690796 }, { "auxiliary_loss_clip": 0.01173127, "auxiliary_loss_mlp": 0.01032655, "balance_loss_clip": 1.05075109, "balance_loss_mlp": 1.02374411, "epoch": 0.3964408104370829, "flos": 25667439431040.0, "grad_norm": 1.8696170358202702, "language_loss": 0.81289721, "learning_rate": 2.7493255354464877e-06, "loss": 0.83495498, "num_input_tokens_seen": 71064275, "step": 3297, "time_per_iteration": 2.711761951446533 }, { "auxiliary_loss_clip": 0.01057849, "auxiliary_loss_mlp": 0.01032407, "balance_loss_clip": 1.03625727, "balance_loss_mlp": 1.02412796, "epoch": 0.396561053327722, "flos": 24277480790400.0, "grad_norm": 1.9823858527518616, "language_loss": 0.76462704, "learning_rate": 2.748603246375156e-06, "loss": 0.78552955, "num_input_tokens_seen": 71082290, "step": 3298, "time_per_iteration": 3.0387825965881348 }, { "auxiliary_loss_clip": 0.01189598, "auxiliary_loss_mlp": 0.01031896, "balance_loss_clip": 1.05392694, "balance_loss_mlp": 1.02360511, "epoch": 0.39668129621836107, "flos": 20522302364160.0, "grad_norm": 2.349871843356602, "language_loss": 0.69587123, "learning_rate": 2.7478808437413055e-06, "loss": 0.71808618, "num_input_tokens_seen": 71101700, "step": 3299, "time_per_iteration": 2.94031023979187 }, { "auxiliary_loss_clip": 0.01129248, "auxiliary_loss_mlp": 0.01029332, "balance_loss_clip": 1.04940367, "balance_loss_mlp": 1.02089787, "epoch": 0.3968015391090002, "flos": 27052585649280.0, "grad_norm": 2.008476693530805, "language_loss": 0.65646708, "learning_rate": 2.7471583276545263e-06, "loss": 0.67805278, "num_input_tokens_seen": 71122360, "step": 3300, "time_per_iteration": 2.7558059692382812 }, { "auxiliary_loss_clip": 0.01162537, "auxiliary_loss_mlp": 0.01026737, "balance_loss_clip": 1.04982829, "balance_loss_mlp": 1.0183568, "epoch": 0.3969217819996393, "flos": 12531819392640.0, "grad_norm": 1.7732128166686139, "language_loss": 0.7046622, "learning_rate": 2.7464356982244224e-06, "loss": 0.72655493, "num_input_tokens_seen": 71140360, "step": 3301, "time_per_iteration": 2.621948480606079 }, { "auxiliary_loss_clip": 0.01084109, "auxiliary_loss_mlp": 0.01005014, "balance_loss_clip": 1.02908111, "balance_loss_mlp": 1.00338066, "epoch": 0.39704202489027834, "flos": 66241399230720.0, "grad_norm": 0.7747660906312722, "language_loss": 0.61686563, "learning_rate": 2.745712955560617e-06, "loss": 0.63775682, "num_input_tokens_seen": 71196565, "step": 3302, "time_per_iteration": 3.197483777999878 }, { "auxiliary_loss_clip": 0.01114237, "auxiliary_loss_mlp": 0.01034798, "balance_loss_clip": 1.04474783, "balance_loss_mlp": 1.02623868, "epoch": 0.39716226778091746, "flos": 16982982720000.0, "grad_norm": 2.4999283362716835, "language_loss": 0.77523601, "learning_rate": 2.7449900997727496e-06, "loss": 0.79672635, "num_input_tokens_seen": 71214675, "step": 3303, "time_per_iteration": 2.7255303859710693 }, { "auxiliary_loss_clip": 0.01161312, "auxiliary_loss_mlp": 0.01030326, "balance_loss_clip": 1.05265915, "balance_loss_mlp": 1.02238107, "epoch": 0.39728251067155657, "flos": 23477139901440.0, "grad_norm": 2.5514167667598158, "language_loss": 0.84041446, "learning_rate": 2.744267130970476e-06, "loss": 0.86233079, "num_input_tokens_seen": 71234400, "step": 3304, "time_per_iteration": 2.7456612586975098 }, { "auxiliary_loss_clip": 0.01155175, "auxiliary_loss_mlp": 0.01026763, "balance_loss_clip": 1.05139291, "balance_loss_mlp": 1.01778615, "epoch": 0.3974027535621956, "flos": 20704441253760.0, "grad_norm": 2.5148021967809098, "language_loss": 0.76962405, "learning_rate": 2.7435440492634697e-06, "loss": 0.79144335, "num_input_tokens_seen": 71253725, "step": 3305, "time_per_iteration": 2.7264208793640137 }, { "auxiliary_loss_clip": 0.0116314, "auxiliary_loss_mlp": 0.01032642, "balance_loss_clip": 1.05099583, "balance_loss_mlp": 1.02259862, "epoch": 0.39752299645283473, "flos": 21543278544000.0, "grad_norm": 12.165097585589187, "language_loss": 0.67216229, "learning_rate": 2.7428208547614228e-06, "loss": 0.69412005, "num_input_tokens_seen": 71273220, "step": 3306, "time_per_iteration": 2.659529685974121 }, { "auxiliary_loss_clip": 0.01175735, "auxiliary_loss_mlp": 0.01031367, "balance_loss_clip": 1.05262423, "balance_loss_mlp": 1.02259362, "epoch": 0.39764323934347384, "flos": 19208295031680.0, "grad_norm": 1.8369676237465387, "language_loss": 0.76961166, "learning_rate": 2.742097547574043e-06, "loss": 0.79168272, "num_input_tokens_seen": 71291445, "step": 3307, "time_per_iteration": 2.8765435218811035 }, { "auxiliary_loss_clip": 0.01168439, "auxiliary_loss_mlp": 0.00763848, "balance_loss_clip": 1.05097914, "balance_loss_mlp": 1.00029302, "epoch": 0.3977634822341129, "flos": 20850202644480.0, "grad_norm": 1.8755128724099919, "language_loss": 0.77606028, "learning_rate": 2.7413741278110544e-06, "loss": 0.79538316, "num_input_tokens_seen": 71310135, "step": 3308, "time_per_iteration": 2.6444544792175293 }, { "auxiliary_loss_clip": 0.01166398, "auxiliary_loss_mlp": 0.01034785, "balance_loss_clip": 1.05355799, "balance_loss_mlp": 1.02555859, "epoch": 0.397883725124752, "flos": 39786042038400.0, "grad_norm": 2.040696608930875, "language_loss": 0.68837333, "learning_rate": 2.7406505955822016e-06, "loss": 0.7103852, "num_input_tokens_seen": 71331160, "step": 3309, "time_per_iteration": 2.8109352588653564 }, { "auxiliary_loss_clip": 0.0115898, "auxiliary_loss_mlp": 0.01033317, "balance_loss_clip": 1.04855728, "balance_loss_mlp": 1.02406669, "epoch": 0.39800396801539106, "flos": 17379507934080.0, "grad_norm": 5.533467068506392, "language_loss": 0.66017938, "learning_rate": 2.7399269509972415e-06, "loss": 0.68210244, "num_input_tokens_seen": 71345315, "step": 3310, "time_per_iteration": 2.645211696624756 }, { "auxiliary_loss_clip": 0.01154239, "auxiliary_loss_mlp": 0.01034664, "balance_loss_clip": 1.04563653, "balance_loss_mlp": 1.025455, "epoch": 0.3981242109060302, "flos": 19202764337280.0, "grad_norm": 2.423686333899025, "language_loss": 0.85175246, "learning_rate": 2.7392031941659514e-06, "loss": 0.87364155, "num_input_tokens_seen": 71363160, "step": 3311, "time_per_iteration": 2.664438009262085 }, { "auxiliary_loss_clip": 0.01163789, "auxiliary_loss_mlp": 0.01030214, "balance_loss_clip": 1.05502772, "balance_loss_mlp": 1.02180362, "epoch": 0.3982444537966693, "flos": 24565124903040.0, "grad_norm": 1.7576411331503077, "language_loss": 0.85898459, "learning_rate": 2.7384793251981244e-06, "loss": 0.8809247, "num_input_tokens_seen": 71382145, "step": 3312, "time_per_iteration": 2.7371463775634766 }, { "auxiliary_loss_clip": 0.01181101, "auxiliary_loss_mlp": 0.01027609, "balance_loss_clip": 1.05138075, "balance_loss_mlp": 1.01884079, "epoch": 0.39836469668730834, "flos": 26213856099840.0, "grad_norm": 2.2134529035006585, "language_loss": 0.80922896, "learning_rate": 2.737755344203571e-06, "loss": 0.83131599, "num_input_tokens_seen": 71402095, "step": 3313, "time_per_iteration": 2.6667544841766357 }, { "auxiliary_loss_clip": 0.01180943, "auxiliary_loss_mlp": 0.01038909, "balance_loss_clip": 1.05660915, "balance_loss_mlp": 1.02968788, "epoch": 0.39848493957794745, "flos": 27636134002560.0, "grad_norm": 1.70234152637495, "language_loss": 0.79997671, "learning_rate": 2.7370312512921186e-06, "loss": 0.82217526, "num_input_tokens_seen": 71423875, "step": 3314, "time_per_iteration": 4.052127122879028 }, { "auxiliary_loss_clip": 0.01165812, "auxiliary_loss_mlp": 0.01034841, "balance_loss_clip": 1.04987502, "balance_loss_mlp": 1.02529788, "epoch": 0.39860518246858656, "flos": 12239326944000.0, "grad_norm": 3.578503387245907, "language_loss": 0.77089137, "learning_rate": 2.736307046573611e-06, "loss": 0.79289794, "num_input_tokens_seen": 71439745, "step": 3315, "time_per_iteration": 3.555344343185425 }, { "auxiliary_loss_clip": 0.01189885, "auxiliary_loss_mlp": 0.01032491, "balance_loss_clip": 1.05355489, "balance_loss_mlp": 1.02408075, "epoch": 0.3987254253592256, "flos": 22379135005440.0, "grad_norm": 1.7943814777909188, "language_loss": 0.82075483, "learning_rate": 2.73558273015791e-06, "loss": 0.8429786, "num_input_tokens_seen": 71459575, "step": 3316, "time_per_iteration": 2.607576608657837 }, { "auxiliary_loss_clip": 0.01195831, "auxiliary_loss_mlp": 0.01031418, "balance_loss_clip": 1.05743909, "balance_loss_mlp": 1.0221858, "epoch": 0.3988456682498647, "flos": 23514020190720.0, "grad_norm": 2.441538984034888, "language_loss": 0.70620215, "learning_rate": 2.734858302154894e-06, "loss": 0.72847468, "num_input_tokens_seen": 71481075, "step": 3317, "time_per_iteration": 2.6018290519714355 }, { "auxiliary_loss_clip": 0.01160172, "auxiliary_loss_mlp": 0.01028004, "balance_loss_clip": 1.05379367, "balance_loss_mlp": 1.01918244, "epoch": 0.39896591114050384, "flos": 19208761908480.0, "grad_norm": 1.984612921237406, "language_loss": 0.7665956, "learning_rate": 2.734133762674457e-06, "loss": 0.78847736, "num_input_tokens_seen": 71500665, "step": 3318, "time_per_iteration": 2.6281869411468506 }, { "auxiliary_loss_clip": 0.01164232, "auxiliary_loss_mlp": 0.01025032, "balance_loss_clip": 1.05305111, "balance_loss_mlp": 1.0164547, "epoch": 0.3990861540311429, "flos": 28401031146240.0, "grad_norm": 2.4561909681602865, "language_loss": 0.70768166, "learning_rate": 2.7334091118265124e-06, "loss": 0.72957432, "num_input_tokens_seen": 71522560, "step": 3319, "time_per_iteration": 3.636202812194824 }, { "auxiliary_loss_clip": 0.01080012, "auxiliary_loss_mlp": 0.01001407, "balance_loss_clip": 1.01838899, "balance_loss_mlp": 0.99945182, "epoch": 0.399206396921782, "flos": 61758563086080.0, "grad_norm": 0.6805481396843899, "language_loss": 0.57817316, "learning_rate": 2.732684349720989e-06, "loss": 0.59898734, "num_input_tokens_seen": 71590520, "step": 3320, "time_per_iteration": 4.1919169425964355 }, { "auxiliary_loss_clip": 0.01153523, "auxiliary_loss_mlp": 0.01033542, "balance_loss_clip": 1.04948258, "balance_loss_mlp": 1.02439845, "epoch": 0.3993266398124211, "flos": 28074567409920.0, "grad_norm": 3.829545141629055, "language_loss": 0.75401735, "learning_rate": 2.7319594764678318e-06, "loss": 0.77588809, "num_input_tokens_seen": 71612620, "step": 3321, "time_per_iteration": 2.7326695919036865 }, { "auxiliary_loss_clip": 0.01134226, "auxiliary_loss_mlp": 0.010286, "balance_loss_clip": 1.0496093, "balance_loss_mlp": 1.01936126, "epoch": 0.39944688270306017, "flos": 23225083188480.0, "grad_norm": 1.8305349570287492, "language_loss": 0.82958949, "learning_rate": 2.7312344921770044e-06, "loss": 0.85121775, "num_input_tokens_seen": 71634320, "step": 3322, "time_per_iteration": 2.802638530731201 }, { "auxiliary_loss_clip": 0.01161665, "auxiliary_loss_mlp": 0.01029842, "balance_loss_clip": 1.04775381, "balance_loss_mlp": 1.02120531, "epoch": 0.3995671255936993, "flos": 19390433921280.0, "grad_norm": 2.115997871326739, "language_loss": 0.78557765, "learning_rate": 2.7305093969584857e-06, "loss": 0.80749273, "num_input_tokens_seen": 71653145, "step": 3323, "time_per_iteration": 2.6603786945343018 }, { "auxiliary_loss_clip": 0.01169971, "auxiliary_loss_mlp": 0.0103663, "balance_loss_clip": 1.05057502, "balance_loss_mlp": 1.02776074, "epoch": 0.3996873684843384, "flos": 23842638743040.0, "grad_norm": 1.9888715428305184, "language_loss": 0.80015278, "learning_rate": 2.729784190922272e-06, "loss": 0.82221878, "num_input_tokens_seen": 71674580, "step": 3324, "time_per_iteration": 2.6845834255218506 }, { "auxiliary_loss_clip": 0.01065909, "auxiliary_loss_mlp": 0.01000343, "balance_loss_clip": 1.01783121, "balance_loss_mlp": 0.99845934, "epoch": 0.39980761137497745, "flos": 66576877280640.0, "grad_norm": 0.9776976738978921, "language_loss": 0.57219929, "learning_rate": 2.729058874178378e-06, "loss": 0.59286183, "num_input_tokens_seen": 71745260, "step": 3325, "time_per_iteration": 3.371251106262207 }, { "auxiliary_loss_clip": 0.01168735, "auxiliary_loss_mlp": 0.01029463, "balance_loss_clip": 1.05478597, "balance_loss_mlp": 1.02034318, "epoch": 0.39992785426561656, "flos": 28549162834560.0, "grad_norm": 1.716591218333367, "language_loss": 0.69266146, "learning_rate": 2.7283334468368315e-06, "loss": 0.71464348, "num_input_tokens_seen": 71766540, "step": 3326, "time_per_iteration": 2.822084903717041 }, { "auxiliary_loss_clip": 0.01086083, "auxiliary_loss_mlp": 0.01039381, "balance_loss_clip": 1.03837895, "balance_loss_mlp": 1.03011239, "epoch": 0.4000480971562556, "flos": 15049408671360.0, "grad_norm": 2.098367929150586, "language_loss": 0.72689539, "learning_rate": 2.72760790900768e-06, "loss": 0.74815005, "num_input_tokens_seen": 71783125, "step": 3327, "time_per_iteration": 2.915095806121826 }, { "auxiliary_loss_clip": 0.01196851, "auxiliary_loss_mlp": 0.01028429, "balance_loss_clip": 1.05870819, "balance_loss_mlp": 1.01929784, "epoch": 0.4001683400468947, "flos": 23915609222400.0, "grad_norm": 1.9378903106590362, "language_loss": 0.78853273, "learning_rate": 2.7268822608009875e-06, "loss": 0.81078553, "num_input_tokens_seen": 71802500, "step": 3328, "time_per_iteration": 3.021928548812866 }, { "auxiliary_loss_clip": 0.01154705, "auxiliary_loss_mlp": 0.01030667, "balance_loss_clip": 1.05227137, "balance_loss_mlp": 1.02184582, "epoch": 0.40028858293753383, "flos": 24352677912960.0, "grad_norm": 1.769176814042345, "language_loss": 0.78424346, "learning_rate": 2.726156502326834e-06, "loss": 0.80609715, "num_input_tokens_seen": 71823800, "step": 3329, "time_per_iteration": 2.926999092102051 }, { "auxiliary_loss_clip": 0.01047138, "auxiliary_loss_mlp": 0.01013151, "balance_loss_clip": 1.02750325, "balance_loss_mlp": 1.01145864, "epoch": 0.4004088258281729, "flos": 66787025800320.0, "grad_norm": 0.7189636731023401, "language_loss": 0.60252976, "learning_rate": 2.725430633695316e-06, "loss": 0.62313271, "num_input_tokens_seen": 71886880, "step": 3330, "time_per_iteration": 3.39568829536438 }, { "auxiliary_loss_clip": 0.0108778, "auxiliary_loss_mlp": 0.0100236, "balance_loss_clip": 1.01690853, "balance_loss_mlp": 1.00045288, "epoch": 0.400529068718812, "flos": 58598386473600.0, "grad_norm": 0.8851323340201467, "language_loss": 0.57932782, "learning_rate": 2.7247046550165485e-06, "loss": 0.6002292, "num_input_tokens_seen": 71939005, "step": 3331, "time_per_iteration": 3.0423264503479004 }, { "auxiliary_loss_clip": 0.01195216, "auxiliary_loss_mlp": 0.01036351, "balance_loss_clip": 1.05819702, "balance_loss_mlp": 1.02706468, "epoch": 0.4006493116094511, "flos": 25377460934400.0, "grad_norm": 1.5004161641865668, "language_loss": 0.75982219, "learning_rate": 2.7239785664006606e-06, "loss": 0.78213787, "num_input_tokens_seen": 71962545, "step": 3332, "time_per_iteration": 2.649212121963501 }, { "auxiliary_loss_clip": 0.01078314, "auxiliary_loss_mlp": 0.01002087, "balance_loss_clip": 1.01594186, "balance_loss_mlp": 1.00019193, "epoch": 0.40076955450009016, "flos": 60280729822080.0, "grad_norm": 0.7750669154745256, "language_loss": 0.61813343, "learning_rate": 2.7232523679578002e-06, "loss": 0.63893747, "num_input_tokens_seen": 72025625, "step": 3333, "time_per_iteration": 3.2458696365356445 }, { "auxiliary_loss_clip": 0.01180633, "auxiliary_loss_mlp": 0.01029494, "balance_loss_clip": 1.05719852, "balance_loss_mlp": 1.02073801, "epoch": 0.4008897973907293, "flos": 16617268396800.0, "grad_norm": 2.1596398148786204, "language_loss": 0.79151446, "learning_rate": 2.7225260597981295e-06, "loss": 0.81361568, "num_input_tokens_seen": 72043330, "step": 3334, "time_per_iteration": 2.6059205532073975 }, { "auxiliary_loss_clip": 0.01147344, "auxiliary_loss_mlp": 0.00764893, "balance_loss_clip": 1.05241692, "balance_loss_mlp": 1.00026309, "epoch": 0.4010100402813684, "flos": 15377344865280.0, "grad_norm": 2.278370680455405, "language_loss": 0.78582919, "learning_rate": 2.721799642031831e-06, "loss": 0.80495155, "num_input_tokens_seen": 72059500, "step": 3335, "time_per_iteration": 2.6479475498199463 }, { "auxiliary_loss_clip": 0.01167957, "auxiliary_loss_mlp": 0.01031517, "balance_loss_clip": 1.04987609, "balance_loss_mlp": 1.02223039, "epoch": 0.40113028317200744, "flos": 13298835438720.0, "grad_norm": 2.046134127025573, "language_loss": 0.77734417, "learning_rate": 2.721073114769101e-06, "loss": 0.79933894, "num_input_tokens_seen": 72077175, "step": 3336, "time_per_iteration": 2.6756248474121094 }, { "auxiliary_loss_clip": 0.01147524, "auxiliary_loss_mlp": 0.01035808, "balance_loss_clip": 1.05191016, "balance_loss_mlp": 1.0269388, "epoch": 0.40125052606264655, "flos": 20668027841280.0, "grad_norm": 1.9107146883881425, "language_loss": 0.75309014, "learning_rate": 2.7203464781201523e-06, "loss": 0.7749235, "num_input_tokens_seen": 72096490, "step": 3337, "time_per_iteration": 2.743034839630127 }, { "auxiliary_loss_clip": 0.01195975, "auxiliary_loss_mlp": 0.01030916, "balance_loss_clip": 1.05724621, "balance_loss_mlp": 1.02160621, "epoch": 0.40137076895328566, "flos": 24607679541120.0, "grad_norm": 2.8376406164492463, "language_loss": 0.78402638, "learning_rate": 2.719619732195215e-06, "loss": 0.80629534, "num_input_tokens_seen": 72118130, "step": 3338, "time_per_iteration": 2.8128926753997803 }, { "auxiliary_loss_clip": 0.01148213, "auxiliary_loss_mlp": 0.01029467, "balance_loss_clip": 1.04947972, "balance_loss_mlp": 1.02024007, "epoch": 0.4014910118439247, "flos": 24206593299840.0, "grad_norm": 1.4420356161670347, "language_loss": 0.72776377, "learning_rate": 2.7188928771045377e-06, "loss": 0.74954057, "num_input_tokens_seen": 72139450, "step": 3339, "time_per_iteration": 2.7620186805725098 }, { "auxiliary_loss_clip": 0.01142272, "auxiliary_loss_mlp": 0.01037941, "balance_loss_clip": 1.04638624, "balance_loss_mlp": 1.02852905, "epoch": 0.4016112547345638, "flos": 26725080418560.0, "grad_norm": 1.710719677298961, "language_loss": 0.80215156, "learning_rate": 2.7181659129583815e-06, "loss": 0.82395369, "num_input_tokens_seen": 72159040, "step": 3340, "time_per_iteration": 5.0075767040252686 }, { "auxiliary_loss_clip": 0.01152122, "auxiliary_loss_mlp": 0.01037673, "balance_loss_clip": 1.04534245, "balance_loss_mlp": 1.02926898, "epoch": 0.4017314976252029, "flos": 21288025520640.0, "grad_norm": 1.8342339970227368, "language_loss": 0.7542851, "learning_rate": 2.7174388398670276e-06, "loss": 0.77618307, "num_input_tokens_seen": 72178220, "step": 3341, "time_per_iteration": 2.7579185962677 }, { "auxiliary_loss_clip": 0.01191278, "auxiliary_loss_mlp": 0.01039034, "balance_loss_clip": 1.05147755, "balance_loss_mlp": 1.0296042, "epoch": 0.401851740515842, "flos": 25484690010240.0, "grad_norm": 2.586494650451684, "language_loss": 0.92139453, "learning_rate": 2.716711657940773e-06, "loss": 0.94369769, "num_input_tokens_seen": 72199230, "step": 3342, "time_per_iteration": 2.75146746635437 }, { "auxiliary_loss_clip": 0.01052179, "auxiliary_loss_mlp": 0.01004187, "balance_loss_clip": 1.01323485, "balance_loss_mlp": 1.00231504, "epoch": 0.4019719834064811, "flos": 55395334978560.0, "grad_norm": 0.8140787817874595, "language_loss": 0.56506443, "learning_rate": 2.7159843672899284e-06, "loss": 0.58562815, "num_input_tokens_seen": 72263430, "step": 3343, "time_per_iteration": 3.5167477130889893 }, { "auxiliary_loss_clip": 0.01180066, "auxiliary_loss_mlp": 0.01028252, "balance_loss_clip": 1.05608511, "balance_loss_mlp": 1.01930583, "epoch": 0.40209222629712016, "flos": 18180100218240.0, "grad_norm": 2.4691193874804185, "language_loss": 0.81230664, "learning_rate": 2.715256968024825e-06, "loss": 0.83438981, "num_input_tokens_seen": 72280505, "step": 3344, "time_per_iteration": 2.8045685291290283 }, { "auxiliary_loss_clip": 0.01170843, "auxiliary_loss_mlp": 0.0103755, "balance_loss_clip": 1.05308807, "balance_loss_mlp": 1.02850509, "epoch": 0.40221246918775927, "flos": 25961009287680.0, "grad_norm": 1.6619518474644552, "language_loss": 0.82529593, "learning_rate": 2.7145294602558083e-06, "loss": 0.8473798, "num_input_tokens_seen": 72301215, "step": 3345, "time_per_iteration": 3.5982537269592285 }, { "auxiliary_loss_clip": 0.0117681, "auxiliary_loss_mlp": 0.01027381, "balance_loss_clip": 1.05270886, "balance_loss_mlp": 1.01854789, "epoch": 0.4023327120783984, "flos": 33838912056960.0, "grad_norm": 7.518337118082625, "language_loss": 0.70958918, "learning_rate": 2.713801844093241e-06, "loss": 0.7316311, "num_input_tokens_seen": 72322365, "step": 3346, "time_per_iteration": 3.8035991191864014 }, { "auxiliary_loss_clip": 0.01175828, "auxiliary_loss_mlp": 0.010331, "balance_loss_clip": 1.0522089, "balance_loss_mlp": 1.02454615, "epoch": 0.40245295496903744, "flos": 26900252069760.0, "grad_norm": 32.60576870945136, "language_loss": 0.88112581, "learning_rate": 2.7130741196475014e-06, "loss": 0.90321511, "num_input_tokens_seen": 72340495, "step": 3347, "time_per_iteration": 2.677321434020996 }, { "auxiliary_loss_clip": 0.01164544, "auxiliary_loss_mlp": 0.0103443, "balance_loss_clip": 1.05300975, "balance_loss_mlp": 1.02460098, "epoch": 0.40257319785967655, "flos": 36902738436480.0, "grad_norm": 2.0108775388843068, "language_loss": 0.79677248, "learning_rate": 2.7123462870289848e-06, "loss": 0.8187623, "num_input_tokens_seen": 72360545, "step": 3348, "time_per_iteration": 2.771833658218384 }, { "auxiliary_loss_clip": 0.01163743, "auxiliary_loss_mlp": 0.01025773, "balance_loss_clip": 1.04942977, "balance_loss_mlp": 1.01699924, "epoch": 0.40269344075031566, "flos": 24353180703360.0, "grad_norm": 1.7004776901740828, "language_loss": 0.81289601, "learning_rate": 2.711618346348102e-06, "loss": 0.83479124, "num_input_tokens_seen": 72381070, "step": 3349, "time_per_iteration": 2.755070209503174 }, { "auxiliary_loss_clip": 0.01154699, "auxiliary_loss_mlp": 0.01030103, "balance_loss_clip": 1.04930127, "balance_loss_mlp": 1.02197933, "epoch": 0.4028136836409547, "flos": 14389657614720.0, "grad_norm": 1.757028205950734, "language_loss": 0.6378935, "learning_rate": 2.7108902977152825e-06, "loss": 0.65974152, "num_input_tokens_seen": 72398970, "step": 3350, "time_per_iteration": 2.6395018100738525 }, { "auxiliary_loss_clip": 0.01173524, "auxiliary_loss_mlp": 0.01036605, "balance_loss_clip": 1.05040336, "balance_loss_mlp": 1.02716386, "epoch": 0.4029339265315938, "flos": 26136037284480.0, "grad_norm": 3.0150944152273556, "language_loss": 0.75292552, "learning_rate": 2.7101621412409704e-06, "loss": 0.7750268, "num_input_tokens_seen": 72418455, "step": 3351, "time_per_iteration": 2.703063488006592 }, { "auxiliary_loss_clip": 0.01190062, "auxiliary_loss_mlp": 0.01033664, "balance_loss_clip": 1.05204475, "balance_loss_mlp": 1.02421069, "epoch": 0.40305416942223293, "flos": 23256325042560.0, "grad_norm": 1.9372006311851622, "language_loss": 0.85830271, "learning_rate": 2.7094338770356256e-06, "loss": 0.88053989, "num_input_tokens_seen": 72437540, "step": 3352, "time_per_iteration": 2.581650495529175 }, { "auxiliary_loss_clip": 0.0115904, "auxiliary_loss_mlp": 0.01028213, "balance_loss_clip": 1.05164421, "balance_loss_mlp": 1.01915264, "epoch": 0.403174412312872, "flos": 27089645506560.0, "grad_norm": 2.3727004174171764, "language_loss": 0.64259219, "learning_rate": 2.708705505209726e-06, "loss": 0.66446477, "num_input_tokens_seen": 72458315, "step": 3353, "time_per_iteration": 2.7378830909729004 }, { "auxiliary_loss_clip": 0.01122975, "auxiliary_loss_mlp": 0.01031302, "balance_loss_clip": 1.04393339, "balance_loss_mlp": 1.02265382, "epoch": 0.4032946552035111, "flos": 21756336065280.0, "grad_norm": 2.060602107262452, "language_loss": 0.91677594, "learning_rate": 2.7079770258737646e-06, "loss": 0.93831873, "num_input_tokens_seen": 72476225, "step": 3354, "time_per_iteration": 2.74843692779541 }, { "auxiliary_loss_clip": 0.01141606, "auxiliary_loss_mlp": 0.01024966, "balance_loss_clip": 1.04623973, "balance_loss_mlp": 1.01554227, "epoch": 0.4034148980941502, "flos": 17343956448000.0, "grad_norm": 2.1771557639736643, "language_loss": 0.75590217, "learning_rate": 2.707248439138251e-06, "loss": 0.77756786, "num_input_tokens_seen": 72492460, "step": 3355, "time_per_iteration": 2.766432285308838 }, { "auxiliary_loss_clip": 0.01162175, "auxiliary_loss_mlp": 0.01028162, "balance_loss_clip": 1.05625677, "balance_loss_mlp": 1.01897132, "epoch": 0.40353514098478926, "flos": 22017838055040.0, "grad_norm": 1.6372049670255207, "language_loss": 0.65129638, "learning_rate": 2.7065197451137114e-06, "loss": 0.67319977, "num_input_tokens_seen": 72513840, "step": 3356, "time_per_iteration": 2.740413188934326 }, { "auxiliary_loss_clip": 0.01161119, "auxiliary_loss_mlp": 0.01032338, "balance_loss_clip": 1.05016232, "balance_loss_mlp": 1.02416027, "epoch": 0.4036553838754284, "flos": 14246446089600.0, "grad_norm": 2.141516832158929, "language_loss": 0.67554617, "learning_rate": 2.7057909439106894e-06, "loss": 0.69748074, "num_input_tokens_seen": 72531695, "step": 3357, "time_per_iteration": 2.6938605308532715 }, { "auxiliary_loss_clip": 0.01168132, "auxiliary_loss_mlp": 0.00764163, "balance_loss_clip": 1.05079746, "balance_loss_mlp": 1.00029349, "epoch": 0.40377562676606743, "flos": 24790644443520.0, "grad_norm": 1.8912302859982433, "language_loss": 0.7859785, "learning_rate": 2.7050620356397417e-06, "loss": 0.80530143, "num_input_tokens_seen": 72550645, "step": 3358, "time_per_iteration": 2.7090680599212646 }, { "auxiliary_loss_clip": 0.01190659, "auxiliary_loss_mlp": 0.01031667, "balance_loss_clip": 1.05770409, "balance_loss_mlp": 1.02336454, "epoch": 0.40389586965670654, "flos": 24061226958720.0, "grad_norm": 1.7863046769943873, "language_loss": 0.72286141, "learning_rate": 2.7043330204114437e-06, "loss": 0.74508464, "num_input_tokens_seen": 72569355, "step": 3359, "time_per_iteration": 2.636376142501831 }, { "auxiliary_loss_clip": 0.01184721, "auxiliary_loss_mlp": 0.01032839, "balance_loss_clip": 1.05042005, "balance_loss_mlp": 1.02388668, "epoch": 0.40401611254734565, "flos": 16399613934720.0, "grad_norm": 1.8599530996429623, "language_loss": 0.8577776, "learning_rate": 2.7036038983363862e-06, "loss": 0.87995321, "num_input_tokens_seen": 72585960, "step": 3360, "time_per_iteration": 2.6341536045074463 }, { "auxiliary_loss_clip": 0.01170526, "auxiliary_loss_mlp": 0.01029514, "balance_loss_clip": 1.05127835, "balance_loss_mlp": 1.02185464, "epoch": 0.4041363554379847, "flos": 23988220565760.0, "grad_norm": 1.9301631413395646, "language_loss": 0.84443474, "learning_rate": 2.702874669525177e-06, "loss": 0.86643511, "num_input_tokens_seen": 72604440, "step": 3361, "time_per_iteration": 2.7052724361419678 }, { "auxiliary_loss_clip": 0.01149569, "auxiliary_loss_mlp": 0.01038703, "balance_loss_clip": 1.05292308, "balance_loss_mlp": 1.02945244, "epoch": 0.4042565983286238, "flos": 28401964899840.0, "grad_norm": 2.3701503356677978, "language_loss": 0.69578612, "learning_rate": 2.7021453340884394e-06, "loss": 0.71766883, "num_input_tokens_seen": 72622165, "step": 3362, "time_per_iteration": 2.7761149406433105 }, { "auxiliary_loss_clip": 0.01150716, "auxiliary_loss_mlp": 0.00764081, "balance_loss_clip": 1.04915333, "balance_loss_mlp": 1.00030756, "epoch": 0.40437684121926293, "flos": 17710963660800.0, "grad_norm": 2.6771337623613785, "language_loss": 0.73206699, "learning_rate": 2.7014158921368125e-06, "loss": 0.75121498, "num_input_tokens_seen": 72640490, "step": 3363, "time_per_iteration": 2.6142802238464355 }, { "auxiliary_loss_clip": 0.01190565, "auxiliary_loss_mlp": 0.0103315, "balance_loss_clip": 1.05473757, "balance_loss_mlp": 1.02410769, "epoch": 0.404497084109902, "flos": 24018959629440.0, "grad_norm": 2.1035454991349183, "language_loss": 0.85900688, "learning_rate": 2.700686343780953e-06, "loss": 0.88124406, "num_input_tokens_seen": 72660360, "step": 3364, "time_per_iteration": 2.667922019958496 }, { "auxiliary_loss_clip": 0.01161059, "auxiliary_loss_mlp": 0.01031087, "balance_loss_clip": 1.04934573, "balance_loss_mlp": 1.02266479, "epoch": 0.4046173270005411, "flos": 22929861306240.0, "grad_norm": 2.1521049039879734, "language_loss": 0.88914543, "learning_rate": 2.699956689131532e-06, "loss": 0.91106689, "num_input_tokens_seen": 72680345, "step": 3365, "time_per_iteration": 3.583500862121582 }, { "auxiliary_loss_clip": 0.01163903, "auxiliary_loss_mlp": 0.01031692, "balance_loss_clip": 1.05144024, "balance_loss_mlp": 1.0231154, "epoch": 0.4047375698911802, "flos": 20668135582080.0, "grad_norm": 2.1756835897602143, "language_loss": 0.84939182, "learning_rate": 2.699226928299238e-06, "loss": 0.87134778, "num_input_tokens_seen": 72698365, "step": 3366, "time_per_iteration": 3.6626839637756348 }, { "auxiliary_loss_clip": 0.01176438, "auxiliary_loss_mlp": 0.01029436, "balance_loss_clip": 1.05261278, "balance_loss_mlp": 1.02144301, "epoch": 0.40485781278181926, "flos": 28912865996160.0, "grad_norm": 2.5999735884616757, "language_loss": 0.79622382, "learning_rate": 2.698497061394774e-06, "loss": 0.81828254, "num_input_tokens_seen": 72716850, "step": 3367, "time_per_iteration": 2.698781728744507 }, { "auxiliary_loss_clip": 0.01154656, "auxiliary_loss_mlp": 0.00763424, "balance_loss_clip": 1.05391335, "balance_loss_mlp": 1.00035322, "epoch": 0.40497805567245837, "flos": 23148377694720.0, "grad_norm": 1.643268987352253, "language_loss": 0.80848855, "learning_rate": 2.6977670885288627e-06, "loss": 0.82766938, "num_input_tokens_seen": 72738250, "step": 3368, "time_per_iteration": 2.7300407886505127 }, { "auxiliary_loss_clip": 0.0114866, "auxiliary_loss_mlp": 0.01031353, "balance_loss_clip": 1.04760242, "balance_loss_mlp": 1.0229249, "epoch": 0.4050982985630975, "flos": 16289404030080.0, "grad_norm": 2.180802719252512, "language_loss": 0.75093657, "learning_rate": 2.6970370098122378e-06, "loss": 0.77273673, "num_input_tokens_seen": 72755235, "step": 3369, "time_per_iteration": 2.7568423748016357 }, { "auxiliary_loss_clip": 0.0118753, "auxiliary_loss_mlp": 0.01033202, "balance_loss_clip": 1.05186391, "balance_loss_mlp": 1.02507782, "epoch": 0.40521854145373654, "flos": 34459484353920.0, "grad_norm": 1.8653736204988296, "language_loss": 0.86856151, "learning_rate": 2.6963068253556535e-06, "loss": 0.89076877, "num_input_tokens_seen": 72776620, "step": 3370, "time_per_iteration": 3.896428346633911 }, { "auxiliary_loss_clip": 0.01180595, "auxiliary_loss_mlp": 0.01033158, "balance_loss_clip": 1.04958153, "balance_loss_mlp": 1.02418184, "epoch": 0.40533878434437565, "flos": 25331099454720.0, "grad_norm": 1.9435232898736479, "language_loss": 0.85872042, "learning_rate": 2.6955765352698763e-06, "loss": 0.88085788, "num_input_tokens_seen": 72796765, "step": 3371, "time_per_iteration": 3.650041103363037 }, { "auxiliary_loss_clip": 0.01191239, "auxiliary_loss_mlp": 0.01030349, "balance_loss_clip": 1.05096948, "balance_loss_mlp": 1.02173603, "epoch": 0.40545902723501476, "flos": 15012061505280.0, "grad_norm": 6.573987432495985, "language_loss": 0.73648334, "learning_rate": 2.6948461396656923e-06, "loss": 0.75869918, "num_input_tokens_seen": 72814175, "step": 3372, "time_per_iteration": 2.617652177810669 }, { "auxiliary_loss_clip": 0.01183671, "auxiliary_loss_mlp": 0.01033315, "balance_loss_clip": 1.0555706, "balance_loss_mlp": 1.02473736, "epoch": 0.4055792701256538, "flos": 25521103422720.0, "grad_norm": 2.157067615043618, "language_loss": 0.74362409, "learning_rate": 2.6941156386539013e-06, "loss": 0.76579392, "num_input_tokens_seen": 72834125, "step": 3373, "time_per_iteration": 2.6585164070129395 }, { "auxiliary_loss_clip": 0.01158761, "auxiliary_loss_mlp": 0.01029305, "balance_loss_clip": 1.0537219, "balance_loss_mlp": 1.02074027, "epoch": 0.4056995130162929, "flos": 19574583972480.0, "grad_norm": 2.8341682038193814, "language_loss": 0.80816978, "learning_rate": 2.6933850323453203e-06, "loss": 0.83005047, "num_input_tokens_seen": 72852570, "step": 3374, "time_per_iteration": 2.7157886028289795 }, { "auxiliary_loss_clip": 0.01192006, "auxiliary_loss_mlp": 0.01028852, "balance_loss_clip": 1.05635679, "balance_loss_mlp": 1.02115703, "epoch": 0.405819755906932, "flos": 15413794191360.0, "grad_norm": 1.9688582017360996, "language_loss": 0.74926436, "learning_rate": 2.6926543208507806e-06, "loss": 0.77147293, "num_input_tokens_seen": 72871250, "step": 3375, "time_per_iteration": 2.566619873046875 }, { "auxiliary_loss_clip": 0.01176093, "auxiliary_loss_mlp": 0.01028997, "balance_loss_clip": 1.05370927, "balance_loss_mlp": 1.02033067, "epoch": 0.4059399987975711, "flos": 21433930565760.0, "grad_norm": 2.1715544405746026, "language_loss": 0.80001694, "learning_rate": 2.6919235042811316e-06, "loss": 0.82206786, "num_input_tokens_seen": 72890035, "step": 3376, "time_per_iteration": 2.639587640762329 }, { "auxiliary_loss_clip": 0.01146142, "auxiliary_loss_mlp": 0.01035294, "balance_loss_clip": 1.04911065, "balance_loss_mlp": 1.02622223, "epoch": 0.4060602416882102, "flos": 25556942217600.0, "grad_norm": 2.1011870123449117, "language_loss": 0.76397562, "learning_rate": 2.691192582747237e-06, "loss": 0.78578997, "num_input_tokens_seen": 72909665, "step": 3377, "time_per_iteration": 2.7537338733673096 }, { "auxiliary_loss_clip": 0.01189236, "auxiliary_loss_mlp": 0.01029426, "balance_loss_clip": 1.05297899, "balance_loss_mlp": 1.02008581, "epoch": 0.40618048457884925, "flos": 23766759262080.0, "grad_norm": 1.8954163057110602, "language_loss": 0.74007678, "learning_rate": 2.6904615563599765e-06, "loss": 0.76226336, "num_input_tokens_seen": 72929465, "step": 3378, "time_per_iteration": 2.5806188583374023 }, { "auxiliary_loss_clip": 0.0114409, "auxiliary_loss_mlp": 0.01031018, "balance_loss_clip": 1.04802203, "balance_loss_mlp": 1.02266765, "epoch": 0.40630072746948837, "flos": 17639681120640.0, "grad_norm": 1.8291239729083915, "language_loss": 0.8340255, "learning_rate": 2.6897304252302477e-06, "loss": 0.85577655, "num_input_tokens_seen": 72946785, "step": 3379, "time_per_iteration": 2.713282585144043 }, { "auxiliary_loss_clip": 0.01060916, "auxiliary_loss_mlp": 0.01002812, "balance_loss_clip": 1.02419114, "balance_loss_mlp": 1.00098789, "epoch": 0.4064209703601275, "flos": 60836053063680.0, "grad_norm": 0.7901973507070418, "language_loss": 0.54813409, "learning_rate": 2.688999189468962e-06, "loss": 0.56877136, "num_input_tokens_seen": 73003215, "step": 3380, "time_per_iteration": 3.1721577644348145 }, { "auxiliary_loss_clip": 0.01174488, "auxiliary_loss_mlp": 0.01034014, "balance_loss_clip": 1.05347431, "balance_loss_mlp": 1.02543128, "epoch": 0.40654121325076653, "flos": 24024346669440.0, "grad_norm": 2.4687682464984353, "language_loss": 0.76413774, "learning_rate": 2.6882678491870464e-06, "loss": 0.78622282, "num_input_tokens_seen": 73023650, "step": 3381, "time_per_iteration": 2.7907159328460693 }, { "auxiliary_loss_clip": 0.01180341, "auxiliary_loss_mlp": 0.01029639, "balance_loss_clip": 1.05288553, "balance_loss_mlp": 1.02079356, "epoch": 0.40666145614140564, "flos": 27344252085120.0, "grad_norm": 1.717766393200777, "language_loss": 0.71349537, "learning_rate": 2.6875364044954453e-06, "loss": 0.73559523, "num_input_tokens_seen": 73043880, "step": 3382, "time_per_iteration": 2.712151288986206 }, { "auxiliary_loss_clip": 0.01156196, "auxiliary_loss_mlp": 0.01030155, "balance_loss_clip": 1.04575157, "balance_loss_mlp": 1.02151787, "epoch": 0.40678169903204475, "flos": 26176724415360.0, "grad_norm": 1.9748049474413685, "language_loss": 0.82353741, "learning_rate": 2.6868048555051185e-06, "loss": 0.84540093, "num_input_tokens_seen": 73065410, "step": 3383, "time_per_iteration": 2.7227067947387695 }, { "auxiliary_loss_clip": 0.0116666, "auxiliary_loss_mlp": 0.01031654, "balance_loss_clip": 1.04834521, "balance_loss_mlp": 1.02242756, "epoch": 0.4069019419226838, "flos": 28622420622720.0, "grad_norm": 2.3852997327647443, "language_loss": 0.85655761, "learning_rate": 2.686073202327041e-06, "loss": 0.87854081, "num_input_tokens_seen": 73084410, "step": 3384, "time_per_iteration": 2.7458460330963135 }, { "auxiliary_loss_clip": 0.01150117, "auxiliary_loss_mlp": 0.01025547, "balance_loss_clip": 1.04647613, "balance_loss_mlp": 1.01732755, "epoch": 0.4070221848133229, "flos": 25229006023680.0, "grad_norm": 1.6875833223374777, "language_loss": 0.73317641, "learning_rate": 2.6853414450722043e-06, "loss": 0.75493306, "num_input_tokens_seen": 73104075, "step": 3385, "time_per_iteration": 2.7055723667144775 }, { "auxiliary_loss_clip": 0.01171297, "auxiliary_loss_mlp": 0.01026374, "balance_loss_clip": 1.05050516, "balance_loss_mlp": 1.0186311, "epoch": 0.40714242770396203, "flos": 18405224709120.0, "grad_norm": 1.7306728182637594, "language_loss": 0.85444999, "learning_rate": 2.684609583851616e-06, "loss": 0.8764267, "num_input_tokens_seen": 73122250, "step": 3386, "time_per_iteration": 2.6221206188201904 }, { "auxiliary_loss_clip": 0.01127479, "auxiliary_loss_mlp": 0.01031461, "balance_loss_clip": 1.04606438, "balance_loss_mlp": 1.02276444, "epoch": 0.4072626705946011, "flos": 30228920403840.0, "grad_norm": 1.8117655803515722, "language_loss": 0.80765688, "learning_rate": 2.683877618776297e-06, "loss": 0.82924634, "num_input_tokens_seen": 73144505, "step": 3387, "time_per_iteration": 2.812366247177124 }, { "auxiliary_loss_clip": 0.01153505, "auxiliary_loss_mlp": 0.01034041, "balance_loss_clip": 1.04623628, "balance_loss_mlp": 1.02531481, "epoch": 0.4073829134852402, "flos": 21834549930240.0, "grad_norm": 2.604487577553351, "language_loss": 0.74287868, "learning_rate": 2.6831455499572876e-06, "loss": 0.76475412, "num_input_tokens_seen": 73162440, "step": 3388, "time_per_iteration": 2.657925605773926 }, { "auxiliary_loss_clip": 0.01188589, "auxiliary_loss_mlp": 0.01025997, "balance_loss_clip": 1.05174351, "balance_loss_mlp": 1.01787877, "epoch": 0.40750315637587925, "flos": 25260211964160.0, "grad_norm": 2.045994883049423, "language_loss": 0.77815062, "learning_rate": 2.682413377505641e-06, "loss": 0.80029655, "num_input_tokens_seen": 73181245, "step": 3389, "time_per_iteration": 2.6371636390686035 }, { "auxiliary_loss_clip": 0.01173587, "auxiliary_loss_mlp": 0.01027175, "balance_loss_clip": 1.04953229, "balance_loss_mlp": 1.01847839, "epoch": 0.40762339926651836, "flos": 19712767593600.0, "grad_norm": 1.8464700676363104, "language_loss": 0.76867771, "learning_rate": 2.6816811015324284e-06, "loss": 0.79068536, "num_input_tokens_seen": 73199295, "step": 3390, "time_per_iteration": 2.6720380783081055 }, { "auxiliary_loss_clip": 0.0109427, "auxiliary_loss_mlp": 0.01001607, "balance_loss_clip": 1.02310586, "balance_loss_mlp": 0.99989033, "epoch": 0.40774364215715747, "flos": 71449307314560.0, "grad_norm": 0.7237512255007107, "language_loss": 0.56643641, "learning_rate": 2.6809487221487343e-06, "loss": 0.58739519, "num_input_tokens_seen": 73258780, "step": 3391, "time_per_iteration": 4.084848165512085 }, { "auxiliary_loss_clip": 0.01164057, "auxiliary_loss_mlp": 0.0103385, "balance_loss_clip": 1.04781866, "balance_loss_mlp": 1.02515996, "epoch": 0.4078638850477965, "flos": 15084134144640.0, "grad_norm": 2.7868833274048543, "language_loss": 0.82000959, "learning_rate": 2.6802162394656605e-06, "loss": 0.84198868, "num_input_tokens_seen": 73275490, "step": 3392, "time_per_iteration": 2.577679395675659 }, { "auxiliary_loss_clip": 0.01154632, "auxiliary_loss_mlp": 0.01030057, "balance_loss_clip": 1.04547858, "balance_loss_mlp": 1.02170622, "epoch": 0.40798412793843564, "flos": 23842890138240.0, "grad_norm": 2.8351010719381717, "language_loss": 0.71510053, "learning_rate": 2.679483653594324e-06, "loss": 0.73694742, "num_input_tokens_seen": 73297260, "step": 3393, "time_per_iteration": 3.7427818775177 }, { "auxiliary_loss_clip": 0.01175253, "auxiliary_loss_mlp": 0.01031035, "balance_loss_clip": 1.05026102, "balance_loss_mlp": 1.02255917, "epoch": 0.40810437082907475, "flos": 21065774117760.0, "grad_norm": 2.565379910889756, "language_loss": 0.76393056, "learning_rate": 2.678750964645857e-06, "loss": 0.78599346, "num_input_tokens_seen": 73316340, "step": 3394, "time_per_iteration": 2.5611231327056885 }, { "auxiliary_loss_clip": 0.01178643, "auxiliary_loss_mlp": 0.01026788, "balance_loss_clip": 1.05636787, "balance_loss_mlp": 1.01800239, "epoch": 0.4082246137197138, "flos": 11321377948800.0, "grad_norm": 2.4046338615515728, "language_loss": 0.8404566, "learning_rate": 2.6780181727314094e-06, "loss": 0.86251092, "num_input_tokens_seen": 73331245, "step": 3395, "time_per_iteration": 2.5958919525146484 }, { "auxiliary_loss_clip": 0.0114637, "auxiliary_loss_mlp": 0.0076348, "balance_loss_clip": 1.04717457, "balance_loss_mlp": 1.00032616, "epoch": 0.4083448566103529, "flos": 19062569554560.0, "grad_norm": 1.9166557968971487, "language_loss": 0.78379416, "learning_rate": 2.6772852779621435e-06, "loss": 0.80289268, "num_input_tokens_seen": 73349105, "step": 3396, "time_per_iteration": 2.666388988494873 }, { "auxiliary_loss_clip": 0.01170029, "auxiliary_loss_mlp": 0.00762873, "balance_loss_clip": 1.05314028, "balance_loss_mlp": 1.000229, "epoch": 0.408465099500992, "flos": 23550254035200.0, "grad_norm": 1.876502527092022, "language_loss": 0.86291069, "learning_rate": 2.676552280449239e-06, "loss": 0.8822397, "num_input_tokens_seen": 73368990, "step": 3397, "time_per_iteration": 3.498594284057617 }, { "auxiliary_loss_clip": 0.01164568, "auxiliary_loss_mlp": 0.01028409, "balance_loss_clip": 1.04902732, "balance_loss_mlp": 1.02010596, "epoch": 0.4085853423916311, "flos": 12750012558720.0, "grad_norm": 2.3577291925068025, "language_loss": 0.75840855, "learning_rate": 2.6758191803038917e-06, "loss": 0.78033829, "num_input_tokens_seen": 73387485, "step": 3398, "time_per_iteration": 3.5802838802337646 }, { "auxiliary_loss_clip": 0.01110703, "auxiliary_loss_mlp": 0.01039361, "balance_loss_clip": 1.04559207, "balance_loss_mlp": 1.02989578, "epoch": 0.4087055852822702, "flos": 24353072962560.0, "grad_norm": 1.7800567481382366, "language_loss": 0.82922393, "learning_rate": 2.6750859776373125e-06, "loss": 0.85072458, "num_input_tokens_seen": 73406940, "step": 3399, "time_per_iteration": 2.7886383533477783 }, { "auxiliary_loss_clip": 0.01041, "auxiliary_loss_mlp": 0.01003054, "balance_loss_clip": 1.01945066, "balance_loss_mlp": 1.00109899, "epoch": 0.4088258281729093, "flos": 66387950720640.0, "grad_norm": 0.7667163126199344, "language_loss": 0.60375249, "learning_rate": 2.674352672560727e-06, "loss": 0.62419307, "num_input_tokens_seen": 73468385, "step": 3400, "time_per_iteration": 3.346703052520752 }, { "auxiliary_loss_clip": 0.01143639, "auxiliary_loss_mlp": 0.01026997, "balance_loss_clip": 1.04673028, "balance_loss_mlp": 1.01853943, "epoch": 0.40894607106354836, "flos": 20449260057600.0, "grad_norm": 1.724481869885827, "language_loss": 0.76783741, "learning_rate": 2.673619265185377e-06, "loss": 0.78954375, "num_input_tokens_seen": 73488225, "step": 3401, "time_per_iteration": 2.812732458114624 }, { "auxiliary_loss_clip": 0.01178329, "auxiliary_loss_mlp": 0.01034346, "balance_loss_clip": 1.05157769, "balance_loss_mlp": 1.02542281, "epoch": 0.40906631395418747, "flos": 27053627143680.0, "grad_norm": 1.7658095550985171, "language_loss": 0.77715123, "learning_rate": 2.672885755622521e-06, "loss": 0.79927796, "num_input_tokens_seen": 73510640, "step": 3402, "time_per_iteration": 2.6845688819885254 }, { "auxiliary_loss_clip": 0.01128467, "auxiliary_loss_mlp": 0.01025091, "balance_loss_clip": 1.04575968, "balance_loss_mlp": 1.01691866, "epoch": 0.4091865568448266, "flos": 25484151306240.0, "grad_norm": 9.118597132695399, "language_loss": 0.70786166, "learning_rate": 2.67215214398343e-06, "loss": 0.72939724, "num_input_tokens_seen": 73530655, "step": 3403, "time_per_iteration": 2.7989044189453125 }, { "auxiliary_loss_clip": 0.01135172, "auxiliary_loss_mlp": 0.01033915, "balance_loss_clip": 1.04608154, "balance_loss_mlp": 1.02506328, "epoch": 0.40930679973546563, "flos": 28657864368000.0, "grad_norm": 2.1375372352918096, "language_loss": 0.78425789, "learning_rate": 2.671418430379393e-06, "loss": 0.80594879, "num_input_tokens_seen": 73549340, "step": 3404, "time_per_iteration": 2.807591676712036 }, { "auxiliary_loss_clip": 0.01188453, "auxiliary_loss_mlp": 0.01026563, "balance_loss_clip": 1.05236602, "balance_loss_mlp": 1.01821184, "epoch": 0.40942704262610474, "flos": 20886292834560.0, "grad_norm": 1.7651770783118657, "language_loss": 0.83525592, "learning_rate": 2.670684614921715e-06, "loss": 0.85740602, "num_input_tokens_seen": 73568315, "step": 3405, "time_per_iteration": 2.5892527103424072 }, { "auxiliary_loss_clip": 0.01160564, "auxiliary_loss_mlp": 0.01031307, "balance_loss_clip": 1.04776287, "balance_loss_mlp": 1.02302814, "epoch": 0.4095472855167438, "flos": 21618080616960.0, "grad_norm": 2.1932933106323684, "language_loss": 0.69119108, "learning_rate": 2.6699506977217128e-06, "loss": 0.71310985, "num_input_tokens_seen": 73588490, "step": 3406, "time_per_iteration": 2.6777000427246094 }, { "auxiliary_loss_clip": 0.01173577, "auxiliary_loss_mlp": 0.01033464, "balance_loss_clip": 1.05414391, "balance_loss_mlp": 1.02510154, "epoch": 0.4096675284073829, "flos": 27926112499200.0, "grad_norm": 2.1453274378353124, "language_loss": 0.70135438, "learning_rate": 2.6692166788907233e-06, "loss": 0.72342479, "num_input_tokens_seen": 73608685, "step": 3407, "time_per_iteration": 2.732755184173584 }, { "auxiliary_loss_clip": 0.01160297, "auxiliary_loss_mlp": 0.01031703, "balance_loss_clip": 1.04830933, "balance_loss_mlp": 1.02328086, "epoch": 0.409787771298022, "flos": 19206607092480.0, "grad_norm": 1.9065886196249275, "language_loss": 0.77071106, "learning_rate": 2.6684825585400957e-06, "loss": 0.79263109, "num_input_tokens_seen": 73627630, "step": 3408, "time_per_iteration": 2.6374309062957764 }, { "auxiliary_loss_clip": 0.01060496, "auxiliary_loss_mlp": 0.01001674, "balance_loss_clip": 1.01616001, "balance_loss_mlp": 0.99992162, "epoch": 0.4099080141886611, "flos": 59269234832640.0, "grad_norm": 0.8119079647868245, "language_loss": 0.65156496, "learning_rate": 2.6677483367811947e-06, "loss": 0.67218673, "num_input_tokens_seen": 73687670, "step": 3409, "time_per_iteration": 3.340127944946289 }, { "auxiliary_loss_clip": 0.01174298, "auxiliary_loss_mlp": 0.01034407, "balance_loss_clip": 1.04949701, "balance_loss_mlp": 1.02584195, "epoch": 0.4100282570793002, "flos": 21906443001600.0, "grad_norm": 2.111189588988882, "language_loss": 0.75537926, "learning_rate": 2.6670140137254028e-06, "loss": 0.7774663, "num_input_tokens_seen": 73707145, "step": 3410, "time_per_iteration": 2.6366915702819824 }, { "auxiliary_loss_clip": 0.0112816, "auxiliary_loss_mlp": 0.01032881, "balance_loss_clip": 1.04612589, "balance_loss_mlp": 1.02464986, "epoch": 0.4101484999699393, "flos": 18551596631040.0, "grad_norm": 2.6902346197187903, "language_loss": 0.89712447, "learning_rate": 2.666279589484115e-06, "loss": 0.91873491, "num_input_tokens_seen": 73725045, "step": 3411, "time_per_iteration": 2.761784076690674 }, { "auxiliary_loss_clip": 0.01131548, "auxiliary_loss_mlp": 0.010273, "balance_loss_clip": 1.04573631, "balance_loss_mlp": 1.01892591, "epoch": 0.41026874286057835, "flos": 19094529680640.0, "grad_norm": 1.8478108182069197, "language_loss": 0.81199461, "learning_rate": 2.6655450641687435e-06, "loss": 0.83358312, "num_input_tokens_seen": 73742610, "step": 3412, "time_per_iteration": 2.832249164581299 }, { "auxiliary_loss_clip": 0.01189631, "auxiliary_loss_mlp": 0.01023616, "balance_loss_clip": 1.05518603, "balance_loss_mlp": 1.0157783, "epoch": 0.41038898575121746, "flos": 31209568588800.0, "grad_norm": 1.6503818140706048, "language_loss": 0.69280785, "learning_rate": 2.664810437890715e-06, "loss": 0.71494031, "num_input_tokens_seen": 73764280, "step": 3413, "time_per_iteration": 2.678687810897827 }, { "auxiliary_loss_clip": 0.01107998, "auxiliary_loss_mlp": 0.01032607, "balance_loss_clip": 1.04749167, "balance_loss_mlp": 1.02412486, "epoch": 0.41050922864185657, "flos": 14355865895040.0, "grad_norm": 2.142994583912524, "language_loss": 0.79584241, "learning_rate": 2.6640757107614714e-06, "loss": 0.81724846, "num_input_tokens_seen": 73782375, "step": 3414, "time_per_iteration": 2.7275562286376953 }, { "auxiliary_loss_clip": 0.01140217, "auxiliary_loss_mlp": 0.01031789, "balance_loss_clip": 1.04913783, "balance_loss_mlp": 1.02294993, "epoch": 0.4106294715324956, "flos": 30956290813440.0, "grad_norm": 2.3005841437079138, "language_loss": 0.69445038, "learning_rate": 2.6633408828924697e-06, "loss": 0.71617043, "num_input_tokens_seen": 73801240, "step": 3415, "time_per_iteration": 2.7826969623565674 }, { "auxiliary_loss_clip": 0.01156685, "auxiliary_loss_mlp": 0.01031555, "balance_loss_clip": 1.05076468, "balance_loss_mlp": 1.02272177, "epoch": 0.41074971442313474, "flos": 24457321209600.0, "grad_norm": 1.7275473696677672, "language_loss": 0.70420516, "learning_rate": 2.662605954395185e-06, "loss": 0.72608757, "num_input_tokens_seen": 73821200, "step": 3416, "time_per_iteration": 2.7817084789276123 }, { "auxiliary_loss_clip": 0.01176923, "auxiliary_loss_mlp": 0.01035977, "balance_loss_clip": 1.0513072, "balance_loss_mlp": 1.02798438, "epoch": 0.41086995731377385, "flos": 21542991235200.0, "grad_norm": 1.7247591756642005, "language_loss": 0.8394137, "learning_rate": 2.6618709253811027e-06, "loss": 0.8615427, "num_input_tokens_seen": 73840655, "step": 3417, "time_per_iteration": 3.6339950561523438 }, { "auxiliary_loss_clip": 0.01186513, "auxiliary_loss_mlp": 0.01026841, "balance_loss_clip": 1.05413246, "balance_loss_mlp": 1.01882458, "epoch": 0.4109902002044129, "flos": 20702753314560.0, "grad_norm": 9.627631549087427, "language_loss": 0.87844926, "learning_rate": 2.6611357959617277e-06, "loss": 0.90058279, "num_input_tokens_seen": 73860275, "step": 3418, "time_per_iteration": 3.645164728164673 }, { "auxiliary_loss_clip": 0.0113967, "auxiliary_loss_mlp": 0.01030765, "balance_loss_clip": 1.04807806, "balance_loss_mlp": 1.02261674, "epoch": 0.411110443095052, "flos": 18179992477440.0, "grad_norm": 1.8815169648959387, "language_loss": 0.90942311, "learning_rate": 2.660400566248578e-06, "loss": 0.93112743, "num_input_tokens_seen": 73878400, "step": 3419, "time_per_iteration": 2.7315635681152344 }, { "auxiliary_loss_clip": 0.01147863, "auxiliary_loss_mlp": 0.01034107, "balance_loss_clip": 1.04883707, "balance_loss_mlp": 1.02489841, "epoch": 0.41123068598569107, "flos": 14575244209920.0, "grad_norm": 2.535322785383011, "language_loss": 0.66755366, "learning_rate": 2.6596652363531876e-06, "loss": 0.68937343, "num_input_tokens_seen": 73894275, "step": 3420, "time_per_iteration": 2.653235673904419 }, { "auxiliary_loss_clip": 0.01188696, "auxiliary_loss_mlp": 0.01027683, "balance_loss_clip": 1.05423999, "balance_loss_mlp": 1.01937461, "epoch": 0.4113509288763302, "flos": 21177995184000.0, "grad_norm": 1.5481938353352083, "language_loss": 0.78199625, "learning_rate": 2.6589298063871055e-06, "loss": 0.80416006, "num_input_tokens_seen": 73914450, "step": 3421, "time_per_iteration": 2.6544947624206543 }, { "auxiliary_loss_clip": 0.01187973, "auxiliary_loss_mlp": 0.01030046, "balance_loss_clip": 1.05358195, "balance_loss_mlp": 1.02161813, "epoch": 0.4114711717669693, "flos": 18442212739200.0, "grad_norm": 1.8756455144250808, "language_loss": 0.69622803, "learning_rate": 2.658194276461895e-06, "loss": 0.71840823, "num_input_tokens_seen": 73932375, "step": 3422, "time_per_iteration": 3.4783692359924316 }, { "auxiliary_loss_clip": 0.01159108, "auxiliary_loss_mlp": 0.01036787, "balance_loss_clip": 1.04697871, "balance_loss_mlp": 1.02750611, "epoch": 0.41159141465760835, "flos": 27233395735680.0, "grad_norm": 1.8774703509976973, "language_loss": 0.67067403, "learning_rate": 2.6574586466891368e-06, "loss": 0.69263303, "num_input_tokens_seen": 73952850, "step": 3423, "time_per_iteration": 2.6711585521698 }, { "auxiliary_loss_clip": 0.01159157, "auxiliary_loss_mlp": 0.00763132, "balance_loss_clip": 1.04932582, "balance_loss_mlp": 1.00032043, "epoch": 0.41171165754824746, "flos": 20006876154240.0, "grad_norm": 1.9063783893889557, "language_loss": 0.64746243, "learning_rate": 2.6567229171804247e-06, "loss": 0.66668534, "num_input_tokens_seen": 73970735, "step": 3424, "time_per_iteration": 3.6242713928222656 }, { "auxiliary_loss_clip": 0.01153877, "auxiliary_loss_mlp": 0.01034942, "balance_loss_clip": 1.04698634, "balance_loss_mlp": 1.0261507, "epoch": 0.41183190043888657, "flos": 18004318035840.0, "grad_norm": 2.3429901222118192, "language_loss": 0.87766331, "learning_rate": 2.655987088047368e-06, "loss": 0.89955151, "num_input_tokens_seen": 73989080, "step": 3425, "time_per_iteration": 2.69061017036438 }, { "auxiliary_loss_clip": 0.01157239, "auxiliary_loss_mlp": 0.01038127, "balance_loss_clip": 1.04982352, "balance_loss_mlp": 1.02918005, "epoch": 0.4119521433295256, "flos": 27163370171520.0, "grad_norm": 2.1768971254464815, "language_loss": 0.78794837, "learning_rate": 2.6552511594015912e-06, "loss": 0.80990195, "num_input_tokens_seen": 74009470, "step": 3426, "time_per_iteration": 2.732583522796631 }, { "auxiliary_loss_clip": 0.01156598, "auxiliary_loss_mlp": 0.0103578, "balance_loss_clip": 1.04580045, "balance_loss_mlp": 1.02629709, "epoch": 0.41207238622016473, "flos": 15122020014720.0, "grad_norm": 2.010757128051907, "language_loss": 0.85543442, "learning_rate": 2.654515131354735e-06, "loss": 0.8773582, "num_input_tokens_seen": 74027735, "step": 3427, "time_per_iteration": 2.6264939308166504 }, { "auxiliary_loss_clip": 0.01149401, "auxiliary_loss_mlp": 0.01032513, "balance_loss_clip": 1.05148411, "balance_loss_mlp": 1.0243535, "epoch": 0.41219262911080384, "flos": 27052872958080.0, "grad_norm": 2.0251542465815975, "language_loss": 0.84853733, "learning_rate": 2.653779004018453e-06, "loss": 0.87035656, "num_input_tokens_seen": 74048300, "step": 3428, "time_per_iteration": 2.723256826400757 }, { "auxiliary_loss_clip": 0.01155386, "auxiliary_loss_mlp": 0.01030851, "balance_loss_clip": 1.04994297, "balance_loss_mlp": 1.02312016, "epoch": 0.4123128720014429, "flos": 24686360282880.0, "grad_norm": 1.7867335010547336, "language_loss": 0.82270563, "learning_rate": 2.653042777504417e-06, "loss": 0.84456789, "num_input_tokens_seen": 74070890, "step": 3429, "time_per_iteration": 2.7638304233551025 }, { "auxiliary_loss_clip": 0.01166645, "auxiliary_loss_mlp": 0.01037387, "balance_loss_clip": 1.04943812, "balance_loss_mlp": 1.02847624, "epoch": 0.412433114892082, "flos": 26244774731520.0, "grad_norm": 1.928449727108071, "language_loss": 0.80010736, "learning_rate": 2.6523064519243105e-06, "loss": 0.82214761, "num_input_tokens_seen": 74090460, "step": 3430, "time_per_iteration": 2.6562678813934326 }, { "auxiliary_loss_clip": 0.01175962, "auxiliary_loss_mlp": 0.01043032, "balance_loss_clip": 1.05391335, "balance_loss_mlp": 1.0333643, "epoch": 0.4125533577827211, "flos": 21361031913600.0, "grad_norm": 2.2784621248748294, "language_loss": 0.78951699, "learning_rate": 2.6515700273898333e-06, "loss": 0.8117069, "num_input_tokens_seen": 74108335, "step": 3431, "time_per_iteration": 2.652026414871216 }, { "auxiliary_loss_clip": 0.01152367, "auxiliary_loss_mlp": 0.01027504, "balance_loss_clip": 1.05094683, "balance_loss_mlp": 1.01835489, "epoch": 0.4126736006733602, "flos": 26067556005120.0, "grad_norm": 2.3361639286209557, "language_loss": 0.6936447, "learning_rate": 2.6508335040127018e-06, "loss": 0.71544337, "num_input_tokens_seen": 74128030, "step": 3432, "time_per_iteration": 2.657735586166382 }, { "auxiliary_loss_clip": 0.01179628, "auxiliary_loss_mlp": 0.01036122, "balance_loss_clip": 1.05417943, "balance_loss_mlp": 1.02709758, "epoch": 0.4127938435639993, "flos": 25666146541440.0, "grad_norm": 1.6860907290193277, "language_loss": 0.77411485, "learning_rate": 2.6500968819046446e-06, "loss": 0.7962724, "num_input_tokens_seen": 74148330, "step": 3433, "time_per_iteration": 2.6907362937927246 }, { "auxiliary_loss_clip": 0.01132461, "auxiliary_loss_mlp": 0.01031432, "balance_loss_clip": 1.04422295, "balance_loss_mlp": 1.02268207, "epoch": 0.4129140864546384, "flos": 17995914253440.0, "grad_norm": 2.7964847820670755, "language_loss": 0.58542073, "learning_rate": 2.649360161177408e-06, "loss": 0.60705966, "num_input_tokens_seen": 74163390, "step": 3434, "time_per_iteration": 2.6369788646698 }, { "auxiliary_loss_clip": 0.01182763, "auxiliary_loss_mlp": 0.01028509, "balance_loss_clip": 1.05294418, "balance_loss_mlp": 1.02006888, "epoch": 0.41303432934527745, "flos": 23732895715200.0, "grad_norm": 1.9960570044819903, "language_loss": 0.73900819, "learning_rate": 2.6486233419427504e-06, "loss": 0.76112092, "num_input_tokens_seen": 74183205, "step": 3435, "time_per_iteration": 2.679516077041626 }, { "auxiliary_loss_clip": 0.01139199, "auxiliary_loss_mlp": 0.01037654, "balance_loss_clip": 1.05006468, "balance_loss_mlp": 1.02847493, "epoch": 0.41315457223591656, "flos": 19755286318080.0, "grad_norm": 2.389838805316941, "language_loss": 0.75197947, "learning_rate": 2.6478864243124484e-06, "loss": 0.77374804, "num_input_tokens_seen": 74202870, "step": 3436, "time_per_iteration": 2.688663959503174 }, { "auxiliary_loss_clip": 0.01177733, "auxiliary_loss_mlp": 0.01026966, "balance_loss_clip": 1.05141926, "balance_loss_mlp": 1.01827526, "epoch": 0.4132748151265556, "flos": 20923316778240.0, "grad_norm": 2.138538675287534, "language_loss": 0.84963012, "learning_rate": 2.6471494083982903e-06, "loss": 0.87167716, "num_input_tokens_seen": 74222255, "step": 3437, "time_per_iteration": 2.70694637298584 }, { "auxiliary_loss_clip": 0.01148864, "auxiliary_loss_mlp": 0.01028724, "balance_loss_clip": 1.0478797, "balance_loss_mlp": 1.02014685, "epoch": 0.4133950580171947, "flos": 32232520016640.0, "grad_norm": 2.175602593059648, "language_loss": 0.74804032, "learning_rate": 2.6464122943120818e-06, "loss": 0.76981616, "num_input_tokens_seen": 74242480, "step": 3438, "time_per_iteration": 2.7653820514678955 }, { "auxiliary_loss_clip": 0.01144864, "auxiliary_loss_mlp": 0.01029601, "balance_loss_clip": 1.05083561, "balance_loss_mlp": 1.02124763, "epoch": 0.41351530090783384, "flos": 23292487059840.0, "grad_norm": 3.058443775271033, "language_loss": 0.82441491, "learning_rate": 2.645675082165642e-06, "loss": 0.84615958, "num_input_tokens_seen": 74258690, "step": 3439, "time_per_iteration": 2.7158288955688477 }, { "auxiliary_loss_clip": 0.01160485, "auxiliary_loss_mlp": 0.01028682, "balance_loss_clip": 1.0523243, "balance_loss_mlp": 1.02030766, "epoch": 0.4136355437984729, "flos": 25593571111680.0, "grad_norm": 2.104474401664527, "language_loss": 0.75467438, "learning_rate": 2.644937772070806e-06, "loss": 0.77656603, "num_input_tokens_seen": 74277135, "step": 3440, "time_per_iteration": 2.67889404296875 }, { "auxiliary_loss_clip": 0.01190622, "auxiliary_loss_mlp": 0.01031125, "balance_loss_clip": 1.05495977, "balance_loss_mlp": 1.02301908, "epoch": 0.413755786689112, "flos": 19828615933440.0, "grad_norm": 2.4726996750138355, "language_loss": 0.83096242, "learning_rate": 2.6442003641394225e-06, "loss": 0.85317987, "num_input_tokens_seen": 74294730, "step": 3441, "time_per_iteration": 2.6235013008117676 }, { "auxiliary_loss_clip": 0.0115458, "auxiliary_loss_mlp": 0.01026654, "balance_loss_clip": 1.04601216, "balance_loss_mlp": 1.01875067, "epoch": 0.4138760295797511, "flos": 26870446759680.0, "grad_norm": 1.6189487590758203, "language_loss": 0.84077823, "learning_rate": 2.643462858483356e-06, "loss": 0.86259055, "num_input_tokens_seen": 74315015, "step": 3442, "time_per_iteration": 2.704502820968628 }, { "auxiliary_loss_clip": 0.01126226, "auxiliary_loss_mlp": 0.01028058, "balance_loss_clip": 1.04583788, "balance_loss_mlp": 1.01907015, "epoch": 0.41399627247039017, "flos": 16399254798720.0, "grad_norm": 1.9322128268780854, "language_loss": 0.72560322, "learning_rate": 2.6427252552144856e-06, "loss": 0.74714601, "num_input_tokens_seen": 74333665, "step": 3443, "time_per_iteration": 2.726768732070923 }, { "auxiliary_loss_clip": 0.01189674, "auxiliary_loss_mlp": 0.01029521, "balance_loss_clip": 1.05364478, "balance_loss_mlp": 1.02059269, "epoch": 0.4141165153610293, "flos": 22930220442240.0, "grad_norm": 2.579008041045323, "language_loss": 0.75161958, "learning_rate": 2.6419875544447044e-06, "loss": 0.77381152, "num_input_tokens_seen": 74355065, "step": 3444, "time_per_iteration": 3.53420352935791 }, { "auxiliary_loss_clip": 0.01189621, "auxiliary_loss_mlp": 0.01025222, "balance_loss_clip": 1.05340052, "balance_loss_mlp": 1.01714563, "epoch": 0.4142367582516684, "flos": 25192556697600.0, "grad_norm": 2.9982187054761726, "language_loss": 0.71592963, "learning_rate": 2.6412497562859218e-06, "loss": 0.73807806, "num_input_tokens_seen": 74376345, "step": 3445, "time_per_iteration": 3.648343801498413 }, { "auxiliary_loss_clip": 0.0117731, "auxiliary_loss_mlp": 0.01035007, "balance_loss_clip": 1.05011392, "balance_loss_mlp": 1.02638245, "epoch": 0.41435700114230745, "flos": 21690476478720.0, "grad_norm": 2.5015572733997655, "language_loss": 0.76007223, "learning_rate": 2.6405118608500617e-06, "loss": 0.78219545, "num_input_tokens_seen": 74395170, "step": 3446, "time_per_iteration": 2.6316254138946533 }, { "auxiliary_loss_clip": 0.0114216, "auxiliary_loss_mlp": 0.01029205, "balance_loss_clip": 1.05453718, "balance_loss_mlp": 1.02112222, "epoch": 0.41447724403294656, "flos": 25995160143360.0, "grad_norm": 1.6768617700699564, "language_loss": 0.81488323, "learning_rate": 2.6397738682490613e-06, "loss": 0.83659685, "num_input_tokens_seen": 74416070, "step": 3447, "time_per_iteration": 2.8883109092712402 }, { "auxiliary_loss_clip": 0.01189902, "auxiliary_loss_mlp": 0.01029853, "balance_loss_clip": 1.0550462, "balance_loss_mlp": 1.02186573, "epoch": 0.41459748692358567, "flos": 18259678800000.0, "grad_norm": 1.8246867863719942, "language_loss": 0.75117499, "learning_rate": 2.6390357785948734e-06, "loss": 0.77337253, "num_input_tokens_seen": 74433185, "step": 3448, "time_per_iteration": 2.6066389083862305 }, { "auxiliary_loss_clip": 0.01173877, "auxiliary_loss_mlp": 0.0103092, "balance_loss_clip": 1.05240607, "balance_loss_mlp": 1.02224195, "epoch": 0.4147177298142247, "flos": 24168456034560.0, "grad_norm": 2.3559172640764974, "language_loss": 0.79929084, "learning_rate": 2.6382975919994667e-06, "loss": 0.82133883, "num_input_tokens_seen": 74453760, "step": 3449, "time_per_iteration": 4.54530143737793 }, { "auxiliary_loss_clip": 0.01160255, "auxiliary_loss_mlp": 0.01028137, "balance_loss_clip": 1.04877543, "balance_loss_mlp": 1.02031076, "epoch": 0.41483797270486383, "flos": 20084659056000.0, "grad_norm": 1.6146889275249017, "language_loss": 0.73408115, "learning_rate": 2.637559308574822e-06, "loss": 0.75596511, "num_input_tokens_seen": 74473505, "step": 3450, "time_per_iteration": 2.7554574012756348 }, { "auxiliary_loss_clip": 0.01188289, "auxiliary_loss_mlp": 0.01026786, "balance_loss_clip": 1.05348134, "balance_loss_mlp": 1.01839423, "epoch": 0.4149582155955029, "flos": 30081040110720.0, "grad_norm": 2.0144794436663287, "language_loss": 0.71425462, "learning_rate": 2.6368209284329376e-06, "loss": 0.73640537, "num_input_tokens_seen": 74494135, "step": 3451, "time_per_iteration": 2.678292751312256 }, { "auxiliary_loss_clip": 0.01171444, "auxiliary_loss_mlp": 0.01034709, "balance_loss_clip": 1.0485301, "balance_loss_mlp": 1.02603054, "epoch": 0.415078458486142, "flos": 16764394504320.0, "grad_norm": 1.9455408929824647, "language_loss": 0.75845838, "learning_rate": 2.636082451685825e-06, "loss": 0.78051996, "num_input_tokens_seen": 74512335, "step": 3452, "time_per_iteration": 2.6549808979034424 }, { "auxiliary_loss_clip": 0.01162168, "auxiliary_loss_mlp": 0.01034355, "balance_loss_clip": 1.05311465, "balance_loss_mlp": 1.02614737, "epoch": 0.4151987013767811, "flos": 26033692458240.0, "grad_norm": 1.627794785055567, "language_loss": 0.86395073, "learning_rate": 2.6353438784455094e-06, "loss": 0.88591594, "num_input_tokens_seen": 74535620, "step": 3453, "time_per_iteration": 2.7135393619537354 }, { "auxiliary_loss_clip": 0.01158393, "auxiliary_loss_mlp": 0.01034859, "balance_loss_clip": 1.05048299, "balance_loss_mlp": 1.02594197, "epoch": 0.41531894426742016, "flos": 24608002763520.0, "grad_norm": 2.180556666069311, "language_loss": 0.71795481, "learning_rate": 2.6346052088240326e-06, "loss": 0.7398873, "num_input_tokens_seen": 74555140, "step": 3454, "time_per_iteration": 2.734668254852295 }, { "auxiliary_loss_clip": 0.01158351, "auxiliary_loss_mlp": 0.01031171, "balance_loss_clip": 1.0481509, "balance_loss_mlp": 1.02276707, "epoch": 0.4154391871580593, "flos": 14975791747200.0, "grad_norm": 3.321985415273994, "language_loss": 0.77109122, "learning_rate": 2.63386644293345e-06, "loss": 0.79298639, "num_input_tokens_seen": 74571485, "step": 3455, "time_per_iteration": 2.6947431564331055 }, { "auxiliary_loss_clip": 0.01139859, "auxiliary_loss_mlp": 0.01028305, "balance_loss_clip": 1.04425478, "balance_loss_mlp": 1.02037716, "epoch": 0.4155594300486984, "flos": 14647173194880.0, "grad_norm": 2.9130662080399192, "language_loss": 0.82818758, "learning_rate": 2.633127580885833e-06, "loss": 0.84986913, "num_input_tokens_seen": 74585985, "step": 3456, "time_per_iteration": 2.6883790493011475 }, { "auxiliary_loss_clip": 0.01187186, "auxiliary_loss_mlp": 0.01029569, "balance_loss_clip": 1.0551945, "balance_loss_mlp": 1.02131939, "epoch": 0.41567967293933744, "flos": 29497276275840.0, "grad_norm": 1.947079291832593, "language_loss": 0.6511538, "learning_rate": 2.632388622793265e-06, "loss": 0.67332137, "num_input_tokens_seen": 74605140, "step": 3457, "time_per_iteration": 2.7083117961883545 }, { "auxiliary_loss_clip": 0.01175095, "auxiliary_loss_mlp": 0.01028957, "balance_loss_clip": 1.05440736, "balance_loss_mlp": 1.02164674, "epoch": 0.41579991582997655, "flos": 19238387650560.0, "grad_norm": 1.9189855756383594, "language_loss": 0.68408209, "learning_rate": 2.6316495687678457e-06, "loss": 0.70612258, "num_input_tokens_seen": 74623790, "step": 3458, "time_per_iteration": 2.7322962284088135 }, { "auxiliary_loss_clip": 0.01124004, "auxiliary_loss_mlp": 0.01030428, "balance_loss_clip": 1.04584885, "balance_loss_mlp": 1.02231002, "epoch": 0.41592015872061566, "flos": 24462061804800.0, "grad_norm": 2.490931793322291, "language_loss": 0.7655586, "learning_rate": 2.6309104189216887e-06, "loss": 0.78710288, "num_input_tokens_seen": 74641355, "step": 3459, "time_per_iteration": 2.8138561248779297 }, { "auxiliary_loss_clip": 0.01132751, "auxiliary_loss_mlp": 0.00763215, "balance_loss_clip": 1.046525, "balance_loss_mlp": 1.00033283, "epoch": 0.4160404016112547, "flos": 20775651966720.0, "grad_norm": 2.106232315719616, "language_loss": 0.75054574, "learning_rate": 2.630171173366923e-06, "loss": 0.76950538, "num_input_tokens_seen": 74657155, "step": 3460, "time_per_iteration": 2.7280468940734863 }, { "auxiliary_loss_clip": 0.01127874, "auxiliary_loss_mlp": 0.01027814, "balance_loss_clip": 1.0462085, "balance_loss_mlp": 1.01920068, "epoch": 0.41616064450189383, "flos": 13916462820480.0, "grad_norm": 2.2364271667394515, "language_loss": 0.74526012, "learning_rate": 2.629431832215691e-06, "loss": 0.76681703, "num_input_tokens_seen": 74671960, "step": 3461, "time_per_iteration": 2.7445452213287354 }, { "auxiliary_loss_clip": 0.01152852, "auxiliary_loss_mlp": 0.01025522, "balance_loss_clip": 1.05014396, "balance_loss_mlp": 1.01715934, "epoch": 0.41628088739253294, "flos": 20010826650240.0, "grad_norm": 1.675373474709286, "language_loss": 0.87255937, "learning_rate": 2.628692395580151e-06, "loss": 0.89434308, "num_input_tokens_seen": 74692050, "step": 3462, "time_per_iteration": 2.7479617595672607 }, { "auxiliary_loss_clip": 0.01095877, "auxiliary_loss_mlp": 0.01032229, "balance_loss_clip": 1.04014421, "balance_loss_mlp": 1.02424765, "epoch": 0.416401130283172, "flos": 29168801377920.0, "grad_norm": 2.1329157290533804, "language_loss": 0.79811943, "learning_rate": 2.6279528635724747e-06, "loss": 0.81940043, "num_input_tokens_seen": 74712205, "step": 3463, "time_per_iteration": 2.8461639881134033 }, { "auxiliary_loss_clip": 0.01170079, "auxiliary_loss_mlp": 0.01025059, "balance_loss_clip": 1.04846191, "balance_loss_mlp": 1.01681626, "epoch": 0.4165213731738111, "flos": 16246813478400.0, "grad_norm": 3.2682228932403024, "language_loss": 0.7883687, "learning_rate": 2.627213236304848e-06, "loss": 0.81032008, "num_input_tokens_seen": 74729005, "step": 3464, "time_per_iteration": 2.6360480785369873 }, { "auxiliary_loss_clip": 0.01174149, "auxiliary_loss_mlp": 0.0102961, "balance_loss_clip": 1.05086446, "balance_loss_mlp": 1.02193832, "epoch": 0.4166416160644502, "flos": 33765438787200.0, "grad_norm": 2.0498646495773487, "language_loss": 0.71076667, "learning_rate": 2.626473513889472e-06, "loss": 0.7328043, "num_input_tokens_seen": 74751385, "step": 3465, "time_per_iteration": 2.7906296253204346 }, { "auxiliary_loss_clip": 0.01164371, "auxiliary_loss_mlp": 0.01037318, "balance_loss_clip": 1.04854512, "balance_loss_mlp": 1.02938485, "epoch": 0.41676185895508927, "flos": 20917498775040.0, "grad_norm": 1.933758726291422, "language_loss": 0.8285774, "learning_rate": 2.625733696438562e-06, "loss": 0.85059428, "num_input_tokens_seen": 74768890, "step": 3466, "time_per_iteration": 2.6378533840179443 }, { "auxiliary_loss_clip": 0.01153293, "auxiliary_loss_mlp": 0.01030275, "balance_loss_clip": 1.04778707, "balance_loss_mlp": 1.02151942, "epoch": 0.4168821018457284, "flos": 18406122549120.0, "grad_norm": 1.6893305854417422, "language_loss": 0.75210321, "learning_rate": 2.6249937840643476e-06, "loss": 0.77393889, "num_input_tokens_seen": 74787195, "step": 3467, "time_per_iteration": 2.6766085624694824 }, { "auxiliary_loss_clip": 0.01188731, "auxiliary_loss_mlp": 0.00762918, "balance_loss_clip": 1.05548835, "balance_loss_mlp": 1.00046515, "epoch": 0.41700234473636744, "flos": 18698399516160.0, "grad_norm": 1.7083835791490265, "language_loss": 0.67110729, "learning_rate": 2.6242537768790733e-06, "loss": 0.69062376, "num_input_tokens_seen": 74806350, "step": 3468, "time_per_iteration": 2.5978121757507324 }, { "auxiliary_loss_clip": 0.01170644, "auxiliary_loss_mlp": 0.01028851, "balance_loss_clip": 1.05083823, "balance_loss_mlp": 1.02039337, "epoch": 0.41712258762700655, "flos": 31033283616000.0, "grad_norm": 1.783627595316444, "language_loss": 0.6866684, "learning_rate": 2.6235136749949975e-06, "loss": 0.70866334, "num_input_tokens_seen": 74829800, "step": 3469, "time_per_iteration": 3.630089282989502 }, { "auxiliary_loss_clip": 0.01184778, "auxiliary_loss_mlp": 0.01031123, "balance_loss_clip": 1.0513401, "balance_loss_mlp": 1.02324963, "epoch": 0.41724283051764566, "flos": 35914763877120.0, "grad_norm": 2.1910644875299967, "language_loss": 0.61547709, "learning_rate": 2.6227734785243924e-06, "loss": 0.63763613, "num_input_tokens_seen": 74849760, "step": 3470, "time_per_iteration": 2.8189282417297363 }, { "auxiliary_loss_clip": 0.01105629, "auxiliary_loss_mlp": 0.01027283, "balance_loss_clip": 1.04039872, "balance_loss_mlp": 1.01957631, "epoch": 0.4173630734082847, "flos": 25333649320320.0, "grad_norm": 1.7649569000693905, "language_loss": 0.79392701, "learning_rate": 2.6220331875795466e-06, "loss": 0.81525618, "num_input_tokens_seen": 74869110, "step": 3471, "time_per_iteration": 3.8060641288757324 }, { "auxiliary_loss_clip": 0.01169917, "auxiliary_loss_mlp": 0.01031863, "balance_loss_clip": 1.05213511, "balance_loss_mlp": 1.02310693, "epoch": 0.4174833162989238, "flos": 26685398868480.0, "grad_norm": 1.6794432390180742, "language_loss": 0.75377905, "learning_rate": 2.62129280227276e-06, "loss": 0.77579677, "num_input_tokens_seen": 74889110, "step": 3472, "time_per_iteration": 2.725999355316162 }, { "auxiliary_loss_clip": 0.01176128, "auxiliary_loss_mlp": 0.01038253, "balance_loss_clip": 1.05119348, "balance_loss_mlp": 1.03007507, "epoch": 0.41760355918956293, "flos": 74739584010240.0, "grad_norm": 2.1118188719547533, "language_loss": 0.68561637, "learning_rate": 2.62055232271635e-06, "loss": 0.7077601, "num_input_tokens_seen": 74916260, "step": 3473, "time_per_iteration": 3.062546968460083 }, { "auxiliary_loss_clip": 0.01135494, "auxiliary_loss_mlp": 0.01029302, "balance_loss_clip": 1.04556906, "balance_loss_mlp": 1.02083778, "epoch": 0.417723802080202, "flos": 14317513148160.0, "grad_norm": 2.0822162067017143, "language_loss": 0.87909168, "learning_rate": 2.619811749022646e-06, "loss": 0.90073967, "num_input_tokens_seen": 74931570, "step": 3474, "time_per_iteration": 2.65370512008667 }, { "auxiliary_loss_clip": 0.01175488, "auxiliary_loss_mlp": 0.01036272, "balance_loss_clip": 1.05412126, "balance_loss_mlp": 1.02676523, "epoch": 0.4178440449708411, "flos": 14643797316480.0, "grad_norm": 2.188612991979792, "language_loss": 0.71374488, "learning_rate": 2.6190710813039917e-06, "loss": 0.73586243, "num_input_tokens_seen": 74944695, "step": 3475, "time_per_iteration": 4.6343934535980225 }, { "auxiliary_loss_clip": 0.01122787, "auxiliary_loss_mlp": 0.00763388, "balance_loss_clip": 1.04175401, "balance_loss_mlp": 1.00026035, "epoch": 0.4179642878614802, "flos": 21507296094720.0, "grad_norm": 2.2090507424219212, "language_loss": 0.84153306, "learning_rate": 2.618330319672747e-06, "loss": 0.86039484, "num_input_tokens_seen": 74964115, "step": 3476, "time_per_iteration": 2.8584110736846924 }, { "auxiliary_loss_clip": 0.01188214, "auxiliary_loss_mlp": 0.01025883, "balance_loss_clip": 1.05357289, "balance_loss_mlp": 1.01797938, "epoch": 0.41808453075211927, "flos": 18441997257600.0, "grad_norm": 2.1960193070222442, "language_loss": 0.92070234, "learning_rate": 2.617589464241284e-06, "loss": 0.94284326, "num_input_tokens_seen": 74978515, "step": 3477, "time_per_iteration": 2.555351734161377 }, { "auxiliary_loss_clip": 0.01146963, "auxiliary_loss_mlp": 0.01031346, "balance_loss_clip": 1.04870176, "balance_loss_mlp": 1.02399659, "epoch": 0.4182047736427584, "flos": 20301020628480.0, "grad_norm": 2.615546260840198, "language_loss": 0.74643987, "learning_rate": 2.6168485151219914e-06, "loss": 0.76822299, "num_input_tokens_seen": 74998135, "step": 3478, "time_per_iteration": 2.7152771949768066 }, { "auxiliary_loss_clip": 0.01174292, "auxiliary_loss_mlp": 0.01028034, "balance_loss_clip": 1.05454421, "balance_loss_mlp": 1.02016056, "epoch": 0.4183250165333975, "flos": 18876623823360.0, "grad_norm": 2.9550968260440276, "language_loss": 0.71574187, "learning_rate": 2.616107472427269e-06, "loss": 0.73776507, "num_input_tokens_seen": 75012830, "step": 3479, "time_per_iteration": 2.5872716903686523 }, { "auxiliary_loss_clip": 0.01175523, "auxiliary_loss_mlp": 0.01035587, "balance_loss_clip": 1.04994488, "balance_loss_mlp": 1.02678943, "epoch": 0.41844525942403654, "flos": 17740050698880.0, "grad_norm": 2.5237533087946393, "language_loss": 0.76637131, "learning_rate": 2.615366336269533e-06, "loss": 0.78848237, "num_input_tokens_seen": 75026495, "step": 3480, "time_per_iteration": 2.6310482025146484 }, { "auxiliary_loss_clip": 0.01189773, "auxiliary_loss_mlp": 0.01028406, "balance_loss_clip": 1.05272388, "balance_loss_mlp": 1.0196383, "epoch": 0.41856550231467565, "flos": 18361377181440.0, "grad_norm": 2.274528555565257, "language_loss": 0.80402422, "learning_rate": 2.6146251067612126e-06, "loss": 0.82620597, "num_input_tokens_seen": 75041970, "step": 3481, "time_per_iteration": 2.553116798400879 }, { "auxiliary_loss_clip": 0.01171477, "auxiliary_loss_mlp": 0.01029699, "balance_loss_clip": 1.05303407, "balance_loss_mlp": 1.02143216, "epoch": 0.41868574520531476, "flos": 22781801445120.0, "grad_norm": 1.7801494720379165, "language_loss": 0.83029985, "learning_rate": 2.6138837840147525e-06, "loss": 0.85231155, "num_input_tokens_seen": 75061005, "step": 3482, "time_per_iteration": 2.6493980884552 }, { "auxiliary_loss_clip": 0.01140433, "auxiliary_loss_mlp": 0.01027716, "balance_loss_clip": 1.04650402, "balance_loss_mlp": 1.01972294, "epoch": 0.4188059880959538, "flos": 13699167494400.0, "grad_norm": 1.9407853367226497, "language_loss": 0.76713479, "learning_rate": 2.6131423681426103e-06, "loss": 0.78881633, "num_input_tokens_seen": 75076920, "step": 3483, "time_per_iteration": 2.6533124446868896 }, { "auxiliary_loss_clip": 0.01186302, "auxiliary_loss_mlp": 0.01030471, "balance_loss_clip": 1.05354261, "balance_loss_mlp": 1.02307749, "epoch": 0.41892623098659293, "flos": 37818281220480.0, "grad_norm": 2.1163683964241353, "language_loss": 0.73243648, "learning_rate": 2.6124008592572587e-06, "loss": 0.75460422, "num_input_tokens_seen": 75100905, "step": 3484, "time_per_iteration": 2.77797794342041 }, { "auxiliary_loss_clip": 0.01187856, "auxiliary_loss_mlp": 0.01026029, "balance_loss_clip": 1.05148411, "balance_loss_mlp": 1.01734471, "epoch": 0.419046473877232, "flos": 23258874908160.0, "grad_norm": 2.2359405730952493, "language_loss": 0.82259643, "learning_rate": 2.6116592574711835e-06, "loss": 0.84473526, "num_input_tokens_seen": 75119205, "step": 3485, "time_per_iteration": 2.592000961303711 }, { "auxiliary_loss_clip": 0.01189677, "auxiliary_loss_mlp": 0.01029901, "balance_loss_clip": 1.05362463, "balance_loss_mlp": 1.02197337, "epoch": 0.4191667167678711, "flos": 20741034234240.0, "grad_norm": 1.9063379827170264, "language_loss": 0.84463942, "learning_rate": 2.6109175628968853e-06, "loss": 0.86683524, "num_input_tokens_seen": 75138970, "step": 3486, "time_per_iteration": 2.6076834201812744 }, { "auxiliary_loss_clip": 0.0115966, "auxiliary_loss_mlp": 0.01027507, "balance_loss_clip": 1.04631937, "balance_loss_mlp": 1.01897204, "epoch": 0.4192869596585102, "flos": 23586416052480.0, "grad_norm": 2.0422084935262825, "language_loss": 0.82944775, "learning_rate": 2.610175775646878e-06, "loss": 0.85131943, "num_input_tokens_seen": 75157550, "step": 3487, "time_per_iteration": 2.6416242122650146 }, { "auxiliary_loss_clip": 0.01154696, "auxiliary_loss_mlp": 0.01028914, "balance_loss_clip": 1.04753566, "balance_loss_mlp": 1.01993155, "epoch": 0.41940720254914926, "flos": 25081269384960.0, "grad_norm": 2.1825446265148267, "language_loss": 0.73407805, "learning_rate": 2.6094338958336907e-06, "loss": 0.75591415, "num_input_tokens_seen": 75176220, "step": 3488, "time_per_iteration": 2.6953697204589844 }, { "auxiliary_loss_clip": 0.01161148, "auxiliary_loss_mlp": 0.01026544, "balance_loss_clip": 1.05236435, "balance_loss_mlp": 1.01881897, "epoch": 0.41952744543978837, "flos": 15554132628480.0, "grad_norm": 1.9888104546797019, "language_loss": 0.82260889, "learning_rate": 2.608691923569867e-06, "loss": 0.84448576, "num_input_tokens_seen": 75193095, "step": 3489, "time_per_iteration": 2.6333351135253906 }, { "auxiliary_loss_clip": 0.01177685, "auxiliary_loss_mlp": 0.01026675, "balance_loss_clip": 1.0548265, "balance_loss_mlp": 1.01877761, "epoch": 0.4196476883304275, "flos": 24644775312000.0, "grad_norm": 1.641227764346916, "language_loss": 0.75737971, "learning_rate": 2.6079498589679616e-06, "loss": 0.7794233, "num_input_tokens_seen": 75214185, "step": 3490, "time_per_iteration": 2.6646971702575684 }, { "auxiliary_loss_clip": 0.01112296, "auxiliary_loss_mlp": 0.01031169, "balance_loss_clip": 1.04285717, "balance_loss_mlp": 1.02150691, "epoch": 0.41976793122106654, "flos": 24531333183360.0, "grad_norm": 2.3863812840644694, "language_loss": 0.76185155, "learning_rate": 2.6072077021405465e-06, "loss": 0.78328621, "num_input_tokens_seen": 75233020, "step": 3491, "time_per_iteration": 2.826784133911133 }, { "auxiliary_loss_clip": 0.01150974, "auxiliary_loss_mlp": 0.01029215, "balance_loss_clip": 1.04692221, "balance_loss_mlp": 1.02142477, "epoch": 0.41988817411170565, "flos": 21175301664000.0, "grad_norm": 1.7252455326851732, "language_loss": 0.69625974, "learning_rate": 2.6064654532002054e-06, "loss": 0.71806169, "num_input_tokens_seen": 75252030, "step": 3492, "time_per_iteration": 2.7395827770233154 }, { "auxiliary_loss_clip": 0.0118685, "auxiliary_loss_mlp": 0.01026123, "balance_loss_clip": 1.05379832, "balance_loss_mlp": 1.01853502, "epoch": 0.42000841700234476, "flos": 31649402626560.0, "grad_norm": 1.5116171081009089, "language_loss": 0.75760198, "learning_rate": 2.6057231122595375e-06, "loss": 0.77973169, "num_input_tokens_seen": 75273340, "step": 3493, "time_per_iteration": 2.7768354415893555 }, { "auxiliary_loss_clip": 0.01160533, "auxiliary_loss_mlp": 0.01034287, "balance_loss_clip": 1.04978406, "balance_loss_mlp": 1.0265379, "epoch": 0.4201286598929838, "flos": 21281525159040.0, "grad_norm": 1.6038393927783618, "language_loss": 0.73202062, "learning_rate": 2.604980679431154e-06, "loss": 0.75396883, "num_input_tokens_seen": 75291580, "step": 3494, "time_per_iteration": 2.668672800064087 }, { "auxiliary_loss_clip": 0.01175076, "auxiliary_loss_mlp": 0.01024212, "balance_loss_clip": 1.05017197, "balance_loss_mlp": 1.01603436, "epoch": 0.4202489027836229, "flos": 18546532813440.0, "grad_norm": 2.0588178169971685, "language_loss": 0.74831045, "learning_rate": 2.604238154827684e-06, "loss": 0.77030331, "num_input_tokens_seen": 75308205, "step": 3495, "time_per_iteration": 3.58237361907959 }, { "auxiliary_loss_clip": 0.01177106, "auxiliary_loss_mlp": 0.01031122, "balance_loss_clip": 1.05388117, "balance_loss_mlp": 1.02288449, "epoch": 0.42036914567426203, "flos": 19317643009920.0, "grad_norm": 2.4956957114922527, "language_loss": 0.7276969, "learning_rate": 2.6034955385617656e-06, "loss": 0.74977911, "num_input_tokens_seen": 75326535, "step": 3496, "time_per_iteration": 2.630383014678955 }, { "auxiliary_loss_clip": 0.01052933, "auxiliary_loss_mlp": 0.01000887, "balance_loss_clip": 1.01504242, "balance_loss_mlp": 0.99913424, "epoch": 0.4204893885649011, "flos": 67842942935040.0, "grad_norm": 0.7177584716073458, "language_loss": 0.61709106, "learning_rate": 2.6027528307460544e-06, "loss": 0.63762927, "num_input_tokens_seen": 75390540, "step": 3497, "time_per_iteration": 4.434598445892334 }, { "auxiliary_loss_clip": 0.01189314, "auxiliary_loss_mlp": 0.01028516, "balance_loss_clip": 1.05415344, "balance_loss_mlp": 1.0206244, "epoch": 0.4206096314555402, "flos": 21908777385600.0, "grad_norm": 1.7670282637300714, "language_loss": 0.86396283, "learning_rate": 2.602010031493217e-06, "loss": 0.88614118, "num_input_tokens_seen": 75408770, "step": 3498, "time_per_iteration": 2.7410974502563477 }, { "auxiliary_loss_clip": 0.01142297, "auxiliary_loss_mlp": 0.01021143, "balance_loss_clip": 1.04945958, "balance_loss_mlp": 1.01298916, "epoch": 0.42072987434617926, "flos": 29278185269760.0, "grad_norm": 1.8401753803207481, "language_loss": 0.87083703, "learning_rate": 2.6012671409159367e-06, "loss": 0.89247143, "num_input_tokens_seen": 75430105, "step": 3499, "time_per_iteration": 2.738534450531006 }, { "auxiliary_loss_clip": 0.01156722, "auxiliary_loss_mlp": 0.01027277, "balance_loss_clip": 1.05145383, "balance_loss_mlp": 1.01914716, "epoch": 0.42085011723681837, "flos": 27600726170880.0, "grad_norm": 2.0494679149966957, "language_loss": 0.81925535, "learning_rate": 2.6005241591269097e-06, "loss": 0.84109539, "num_input_tokens_seen": 75449475, "step": 3500, "time_per_iteration": 3.636561155319214 }, { "auxiliary_loss_clip": 0.01142542, "auxiliary_loss_mlp": 0.01027207, "balance_loss_clip": 1.05289173, "balance_loss_mlp": 1.0197866, "epoch": 0.4209703601274575, "flos": 27818632028160.0, "grad_norm": 1.8098474620214282, "language_loss": 0.79843616, "learning_rate": 2.5997810862388454e-06, "loss": 0.82013363, "num_input_tokens_seen": 75469315, "step": 3501, "time_per_iteration": 3.6834137439727783 }, { "auxiliary_loss_clip": 0.01157735, "auxiliary_loss_mlp": 0.01026645, "balance_loss_clip": 1.04861534, "balance_loss_mlp": 1.01840723, "epoch": 0.42109060301809653, "flos": 27525529048320.0, "grad_norm": 2.7986332192261187, "language_loss": 0.75737619, "learning_rate": 2.599037922364467e-06, "loss": 0.77922004, "num_input_tokens_seen": 75488215, "step": 3502, "time_per_iteration": 2.781593084335327 }, { "auxiliary_loss_clip": 0.01139763, "auxiliary_loss_mlp": 0.01030728, "balance_loss_clip": 1.04922843, "balance_loss_mlp": 1.02229357, "epoch": 0.42121084590873564, "flos": 29314275459840.0, "grad_norm": 2.0934604262163, "language_loss": 0.75530541, "learning_rate": 2.5982946676165112e-06, "loss": 0.77701032, "num_input_tokens_seen": 75507985, "step": 3503, "time_per_iteration": 2.7563836574554443 }, { "auxiliary_loss_clip": 0.01077297, "auxiliary_loss_mlp": 0.01003586, "balance_loss_clip": 1.04001045, "balance_loss_mlp": 1.00147617, "epoch": 0.42133108879937475, "flos": 67398835178880.0, "grad_norm": 0.7268753768007374, "language_loss": 0.57545578, "learning_rate": 2.5975513221077313e-06, "loss": 0.5962646, "num_input_tokens_seen": 75571955, "step": 3504, "time_per_iteration": 3.3643743991851807 }, { "auxiliary_loss_clip": 0.01150325, "auxiliary_loss_mlp": 0.01034391, "balance_loss_clip": 1.04816842, "balance_loss_mlp": 1.02583838, "epoch": 0.4214513316900138, "flos": 23106038538240.0, "grad_norm": 2.168303480937892, "language_loss": 0.88535792, "learning_rate": 2.5968078859508897e-06, "loss": 0.90720505, "num_input_tokens_seen": 75589155, "step": 3505, "time_per_iteration": 2.6907010078430176 }, { "auxiliary_loss_clip": 0.01174405, "auxiliary_loss_mlp": 0.01023612, "balance_loss_clip": 1.0533042, "balance_loss_mlp": 1.01576757, "epoch": 0.4215715745806529, "flos": 15336190857600.0, "grad_norm": 2.0701894595746007, "language_loss": 0.79835999, "learning_rate": 2.5960643592587673e-06, "loss": 0.8203401, "num_input_tokens_seen": 75606565, "step": 3506, "time_per_iteration": 2.6331839561462402 }, { "auxiliary_loss_clip": 0.01146599, "auxiliary_loss_mlp": 0.01028045, "balance_loss_clip": 1.04839158, "balance_loss_mlp": 1.02005839, "epoch": 0.42169181747129203, "flos": 22127257860480.0, "grad_norm": 1.701310473139929, "language_loss": 0.81319064, "learning_rate": 2.5953207421441553e-06, "loss": 0.83493704, "num_input_tokens_seen": 75625165, "step": 3507, "time_per_iteration": 2.7252910137176514 }, { "auxiliary_loss_clip": 0.01147524, "auxiliary_loss_mlp": 0.01027042, "balance_loss_clip": 1.05007088, "balance_loss_mlp": 1.01851845, "epoch": 0.4218120603619311, "flos": 22630724841600.0, "grad_norm": 2.1119677359501736, "language_loss": 0.75638521, "learning_rate": 2.5945770347198603e-06, "loss": 0.77813083, "num_input_tokens_seen": 75643320, "step": 3508, "time_per_iteration": 2.747039794921875 }, { "auxiliary_loss_clip": 0.01153387, "auxiliary_loss_mlp": 0.01026239, "balance_loss_clip": 1.04603457, "balance_loss_mlp": 1.01843107, "epoch": 0.4219323032525702, "flos": 19682818629120.0, "grad_norm": 1.7252196732033775, "language_loss": 0.8218528, "learning_rate": 2.593833237098701e-06, "loss": 0.84364903, "num_input_tokens_seen": 75660920, "step": 3509, "time_per_iteration": 2.6713287830352783 }, { "auxiliary_loss_clip": 0.01169511, "auxiliary_loss_mlp": 0.01033037, "balance_loss_clip": 1.04710472, "balance_loss_mlp": 1.02414417, "epoch": 0.4220525461432093, "flos": 30190747224960.0, "grad_norm": 2.03285972268686, "language_loss": 0.63008821, "learning_rate": 2.593089349393512e-06, "loss": 0.65211368, "num_input_tokens_seen": 75681410, "step": 3510, "time_per_iteration": 2.6857268810272217 }, { "auxiliary_loss_clip": 0.01171709, "auxiliary_loss_mlp": 0.01029901, "balance_loss_clip": 1.05200231, "balance_loss_mlp": 1.02182508, "epoch": 0.42217278903384836, "flos": 24315941278080.0, "grad_norm": 2.4187369202665825, "language_loss": 0.83614099, "learning_rate": 2.592345371717141e-06, "loss": 0.85815716, "num_input_tokens_seen": 75700940, "step": 3511, "time_per_iteration": 2.650291919708252 }, { "auxiliary_loss_clip": 0.01174054, "auxiliary_loss_mlp": 0.01034091, "balance_loss_clip": 1.05531311, "balance_loss_mlp": 1.02570152, "epoch": 0.42229303192448747, "flos": 17092474352640.0, "grad_norm": 2.315202723197227, "language_loss": 0.71931577, "learning_rate": 2.591601304182448e-06, "loss": 0.7413972, "num_input_tokens_seen": 75718910, "step": 3512, "time_per_iteration": 2.604853630065918 }, { "auxiliary_loss_clip": 0.01159532, "auxiliary_loss_mlp": 0.01025344, "balance_loss_clip": 1.05367434, "balance_loss_mlp": 1.01747024, "epoch": 0.4224132748151266, "flos": 22784530878720.0, "grad_norm": 2.172189887692531, "language_loss": 0.79566717, "learning_rate": 2.5908571469023067e-06, "loss": 0.81751591, "num_input_tokens_seen": 75738395, "step": 3513, "time_per_iteration": 2.700212001800537 }, { "auxiliary_loss_clip": 0.0118637, "auxiliary_loss_mlp": 0.01029794, "balance_loss_clip": 1.05340123, "balance_loss_mlp": 1.0218612, "epoch": 0.42253351770576564, "flos": 17819090576640.0, "grad_norm": 2.3710727306675223, "language_loss": 0.75804257, "learning_rate": 2.5901128999896067e-06, "loss": 0.78020418, "num_input_tokens_seen": 75753825, "step": 3514, "time_per_iteration": 2.605175495147705 }, { "auxiliary_loss_clip": 0.01171554, "auxiliary_loss_mlp": 0.01030213, "balance_loss_clip": 1.0517422, "balance_loss_mlp": 1.02228522, "epoch": 0.42265376059640475, "flos": 28512390286080.0, "grad_norm": 1.6271151910872301, "language_loss": 0.6834954, "learning_rate": 2.5893685635572487e-06, "loss": 0.70551306, "num_input_tokens_seen": 75774675, "step": 3515, "time_per_iteration": 2.740384101867676 }, { "auxiliary_loss_clip": 0.01154322, "auxiliary_loss_mlp": 0.01030359, "balance_loss_clip": 1.0503813, "balance_loss_mlp": 1.02193642, "epoch": 0.4227740034870438, "flos": 16253349753600.0, "grad_norm": 1.990422708851614, "language_loss": 0.69204509, "learning_rate": 2.5886241377181483e-06, "loss": 0.71389186, "num_input_tokens_seen": 75793545, "step": 3516, "time_per_iteration": 2.632050037384033 }, { "auxiliary_loss_clip": 0.01173856, "auxiliary_loss_mlp": 0.01029002, "balance_loss_clip": 1.051126, "balance_loss_mlp": 1.01945949, "epoch": 0.4228942463776829, "flos": 25295691623040.0, "grad_norm": 1.8878280754387362, "language_loss": 0.8141073, "learning_rate": 2.587879622585234e-06, "loss": 0.83613586, "num_input_tokens_seen": 75812145, "step": 3517, "time_per_iteration": 2.663898229598999 }, { "auxiliary_loss_clip": 0.01172611, "auxiliary_loss_mlp": 0.01033496, "balance_loss_clip": 1.05435061, "balance_loss_mlp": 1.02534807, "epoch": 0.423014489268322, "flos": 26395779507840.0, "grad_norm": 2.016497279838565, "language_loss": 0.7591657, "learning_rate": 2.5871350182714486e-06, "loss": 0.78122675, "num_input_tokens_seen": 75833025, "step": 3518, "time_per_iteration": 2.6465697288513184 }, { "auxiliary_loss_clip": 0.01184665, "auxiliary_loss_mlp": 0.01030853, "balance_loss_clip": 1.05198216, "balance_loss_mlp": 1.02252007, "epoch": 0.4231347321589611, "flos": 17274002711040.0, "grad_norm": 2.233988058457752, "language_loss": 0.80219162, "learning_rate": 2.586390324889748e-06, "loss": 0.82434678, "num_input_tokens_seen": 75848925, "step": 3519, "time_per_iteration": 2.605771064758301 }, { "auxiliary_loss_clip": 0.01171054, "auxiliary_loss_mlp": 0.01028994, "balance_loss_clip": 1.05175471, "balance_loss_mlp": 1.02066112, "epoch": 0.4232549750496002, "flos": 22999635475200.0, "grad_norm": 1.7517251440479846, "language_loss": 0.67360592, "learning_rate": 2.5856455425531003e-06, "loss": 0.69560641, "num_input_tokens_seen": 75870400, "step": 3520, "time_per_iteration": 2.687532424926758 }, { "auxiliary_loss_clip": 0.01172392, "auxiliary_loss_mlp": 0.01032566, "balance_loss_clip": 1.05458355, "balance_loss_mlp": 1.02413177, "epoch": 0.4233752179402393, "flos": 21248343970560.0, "grad_norm": 1.7734261134982823, "language_loss": 0.81168109, "learning_rate": 2.5849006713744902e-06, "loss": 0.8337307, "num_input_tokens_seen": 75889195, "step": 3521, "time_per_iteration": 3.628653049468994 }, { "auxiliary_loss_clip": 0.01157862, "auxiliary_loss_mlp": 0.01029396, "balance_loss_clip": 1.0495131, "balance_loss_mlp": 1.02053261, "epoch": 0.42349546083087836, "flos": 20704297599360.0, "grad_norm": 70.52847032222567, "language_loss": 0.72594148, "learning_rate": 2.5841557114669135e-06, "loss": 0.74781406, "num_input_tokens_seen": 75906055, "step": 3522, "time_per_iteration": 2.6959068775177 }, { "auxiliary_loss_clip": 0.01189467, "auxiliary_loss_mlp": 0.01027906, "balance_loss_clip": 1.05168617, "balance_loss_mlp": 1.01874435, "epoch": 0.42361570372151747, "flos": 18585065128320.0, "grad_norm": 2.5780370557755052, "language_loss": 0.67121387, "learning_rate": 2.58341066294338e-06, "loss": 0.69338757, "num_input_tokens_seen": 75922720, "step": 3523, "time_per_iteration": 3.5693626403808594 }, { "auxiliary_loss_clip": 0.0113286, "auxiliary_loss_mlp": 0.00763236, "balance_loss_clip": 1.04697978, "balance_loss_mlp": 1.00042415, "epoch": 0.4237359466121566, "flos": 20959478795520.0, "grad_norm": 2.2172959979199196, "language_loss": 0.85188544, "learning_rate": 2.5826655259169124e-06, "loss": 0.87084639, "num_input_tokens_seen": 75941375, "step": 3524, "time_per_iteration": 2.7979917526245117 }, { "auxiliary_loss_clip": 0.01189337, "auxiliary_loss_mlp": 0.01029614, "balance_loss_clip": 1.05458462, "balance_loss_mlp": 1.02183008, "epoch": 0.42385618950279563, "flos": 18038181582720.0, "grad_norm": 1.891978018177093, "language_loss": 0.90713, "learning_rate": 2.5819203005005475e-06, "loss": 0.9293195, "num_input_tokens_seen": 75958710, "step": 3525, "time_per_iteration": 2.6336874961853027 }, { "auxiliary_loss_clip": 0.01153773, "auxiliary_loss_mlp": 0.01034421, "balance_loss_clip": 1.0497334, "balance_loss_mlp": 1.02537251, "epoch": 0.42397643239343474, "flos": 23769129559680.0, "grad_norm": 2.008509497961551, "language_loss": 0.78706998, "learning_rate": 2.581174986807336e-06, "loss": 0.80895191, "num_input_tokens_seen": 75978945, "step": 3526, "time_per_iteration": 3.6497247219085693 }, { "auxiliary_loss_clip": 0.01163513, "auxiliary_loss_mlp": 0.00763516, "balance_loss_clip": 1.04884744, "balance_loss_mlp": 1.00043666, "epoch": 0.42409667528407385, "flos": 16545088016640.0, "grad_norm": 2.6213761965992424, "language_loss": 0.91247493, "learning_rate": 2.580429584950341e-06, "loss": 0.93174517, "num_input_tokens_seen": 75994695, "step": 3527, "time_per_iteration": 3.5901148319244385 }, { "auxiliary_loss_clip": 0.01150493, "auxiliary_loss_mlp": 0.0103147, "balance_loss_clip": 1.04871988, "balance_loss_mlp": 1.02249932, "epoch": 0.4242169181747129, "flos": 16034186920320.0, "grad_norm": 2.1535403463354386, "language_loss": 0.66553438, "learning_rate": 2.5796840950426397e-06, "loss": 0.68735397, "num_input_tokens_seen": 76011780, "step": 3528, "time_per_iteration": 2.6876461505889893 }, { "auxiliary_loss_clip": 0.01166296, "auxiliary_loss_mlp": 0.01034893, "balance_loss_clip": 1.05135202, "balance_loss_mlp": 1.02561307, "epoch": 0.424337161065352, "flos": 20084012611200.0, "grad_norm": 4.935629946808246, "language_loss": 0.65883648, "learning_rate": 2.578938517197322e-06, "loss": 0.68084842, "num_input_tokens_seen": 76029875, "step": 3529, "time_per_iteration": 2.6843271255493164 }, { "auxiliary_loss_clip": 0.01152535, "auxiliary_loss_mlp": 0.01031701, "balance_loss_clip": 1.04934788, "balance_loss_mlp": 1.02263546, "epoch": 0.4244574039559911, "flos": 23878369797120.0, "grad_norm": 4.447605780148659, "language_loss": 0.62831652, "learning_rate": 2.5781928515274916e-06, "loss": 0.65015888, "num_input_tokens_seen": 76048595, "step": 3530, "time_per_iteration": 2.7046899795532227 }, { "auxiliary_loss_clip": 0.01176804, "auxiliary_loss_mlp": 0.01027614, "balance_loss_clip": 1.05309367, "balance_loss_mlp": 1.01909649, "epoch": 0.4245776468466302, "flos": 17565920542080.0, "grad_norm": 1.8152004737302525, "language_loss": 0.67537308, "learning_rate": 2.577447098146265e-06, "loss": 0.69741726, "num_input_tokens_seen": 76065770, "step": 3531, "time_per_iteration": 2.7420899868011475 }, { "auxiliary_loss_clip": 0.01146901, "auxiliary_loss_mlp": 0.0102884, "balance_loss_clip": 1.04956615, "balance_loss_mlp": 1.02025723, "epoch": 0.4246978897372693, "flos": 27776256958080.0, "grad_norm": 1.660756420769618, "language_loss": 0.78907013, "learning_rate": 2.5767012571667724e-06, "loss": 0.81082755, "num_input_tokens_seen": 76085250, "step": 3532, "time_per_iteration": 2.7533302307128906 }, { "auxiliary_loss_clip": 0.01174867, "auxiliary_loss_mlp": 0.01032389, "balance_loss_clip": 1.05071115, "balance_loss_mlp": 1.02321029, "epoch": 0.42481813262790835, "flos": 15596615439360.0, "grad_norm": 1.9610806697253336, "language_loss": 0.68545687, "learning_rate": 2.5759553287021587e-06, "loss": 0.70752943, "num_input_tokens_seen": 76103580, "step": 3533, "time_per_iteration": 2.601832389831543 }, { "auxiliary_loss_clip": 0.01157346, "auxiliary_loss_mlp": 0.01035406, "balance_loss_clip": 1.0500288, "balance_loss_mlp": 1.02696574, "epoch": 0.42493837551854746, "flos": 23951088881280.0, "grad_norm": 1.9468276654194652, "language_loss": 0.77546251, "learning_rate": 2.5752093128655786e-06, "loss": 0.79739004, "num_input_tokens_seen": 76121825, "step": 3534, "time_per_iteration": 2.7032108306884766 }, { "auxiliary_loss_clip": 0.01148012, "auxiliary_loss_mlp": 0.01028197, "balance_loss_clip": 1.04532337, "balance_loss_mlp": 1.01973939, "epoch": 0.4250586184091866, "flos": 20813466009600.0, "grad_norm": 1.9666882887240578, "language_loss": 0.73791575, "learning_rate": 2.574463209770204e-06, "loss": 0.75967789, "num_input_tokens_seen": 76141140, "step": 3535, "time_per_iteration": 2.657381296157837 }, { "auxiliary_loss_clip": 0.01140672, "auxiliary_loss_mlp": 0.01027391, "balance_loss_clip": 1.04491913, "balance_loss_mlp": 1.01868856, "epoch": 0.42517886129982563, "flos": 30371018607360.0, "grad_norm": 1.6636445369566226, "language_loss": 0.79535794, "learning_rate": 2.5737170195292165e-06, "loss": 0.81703854, "num_input_tokens_seen": 76164475, "step": 3536, "time_per_iteration": 2.8733224868774414 }, { "auxiliary_loss_clip": 0.01141179, "auxiliary_loss_mlp": 0.0103291, "balance_loss_clip": 1.04515755, "balance_loss_mlp": 1.02428484, "epoch": 0.42529910419046474, "flos": 20080636732800.0, "grad_norm": 1.6810804670594863, "language_loss": 0.77865154, "learning_rate": 2.572970742255814e-06, "loss": 0.80039239, "num_input_tokens_seen": 76182965, "step": 3537, "time_per_iteration": 2.678542137145996 }, { "auxiliary_loss_clip": 0.01171809, "auxiliary_loss_mlp": 0.01028587, "balance_loss_clip": 1.0527215, "balance_loss_mlp": 1.02006996, "epoch": 0.42541934708110385, "flos": 22632448694400.0, "grad_norm": 1.837817260773353, "language_loss": 0.81435907, "learning_rate": 2.5722243780632046e-06, "loss": 0.83636308, "num_input_tokens_seen": 76201230, "step": 3538, "time_per_iteration": 2.7220025062561035 }, { "auxiliary_loss_clip": 0.01047706, "auxiliary_loss_mlp": 0.01006834, "balance_loss_clip": 1.01787877, "balance_loss_mlp": 1.00490308, "epoch": 0.4255395899717429, "flos": 66200676186240.0, "grad_norm": 0.7773195658321704, "language_loss": 0.60387391, "learning_rate": 2.5714779270646125e-06, "loss": 0.62441933, "num_input_tokens_seen": 76262000, "step": 3539, "time_per_iteration": 3.227846145629883 }, { "auxiliary_loss_clip": 0.01159643, "auxiliary_loss_mlp": 0.00764367, "balance_loss_clip": 1.05209899, "balance_loss_mlp": 1.00037956, "epoch": 0.425659832862382, "flos": 17931814433280.0, "grad_norm": 2.466787777941296, "language_loss": 0.78031385, "learning_rate": 2.5707313893732735e-06, "loss": 0.79955399, "num_input_tokens_seen": 76280540, "step": 3540, "time_per_iteration": 2.7084479331970215 }, { "auxiliary_loss_clip": 0.0109356, "auxiliary_loss_mlp": 0.01027867, "balance_loss_clip": 1.03899407, "balance_loss_mlp": 1.01888418, "epoch": 0.4257800757530211, "flos": 24022550989440.0, "grad_norm": 1.7943008791281314, "language_loss": 0.76943958, "learning_rate": 2.5699847651024364e-06, "loss": 0.79065382, "num_input_tokens_seen": 76301180, "step": 3541, "time_per_iteration": 2.831658363342285 }, { "auxiliary_loss_clip": 0.0117091, "auxiliary_loss_mlp": 0.01033001, "balance_loss_clip": 1.05277729, "balance_loss_mlp": 1.02466202, "epoch": 0.4259003186436602, "flos": 23696015425920.0, "grad_norm": 2.6743016238093817, "language_loss": 0.76482153, "learning_rate": 2.5692380543653627e-06, "loss": 0.78686064, "num_input_tokens_seen": 76319335, "step": 3542, "time_per_iteration": 2.6723670959472656 }, { "auxiliary_loss_clip": 0.01178004, "auxiliary_loss_mlp": 0.00763787, "balance_loss_clip": 1.05272245, "balance_loss_mlp": 1.00039136, "epoch": 0.4260205615342993, "flos": 15259772672640.0, "grad_norm": 2.0710600411036673, "language_loss": 0.70150197, "learning_rate": 2.5684912572753293e-06, "loss": 0.72091991, "num_input_tokens_seen": 76335010, "step": 3543, "time_per_iteration": 2.6652090549468994 }, { "auxiliary_loss_clip": 0.01184597, "auxiliary_loss_mlp": 0.01030376, "balance_loss_clip": 1.0529393, "balance_loss_mlp": 1.02281761, "epoch": 0.4261408044249384, "flos": 30665306736000.0, "grad_norm": 1.9845208613039869, "language_loss": 0.8450948, "learning_rate": 2.5677443739456245e-06, "loss": 0.86724454, "num_input_tokens_seen": 76356670, "step": 3544, "time_per_iteration": 2.6831905841827393 }, { "auxiliary_loss_clip": 0.01162873, "auxiliary_loss_mlp": 0.01032205, "balance_loss_clip": 1.05262196, "balance_loss_mlp": 1.02387273, "epoch": 0.42626104731557746, "flos": 23257905240960.0, "grad_norm": 2.2970676957567013, "language_loss": 0.7964263, "learning_rate": 2.5669974044895495e-06, "loss": 0.81837702, "num_input_tokens_seen": 76373065, "step": 3545, "time_per_iteration": 2.720111608505249 }, { "auxiliary_loss_clip": 0.01151173, "auxiliary_loss_mlp": 0.0102622, "balance_loss_clip": 1.04832041, "balance_loss_mlp": 1.01762509, "epoch": 0.42638129020621657, "flos": 25884770670720.0, "grad_norm": 1.6333459186891055, "language_loss": 0.79005802, "learning_rate": 2.5662503490204187e-06, "loss": 0.81183195, "num_input_tokens_seen": 76393230, "step": 3546, "time_per_iteration": 2.720024585723877 }, { "auxiliary_loss_clip": 0.01155036, "auxiliary_loss_mlp": 0.01038337, "balance_loss_clip": 1.04686451, "balance_loss_mlp": 1.02929497, "epoch": 0.4265015330968556, "flos": 26502362138880.0, "grad_norm": 1.9890829846609375, "language_loss": 0.76438498, "learning_rate": 2.5655032076515603e-06, "loss": 0.78631872, "num_input_tokens_seen": 76412555, "step": 3547, "time_per_iteration": 3.6662683486938477 }, { "auxiliary_loss_clip": 0.01160781, "auxiliary_loss_mlp": 0.01028355, "balance_loss_clip": 1.0507586, "balance_loss_mlp": 1.02006412, "epoch": 0.42662177598749473, "flos": 24389522288640.0, "grad_norm": 5.143813064632022, "language_loss": 0.82132494, "learning_rate": 2.5647559804963155e-06, "loss": 0.8432163, "num_input_tokens_seen": 76432485, "step": 3548, "time_per_iteration": 2.7028098106384277 }, { "auxiliary_loss_clip": 0.01137883, "auxiliary_loss_mlp": 0.01027937, "balance_loss_clip": 1.0490104, "balance_loss_mlp": 1.01945519, "epoch": 0.42674201887813384, "flos": 23148629089920.0, "grad_norm": 1.9756868275064754, "language_loss": 0.78851688, "learning_rate": 2.5640086676680364e-06, "loss": 0.81017506, "num_input_tokens_seen": 76453980, "step": 3549, "time_per_iteration": 3.7275803089141846 }, { "auxiliary_loss_clip": 0.0117586, "auxiliary_loss_mlp": 0.01030967, "balance_loss_clip": 1.0529722, "balance_loss_mlp": 1.02266431, "epoch": 0.4268622617687729, "flos": 21689614552320.0, "grad_norm": 2.7465713268694962, "language_loss": 0.80935514, "learning_rate": 2.5632612692800923e-06, "loss": 0.8314234, "num_input_tokens_seen": 76473045, "step": 3550, "time_per_iteration": 2.6946353912353516 }, { "auxiliary_loss_clip": 0.01147176, "auxiliary_loss_mlp": 0.01032943, "balance_loss_clip": 1.04845333, "balance_loss_mlp": 1.02335227, "epoch": 0.426982504659412, "flos": 23440151871360.0, "grad_norm": 1.9843526577873574, "language_loss": 0.7560339, "learning_rate": 2.5625137854458603e-06, "loss": 0.77783507, "num_input_tokens_seen": 76492060, "step": 3551, "time_per_iteration": 3.680642604827881 }, { "auxiliary_loss_clip": 0.01162801, "auxiliary_loss_mlp": 0.01032481, "balance_loss_clip": 1.04974842, "balance_loss_mlp": 1.02463675, "epoch": 0.4271027475500511, "flos": 18916556768640.0, "grad_norm": 1.9859393310219438, "language_loss": 0.80291712, "learning_rate": 2.561766216278735e-06, "loss": 0.82486993, "num_input_tokens_seen": 76509655, "step": 3552, "time_per_iteration": 2.732646942138672 }, { "auxiliary_loss_clip": 0.01131413, "auxiliary_loss_mlp": 0.01031998, "balance_loss_clip": 1.048612, "balance_loss_mlp": 1.02365971, "epoch": 0.4272229904406902, "flos": 26870554500480.0, "grad_norm": 2.0006516303086372, "language_loss": 0.81056547, "learning_rate": 2.561018561892121e-06, "loss": 0.83219957, "num_input_tokens_seen": 76528795, "step": 3553, "time_per_iteration": 3.69384503364563 }, { "auxiliary_loss_clip": 0.01155289, "auxiliary_loss_mlp": 0.01027849, "balance_loss_clip": 1.04602098, "balance_loss_mlp": 1.01908755, "epoch": 0.4273432333313293, "flos": 23951376190080.0, "grad_norm": 2.435051292531486, "language_loss": 0.76852381, "learning_rate": 2.5602708223994363e-06, "loss": 0.79035521, "num_input_tokens_seen": 76550660, "step": 3554, "time_per_iteration": 2.715123414993286 }, { "auxiliary_loss_clip": 0.01143261, "auxiliary_loss_mlp": 0.01030623, "balance_loss_clip": 1.04293823, "balance_loss_mlp": 1.02245116, "epoch": 0.4274634762219684, "flos": 29570354496000.0, "grad_norm": 2.375513861930382, "language_loss": 0.67875224, "learning_rate": 2.559522997914115e-06, "loss": 0.70049107, "num_input_tokens_seen": 76570240, "step": 3555, "time_per_iteration": 2.778697967529297 }, { "auxiliary_loss_clip": 0.01186809, "auxiliary_loss_mlp": 0.0103172, "balance_loss_clip": 1.05515075, "balance_loss_mlp": 1.02392375, "epoch": 0.42758371911260745, "flos": 21434146047360.0, "grad_norm": 1.9781009858662122, "language_loss": 0.84300494, "learning_rate": 2.558775088549599e-06, "loss": 0.86519027, "num_input_tokens_seen": 76589820, "step": 3556, "time_per_iteration": 2.6392624378204346 }, { "auxiliary_loss_clip": 0.01181319, "auxiliary_loss_mlp": 0.01029057, "balance_loss_clip": 1.05251098, "balance_loss_mlp": 1.01981831, "epoch": 0.42770396200324656, "flos": 14752822072320.0, "grad_norm": 2.2405724634686264, "language_loss": 0.66221523, "learning_rate": 2.5580270944193467e-06, "loss": 0.68431896, "num_input_tokens_seen": 76606640, "step": 3557, "time_per_iteration": 2.6725265979766846 }, { "auxiliary_loss_clip": 0.01086473, "auxiliary_loss_mlp": 0.01004789, "balance_loss_clip": 1.0178144, "balance_loss_mlp": 1.00288188, "epoch": 0.4278242048938857, "flos": 70654712601600.0, "grad_norm": 0.7433794170768325, "language_loss": 0.55488724, "learning_rate": 2.557279015636827e-06, "loss": 0.57579988, "num_input_tokens_seen": 76667050, "step": 3558, "time_per_iteration": 3.233607530593872 }, { "auxiliary_loss_clip": 0.01071592, "auxiliary_loss_mlp": 0.01002804, "balance_loss_clip": 1.01730716, "balance_loss_mlp": 1.00106359, "epoch": 0.42794444778452473, "flos": 69366165033600.0, "grad_norm": 0.8209970267353723, "language_loss": 0.61218166, "learning_rate": 2.5565308523155245e-06, "loss": 0.63292563, "num_input_tokens_seen": 76726650, "step": 3559, "time_per_iteration": 3.1589066982269287 }, { "auxiliary_loss_clip": 0.01126032, "auxiliary_loss_mlp": 0.01037976, "balance_loss_clip": 1.04764533, "balance_loss_mlp": 1.02968502, "epoch": 0.42806469067516384, "flos": 18215328481920.0, "grad_norm": 2.378983481962055, "language_loss": 0.81835186, "learning_rate": 2.5557826045689336e-06, "loss": 0.83999193, "num_input_tokens_seen": 76742890, "step": 3560, "time_per_iteration": 2.764399290084839 }, { "auxiliary_loss_clip": 0.01065005, "auxiliary_loss_mlp": 0.01002882, "balance_loss_clip": 1.02848685, "balance_loss_mlp": 1.00091541, "epoch": 0.4281849335658029, "flos": 54535814432640.0, "grad_norm": 0.8267305275907741, "language_loss": 0.58783567, "learning_rate": 2.5550342725105643e-06, "loss": 0.60851455, "num_input_tokens_seen": 76801055, "step": 3561, "time_per_iteration": 3.387869358062744 }, { "auxiliary_loss_clip": 0.01175699, "auxiliary_loss_mlp": 0.01032895, "balance_loss_clip": 1.05599821, "balance_loss_mlp": 1.02392459, "epoch": 0.428305176456442, "flos": 17274828723840.0, "grad_norm": 1.9924327099530497, "language_loss": 0.81119561, "learning_rate": 2.554285856253937e-06, "loss": 0.83328152, "num_input_tokens_seen": 76819890, "step": 3562, "time_per_iteration": 2.8317251205444336 }, { "auxiliary_loss_clip": 0.01156939, "auxiliary_loss_mlp": 0.01038118, "balance_loss_clip": 1.05233812, "balance_loss_mlp": 1.02985692, "epoch": 0.4284254193470811, "flos": 26359509749760.0, "grad_norm": 1.6545504500514931, "language_loss": 0.77698159, "learning_rate": 2.5535373559125855e-06, "loss": 0.79893214, "num_input_tokens_seen": 76840255, "step": 3563, "time_per_iteration": 2.7970194816589355 }, { "auxiliary_loss_clip": 0.01098715, "auxiliary_loss_mlp": 0.01034425, "balance_loss_clip": 1.04153204, "balance_loss_mlp": 1.02570808, "epoch": 0.42854566223772017, "flos": 29714248379520.0, "grad_norm": 1.5468399878867374, "language_loss": 0.82216614, "learning_rate": 2.552788771600057e-06, "loss": 0.84349751, "num_input_tokens_seen": 76860565, "step": 3564, "time_per_iteration": 2.96913743019104 }, { "auxiliary_loss_clip": 0.01149718, "auxiliary_loss_mlp": 0.0103536, "balance_loss_clip": 1.05157089, "balance_loss_mlp": 1.02677655, "epoch": 0.4286659051283593, "flos": 22018161277440.0, "grad_norm": 1.7453123792323384, "language_loss": 0.82063746, "learning_rate": 2.5520401034299118e-06, "loss": 0.84248823, "num_input_tokens_seen": 76878325, "step": 3565, "time_per_iteration": 2.7962448596954346 }, { "auxiliary_loss_clip": 0.01175334, "auxiliary_loss_mlp": 0.01031061, "balance_loss_clip": 1.05113995, "balance_loss_mlp": 1.02116644, "epoch": 0.4287861480189984, "flos": 13334422838400.0, "grad_norm": 1.9092666056164034, "language_loss": 0.87961483, "learning_rate": 2.551291351515722e-06, "loss": 0.90167874, "num_input_tokens_seen": 76895340, "step": 3566, "time_per_iteration": 2.6054606437683105 }, { "auxiliary_loss_clip": 0.01140629, "auxiliary_loss_mlp": 0.00763838, "balance_loss_clip": 1.04466927, "balance_loss_mlp": 1.00044119, "epoch": 0.42890639090963745, "flos": 26651535321600.0, "grad_norm": 1.7577367914418303, "language_loss": 0.86000705, "learning_rate": 2.5505425159710726e-06, "loss": 0.87905174, "num_input_tokens_seen": 76915150, "step": 3567, "time_per_iteration": 2.7730746269226074 }, { "auxiliary_loss_clip": 0.01166721, "auxiliary_loss_mlp": 0.00764057, "balance_loss_clip": 1.05074382, "balance_loss_mlp": 1.00041628, "epoch": 0.42902663380027656, "flos": 24055768091520.0, "grad_norm": 1.7881097299079334, "language_loss": 0.83138978, "learning_rate": 2.549793596909561e-06, "loss": 0.85069752, "num_input_tokens_seen": 76933770, "step": 3568, "time_per_iteration": 2.7250311374664307 }, { "auxiliary_loss_clip": 0.01155435, "auxiliary_loss_mlp": 0.01026673, "balance_loss_clip": 1.04982626, "balance_loss_mlp": 1.01844203, "epoch": 0.42914687669091567, "flos": 15632561975040.0, "grad_norm": 2.1716195801292075, "language_loss": 0.65919113, "learning_rate": 2.5490445944447976e-06, "loss": 0.68101227, "num_input_tokens_seen": 76952265, "step": 3569, "time_per_iteration": 2.6952881813049316 }, { "auxiliary_loss_clip": 0.01175845, "auxiliary_loss_mlp": 0.01032663, "balance_loss_clip": 1.05286682, "balance_loss_mlp": 1.02402616, "epoch": 0.4292671195815547, "flos": 31467802440960.0, "grad_norm": 2.0790128373576175, "language_loss": 0.6547606, "learning_rate": 2.548295508690406e-06, "loss": 0.67684567, "num_input_tokens_seen": 76973560, "step": 3570, "time_per_iteration": 2.703143358230591 }, { "auxiliary_loss_clip": 0.01176208, "auxiliary_loss_mlp": 0.01033841, "balance_loss_clip": 1.05001926, "balance_loss_mlp": 1.02520967, "epoch": 0.42938736247219383, "flos": 30257756046720.0, "grad_norm": 2.015925765642274, "language_loss": 0.76393878, "learning_rate": 2.5475463397600217e-06, "loss": 0.78603923, "num_input_tokens_seen": 76993640, "step": 3571, "time_per_iteration": 2.7500104904174805 }, { "auxiliary_loss_clip": 0.01190487, "auxiliary_loss_mlp": 0.01031278, "balance_loss_clip": 1.05398476, "balance_loss_mlp": 1.02262902, "epoch": 0.42950760536283294, "flos": 29349683291520.0, "grad_norm": 2.154189343392535, "language_loss": 0.77808809, "learning_rate": 2.546797087767293e-06, "loss": 0.80030572, "num_input_tokens_seen": 77013765, "step": 3572, "time_per_iteration": 2.734323263168335 }, { "auxiliary_loss_clip": 0.01128853, "auxiliary_loss_mlp": 0.01030406, "balance_loss_clip": 1.04753816, "balance_loss_mlp": 1.0218643, "epoch": 0.429627848253472, "flos": 26869943969280.0, "grad_norm": 1.668514765294981, "language_loss": 0.87296593, "learning_rate": 2.546047752825881e-06, "loss": 0.89455849, "num_input_tokens_seen": 77034370, "step": 3573, "time_per_iteration": 3.6749446392059326 }, { "auxiliary_loss_clip": 0.01136434, "auxiliary_loss_mlp": 0.01023452, "balance_loss_clip": 1.04759145, "balance_loss_mlp": 1.01478004, "epoch": 0.4297480911441111, "flos": 13881270470400.0, "grad_norm": 2.070212120025745, "language_loss": 0.93612134, "learning_rate": 2.5452983350494595e-06, "loss": 0.95772016, "num_input_tokens_seen": 77049925, "step": 3574, "time_per_iteration": 2.789116144180298 }, { "auxiliary_loss_clip": 0.01178006, "auxiliary_loss_mlp": 0.00763498, "balance_loss_clip": 1.05365205, "balance_loss_mlp": 1.00045145, "epoch": 0.4298683340347502, "flos": 20741141975040.0, "grad_norm": 1.983582721311658, "language_loss": 0.65405214, "learning_rate": 2.544548834551713e-06, "loss": 0.67346716, "num_input_tokens_seen": 77068930, "step": 3575, "time_per_iteration": 3.74849009513855 }, { "auxiliary_loss_clip": 0.01142188, "auxiliary_loss_mlp": 0.00763972, "balance_loss_clip": 1.04752111, "balance_loss_mlp": 1.00041449, "epoch": 0.4299885769253893, "flos": 20882126856960.0, "grad_norm": 2.8950959525872295, "language_loss": 0.9490723, "learning_rate": 2.5437992514463424e-06, "loss": 0.96813393, "num_input_tokens_seen": 77082255, "step": 3576, "time_per_iteration": 2.699000120162964 }, { "auxiliary_loss_clip": 0.01173379, "auxiliary_loss_mlp": 0.01032095, "balance_loss_clip": 1.05099189, "balance_loss_mlp": 1.02296996, "epoch": 0.4301088198160284, "flos": 25484618183040.0, "grad_norm": 1.6838051996476395, "language_loss": 0.87912571, "learning_rate": 2.5430495858470565e-06, "loss": 0.90118039, "num_input_tokens_seen": 77101725, "step": 3577, "time_per_iteration": 2.715313673019409 }, { "auxiliary_loss_clip": 0.01171226, "auxiliary_loss_mlp": 0.0102945, "balance_loss_clip": 1.05102205, "balance_loss_mlp": 1.02052736, "epoch": 0.43022906270666744, "flos": 18259427404800.0, "grad_norm": 2.305336896513299, "language_loss": 0.77498603, "learning_rate": 2.54229983786758e-06, "loss": 0.79699278, "num_input_tokens_seen": 77119670, "step": 3578, "time_per_iteration": 4.491747856140137 }, { "auxiliary_loss_clip": 0.01162518, "auxiliary_loss_mlp": 0.01032136, "balance_loss_clip": 1.05062532, "balance_loss_mlp": 1.02267647, "epoch": 0.43034930559730655, "flos": 23399536567680.0, "grad_norm": 1.8193216065438784, "language_loss": 0.8542459, "learning_rate": 2.541550007621651e-06, "loss": 0.87619245, "num_input_tokens_seen": 77138160, "step": 3579, "time_per_iteration": 2.81290340423584 }, { "auxiliary_loss_clip": 0.01176115, "auxiliary_loss_mlp": 0.01027664, "balance_loss_clip": 1.05624771, "balance_loss_mlp": 1.01945662, "epoch": 0.43046954848794566, "flos": 28184382264960.0, "grad_norm": 2.287982713966207, "language_loss": 0.80618942, "learning_rate": 2.5408000952230156e-06, "loss": 0.82822722, "num_input_tokens_seen": 77156950, "step": 3580, "time_per_iteration": 2.6883723735809326 }, { "auxiliary_loss_clip": 0.01154805, "auxiliary_loss_mlp": 0.01030235, "balance_loss_clip": 1.04811537, "balance_loss_mlp": 1.02125227, "epoch": 0.4305897913785847, "flos": 28580476515840.0, "grad_norm": 1.9742496237279235, "language_loss": 0.90624464, "learning_rate": 2.5400501007854357e-06, "loss": 0.92809504, "num_input_tokens_seen": 77176395, "step": 3581, "time_per_iteration": 2.738523006439209 }, { "auxiliary_loss_clip": 0.01126854, "auxiliary_loss_mlp": 0.01025217, "balance_loss_clip": 1.04375935, "balance_loss_mlp": 1.01667547, "epoch": 0.43071003426922383, "flos": 20448721353600.0, "grad_norm": 2.2363266237033015, "language_loss": 0.75403535, "learning_rate": 2.539300024422685e-06, "loss": 0.77555609, "num_input_tokens_seen": 77194340, "step": 3582, "time_per_iteration": 2.789151668548584 }, { "auxiliary_loss_clip": 0.01052933, "auxiliary_loss_mlp": 0.0100332, "balance_loss_clip": 1.01751876, "balance_loss_mlp": 1.00161517, "epoch": 0.43083027715986294, "flos": 51997969883520.0, "grad_norm": 0.7924929334248949, "language_loss": 0.60932624, "learning_rate": 2.538549866248549e-06, "loss": 0.62988877, "num_input_tokens_seen": 77249320, "step": 3583, "time_per_iteration": 3.1530730724334717 }, { "auxiliary_loss_clip": 0.01177021, "auxiliary_loss_mlp": 0.01029392, "balance_loss_clip": 1.05246389, "balance_loss_mlp": 1.02071393, "epoch": 0.430950520050502, "flos": 16690885320960.0, "grad_norm": 2.5729074450994536, "language_loss": 0.81156611, "learning_rate": 2.5377996263768274e-06, "loss": 0.83363032, "num_input_tokens_seen": 77267400, "step": 3584, "time_per_iteration": 2.6495094299316406 }, { "auxiliary_loss_clip": 0.01171248, "auxiliary_loss_mlp": 0.01038755, "balance_loss_clip": 1.05107844, "balance_loss_mlp": 1.02997565, "epoch": 0.4310707629411411, "flos": 24608433726720.0, "grad_norm": 1.9486414217562429, "language_loss": 0.6862939, "learning_rate": 2.5370493049213293e-06, "loss": 0.70839393, "num_input_tokens_seen": 77287045, "step": 3585, "time_per_iteration": 2.7423524856567383 }, { "auxiliary_loss_clip": 0.01084145, "auxiliary_loss_mlp": 0.01029948, "balance_loss_clip": 1.04283309, "balance_loss_mlp": 1.02155566, "epoch": 0.4311910058317802, "flos": 26432983019520.0, "grad_norm": 1.9674277147102284, "language_loss": 0.79746211, "learning_rate": 2.536298901995878e-06, "loss": 0.8186031, "num_input_tokens_seen": 77306255, "step": 3586, "time_per_iteration": 3.07484769821167 }, { "auxiliary_loss_clip": 0.01163398, "auxiliary_loss_mlp": 0.01029193, "balance_loss_clip": 1.05138612, "balance_loss_mlp": 1.01993048, "epoch": 0.43131124872241927, "flos": 25155891889920.0, "grad_norm": 1.9803882135957005, "language_loss": 0.80058783, "learning_rate": 2.535548417714311e-06, "loss": 0.8225137, "num_input_tokens_seen": 77325555, "step": 3587, "time_per_iteration": 2.9281415939331055 }, { "auxiliary_loss_clip": 0.01178842, "auxiliary_loss_mlp": 0.01030134, "balance_loss_clip": 1.05081105, "balance_loss_mlp": 1.02150941, "epoch": 0.4314314916130584, "flos": 21614812479360.0, "grad_norm": 1.6373541943091054, "language_loss": 0.87224609, "learning_rate": 2.534797852190474e-06, "loss": 0.89433587, "num_input_tokens_seen": 77345735, "step": 3588, "time_per_iteration": 2.642416477203369 }, { "auxiliary_loss_clip": 0.01170043, "auxiliary_loss_mlp": 0.01034634, "balance_loss_clip": 1.04970396, "balance_loss_mlp": 1.02603889, "epoch": 0.4315517345036975, "flos": 19275016544640.0, "grad_norm": 2.049066740227965, "language_loss": 0.81519711, "learning_rate": 2.5340472055382283e-06, "loss": 0.83724391, "num_input_tokens_seen": 77361765, "step": 3589, "time_per_iteration": 2.6205978393554688 }, { "auxiliary_loss_clip": 0.0114527, "auxiliary_loss_mlp": 0.01025078, "balance_loss_clip": 1.04720354, "balance_loss_mlp": 1.01657856, "epoch": 0.43167197739433655, "flos": 24273853516800.0, "grad_norm": 2.1644296582411235, "language_loss": 0.81341338, "learning_rate": 2.5332964778714468e-06, "loss": 0.8351168, "num_input_tokens_seen": 77378950, "step": 3590, "time_per_iteration": 3.1484291553497314 }, { "auxiliary_loss_clip": 0.01143561, "auxiliary_loss_mlp": 0.01033588, "balance_loss_clip": 1.0506556, "balance_loss_mlp": 1.02498746, "epoch": 0.43179222028497566, "flos": 16867816738560.0, "grad_norm": 1.6278454847381083, "language_loss": 0.66170764, "learning_rate": 2.5325456693040123e-06, "loss": 0.68347907, "num_input_tokens_seen": 77396145, "step": 3591, "time_per_iteration": 2.7334723472595215 }, { "auxiliary_loss_clip": 0.01181687, "auxiliary_loss_mlp": 0.01027527, "balance_loss_clip": 1.05277085, "balance_loss_mlp": 1.01834846, "epoch": 0.43191246317561477, "flos": 17639214243840.0, "grad_norm": 2.478333963376418, "language_loss": 0.75274992, "learning_rate": 2.531794779949824e-06, "loss": 0.77484208, "num_input_tokens_seen": 77414045, "step": 3592, "time_per_iteration": 2.5936429500579834 }, { "auxiliary_loss_clip": 0.01134848, "auxiliary_loss_mlp": 0.01032632, "balance_loss_clip": 1.04709888, "balance_loss_mlp": 1.0245676, "epoch": 0.4320327060662538, "flos": 23878800760320.0, "grad_norm": 1.8413564400909865, "language_loss": 0.88030982, "learning_rate": 2.5310438099227903e-06, "loss": 0.90198463, "num_input_tokens_seen": 77431310, "step": 3593, "time_per_iteration": 2.750535249710083 }, { "auxiliary_loss_clip": 0.01075993, "auxiliary_loss_mlp": 0.01003302, "balance_loss_clip": 1.0178895, "balance_loss_mlp": 1.00158548, "epoch": 0.43215294895689293, "flos": 66394917959040.0, "grad_norm": 0.7983724054049584, "language_loss": 0.53287089, "learning_rate": 2.530292759336833e-06, "loss": 0.55366385, "num_input_tokens_seen": 77492045, "step": 3594, "time_per_iteration": 3.2955424785614014 }, { "auxiliary_loss_clip": 0.01156676, "auxiliary_loss_mlp": 0.01029196, "balance_loss_clip": 1.05027533, "balance_loss_mlp": 1.02018368, "epoch": 0.432273191847532, "flos": 20594267262720.0, "grad_norm": 2.112965963138508, "language_loss": 0.69893849, "learning_rate": 2.5295416283058855e-06, "loss": 0.7207973, "num_input_tokens_seen": 77510910, "step": 3595, "time_per_iteration": 2.6660494804382324 }, { "auxiliary_loss_clip": 0.01155464, "auxiliary_loss_mlp": 0.00762855, "balance_loss_clip": 1.04966319, "balance_loss_mlp": 1.00049376, "epoch": 0.4323934347381711, "flos": 19282127437440.0, "grad_norm": 1.5714699853993486, "language_loss": 0.66103268, "learning_rate": 2.5287904169438943e-06, "loss": 0.68021584, "num_input_tokens_seen": 77530115, "step": 3596, "time_per_iteration": 2.722686529159546 }, { "auxiliary_loss_clip": 0.01110932, "auxiliary_loss_mlp": 0.01029143, "balance_loss_clip": 1.04575348, "balance_loss_mlp": 1.01972604, "epoch": 0.4325136776288102, "flos": 21726315273600.0, "grad_norm": 2.793167884968842, "language_loss": 0.63825214, "learning_rate": 2.528039125364817e-06, "loss": 0.65965289, "num_input_tokens_seen": 77548920, "step": 3597, "time_per_iteration": 2.9336624145507812 }, { "auxiliary_loss_clip": 0.01146538, "auxiliary_loss_mlp": 0.01029747, "balance_loss_clip": 1.04821038, "balance_loss_mlp": 1.02062118, "epoch": 0.43263392051944927, "flos": 22340746344960.0, "grad_norm": 2.0117084046638443, "language_loss": 0.76061273, "learning_rate": 2.5272877536826246e-06, "loss": 0.78237557, "num_input_tokens_seen": 77567715, "step": 3598, "time_per_iteration": 3.0277509689331055 }, { "auxiliary_loss_clip": 0.01130976, "auxiliary_loss_mlp": 0.01032365, "balance_loss_clip": 1.0444454, "balance_loss_mlp": 1.02336419, "epoch": 0.4327541634100884, "flos": 29168406328320.0, "grad_norm": 1.9426299984911068, "language_loss": 0.70967764, "learning_rate": 2.5265363020112986e-06, "loss": 0.73131108, "num_input_tokens_seen": 77588035, "step": 3599, "time_per_iteration": 3.9300270080566406 }, { "auxiliary_loss_clip": 0.01175915, "auxiliary_loss_mlp": 0.01033987, "balance_loss_clip": 1.05535424, "balance_loss_mlp": 1.02394342, "epoch": 0.4328744063007275, "flos": 26067448264320.0, "grad_norm": 1.920568097569299, "language_loss": 0.8421188, "learning_rate": 2.5257847704648344e-06, "loss": 0.86421788, "num_input_tokens_seen": 77609265, "step": 3600, "time_per_iteration": 3.6557302474975586 }, { "auxiliary_loss_clip": 0.0118867, "auxiliary_loss_mlp": 0.01030085, "balance_loss_clip": 1.05420578, "balance_loss_mlp": 1.02135324, "epoch": 0.43299464919136654, "flos": 16581357774720.0, "grad_norm": 3.021355966762936, "language_loss": 0.761464, "learning_rate": 2.525033159157239e-06, "loss": 0.78365153, "num_input_tokens_seen": 77625580, "step": 3601, "time_per_iteration": 2.6542649269104004 }, { "auxiliary_loss_clip": 0.01172487, "auxiliary_loss_mlp": 0.01031782, "balance_loss_clip": 1.05185318, "balance_loss_mlp": 1.02287686, "epoch": 0.43311489208200565, "flos": 16107265140480.0, "grad_norm": 1.7839130845087798, "language_loss": 0.77092624, "learning_rate": 2.52428146820253e-06, "loss": 0.79296893, "num_input_tokens_seen": 77643835, "step": 3602, "time_per_iteration": 2.626575469970703 }, { "auxiliary_loss_clip": 0.01148991, "auxiliary_loss_mlp": 0.01031178, "balance_loss_clip": 1.05135489, "balance_loss_mlp": 1.02237463, "epoch": 0.43323513497264476, "flos": 22930220442240.0, "grad_norm": 1.7280420835321257, "language_loss": 0.81696415, "learning_rate": 2.52352969771474e-06, "loss": 0.83876586, "num_input_tokens_seen": 77663060, "step": 3603, "time_per_iteration": 2.7398667335510254 }, { "auxiliary_loss_clip": 0.01162522, "auxiliary_loss_mlp": 0.01035204, "balance_loss_clip": 1.05162907, "balance_loss_mlp": 1.02675223, "epoch": 0.4333553778632838, "flos": 25299031587840.0, "grad_norm": 2.2304841587324002, "language_loss": 0.88602364, "learning_rate": 2.5227778478079106e-06, "loss": 0.90800095, "num_input_tokens_seen": 77682470, "step": 3604, "time_per_iteration": 4.6022655963897705 }, { "auxiliary_loss_clip": 0.01169159, "auxiliary_loss_mlp": 0.01030859, "balance_loss_clip": 1.05015874, "balance_loss_mlp": 1.02269316, "epoch": 0.43347562075392293, "flos": 19387165783680.0, "grad_norm": 1.5385197407757567, "language_loss": 0.77083331, "learning_rate": 2.522025918596098e-06, "loss": 0.79283345, "num_input_tokens_seen": 77700770, "step": 3605, "time_per_iteration": 2.706249475479126 }, { "auxiliary_loss_clip": 0.01175877, "auxiliary_loss_mlp": 0.0103032, "balance_loss_clip": 1.05314636, "balance_loss_mlp": 1.02253604, "epoch": 0.43359586364456204, "flos": 26325969425280.0, "grad_norm": 1.4686010595213426, "language_loss": 0.65865022, "learning_rate": 2.521273910193368e-06, "loss": 0.68071216, "num_input_tokens_seen": 77723950, "step": 3606, "time_per_iteration": 2.8919973373413086 }, { "auxiliary_loss_clip": 0.01182039, "auxiliary_loss_mlp": 0.01029622, "balance_loss_clip": 1.05411959, "balance_loss_mlp": 1.02072906, "epoch": 0.4337161065352011, "flos": 15989261984640.0, "grad_norm": 2.698900062602605, "language_loss": 0.87418818, "learning_rate": 2.5205218227138006e-06, "loss": 0.89630485, "num_input_tokens_seen": 77736905, "step": 3607, "time_per_iteration": 2.6692521572113037 }, { "auxiliary_loss_clip": 0.01188521, "auxiliary_loss_mlp": 0.01030953, "balance_loss_clip": 1.05239415, "balance_loss_mlp": 1.02245986, "epoch": 0.4338363494258402, "flos": 20224710184320.0, "grad_norm": 1.9986375322641783, "language_loss": 0.7884537, "learning_rate": 2.519769656271486e-06, "loss": 0.81064844, "num_input_tokens_seen": 77754325, "step": 3608, "time_per_iteration": 2.598769426345825 }, { "auxiliary_loss_clip": 0.011236, "auxiliary_loss_mlp": 0.0102931, "balance_loss_clip": 1.04641795, "balance_loss_mlp": 1.02053082, "epoch": 0.43395659231647926, "flos": 20083904870400.0, "grad_norm": 2.311717771648048, "language_loss": 0.67433977, "learning_rate": 2.5190174109805285e-06, "loss": 0.69586891, "num_input_tokens_seen": 77774150, "step": 3609, "time_per_iteration": 2.7555224895477295 }, { "auxiliary_loss_clip": 0.01149969, "auxiliary_loss_mlp": 0.01037037, "balance_loss_clip": 1.04822731, "balance_loss_mlp": 1.02773833, "epoch": 0.43407683520711837, "flos": 19901801894400.0, "grad_norm": 2.192360444066044, "language_loss": 0.64147007, "learning_rate": 2.518265086955042e-06, "loss": 0.66334015, "num_input_tokens_seen": 77791870, "step": 3610, "time_per_iteration": 2.6512627601623535 }, { "auxiliary_loss_clip": 0.01187387, "auxiliary_loss_mlp": 0.0103186, "balance_loss_clip": 1.05180478, "balance_loss_mlp": 1.02303863, "epoch": 0.4341970780977575, "flos": 23108732058240.0, "grad_norm": 1.7632546807791794, "language_loss": 0.83763015, "learning_rate": 2.5175126843091534e-06, "loss": 0.85982263, "num_input_tokens_seen": 77811240, "step": 3611, "time_per_iteration": 2.7160422801971436 }, { "auxiliary_loss_clip": 0.0116531, "auxiliary_loss_mlp": 0.01033073, "balance_loss_clip": 1.05426753, "balance_loss_mlp": 1.02425194, "epoch": 0.43431732098839654, "flos": 37408288406400.0, "grad_norm": 2.3094431773180197, "language_loss": 0.75538152, "learning_rate": 2.5167602031570034e-06, "loss": 0.77736533, "num_input_tokens_seen": 77831425, "step": 3612, "time_per_iteration": 2.784390926361084 }, { "auxiliary_loss_clip": 0.01190302, "auxiliary_loss_mlp": 0.0102703, "balance_loss_clip": 1.05463409, "balance_loss_mlp": 1.01868546, "epoch": 0.43443756387903565, "flos": 31868206323840.0, "grad_norm": 1.6567404832385049, "language_loss": 0.73378932, "learning_rate": 2.51600764361274e-06, "loss": 0.75596267, "num_input_tokens_seen": 77852950, "step": 3613, "time_per_iteration": 2.717949867248535 }, { "auxiliary_loss_clip": 0.01191602, "auxiliary_loss_mlp": 0.01033689, "balance_loss_clip": 1.05484939, "balance_loss_mlp": 1.02468908, "epoch": 0.43455780676967476, "flos": 23477139901440.0, "grad_norm": 3.917447712906433, "language_loss": 0.78658634, "learning_rate": 2.5152550057905283e-06, "loss": 0.80883926, "num_input_tokens_seen": 77872840, "step": 3614, "time_per_iteration": 2.6141862869262695 }, { "auxiliary_loss_clip": 0.01177016, "auxiliary_loss_mlp": 0.00763787, "balance_loss_clip": 1.0548656, "balance_loss_mlp": 1.00044918, "epoch": 0.4346780496603138, "flos": 24207060176640.0, "grad_norm": 1.9604280709142665, "language_loss": 0.77136207, "learning_rate": 2.5145022898045415e-06, "loss": 0.79077011, "num_input_tokens_seen": 77892025, "step": 3615, "time_per_iteration": 2.6132619380950928 }, { "auxiliary_loss_clip": 0.01162322, "auxiliary_loss_mlp": 0.01027145, "balance_loss_clip": 1.04848802, "balance_loss_mlp": 1.01824605, "epoch": 0.4347982925509529, "flos": 17092366611840.0, "grad_norm": 1.9988865896241532, "language_loss": 0.90022373, "learning_rate": 2.5137494957689664e-06, "loss": 0.92211848, "num_input_tokens_seen": 77907635, "step": 3616, "time_per_iteration": 2.607708215713501 }, { "auxiliary_loss_clip": 0.01060772, "auxiliary_loss_mlp": 0.01004011, "balance_loss_clip": 1.01528203, "balance_loss_mlp": 1.00230587, "epoch": 0.43491853544159204, "flos": 60945544696320.0, "grad_norm": 0.7647456987844858, "language_loss": 0.57326162, "learning_rate": 2.5129966237980016e-06, "loss": 0.5939095, "num_input_tokens_seen": 77970630, "step": 3617, "time_per_iteration": 3.2574472427368164 }, { "auxiliary_loss_clip": 0.011468, "auxiliary_loss_mlp": 0.01029277, "balance_loss_clip": 1.04853559, "balance_loss_mlp": 1.02102137, "epoch": 0.4350387783322311, "flos": 21944652094080.0, "grad_norm": 1.8666160967155916, "language_loss": 0.78088498, "learning_rate": 2.512243674005857e-06, "loss": 0.8026458, "num_input_tokens_seen": 77989995, "step": 3618, "time_per_iteration": 2.8058979511260986 }, { "auxiliary_loss_clip": 0.0111534, "auxiliary_loss_mlp": 0.0103203, "balance_loss_clip": 1.0450995, "balance_loss_mlp": 1.02373326, "epoch": 0.4351590212228702, "flos": 25082705928960.0, "grad_norm": 2.062267515332014, "language_loss": 0.8613615, "learning_rate": 2.5114906465067537e-06, "loss": 0.88283521, "num_input_tokens_seen": 78010980, "step": 3619, "time_per_iteration": 2.809054136276245 }, { "auxiliary_loss_clip": 0.01174691, "auxiliary_loss_mlp": 0.01029832, "balance_loss_clip": 1.05060339, "balance_loss_mlp": 1.02142799, "epoch": 0.4352792641135093, "flos": 21506541909120.0, "grad_norm": 1.9836836797076929, "language_loss": 0.74792045, "learning_rate": 2.5107375414149264e-06, "loss": 0.76996571, "num_input_tokens_seen": 78030225, "step": 3620, "time_per_iteration": 2.745995044708252 }, { "auxiliary_loss_clip": 0.01119862, "auxiliary_loss_mlp": 0.01030194, "balance_loss_clip": 1.04159379, "balance_loss_mlp": 1.02105105, "epoch": 0.43539950700414837, "flos": 16253457494400.0, "grad_norm": 2.1920158282005495, "language_loss": 0.71927905, "learning_rate": 2.5099843588446197e-06, "loss": 0.74077964, "num_input_tokens_seen": 78048545, "step": 3621, "time_per_iteration": 2.7285523414611816 }, { "auxiliary_loss_clip": 0.01137535, "auxiliary_loss_mlp": 0.01036889, "balance_loss_clip": 1.04900301, "balance_loss_mlp": 1.02812064, "epoch": 0.4355197498947875, "flos": 16691819074560.0, "grad_norm": 1.6955827829356918, "language_loss": 0.617275, "learning_rate": 2.509231098910091e-06, "loss": 0.63901925, "num_input_tokens_seen": 78068415, "step": 3622, "time_per_iteration": 2.764375686645508 }, { "auxiliary_loss_clip": 0.01158795, "auxiliary_loss_mlp": 0.01029062, "balance_loss_clip": 1.05437517, "balance_loss_mlp": 1.02081823, "epoch": 0.4356399927854266, "flos": 16362733645440.0, "grad_norm": 3.6049638575166365, "language_loss": 0.7506777, "learning_rate": 2.508477761725611e-06, "loss": 0.7725563, "num_input_tokens_seen": 78086690, "step": 3623, "time_per_iteration": 2.721052885055542 }, { "auxiliary_loss_clip": 0.01175953, "auxiliary_loss_mlp": 0.01034877, "balance_loss_clip": 1.04986525, "balance_loss_mlp": 1.02630532, "epoch": 0.43576023567606564, "flos": 17202037812480.0, "grad_norm": 1.842106936216907, "language_loss": 0.81182772, "learning_rate": 2.507724347405458e-06, "loss": 0.83393598, "num_input_tokens_seen": 78104640, "step": 3624, "time_per_iteration": 2.5896763801574707 }, { "auxiliary_loss_clip": 0.0112434, "auxiliary_loss_mlp": 0.01027231, "balance_loss_clip": 1.04282761, "balance_loss_mlp": 1.01901793, "epoch": 0.43588047856670475, "flos": 15917656222080.0, "grad_norm": 2.0968851276627416, "language_loss": 0.82015496, "learning_rate": 2.5069708560639243e-06, "loss": 0.84167069, "num_input_tokens_seen": 78122550, "step": 3625, "time_per_iteration": 3.639252185821533 }, { "auxiliary_loss_clip": 0.01145596, "auxiliary_loss_mlp": 0.01027659, "balance_loss_clip": 1.04774213, "balance_loss_mlp": 1.01917672, "epoch": 0.4360007214573438, "flos": 23659566099840.0, "grad_norm": 3.5340223065839527, "language_loss": 0.61671197, "learning_rate": 2.5062172878153158e-06, "loss": 0.63844454, "num_input_tokens_seen": 78141825, "step": 3626, "time_per_iteration": 3.664763927459717 }, { "auxiliary_loss_clip": 0.01123327, "auxiliary_loss_mlp": 0.01031167, "balance_loss_clip": 1.04452574, "balance_loss_mlp": 1.02275634, "epoch": 0.4361209643479829, "flos": 21978767036160.0, "grad_norm": 1.7691054769969645, "language_loss": 0.86904716, "learning_rate": 2.505463642773947e-06, "loss": 0.8905921, "num_input_tokens_seen": 78161790, "step": 3627, "time_per_iteration": 2.8500239849090576 }, { "auxiliary_loss_clip": 0.01149151, "auxiliary_loss_mlp": 0.00763945, "balance_loss_clip": 1.050951, "balance_loss_mlp": 1.00036335, "epoch": 0.43624120723862203, "flos": 17420159151360.0, "grad_norm": 2.5462585670334215, "language_loss": 0.75086522, "learning_rate": 2.504709921054146e-06, "loss": 0.76999617, "num_input_tokens_seen": 78178605, "step": 3628, "time_per_iteration": 2.771569013595581 }, { "auxiliary_loss_clip": 0.01137486, "auxiliary_loss_mlp": 0.01035151, "balance_loss_clip": 1.04330301, "balance_loss_mlp": 1.02615643, "epoch": 0.4363614501292611, "flos": 17895293280000.0, "grad_norm": 2.5031655976005633, "language_loss": 0.83599633, "learning_rate": 2.50395612277025e-06, "loss": 0.8577227, "num_input_tokens_seen": 78194460, "step": 3629, "time_per_iteration": 2.750523805618286 }, { "auxiliary_loss_clip": 0.01163549, "auxiliary_loss_mlp": 0.01034063, "balance_loss_clip": 1.04959452, "balance_loss_mlp": 1.0253607, "epoch": 0.4364816930199002, "flos": 20302888135680.0, "grad_norm": 2.022455579760982, "language_loss": 0.73286033, "learning_rate": 2.503202248036612e-06, "loss": 0.75483644, "num_input_tokens_seen": 78213315, "step": 3630, "time_per_iteration": 4.556641578674316 }, { "auxiliary_loss_clip": 0.01190852, "auxiliary_loss_mlp": 0.01026487, "balance_loss_clip": 1.05545998, "balance_loss_mlp": 1.01835656, "epoch": 0.4366019359105393, "flos": 24061334699520.0, "grad_norm": 1.6475859406748012, "language_loss": 0.73651862, "learning_rate": 2.5024482969675927e-06, "loss": 0.75869203, "num_input_tokens_seen": 78233270, "step": 3631, "time_per_iteration": 2.6520493030548096 }, { "auxiliary_loss_clip": 0.01138276, "auxiliary_loss_mlp": 0.01026693, "balance_loss_clip": 1.04913259, "balance_loss_mlp": 1.01837778, "epoch": 0.43672217880117836, "flos": 21754109422080.0, "grad_norm": 2.3585639464057495, "language_loss": 0.84390378, "learning_rate": 2.501694269677566e-06, "loss": 0.8655535, "num_input_tokens_seen": 78251040, "step": 3632, "time_per_iteration": 2.8094675540924072 }, { "auxiliary_loss_clip": 0.01177098, "auxiliary_loss_mlp": 0.01028165, "balance_loss_clip": 1.04969203, "balance_loss_mlp": 1.01937342, "epoch": 0.4368424216918175, "flos": 18035200753920.0, "grad_norm": 3.202918260719624, "language_loss": 0.80487573, "learning_rate": 2.500940166280918e-06, "loss": 0.82692832, "num_input_tokens_seen": 78269470, "step": 3633, "time_per_iteration": 2.745352268218994 }, { "auxiliary_loss_clip": 0.01171609, "auxiliary_loss_mlp": 0.01034182, "balance_loss_clip": 1.05125546, "balance_loss_mlp": 1.02610552, "epoch": 0.4369626645824566, "flos": 25447127362560.0, "grad_norm": 2.0731942733404503, "language_loss": 0.79098916, "learning_rate": 2.500185986892045e-06, "loss": 0.81304705, "num_input_tokens_seen": 78288955, "step": 3634, "time_per_iteration": 2.682300090789795 }, { "auxiliary_loss_clip": 0.0116949, "auxiliary_loss_mlp": 0.010321, "balance_loss_clip": 1.04999232, "balance_loss_mlp": 1.02305841, "epoch": 0.43708290747309564, "flos": 25302694775040.0, "grad_norm": 2.126522284228904, "language_loss": 0.7747997, "learning_rate": 2.499431731625355e-06, "loss": 0.79681551, "num_input_tokens_seen": 78307980, "step": 3635, "time_per_iteration": 2.6690468788146973 }, { "auxiliary_loss_clip": 0.01190306, "auxiliary_loss_mlp": 0.01033377, "balance_loss_clip": 1.0533222, "balance_loss_mlp": 1.02421594, "epoch": 0.43720315036373475, "flos": 31575103344000.0, "grad_norm": 2.0671254077363006, "language_loss": 0.79612643, "learning_rate": 2.4986774005952686e-06, "loss": 0.81836331, "num_input_tokens_seen": 78330355, "step": 3636, "time_per_iteration": 2.749903917312622 }, { "auxiliary_loss_clip": 0.01171152, "auxiliary_loss_mlp": 0.01028063, "balance_loss_clip": 1.05230618, "balance_loss_mlp": 1.01956916, "epoch": 0.43732339325437386, "flos": 23112000195840.0, "grad_norm": 2.5440618583045054, "language_loss": 0.8481065, "learning_rate": 2.4979229939162166e-06, "loss": 0.87009865, "num_input_tokens_seen": 78349135, "step": 3637, "time_per_iteration": 2.6842153072357178 }, { "auxiliary_loss_clip": 0.01172421, "auxiliary_loss_mlp": 0.01027721, "balance_loss_clip": 1.05462885, "balance_loss_mlp": 1.01965666, "epoch": 0.4374436361450129, "flos": 27746272080000.0, "grad_norm": 1.8046246040435532, "language_loss": 0.80381012, "learning_rate": 2.4971685117026433e-06, "loss": 0.82581151, "num_input_tokens_seen": 78368900, "step": 3638, "time_per_iteration": 2.629162073135376 }, { "auxiliary_loss_clip": 0.01178834, "auxiliary_loss_mlp": 0.01041077, "balance_loss_clip": 1.05388379, "balance_loss_mlp": 1.03260136, "epoch": 0.437563879035652, "flos": 24172370616960.0, "grad_norm": 1.504283242781577, "language_loss": 0.76778221, "learning_rate": 2.4964139540690018e-06, "loss": 0.78998131, "num_input_tokens_seen": 78392235, "step": 3639, "time_per_iteration": 2.728782892227173 }, { "auxiliary_loss_clip": 0.01146862, "auxiliary_loss_mlp": 0.01038161, "balance_loss_clip": 1.04829431, "balance_loss_mlp": 1.02833855, "epoch": 0.4376841219262911, "flos": 23477211728640.0, "grad_norm": 2.013686258211536, "language_loss": 0.73139006, "learning_rate": 2.495659321129758e-06, "loss": 0.75324035, "num_input_tokens_seen": 78409980, "step": 3640, "time_per_iteration": 2.7193362712860107 }, { "auxiliary_loss_clip": 0.0117106, "auxiliary_loss_mlp": 0.01032054, "balance_loss_clip": 1.05028033, "balance_loss_mlp": 1.02328634, "epoch": 0.4378043648169302, "flos": 25447809720960.0, "grad_norm": 1.8934201955150876, "language_loss": 0.75248373, "learning_rate": 2.494904612999389e-06, "loss": 0.77451491, "num_input_tokens_seen": 78428690, "step": 3641, "time_per_iteration": 2.6830637454986572 }, { "auxiliary_loss_clip": 0.01071523, "auxiliary_loss_mlp": 0.01002095, "balance_loss_clip": 1.0180527, "balance_loss_mlp": 1.00046158, "epoch": 0.4379246077075693, "flos": 53914056986880.0, "grad_norm": 0.7676143853040126, "language_loss": 0.56518102, "learning_rate": 2.4941498297923843e-06, "loss": 0.58591723, "num_input_tokens_seen": 78489260, "step": 3642, "time_per_iteration": 3.246466636657715 }, { "auxiliary_loss_clip": 0.01171597, "auxiliary_loss_mlp": 0.01023963, "balance_loss_clip": 1.05152178, "balance_loss_mlp": 1.01531434, "epoch": 0.43804485059820836, "flos": 20588305605120.0, "grad_norm": 1.7455108822541578, "language_loss": 0.69837618, "learning_rate": 2.4933949716232424e-06, "loss": 0.72033179, "num_input_tokens_seen": 78506785, "step": 3643, "time_per_iteration": 2.6431987285614014 }, { "auxiliary_loss_clip": 0.0114624, "auxiliary_loss_mlp": 0.01028418, "balance_loss_clip": 1.05061781, "balance_loss_mlp": 1.02010942, "epoch": 0.43816509348884747, "flos": 23876214981120.0, "grad_norm": 1.9545658271162547, "language_loss": 0.7409637, "learning_rate": 2.492640038606476e-06, "loss": 0.76271027, "num_input_tokens_seen": 78525150, "step": 3644, "time_per_iteration": 2.7577614784240723 }, { "auxiliary_loss_clip": 0.01173514, "auxiliary_loss_mlp": 0.01037054, "balance_loss_clip": 1.05115438, "balance_loss_mlp": 1.02773166, "epoch": 0.4382853363794866, "flos": 14684448533760.0, "grad_norm": 2.1080819437874614, "language_loss": 0.78711116, "learning_rate": 2.491885030856608e-06, "loss": 0.80921686, "num_input_tokens_seen": 78543245, "step": 3645, "time_per_iteration": 2.6877214908599854 }, { "auxiliary_loss_clip": 0.01162733, "auxiliary_loss_mlp": 0.01031127, "balance_loss_clip": 1.05057263, "balance_loss_mlp": 1.02236485, "epoch": 0.43840557927012563, "flos": 17165301177600.0, "grad_norm": 2.1029898541907674, "language_loss": 0.82339597, "learning_rate": 2.4911299484881713e-06, "loss": 0.84533453, "num_input_tokens_seen": 78560775, "step": 3646, "time_per_iteration": 2.6882827281951904 }, { "auxiliary_loss_clip": 0.01153972, "auxiliary_loss_mlp": 0.01029091, "balance_loss_clip": 1.04748809, "balance_loss_mlp": 1.02111614, "epoch": 0.43852582216076474, "flos": 19390685316480.0, "grad_norm": 1.5973679921460024, "language_loss": 0.8084532, "learning_rate": 2.490374791615712e-06, "loss": 0.83028388, "num_input_tokens_seen": 78580800, "step": 3647, "time_per_iteration": 2.7253425121307373 }, { "auxiliary_loss_clip": 0.01196546, "auxiliary_loss_mlp": 0.00764299, "balance_loss_clip": 1.05665731, "balance_loss_mlp": 1.00035143, "epoch": 0.43864606505140386, "flos": 18075133699200.0, "grad_norm": 2.875957993142965, "language_loss": 0.77799833, "learning_rate": 2.4896195603537867e-06, "loss": 0.79760671, "num_input_tokens_seen": 78595410, "step": 3648, "time_per_iteration": 2.637045383453369 }, { "auxiliary_loss_clip": 0.01128237, "auxiliary_loss_mlp": 0.01033277, "balance_loss_clip": 1.05150843, "balance_loss_mlp": 1.02501607, "epoch": 0.4387663079420429, "flos": 19644896845440.0, "grad_norm": 2.0141729936908925, "language_loss": 0.73750466, "learning_rate": 2.488864254816964e-06, "loss": 0.75911975, "num_input_tokens_seen": 78614100, "step": 3649, "time_per_iteration": 2.7055630683898926 }, { "auxiliary_loss_clip": 0.01179039, "auxiliary_loss_mlp": 0.01033099, "balance_loss_clip": 1.05491269, "balance_loss_mlp": 1.02356803, "epoch": 0.438886550832682, "flos": 19719339782400.0, "grad_norm": 2.7609316107689157, "language_loss": 0.68331492, "learning_rate": 2.4881088751198218e-06, "loss": 0.70543635, "num_input_tokens_seen": 78632260, "step": 3650, "time_per_iteration": 3.6018640995025635 }, { "auxiliary_loss_clip": 0.01164122, "auxiliary_loss_mlp": 0.0103165, "balance_loss_clip": 1.05024052, "balance_loss_mlp": 1.0223037, "epoch": 0.43900679372332113, "flos": 14536675981440.0, "grad_norm": 2.846705354740708, "language_loss": 0.64311081, "learning_rate": 2.4873534213769517e-06, "loss": 0.66506851, "num_input_tokens_seen": 78647490, "step": 3651, "time_per_iteration": 2.677638053894043 }, { "auxiliary_loss_clip": 0.01142298, "auxiliary_loss_mlp": 0.0103067, "balance_loss_clip": 1.05200589, "balance_loss_mlp": 1.02216387, "epoch": 0.4391270366139602, "flos": 24056234968320.0, "grad_norm": 1.8309671877446523, "language_loss": 0.71860278, "learning_rate": 2.4865978937029547e-06, "loss": 0.74033248, "num_input_tokens_seen": 78666470, "step": 3652, "time_per_iteration": 3.7030718326568604 }, { "auxiliary_loss_clip": 0.01119676, "auxiliary_loss_mlp": 0.0103062, "balance_loss_clip": 1.04662538, "balance_loss_mlp": 1.02171516, "epoch": 0.4392472795045993, "flos": 31538510363520.0, "grad_norm": 1.8456448251091269, "language_loss": 0.66438723, "learning_rate": 2.485842292212445e-06, "loss": 0.6858902, "num_input_tokens_seen": 78687685, "step": 3653, "time_per_iteration": 2.832625389099121 }, { "auxiliary_loss_clip": 0.01193443, "auxiliary_loss_mlp": 0.01036519, "balance_loss_clip": 1.05676913, "balance_loss_mlp": 1.02716112, "epoch": 0.4393675223952384, "flos": 14866300114560.0, "grad_norm": 2.099772596910281, "language_loss": 0.80553383, "learning_rate": 2.485086617020045e-06, "loss": 0.82783341, "num_input_tokens_seen": 78706180, "step": 3654, "time_per_iteration": 2.5469486713409424 }, { "auxiliary_loss_clip": 0.01152483, "auxiliary_loss_mlp": 0.01031468, "balance_loss_clip": 1.04760838, "balance_loss_mlp": 1.0223366, "epoch": 0.43948776528587746, "flos": 14825900292480.0, "grad_norm": 2.036891662439873, "language_loss": 0.81894553, "learning_rate": 2.4843308682403903e-06, "loss": 0.84078509, "num_input_tokens_seen": 78723095, "step": 3655, "time_per_iteration": 2.6249020099639893 }, { "auxiliary_loss_clip": 0.01188766, "auxiliary_loss_mlp": 0.01030442, "balance_loss_clip": 1.05382478, "balance_loss_mlp": 1.02258003, "epoch": 0.4396080081765166, "flos": 13914523486080.0, "grad_norm": 1.75712339504953, "language_loss": 0.82804787, "learning_rate": 2.4835750459881294e-06, "loss": 0.85023993, "num_input_tokens_seen": 78739720, "step": 3656, "time_per_iteration": 4.3919665813446045 }, { "auxiliary_loss_clip": 0.01152246, "auxiliary_loss_mlp": 0.01037905, "balance_loss_clip": 1.04724336, "balance_loss_mlp": 1.02873826, "epoch": 0.43972825106715563, "flos": 18222978078720.0, "grad_norm": 1.71272184801934, "language_loss": 0.81906033, "learning_rate": 2.4828191503779177e-06, "loss": 0.84096181, "num_input_tokens_seen": 78757820, "step": 3657, "time_per_iteration": 2.6146507263183594 }, { "auxiliary_loss_clip": 0.01144836, "auxiliary_loss_mlp": 0.01030762, "balance_loss_clip": 1.04728198, "balance_loss_mlp": 1.02152896, "epoch": 0.43984849395779474, "flos": 16873239692160.0, "grad_norm": 1.9500005310713235, "language_loss": 0.8998602, "learning_rate": 2.482063181524425e-06, "loss": 0.9216162, "num_input_tokens_seen": 78773720, "step": 3658, "time_per_iteration": 2.7031266689300537 }, { "auxiliary_loss_clip": 0.01196731, "auxiliary_loss_mlp": 0.01036804, "balance_loss_clip": 1.05806053, "balance_loss_mlp": 1.0278933, "epoch": 0.43996873684843385, "flos": 18691504104960.0, "grad_norm": 2.5361305151582014, "language_loss": 0.81135154, "learning_rate": 2.4813071395423307e-06, "loss": 0.83368695, "num_input_tokens_seen": 78791285, "step": 3659, "time_per_iteration": 2.6063992977142334 }, { "auxiliary_loss_clip": 0.011761, "auxiliary_loss_mlp": 0.01030261, "balance_loss_clip": 1.05080235, "balance_loss_mlp": 1.0212431, "epoch": 0.4400889797390729, "flos": 23653460787840.0, "grad_norm": 2.864284903715874, "language_loss": 0.64751011, "learning_rate": 2.4805510245463263e-06, "loss": 0.66957366, "num_input_tokens_seen": 78811440, "step": 3660, "time_per_iteration": 2.7313833236694336 }, { "auxiliary_loss_clip": 0.01173397, "auxiliary_loss_mlp": 0.01038249, "balance_loss_clip": 1.04898262, "balance_loss_mlp": 1.02939141, "epoch": 0.440209222629712, "flos": 23149203707520.0, "grad_norm": 2.3341969009804413, "language_loss": 0.60399365, "learning_rate": 2.4797948366511137e-06, "loss": 0.62611008, "num_input_tokens_seen": 78831150, "step": 3661, "time_per_iteration": 2.6798031330108643 }, { "auxiliary_loss_clip": 0.01143137, "auxiliary_loss_mlp": 0.01029456, "balance_loss_clip": 1.04524183, "balance_loss_mlp": 1.02062893, "epoch": 0.4403294655203511, "flos": 24823394668800.0, "grad_norm": 6.603729715586676, "language_loss": 0.76626313, "learning_rate": 2.4790385759714055e-06, "loss": 0.78798908, "num_input_tokens_seen": 78850215, "step": 3662, "time_per_iteration": 2.7237746715545654 }, { "auxiliary_loss_clip": 0.01175686, "auxiliary_loss_mlp": 0.01027824, "balance_loss_clip": 1.05490923, "balance_loss_mlp": 1.01927686, "epoch": 0.4404497084109902, "flos": 22565080736640.0, "grad_norm": 1.781715182680308, "language_loss": 0.71464992, "learning_rate": 2.478282242621926e-06, "loss": 0.73668504, "num_input_tokens_seen": 78870675, "step": 3663, "time_per_iteration": 2.7357938289642334 }, { "auxiliary_loss_clip": 0.01050428, "auxiliary_loss_mlp": 0.01002372, "balance_loss_clip": 1.01416326, "balance_loss_mlp": 1.00063157, "epoch": 0.4405699513016293, "flos": 64967073448320.0, "grad_norm": 0.8377646876700964, "language_loss": 0.59570199, "learning_rate": 2.477525836717411e-06, "loss": 0.61623001, "num_input_tokens_seen": 78938440, "step": 3664, "time_per_iteration": 3.401033878326416 }, { "auxiliary_loss_clip": 0.01175752, "auxiliary_loss_mlp": 0.0103197, "balance_loss_clip": 1.05219877, "balance_loss_mlp": 1.02297592, "epoch": 0.4406901941922684, "flos": 35661952978560.0, "grad_norm": 2.963150216107812, "language_loss": 0.7974894, "learning_rate": 2.476769358372606e-06, "loss": 0.81956661, "num_input_tokens_seen": 78960090, "step": 3665, "time_per_iteration": 2.7882802486419678 }, { "auxiliary_loss_clip": 0.01147042, "auxiliary_loss_mlp": 0.01030053, "balance_loss_clip": 1.05353725, "balance_loss_mlp": 1.02164888, "epoch": 0.44081043708290746, "flos": 18040767361920.0, "grad_norm": 2.1932272231585777, "language_loss": 0.75011766, "learning_rate": 2.4760128077022683e-06, "loss": 0.77188861, "num_input_tokens_seen": 78978225, "step": 3666, "time_per_iteration": 2.646496295928955 }, { "auxiliary_loss_clip": 0.0112399, "auxiliary_loss_mlp": 0.01034193, "balance_loss_clip": 1.04677105, "balance_loss_mlp": 1.02546644, "epoch": 0.44093067997354657, "flos": 30153507799680.0, "grad_norm": 1.4289623974330963, "language_loss": 0.6834023, "learning_rate": 2.4752561848211672e-06, "loss": 0.70498413, "num_input_tokens_seen": 79000625, "step": 3667, "time_per_iteration": 2.8807568550109863 }, { "auxiliary_loss_clip": 0.01177002, "auxiliary_loss_mlp": 0.01028122, "balance_loss_clip": 1.05759525, "balance_loss_mlp": 1.01943803, "epoch": 0.4410509228641857, "flos": 23255068066560.0, "grad_norm": 1.9279639887968356, "language_loss": 0.7140187, "learning_rate": 2.4744994898440797e-06, "loss": 0.73606992, "num_input_tokens_seen": 79019415, "step": 3668, "time_per_iteration": 2.652148485183716 }, { "auxiliary_loss_clip": 0.0114929, "auxiliary_loss_mlp": 0.0102788, "balance_loss_clip": 1.04860258, "balance_loss_mlp": 1.01927269, "epoch": 0.44117116575482473, "flos": 19500571998720.0, "grad_norm": 2.1987294749154187, "language_loss": 0.83995783, "learning_rate": 2.473742722885797e-06, "loss": 0.8617295, "num_input_tokens_seen": 79038435, "step": 3669, "time_per_iteration": 2.6833221912384033 }, { "auxiliary_loss_clip": 0.01176356, "auxiliary_loss_mlp": 0.00764355, "balance_loss_clip": 1.05465055, "balance_loss_mlp": 1.00033581, "epoch": 0.44129140864546385, "flos": 27053124353280.0, "grad_norm": 2.2728954637744287, "language_loss": 0.65523899, "learning_rate": 2.4729858840611197e-06, "loss": 0.67464614, "num_input_tokens_seen": 79057345, "step": 3670, "time_per_iteration": 2.7127597332000732 }, { "auxiliary_loss_clip": 0.01189256, "auxiliary_loss_mlp": 0.01030352, "balance_loss_clip": 1.05443692, "balance_loss_mlp": 1.02215004, "epoch": 0.4414116515361029, "flos": 26102101910400.0, "grad_norm": 1.7786638519083433, "language_loss": 0.72827756, "learning_rate": 2.4722289734848605e-06, "loss": 0.75047362, "num_input_tokens_seen": 79077810, "step": 3671, "time_per_iteration": 2.663055419921875 }, { "auxiliary_loss_clip": 0.01145309, "auxiliary_loss_mlp": 0.01027758, "balance_loss_clip": 1.05255198, "balance_loss_mlp": 1.01946056, "epoch": 0.441531894426742, "flos": 21906083865600.0, "grad_norm": 1.9230970220592325, "language_loss": 0.77544498, "learning_rate": 2.471471991271841e-06, "loss": 0.79717571, "num_input_tokens_seen": 79094935, "step": 3672, "time_per_iteration": 2.7645130157470703 }, { "auxiliary_loss_clip": 0.01166021, "auxiliary_loss_mlp": 0.01036898, "balance_loss_clip": 1.04863834, "balance_loss_mlp": 1.02840459, "epoch": 0.4416521373173811, "flos": 23437099215360.0, "grad_norm": 3.171791582683192, "language_loss": 0.79453325, "learning_rate": 2.470714937536896e-06, "loss": 0.81656241, "num_input_tokens_seen": 79113660, "step": 3673, "time_per_iteration": 2.653517246246338 }, { "auxiliary_loss_clip": 0.0112897, "auxiliary_loss_mlp": 0.01027912, "balance_loss_clip": 1.04806066, "balance_loss_mlp": 1.01864934, "epoch": 0.4417723802080202, "flos": 20334345471360.0, "grad_norm": 1.7357159380252134, "language_loss": 0.703879, "learning_rate": 2.469957812394868e-06, "loss": 0.72544777, "num_input_tokens_seen": 79132470, "step": 3674, "time_per_iteration": 2.7627782821655273 }, { "auxiliary_loss_clip": 0.01191097, "auxiliary_loss_mlp": 0.01022861, "balance_loss_clip": 1.05695665, "balance_loss_mlp": 1.0142715, "epoch": 0.4418926230986593, "flos": 18880682060160.0, "grad_norm": 1.8906369535314556, "language_loss": 0.76415676, "learning_rate": 2.4692006159606148e-06, "loss": 0.78629637, "num_input_tokens_seen": 79150000, "step": 3675, "time_per_iteration": 2.6349289417266846 }, { "auxiliary_loss_clip": 0.01187429, "auxiliary_loss_mlp": 0.0103623, "balance_loss_clip": 1.0534029, "balance_loss_mlp": 1.02684855, "epoch": 0.4420128659892984, "flos": 19464409981440.0, "grad_norm": 1.764557602116056, "language_loss": 0.78231192, "learning_rate": 2.468443348349e-06, "loss": 0.8045485, "num_input_tokens_seen": 79167875, "step": 3676, "time_per_iteration": 2.654224157333374 }, { "auxiliary_loss_clip": 0.01130671, "auxiliary_loss_mlp": 0.01036314, "balance_loss_clip": 1.04563379, "balance_loss_mlp": 1.02708173, "epoch": 0.44213310887993745, "flos": 17894359526400.0, "grad_norm": 2.548508672115378, "language_loss": 0.82324702, "learning_rate": 2.467686009674902e-06, "loss": 0.84491688, "num_input_tokens_seen": 79182325, "step": 3677, "time_per_iteration": 3.705359935760498 }, { "auxiliary_loss_clip": 0.0116913, "auxiliary_loss_mlp": 0.01033282, "balance_loss_clip": 1.04970789, "balance_loss_mlp": 1.02457333, "epoch": 0.44225335177057656, "flos": 19204667758080.0, "grad_norm": 2.012377098272832, "language_loss": 0.85664546, "learning_rate": 2.466928600053209e-06, "loss": 0.8786695, "num_input_tokens_seen": 79197630, "step": 3678, "time_per_iteration": 3.6749086380004883 }, { "auxiliary_loss_clip": 0.01158071, "auxiliary_loss_mlp": 0.01028803, "balance_loss_clip": 1.04856491, "balance_loss_mlp": 1.02070832, "epoch": 0.4423735946612157, "flos": 23471321898240.0, "grad_norm": 1.916292391431073, "language_loss": 0.71573639, "learning_rate": 2.466171119598818e-06, "loss": 0.73760521, "num_input_tokens_seen": 79217600, "step": 3679, "time_per_iteration": 2.737898826599121 }, { "auxiliary_loss_clip": 0.01180921, "auxiliary_loss_mlp": 0.01028998, "balance_loss_clip": 1.05173624, "balance_loss_mlp": 1.01968813, "epoch": 0.44249383755185473, "flos": 26685398868480.0, "grad_norm": 1.885700049285115, "language_loss": 0.77584296, "learning_rate": 2.465413568426639e-06, "loss": 0.79794216, "num_input_tokens_seen": 79238550, "step": 3680, "time_per_iteration": 2.6661536693573 }, { "auxiliary_loss_clip": 0.01167261, "auxiliary_loss_mlp": 0.01030547, "balance_loss_clip": 1.05076897, "balance_loss_mlp": 1.02236927, "epoch": 0.44261408044249384, "flos": 23147659422720.0, "grad_norm": 1.572114119895612, "language_loss": 0.81549835, "learning_rate": 2.464655946651591e-06, "loss": 0.83747643, "num_input_tokens_seen": 79257555, "step": 3681, "time_per_iteration": 2.660778760910034 }, { "auxiliary_loss_clip": 0.01174918, "auxiliary_loss_mlp": 0.01031637, "balance_loss_clip": 1.05306196, "balance_loss_mlp": 1.02259481, "epoch": 0.44273432333313295, "flos": 24462564595200.0, "grad_norm": 1.8654908419979632, "language_loss": 0.80994827, "learning_rate": 2.4638982543886065e-06, "loss": 0.83201379, "num_input_tokens_seen": 79277595, "step": 3682, "time_per_iteration": 4.500851154327393 }, { "auxiliary_loss_clip": 0.01180309, "auxiliary_loss_mlp": 0.01031496, "balance_loss_clip": 1.05500889, "balance_loss_mlp": 1.02247238, "epoch": 0.442854566223772, "flos": 17528932512000.0, "grad_norm": 2.7293350687141738, "language_loss": 0.87106341, "learning_rate": 2.4631404917526254e-06, "loss": 0.8931815, "num_input_tokens_seen": 79294550, "step": 3683, "time_per_iteration": 2.5919923782348633 }, { "auxiliary_loss_clip": 0.0116482, "auxiliary_loss_mlp": 0.01035407, "balance_loss_clip": 1.04769373, "balance_loss_mlp": 1.02691913, "epoch": 0.4429748091144111, "flos": 24896293320960.0, "grad_norm": 1.50226329519889, "language_loss": 0.79706109, "learning_rate": 2.4623826588586e-06, "loss": 0.81906331, "num_input_tokens_seen": 79314820, "step": 3684, "time_per_iteration": 2.709543228149414 }, { "auxiliary_loss_clip": 0.01154547, "auxiliary_loss_mlp": 0.01029539, "balance_loss_clip": 1.0464673, "balance_loss_mlp": 1.02089, "epoch": 0.4430950520050502, "flos": 21614704738560.0, "grad_norm": 1.8300631747232492, "language_loss": 0.83053005, "learning_rate": 2.461624755821492e-06, "loss": 0.85237086, "num_input_tokens_seen": 79334300, "step": 3685, "time_per_iteration": 2.6639208793640137 }, { "auxiliary_loss_clip": 0.01150051, "auxiliary_loss_mlp": 0.01031635, "balance_loss_clip": 1.05169094, "balance_loss_mlp": 1.02252698, "epoch": 0.4432152948956893, "flos": 24572271709440.0, "grad_norm": 1.7863351606734668, "language_loss": 0.76662087, "learning_rate": 2.4608667827562763e-06, "loss": 0.78843772, "num_input_tokens_seen": 79353630, "step": 3686, "time_per_iteration": 2.727769613265991 }, { "auxiliary_loss_clip": 0.01181896, "auxiliary_loss_mlp": 0.01030952, "balance_loss_clip": 1.05472994, "balance_loss_mlp": 1.02230906, "epoch": 0.4433355377863284, "flos": 21762261809280.0, "grad_norm": 1.864729253474617, "language_loss": 0.90336102, "learning_rate": 2.460108739777936e-06, "loss": 0.92548954, "num_input_tokens_seen": 79372765, "step": 3687, "time_per_iteration": 2.6538448333740234 }, { "auxiliary_loss_clip": 0.01159539, "auxiliary_loss_mlp": 0.01035247, "balance_loss_clip": 1.05118704, "balance_loss_mlp": 1.02699792, "epoch": 0.44345578067696745, "flos": 20084479488000.0, "grad_norm": 1.9864796165648595, "language_loss": 0.76484847, "learning_rate": 2.4593506270014656e-06, "loss": 0.78679633, "num_input_tokens_seen": 79391735, "step": 3688, "time_per_iteration": 2.719886302947998 }, { "auxiliary_loss_clip": 0.0116022, "auxiliary_loss_mlp": 0.0103512, "balance_loss_clip": 1.04870105, "balance_loss_mlp": 1.02648926, "epoch": 0.44357602356760656, "flos": 24169497528960.0, "grad_norm": 1.4909973968486707, "language_loss": 0.81873405, "learning_rate": 2.45859244454187e-06, "loss": 0.84068745, "num_input_tokens_seen": 79411525, "step": 3689, "time_per_iteration": 2.784013509750366 }, { "auxiliary_loss_clip": 0.01170803, "auxiliary_loss_mlp": 0.01036745, "balance_loss_clip": 1.05049157, "balance_loss_mlp": 1.02842474, "epoch": 0.44369626645824567, "flos": 22707717644160.0, "grad_norm": 1.6195369754194644, "language_loss": 0.66389978, "learning_rate": 2.4578341925141655e-06, "loss": 0.68597531, "num_input_tokens_seen": 79430740, "step": 3690, "time_per_iteration": 2.6220197677612305 }, { "auxiliary_loss_clip": 0.0118093, "auxiliary_loss_mlp": 0.0103164, "balance_loss_clip": 1.05249512, "balance_loss_mlp": 1.02252054, "epoch": 0.4438165093488847, "flos": 38030225420160.0, "grad_norm": 1.9580520902409013, "language_loss": 0.72231179, "learning_rate": 2.457075871033378e-06, "loss": 0.74443746, "num_input_tokens_seen": 79452615, "step": 3691, "time_per_iteration": 2.910701274871826 }, { "auxiliary_loss_clip": 0.01143171, "auxiliary_loss_mlp": 0.01028273, "balance_loss_clip": 1.04935503, "balance_loss_mlp": 1.02041149, "epoch": 0.44393675223952384, "flos": 15523213996800.0, "grad_norm": 2.8604348363891234, "language_loss": 0.88606292, "learning_rate": 2.4563174802145445e-06, "loss": 0.90777743, "num_input_tokens_seen": 79469865, "step": 3692, "time_per_iteration": 2.7159037590026855 }, { "auxiliary_loss_clip": 0.01059512, "auxiliary_loss_mlp": 0.01003329, "balance_loss_clip": 1.01301098, "balance_loss_mlp": 1.00161254, "epoch": 0.44405699513016295, "flos": 64574893779840.0, "grad_norm": 0.6356160822279932, "language_loss": 0.48600069, "learning_rate": 2.455559020172712e-06, "loss": 0.50662911, "num_input_tokens_seen": 79537220, "step": 3693, "time_per_iteration": 3.408446788787842 }, { "auxiliary_loss_clip": 0.01136671, "auxiliary_loss_mlp": 0.0102945, "balance_loss_clip": 1.05126572, "balance_loss_mlp": 1.02115858, "epoch": 0.444177238020802, "flos": 23987394552960.0, "grad_norm": 1.987684780337597, "language_loss": 0.89591467, "learning_rate": 2.4548004910229385e-06, "loss": 0.91757596, "num_input_tokens_seen": 79554795, "step": 3694, "time_per_iteration": 2.761852741241455 }, { "auxiliary_loss_clip": 0.01174287, "auxiliary_loss_mlp": 0.00763444, "balance_loss_clip": 1.05029845, "balance_loss_mlp": 1.00036967, "epoch": 0.4442974809114411, "flos": 22563069575040.0, "grad_norm": 2.3475126399626487, "language_loss": 0.86876768, "learning_rate": 2.4540418928802913e-06, "loss": 0.88814497, "num_input_tokens_seen": 79573530, "step": 3695, "time_per_iteration": 2.7402238845825195 }, { "auxiliary_loss_clip": 0.01156689, "auxiliary_loss_mlp": 0.01027985, "balance_loss_clip": 1.04920638, "balance_loss_mlp": 1.01934862, "epoch": 0.4444177238020802, "flos": 17675699483520.0, "grad_norm": 2.0796532868567263, "language_loss": 0.66405356, "learning_rate": 2.4532832258598506e-06, "loss": 0.68590027, "num_input_tokens_seen": 79591360, "step": 3696, "time_per_iteration": 2.7102975845336914 }, { "auxiliary_loss_clip": 0.01185443, "auxiliary_loss_mlp": 0.01033549, "balance_loss_clip": 1.05217063, "balance_loss_mlp": 1.02516818, "epoch": 0.4445379666927193, "flos": 28621594609920.0, "grad_norm": 1.6977154941058177, "language_loss": 0.80404425, "learning_rate": 2.4525244900767047e-06, "loss": 0.8262341, "num_input_tokens_seen": 79612175, "step": 3697, "time_per_iteration": 2.714984178543091 }, { "auxiliary_loss_clip": 0.01065645, "auxiliary_loss_mlp": 0.01003383, "balance_loss_clip": 1.01315904, "balance_loss_mlp": 1.00159502, "epoch": 0.4446582095833584, "flos": 70487370115200.0, "grad_norm": 0.7775214546804061, "language_loss": 0.60492277, "learning_rate": 2.4517656856459536e-06, "loss": 0.62561309, "num_input_tokens_seen": 79678020, "step": 3698, "time_per_iteration": 3.319962739944458 }, { "auxiliary_loss_clip": 0.01169995, "auxiliary_loss_mlp": 0.01024554, "balance_loss_clip": 1.04904008, "balance_loss_mlp": 1.01596534, "epoch": 0.4447784524739975, "flos": 26505199313280.0, "grad_norm": 1.7230341897139447, "language_loss": 0.67932975, "learning_rate": 2.4510068126827073e-06, "loss": 0.70127523, "num_input_tokens_seen": 79699020, "step": 3699, "time_per_iteration": 2.699629783630371 }, { "auxiliary_loss_clip": 0.01158015, "auxiliary_loss_mlp": 0.01027966, "balance_loss_clip": 1.04895747, "balance_loss_mlp": 1.01928139, "epoch": 0.44489869536463655, "flos": 11656209553920.0, "grad_norm": 2.2501128830946806, "language_loss": 0.8231985, "learning_rate": 2.450247871302086e-06, "loss": 0.84505832, "num_input_tokens_seen": 79716795, "step": 3700, "time_per_iteration": 2.7203481197357178 }, { "auxiliary_loss_clip": 0.01178945, "auxiliary_loss_mlp": 0.01024791, "balance_loss_clip": 1.05246186, "balance_loss_mlp": 1.01605904, "epoch": 0.44501893825527566, "flos": 20448469958400.0, "grad_norm": 2.359187019720124, "language_loss": 0.8346411, "learning_rate": 2.44948886161922e-06, "loss": 0.85667849, "num_input_tokens_seen": 79735810, "step": 3701, "time_per_iteration": 2.612943410873413 }, { "auxiliary_loss_clip": 0.01175846, "auxiliary_loss_mlp": 0.01022259, "balance_loss_clip": 1.0510366, "balance_loss_mlp": 1.01436424, "epoch": 0.4451391811459148, "flos": 18261079430400.0, "grad_norm": 1.5677676291463951, "language_loss": 0.85075092, "learning_rate": 2.4487297837492524e-06, "loss": 0.87273204, "num_input_tokens_seen": 79754975, "step": 3702, "time_per_iteration": 2.7005562782287598 }, { "auxiliary_loss_clip": 0.01143918, "auxiliary_loss_mlp": 0.01028414, "balance_loss_clip": 1.04835701, "balance_loss_mlp": 1.01918757, "epoch": 0.44525942403655383, "flos": 16910155895040.0, "grad_norm": 1.8663613967310655, "language_loss": 0.62158978, "learning_rate": 2.4479706378073323e-06, "loss": 0.64331311, "num_input_tokens_seen": 79773515, "step": 3703, "time_per_iteration": 3.5774528980255127 }, { "auxiliary_loss_clip": 0.01133187, "auxiliary_loss_mlp": 0.01034003, "balance_loss_clip": 1.0434041, "balance_loss_mlp": 1.02599835, "epoch": 0.44537966692719294, "flos": 23258838994560.0, "grad_norm": 1.5586169113191524, "language_loss": 0.83846021, "learning_rate": 2.447211423908623e-06, "loss": 0.8601321, "num_input_tokens_seen": 79793560, "step": 3704, "time_per_iteration": 3.669285535812378 }, { "auxiliary_loss_clip": 0.01172487, "auxiliary_loss_mlp": 0.01025424, "balance_loss_clip": 1.0484066, "balance_loss_mlp": 1.01778841, "epoch": 0.445499909817832, "flos": 21724160457600.0, "grad_norm": 2.1163097233922805, "language_loss": 0.75000155, "learning_rate": 2.4464521421682966e-06, "loss": 0.77198064, "num_input_tokens_seen": 79811150, "step": 3705, "time_per_iteration": 2.6374497413635254 }, { "auxiliary_loss_clip": 0.01168456, "auxiliary_loss_mlp": 0.01029374, "balance_loss_clip": 1.05050969, "balance_loss_mlp": 1.02135754, "epoch": 0.4456201527084711, "flos": 23987969170560.0, "grad_norm": 1.4311894096633222, "language_loss": 0.87644976, "learning_rate": 2.4456927927015345e-06, "loss": 0.89842808, "num_input_tokens_seen": 79832190, "step": 3706, "time_per_iteration": 2.667948007583618 }, { "auxiliary_loss_clip": 0.01165953, "auxiliary_loss_mlp": 0.010351, "balance_loss_clip": 1.05316877, "balance_loss_mlp": 1.02567649, "epoch": 0.4457403955991102, "flos": 18807065136000.0, "grad_norm": 5.9274872244321894, "language_loss": 0.76259941, "learning_rate": 2.4449333756235307e-06, "loss": 0.78460991, "num_input_tokens_seen": 79848905, "step": 3707, "time_per_iteration": 3.643660545349121 }, { "auxiliary_loss_clip": 0.01177827, "auxiliary_loss_mlp": 0.01033823, "balance_loss_clip": 1.05269468, "balance_loss_mlp": 1.02577627, "epoch": 0.4458606384897493, "flos": 19207756327680.0, "grad_norm": 2.2977586470834828, "language_loss": 0.78487551, "learning_rate": 2.4441738910494876e-06, "loss": 0.80699199, "num_input_tokens_seen": 79863640, "step": 3708, "time_per_iteration": 3.550788640975952 }, { "auxiliary_loss_clip": 0.01165506, "auxiliary_loss_mlp": 0.01031935, "balance_loss_clip": 1.04889178, "balance_loss_mlp": 1.02341175, "epoch": 0.4459808813803884, "flos": 21361283308800.0, "grad_norm": 1.7981430834814152, "language_loss": 0.82355124, "learning_rate": 2.4434143390946176e-06, "loss": 0.84552562, "num_input_tokens_seen": 79882450, "step": 3709, "time_per_iteration": 2.7174766063690186 }, { "auxiliary_loss_clip": 0.01140289, "auxiliary_loss_mlp": 0.01031614, "balance_loss_clip": 1.0476203, "balance_loss_mlp": 1.02347481, "epoch": 0.4461011242710275, "flos": 23288967527040.0, "grad_norm": 1.891224166622154, "language_loss": 0.85978144, "learning_rate": 2.4426547198741457e-06, "loss": 0.88150048, "num_input_tokens_seen": 79900655, "step": 3710, "time_per_iteration": 2.746715545654297 }, { "auxiliary_loss_clip": 0.01129613, "auxiliary_loss_mlp": 0.01026547, "balance_loss_clip": 1.04963946, "balance_loss_mlp": 1.01747489, "epoch": 0.44622136716166655, "flos": 20193001453440.0, "grad_norm": 2.061965096083455, "language_loss": 0.7463305, "learning_rate": 2.441895033503305e-06, "loss": 0.76789212, "num_input_tokens_seen": 79918575, "step": 3711, "time_per_iteration": 2.7268617153167725 }, { "auxiliary_loss_clip": 0.01173085, "auxiliary_loss_mlp": 0.01031049, "balance_loss_clip": 1.05151701, "balance_loss_mlp": 1.02192354, "epoch": 0.44634161005230566, "flos": 21283033530240.0, "grad_norm": 1.7561245368745524, "language_loss": 0.82227176, "learning_rate": 2.4411352800973375e-06, "loss": 0.84431314, "num_input_tokens_seen": 79937010, "step": 3712, "time_per_iteration": 2.6689631938934326 }, { "auxiliary_loss_clip": 0.01138439, "auxiliary_loss_mlp": 0.01031024, "balance_loss_clip": 1.04519475, "balance_loss_mlp": 1.02263188, "epoch": 0.44646185294294477, "flos": 22929358515840.0, "grad_norm": 2.64742078489163, "language_loss": 0.7499373, "learning_rate": 2.4403754597715005e-06, "loss": 0.77163196, "num_input_tokens_seen": 79956455, "step": 3713, "time_per_iteration": 2.7410027980804443 }, { "auxiliary_loss_clip": 0.01160785, "auxiliary_loss_mlp": 0.01027201, "balance_loss_clip": 1.04634607, "balance_loss_mlp": 1.01799798, "epoch": 0.4465820958335838, "flos": 22637692080000.0, "grad_norm": 2.3638893793753826, "language_loss": 0.92977023, "learning_rate": 2.4396155726410553e-06, "loss": 0.95165014, "num_input_tokens_seen": 79975065, "step": 3714, "time_per_iteration": 2.687425374984741 }, { "auxiliary_loss_clip": 0.01177761, "auxiliary_loss_mlp": 0.01028566, "balance_loss_clip": 1.04959607, "balance_loss_mlp": 1.01993477, "epoch": 0.44670233872422294, "flos": 22672525294080.0, "grad_norm": 2.8461451588034317, "language_loss": 0.90624136, "learning_rate": 2.438855618821278e-06, "loss": 0.92830455, "num_input_tokens_seen": 79990865, "step": 3715, "time_per_iteration": 2.64646315574646 }, { "auxiliary_loss_clip": 0.0116278, "auxiliary_loss_mlp": 0.01027242, "balance_loss_clip": 1.04642415, "balance_loss_mlp": 1.01849198, "epoch": 0.44682258161486205, "flos": 23582178247680.0, "grad_norm": 1.6684953625130445, "language_loss": 0.66909754, "learning_rate": 2.4380955984274517e-06, "loss": 0.69099778, "num_input_tokens_seen": 80009520, "step": 3716, "time_per_iteration": 2.625842332839966 }, { "auxiliary_loss_clip": 0.01170924, "auxiliary_loss_mlp": 0.01028221, "balance_loss_clip": 1.04930568, "balance_loss_mlp": 1.019292, "epoch": 0.4469428245055011, "flos": 26501356558080.0, "grad_norm": 1.979159249626376, "language_loss": 0.76599419, "learning_rate": 2.4373355115748716e-06, "loss": 0.78798568, "num_input_tokens_seen": 80030350, "step": 3717, "time_per_iteration": 2.687736749649048 }, { "auxiliary_loss_clip": 0.01150266, "auxiliary_loss_mlp": 0.01029483, "balance_loss_clip": 1.04863071, "balance_loss_mlp": 1.0207808, "epoch": 0.4470630673961402, "flos": 21504925797120.0, "grad_norm": 1.5975646144150326, "language_loss": 0.72299576, "learning_rate": 2.436575358378842e-06, "loss": 0.7447933, "num_input_tokens_seen": 80049840, "step": 3718, "time_per_iteration": 2.679807424545288 }, { "auxiliary_loss_clip": 0.01163651, "auxiliary_loss_mlp": 0.01033644, "balance_loss_clip": 1.05053985, "balance_loss_mlp": 1.02523077, "epoch": 0.44718331028677927, "flos": 16173986653440.0, "grad_norm": 2.9096758162404157, "language_loss": 0.82978076, "learning_rate": 2.4358151389546782e-06, "loss": 0.85175371, "num_input_tokens_seen": 80066525, "step": 3719, "time_per_iteration": 2.8122947216033936 }, { "auxiliary_loss_clip": 0.01187144, "auxiliary_loss_mlp": 0.01032401, "balance_loss_clip": 1.05275917, "balance_loss_mlp": 1.02421701, "epoch": 0.4473035531774184, "flos": 19681238430720.0, "grad_norm": 4.630096588913423, "language_loss": 0.7538864, "learning_rate": 2.4350548534177035e-06, "loss": 0.77608186, "num_input_tokens_seen": 80083355, "step": 3720, "time_per_iteration": 2.6823770999908447 }, { "auxiliary_loss_clip": 0.01144634, "auxiliary_loss_mlp": 0.01032089, "balance_loss_clip": 1.04712725, "balance_loss_mlp": 1.02441764, "epoch": 0.4474237960680575, "flos": 41427590515200.0, "grad_norm": 1.6168103022164773, "language_loss": 0.66631538, "learning_rate": 2.434294501883254e-06, "loss": 0.68808258, "num_input_tokens_seen": 80106450, "step": 3721, "time_per_iteration": 2.9095029830932617 }, { "auxiliary_loss_clip": 0.01146952, "auxiliary_loss_mlp": 0.01028793, "balance_loss_clip": 1.04503858, "balance_loss_mlp": 1.020329, "epoch": 0.44754403895869654, "flos": 22891328991360.0, "grad_norm": 1.998662527345902, "language_loss": 0.65730309, "learning_rate": 2.433534084466674e-06, "loss": 0.67906052, "num_input_tokens_seen": 80125670, "step": 3722, "time_per_iteration": 2.7547476291656494 }, { "auxiliary_loss_clip": 0.01184137, "auxiliary_loss_mlp": 0.01028909, "balance_loss_clip": 1.05067444, "balance_loss_mlp": 1.02101767, "epoch": 0.44766428184933565, "flos": 25630271832960.0, "grad_norm": 1.780718511377368, "language_loss": 0.70666313, "learning_rate": 2.4327736012833178e-06, "loss": 0.72879356, "num_input_tokens_seen": 80147390, "step": 3723, "time_per_iteration": 2.653913974761963 }, { "auxiliary_loss_clip": 0.01171351, "auxiliary_loss_mlp": 0.01026896, "balance_loss_clip": 1.05061448, "balance_loss_mlp": 1.01832199, "epoch": 0.44778452473997477, "flos": 20448972748800.0, "grad_norm": 2.1773636712034814, "language_loss": 0.76705116, "learning_rate": 2.4320130524485506e-06, "loss": 0.78903359, "num_input_tokens_seen": 80166185, "step": 3724, "time_per_iteration": 2.678165912628174 }, { "auxiliary_loss_clip": 0.01150326, "auxiliary_loss_mlp": 0.01030189, "balance_loss_clip": 1.04945385, "balance_loss_mlp": 1.0222261, "epoch": 0.4479047676306138, "flos": 21975462984960.0, "grad_norm": 1.4793607652058478, "language_loss": 0.79697204, "learning_rate": 2.431252438077746e-06, "loss": 0.8187772, "num_input_tokens_seen": 80185685, "step": 3725, "time_per_iteration": 2.6977450847625732 }, { "auxiliary_loss_clip": 0.01176352, "auxiliary_loss_mlp": 0.00763426, "balance_loss_clip": 1.05042672, "balance_loss_mlp": 1.00031447, "epoch": 0.44802501052125293, "flos": 21467219495040.0, "grad_norm": 5.154584527475392, "language_loss": 0.77161348, "learning_rate": 2.4304917582862906e-06, "loss": 0.79101121, "num_input_tokens_seen": 80204865, "step": 3726, "time_per_iteration": 2.6747653484344482 }, { "auxiliary_loss_clip": 0.01187387, "auxiliary_loss_mlp": 0.01031923, "balance_loss_clip": 1.05290544, "balance_loss_mlp": 1.02413821, "epoch": 0.44814525341189204, "flos": 22126970551680.0, "grad_norm": 2.2023255207635164, "language_loss": 0.87729323, "learning_rate": 2.4297310131895774e-06, "loss": 0.8994863, "num_input_tokens_seen": 80223410, "step": 3727, "time_per_iteration": 2.581878185272217 }, { "auxiliary_loss_clip": 0.01169804, "auxiliary_loss_mlp": 0.01025962, "balance_loss_clip": 1.04938126, "balance_loss_mlp": 1.01784325, "epoch": 0.4482654963025311, "flos": 16653933204480.0, "grad_norm": 13.841496533601566, "language_loss": 0.74719018, "learning_rate": 2.4289702029030113e-06, "loss": 0.76914787, "num_input_tokens_seen": 80240880, "step": 3728, "time_per_iteration": 3.5480575561523438 }, { "auxiliary_loss_clip": 0.01175639, "auxiliary_loss_mlp": 0.01029412, "balance_loss_clip": 1.0536375, "balance_loss_mlp": 1.02147257, "epoch": 0.4483857391931702, "flos": 18841251905280.0, "grad_norm": 1.9264508614778388, "language_loss": 0.83130598, "learning_rate": 2.4282093275420057e-06, "loss": 0.85335648, "num_input_tokens_seen": 80259910, "step": 3729, "time_per_iteration": 2.671405792236328 }, { "auxiliary_loss_clip": 0.01177451, "auxiliary_loss_mlp": 0.01021788, "balance_loss_clip": 1.0516361, "balance_loss_mlp": 1.01355624, "epoch": 0.4485059820838093, "flos": 20372590477440.0, "grad_norm": 2.578670321438649, "language_loss": 0.70287108, "learning_rate": 2.4274483872219863e-06, "loss": 0.72486347, "num_input_tokens_seen": 80277270, "step": 3730, "time_per_iteration": 3.5934853553771973 }, { "auxiliary_loss_clip": 0.011696, "auxiliary_loss_mlp": 0.01027589, "balance_loss_clip": 1.04996026, "balance_loss_mlp": 1.01945853, "epoch": 0.4486262249744484, "flos": 20047742853120.0, "grad_norm": 2.061967806418435, "language_loss": 0.93652701, "learning_rate": 2.426687382058386e-06, "loss": 0.9584989, "num_input_tokens_seen": 80295550, "step": 3731, "time_per_iteration": 2.7246763706207275 }, { "auxiliary_loss_clip": 0.0106748, "auxiliary_loss_mlp": 0.01004634, "balance_loss_clip": 1.01476359, "balance_loss_mlp": 1.00291753, "epoch": 0.4487464678650875, "flos": 64595684776320.0, "grad_norm": 0.8673221143331336, "language_loss": 0.59838605, "learning_rate": 2.425926312166649e-06, "loss": 0.61910713, "num_input_tokens_seen": 80348425, "step": 3732, "time_per_iteration": 3.1514012813568115 }, { "auxiliary_loss_clip": 0.01163324, "auxiliary_loss_mlp": 0.01033687, "balance_loss_clip": 1.05027199, "balance_loss_mlp": 1.02499115, "epoch": 0.4488667107557266, "flos": 20769798049920.0, "grad_norm": 2.6126996734870844, "language_loss": 0.73397666, "learning_rate": 2.42516517766223e-06, "loss": 0.75594676, "num_input_tokens_seen": 80366505, "step": 3733, "time_per_iteration": 3.7910943031311035 }, { "auxiliary_loss_clip": 0.01189566, "auxiliary_loss_mlp": 0.01027154, "balance_loss_clip": 1.05509162, "balance_loss_mlp": 1.01897597, "epoch": 0.44898695364636565, "flos": 23951735326080.0, "grad_norm": 2.039417551397409, "language_loss": 0.67824233, "learning_rate": 2.4244039786605907e-06, "loss": 0.70040953, "num_input_tokens_seen": 80387510, "step": 3734, "time_per_iteration": 3.6253175735473633 }, { "auxiliary_loss_clip": 0.0112732, "auxiliary_loss_mlp": 0.0103317, "balance_loss_clip": 1.0429697, "balance_loss_mlp": 1.02465212, "epoch": 0.44910719653700476, "flos": 18624351628800.0, "grad_norm": 2.4740941299298798, "language_loss": 0.83188832, "learning_rate": 2.4236427152772055e-06, "loss": 0.85349321, "num_input_tokens_seen": 80405915, "step": 3735, "time_per_iteration": 2.7283754348754883 }, { "auxiliary_loss_clip": 0.01037301, "auxiliary_loss_mlp": 0.01002749, "balance_loss_clip": 1.01436007, "balance_loss_mlp": 1.00115168, "epoch": 0.4492274394276438, "flos": 57033435749760.0, "grad_norm": 0.838642635392877, "language_loss": 0.57317585, "learning_rate": 2.422881387627557e-06, "loss": 0.59357631, "num_input_tokens_seen": 80458365, "step": 3736, "time_per_iteration": 3.1052865982055664 }, { "auxiliary_loss_clip": 0.01158084, "auxiliary_loss_mlp": 0.01029693, "balance_loss_clip": 1.0480963, "balance_loss_mlp": 1.02102017, "epoch": 0.4493476823182829, "flos": 23254888498560.0, "grad_norm": 1.803218869283669, "language_loss": 0.77334529, "learning_rate": 2.422119995827139e-06, "loss": 0.79522306, "num_input_tokens_seen": 80478490, "step": 3737, "time_per_iteration": 2.6839122772216797 }, { "auxiliary_loss_clip": 0.01177546, "auxiliary_loss_mlp": 0.01030186, "balance_loss_clip": 1.05303955, "balance_loss_mlp": 1.02171016, "epoch": 0.44946792520892204, "flos": 15815131827840.0, "grad_norm": 3.386947649327146, "language_loss": 0.74842393, "learning_rate": 2.4213585399914528e-06, "loss": 0.77050126, "num_input_tokens_seen": 80495695, "step": 3738, "time_per_iteration": 2.662548065185547 }, { "auxiliary_loss_clip": 0.01172289, "auxiliary_loss_mlp": 0.01027591, "balance_loss_clip": 1.05266726, "balance_loss_mlp": 1.01979494, "epoch": 0.4495881680995611, "flos": 19610063631360.0, "grad_norm": 1.7531185098568636, "language_loss": 0.85532647, "learning_rate": 2.4205970202360113e-06, "loss": 0.87732524, "num_input_tokens_seen": 80515260, "step": 3739, "time_per_iteration": 2.6344974040985107 }, { "auxiliary_loss_clip": 0.01120667, "auxiliary_loss_mlp": 0.01025689, "balance_loss_clip": 1.04503083, "balance_loss_mlp": 1.01710558, "epoch": 0.4497084109902002, "flos": 26031465815040.0, "grad_norm": 1.9340890559043171, "language_loss": 0.78204107, "learning_rate": 2.4198354366763354e-06, "loss": 0.80350459, "num_input_tokens_seen": 80533900, "step": 3740, "time_per_iteration": 2.8254148960113525 }, { "auxiliary_loss_clip": 0.0116149, "auxiliary_loss_mlp": 0.01028992, "balance_loss_clip": 1.05013728, "balance_loss_mlp": 1.02148211, "epoch": 0.4498286538808393, "flos": 14793688771200.0, "grad_norm": 1.9243048671192629, "language_loss": 0.78278852, "learning_rate": 2.4190737894279587e-06, "loss": 0.80469334, "num_input_tokens_seen": 80551270, "step": 3741, "time_per_iteration": 2.7033510208129883 }, { "auxiliary_loss_clip": 0.01130105, "auxiliary_loss_mlp": 0.01032654, "balance_loss_clip": 1.04157627, "balance_loss_mlp": 1.02382028, "epoch": 0.44994889677147837, "flos": 15450171690240.0, "grad_norm": 2.012972342179157, "language_loss": 0.80854881, "learning_rate": 2.4183120786064203e-06, "loss": 0.83017641, "num_input_tokens_seen": 80568145, "step": 3742, "time_per_iteration": 2.6959125995635986 }, { "auxiliary_loss_clip": 0.01171189, "auxiliary_loss_mlp": 0.00763163, "balance_loss_clip": 1.05054927, "balance_loss_mlp": 1.00026524, "epoch": 0.4500691396621175, "flos": 21798316085760.0, "grad_norm": 2.749750298598797, "language_loss": 0.8568306, "learning_rate": 2.417550304327273e-06, "loss": 0.87617409, "num_input_tokens_seen": 80586185, "step": 3743, "time_per_iteration": 2.7928082942962646 }, { "auxiliary_loss_clip": 0.01189378, "auxiliary_loss_mlp": 0.01033277, "balance_loss_clip": 1.05354571, "balance_loss_mlp": 1.02394271, "epoch": 0.4501893825527566, "flos": 32382016421760.0, "grad_norm": 1.587820998816688, "language_loss": 0.75999749, "learning_rate": 2.4167884667060763e-06, "loss": 0.78222406, "num_input_tokens_seen": 80608895, "step": 3744, "time_per_iteration": 2.6924309730529785 }, { "auxiliary_loss_clip": 0.01158899, "auxiliary_loss_mlp": 0.01031549, "balance_loss_clip": 1.04904962, "balance_loss_mlp": 1.02303112, "epoch": 0.45030962544339564, "flos": 16544944362240.0, "grad_norm": 3.290643060498265, "language_loss": 0.87137222, "learning_rate": 2.4160265658584e-06, "loss": 0.89327663, "num_input_tokens_seen": 80623785, "step": 3745, "time_per_iteration": 2.659545660018921 }, { "auxiliary_loss_clip": 0.0117399, "auxiliary_loss_mlp": 0.01035937, "balance_loss_clip": 1.05082202, "balance_loss_mlp": 1.02771103, "epoch": 0.45042986833403476, "flos": 19573039687680.0, "grad_norm": 2.295046646712436, "language_loss": 0.68435204, "learning_rate": 2.4152646018998253e-06, "loss": 0.7064513, "num_input_tokens_seen": 80642735, "step": 3746, "time_per_iteration": 2.6366541385650635 }, { "auxiliary_loss_clip": 0.01168646, "auxiliary_loss_mlp": 0.01032356, "balance_loss_clip": 1.05047381, "balance_loss_mlp": 1.02414203, "epoch": 0.45055011122467387, "flos": 23112467072640.0, "grad_norm": 1.6962276933528193, "language_loss": 0.71977341, "learning_rate": 2.4145025749459403e-06, "loss": 0.74178344, "num_input_tokens_seen": 80663760, "step": 3747, "time_per_iteration": 2.7313196659088135 }, { "auxiliary_loss_clip": 0.01100632, "auxiliary_loss_mlp": 0.01027753, "balance_loss_clip": 1.04455078, "balance_loss_mlp": 1.01908016, "epoch": 0.4506703541153129, "flos": 19934623946880.0, "grad_norm": 2.31288153775069, "language_loss": 0.69944036, "learning_rate": 2.413740485112344e-06, "loss": 0.72072423, "num_input_tokens_seen": 80682100, "step": 3748, "time_per_iteration": 2.8301150798797607 }, { "auxiliary_loss_clip": 0.01151673, "auxiliary_loss_mlp": 0.01025586, "balance_loss_clip": 1.05000997, "balance_loss_mlp": 1.01782596, "epoch": 0.45079059700595203, "flos": 19499530504320.0, "grad_norm": 1.7311697912415969, "language_loss": 0.8192293, "learning_rate": 2.412978332514646e-06, "loss": 0.84100193, "num_input_tokens_seen": 80700880, "step": 3749, "time_per_iteration": 2.8018741607666016 }, { "auxiliary_loss_clip": 0.01159631, "auxiliary_loss_mlp": 0.01033265, "balance_loss_clip": 1.05004287, "balance_loss_mlp": 1.02415133, "epoch": 0.4509108398965911, "flos": 27636313570560.0, "grad_norm": 2.1296140080458223, "language_loss": 0.72336721, "learning_rate": 2.4122161172684623e-06, "loss": 0.74529612, "num_input_tokens_seen": 80721675, "step": 3750, "time_per_iteration": 2.841596841812134 }, { "auxiliary_loss_clip": 0.01158688, "auxiliary_loss_mlp": 0.01031077, "balance_loss_clip": 1.0487622, "balance_loss_mlp": 1.0226723, "epoch": 0.4510310827872302, "flos": 20995712640000.0, "grad_norm": 2.087078154673364, "language_loss": 0.84000063, "learning_rate": 2.4114538394894216e-06, "loss": 0.8618983, "num_input_tokens_seen": 80739315, "step": 3751, "time_per_iteration": 2.650799036026001 }, { "auxiliary_loss_clip": 0.01153704, "auxiliary_loss_mlp": 0.01026303, "balance_loss_clip": 1.0439868, "balance_loss_mlp": 1.01878715, "epoch": 0.4511513256778693, "flos": 16216684945920.0, "grad_norm": 1.8182877763830234, "language_loss": 0.83383983, "learning_rate": 2.410691499293161e-06, "loss": 0.85563993, "num_input_tokens_seen": 80757470, "step": 3752, "time_per_iteration": 2.846543550491333 }, { "auxiliary_loss_clip": 0.0116847, "auxiliary_loss_mlp": 0.01026367, "balance_loss_clip": 1.04698825, "balance_loss_mlp": 1.01836765, "epoch": 0.45127156856850836, "flos": 25186702780800.0, "grad_norm": 1.6314070629931532, "language_loss": 0.74215221, "learning_rate": 2.409929096795326e-06, "loss": 0.76410061, "num_input_tokens_seen": 80777840, "step": 3753, "time_per_iteration": 2.6440107822418213 }, { "auxiliary_loss_clip": 0.01169447, "auxiliary_loss_mlp": 0.01030674, "balance_loss_clip": 1.04726541, "balance_loss_mlp": 1.02244282, "epoch": 0.4513918114591475, "flos": 20412523422720.0, "grad_norm": 2.604885988786167, "language_loss": 0.79139578, "learning_rate": 2.409166632111573e-06, "loss": 0.81339699, "num_input_tokens_seen": 80795975, "step": 3754, "time_per_iteration": 2.7049014568328857 }, { "auxiliary_loss_clip": 0.0117901, "auxiliary_loss_mlp": 0.01031577, "balance_loss_clip": 1.05141616, "balance_loss_mlp": 1.02280927, "epoch": 0.4515120543497866, "flos": 26648482665600.0, "grad_norm": 1.9902238550495577, "language_loss": 0.80229127, "learning_rate": 2.4084041053575674e-06, "loss": 0.82439721, "num_input_tokens_seen": 80815395, "step": 3755, "time_per_iteration": 3.633913040161133 }, { "auxiliary_loss_clip": 0.01161433, "auxiliary_loss_mlp": 0.01025101, "balance_loss_clip": 1.04907346, "balance_loss_mlp": 1.01661885, "epoch": 0.45163229724042564, "flos": 20595093275520.0, "grad_norm": 1.987712151582793, "language_loss": 0.72094715, "learning_rate": 2.4076415166489834e-06, "loss": 0.74281251, "num_input_tokens_seen": 80834805, "step": 3756, "time_per_iteration": 3.5977869033813477 }, { "auxiliary_loss_clip": 0.01130867, "auxiliary_loss_mlp": 0.01027726, "balance_loss_clip": 1.04537892, "balance_loss_mlp": 1.02006078, "epoch": 0.45175254013106475, "flos": 21689004021120.0, "grad_norm": 1.5522171604790427, "language_loss": 0.79136974, "learning_rate": 2.406878866101506e-06, "loss": 0.81295568, "num_input_tokens_seen": 80853770, "step": 3757, "time_per_iteration": 2.797980785369873 }, { "auxiliary_loss_clip": 0.01185493, "auxiliary_loss_mlp": 0.01026674, "balance_loss_clip": 1.05370235, "balance_loss_mlp": 1.01829314, "epoch": 0.45187278302170386, "flos": 18878850466560.0, "grad_norm": 1.9422172811799012, "language_loss": 0.78264439, "learning_rate": 2.4061161538308273e-06, "loss": 0.80476606, "num_input_tokens_seen": 80870615, "step": 3758, "time_per_iteration": 2.583890676498413 }, { "auxiliary_loss_clip": 0.01169048, "auxiliary_loss_mlp": 0.0102518, "balance_loss_clip": 1.05010903, "balance_loss_mlp": 1.01711559, "epoch": 0.4519930259123429, "flos": 18582479349120.0, "grad_norm": 1.949067143845132, "language_loss": 0.88996321, "learning_rate": 2.4053533799526523e-06, "loss": 0.91190553, "num_input_tokens_seen": 80886335, "step": 3759, "time_per_iteration": 3.513416290283203 }, { "auxiliary_loss_clip": 0.01153704, "auxiliary_loss_mlp": 0.01024502, "balance_loss_clip": 1.05002058, "balance_loss_mlp": 1.01693845, "epoch": 0.452113268802982, "flos": 25192377129600.0, "grad_norm": 1.670694566229579, "language_loss": 0.86217773, "learning_rate": 2.404590544582691e-06, "loss": 0.88395977, "num_input_tokens_seen": 80904570, "step": 3760, "time_per_iteration": 3.612510919570923 }, { "auxiliary_loss_clip": 0.01131098, "auxiliary_loss_mlp": 0.01029318, "balance_loss_clip": 1.0424881, "balance_loss_mlp": 1.02067566, "epoch": 0.45223351169362114, "flos": 39378922312320.0, "grad_norm": 1.7394359063219174, "language_loss": 0.81384605, "learning_rate": 2.403827647836666e-06, "loss": 0.83545017, "num_input_tokens_seen": 80925125, "step": 3761, "time_per_iteration": 2.891256809234619 }, { "auxiliary_loss_clip": 0.01184293, "auxiliary_loss_mlp": 0.01029846, "balance_loss_clip": 1.05051017, "balance_loss_mlp": 1.02185869, "epoch": 0.4523537545842602, "flos": 21582169994880.0, "grad_norm": 2.056226691111121, "language_loss": 0.69536531, "learning_rate": 2.4030646898303075e-06, "loss": 0.71750671, "num_input_tokens_seen": 80946615, "step": 3762, "time_per_iteration": 2.6750059127807617 }, { "auxiliary_loss_clip": 0.01157653, "auxiliary_loss_mlp": 0.01031308, "balance_loss_clip": 1.04560137, "balance_loss_mlp": 1.02302849, "epoch": 0.4524739974748993, "flos": 28439527547520.0, "grad_norm": 1.9973584037012408, "language_loss": 0.81807506, "learning_rate": 2.4023016706793566e-06, "loss": 0.83996463, "num_input_tokens_seen": 80966410, "step": 3763, "time_per_iteration": 2.7446541786193848 }, { "auxiliary_loss_clip": 0.01056773, "auxiliary_loss_mlp": 0.0100427, "balance_loss_clip": 1.01553297, "balance_loss_mlp": 1.00254142, "epoch": 0.4525942403655384, "flos": 61556492148480.0, "grad_norm": 0.7690677125990084, "language_loss": 0.5687325, "learning_rate": 2.401538590499561e-06, "loss": 0.58934295, "num_input_tokens_seen": 81026865, "step": 3764, "time_per_iteration": 3.3574702739715576 }, { "auxiliary_loss_clip": 0.01170493, "auxiliary_loss_mlp": 0.00763614, "balance_loss_clip": 1.04831064, "balance_loss_mlp": 1.00025606, "epoch": 0.45271448325617747, "flos": 27529838680320.0, "grad_norm": 2.1747407383060153, "language_loss": 0.71750152, "learning_rate": 2.400775449406682e-06, "loss": 0.73684263, "num_input_tokens_seen": 81050060, "step": 3765, "time_per_iteration": 2.763550043106079 }, { "auxiliary_loss_clip": 0.01167059, "auxiliary_loss_mlp": 0.01025208, "balance_loss_clip": 1.04585242, "balance_loss_mlp": 1.01785231, "epoch": 0.4528347261468166, "flos": 22452608275200.0, "grad_norm": 1.9879067354228466, "language_loss": 0.7315942, "learning_rate": 2.400012247516485e-06, "loss": 0.75351691, "num_input_tokens_seen": 81070625, "step": 3766, "time_per_iteration": 2.654012680053711 }, { "auxiliary_loss_clip": 0.01145021, "auxiliary_loss_mlp": 0.01033792, "balance_loss_clip": 1.0464232, "balance_loss_mlp": 1.02535772, "epoch": 0.45295496903745563, "flos": 21103875469440.0, "grad_norm": 2.0004437472374974, "language_loss": 0.90366805, "learning_rate": 2.3992489849447484e-06, "loss": 0.92545617, "num_input_tokens_seen": 81089080, "step": 3767, "time_per_iteration": 2.763418197631836 }, { "auxiliary_loss_clip": 0.01146622, "auxiliary_loss_mlp": 0.01026655, "balance_loss_clip": 1.04809022, "balance_loss_mlp": 1.01768482, "epoch": 0.45307521192809475, "flos": 23221168606080.0, "grad_norm": 1.6992379321269822, "language_loss": 0.79000545, "learning_rate": 2.3984856618072584e-06, "loss": 0.81173825, "num_input_tokens_seen": 81109115, "step": 3768, "time_per_iteration": 2.7231523990631104 }, { "auxiliary_loss_clip": 0.0114943, "auxiliary_loss_mlp": 0.01030762, "balance_loss_clip": 1.04883504, "balance_loss_mlp": 1.02281642, "epoch": 0.45319545481873386, "flos": 15560094286080.0, "grad_norm": 1.8501080292220489, "language_loss": 0.74324691, "learning_rate": 2.3977222782198098e-06, "loss": 0.76504886, "num_input_tokens_seen": 81127750, "step": 3769, "time_per_iteration": 2.7318825721740723 }, { "auxiliary_loss_clip": 0.01134542, "auxiliary_loss_mlp": 0.010254, "balance_loss_clip": 1.04494858, "balance_loss_mlp": 1.01701999, "epoch": 0.4533156977093729, "flos": 21944759834880.0, "grad_norm": 2.0209836197747557, "language_loss": 0.75196445, "learning_rate": 2.3969588342982077e-06, "loss": 0.77356386, "num_input_tokens_seen": 81147125, "step": 3770, "time_per_iteration": 2.7186527252197266 }, { "auxiliary_loss_clip": 0.01167133, "auxiliary_loss_mlp": 0.01028307, "balance_loss_clip": 1.04968131, "balance_loss_mlp": 1.02040362, "epoch": 0.453435940600012, "flos": 24242180699520.0, "grad_norm": 1.653628939923085, "language_loss": 0.72739148, "learning_rate": 2.396195330158267e-06, "loss": 0.74934596, "num_input_tokens_seen": 81167015, "step": 3771, "time_per_iteration": 2.752680778503418 }, { "auxiliary_loss_clip": 0.01184276, "auxiliary_loss_mlp": 0.01032995, "balance_loss_clip": 1.05028069, "balance_loss_mlp": 1.025038, "epoch": 0.45355618349065113, "flos": 23440367352960.0, "grad_norm": 1.7424126397020594, "language_loss": 0.79460514, "learning_rate": 2.3954317659158094e-06, "loss": 0.81677788, "num_input_tokens_seen": 81187350, "step": 3772, "time_per_iteration": 2.6147401332855225 }, { "auxiliary_loss_clip": 0.01084079, "auxiliary_loss_mlp": 0.0100222, "balance_loss_clip": 1.01596618, "balance_loss_mlp": 1.00043201, "epoch": 0.4536764263812902, "flos": 66903161448960.0, "grad_norm": 0.992214202620696, "language_loss": 0.56943893, "learning_rate": 2.394668141686667e-06, "loss": 0.59030193, "num_input_tokens_seen": 81249315, "step": 3773, "time_per_iteration": 3.224578380584717 }, { "auxiliary_loss_clip": 0.01166225, "auxiliary_loss_mlp": 0.01026493, "balance_loss_clip": 1.04785931, "balance_loss_mlp": 1.01885128, "epoch": 0.4537966692719293, "flos": 42739766254080.0, "grad_norm": 2.1126769203219555, "language_loss": 0.69490826, "learning_rate": 2.3939044575866813e-06, "loss": 0.71683544, "num_input_tokens_seen": 81272065, "step": 3774, "time_per_iteration": 2.8738088607788086 }, { "auxiliary_loss_clip": 0.01150539, "auxiliary_loss_mlp": 0.00763557, "balance_loss_clip": 1.0472641, "balance_loss_mlp": 1.00020862, "epoch": 0.4539169121625684, "flos": 35549480517120.0, "grad_norm": 2.163951829345864, "language_loss": 0.75248575, "learning_rate": 2.3931407137317024e-06, "loss": 0.77162671, "num_input_tokens_seen": 81292220, "step": 3775, "time_per_iteration": 2.761521100997925 }, { "auxiliary_loss_clip": 0.01138765, "auxiliary_loss_mlp": 0.01030612, "balance_loss_clip": 1.04337525, "balance_loss_mlp": 1.02215993, "epoch": 0.45403715505320746, "flos": 18514716341760.0, "grad_norm": 1.7245377508497888, "language_loss": 0.84790313, "learning_rate": 2.3923769102375907e-06, "loss": 0.86959696, "num_input_tokens_seen": 81311085, "step": 3776, "time_per_iteration": 2.742717742919922 }, { "auxiliary_loss_clip": 0.01141108, "auxiliary_loss_mlp": 0.01033443, "balance_loss_clip": 1.04467249, "balance_loss_mlp": 1.02455044, "epoch": 0.4541573979438466, "flos": 25045825639680.0, "grad_norm": 2.2899578580626097, "language_loss": 0.78621304, "learning_rate": 2.391613047220213e-06, "loss": 0.80795848, "num_input_tokens_seen": 81330985, "step": 3777, "time_per_iteration": 2.668560028076172 }, { "auxiliary_loss_clip": 0.01136545, "auxiliary_loss_mlp": 0.01031711, "balance_loss_clip": 1.04527807, "balance_loss_mlp": 1.02334237, "epoch": 0.4542776408344857, "flos": 18332397884160.0, "grad_norm": 1.7960918455177004, "language_loss": 0.78776139, "learning_rate": 2.390849124795447e-06, "loss": 0.80944395, "num_input_tokens_seen": 81346985, "step": 3778, "time_per_iteration": 2.6834897994995117 }, { "auxiliary_loss_clip": 0.0118587, "auxiliary_loss_mlp": 0.01029058, "balance_loss_clip": 1.05212522, "balance_loss_mlp": 1.02073753, "epoch": 0.45439788372512474, "flos": 20701173116160.0, "grad_norm": 2.011187735931306, "language_loss": 0.84064734, "learning_rate": 2.3900851430791804e-06, "loss": 0.8627966, "num_input_tokens_seen": 81365005, "step": 3779, "time_per_iteration": 2.636666774749756 }, { "auxiliary_loss_clip": 0.01187287, "auxiliary_loss_mlp": 0.01029054, "balance_loss_clip": 1.05045247, "balance_loss_mlp": 1.02007127, "epoch": 0.45451812661576385, "flos": 22309432663680.0, "grad_norm": 2.2579276623578455, "language_loss": 0.84659874, "learning_rate": 2.389321102187307e-06, "loss": 0.86876214, "num_input_tokens_seen": 81383785, "step": 3780, "time_per_iteration": 2.577648639678955 }, { "auxiliary_loss_clip": 0.01153412, "auxiliary_loss_mlp": 0.00763631, "balance_loss_clip": 1.04808378, "balance_loss_mlp": 1.00027728, "epoch": 0.4546383695064029, "flos": 21763303303680.0, "grad_norm": 2.035230610942664, "language_loss": 0.81760383, "learning_rate": 2.3885570022357326e-06, "loss": 0.83677423, "num_input_tokens_seen": 81402915, "step": 3781, "time_per_iteration": 3.4947707653045654 }, { "auxiliary_loss_clip": 0.01056896, "auxiliary_loss_mlp": 0.01002267, "balance_loss_clip": 1.01551509, "balance_loss_mlp": 1.00039566, "epoch": 0.454758612397042, "flos": 64242755694720.0, "grad_norm": 0.7968785258889399, "language_loss": 0.60880476, "learning_rate": 2.38779284334037e-06, "loss": 0.62939644, "num_input_tokens_seen": 81467890, "step": 3782, "time_per_iteration": 4.178065061569214 }, { "auxiliary_loss_clip": 0.01110697, "auxiliary_loss_mlp": 0.01024176, "balance_loss_clip": 1.04062593, "balance_loss_mlp": 1.01581323, "epoch": 0.4548788552876811, "flos": 27304175485440.0, "grad_norm": 2.1072543104814443, "language_loss": 0.7842083, "learning_rate": 2.387028625617141e-06, "loss": 0.80555701, "num_input_tokens_seen": 81487105, "step": 3783, "time_per_iteration": 2.7396762371063232 }, { "auxiliary_loss_clip": 0.01143669, "auxiliary_loss_mlp": 0.01030043, "balance_loss_clip": 1.0443058, "balance_loss_mlp": 1.02196038, "epoch": 0.4549990981783202, "flos": 22857142222080.0, "grad_norm": 1.9047748447225141, "language_loss": 0.84932172, "learning_rate": 2.3862643491819766e-06, "loss": 0.87105882, "num_input_tokens_seen": 81505670, "step": 3784, "time_per_iteration": 2.640009641647339 }, { "auxiliary_loss_clip": 0.01164894, "auxiliary_loss_mlp": 0.0102707, "balance_loss_clip": 1.04534793, "balance_loss_mlp": 1.0191431, "epoch": 0.4551193410689593, "flos": 23258587599360.0, "grad_norm": 1.6857143769581937, "language_loss": 0.84770703, "learning_rate": 2.3855000141508186e-06, "loss": 0.8696267, "num_input_tokens_seen": 81525825, "step": 3785, "time_per_iteration": 3.5934386253356934 }, { "auxiliary_loss_clip": 0.01159447, "auxiliary_loss_mlp": 0.01030636, "balance_loss_clip": 1.04959106, "balance_loss_mlp": 1.0224998, "epoch": 0.4552395839595984, "flos": 20777519473920.0, "grad_norm": 2.509303787151921, "language_loss": 0.84749997, "learning_rate": 2.3847356206396143e-06, "loss": 0.86940074, "num_input_tokens_seen": 81543135, "step": 3786, "time_per_iteration": 3.662792205810547 }, { "auxiliary_loss_clip": 0.01186679, "auxiliary_loss_mlp": 0.01028849, "balance_loss_clip": 1.05290186, "balance_loss_mlp": 1.02079618, "epoch": 0.45535982685023746, "flos": 23257510191360.0, "grad_norm": 1.6815497593397553, "language_loss": 0.78570557, "learning_rate": 2.3839711687643227e-06, "loss": 0.80786085, "num_input_tokens_seen": 81564360, "step": 3787, "time_per_iteration": 2.6299681663513184 }, { "auxiliary_loss_clip": 0.01171462, "auxiliary_loss_mlp": 0.01032496, "balance_loss_clip": 1.05115628, "balance_loss_mlp": 1.0236032, "epoch": 0.45548006974087657, "flos": 19646117907840.0, "grad_norm": 2.302172489667698, "language_loss": 0.74058402, "learning_rate": 2.38320665864091e-06, "loss": 0.76262367, "num_input_tokens_seen": 81583710, "step": 3788, "time_per_iteration": 2.6754024028778076 }, { "auxiliary_loss_clip": 0.01112396, "auxiliary_loss_mlp": 0.01032987, "balance_loss_clip": 1.03972554, "balance_loss_mlp": 1.02348578, "epoch": 0.4556003126315157, "flos": 20047778766720.0, "grad_norm": 2.1263410381049077, "language_loss": 0.81948066, "learning_rate": 2.3824420903853516e-06, "loss": 0.84093451, "num_input_tokens_seen": 81602175, "step": 3789, "time_per_iteration": 2.8183977603912354 }, { "auxiliary_loss_clip": 0.01172468, "auxiliary_loss_mlp": 0.01025056, "balance_loss_clip": 1.05202723, "balance_loss_mlp": 1.01692557, "epoch": 0.45572055552215474, "flos": 22959738443520.0, "grad_norm": 2.3375679170963157, "language_loss": 0.81669652, "learning_rate": 2.3816774641136324e-06, "loss": 0.83867168, "num_input_tokens_seen": 81619430, "step": 3790, "time_per_iteration": 2.65752911567688 }, { "auxiliary_loss_clip": 0.01167366, "auxiliary_loss_mlp": 0.00763283, "balance_loss_clip": 1.04864526, "balance_loss_mlp": 1.00016356, "epoch": 0.45584079841279385, "flos": 33109925535360.0, "grad_norm": 1.7943307482343498, "language_loss": 0.71138734, "learning_rate": 2.380912779941745e-06, "loss": 0.73069382, "num_input_tokens_seen": 81642550, "step": 3791, "time_per_iteration": 2.8226561546325684 }, { "auxiliary_loss_clip": 0.01168272, "auxiliary_loss_mlp": 0.01031665, "balance_loss_clip": 1.04428411, "balance_loss_mlp": 1.02267087, "epoch": 0.45596104130343296, "flos": 27272179445760.0, "grad_norm": 4.1790098498457455, "language_loss": 0.8354494, "learning_rate": 2.3801480379856918e-06, "loss": 0.85744876, "num_input_tokens_seen": 81664260, "step": 3792, "time_per_iteration": 2.7123584747314453 }, { "auxiliary_loss_clip": 0.01156652, "auxiliary_loss_mlp": 0.01030704, "balance_loss_clip": 1.05001116, "balance_loss_mlp": 1.02251387, "epoch": 0.456081284194072, "flos": 21579799697280.0, "grad_norm": 2.0471353402476136, "language_loss": 0.838588, "learning_rate": 2.379383238361484e-06, "loss": 0.86046153, "num_input_tokens_seen": 81683620, "step": 3793, "time_per_iteration": 2.7634336948394775 }, { "auxiliary_loss_clip": 0.01167659, "auxiliary_loss_mlp": 0.01024496, "balance_loss_clip": 1.04732227, "balance_loss_mlp": 1.01703644, "epoch": 0.4562015270847111, "flos": 35918822113920.0, "grad_norm": 2.198276031758361, "language_loss": 0.79751611, "learning_rate": 2.3786183811851407e-06, "loss": 0.81943762, "num_input_tokens_seen": 81704325, "step": 3794, "time_per_iteration": 2.8194656372070312 }, { "auxiliary_loss_clip": 0.01186235, "auxiliary_loss_mlp": 0.01035554, "balance_loss_clip": 1.05191207, "balance_loss_mlp": 1.02711368, "epoch": 0.45632176997535023, "flos": 13589783602560.0, "grad_norm": 1.8426128131300044, "language_loss": 0.80074048, "learning_rate": 2.3778534665726892e-06, "loss": 0.82295835, "num_input_tokens_seen": 81721155, "step": 3795, "time_per_iteration": 2.684628963470459 }, { "auxiliary_loss_clip": 0.01162906, "auxiliary_loss_mlp": 0.01038914, "balance_loss_clip": 1.05065536, "balance_loss_mlp": 1.03092396, "epoch": 0.4564420128659893, "flos": 32635401937920.0, "grad_norm": 1.857027331481611, "language_loss": 0.72885811, "learning_rate": 2.377088494640168e-06, "loss": 0.75087637, "num_input_tokens_seen": 81742905, "step": 3796, "time_per_iteration": 2.6828017234802246 }, { "auxiliary_loss_clip": 0.01166717, "auxiliary_loss_mlp": 0.01031179, "balance_loss_clip": 1.05035233, "balance_loss_mlp": 1.02307236, "epoch": 0.4565622557566284, "flos": 20377690208640.0, "grad_norm": 1.7908306434636296, "language_loss": 0.78147316, "learning_rate": 2.3763234655036216e-06, "loss": 0.80345213, "num_input_tokens_seen": 81762105, "step": 3797, "time_per_iteration": 2.69478440284729 }, { "auxiliary_loss_clip": 0.01139073, "auxiliary_loss_mlp": 0.01030467, "balance_loss_clip": 1.04252326, "balance_loss_mlp": 1.0226469, "epoch": 0.45668249864726745, "flos": 25374372364800.0, "grad_norm": 2.289305841498056, "language_loss": 0.86790657, "learning_rate": 2.3755583792791046e-06, "loss": 0.88960201, "num_input_tokens_seen": 81781975, "step": 3798, "time_per_iteration": 2.7829136848449707 }, { "auxiliary_loss_clip": 0.01173688, "auxiliary_loss_mlp": 0.01028805, "balance_loss_clip": 1.05053592, "balance_loss_mlp": 1.02033532, "epoch": 0.45680274153790656, "flos": 15559806977280.0, "grad_norm": 2.092698032288108, "language_loss": 0.74330807, "learning_rate": 2.3747932360826803e-06, "loss": 0.765333, "num_input_tokens_seen": 81798905, "step": 3799, "time_per_iteration": 2.655580520629883 }, { "auxiliary_loss_clip": 0.01170217, "auxiliary_loss_mlp": 0.0102724, "balance_loss_clip": 1.05076432, "balance_loss_mlp": 1.01944971, "epoch": 0.4569229844285457, "flos": 19792884879360.0, "grad_norm": 1.922964460493744, "language_loss": 0.82423472, "learning_rate": 2.3740280360304205e-06, "loss": 0.84620929, "num_input_tokens_seen": 81816630, "step": 3800, "time_per_iteration": 2.6316416263580322 }, { "auxiliary_loss_clip": 0.01140988, "auxiliary_loss_mlp": 0.01026472, "balance_loss_clip": 1.04856706, "balance_loss_mlp": 1.01876771, "epoch": 0.45704322731918473, "flos": 24093941270400.0, "grad_norm": 1.734900450550298, "language_loss": 0.68390501, "learning_rate": 2.3732627792384038e-06, "loss": 0.70557964, "num_input_tokens_seen": 81837700, "step": 3801, "time_per_iteration": 2.7590556144714355 }, { "auxiliary_loss_clip": 0.01185689, "auxiliary_loss_mlp": 0.0102643, "balance_loss_clip": 1.05093622, "balance_loss_mlp": 1.01813889, "epoch": 0.45716347020982384, "flos": 31317803245440.0, "grad_norm": 1.986594110911609, "language_loss": 0.75203609, "learning_rate": 2.3724974658227207e-06, "loss": 0.77415729, "num_input_tokens_seen": 81858490, "step": 3802, "time_per_iteration": 2.6969940662384033 }, { "auxiliary_loss_clip": 0.01154483, "auxiliary_loss_mlp": 0.00762988, "balance_loss_clip": 1.04831457, "balance_loss_mlp": 1.00019026, "epoch": 0.45728371310046295, "flos": 26501392471680.0, "grad_norm": 2.1154809202072733, "language_loss": 0.71372551, "learning_rate": 2.3717320958994687e-06, "loss": 0.73290026, "num_input_tokens_seen": 81876050, "step": 3803, "time_per_iteration": 2.707887887954712 }, { "auxiliary_loss_clip": 0.01141777, "auxiliary_loss_mlp": 0.01029328, "balance_loss_clip": 1.04199564, "balance_loss_mlp": 1.02083397, "epoch": 0.457403955991102, "flos": 17929408222080.0, "grad_norm": 1.948001932407079, "language_loss": 0.70452142, "learning_rate": 2.3709666695847534e-06, "loss": 0.72623247, "num_input_tokens_seen": 81894230, "step": 3804, "time_per_iteration": 2.7754030227661133 }, { "auxiliary_loss_clip": 0.01118759, "auxiliary_loss_mlp": 0.01030172, "balance_loss_clip": 1.04261088, "balance_loss_mlp": 1.0226438, "epoch": 0.4575241988817411, "flos": 42230660837760.0, "grad_norm": 1.6378909739955605, "language_loss": 0.70500433, "learning_rate": 2.370201186994689e-06, "loss": 0.7264936, "num_input_tokens_seen": 81917915, "step": 3805, "time_per_iteration": 2.866208076477051 }, { "auxiliary_loss_clip": 0.01148134, "auxiliary_loss_mlp": 0.01028549, "balance_loss_clip": 1.04751372, "balance_loss_mlp": 1.02023447, "epoch": 0.45764444177238023, "flos": 30117309868800.0, "grad_norm": 1.8554025236356495, "language_loss": 0.69894546, "learning_rate": 2.369435648245399e-06, "loss": 0.7207123, "num_input_tokens_seen": 81938130, "step": 3806, "time_per_iteration": 2.690476417541504 }, { "auxiliary_loss_clip": 0.0115639, "auxiliary_loss_mlp": 0.01038349, "balance_loss_clip": 1.04814303, "balance_loss_mlp": 1.03045142, "epoch": 0.4577646846630193, "flos": 24060293205120.0, "grad_norm": 1.729903981343227, "language_loss": 0.85032493, "learning_rate": 2.368670053453015e-06, "loss": 0.87227225, "num_input_tokens_seen": 81959820, "step": 3807, "time_per_iteration": 3.644468307495117 }, { "auxiliary_loss_clip": 0.01178751, "auxiliary_loss_mlp": 0.01023622, "balance_loss_clip": 1.05355835, "balance_loss_mlp": 1.01568866, "epoch": 0.4578849275536584, "flos": 17418578952960.0, "grad_norm": 2.2202937536825145, "language_loss": 0.74259067, "learning_rate": 2.3679044027336757e-06, "loss": 0.7646144, "num_input_tokens_seen": 81975710, "step": 3808, "time_per_iteration": 3.5734386444091797 }, { "auxiliary_loss_clip": 0.01186022, "auxiliary_loss_mlp": 0.01031171, "balance_loss_clip": 1.05068493, "balance_loss_mlp": 1.02243948, "epoch": 0.4580051704442975, "flos": 13510169107200.0, "grad_norm": 3.275086906572407, "language_loss": 0.69558609, "learning_rate": 2.3671386962035326e-06, "loss": 0.717758, "num_input_tokens_seen": 81993180, "step": 3809, "time_per_iteration": 2.689065456390381 }, { "auxiliary_loss_clip": 0.01172707, "auxiliary_loss_mlp": 0.01030941, "balance_loss_clip": 1.05056989, "balance_loss_mlp": 1.02258432, "epoch": 0.45812541333493656, "flos": 18037606965120.0, "grad_norm": 2.3782803002006583, "language_loss": 0.68889087, "learning_rate": 2.3663729339787405e-06, "loss": 0.71092737, "num_input_tokens_seen": 82010115, "step": 3810, "time_per_iteration": 2.638394594192505 }, { "auxiliary_loss_clip": 0.01186051, "auxiliary_loss_mlp": 0.01034181, "balance_loss_clip": 1.05175996, "balance_loss_mlp": 1.02534795, "epoch": 0.45824565622557567, "flos": 20222196232320.0, "grad_norm": 2.722942157169025, "language_loss": 0.73177552, "learning_rate": 2.365607116175466e-06, "loss": 0.75397778, "num_input_tokens_seen": 82025540, "step": 3811, "time_per_iteration": 3.522378444671631 }, { "auxiliary_loss_clip": 0.01184483, "auxiliary_loss_mlp": 0.01024535, "balance_loss_clip": 1.05075645, "balance_loss_mlp": 1.01651788, "epoch": 0.4583658991162148, "flos": 19864885691520.0, "grad_norm": 3.3254266495646956, "language_loss": 0.67179221, "learning_rate": 2.3648412429098825e-06, "loss": 0.69388241, "num_input_tokens_seen": 82043890, "step": 3812, "time_per_iteration": 3.56215238571167 }, { "auxiliary_loss_clip": 0.01137894, "auxiliary_loss_mlp": 0.0103124, "balance_loss_clip": 1.04700923, "balance_loss_mlp": 1.0216496, "epoch": 0.45848614200685384, "flos": 21029935322880.0, "grad_norm": 2.5449367748011387, "language_loss": 0.82080263, "learning_rate": 2.364075314298172e-06, "loss": 0.84249401, "num_input_tokens_seen": 82061345, "step": 3813, "time_per_iteration": 2.653594493865967 }, { "auxiliary_loss_clip": 0.01173935, "auxiliary_loss_mlp": 0.00763785, "balance_loss_clip": 1.04968166, "balance_loss_mlp": 1.00021076, "epoch": 0.45860638489749295, "flos": 21069293650560.0, "grad_norm": 1.9660841603347303, "language_loss": 0.69973695, "learning_rate": 2.3633093304565267e-06, "loss": 0.71911418, "num_input_tokens_seen": 82080400, "step": 3814, "time_per_iteration": 2.733041763305664 }, { "auxiliary_loss_clip": 0.01192118, "auxiliary_loss_mlp": 0.01028955, "balance_loss_clip": 1.05523968, "balance_loss_mlp": 1.02044892, "epoch": 0.458726627788132, "flos": 26833889692800.0, "grad_norm": 1.9613259922265862, "language_loss": 0.63070661, "learning_rate": 2.3625432915011443e-06, "loss": 0.65291733, "num_input_tokens_seen": 82102310, "step": 3815, "time_per_iteration": 2.636105537414551 }, { "auxiliary_loss_clip": 0.01148989, "auxiliary_loss_mlp": 0.01033379, "balance_loss_clip": 1.04598403, "balance_loss_mlp": 1.02450347, "epoch": 0.4588468706787711, "flos": 24097927680000.0, "grad_norm": 1.7390973396102092, "language_loss": 0.65573537, "learning_rate": 2.3617771975482334e-06, "loss": 0.67755902, "num_input_tokens_seen": 82121140, "step": 3816, "time_per_iteration": 2.7017769813537598 }, { "auxiliary_loss_clip": 0.01124399, "auxiliary_loss_mlp": 0.01034733, "balance_loss_clip": 1.04464364, "balance_loss_mlp": 1.0265379, "epoch": 0.4589671135694102, "flos": 17889331622400.0, "grad_norm": 1.6910284386742453, "language_loss": 0.74327016, "learning_rate": 2.3610110487140083e-06, "loss": 0.76486152, "num_input_tokens_seen": 82139575, "step": 3817, "time_per_iteration": 2.7090156078338623 }, { "auxiliary_loss_clip": 0.01157608, "auxiliary_loss_mlp": 0.01027126, "balance_loss_clip": 1.04968405, "balance_loss_mlp": 1.01910329, "epoch": 0.4590873564600493, "flos": 25626967781760.0, "grad_norm": 1.7776623124767856, "language_loss": 0.80670756, "learning_rate": 2.360244845114695e-06, "loss": 0.82855487, "num_input_tokens_seen": 82159195, "step": 3818, "time_per_iteration": 2.7197089195251465 }, { "auxiliary_loss_clip": 0.0115057, "auxiliary_loss_mlp": 0.01028055, "balance_loss_clip": 1.0491209, "balance_loss_mlp": 1.02010369, "epoch": 0.4592075993506884, "flos": 18514788168960.0, "grad_norm": 2.3070480130475914, "language_loss": 0.68525219, "learning_rate": 2.3594785868665245e-06, "loss": 0.7070384, "num_input_tokens_seen": 82175500, "step": 3819, "time_per_iteration": 2.6559667587280273 }, { "auxiliary_loss_clip": 0.01145237, "auxiliary_loss_mlp": 0.00763428, "balance_loss_clip": 1.04859281, "balance_loss_mlp": 1.00016046, "epoch": 0.4593278422413275, "flos": 20631111638400.0, "grad_norm": 1.9523955743573738, "language_loss": 0.80428356, "learning_rate": 2.3587122740857386e-06, "loss": 0.82337022, "num_input_tokens_seen": 82192600, "step": 3820, "time_per_iteration": 2.7656543254852295 }, { "auxiliary_loss_clip": 0.01168806, "auxiliary_loss_mlp": 0.01023422, "balance_loss_clip": 1.04884219, "balance_loss_mlp": 1.01651955, "epoch": 0.45944808513196655, "flos": 21358517961600.0, "grad_norm": 1.7667110949097864, "language_loss": 0.7804119, "learning_rate": 2.357945906888586e-06, "loss": 0.80233419, "num_input_tokens_seen": 82212040, "step": 3821, "time_per_iteration": 2.7187771797180176 }, { "auxiliary_loss_clip": 0.0117185, "auxiliary_loss_mlp": 0.01033061, "balance_loss_clip": 1.05050504, "balance_loss_mlp": 1.0242393, "epoch": 0.45956832802260567, "flos": 21427789340160.0, "grad_norm": 2.680941359972959, "language_loss": 0.80118525, "learning_rate": 2.357179485391324e-06, "loss": 0.82323438, "num_input_tokens_seen": 82229895, "step": 3822, "time_per_iteration": 2.7255406379699707 }, { "auxiliary_loss_clip": 0.01184003, "auxiliary_loss_mlp": 0.01030312, "balance_loss_clip": 1.05197001, "balance_loss_mlp": 1.02208066, "epoch": 0.4596885709132448, "flos": 22382654538240.0, "grad_norm": 2.508324661546939, "language_loss": 0.86311257, "learning_rate": 2.3564130097102173e-06, "loss": 0.88525569, "num_input_tokens_seen": 82249550, "step": 3823, "time_per_iteration": 2.675727128982544 }, { "auxiliary_loss_clip": 0.01148263, "auxiliary_loss_mlp": 0.01032315, "balance_loss_clip": 1.04824567, "balance_loss_mlp": 1.02403545, "epoch": 0.45980881380388383, "flos": 28981957806720.0, "grad_norm": 1.7674743409379776, "language_loss": 0.75412929, "learning_rate": 2.355646479961541e-06, "loss": 0.77593505, "num_input_tokens_seen": 82268860, "step": 3824, "time_per_iteration": 2.7737581729888916 }, { "auxiliary_loss_clip": 0.01185078, "auxiliary_loss_mlp": 0.01024507, "balance_loss_clip": 1.05121303, "balance_loss_mlp": 1.01652026, "epoch": 0.45992905669452294, "flos": 33396599980800.0, "grad_norm": 1.8598177290271187, "language_loss": 0.71620399, "learning_rate": 2.354879896261576e-06, "loss": 0.73829985, "num_input_tokens_seen": 82289070, "step": 3825, "time_per_iteration": 2.731172800064087 }, { "auxiliary_loss_clip": 0.01137687, "auxiliary_loss_mlp": 0.01030767, "balance_loss_clip": 1.0481149, "balance_loss_mlp": 1.0222497, "epoch": 0.46004929958516205, "flos": 36318184502400.0, "grad_norm": 2.0361364384270644, "language_loss": 0.5669719, "learning_rate": 2.3541132587266133e-06, "loss": 0.58865643, "num_input_tokens_seen": 82311790, "step": 3826, "time_per_iteration": 2.8080198764801025 }, { "auxiliary_loss_clip": 0.01148223, "auxiliary_loss_mlp": 0.01030635, "balance_loss_clip": 1.04961205, "balance_loss_mlp": 1.0219568, "epoch": 0.4601695424758011, "flos": 17238451224960.0, "grad_norm": 4.180153879432144, "language_loss": 0.69292665, "learning_rate": 2.3533465674729515e-06, "loss": 0.71471524, "num_input_tokens_seen": 82329020, "step": 3827, "time_per_iteration": 2.694967746734619 }, { "auxiliary_loss_clip": 0.01187506, "auxiliary_loss_mlp": 0.01036195, "balance_loss_clip": 1.05341446, "balance_loss_mlp": 1.02759361, "epoch": 0.4602897853664402, "flos": 15888425529600.0, "grad_norm": 2.1342636477737753, "language_loss": 0.72766173, "learning_rate": 2.352579822616895e-06, "loss": 0.74989867, "num_input_tokens_seen": 82346455, "step": 3828, "time_per_iteration": 2.654542922973633 }, { "auxiliary_loss_clip": 0.0115907, "auxiliary_loss_mlp": 0.01021498, "balance_loss_clip": 1.05054283, "balance_loss_mlp": 1.01356745, "epoch": 0.4604100282570793, "flos": 25412617370880.0, "grad_norm": 1.8964156694111172, "language_loss": 0.77667391, "learning_rate": 2.351813024274761e-06, "loss": 0.79847962, "num_input_tokens_seen": 82367810, "step": 3829, "time_per_iteration": 2.6943414211273193 }, { "auxiliary_loss_clip": 0.01146484, "auxiliary_loss_mlp": 0.01025476, "balance_loss_clip": 1.04884255, "balance_loss_mlp": 1.01761127, "epoch": 0.4605302711477184, "flos": 27630711048960.0, "grad_norm": 1.8083484584285114, "language_loss": 0.73786402, "learning_rate": 2.3510461725628693e-06, "loss": 0.75958365, "num_input_tokens_seen": 82388275, "step": 3830, "time_per_iteration": 2.693460702896118 }, { "auxiliary_loss_clip": 0.0114455, "auxiliary_loss_mlp": 0.01031229, "balance_loss_clip": 1.04831338, "balance_loss_mlp": 1.0232892, "epoch": 0.4606505140383575, "flos": 23839657914240.0, "grad_norm": 2.15721949759543, "language_loss": 0.7107932, "learning_rate": 2.350279267597554e-06, "loss": 0.73255098, "num_input_tokens_seen": 82408915, "step": 3831, "time_per_iteration": 2.7996551990509033 }, { "auxiliary_loss_clip": 0.01171953, "auxiliary_loss_mlp": 0.01027899, "balance_loss_clip": 1.05049241, "balance_loss_mlp": 1.01926827, "epoch": 0.46077075692899655, "flos": 16107013745280.0, "grad_norm": 2.305715771050497, "language_loss": 0.83032143, "learning_rate": 2.3495123094951515e-06, "loss": 0.85231996, "num_input_tokens_seen": 82427260, "step": 3832, "time_per_iteration": 2.6804981231689453 }, { "auxiliary_loss_clip": 0.01148995, "auxiliary_loss_mlp": 0.01028715, "balance_loss_clip": 1.04835498, "balance_loss_mlp": 1.02032292, "epoch": 0.46089099981963566, "flos": 48798147634560.0, "grad_norm": 2.6011501881987216, "language_loss": 0.75793958, "learning_rate": 2.34874529837201e-06, "loss": 0.77971667, "num_input_tokens_seen": 82450805, "step": 3833, "time_per_iteration": 4.834528923034668 }, { "auxiliary_loss_clip": 0.01104998, "auxiliary_loss_mlp": 0.01029453, "balance_loss_clip": 1.04086947, "balance_loss_mlp": 1.02134073, "epoch": 0.46101124271027477, "flos": 19099234362240.0, "grad_norm": 2.123662846856711, "language_loss": 0.79130423, "learning_rate": 2.347978234344483e-06, "loss": 0.81264877, "num_input_tokens_seen": 82467010, "step": 3834, "time_per_iteration": 2.7935197353363037 }, { "auxiliary_loss_clip": 0.01177461, "auxiliary_loss_mlp": 0.01035178, "balance_loss_clip": 1.05286276, "balance_loss_mlp": 1.02589488, "epoch": 0.4611314856009138, "flos": 39347931853440.0, "grad_norm": 1.830981154374917, "language_loss": 0.69118893, "learning_rate": 2.347211117528935e-06, "loss": 0.71331537, "num_input_tokens_seen": 82489310, "step": 3835, "time_per_iteration": 2.8139333724975586 }, { "auxiliary_loss_clip": 0.01151984, "auxiliary_loss_mlp": 0.01034845, "balance_loss_clip": 1.0519768, "balance_loss_mlp": 1.02659583, "epoch": 0.46125172849155294, "flos": 20810772489600.0, "grad_norm": 1.5587743291800389, "language_loss": 0.7187106, "learning_rate": 2.3464439480417374e-06, "loss": 0.74057889, "num_input_tokens_seen": 82508830, "step": 3836, "time_per_iteration": 3.6322555541992188 }, { "auxiliary_loss_clip": 0.0117897, "auxiliary_loss_mlp": 0.01028747, "balance_loss_clip": 1.05469489, "balance_loss_mlp": 1.01920986, "epoch": 0.46137197138219205, "flos": 17930808852480.0, "grad_norm": 2.6436250763463542, "language_loss": 0.77405608, "learning_rate": 2.3456767259992676e-06, "loss": 0.79613328, "num_input_tokens_seen": 82526475, "step": 3837, "time_per_iteration": 2.6096034049987793 }, { "auxiliary_loss_clip": 0.0118727, "auxiliary_loss_mlp": 0.00763198, "balance_loss_clip": 1.05133927, "balance_loss_mlp": 1.00015545, "epoch": 0.4614922142728311, "flos": 16836610798080.0, "grad_norm": 2.547858527077774, "language_loss": 0.88318008, "learning_rate": 2.3449094515179135e-06, "loss": 0.90268475, "num_input_tokens_seen": 82543935, "step": 3838, "time_per_iteration": 3.5027754306793213 }, { "auxiliary_loss_clip": 0.01160092, "auxiliary_loss_mlp": 0.01026096, "balance_loss_clip": 1.04692447, "balance_loss_mlp": 1.01809108, "epoch": 0.4616124571634702, "flos": 26614906427520.0, "grad_norm": 1.568236967828726, "language_loss": 0.81784105, "learning_rate": 2.34414212471407e-06, "loss": 0.8397029, "num_input_tokens_seen": 82563730, "step": 3839, "time_per_iteration": 2.6916425228118896 }, { "auxiliary_loss_clip": 0.0117905, "auxiliary_loss_mlp": 0.01029896, "balance_loss_clip": 1.05133379, "balance_loss_mlp": 1.02062726, "epoch": 0.4617327000541093, "flos": 20340127560960.0, "grad_norm": 1.9965868580968595, "language_loss": 0.7295295, "learning_rate": 2.3433747457041394e-06, "loss": 0.75161886, "num_input_tokens_seen": 82582435, "step": 3840, "time_per_iteration": 2.6610267162323 }, { "auxiliary_loss_clip": 0.01143233, "auxiliary_loss_mlp": 0.01028695, "balance_loss_clip": 1.04903841, "balance_loss_mlp": 1.01955748, "epoch": 0.4618529429447484, "flos": 29570749545600.0, "grad_norm": 1.9347505496803592, "language_loss": 0.84917074, "learning_rate": 2.342607314604533e-06, "loss": 0.87089008, "num_input_tokens_seen": 82602185, "step": 3841, "time_per_iteration": 2.7545342445373535 }, { "auxiliary_loss_clip": 0.01171788, "auxiliary_loss_mlp": 0.01029459, "balance_loss_clip": 1.05355382, "balance_loss_mlp": 1.01995838, "epoch": 0.4619731858353875, "flos": 19787030962560.0, "grad_norm": 6.666626297566391, "language_loss": 0.84304082, "learning_rate": 2.3418398315316694e-06, "loss": 0.8650533, "num_input_tokens_seen": 82620005, "step": 3842, "time_per_iteration": 2.7001709938049316 }, { "auxiliary_loss_clip": 0.01187307, "auxiliary_loss_mlp": 0.01023824, "balance_loss_clip": 1.05411696, "balance_loss_mlp": 1.01577187, "epoch": 0.4620934287260266, "flos": 18951138587520.0, "grad_norm": 2.2766224116393405, "language_loss": 0.78259599, "learning_rate": 2.3410722966019755e-06, "loss": 0.80470729, "num_input_tokens_seen": 82635120, "step": 3843, "time_per_iteration": 2.647512674331665 }, { "auxiliary_loss_clip": 0.0116939, "auxiliary_loss_mlp": 0.01031142, "balance_loss_clip": 1.05019999, "balance_loss_mlp": 1.02339935, "epoch": 0.46221367161666566, "flos": 37341674634240.0, "grad_norm": 2.5564385581536007, "language_loss": 0.65524805, "learning_rate": 2.3403047099318848e-06, "loss": 0.67725337, "num_input_tokens_seen": 82659190, "step": 3844, "time_per_iteration": 2.783294677734375 }, { "auxiliary_loss_clip": 0.01120497, "auxiliary_loss_mlp": 0.01025188, "balance_loss_clip": 1.0418998, "balance_loss_mlp": 1.01696825, "epoch": 0.46233391450730477, "flos": 14428549065600.0, "grad_norm": 2.5797584752317313, "language_loss": 0.74743116, "learning_rate": 2.3395370716378405e-06, "loss": 0.768888, "num_input_tokens_seen": 82676635, "step": 3845, "time_per_iteration": 2.674377679824829 }, { "auxiliary_loss_clip": 0.01175237, "auxiliary_loss_mlp": 0.01025083, "balance_loss_clip": 1.05083787, "balance_loss_mlp": 1.01642871, "epoch": 0.4624541573979438, "flos": 22493044010880.0, "grad_norm": 2.2679206490784822, "language_loss": 0.72343349, "learning_rate": 2.338769381836292e-06, "loss": 0.74543667, "num_input_tokens_seen": 82696245, "step": 3846, "time_per_iteration": 2.857036828994751 }, { "auxiliary_loss_clip": 0.01143087, "auxiliary_loss_mlp": 0.01030077, "balance_loss_clip": 1.05101824, "balance_loss_mlp": 1.02217388, "epoch": 0.46257440028858293, "flos": 14465070218880.0, "grad_norm": 1.9066503631982015, "language_loss": 0.73297554, "learning_rate": 2.3380016406436984e-06, "loss": 0.75470716, "num_input_tokens_seen": 82713725, "step": 3847, "time_per_iteration": 2.679553747177124 }, { "auxiliary_loss_clip": 0.01127516, "auxiliary_loss_mlp": 0.0102585, "balance_loss_clip": 1.04882884, "balance_loss_mlp": 1.01668906, "epoch": 0.46269464317922204, "flos": 23332204523520.0, "grad_norm": 2.041135317237289, "language_loss": 0.81624365, "learning_rate": 2.337233848176524e-06, "loss": 0.83777732, "num_input_tokens_seen": 82731495, "step": 3848, "time_per_iteration": 2.729567766189575 }, { "auxiliary_loss_clip": 0.01120437, "auxiliary_loss_mlp": 0.01030813, "balance_loss_clip": 1.0438211, "balance_loss_mlp": 1.02203357, "epoch": 0.4628148860698611, "flos": 18552027594240.0, "grad_norm": 1.9321330338105165, "language_loss": 0.83513039, "learning_rate": 2.3364660045512435e-06, "loss": 0.85664296, "num_input_tokens_seen": 82750255, "step": 3849, "time_per_iteration": 2.7877683639526367 }, { "auxiliary_loss_clip": 0.01060181, "auxiliary_loss_mlp": 0.01009802, "balance_loss_clip": 1.01761532, "balance_loss_mlp": 1.00809693, "epoch": 0.4629351289605002, "flos": 70667569670400.0, "grad_norm": 0.8352963142381657, "language_loss": 0.58232033, "learning_rate": 2.335698109884337e-06, "loss": 0.60302019, "num_input_tokens_seen": 82815460, "step": 3850, "time_per_iteration": 3.4033608436584473 }, { "auxiliary_loss_clip": 0.01052099, "auxiliary_loss_mlp": 0.01004457, "balance_loss_clip": 1.02902687, "balance_loss_mlp": 1.00278759, "epoch": 0.4630553718511393, "flos": 59687200465920.0, "grad_norm": 0.7852880779529042, "language_loss": 0.59865332, "learning_rate": 2.334930164292294e-06, "loss": 0.61921889, "num_input_tokens_seen": 82878010, "step": 3851, "time_per_iteration": 3.459230899810791 }, { "auxiliary_loss_clip": 0.0112122, "auxiliary_loss_mlp": 0.01032099, "balance_loss_clip": 1.0427264, "balance_loss_mlp": 1.02396929, "epoch": 0.4631756147417784, "flos": 15960605909760.0, "grad_norm": 1.954813854944133, "language_loss": 0.79889292, "learning_rate": 2.334162167891612e-06, "loss": 0.82042611, "num_input_tokens_seen": 82895275, "step": 3852, "time_per_iteration": 2.933835029602051 }, { "auxiliary_loss_clip": 0.01162005, "auxiliary_loss_mlp": 0.01028227, "balance_loss_clip": 1.04913259, "balance_loss_mlp": 1.01957214, "epoch": 0.4632958576324175, "flos": 16472907636480.0, "grad_norm": 2.076889048683533, "language_loss": 0.75121981, "learning_rate": 2.333394120798795e-06, "loss": 0.77312213, "num_input_tokens_seen": 82914010, "step": 3853, "time_per_iteration": 2.6549072265625 }, { "auxiliary_loss_clip": 0.01156637, "auxiliary_loss_mlp": 0.01024321, "balance_loss_clip": 1.04721332, "balance_loss_mlp": 1.0164237, "epoch": 0.4634161005230566, "flos": 22346492520960.0, "grad_norm": 2.529322269152144, "language_loss": 0.72005647, "learning_rate": 2.3326260231303545e-06, "loss": 0.74186611, "num_input_tokens_seen": 82932610, "step": 3854, "time_per_iteration": 2.722343921661377 }, { "auxiliary_loss_clip": 0.01183604, "auxiliary_loss_mlp": 0.01026289, "balance_loss_clip": 1.0521425, "balance_loss_mlp": 1.01871669, "epoch": 0.46353634341369565, "flos": 15742233175680.0, "grad_norm": 1.6618001694861786, "language_loss": 0.87005103, "learning_rate": 2.331857875002811e-06, "loss": 0.89214998, "num_input_tokens_seen": 82951210, "step": 3855, "time_per_iteration": 2.6663739681243896 }, { "auxiliary_loss_clip": 0.01159065, "auxiliary_loss_mlp": 0.01024856, "balance_loss_clip": 1.05302477, "balance_loss_mlp": 1.01711345, "epoch": 0.46365658630433476, "flos": 28329820433280.0, "grad_norm": 1.6125485612152428, "language_loss": 0.76332641, "learning_rate": 2.3310896765326916e-06, "loss": 0.78516561, "num_input_tokens_seen": 82972210, "step": 3856, "time_per_iteration": 2.8275814056396484 }, { "auxiliary_loss_clip": 0.01137613, "auxiliary_loss_mlp": 0.01028114, "balance_loss_clip": 1.04733109, "balance_loss_mlp": 1.01976299, "epoch": 0.46377682919497387, "flos": 24608074590720.0, "grad_norm": 1.6634255282702313, "language_loss": 0.84311587, "learning_rate": 2.330321427836531e-06, "loss": 0.86477315, "num_input_tokens_seen": 82994080, "step": 3857, "time_per_iteration": 2.7586252689361572 }, { "auxiliary_loss_clip": 0.01169658, "auxiliary_loss_mlp": 0.01026824, "balance_loss_clip": 1.05126429, "balance_loss_mlp": 1.01908112, "epoch": 0.4638970720856129, "flos": 19060953442560.0, "grad_norm": 4.993732182188965, "language_loss": 0.8280341, "learning_rate": 2.3295531290308733e-06, "loss": 0.84999889, "num_input_tokens_seen": 83012230, "step": 3858, "time_per_iteration": 2.6841845512390137 }, { "auxiliary_loss_clip": 0.01189824, "auxiliary_loss_mlp": 0.00764034, "balance_loss_clip": 1.05464041, "balance_loss_mlp": 1.00021005, "epoch": 0.46401731497625204, "flos": 18471012468480.0, "grad_norm": 3.4836264901156975, "language_loss": 0.76062083, "learning_rate": 2.3287847802322678e-06, "loss": 0.78015941, "num_input_tokens_seen": 83027800, "step": 3859, "time_per_iteration": 3.5625555515289307 }, { "auxiliary_loss_clip": 0.0116461, "auxiliary_loss_mlp": 0.01029743, "balance_loss_clip": 1.05216444, "balance_loss_mlp": 1.02034307, "epoch": 0.4641375578668911, "flos": 26067053214720.0, "grad_norm": 1.925636636515387, "language_loss": 0.84229344, "learning_rate": 2.3280163815572723e-06, "loss": 0.86423695, "num_input_tokens_seen": 83048395, "step": 3860, "time_per_iteration": 2.7129099369049072 }, { "auxiliary_loss_clip": 0.01149464, "auxiliary_loss_mlp": 0.01029719, "balance_loss_clip": 1.04840922, "balance_loss_mlp": 1.02125514, "epoch": 0.4642578007575302, "flos": 19570382081280.0, "grad_norm": 2.3899367267100375, "language_loss": 0.77333945, "learning_rate": 2.3272479331224522e-06, "loss": 0.79513133, "num_input_tokens_seen": 83065825, "step": 3861, "time_per_iteration": 2.734492063522339 }, { "auxiliary_loss_clip": 0.01185802, "auxiliary_loss_mlp": 0.01030294, "balance_loss_clip": 1.0506587, "balance_loss_mlp": 1.0217886, "epoch": 0.4643780436481693, "flos": 28186249772160.0, "grad_norm": 1.7788738651644669, "language_loss": 0.78168547, "learning_rate": 2.3264794350443817e-06, "loss": 0.80384636, "num_input_tokens_seen": 83087920, "step": 3862, "time_per_iteration": 3.577354669570923 }, { "auxiliary_loss_clip": 0.01173997, "auxiliary_loss_mlp": 0.01029522, "balance_loss_clip": 1.04983401, "balance_loss_mlp": 1.02080178, "epoch": 0.46449828653880837, "flos": 25375270204800.0, "grad_norm": 1.8306698245970212, "language_loss": 0.78700507, "learning_rate": 2.3257108874396396e-06, "loss": 0.80904025, "num_input_tokens_seen": 83109015, "step": 3863, "time_per_iteration": 2.769343376159668 }, { "auxiliary_loss_clip": 0.01155549, "auxiliary_loss_mlp": 0.01035987, "balance_loss_clip": 1.0468142, "balance_loss_mlp": 1.02779722, "epoch": 0.4646185294294475, "flos": 16034330574720.0, "grad_norm": 1.9144599639290787, "language_loss": 0.74081886, "learning_rate": 2.3249422904248152e-06, "loss": 0.76273423, "num_input_tokens_seen": 83127450, "step": 3864, "time_per_iteration": 3.7400617599487305 }, { "auxiliary_loss_clip": 0.01175828, "auxiliary_loss_mlp": 0.01032971, "balance_loss_clip": 1.05170524, "balance_loss_mlp": 1.02504611, "epoch": 0.4647387723200866, "flos": 26363101109760.0, "grad_norm": 1.9812387411174233, "language_loss": 0.8695817, "learning_rate": 2.324173644116504e-06, "loss": 0.89166963, "num_input_tokens_seen": 83150300, "step": 3865, "time_per_iteration": 2.7115299701690674 }, { "auxiliary_loss_clip": 0.0116937, "auxiliary_loss_mlp": 0.01030796, "balance_loss_clip": 1.05199504, "balance_loss_mlp": 1.02215374, "epoch": 0.46485901521072565, "flos": 27160209774720.0, "grad_norm": 1.6341360138761267, "language_loss": 0.81403613, "learning_rate": 2.3234049486313087e-06, "loss": 0.83603781, "num_input_tokens_seen": 83171750, "step": 3866, "time_per_iteration": 2.852799415588379 }, { "auxiliary_loss_clip": 0.01171471, "auxiliary_loss_mlp": 0.01026814, "balance_loss_clip": 1.05132329, "balance_loss_mlp": 1.01922011, "epoch": 0.46497925810136476, "flos": 24279851088000.0, "grad_norm": 1.8405904263757935, "language_loss": 0.75671649, "learning_rate": 2.322636204085839e-06, "loss": 0.77869934, "num_input_tokens_seen": 83191820, "step": 3867, "time_per_iteration": 2.7998647689819336 }, { "auxiliary_loss_clip": 0.01146601, "auxiliary_loss_mlp": 0.01030021, "balance_loss_clip": 1.04451084, "balance_loss_mlp": 1.02163458, "epoch": 0.46509950099200387, "flos": 16253134272000.0, "grad_norm": 2.2200537038963937, "language_loss": 0.78956378, "learning_rate": 2.3218674105967143e-06, "loss": 0.81133002, "num_input_tokens_seen": 83210085, "step": 3868, "time_per_iteration": 2.7260873317718506 }, { "auxiliary_loss_clip": 0.01149234, "auxiliary_loss_mlp": 0.01035078, "balance_loss_clip": 1.04635096, "balance_loss_mlp": 1.02675152, "epoch": 0.4652197438826429, "flos": 23442270773760.0, "grad_norm": 1.522847998352799, "language_loss": 0.8342036, "learning_rate": 2.3210985682805593e-06, "loss": 0.85604674, "num_input_tokens_seen": 83231865, "step": 3869, "time_per_iteration": 2.742035150527954 }, { "auxiliary_loss_clip": 0.01188117, "auxiliary_loss_mlp": 0.01030858, "balance_loss_clip": 1.05451369, "balance_loss_mlp": 1.02225685, "epoch": 0.46533998677328203, "flos": 16216397637120.0, "grad_norm": 2.538426337038713, "language_loss": 0.68076175, "learning_rate": 2.320329677254007e-06, "loss": 0.70295149, "num_input_tokens_seen": 83249195, "step": 3870, "time_per_iteration": 2.589747190475464 }, { "auxiliary_loss_clip": 0.01189783, "auxiliary_loss_mlp": 0.01033064, "balance_loss_clip": 1.0565486, "balance_loss_mlp": 1.02467191, "epoch": 0.46546022966392114, "flos": 21141869080320.0, "grad_norm": 2.303044567066429, "language_loss": 0.72253466, "learning_rate": 2.319560737633697e-06, "loss": 0.74476314, "num_input_tokens_seen": 83267915, "step": 3871, "time_per_iteration": 2.602261781692505 }, { "auxiliary_loss_clip": 0.01148298, "auxiliary_loss_mlp": 0.0103108, "balance_loss_clip": 1.04580522, "balance_loss_mlp": 1.02233052, "epoch": 0.4655804725545602, "flos": 41171942442240.0, "grad_norm": 1.7225242059536656, "language_loss": 0.6830616, "learning_rate": 2.3187917495362775e-06, "loss": 0.70485544, "num_input_tokens_seen": 83292325, "step": 3872, "time_per_iteration": 2.8763175010681152 }, { "auxiliary_loss_clip": 0.01126956, "auxiliary_loss_mlp": 0.01028161, "balance_loss_clip": 1.04787397, "balance_loss_mlp": 1.02033174, "epoch": 0.4657007154451993, "flos": 19570956698880.0, "grad_norm": 3.226177377721573, "language_loss": 0.76722568, "learning_rate": 2.318022713078403e-06, "loss": 0.78877681, "num_input_tokens_seen": 83306905, "step": 3873, "time_per_iteration": 2.8014166355133057 }, { "auxiliary_loss_clip": 0.01159028, "auxiliary_loss_mlp": 0.01025266, "balance_loss_clip": 1.04997563, "balance_loss_mlp": 1.01739192, "epoch": 0.4658209583358384, "flos": 15517826956800.0, "grad_norm": 2.2893660919300047, "language_loss": 0.85500157, "learning_rate": 2.3172536283767354e-06, "loss": 0.87684453, "num_input_tokens_seen": 83320665, "step": 3874, "time_per_iteration": 2.661252021789551 }, { "auxiliary_loss_clip": 0.0113822, "auxiliary_loss_mlp": 0.01029447, "balance_loss_clip": 1.04788351, "balance_loss_mlp": 1.02074504, "epoch": 0.4659412012264775, "flos": 14903180403840.0, "grad_norm": 1.9936366509920724, "language_loss": 0.80923271, "learning_rate": 2.3164844955479447e-06, "loss": 0.83090937, "num_input_tokens_seen": 83336475, "step": 3875, "time_per_iteration": 2.809168577194214 }, { "auxiliary_loss_clip": 0.01134962, "auxiliary_loss_mlp": 0.01028867, "balance_loss_clip": 1.04584444, "balance_loss_mlp": 1.02044535, "epoch": 0.4660614441171166, "flos": 24425612478720.0, "grad_norm": 2.110800098459517, "language_loss": 0.70792925, "learning_rate": 2.3157153147087082e-06, "loss": 0.72956753, "num_input_tokens_seen": 83358365, "step": 3876, "time_per_iteration": 2.7619433403015137 }, { "auxiliary_loss_clip": 0.01138648, "auxiliary_loss_mlp": 0.0102929, "balance_loss_clip": 1.05070949, "balance_loss_mlp": 1.02080202, "epoch": 0.46618168700775564, "flos": 22091095843200.0, "grad_norm": 1.7118852573851393, "language_loss": 0.82956785, "learning_rate": 2.314946085975709e-06, "loss": 0.85124725, "num_input_tokens_seen": 83377345, "step": 3877, "time_per_iteration": 2.8471899032592773 }, { "auxiliary_loss_clip": 0.01132497, "auxiliary_loss_mlp": 0.01030182, "balance_loss_clip": 1.04869437, "balance_loss_mlp": 1.02229643, "epoch": 0.46630192989839475, "flos": 26176975810560.0, "grad_norm": 1.7311050698130053, "language_loss": 0.82439876, "learning_rate": 2.3141768094656393e-06, "loss": 0.84602559, "num_input_tokens_seen": 83395920, "step": 3878, "time_per_iteration": 2.7453835010528564 }, { "auxiliary_loss_clip": 0.01106367, "auxiliary_loss_mlp": 0.01027463, "balance_loss_clip": 1.0422852, "balance_loss_mlp": 1.01952052, "epoch": 0.46642217278903386, "flos": 11509622150400.0, "grad_norm": 2.34003994231913, "language_loss": 0.82827914, "learning_rate": 2.3134074852951966e-06, "loss": 0.84961748, "num_input_tokens_seen": 83412510, "step": 3879, "time_per_iteration": 2.842590808868408 }, { "auxiliary_loss_clip": 0.01121201, "auxiliary_loss_mlp": 0.01028459, "balance_loss_clip": 1.04191113, "balance_loss_mlp": 1.02057934, "epoch": 0.4665424156796729, "flos": 32306819299200.0, "grad_norm": 1.679315316127553, "language_loss": 0.77714789, "learning_rate": 2.312638113581088e-06, "loss": 0.79864454, "num_input_tokens_seen": 83432995, "step": 3880, "time_per_iteration": 2.899304151535034 }, { "auxiliary_loss_clip": 0.01168961, "auxiliary_loss_mlp": 0.01030593, "balance_loss_clip": 1.04736447, "balance_loss_mlp": 1.0219326, "epoch": 0.46666265857031203, "flos": 18436179254400.0, "grad_norm": 2.6972281801373064, "language_loss": 0.77824664, "learning_rate": 2.311868694440027e-06, "loss": 0.80024219, "num_input_tokens_seen": 83447415, "step": 3881, "time_per_iteration": 2.6214747428894043 }, { "auxiliary_loss_clip": 0.01082556, "auxiliary_loss_mlp": 0.0100399, "balance_loss_clip": 1.01549935, "balance_loss_mlp": 1.00236917, "epoch": 0.46678290146095114, "flos": 68438989221120.0, "grad_norm": 0.7409824968993908, "language_loss": 0.6245923, "learning_rate": 2.3110992279887323e-06, "loss": 0.6454578, "num_input_tokens_seen": 83519340, "step": 3882, "time_per_iteration": 3.3362624645233154 }, { "auxiliary_loss_clip": 0.0115097, "auxiliary_loss_mlp": 0.01031784, "balance_loss_clip": 1.05143702, "balance_loss_mlp": 1.02367163, "epoch": 0.4669031443515902, "flos": 17712507945600.0, "grad_norm": 2.449782917073229, "language_loss": 0.84478247, "learning_rate": 2.310329714343932e-06, "loss": 0.86661005, "num_input_tokens_seen": 83535490, "step": 3883, "time_per_iteration": 2.714383363723755 }, { "auxiliary_loss_clip": 0.01152324, "auxiliary_loss_mlp": 0.01027303, "balance_loss_clip": 1.04915118, "balance_loss_mlp": 1.01926255, "epoch": 0.4670233872422293, "flos": 23947748916480.0, "grad_norm": 2.8413399790418628, "language_loss": 0.81512868, "learning_rate": 2.309560153622361e-06, "loss": 0.83692497, "num_input_tokens_seen": 83552400, "step": 3884, "time_per_iteration": 2.730412483215332 }, { "auxiliary_loss_clip": 0.01144299, "auxiliary_loss_mlp": 0.01029861, "balance_loss_clip": 1.0492332, "balance_loss_mlp": 1.02127755, "epoch": 0.4671436301328684, "flos": 28111268131200.0, "grad_norm": 1.9750414302784336, "language_loss": 0.74525845, "learning_rate": 2.3087905459407602e-06, "loss": 0.76700008, "num_input_tokens_seen": 83571340, "step": 3885, "time_per_iteration": 3.6685636043548584 }, { "auxiliary_loss_clip": 0.01074422, "auxiliary_loss_mlp": 0.01002539, "balance_loss_clip": 1.01599216, "balance_loss_mlp": 1.00091779, "epoch": 0.46726387302350747, "flos": 69369684566400.0, "grad_norm": 0.8066923735779634, "language_loss": 0.62863255, "learning_rate": 2.3080208914158795e-06, "loss": 0.64940214, "num_input_tokens_seen": 83634340, "step": 3886, "time_per_iteration": 4.176991701126099 }, { "auxiliary_loss_clip": 0.01156356, "auxiliary_loss_mlp": 0.01027265, "balance_loss_clip": 1.05195284, "balance_loss_mlp": 1.01909351, "epoch": 0.4673841159141466, "flos": 25519666878720.0, "grad_norm": 2.944177526182004, "language_loss": 0.71992576, "learning_rate": 2.3072511901644753e-06, "loss": 0.74176192, "num_input_tokens_seen": 83653410, "step": 3887, "time_per_iteration": 2.726440668106079 }, { "auxiliary_loss_clip": 0.01182874, "auxiliary_loss_mlp": 0.01023682, "balance_loss_clip": 1.05224633, "balance_loss_mlp": 1.01617181, "epoch": 0.4675043588047857, "flos": 24499265316480.0, "grad_norm": 2.0444820183398753, "language_loss": 0.80899876, "learning_rate": 2.306481442303309e-06, "loss": 0.83106434, "num_input_tokens_seen": 83672985, "step": 3888, "time_per_iteration": 2.628535747528076 }, { "auxiliary_loss_clip": 0.01174107, "auxiliary_loss_mlp": 0.01030115, "balance_loss_clip": 1.0523684, "balance_loss_mlp": 1.02143621, "epoch": 0.46762460169542475, "flos": 20960771685120.0, "grad_norm": 1.8920606141268406, "language_loss": 0.73078066, "learning_rate": 2.3057116479491515e-06, "loss": 0.75282288, "num_input_tokens_seen": 83692395, "step": 3889, "time_per_iteration": 3.575392007827759 }, { "auxiliary_loss_clip": 0.01166545, "auxiliary_loss_mlp": 0.01026194, "balance_loss_clip": 1.04799247, "balance_loss_mlp": 1.01840973, "epoch": 0.46774484458606386, "flos": 19171666137600.0, "grad_norm": 2.166637837977406, "language_loss": 0.76016784, "learning_rate": 2.30494180721878e-06, "loss": 0.78209519, "num_input_tokens_seen": 83709735, "step": 3890, "time_per_iteration": 3.6049256324768066 }, { "auxiliary_loss_clip": 0.01168143, "auxiliary_loss_mlp": 0.01025975, "balance_loss_clip": 1.04869592, "balance_loss_mlp": 1.01849437, "epoch": 0.4678650874767029, "flos": 17967689141760.0, "grad_norm": 1.8541916985298201, "language_loss": 0.89638579, "learning_rate": 2.3041719202289794e-06, "loss": 0.91832691, "num_input_tokens_seen": 83725910, "step": 3891, "time_per_iteration": 2.5900015830993652 }, { "auxiliary_loss_clip": 0.0117206, "auxiliary_loss_mlp": 0.01029569, "balance_loss_clip": 1.05198574, "balance_loss_mlp": 1.0222789, "epoch": 0.467985330367342, "flos": 21360816432000.0, "grad_norm": 1.7649414595063042, "language_loss": 0.80657291, "learning_rate": 2.30340198709654e-06, "loss": 0.8285892, "num_input_tokens_seen": 83745745, "step": 3892, "time_per_iteration": 2.6073925495147705 }, { "auxiliary_loss_clip": 0.01157018, "auxiliary_loss_mlp": 0.01029083, "balance_loss_clip": 1.04576385, "balance_loss_mlp": 1.0211978, "epoch": 0.46810557325798113, "flos": 20521835487360.0, "grad_norm": 2.108018237107952, "language_loss": 0.74169803, "learning_rate": 2.3026320079382605e-06, "loss": 0.76355904, "num_input_tokens_seen": 83762680, "step": 3893, "time_per_iteration": 2.6797237396240234 }, { "auxiliary_loss_clip": 0.0118206, "auxiliary_loss_mlp": 0.01022861, "balance_loss_clip": 1.05103076, "balance_loss_mlp": 1.01473653, "epoch": 0.4682258161486202, "flos": 30117848572800.0, "grad_norm": 1.9168360487852627, "language_loss": 0.76515847, "learning_rate": 2.3018619828709454e-06, "loss": 0.78720766, "num_input_tokens_seen": 83784220, "step": 3894, "time_per_iteration": 2.694000244140625 }, { "auxiliary_loss_clip": 0.01169687, "auxiliary_loss_mlp": 0.00763253, "balance_loss_clip": 1.05257738, "balance_loss_mlp": 1.00016057, "epoch": 0.4683460590392593, "flos": 25293357239040.0, "grad_norm": 2.2012067370136905, "language_loss": 0.82024956, "learning_rate": 2.3010919120114084e-06, "loss": 0.83957899, "num_input_tokens_seen": 83800750, "step": 3895, "time_per_iteration": 2.7824513912200928 }, { "auxiliary_loss_clip": 0.01163691, "auxiliary_loss_mlp": 0.01029344, "balance_loss_clip": 1.04440665, "balance_loss_mlp": 1.0208745, "epoch": 0.4684663019298984, "flos": 15368330551680.0, "grad_norm": 2.1787387031238503, "language_loss": 0.6587981, "learning_rate": 2.3003217954764672e-06, "loss": 0.68072844, "num_input_tokens_seen": 83815455, "step": 3896, "time_per_iteration": 2.6696741580963135 }, { "auxiliary_loss_clip": 0.0117025, "auxiliary_loss_mlp": 0.01027902, "balance_loss_clip": 1.04634356, "balance_loss_mlp": 1.01986194, "epoch": 0.46858654482053747, "flos": 27778842737280.0, "grad_norm": 1.7695336474781151, "language_loss": 0.79493558, "learning_rate": 2.299551633382949e-06, "loss": 0.81691706, "num_input_tokens_seen": 83835765, "step": 3897, "time_per_iteration": 2.668097972869873 }, { "auxiliary_loss_clip": 0.01149287, "auxiliary_loss_mlp": 0.01027988, "balance_loss_clip": 1.04658496, "balance_loss_mlp": 1.01961315, "epoch": 0.4687067877111766, "flos": 18040623707520.0, "grad_norm": 1.778320475914632, "language_loss": 0.85807306, "learning_rate": 2.2987814258476854e-06, "loss": 0.8798458, "num_input_tokens_seen": 83853565, "step": 3898, "time_per_iteration": 2.605916976928711 }, { "auxiliary_loss_clip": 0.01130135, "auxiliary_loss_mlp": 0.01034382, "balance_loss_clip": 1.04291177, "balance_loss_mlp": 1.02641273, "epoch": 0.4688270306018157, "flos": 16977380198400.0, "grad_norm": 2.4184816217833056, "language_loss": 0.68383425, "learning_rate": 2.2980111729875177e-06, "loss": 0.7054795, "num_input_tokens_seen": 83869815, "step": 3899, "time_per_iteration": 2.7167932987213135 }, { "auxiliary_loss_clip": 0.01150649, "auxiliary_loss_mlp": 0.010267, "balance_loss_clip": 1.04990554, "balance_loss_mlp": 1.01883805, "epoch": 0.46894727349245474, "flos": 17821640442240.0, "grad_norm": 1.617673316916541, "language_loss": 0.82571316, "learning_rate": 2.2972408749192917e-06, "loss": 0.84748667, "num_input_tokens_seen": 83887545, "step": 3900, "time_per_iteration": 2.6457011699676514 }, { "auxiliary_loss_clip": 0.01165559, "auxiliary_loss_mlp": 0.00763178, "balance_loss_clip": 1.04956901, "balance_loss_mlp": 1.00016665, "epoch": 0.46906751638309385, "flos": 21471349559040.0, "grad_norm": 2.13158539933793, "language_loss": 0.67168552, "learning_rate": 2.296470531759861e-06, "loss": 0.69097292, "num_input_tokens_seen": 83905645, "step": 3901, "time_per_iteration": 2.69840407371521 }, { "auxiliary_loss_clip": 0.01134125, "auxiliary_loss_mlp": 0.01024341, "balance_loss_clip": 1.04497468, "balance_loss_mlp": 1.01633036, "epoch": 0.46918775927373296, "flos": 20337829090560.0, "grad_norm": 1.7854642589681655, "language_loss": 0.79367745, "learning_rate": 2.2957001436260866e-06, "loss": 0.81526214, "num_input_tokens_seen": 83922705, "step": 3902, "time_per_iteration": 2.6706223487854004 }, { "auxiliary_loss_clip": 0.01153413, "auxiliary_loss_mlp": 0.01028044, "balance_loss_clip": 1.04906332, "balance_loss_mlp": 1.01999712, "epoch": 0.469308002164372, "flos": 18403249461120.0, "grad_norm": 1.5498450378757413, "language_loss": 0.73447442, "learning_rate": 2.294929710634836e-06, "loss": 0.75628901, "num_input_tokens_seen": 83940795, "step": 3903, "time_per_iteration": 2.692601203918457 }, { "auxiliary_loss_clip": 0.01165487, "auxiliary_loss_mlp": 0.01025806, "balance_loss_clip": 1.04518104, "balance_loss_mlp": 1.01788425, "epoch": 0.46942824505501113, "flos": 37962067363200.0, "grad_norm": 1.786793108020436, "language_loss": 0.61168468, "learning_rate": 2.2941592329029823e-06, "loss": 0.63359761, "num_input_tokens_seen": 83961900, "step": 3904, "time_per_iteration": 2.735229015350342 }, { "auxiliary_loss_clip": 0.01166923, "auxiliary_loss_mlp": 0.01032009, "balance_loss_clip": 1.05068111, "balance_loss_mlp": 1.02405143, "epoch": 0.46954848794565024, "flos": 21872507627520.0, "grad_norm": 1.8922636607638492, "language_loss": 0.79251981, "learning_rate": 2.2933887105474067e-06, "loss": 0.81450915, "num_input_tokens_seen": 83980075, "step": 3905, "time_per_iteration": 2.6590332984924316 }, { "auxiliary_loss_clip": 0.01164129, "auxiliary_loss_mlp": 0.01032254, "balance_loss_clip": 1.04933083, "balance_loss_mlp": 1.02444553, "epoch": 0.4696687308362893, "flos": 22016545165440.0, "grad_norm": 1.634207241718594, "language_loss": 0.81484556, "learning_rate": 2.2926181436849974e-06, "loss": 0.8368094, "num_input_tokens_seen": 83999430, "step": 3906, "time_per_iteration": 2.6255295276641846 }, { "auxiliary_loss_clip": 0.01166856, "auxiliary_loss_mlp": 0.01027127, "balance_loss_clip": 1.04973722, "balance_loss_mlp": 1.01880598, "epoch": 0.4697889737269284, "flos": 21613663244160.0, "grad_norm": 1.6836581439967466, "language_loss": 0.72499716, "learning_rate": 2.2918475324326478e-06, "loss": 0.74693704, "num_input_tokens_seen": 84019150, "step": 3907, "time_per_iteration": 2.682119369506836 }, { "auxiliary_loss_clip": 0.01175315, "auxiliary_loss_mlp": 0.00763528, "balance_loss_clip": 1.05156171, "balance_loss_mlp": 1.00028419, "epoch": 0.46990921661756746, "flos": 25228323665280.0, "grad_norm": 2.0634771656570883, "language_loss": 0.91505826, "learning_rate": 2.2910768769072603e-06, "loss": 0.93444669, "num_input_tokens_seen": 84037930, "step": 3908, "time_per_iteration": 2.720750570297241 }, { "auxiliary_loss_clip": 0.01161812, "auxiliary_loss_mlp": 0.01029466, "balance_loss_clip": 1.04704618, "balance_loss_mlp": 1.02150273, "epoch": 0.47002945950820657, "flos": 13844031045120.0, "grad_norm": 1.9259247386896547, "language_loss": 0.7612645, "learning_rate": 2.2903061772257417e-06, "loss": 0.78317726, "num_input_tokens_seen": 84055915, "step": 3909, "time_per_iteration": 2.6470632553100586 }, { "auxiliary_loss_clip": 0.01168575, "auxiliary_loss_mlp": 0.01026477, "balance_loss_clip": 1.0502249, "balance_loss_mlp": 1.01810861, "epoch": 0.4701497023988457, "flos": 26247001374720.0, "grad_norm": 1.6581451009108852, "language_loss": 0.7870419, "learning_rate": 2.289535433505007e-06, "loss": 0.80899239, "num_input_tokens_seen": 84077270, "step": 3910, "time_per_iteration": 2.6944665908813477 }, { "auxiliary_loss_clip": 0.01156262, "auxiliary_loss_mlp": 0.01031856, "balance_loss_clip": 1.04566669, "balance_loss_mlp": 1.02371955, "epoch": 0.47026994528948474, "flos": 25629517647360.0, "grad_norm": 1.8233746763570382, "language_loss": 0.6375317, "learning_rate": 2.2887646458619767e-06, "loss": 0.65941286, "num_input_tokens_seen": 84098635, "step": 3911, "time_per_iteration": 3.6398494243621826 }, { "auxiliary_loss_clip": 0.01145502, "auxiliary_loss_mlp": 0.01036601, "balance_loss_clip": 1.04710388, "balance_loss_mlp": 1.02777982, "epoch": 0.47039018818012385, "flos": 20554406144640.0, "grad_norm": 3.035679892845977, "language_loss": 0.76374543, "learning_rate": 2.2879938144135797e-06, "loss": 0.78556645, "num_input_tokens_seen": 84114740, "step": 3912, "time_per_iteration": 2.7206196784973145 }, { "auxiliary_loss_clip": 0.01137661, "auxiliary_loss_mlp": 0.00762606, "balance_loss_clip": 1.04405713, "balance_loss_mlp": 1.00015223, "epoch": 0.47051043107076296, "flos": 21577249831680.0, "grad_norm": 1.5977673217300887, "language_loss": 0.7528193, "learning_rate": 2.2872229392767496e-06, "loss": 0.77182204, "num_input_tokens_seen": 84134845, "step": 3913, "time_per_iteration": 2.659719944000244 }, { "auxiliary_loss_clip": 0.01172406, "auxiliary_loss_mlp": 0.01029654, "balance_loss_clip": 1.04964089, "balance_loss_mlp": 1.02153611, "epoch": 0.470630673961402, "flos": 18953185662720.0, "grad_norm": 1.6367482835182665, "language_loss": 0.74737841, "learning_rate": 2.286452020568428e-06, "loss": 0.76939899, "num_input_tokens_seen": 84152920, "step": 3914, "time_per_iteration": 2.616288661956787 }, { "auxiliary_loss_clip": 0.01189779, "auxiliary_loss_mlp": 0.01031175, "balance_loss_clip": 1.05239248, "balance_loss_mlp": 1.02278233, "epoch": 0.4707509168520411, "flos": 19938969492480.0, "grad_norm": 1.7819579020675602, "language_loss": 0.73141491, "learning_rate": 2.2856810584055637e-06, "loss": 0.75362444, "num_input_tokens_seen": 84170455, "step": 3915, "time_per_iteration": 3.4468448162078857 }, { "auxiliary_loss_clip": 0.01170785, "auxiliary_loss_mlp": 0.01031332, "balance_loss_clip": 1.04952776, "balance_loss_mlp": 1.02316654, "epoch": 0.47087115974268023, "flos": 40118754741120.0, "grad_norm": 1.536206092465581, "language_loss": 0.67632687, "learning_rate": 2.2849100529051085e-06, "loss": 0.69834805, "num_input_tokens_seen": 84197390, "step": 3916, "time_per_iteration": 3.705092430114746 }, { "auxiliary_loss_clip": 0.01180099, "auxiliary_loss_mlp": 0.01025438, "balance_loss_clip": 1.04926872, "balance_loss_mlp": 1.01720631, "epoch": 0.4709914026333193, "flos": 13552723745280.0, "grad_norm": 2.342345401020367, "language_loss": 0.80124813, "learning_rate": 2.284139004184026e-06, "loss": 0.82330346, "num_input_tokens_seen": 84214620, "step": 3917, "time_per_iteration": 2.5804765224456787 }, { "auxiliary_loss_clip": 0.01183274, "auxiliary_loss_mlp": 0.01028961, "balance_loss_clip": 1.04957032, "balance_loss_mlp": 1.02077103, "epoch": 0.4711116455239584, "flos": 19974628719360.0, "grad_norm": 1.969366981393763, "language_loss": 0.74122071, "learning_rate": 2.2833679123592814e-06, "loss": 0.7633431, "num_input_tokens_seen": 84231880, "step": 3918, "time_per_iteration": 2.5867271423339844 }, { "auxiliary_loss_clip": 0.01153748, "auxiliary_loss_mlp": 0.01031479, "balance_loss_clip": 1.04872799, "balance_loss_mlp": 1.0234921, "epoch": 0.4712318884145975, "flos": 32124824064000.0, "grad_norm": 1.7794949584426258, "language_loss": 0.63665456, "learning_rate": 2.2825967775478508e-06, "loss": 0.65850687, "num_input_tokens_seen": 84252980, "step": 3919, "time_per_iteration": 2.921816349029541 }, { "auxiliary_loss_clip": 0.01180289, "auxiliary_loss_mlp": 0.01027683, "balance_loss_clip": 1.04776347, "balance_loss_mlp": 1.01981497, "epoch": 0.47135213130523657, "flos": 20047850593920.0, "grad_norm": 2.022245965816566, "language_loss": 0.83098698, "learning_rate": 2.2818255998667135e-06, "loss": 0.85306668, "num_input_tokens_seen": 84271490, "step": 3920, "time_per_iteration": 2.5784261226654053 }, { "auxiliary_loss_clip": 0.01168472, "auxiliary_loss_mlp": 0.01030307, "balance_loss_clip": 1.05042183, "balance_loss_mlp": 1.02193856, "epoch": 0.4714723741958757, "flos": 19426990988160.0, "grad_norm": 1.630558804238722, "language_loss": 0.79044384, "learning_rate": 2.2810543794328566e-06, "loss": 0.81243157, "num_input_tokens_seen": 84290525, "step": 3921, "time_per_iteration": 2.6516711711883545 }, { "auxiliary_loss_clip": 0.01173993, "auxiliary_loss_mlp": 0.0103224, "balance_loss_clip": 1.05035472, "balance_loss_mlp": 1.02370417, "epoch": 0.4715926170865148, "flos": 20373883367040.0, "grad_norm": 1.8886906200965397, "language_loss": 0.82782489, "learning_rate": 2.2802831163632735e-06, "loss": 0.84988725, "num_input_tokens_seen": 84309245, "step": 3922, "time_per_iteration": 2.7843987941741943 }, { "auxiliary_loss_clip": 0.01114266, "auxiliary_loss_mlp": 0.01031733, "balance_loss_clip": 1.04333627, "balance_loss_mlp": 1.02314067, "epoch": 0.47171285997715384, "flos": 22672884430080.0, "grad_norm": 2.0459884281006318, "language_loss": 0.74210852, "learning_rate": 2.279511810774965e-06, "loss": 0.76356852, "num_input_tokens_seen": 84330775, "step": 3923, "time_per_iteration": 2.799251079559326 }, { "auxiliary_loss_clip": 0.01183682, "auxiliary_loss_mlp": 0.01024621, "balance_loss_clip": 1.05063725, "balance_loss_mlp": 1.01675868, "epoch": 0.47183310286779295, "flos": 21105419754240.0, "grad_norm": 2.1014775191296775, "language_loss": 0.71568191, "learning_rate": 2.2787404627849364e-06, "loss": 0.73776489, "num_input_tokens_seen": 84349985, "step": 3924, "time_per_iteration": 2.628480911254883 }, { "auxiliary_loss_clip": 0.0115192, "auxiliary_loss_mlp": 0.01025295, "balance_loss_clip": 1.04588711, "balance_loss_mlp": 1.01751077, "epoch": 0.471953345758432, "flos": 21726566668800.0, "grad_norm": 2.170345459286011, "language_loss": 0.79207259, "learning_rate": 2.277969072510202e-06, "loss": 0.81384474, "num_input_tokens_seen": 84368965, "step": 3925, "time_per_iteration": 2.6631126403808594 }, { "auxiliary_loss_clip": 0.01152591, "auxiliary_loss_mlp": 0.01028011, "balance_loss_clip": 1.04680908, "balance_loss_mlp": 1.02054262, "epoch": 0.4720735886490711, "flos": 19861078849920.0, "grad_norm": 1.4996389393293528, "language_loss": 0.81531072, "learning_rate": 2.2771976400677803e-06, "loss": 0.83711672, "num_input_tokens_seen": 84387795, "step": 3926, "time_per_iteration": 2.7199976444244385 }, { "auxiliary_loss_clip": 0.01113857, "auxiliary_loss_mlp": 0.01029342, "balance_loss_clip": 1.04211104, "balance_loss_mlp": 1.0217185, "epoch": 0.47219383153971023, "flos": 19171809792000.0, "grad_norm": 1.7168396859768613, "language_loss": 0.79157591, "learning_rate": 2.2764261655746965e-06, "loss": 0.81300789, "num_input_tokens_seen": 84405290, "step": 3927, "time_per_iteration": 2.681540012359619 }, { "auxiliary_loss_clip": 0.01135496, "auxiliary_loss_mlp": 0.01033326, "balance_loss_clip": 1.04426098, "balance_loss_mlp": 1.02519011, "epoch": 0.4723140744303493, "flos": 23224005780480.0, "grad_norm": 1.627887260947625, "language_loss": 0.76053417, "learning_rate": 2.2756546491479832e-06, "loss": 0.78222239, "num_input_tokens_seen": 84426205, "step": 3928, "time_per_iteration": 2.713150978088379 }, { "auxiliary_loss_clip": 0.01183703, "auxiliary_loss_mlp": 0.00762944, "balance_loss_clip": 1.04982424, "balance_loss_mlp": 1.00019979, "epoch": 0.4724343173209884, "flos": 18223265387520.0, "grad_norm": 3.646009860075139, "language_loss": 0.80714303, "learning_rate": 2.274883090904679e-06, "loss": 0.82660949, "num_input_tokens_seen": 84443970, "step": 3929, "time_per_iteration": 2.5717389583587646 }, { "auxiliary_loss_clip": 0.01189094, "auxiliary_loss_mlp": 0.01028704, "balance_loss_clip": 1.05457163, "balance_loss_mlp": 1.02057958, "epoch": 0.4725545602116275, "flos": 21251037490560.0, "grad_norm": 2.3750285108622564, "language_loss": 0.67579138, "learning_rate": 2.2741114909618283e-06, "loss": 0.69796938, "num_input_tokens_seen": 84459865, "step": 3930, "time_per_iteration": 2.627019166946411 }, { "auxiliary_loss_clip": 0.01141065, "auxiliary_loss_mlp": 0.01027053, "balance_loss_clip": 1.04693627, "balance_loss_mlp": 1.01952529, "epoch": 0.47267480310226656, "flos": 21434002392960.0, "grad_norm": 1.8343322832021847, "language_loss": 0.72283751, "learning_rate": 2.2733398494364828e-06, "loss": 0.7445187, "num_input_tokens_seen": 84479110, "step": 3931, "time_per_iteration": 2.758167266845703 }, { "auxiliary_loss_clip": 0.01150549, "auxiliary_loss_mlp": 0.0102598, "balance_loss_clip": 1.05047798, "balance_loss_mlp": 1.01801121, "epoch": 0.47279504599290567, "flos": 18770508069120.0, "grad_norm": 1.8079465608232885, "language_loss": 0.84436619, "learning_rate": 2.272568166445699e-06, "loss": 0.86613148, "num_input_tokens_seen": 84497675, "step": 3932, "time_per_iteration": 2.7788946628570557 }, { "auxiliary_loss_clip": 0.01167735, "auxiliary_loss_mlp": 0.01023497, "balance_loss_clip": 1.04856563, "balance_loss_mlp": 1.01559949, "epoch": 0.4729152888835448, "flos": 21105742976640.0, "grad_norm": 3.557033031846401, "language_loss": 0.64503849, "learning_rate": 2.271796442106541e-06, "loss": 0.66695082, "num_input_tokens_seen": 84517030, "step": 3933, "time_per_iteration": 2.6185598373413086 }, { "auxiliary_loss_clip": 0.01052402, "auxiliary_loss_mlp": 0.01005275, "balance_loss_clip": 1.02016342, "balance_loss_mlp": 1.00347543, "epoch": 0.47303553177418384, "flos": 70201877840640.0, "grad_norm": 0.7957961616325003, "language_loss": 0.56526256, "learning_rate": 2.271024676536079e-06, "loss": 0.58583939, "num_input_tokens_seen": 84577290, "step": 3934, "time_per_iteration": 3.2556071281433105 }, { "auxiliary_loss_clip": 0.01163496, "auxiliary_loss_mlp": 0.01032635, "balance_loss_clip": 1.05332696, "balance_loss_mlp": 1.02358079, "epoch": 0.47315577466482295, "flos": 22455122227200.0, "grad_norm": 2.289915400589802, "language_loss": 0.73200274, "learning_rate": 2.2702528698513894e-06, "loss": 0.75396407, "num_input_tokens_seen": 84598415, "step": 3935, "time_per_iteration": 2.697964668273926 }, { "auxiliary_loss_clip": 0.01155503, "auxiliary_loss_mlp": 0.01025157, "balance_loss_clip": 1.04581928, "balance_loss_mlp": 1.01672268, "epoch": 0.47327601755546206, "flos": 24352857480960.0, "grad_norm": 4.047232993148734, "language_loss": 0.79255569, "learning_rate": 2.269481022169554e-06, "loss": 0.81436229, "num_input_tokens_seen": 84617010, "step": 3936, "time_per_iteration": 2.699193000793457 }, { "auxiliary_loss_clip": 0.01161139, "auxiliary_loss_mlp": 0.0102883, "balance_loss_clip": 1.04756975, "balance_loss_mlp": 1.0205034, "epoch": 0.4733962604461011, "flos": 22926772736640.0, "grad_norm": 1.914430432976261, "language_loss": 0.8071357, "learning_rate": 2.2687091336076614e-06, "loss": 0.8290354, "num_input_tokens_seen": 84636350, "step": 3937, "time_per_iteration": 3.6518943309783936 }, { "auxiliary_loss_clip": 0.01168252, "auxiliary_loss_mlp": 0.01029308, "balance_loss_clip": 1.0506041, "balance_loss_mlp": 1.02085054, "epoch": 0.4735165033367402, "flos": 18327369980160.0, "grad_norm": 2.0877700182174737, "language_loss": 0.80217516, "learning_rate": 2.267937204282807e-06, "loss": 0.8241508, "num_input_tokens_seen": 84653490, "step": 3938, "time_per_iteration": 2.6783976554870605 }, { "auxiliary_loss_clip": 0.01174044, "auxiliary_loss_mlp": 0.01030641, "balance_loss_clip": 1.05128074, "balance_loss_mlp": 1.02197409, "epoch": 0.4736367462273793, "flos": 23037018554880.0, "grad_norm": 2.1470910048598753, "language_loss": 0.79270804, "learning_rate": 2.2671652343120926e-06, "loss": 0.81475484, "num_input_tokens_seen": 84673965, "step": 3939, "time_per_iteration": 2.690129280090332 }, { "auxiliary_loss_clip": 0.0118392, "auxiliary_loss_mlp": 0.01026335, "balance_loss_clip": 1.05277848, "balance_loss_mlp": 1.01843762, "epoch": 0.4737569891180184, "flos": 25374336451200.0, "grad_norm": 1.6273739982711202, "language_loss": 0.80745655, "learning_rate": 2.2663932238126236e-06, "loss": 0.82955909, "num_input_tokens_seen": 84692525, "step": 3940, "time_per_iteration": 3.4611480236053467 }, { "auxiliary_loss_clip": 0.01166345, "auxiliary_loss_mlp": 0.01026468, "balance_loss_clip": 1.046597, "balance_loss_mlp": 1.01802766, "epoch": 0.4738772320086575, "flos": 25849326925440.0, "grad_norm": 1.5376727522629519, "language_loss": 0.80144477, "learning_rate": 2.265621172901515e-06, "loss": 0.82337296, "num_input_tokens_seen": 84715640, "step": 3941, "time_per_iteration": 2.720466136932373 }, { "auxiliary_loss_clip": 0.01189161, "auxiliary_loss_mlp": 0.01030734, "balance_loss_clip": 1.05593491, "balance_loss_mlp": 1.02232373, "epoch": 0.47399747489929656, "flos": 27564420499200.0, "grad_norm": 2.6342842132308184, "language_loss": 0.71447343, "learning_rate": 2.2648490816958854e-06, "loss": 0.7366724, "num_input_tokens_seen": 84736635, "step": 3942, "time_per_iteration": 3.5601305961608887 }, { "auxiliary_loss_clip": 0.01170294, "auxiliary_loss_mlp": 0.01031472, "balance_loss_clip": 1.05022335, "balance_loss_mlp": 1.02270448, "epoch": 0.47411771778993567, "flos": 24863650836480.0, "grad_norm": 2.3813178247295452, "language_loss": 0.73565108, "learning_rate": 2.264076950312861e-06, "loss": 0.75766873, "num_input_tokens_seen": 84755445, "step": 3943, "time_per_iteration": 2.700568199157715 }, { "auxiliary_loss_clip": 0.01161089, "auxiliary_loss_mlp": 0.01029582, "balance_loss_clip": 1.04977965, "balance_loss_mlp": 1.02170253, "epoch": 0.4742379606805748, "flos": 22748009725440.0, "grad_norm": 2.022951617531913, "language_loss": 0.82670105, "learning_rate": 2.2633047788695727e-06, "loss": 0.84860772, "num_input_tokens_seen": 84775750, "step": 3944, "time_per_iteration": 2.667673349380493 }, { "auxiliary_loss_clip": 0.01153035, "auxiliary_loss_mlp": 0.01023714, "balance_loss_clip": 1.04810643, "balance_loss_mlp": 1.01634097, "epoch": 0.47435820357121383, "flos": 19681130689920.0, "grad_norm": 1.8325208015627505, "language_loss": 0.64043617, "learning_rate": 2.262532567483159e-06, "loss": 0.66220361, "num_input_tokens_seen": 84794310, "step": 3945, "time_per_iteration": 2.670921564102173 }, { "auxiliary_loss_clip": 0.01186119, "auxiliary_loss_mlp": 0.00763127, "balance_loss_clip": 1.0538888, "balance_loss_mlp": 1.00016093, "epoch": 0.47447844646185294, "flos": 25228718714880.0, "grad_norm": 1.7756056218117124, "language_loss": 0.80099446, "learning_rate": 2.2617603162707635e-06, "loss": 0.8204869, "num_input_tokens_seen": 84814720, "step": 3946, "time_per_iteration": 2.6224801540374756 }, { "auxiliary_loss_clip": 0.01183934, "auxiliary_loss_mlp": 0.01026882, "balance_loss_clip": 1.05191612, "balance_loss_mlp": 1.01876378, "epoch": 0.47459868935249205, "flos": 24570619683840.0, "grad_norm": 1.782044247930579, "language_loss": 0.82608724, "learning_rate": 2.2609880253495363e-06, "loss": 0.84819543, "num_input_tokens_seen": 84834355, "step": 3947, "time_per_iteration": 2.621115207672119 }, { "auxiliary_loss_clip": 0.01150972, "auxiliary_loss_mlp": 0.01034206, "balance_loss_clip": 1.0473671, "balance_loss_mlp": 1.02577138, "epoch": 0.4747189322431311, "flos": 20558500295040.0, "grad_norm": 1.9013329601573432, "language_loss": 0.86016905, "learning_rate": 2.260215694836633e-06, "loss": 0.88202083, "num_input_tokens_seen": 84853530, "step": 3948, "time_per_iteration": 2.7053840160369873 }, { "auxiliary_loss_clip": 0.01130913, "auxiliary_loss_mlp": 0.00763306, "balance_loss_clip": 1.04472136, "balance_loss_mlp": 1.00015521, "epoch": 0.4748391751337702, "flos": 25995231970560.0, "grad_norm": 2.7956499669574817, "language_loss": 0.64882207, "learning_rate": 2.2594433248492157e-06, "loss": 0.66776431, "num_input_tokens_seen": 84872505, "step": 3949, "time_per_iteration": 2.7704782485961914 }, { "auxiliary_loss_clip": 0.01173397, "auxiliary_loss_mlp": 0.0102814, "balance_loss_clip": 1.04883182, "balance_loss_mlp": 1.01966977, "epoch": 0.47495941802440933, "flos": 22821052032000.0, "grad_norm": 2.6005216967421365, "language_loss": 0.80117553, "learning_rate": 2.2586709155044527e-06, "loss": 0.82319087, "num_input_tokens_seen": 84893105, "step": 3950, "time_per_iteration": 2.5835225582122803 }, { "auxiliary_loss_clip": 0.01185809, "auxiliary_loss_mlp": 0.01031311, "balance_loss_clip": 1.05322707, "balance_loss_mlp": 1.02321088, "epoch": 0.4750796609150484, "flos": 27891782075520.0, "grad_norm": 1.4885386052513598, "language_loss": 0.75740016, "learning_rate": 2.2578984669195167e-06, "loss": 0.77957141, "num_input_tokens_seen": 84914070, "step": 3951, "time_per_iteration": 2.6251516342163086 }, { "auxiliary_loss_clip": 0.01168615, "auxiliary_loss_mlp": 0.01029363, "balance_loss_clip": 1.04789221, "balance_loss_mlp": 1.02173972, "epoch": 0.4751999038056875, "flos": 35660085471360.0, "grad_norm": 1.9148108559994252, "language_loss": 0.67907393, "learning_rate": 2.2571259792115887e-06, "loss": 0.70105368, "num_input_tokens_seen": 84935290, "step": 3952, "time_per_iteration": 2.7447779178619385 }, { "auxiliary_loss_clip": 0.01164977, "auxiliary_loss_mlp": 0.01030755, "balance_loss_clip": 1.04939497, "balance_loss_mlp": 1.02336371, "epoch": 0.4753201466963266, "flos": 22090880361600.0, "grad_norm": 1.644245364624808, "language_loss": 0.79042131, "learning_rate": 2.2563534524978544e-06, "loss": 0.81237864, "num_input_tokens_seen": 84952760, "step": 3953, "time_per_iteration": 2.6966745853424072 }, { "auxiliary_loss_clip": 0.01138311, "auxiliary_loss_mlp": 0.01023986, "balance_loss_clip": 1.04926205, "balance_loss_mlp": 1.01620722, "epoch": 0.47544038958696566, "flos": 30190854965760.0, "grad_norm": 1.6878755648688595, "language_loss": 0.70576966, "learning_rate": 2.2555808868955052e-06, "loss": 0.72739261, "num_input_tokens_seen": 84974890, "step": 3954, "time_per_iteration": 2.7929909229278564 }, { "auxiliary_loss_clip": 0.0112712, "auxiliary_loss_mlp": 0.01029703, "balance_loss_clip": 1.04582274, "balance_loss_mlp": 1.02081048, "epoch": 0.47556063247760477, "flos": 23472219738240.0, "grad_norm": 2.202777393319252, "language_loss": 0.74172592, "learning_rate": 2.254808282521738e-06, "loss": 0.76329416, "num_input_tokens_seen": 84993640, "step": 3955, "time_per_iteration": 2.7400150299072266 }, { "auxiliary_loss_clip": 0.01145006, "auxiliary_loss_mlp": 0.00762155, "balance_loss_clip": 1.04661787, "balance_loss_mlp": 1.00016975, "epoch": 0.4756808753682438, "flos": 25155209531520.0, "grad_norm": 1.7148496679955048, "language_loss": 0.811948, "learning_rate": 2.2540356394937573e-06, "loss": 0.83101964, "num_input_tokens_seen": 85012340, "step": 3956, "time_per_iteration": 2.7114598751068115 }, { "auxiliary_loss_clip": 0.01145967, "auxiliary_loss_mlp": 0.01031017, "balance_loss_clip": 1.04621732, "balance_loss_mlp": 1.02252352, "epoch": 0.47580111825888294, "flos": 15669729573120.0, "grad_norm": 2.1763157251676803, "language_loss": 0.83755732, "learning_rate": 2.253262957928772e-06, "loss": 0.85932714, "num_input_tokens_seen": 85029225, "step": 3957, "time_per_iteration": 2.6541965007781982 }, { "auxiliary_loss_clip": 0.01150928, "auxiliary_loss_mlp": 0.0103156, "balance_loss_clip": 1.04681528, "balance_loss_mlp": 1.02266061, "epoch": 0.47592136114952205, "flos": 17636556637440.0, "grad_norm": 1.5872102878416015, "language_loss": 0.72143751, "learning_rate": 2.2524902379439976e-06, "loss": 0.74326241, "num_input_tokens_seen": 85047895, "step": 3958, "time_per_iteration": 2.69250226020813 }, { "auxiliary_loss_clip": 0.01037053, "auxiliary_loss_mlp": 0.01003883, "balance_loss_clip": 1.02920079, "balance_loss_mlp": 1.00219011, "epoch": 0.4760416040401611, "flos": 61417159292160.0, "grad_norm": 0.7428303450181923, "language_loss": 0.63686359, "learning_rate": 2.251717479656655e-06, "loss": 0.65727299, "num_input_tokens_seen": 85112690, "step": 3959, "time_per_iteration": 3.4690439701080322 }, { "auxiliary_loss_clip": 0.01186332, "auxiliary_loss_mlp": 0.01031048, "balance_loss_clip": 1.05289102, "balance_loss_mlp": 1.02258992, "epoch": 0.4761618469308002, "flos": 18405871153920.0, "grad_norm": 5.154605544053129, "language_loss": 0.76413631, "learning_rate": 2.2509446831839704e-06, "loss": 0.78631008, "num_input_tokens_seen": 85132130, "step": 3960, "time_per_iteration": 3.261329174041748 }, { "auxiliary_loss_clip": 0.01160005, "auxiliary_loss_mlp": 0.0102779, "balance_loss_clip": 1.04909039, "balance_loss_mlp": 1.01982713, "epoch": 0.4762820898214393, "flos": 18040911016320.0, "grad_norm": 2.2185557167431513, "language_loss": 0.82768124, "learning_rate": 2.250171848643177e-06, "loss": 0.84955919, "num_input_tokens_seen": 85149420, "step": 3961, "time_per_iteration": 2.6906702518463135 }, { "auxiliary_loss_clip": 0.01152141, "auxiliary_loss_mlp": 0.01026868, "balance_loss_clip": 1.05023694, "balance_loss_mlp": 1.01923835, "epoch": 0.4764023327120784, "flos": 19318253541120.0, "grad_norm": 1.6723456047537, "language_loss": 0.86036706, "learning_rate": 2.249398976151513e-06, "loss": 0.88215721, "num_input_tokens_seen": 85166970, "step": 3962, "time_per_iteration": 2.654501438140869 }, { "auxiliary_loss_clip": 0.01181643, "auxiliary_loss_mlp": 0.01029824, "balance_loss_clip": 1.05188119, "balance_loss_mlp": 1.02138948, "epoch": 0.4765225756027175, "flos": 22747255539840.0, "grad_norm": 2.0285725811347417, "language_loss": 0.78468478, "learning_rate": 2.248626065826223e-06, "loss": 0.80679941, "num_input_tokens_seen": 85185175, "step": 3963, "time_per_iteration": 4.569107294082642 }, { "auxiliary_loss_clip": 0.01088953, "auxiliary_loss_mlp": 0.01001974, "balance_loss_clip": 1.02148819, "balance_loss_mlp": 1.00017357, "epoch": 0.4766428184933566, "flos": 65933392106880.0, "grad_norm": 0.763417656487407, "language_loss": 0.62573355, "learning_rate": 2.2478531177845564e-06, "loss": 0.6466428, "num_input_tokens_seen": 85246170, "step": 3964, "time_per_iteration": 3.1997880935668945 }, { "auxiliary_loss_clip": 0.01159765, "auxiliary_loss_mlp": 0.01028209, "balance_loss_clip": 1.05250192, "balance_loss_mlp": 1.01988173, "epoch": 0.47676306138399566, "flos": 24136495908480.0, "grad_norm": 1.6581597361631368, "language_loss": 0.84920394, "learning_rate": 2.247080132143769e-06, "loss": 0.87108368, "num_input_tokens_seen": 85268525, "step": 3965, "time_per_iteration": 2.7295210361480713 }, { "auxiliary_loss_clip": 0.01140742, "auxiliary_loss_mlp": 0.01028754, "balance_loss_clip": 1.04327941, "balance_loss_mlp": 1.02027273, "epoch": 0.47688330427463477, "flos": 12604322995200.0, "grad_norm": 2.4499439906313856, "language_loss": 0.69355035, "learning_rate": 2.246307109021121e-06, "loss": 0.71524537, "num_input_tokens_seen": 85285930, "step": 3966, "time_per_iteration": 2.7127819061279297 }, { "auxiliary_loss_clip": 0.01150482, "auxiliary_loss_mlp": 0.01035425, "balance_loss_clip": 1.04583359, "balance_loss_mlp": 1.027843, "epoch": 0.4770035471652739, "flos": 21390585828480.0, "grad_norm": 1.811836212587798, "language_loss": 0.82751036, "learning_rate": 2.2455340485338817e-06, "loss": 0.84936947, "num_input_tokens_seen": 85303565, "step": 3967, "time_per_iteration": 4.940474033355713 }, { "auxiliary_loss_clip": 0.01171105, "auxiliary_loss_mlp": 0.01030459, "balance_loss_clip": 1.04934001, "balance_loss_mlp": 1.02257288, "epoch": 0.47712379005591293, "flos": 25156251025920.0, "grad_norm": 2.0412288515019514, "language_loss": 0.67681575, "learning_rate": 2.244760950799322e-06, "loss": 0.69883144, "num_input_tokens_seen": 85321835, "step": 3968, "time_per_iteration": 2.7316582202911377 }, { "auxiliary_loss_clip": 0.01128029, "auxiliary_loss_mlp": 0.01027356, "balance_loss_clip": 1.0465343, "balance_loss_mlp": 1.01929212, "epoch": 0.47724403294655204, "flos": 22054323294720.0, "grad_norm": 2.020654857760967, "language_loss": 0.72624123, "learning_rate": 2.2439878159347203e-06, "loss": 0.7477951, "num_input_tokens_seen": 85341260, "step": 3969, "time_per_iteration": 2.753474473953247 }, { "auxiliary_loss_clip": 0.01088554, "auxiliary_loss_mlp": 0.01001308, "balance_loss_clip": 1.02105451, "balance_loss_mlp": 0.99955606, "epoch": 0.4773642758371911, "flos": 70229387658240.0, "grad_norm": 0.7307679284108406, "language_loss": 0.55236185, "learning_rate": 2.2432146440573616e-06, "loss": 0.57326049, "num_input_tokens_seen": 85407220, "step": 3970, "time_per_iteration": 3.3317105770111084 }, { "auxiliary_loss_clip": 0.01154744, "auxiliary_loss_mlp": 0.0103099, "balance_loss_clip": 1.048967, "balance_loss_mlp": 1.02278197, "epoch": 0.4774845187278302, "flos": 23548602009600.0, "grad_norm": 1.835564883086054, "language_loss": 0.6651659, "learning_rate": 2.242441435284534e-06, "loss": 0.68702322, "num_input_tokens_seen": 85426095, "step": 3971, "time_per_iteration": 2.7350192070007324 }, { "auxiliary_loss_clip": 0.01172048, "auxiliary_loss_mlp": 0.01025186, "balance_loss_clip": 1.05150771, "balance_loss_mlp": 1.01732373, "epoch": 0.4776047616184693, "flos": 23075371301760.0, "grad_norm": 2.172577641383062, "language_loss": 0.85641766, "learning_rate": 2.2416681897335337e-06, "loss": 0.87838995, "num_input_tokens_seen": 85444245, "step": 3972, "time_per_iteration": 2.631610155105591 }, { "auxiliary_loss_clip": 0.01126753, "auxiliary_loss_mlp": 0.01029949, "balance_loss_clip": 1.04761493, "balance_loss_mlp": 1.02182519, "epoch": 0.4777250045091084, "flos": 31898119374720.0, "grad_norm": 2.0953834862242404, "language_loss": 0.67016757, "learning_rate": 2.240894907521661e-06, "loss": 0.69173467, "num_input_tokens_seen": 85463325, "step": 3973, "time_per_iteration": 2.8140227794647217 }, { "auxiliary_loss_clip": 0.01157635, "auxiliary_loss_mlp": 0.01025695, "balance_loss_clip": 1.04873371, "balance_loss_mlp": 1.01751089, "epoch": 0.4778452473997475, "flos": 24278163148800.0, "grad_norm": 1.5874780186685356, "language_loss": 0.64036131, "learning_rate": 2.240121588766223e-06, "loss": 0.66219461, "num_input_tokens_seen": 85483375, "step": 3974, "time_per_iteration": 2.704516887664795 }, { "auxiliary_loss_clip": 0.01147859, "auxiliary_loss_mlp": 0.01031583, "balance_loss_clip": 1.04514384, "balance_loss_mlp": 1.02382255, "epoch": 0.4779654902903866, "flos": 31575031516800.0, "grad_norm": 1.6885560856438555, "language_loss": 0.71547079, "learning_rate": 2.239348233584531e-06, "loss": 0.73726517, "num_input_tokens_seen": 85504230, "step": 3975, "time_per_iteration": 2.7292416095733643 }, { "auxiliary_loss_clip": 0.01170641, "auxiliary_loss_mlp": 0.01032767, "balance_loss_clip": 1.04973602, "balance_loss_mlp": 1.02439857, "epoch": 0.47808573318102565, "flos": 19500428344320.0, "grad_norm": 2.1692501315324764, "language_loss": 0.80703193, "learning_rate": 2.2385748420939013e-06, "loss": 0.82906604, "num_input_tokens_seen": 85523425, "step": 3976, "time_per_iteration": 2.6487655639648438 }, { "auxiliary_loss_clip": 0.01184951, "auxiliary_loss_mlp": 0.01029888, "balance_loss_clip": 1.05501246, "balance_loss_mlp": 1.02249086, "epoch": 0.47820597607166476, "flos": 22601135013120.0, "grad_norm": 1.7530449769399241, "language_loss": 0.72219914, "learning_rate": 2.2378014144116583e-06, "loss": 0.74434751, "num_input_tokens_seen": 85542235, "step": 3977, "time_per_iteration": 2.5624608993530273 }, { "auxiliary_loss_clip": 0.01187614, "auxiliary_loss_mlp": 0.01034714, "balance_loss_clip": 1.05328894, "balance_loss_mlp": 1.02633929, "epoch": 0.4783262189623039, "flos": 23003011353600.0, "grad_norm": 2.751280837987753, "language_loss": 0.79615599, "learning_rate": 2.23702795065513e-06, "loss": 0.81837928, "num_input_tokens_seen": 85561815, "step": 3978, "time_per_iteration": 2.655606985092163 }, { "auxiliary_loss_clip": 0.01077564, "auxiliary_loss_mlp": 0.01001803, "balance_loss_clip": 1.01915193, "balance_loss_mlp": 1.00001454, "epoch": 0.47844646185294293, "flos": 49772801226240.0, "grad_norm": 0.980017192752197, "language_loss": 0.67464799, "learning_rate": 2.2362544509416493e-06, "loss": 0.69544172, "num_input_tokens_seen": 85613930, "step": 3979, "time_per_iteration": 3.1027910709381104 }, { "auxiliary_loss_clip": 0.01147939, "auxiliary_loss_mlp": 0.0102483, "balance_loss_clip": 1.04466128, "balance_loss_mlp": 1.01717114, "epoch": 0.47856670474358204, "flos": 20229558520320.0, "grad_norm": 5.058814830068448, "language_loss": 0.82787633, "learning_rate": 2.2354809153885572e-06, "loss": 0.84960407, "num_input_tokens_seen": 85631000, "step": 3980, "time_per_iteration": 2.757737398147583 }, { "auxiliary_loss_clip": 0.01167614, "auxiliary_loss_mlp": 0.01026049, "balance_loss_clip": 1.04881454, "balance_loss_mlp": 1.01822329, "epoch": 0.47868694763422115, "flos": 20990936131200.0, "grad_norm": 2.672648321257097, "language_loss": 0.83217156, "learning_rate": 2.234707344113197e-06, "loss": 0.85410815, "num_input_tokens_seen": 85649095, "step": 3981, "time_per_iteration": 2.656545877456665 }, { "auxiliary_loss_clip": 0.01181481, "auxiliary_loss_mlp": 0.01025556, "balance_loss_clip": 1.05206263, "balance_loss_mlp": 1.01776826, "epoch": 0.4788071905248602, "flos": 19026551191680.0, "grad_norm": 1.9699259707428296, "language_loss": 0.77855366, "learning_rate": 2.233933737232919e-06, "loss": 0.80062401, "num_input_tokens_seen": 85666875, "step": 3982, "time_per_iteration": 2.6217846870422363 }, { "auxiliary_loss_clip": 0.01119425, "auxiliary_loss_mlp": 0.00762681, "balance_loss_clip": 1.04392695, "balance_loss_mlp": 1.00018632, "epoch": 0.4789274334154993, "flos": 23002221254400.0, "grad_norm": 1.6698070940386518, "language_loss": 0.78023398, "learning_rate": 2.2331600948650793e-06, "loss": 0.79905504, "num_input_tokens_seen": 85687020, "step": 3983, "time_per_iteration": 2.7054078578948975 }, { "auxiliary_loss_clip": 0.01131929, "auxiliary_loss_mlp": 0.00764043, "balance_loss_clip": 1.04641318, "balance_loss_mlp": 1.0002867, "epoch": 0.4790476763061384, "flos": 23075586783360.0, "grad_norm": 2.6743920430907537, "language_loss": 0.80032361, "learning_rate": 2.2323864171270386e-06, "loss": 0.81928331, "num_input_tokens_seen": 85708290, "step": 3984, "time_per_iteration": 2.742558717727661 }, { "auxiliary_loss_clip": 0.01143536, "auxiliary_loss_mlp": 0.0102948, "balance_loss_clip": 1.04518557, "balance_loss_mlp": 1.02111125, "epoch": 0.4791679191967775, "flos": 21179288073600.0, "grad_norm": 1.9244381340507446, "language_loss": 0.73060048, "learning_rate": 2.231612704136164e-06, "loss": 0.75233054, "num_input_tokens_seen": 85728660, "step": 3985, "time_per_iteration": 2.728558301925659 }, { "auxiliary_loss_clip": 0.0116403, "auxiliary_loss_mlp": 0.01028821, "balance_loss_clip": 1.0484364, "balance_loss_mlp": 1.02010715, "epoch": 0.4792881620874166, "flos": 22301495758080.0, "grad_norm": 3.829164013013086, "language_loss": 0.75314379, "learning_rate": 2.2308389560098253e-06, "loss": 0.77507228, "num_input_tokens_seen": 85745035, "step": 3986, "time_per_iteration": 2.6348764896392822 }, { "auxiliary_loss_clip": 0.01148059, "auxiliary_loss_mlp": 0.01027598, "balance_loss_clip": 1.05017591, "balance_loss_mlp": 1.01953959, "epoch": 0.47940840497805565, "flos": 17420877423360.0, "grad_norm": 2.750734560541953, "language_loss": 0.7718268, "learning_rate": 2.2300651728654008e-06, "loss": 0.79358339, "num_input_tokens_seen": 85760295, "step": 3987, "time_per_iteration": 2.7001709938049316 }, { "auxiliary_loss_clip": 0.01068292, "auxiliary_loss_mlp": 0.00754718, "balance_loss_clip": 1.01676297, "balance_loss_mlp": 1.0004065, "epoch": 0.47952864786869476, "flos": 65358175708800.0, "grad_norm": 0.7268700870409951, "language_loss": 0.60194296, "learning_rate": 2.229291354820272e-06, "loss": 0.6201731, "num_input_tokens_seen": 85821305, "step": 3988, "time_per_iteration": 3.2397308349609375 }, { "auxiliary_loss_clip": 0.01170038, "auxiliary_loss_mlp": 0.01029497, "balance_loss_clip": 1.05021763, "balance_loss_mlp": 1.02085423, "epoch": 0.47964889075933387, "flos": 16799802336000.0, "grad_norm": 2.1381829544750994, "language_loss": 0.76049715, "learning_rate": 2.228517501991828e-06, "loss": 0.78249252, "num_input_tokens_seen": 85840105, "step": 3989, "time_per_iteration": 3.608168840408325 }, { "auxiliary_loss_clip": 0.01058869, "auxiliary_loss_mlp": 0.01002994, "balance_loss_clip": 1.01711607, "balance_loss_mlp": 1.00132549, "epoch": 0.4797691336499729, "flos": 70079244808320.0, "grad_norm": 0.8109597628611122, "language_loss": 0.61052966, "learning_rate": 2.22774361449746e-06, "loss": 0.63114834, "num_input_tokens_seen": 85896585, "step": 3990, "time_per_iteration": 4.1977458000183105 }, { "auxiliary_loss_clip": 0.01111621, "auxiliary_loss_mlp": 0.01026588, "balance_loss_clip": 1.04346871, "balance_loss_mlp": 1.01833868, "epoch": 0.47988937654061203, "flos": 18953329317120.0, "grad_norm": 5.23016769848664, "language_loss": 0.70943284, "learning_rate": 2.2269696924545668e-06, "loss": 0.73081493, "num_input_tokens_seen": 85914415, "step": 3991, "time_per_iteration": 2.739497661590576 }, { "auxiliary_loss_clip": 0.01142637, "auxiliary_loss_mlp": 0.01026486, "balance_loss_clip": 1.0499649, "balance_loss_mlp": 1.018821, "epoch": 0.48000961943125114, "flos": 14461981649280.0, "grad_norm": 2.14403107632961, "language_loss": 0.78333652, "learning_rate": 2.2261957359805523e-06, "loss": 0.80502772, "num_input_tokens_seen": 85931650, "step": 3992, "time_per_iteration": 3.634110927581787 }, { "auxiliary_loss_clip": 0.01184436, "auxiliary_loss_mlp": 0.01029684, "balance_loss_clip": 1.05243218, "balance_loss_mlp": 1.02126753, "epoch": 0.4801298623218902, "flos": 27051149105280.0, "grad_norm": 1.973302859308407, "language_loss": 0.73703575, "learning_rate": 2.225421745192823e-06, "loss": 0.75917697, "num_input_tokens_seen": 85951805, "step": 3993, "time_per_iteration": 2.7539656162261963 }, { "auxiliary_loss_clip": 0.01168167, "auxiliary_loss_mlp": 0.01025787, "balance_loss_clip": 1.05122495, "balance_loss_mlp": 1.01758504, "epoch": 0.4802501052125293, "flos": 26355236031360.0, "grad_norm": 2.1761348384607913, "language_loss": 0.78555661, "learning_rate": 2.2246477202087955e-06, "loss": 0.80749619, "num_input_tokens_seen": 85972485, "step": 3994, "time_per_iteration": 3.605707883834839 }, { "auxiliary_loss_clip": 0.01153964, "auxiliary_loss_mlp": 0.0102613, "balance_loss_clip": 1.04631472, "balance_loss_mlp": 1.01794589, "epoch": 0.4803703481031684, "flos": 20993916960000.0, "grad_norm": 1.61016377580868, "language_loss": 0.83101672, "learning_rate": 2.223873661145887e-06, "loss": 0.85281765, "num_input_tokens_seen": 85992540, "step": 3995, "time_per_iteration": 2.804819107055664 }, { "auxiliary_loss_clip": 0.01154935, "auxiliary_loss_mlp": 0.00762709, "balance_loss_clip": 1.05184591, "balance_loss_mlp": 1.00023627, "epoch": 0.4804905909938075, "flos": 20703722981760.0, "grad_norm": 1.6291853471689601, "language_loss": 0.71206868, "learning_rate": 2.2230995681215226e-06, "loss": 0.7312451, "num_input_tokens_seen": 86012065, "step": 3996, "time_per_iteration": 2.6791675090789795 }, { "auxiliary_loss_clip": 0.01139133, "auxiliary_loss_mlp": 0.01027505, "balance_loss_clip": 1.04786348, "balance_loss_mlp": 1.01867783, "epoch": 0.4806108338844466, "flos": 16654831044480.0, "grad_norm": 2.390346104680326, "language_loss": 0.78125226, "learning_rate": 2.2223254412531305e-06, "loss": 0.80291861, "num_input_tokens_seen": 86029435, "step": 3997, "time_per_iteration": 2.6908318996429443 }, { "auxiliary_loss_clip": 0.01140905, "auxiliary_loss_mlp": 0.01023098, "balance_loss_clip": 1.04474521, "balance_loss_mlp": 1.01552272, "epoch": 0.4807310767750857, "flos": 20011329440640.0, "grad_norm": 1.8011850013812505, "language_loss": 0.82283044, "learning_rate": 2.221551280658146e-06, "loss": 0.84447044, "num_input_tokens_seen": 86048495, "step": 3998, "time_per_iteration": 2.680685520172119 }, { "auxiliary_loss_clip": 0.01122723, "auxiliary_loss_mlp": 0.01028304, "balance_loss_clip": 1.04557693, "balance_loss_mlp": 1.01973891, "epoch": 0.48085131966572475, "flos": 23185257984000.0, "grad_norm": 1.6444677118650723, "language_loss": 0.7424823, "learning_rate": 2.2207770864540085e-06, "loss": 0.76399267, "num_input_tokens_seen": 86067470, "step": 3999, "time_per_iteration": 2.7773759365081787 }, { "auxiliary_loss_clip": 0.01149662, "auxiliary_loss_mlp": 0.01031644, "balance_loss_clip": 1.04711819, "balance_loss_mlp": 1.02319252, "epoch": 0.48097156255636386, "flos": 20558643949440.0, "grad_norm": 1.7493508689121668, "language_loss": 0.73431826, "learning_rate": 2.220002858758162e-06, "loss": 0.75613129, "num_input_tokens_seen": 86085460, "step": 4000, "time_per_iteration": 2.7083816528320312 }, { "auxiliary_loss_clip": 0.01073497, "auxiliary_loss_mlp": 0.01003648, "balance_loss_clip": 1.01593125, "balance_loss_mlp": 1.00190783, "epoch": 0.481091805447003, "flos": 70511608817280.0, "grad_norm": 0.8872232156697745, "language_loss": 0.60895783, "learning_rate": 2.2192285976880573e-06, "loss": 0.62972927, "num_input_tokens_seen": 86149715, "step": 4001, "time_per_iteration": 3.266961097717285 }, { "auxiliary_loss_clip": 0.0114032, "auxiliary_loss_mlp": 0.00762086, "balance_loss_clip": 1.04431581, "balance_loss_mlp": 1.00026989, "epoch": 0.48121204833764203, "flos": 36428214839040.0, "grad_norm": 1.5587059273603099, "language_loss": 0.80702293, "learning_rate": 2.2184543033611485e-06, "loss": 0.826047, "num_input_tokens_seen": 86170795, "step": 4002, "time_per_iteration": 2.802813768386841 }, { "auxiliary_loss_clip": 0.01169445, "auxiliary_loss_mlp": 0.01022261, "balance_loss_clip": 1.04850149, "balance_loss_mlp": 1.01427388, "epoch": 0.48133229122828114, "flos": 27490264871040.0, "grad_norm": 2.5294114712240767, "language_loss": 0.81753689, "learning_rate": 2.2176799758948957e-06, "loss": 0.83945394, "num_input_tokens_seen": 86190955, "step": 4003, "time_per_iteration": 2.6529674530029297 }, { "auxiliary_loss_clip": 0.01151055, "auxiliary_loss_mlp": 0.01031331, "balance_loss_clip": 1.04929805, "balance_loss_mlp": 1.02317131, "epoch": 0.4814525341189202, "flos": 43072802179200.0, "grad_norm": 1.7445863543665725, "language_loss": 0.73163056, "learning_rate": 2.2169056154067635e-06, "loss": 0.75345445, "num_input_tokens_seen": 86214875, "step": 4004, "time_per_iteration": 2.8670263290405273 }, { "auxiliary_loss_clip": 0.01170415, "auxiliary_loss_mlp": 0.00762708, "balance_loss_clip": 1.051373, "balance_loss_mlp": 1.0002569, "epoch": 0.4815727770095593, "flos": 24236901400320.0, "grad_norm": 2.123498959401769, "language_loss": 0.82638252, "learning_rate": 2.216131222014222e-06, "loss": 0.84571379, "num_input_tokens_seen": 86232950, "step": 4005, "time_per_iteration": 2.5868144035339355 }, { "auxiliary_loss_clip": 0.01133225, "auxiliary_loss_mlp": 0.01026684, "balance_loss_clip": 1.04617095, "balance_loss_mlp": 1.01844072, "epoch": 0.4816930199001984, "flos": 18113630100480.0, "grad_norm": 1.8986730814434014, "language_loss": 0.80653071, "learning_rate": 2.2153567958347455e-06, "loss": 0.82812983, "num_input_tokens_seen": 86249160, "step": 4006, "time_per_iteration": 2.5640811920166016 }, { "auxiliary_loss_clip": 0.01155019, "auxiliary_loss_mlp": 0.01028073, "balance_loss_clip": 1.05075264, "balance_loss_mlp": 1.02035451, "epoch": 0.48181326279083747, "flos": 17274720983040.0, "grad_norm": 2.6420638123502944, "language_loss": 0.79467738, "learning_rate": 2.214582336985815e-06, "loss": 0.81650829, "num_input_tokens_seen": 86267060, "step": 4007, "time_per_iteration": 2.6288905143737793 }, { "auxiliary_loss_clip": 0.01144023, "auxiliary_loss_mlp": 0.01027914, "balance_loss_clip": 1.04482698, "balance_loss_mlp": 1.01980209, "epoch": 0.4819335056814766, "flos": 14903252231040.0, "grad_norm": 2.1000046605165297, "language_loss": 0.66474211, "learning_rate": 2.2138078455849142e-06, "loss": 0.68646145, "num_input_tokens_seen": 86285055, "step": 4008, "time_per_iteration": 2.5176854133605957 }, { "auxiliary_loss_clip": 0.01175884, "auxiliary_loss_mlp": 0.01030294, "balance_loss_clip": 1.05224943, "balance_loss_mlp": 1.0228374, "epoch": 0.4820537485721157, "flos": 19244888012160.0, "grad_norm": 1.880428823616472, "language_loss": 0.78999841, "learning_rate": 2.2130333217495334e-06, "loss": 0.81206024, "num_input_tokens_seen": 86304225, "step": 4009, "time_per_iteration": 2.668553590774536 }, { "auxiliary_loss_clip": 0.01151427, "auxiliary_loss_mlp": 0.01026201, "balance_loss_clip": 1.04801369, "balance_loss_mlp": 1.0180943, "epoch": 0.48217399146275475, "flos": 16033791870720.0, "grad_norm": 2.644887090549291, "language_loss": 0.68096864, "learning_rate": 2.2122587655971665e-06, "loss": 0.7027449, "num_input_tokens_seen": 86319170, "step": 4010, "time_per_iteration": 2.6296064853668213 }, { "auxiliary_loss_clip": 0.01157034, "auxiliary_loss_mlp": 0.01028644, "balance_loss_clip": 1.04918635, "balance_loss_mlp": 1.02071691, "epoch": 0.48229423435339386, "flos": 24134197438080.0, "grad_norm": 1.816593394125063, "language_loss": 0.64175367, "learning_rate": 2.211484177245314e-06, "loss": 0.66361046, "num_input_tokens_seen": 86338760, "step": 4011, "time_per_iteration": 2.6761059761047363 }, { "auxiliary_loss_clip": 0.01184775, "auxiliary_loss_mlp": 0.01025944, "balance_loss_clip": 1.0511173, "balance_loss_mlp": 1.01739073, "epoch": 0.48241447724403297, "flos": 23805435231360.0, "grad_norm": 2.0857295090690062, "language_loss": 0.72368407, "learning_rate": 2.21070955681148e-06, "loss": 0.74579126, "num_input_tokens_seen": 86357865, "step": 4012, "time_per_iteration": 2.5849668979644775 }, { "auxiliary_loss_clip": 0.01132483, "auxiliary_loss_mlp": 0.01031909, "balance_loss_clip": 1.04665709, "balance_loss_mlp": 1.02424335, "epoch": 0.482534720134672, "flos": 23110312256640.0, "grad_norm": 1.5864890525494755, "language_loss": 0.78041464, "learning_rate": 2.209934904413174e-06, "loss": 0.80205858, "num_input_tokens_seen": 86379470, "step": 4013, "time_per_iteration": 2.688859224319458 }, { "auxiliary_loss_clip": 0.01109082, "auxiliary_loss_mlp": 0.01028358, "balance_loss_clip": 1.03779888, "balance_loss_mlp": 1.02011514, "epoch": 0.48265496302531113, "flos": 20923819568640.0, "grad_norm": 2.148816583160264, "language_loss": 0.71608949, "learning_rate": 2.2091602201679095e-06, "loss": 0.73746389, "num_input_tokens_seen": 86399080, "step": 4014, "time_per_iteration": 2.77426815032959 }, { "auxiliary_loss_clip": 0.01145963, "auxiliary_loss_mlp": 0.01028677, "balance_loss_clip": 1.04844892, "balance_loss_mlp": 1.02033818, "epoch": 0.48277520591595025, "flos": 15231152511360.0, "grad_norm": 2.2333877824979167, "language_loss": 0.83488876, "learning_rate": 2.208385504193206e-06, "loss": 0.85663509, "num_input_tokens_seen": 86416580, "step": 4015, "time_per_iteration": 3.628854274749756 }, { "auxiliary_loss_clip": 0.01183848, "auxiliary_loss_mlp": 0.01023553, "balance_loss_clip": 1.04995871, "balance_loss_mlp": 1.01628768, "epoch": 0.4828954488065893, "flos": 17858664385920.0, "grad_norm": 2.583067409931057, "language_loss": 0.81409872, "learning_rate": 2.2076107566065873e-06, "loss": 0.8361727, "num_input_tokens_seen": 86434365, "step": 4016, "time_per_iteration": 3.478276252746582 }, { "auxiliary_loss_clip": 0.01173135, "auxiliary_loss_mlp": 0.01030578, "balance_loss_clip": 1.0499928, "balance_loss_mlp": 1.02276385, "epoch": 0.4830156916972284, "flos": 32087405070720.0, "grad_norm": 2.216864701325059, "language_loss": 0.75245148, "learning_rate": 2.2068359775255816e-06, "loss": 0.77448863, "num_input_tokens_seen": 86452675, "step": 4017, "time_per_iteration": 2.682450294494629 }, { "auxiliary_loss_clip": 0.01123514, "auxiliary_loss_mlp": 0.01025812, "balance_loss_clip": 1.04416704, "balance_loss_mlp": 1.01799202, "epoch": 0.48313593458786747, "flos": 21871717528320.0, "grad_norm": 4.301178459590197, "language_loss": 0.78650141, "learning_rate": 2.206061167067723e-06, "loss": 0.8079946, "num_input_tokens_seen": 86470785, "step": 4018, "time_per_iteration": 2.784757137298584 }, { "auxiliary_loss_clip": 0.01135636, "auxiliary_loss_mlp": 0.01030051, "balance_loss_clip": 1.04248333, "balance_loss_mlp": 1.02162921, "epoch": 0.4832561774785066, "flos": 22601206840320.0, "grad_norm": 2.854682435732161, "language_loss": 0.80168176, "learning_rate": 2.205286325350549e-06, "loss": 0.82333869, "num_input_tokens_seen": 86489850, "step": 4019, "time_per_iteration": 3.64235258102417 }, { "auxiliary_loss_clip": 0.0112618, "auxiliary_loss_mlp": 0.01030847, "balance_loss_clip": 1.04608572, "balance_loss_mlp": 1.02312207, "epoch": 0.4833764203691457, "flos": 13437342282240.0, "grad_norm": 2.087837826919939, "language_loss": 0.72712815, "learning_rate": 2.204511452491603e-06, "loss": 0.74869847, "num_input_tokens_seen": 86506475, "step": 4020, "time_per_iteration": 3.5980746746063232 }, { "auxiliary_loss_clip": 0.01181829, "auxiliary_loss_mlp": 0.01027554, "balance_loss_clip": 1.05286002, "balance_loss_mlp": 1.01978731, "epoch": 0.48349666325978474, "flos": 44128036955520.0, "grad_norm": 1.7214661952583437, "language_loss": 0.74762595, "learning_rate": 2.2037365486084316e-06, "loss": 0.76971984, "num_input_tokens_seen": 86529715, "step": 4021, "time_per_iteration": 2.8566391468048096 }, { "auxiliary_loss_clip": 0.01149325, "auxiliary_loss_mlp": 0.01027313, "balance_loss_clip": 1.04504895, "balance_loss_mlp": 1.01912344, "epoch": 0.48361690615042385, "flos": 26028377245440.0, "grad_norm": 2.5046338339307934, "language_loss": 0.78371829, "learning_rate": 2.2029616138185886e-06, "loss": 0.80548465, "num_input_tokens_seen": 86548715, "step": 4022, "time_per_iteration": 2.7638494968414307 }, { "auxiliary_loss_clip": 0.01138271, "auxiliary_loss_mlp": 0.01024333, "balance_loss_clip": 1.04861593, "balance_loss_mlp": 1.01652503, "epoch": 0.48373714904106296, "flos": 22273306560000.0, "grad_norm": 1.8705726760247465, "language_loss": 0.83069181, "learning_rate": 2.202186648239629e-06, "loss": 0.85231793, "num_input_tokens_seen": 86568650, "step": 4023, "time_per_iteration": 2.734017848968506 }, { "auxiliary_loss_clip": 0.01166767, "auxiliary_loss_mlp": 0.01023059, "balance_loss_clip": 1.04962552, "balance_loss_mlp": 1.01555502, "epoch": 0.483857391931702, "flos": 28292293699200.0, "grad_norm": 3.5257314723457016, "language_loss": 0.71644694, "learning_rate": 2.201411651989117e-06, "loss": 0.73834521, "num_input_tokens_seen": 86590630, "step": 4024, "time_per_iteration": 2.6969945430755615 }, { "auxiliary_loss_clip": 0.01153938, "auxiliary_loss_mlp": 0.00762983, "balance_loss_clip": 1.04872775, "balance_loss_mlp": 1.00028372, "epoch": 0.48397763482234113, "flos": 27418048577280.0, "grad_norm": 1.7917837161341128, "language_loss": 0.78060699, "learning_rate": 2.2006366251846167e-06, "loss": 0.7997762, "num_input_tokens_seen": 86611270, "step": 4025, "time_per_iteration": 2.7299904823303223 }, { "auxiliary_loss_clip": 0.01156554, "auxiliary_loss_mlp": 0.01033388, "balance_loss_clip": 1.05237484, "balance_loss_mlp": 1.02566338, "epoch": 0.48409787771298024, "flos": 16797252470400.0, "grad_norm": 1.8080034993995413, "language_loss": 0.75508255, "learning_rate": 2.1998615679436997e-06, "loss": 0.77698195, "num_input_tokens_seen": 86628810, "step": 4026, "time_per_iteration": 2.658656358718872 }, { "auxiliary_loss_clip": 0.01161764, "auxiliary_loss_mlp": 0.01026232, "balance_loss_clip": 1.04856467, "balance_loss_mlp": 1.01775575, "epoch": 0.4842181206036193, "flos": 25083496028160.0, "grad_norm": 2.537386682040976, "language_loss": 0.77197433, "learning_rate": 2.199086480383942e-06, "loss": 0.7938543, "num_input_tokens_seen": 86648185, "step": 4027, "time_per_iteration": 2.7470242977142334 }, { "auxiliary_loss_clip": 0.01171786, "auxiliary_loss_mlp": 0.01030089, "balance_loss_clip": 1.05187488, "balance_loss_mlp": 1.02081442, "epoch": 0.4843383634942584, "flos": 30372311496960.0, "grad_norm": 2.6421168180202974, "language_loss": 0.68155146, "learning_rate": 2.1983113626229234e-06, "loss": 0.70357025, "num_input_tokens_seen": 86667435, "step": 4028, "time_per_iteration": 2.7181034088134766 }, { "auxiliary_loss_clip": 0.01131626, "auxiliary_loss_mlp": 0.00763559, "balance_loss_clip": 1.04280436, "balance_loss_mlp": 1.00027466, "epoch": 0.4844586063848975, "flos": 20413564917120.0, "grad_norm": 1.6596570122240712, "language_loss": 0.78284335, "learning_rate": 2.1975362147782293e-06, "loss": 0.80179524, "num_input_tokens_seen": 86686630, "step": 4029, "time_per_iteration": 2.739306688308716 }, { "auxiliary_loss_clip": 0.01074319, "auxiliary_loss_mlp": 0.01004561, "balance_loss_clip": 1.02954292, "balance_loss_mlp": 1.00290382, "epoch": 0.48457884927553657, "flos": 70303722854400.0, "grad_norm": 0.6930981595139819, "language_loss": 0.54107738, "learning_rate": 2.196761036967448e-06, "loss": 0.56186616, "num_input_tokens_seen": 86754595, "step": 4030, "time_per_iteration": 3.435328483581543 }, { "auxiliary_loss_clip": 0.01163024, "auxiliary_loss_mlp": 0.01030579, "balance_loss_clip": 1.04795265, "balance_loss_mlp": 1.02317941, "epoch": 0.4846990921661757, "flos": 19934516206080.0, "grad_norm": 1.6577480761146417, "language_loss": 0.77309442, "learning_rate": 2.1959858293081743e-06, "loss": 0.79503036, "num_input_tokens_seen": 86773730, "step": 4031, "time_per_iteration": 2.5975069999694824 }, { "auxiliary_loss_clip": 0.01138498, "auxiliary_loss_mlp": 0.0103273, "balance_loss_clip": 1.04771006, "balance_loss_mlp": 1.02427173, "epoch": 0.4848193350568148, "flos": 23075945919360.0, "grad_norm": 1.5805565529417993, "language_loss": 0.75720906, "learning_rate": 2.1952105919180056e-06, "loss": 0.77892137, "num_input_tokens_seen": 86792985, "step": 4032, "time_per_iteration": 2.713076114654541 }, { "auxiliary_loss_clip": 0.01154359, "auxiliary_loss_mlp": 0.01027989, "balance_loss_clip": 1.05120957, "balance_loss_mlp": 1.01951325, "epoch": 0.48493957794745385, "flos": 22455481363200.0, "grad_norm": 2.930879580591001, "language_loss": 0.68032241, "learning_rate": 2.1944353249145456e-06, "loss": 0.70214593, "num_input_tokens_seen": 86812095, "step": 4033, "time_per_iteration": 2.7157299518585205 }, { "auxiliary_loss_clip": 0.01184976, "auxiliary_loss_mlp": 0.0102747, "balance_loss_clip": 1.05496073, "balance_loss_mlp": 1.01978731, "epoch": 0.48505982083809296, "flos": 25046112948480.0, "grad_norm": 1.6198906433400748, "language_loss": 0.74641109, "learning_rate": 2.193660028415401e-06, "loss": 0.76853549, "num_input_tokens_seen": 86832875, "step": 4034, "time_per_iteration": 2.64768385887146 }, { "auxiliary_loss_clip": 0.011466, "auxiliary_loss_mlp": 0.01024941, "balance_loss_clip": 1.04765022, "balance_loss_mlp": 1.01713252, "epoch": 0.485180063728732, "flos": 26761386090240.0, "grad_norm": 2.4037668526284928, "language_loss": 0.82192504, "learning_rate": 2.1928847025381852e-06, "loss": 0.84364045, "num_input_tokens_seen": 86853480, "step": 4035, "time_per_iteration": 2.7970049381256104 }, { "auxiliary_loss_clip": 0.01166532, "auxiliary_loss_mlp": 0.01032092, "balance_loss_clip": 1.04614067, "balance_loss_mlp": 1.02360404, "epoch": 0.4853003066193711, "flos": 24059143969920.0, "grad_norm": 1.7428120733805368, "language_loss": 0.83818257, "learning_rate": 2.192109347400512e-06, "loss": 0.86016881, "num_input_tokens_seen": 86873695, "step": 4036, "time_per_iteration": 2.659339666366577 }, { "auxiliary_loss_clip": 0.01157156, "auxiliary_loss_mlp": 0.01031771, "balance_loss_clip": 1.04878485, "balance_loss_mlp": 1.02308011, "epoch": 0.48542054951001024, "flos": 23076376882560.0, "grad_norm": 1.9160880132097506, "language_loss": 0.79171813, "learning_rate": 2.191333963120004e-06, "loss": 0.81360739, "num_input_tokens_seen": 86892675, "step": 4037, "time_per_iteration": 2.692756414413452 }, { "auxiliary_loss_clip": 0.01151619, "auxiliary_loss_mlp": 0.01024837, "balance_loss_clip": 1.04755783, "balance_loss_mlp": 1.01716614, "epoch": 0.4855407924006493, "flos": 25664889565440.0, "grad_norm": 3.853883549504259, "language_loss": 0.70284164, "learning_rate": 2.190558549814286e-06, "loss": 0.72460616, "num_input_tokens_seen": 86912835, "step": 4038, "time_per_iteration": 2.6974172592163086 }, { "auxiliary_loss_clip": 0.01151641, "auxiliary_loss_mlp": 0.01030727, "balance_loss_clip": 1.04630065, "balance_loss_mlp": 1.02282906, "epoch": 0.4856610352912884, "flos": 23987933256960.0, "grad_norm": 1.7107355179224146, "language_loss": 0.79485536, "learning_rate": 2.1897831076009872e-06, "loss": 0.816679, "num_input_tokens_seen": 86932475, "step": 4039, "time_per_iteration": 2.6743619441986084 }, { "auxiliary_loss_clip": 0.01171385, "auxiliary_loss_mlp": 0.01024287, "balance_loss_clip": 1.05165291, "balance_loss_mlp": 1.01706564, "epoch": 0.4857812781819275, "flos": 24096814358400.0, "grad_norm": 2.172559458236493, "language_loss": 0.79621327, "learning_rate": 2.1890076365977426e-06, "loss": 0.81816995, "num_input_tokens_seen": 86952300, "step": 4040, "time_per_iteration": 2.7648849487304688 }, { "auxiliary_loss_clip": 0.0105524, "auxiliary_loss_mlp": 0.01000906, "balance_loss_clip": 1.01220798, "balance_loss_mlp": 0.99926108, "epoch": 0.48590152107256657, "flos": 56266635185280.0, "grad_norm": 0.8508634600409317, "language_loss": 0.52804589, "learning_rate": 2.188232136922189e-06, "loss": 0.54860735, "num_input_tokens_seen": 87010420, "step": 4041, "time_per_iteration": 4.139225006103516 }, { "auxiliary_loss_clip": 0.01104808, "auxiliary_loss_mlp": 0.0102651, "balance_loss_clip": 1.04241312, "balance_loss_mlp": 1.01818943, "epoch": 0.4860217639632057, "flos": 20046988667520.0, "grad_norm": 2.8871550260839935, "language_loss": 0.75552833, "learning_rate": 2.187456608691971e-06, "loss": 0.77684152, "num_input_tokens_seen": 87029295, "step": 4042, "time_per_iteration": 3.672436237335205 }, { "auxiliary_loss_clip": 0.0114352, "auxiliary_loss_mlp": 0.01030332, "balance_loss_clip": 1.04946256, "balance_loss_mlp": 1.02271485, "epoch": 0.4861420068538448, "flos": 17822143232640.0, "grad_norm": 2.15221582687372, "language_loss": 0.87751949, "learning_rate": 2.1866810520247334e-06, "loss": 0.89925802, "num_input_tokens_seen": 87048165, "step": 4043, "time_per_iteration": 2.6838765144348145 }, { "auxiliary_loss_clip": 0.01171221, "auxiliary_loss_mlp": 0.01024632, "balance_loss_clip": 1.04741716, "balance_loss_mlp": 1.01635909, "epoch": 0.48626224974448384, "flos": 26250125857920.0, "grad_norm": 1.8724644620978044, "language_loss": 0.65077388, "learning_rate": 2.185905467038129e-06, "loss": 0.67273241, "num_input_tokens_seen": 87067070, "step": 4044, "time_per_iteration": 3.5988035202026367 }, { "auxiliary_loss_clip": 0.01181624, "auxiliary_loss_mlp": 0.01030943, "balance_loss_clip": 1.05307865, "balance_loss_mlp": 1.02295041, "epoch": 0.48638249263512295, "flos": 22054502862720.0, "grad_norm": 1.7471524914703611, "language_loss": 0.77643627, "learning_rate": 2.1851298538498127e-06, "loss": 0.79856193, "num_input_tokens_seen": 87086785, "step": 4045, "time_per_iteration": 3.494997262954712 }, { "auxiliary_loss_clip": 0.01178681, "auxiliary_loss_mlp": 0.00763714, "balance_loss_clip": 1.05436838, "balance_loss_mlp": 1.00034046, "epoch": 0.48650273552576206, "flos": 25119945354240.0, "grad_norm": 2.132679526766794, "language_loss": 0.80038488, "learning_rate": 2.184354212577446e-06, "loss": 0.81980884, "num_input_tokens_seen": 87107090, "step": 4046, "time_per_iteration": 2.6700541973114014 }, { "auxiliary_loss_clip": 0.01186008, "auxiliary_loss_mlp": 0.0103876, "balance_loss_clip": 1.05081666, "balance_loss_mlp": 1.02995634, "epoch": 0.4866229784164011, "flos": 17456931699840.0, "grad_norm": 3.0611790694552843, "language_loss": 0.62983227, "learning_rate": 2.1835785433386907e-06, "loss": 0.65207994, "num_input_tokens_seen": 87125905, "step": 4047, "time_per_iteration": 2.5410852432250977 }, { "auxiliary_loss_clip": 0.01131198, "auxiliary_loss_mlp": 0.01028811, "balance_loss_clip": 1.04712653, "balance_loss_mlp": 1.02106214, "epoch": 0.48674322130704023, "flos": 23331127115520.0, "grad_norm": 1.8452014127932634, "language_loss": 0.65748721, "learning_rate": 2.182802846251216e-06, "loss": 0.67908728, "num_input_tokens_seen": 87146175, "step": 4048, "time_per_iteration": 2.7243688106536865 }, { "auxiliary_loss_clip": 0.01142269, "auxiliary_loss_mlp": 0.01032323, "balance_loss_clip": 1.04423761, "balance_loss_mlp": 1.02371013, "epoch": 0.4868634641976793, "flos": 28804344030720.0, "grad_norm": 2.10343445917185, "language_loss": 0.72273695, "learning_rate": 2.182027121432696e-06, "loss": 0.74448287, "num_input_tokens_seen": 87166800, "step": 4049, "time_per_iteration": 2.7725679874420166 }, { "auxiliary_loss_clip": 0.01186364, "auxiliary_loss_mlp": 0.01032213, "balance_loss_clip": 1.05266011, "balance_loss_mlp": 1.0240171, "epoch": 0.4869837070883184, "flos": 19025976574080.0, "grad_norm": 2.387930326486471, "language_loss": 0.8237133, "learning_rate": 2.1812513690008054e-06, "loss": 0.84589899, "num_input_tokens_seen": 87185920, "step": 4050, "time_per_iteration": 2.645840883255005 }, { "auxiliary_loss_clip": 0.01175041, "auxiliary_loss_mlp": 0.01038297, "balance_loss_clip": 1.05133915, "balance_loss_mlp": 1.03019714, "epoch": 0.4871039499789575, "flos": 15121409483520.0, "grad_norm": 2.238659325994885, "language_loss": 0.80004877, "learning_rate": 2.180475589073227e-06, "loss": 0.82218218, "num_input_tokens_seen": 87203620, "step": 4051, "time_per_iteration": 2.67335844039917 }, { "auxiliary_loss_clip": 0.01158423, "auxiliary_loss_mlp": 0.01023688, "balance_loss_clip": 1.04617977, "balance_loss_mlp": 1.01591253, "epoch": 0.48722419286959656, "flos": 26174066808960.0, "grad_norm": 1.6307540998970063, "language_loss": 0.73240912, "learning_rate": 2.1796997817676456e-06, "loss": 0.75423026, "num_input_tokens_seen": 87224630, "step": 4052, "time_per_iteration": 2.661839246749878 }, { "auxiliary_loss_clip": 0.01169181, "auxiliary_loss_mlp": 0.00762449, "balance_loss_clip": 1.0495286, "balance_loss_mlp": 1.00037515, "epoch": 0.4873444357602357, "flos": 24026142349440.0, "grad_norm": 1.6739859958805676, "language_loss": 0.67673653, "learning_rate": 2.1789239472017494e-06, "loss": 0.69605279, "num_input_tokens_seen": 87246280, "step": 4053, "time_per_iteration": 2.6774613857269287 }, { "auxiliary_loss_clip": 0.01139192, "auxiliary_loss_mlp": 0.01031123, "balance_loss_clip": 1.04511666, "balance_loss_mlp": 1.02318394, "epoch": 0.4874646786508748, "flos": 22820441500800.0, "grad_norm": 2.2481442452957756, "language_loss": 0.73449916, "learning_rate": 2.1781480854932326e-06, "loss": 0.75620228, "num_input_tokens_seen": 87266045, "step": 4054, "time_per_iteration": 2.713327646255493 }, { "auxiliary_loss_clip": 0.01125139, "auxiliary_loss_mlp": 0.0102685, "balance_loss_clip": 1.04831004, "balance_loss_mlp": 1.01925039, "epoch": 0.48758492154151384, "flos": 21287594557440.0, "grad_norm": 1.870174536632044, "language_loss": 0.79251474, "learning_rate": 2.1773721967597933e-06, "loss": 0.81403458, "num_input_tokens_seen": 87284495, "step": 4055, "time_per_iteration": 2.7886545658111572 }, { "auxiliary_loss_clip": 0.01050493, "auxiliary_loss_mlp": 0.01004138, "balance_loss_clip": 1.0123229, "balance_loss_mlp": 1.00274372, "epoch": 0.48770516443215295, "flos": 62244109180800.0, "grad_norm": 2.0032302265915627, "language_loss": 0.57399118, "learning_rate": 2.1765962811191322e-06, "loss": 0.5945375, "num_input_tokens_seen": 87338960, "step": 4056, "time_per_iteration": 3.194542407989502 }, { "auxiliary_loss_clip": 0.01032762, "auxiliary_loss_mlp": 0.01009705, "balance_loss_clip": 1.01383853, "balance_loss_mlp": 1.00814342, "epoch": 0.48782540732279206, "flos": 66133451882880.0, "grad_norm": 0.8260342643057988, "language_loss": 0.61973536, "learning_rate": 2.1758203386889566e-06, "loss": 0.64015996, "num_input_tokens_seen": 87401730, "step": 4057, "time_per_iteration": 3.317780017852783 }, { "auxiliary_loss_clip": 0.01140614, "auxiliary_loss_mlp": 0.00763285, "balance_loss_clip": 1.04717565, "balance_loss_mlp": 1.00033641, "epoch": 0.4879456502134311, "flos": 14607922608000.0, "grad_norm": 2.0748690192236094, "language_loss": 0.84047055, "learning_rate": 2.1750443695869746e-06, "loss": 0.85950953, "num_input_tokens_seen": 87417300, "step": 4058, "time_per_iteration": 2.683527946472168 }, { "auxiliary_loss_clip": 0.01171917, "auxiliary_loss_mlp": 0.01027256, "balance_loss_clip": 1.04975867, "balance_loss_mlp": 1.01948309, "epoch": 0.4880658931040702, "flos": 19500464257920.0, "grad_norm": 4.2486047182720466, "language_loss": 0.8586669, "learning_rate": 2.174268373930901e-06, "loss": 0.88065869, "num_input_tokens_seen": 87434815, "step": 4059, "time_per_iteration": 2.7109627723693848 }, { "auxiliary_loss_clip": 0.01136301, "auxiliary_loss_mlp": 0.0076244, "balance_loss_clip": 1.04964387, "balance_loss_mlp": 1.00031102, "epoch": 0.48818613599470934, "flos": 16723060928640.0, "grad_norm": 2.424134868911688, "language_loss": 0.79826045, "learning_rate": 2.1734923518384537e-06, "loss": 0.81724787, "num_input_tokens_seen": 87451420, "step": 4060, "time_per_iteration": 2.6793675422668457 }, { "auxiliary_loss_clip": 0.01125834, "auxiliary_loss_mlp": 0.01026105, "balance_loss_clip": 1.04676449, "balance_loss_mlp": 1.01838613, "epoch": 0.4883063788853484, "flos": 26756932803840.0, "grad_norm": 1.9184589520619713, "language_loss": 0.82298797, "learning_rate": 2.1727163034273547e-06, "loss": 0.8445074, "num_input_tokens_seen": 87469585, "step": 4061, "time_per_iteration": 2.8239872455596924 }, { "auxiliary_loss_clip": 0.01169714, "auxiliary_loss_mlp": 0.01026837, "balance_loss_clip": 1.04814744, "balance_loss_mlp": 1.01856136, "epoch": 0.4884266217759875, "flos": 16763388923520.0, "grad_norm": 6.8719346175016245, "language_loss": 0.78837115, "learning_rate": 2.17194022881533e-06, "loss": 0.81033665, "num_input_tokens_seen": 87485675, "step": 4062, "time_per_iteration": 2.6239242553710938 }, { "auxiliary_loss_clip": 0.01158036, "auxiliary_loss_mlp": 0.01032839, "balance_loss_clip": 1.04695904, "balance_loss_mlp": 1.0241667, "epoch": 0.4885468646666266, "flos": 24207132003840.0, "grad_norm": 1.7972658523394556, "language_loss": 0.67880821, "learning_rate": 2.1711641281201092e-06, "loss": 0.70071697, "num_input_tokens_seen": 87505605, "step": 4063, "time_per_iteration": 2.725346565246582 }, { "auxiliary_loss_clip": 0.01168056, "auxiliary_loss_mlp": 0.01022814, "balance_loss_clip": 1.05154872, "balance_loss_mlp": 1.01516128, "epoch": 0.48866710755726567, "flos": 14610795696000.0, "grad_norm": 2.780857309557539, "language_loss": 0.79819584, "learning_rate": 2.1703880014594264e-06, "loss": 0.82010454, "num_input_tokens_seen": 87523195, "step": 4064, "time_per_iteration": 2.6397337913513184 }, { "auxiliary_loss_clip": 0.01122702, "auxiliary_loss_mlp": 0.01029489, "balance_loss_clip": 1.04771185, "balance_loss_mlp": 1.02193105, "epoch": 0.4887873504479048, "flos": 28804451771520.0, "grad_norm": 2.772972175880441, "language_loss": 0.73886847, "learning_rate": 2.1696118489510182e-06, "loss": 0.76039034, "num_input_tokens_seen": 87544125, "step": 4065, "time_per_iteration": 2.822624683380127 }, { "auxiliary_loss_clip": 0.01144688, "auxiliary_loss_mlp": 0.00762914, "balance_loss_clip": 1.04765344, "balance_loss_mlp": 1.00032198, "epoch": 0.48890759333854383, "flos": 22784387224320.0, "grad_norm": 1.7850552757384905, "language_loss": 0.72593641, "learning_rate": 2.1688356707126286e-06, "loss": 0.7450124, "num_input_tokens_seen": 87563745, "step": 4066, "time_per_iteration": 2.768155097961426 }, { "auxiliary_loss_clip": 0.01135444, "auxiliary_loss_mlp": 0.01032844, "balance_loss_clip": 1.04867554, "balance_loss_mlp": 1.02420151, "epoch": 0.48902783622918294, "flos": 17786088956160.0, "grad_norm": 2.0723441366417172, "language_loss": 0.69946212, "learning_rate": 2.168059466862001e-06, "loss": 0.72114503, "num_input_tokens_seen": 87581895, "step": 4067, "time_per_iteration": 3.64701771736145 }, { "auxiliary_loss_clip": 0.01156406, "auxiliary_loss_mlp": 0.01025681, "balance_loss_clip": 1.04686594, "balance_loss_mlp": 1.01837993, "epoch": 0.48914807911982205, "flos": 22310294590080.0, "grad_norm": 2.1219998122716723, "language_loss": 0.81901062, "learning_rate": 2.167283237516887e-06, "loss": 0.84083152, "num_input_tokens_seen": 87600170, "step": 4068, "time_per_iteration": 3.649001359939575 }, { "auxiliary_loss_clip": 0.01157933, "auxiliary_loss_mlp": 0.01030879, "balance_loss_clip": 1.04709888, "balance_loss_mlp": 1.02265406, "epoch": 0.4892683220104611, "flos": 16363020954240.0, "grad_norm": 2.0843105894751806, "language_loss": 0.74571669, "learning_rate": 2.1665069827950383e-06, "loss": 0.76760483, "num_input_tokens_seen": 87617455, "step": 4069, "time_per_iteration": 2.614064931869507 }, { "auxiliary_loss_clip": 0.0115021, "auxiliary_loss_mlp": 0.01032391, "balance_loss_clip": 1.04688334, "balance_loss_mlp": 1.02505946, "epoch": 0.4893885649011002, "flos": 15739144606080.0, "grad_norm": 4.776150042076431, "language_loss": 0.8672033, "learning_rate": 2.1657307028142126e-06, "loss": 0.88902926, "num_input_tokens_seen": 87634995, "step": 4070, "time_per_iteration": 3.5018348693847656 }, { "auxiliary_loss_clip": 0.01158114, "auxiliary_loss_mlp": 0.01029572, "balance_loss_clip": 1.05038381, "balance_loss_mlp": 1.02149808, "epoch": 0.48950880779173933, "flos": 28581984887040.0, "grad_norm": 2.660467727163867, "language_loss": 0.67480755, "learning_rate": 2.164954397692171e-06, "loss": 0.69668442, "num_input_tokens_seen": 87654420, "step": 4071, "time_per_iteration": 2.776988983154297 }, { "auxiliary_loss_clip": 0.01063568, "auxiliary_loss_mlp": 0.010024, "balance_loss_clip": 1.01569402, "balance_loss_mlp": 1.00092745, "epoch": 0.4896290506823784, "flos": 66186310746240.0, "grad_norm": 1.0706712495271482, "language_loss": 0.77326202, "learning_rate": 2.164178067546678e-06, "loss": 0.79392171, "num_input_tokens_seen": 87713585, "step": 4072, "time_per_iteration": 4.24094820022583 }, { "auxiliary_loss_clip": 0.01159782, "auxiliary_loss_mlp": 0.01027301, "balance_loss_clip": 1.04669738, "balance_loss_mlp": 1.01969528, "epoch": 0.4897492935730175, "flos": 12531065207040.0, "grad_norm": 1.8638707318715113, "language_loss": 0.90844709, "learning_rate": 2.163401712495504e-06, "loss": 0.93031794, "num_input_tokens_seen": 87731280, "step": 4073, "time_per_iteration": 2.6344895362854004 }, { "auxiliary_loss_clip": 0.01132353, "auxiliary_loss_mlp": 0.0103026, "balance_loss_clip": 1.04846847, "balance_loss_mlp": 1.02235055, "epoch": 0.4898695364636566, "flos": 23476816679040.0, "grad_norm": 1.710853189510822, "language_loss": 0.79091293, "learning_rate": 2.1626253326564194e-06, "loss": 0.81253904, "num_input_tokens_seen": 87750230, "step": 4074, "time_per_iteration": 2.7451934814453125 }, { "auxiliary_loss_clip": 0.01152877, "auxiliary_loss_mlp": 0.01029501, "balance_loss_clip": 1.04649282, "balance_loss_mlp": 1.02090561, "epoch": 0.48998977935429566, "flos": 27160209774720.0, "grad_norm": 1.9130167945045482, "language_loss": 0.7686376, "learning_rate": 2.161848928147201e-06, "loss": 0.79046136, "num_input_tokens_seen": 87770500, "step": 4075, "time_per_iteration": 2.6714491844177246 }, { "auxiliary_loss_clip": 0.01170404, "auxiliary_loss_mlp": 0.01030441, "balance_loss_clip": 1.05107105, "balance_loss_mlp": 1.02219176, "epoch": 0.4901100222449348, "flos": 20339588856960.0, "grad_norm": 2.034902183705985, "language_loss": 0.80558813, "learning_rate": 2.161072499085629e-06, "loss": 0.82759655, "num_input_tokens_seen": 87789495, "step": 4076, "time_per_iteration": 2.6522562503814697 }, { "auxiliary_loss_clip": 0.01143228, "auxiliary_loss_mlp": 0.01038525, "balance_loss_clip": 1.04693568, "balance_loss_mlp": 1.03016281, "epoch": 0.4902302651355739, "flos": 30446359384320.0, "grad_norm": 1.6363658860335009, "language_loss": 0.83440924, "learning_rate": 2.160296045589487e-06, "loss": 0.85622674, "num_input_tokens_seen": 87812955, "step": 4077, "time_per_iteration": 2.7512543201446533 }, { "auxiliary_loss_clip": 0.01170685, "auxiliary_loss_mlp": 0.01032587, "balance_loss_clip": 1.05168092, "balance_loss_mlp": 1.02388465, "epoch": 0.49035050802621294, "flos": 19174180089600.0, "grad_norm": 3.3759790351846024, "language_loss": 0.70004767, "learning_rate": 2.159519567776562e-06, "loss": 0.72208041, "num_input_tokens_seen": 87832605, "step": 4078, "time_per_iteration": 2.691570281982422 }, { "auxiliary_loss_clip": 0.0112854, "auxiliary_loss_mlp": 0.01031798, "balance_loss_clip": 1.04148269, "balance_loss_mlp": 1.02312589, "epoch": 0.49047075091685205, "flos": 22228489365120.0, "grad_norm": 2.8256192291519913, "language_loss": 0.71258569, "learning_rate": 2.1587430657646463e-06, "loss": 0.73418903, "num_input_tokens_seen": 87846040, "step": 4079, "time_per_iteration": 2.760190963745117 }, { "auxiliary_loss_clip": 0.01153701, "auxiliary_loss_mlp": 0.01028954, "balance_loss_clip": 1.04955506, "balance_loss_mlp": 1.02108061, "epoch": 0.4905909938074911, "flos": 20156516213760.0, "grad_norm": 2.5059736149971266, "language_loss": 0.77987838, "learning_rate": 2.157966539671533e-06, "loss": 0.80170494, "num_input_tokens_seen": 87865680, "step": 4080, "time_per_iteration": 2.7427022457122803 }, { "auxiliary_loss_clip": 0.01142696, "auxiliary_loss_mlp": 0.01029615, "balance_loss_clip": 1.04582715, "balance_loss_mlp": 1.02194357, "epoch": 0.4907112366981302, "flos": 17202217380480.0, "grad_norm": 2.12984616130428, "language_loss": 0.67401123, "learning_rate": 2.157189989615021e-06, "loss": 0.69573432, "num_input_tokens_seen": 87884270, "step": 4081, "time_per_iteration": 2.6889097690582275 }, { "auxiliary_loss_clip": 0.01172684, "auxiliary_loss_mlp": 0.0076413, "balance_loss_clip": 1.05009949, "balance_loss_mlp": 1.00033212, "epoch": 0.4908314795887693, "flos": 21688968107520.0, "grad_norm": 1.8568779859971267, "language_loss": 0.75280428, "learning_rate": 2.156413415712913e-06, "loss": 0.77217245, "num_input_tokens_seen": 87906320, "step": 4082, "time_per_iteration": 2.6974568367004395 }, { "auxiliary_loss_clip": 0.01162247, "auxiliary_loss_mlp": 0.00763191, "balance_loss_clip": 1.0506587, "balance_loss_mlp": 1.00033545, "epoch": 0.4909517224794084, "flos": 26213676531840.0, "grad_norm": 1.730208412106505, "language_loss": 0.78495967, "learning_rate": 2.155636818083014e-06, "loss": 0.804214, "num_input_tokens_seen": 87927690, "step": 4083, "time_per_iteration": 2.761655807495117 }, { "auxiliary_loss_clip": 0.01151961, "auxiliary_loss_mlp": 0.01032021, "balance_loss_clip": 1.05004764, "balance_loss_mlp": 1.02439451, "epoch": 0.4910719653700475, "flos": 23148377694720.0, "grad_norm": 1.7901627369877156, "language_loss": 0.84180713, "learning_rate": 2.154860196843134e-06, "loss": 0.86364698, "num_input_tokens_seen": 87946885, "step": 4084, "time_per_iteration": 2.696606159210205 }, { "auxiliary_loss_clip": 0.01184547, "auxiliary_loss_mlp": 0.01029311, "balance_loss_clip": 1.05186617, "balance_loss_mlp": 1.02088273, "epoch": 0.4911922082606866, "flos": 23331845387520.0, "grad_norm": 2.2982421754714935, "language_loss": 0.76897347, "learning_rate": 2.154083552111085e-06, "loss": 0.79111207, "num_input_tokens_seen": 87966055, "step": 4085, "time_per_iteration": 2.5587785243988037 }, { "auxiliary_loss_clip": 0.01182075, "auxiliary_loss_mlp": 0.01034172, "balance_loss_clip": 1.04827619, "balance_loss_mlp": 1.02608371, "epoch": 0.49131245115132566, "flos": 29203239542400.0, "grad_norm": 1.7360014315407375, "language_loss": 0.81835556, "learning_rate": 2.1533068840046834e-06, "loss": 0.840518, "num_input_tokens_seen": 87986320, "step": 4086, "time_per_iteration": 2.6501095294952393 }, { "auxiliary_loss_clip": 0.01148733, "auxiliary_loss_mlp": 0.00763144, "balance_loss_clip": 1.04698801, "balance_loss_mlp": 1.00035107, "epoch": 0.49143269404196477, "flos": 20147465986560.0, "grad_norm": 2.375628441513896, "language_loss": 0.61247587, "learning_rate": 2.152530192641749e-06, "loss": 0.63159466, "num_input_tokens_seen": 88001230, "step": 4087, "time_per_iteration": 2.6306891441345215 }, { "auxiliary_loss_clip": 0.01173331, "auxiliary_loss_mlp": 0.0102664, "balance_loss_clip": 1.05061734, "balance_loss_mlp": 1.01833737, "epoch": 0.4915529369326039, "flos": 24389809597440.0, "grad_norm": 1.9440137445605954, "language_loss": 0.72867507, "learning_rate": 2.1517534781401068e-06, "loss": 0.75067478, "num_input_tokens_seen": 88019110, "step": 4088, "time_per_iteration": 2.6390931606292725 }, { "auxiliary_loss_clip": 0.01166256, "auxiliary_loss_mlp": 0.01025407, "balance_loss_clip": 1.04957414, "balance_loss_mlp": 1.01737189, "epoch": 0.49167317982324293, "flos": 10524305197440.0, "grad_norm": 2.449773637474804, "language_loss": 0.69400454, "learning_rate": 2.150976740617581e-06, "loss": 0.71592116, "num_input_tokens_seen": 88035670, "step": 4089, "time_per_iteration": 2.6393792629241943 }, { "auxiliary_loss_clip": 0.01161179, "auxiliary_loss_mlp": 0.01032173, "balance_loss_clip": 1.05125523, "balance_loss_mlp": 1.02431703, "epoch": 0.49179342271388204, "flos": 25593427457280.0, "grad_norm": 1.8800139399571998, "language_loss": 0.71559882, "learning_rate": 2.150199980192006e-06, "loss": 0.73753232, "num_input_tokens_seen": 88054790, "step": 4090, "time_per_iteration": 2.6972641944885254 }, { "auxiliary_loss_clip": 0.01150665, "auxiliary_loss_mlp": 0.01026366, "balance_loss_clip": 1.04709995, "balance_loss_mlp": 1.01849198, "epoch": 0.49191366560452116, "flos": 21102043875840.0, "grad_norm": 2.1088631241190656, "language_loss": 0.80847597, "learning_rate": 2.1494231969812114e-06, "loss": 0.83024633, "num_input_tokens_seen": 88073780, "step": 4091, "time_per_iteration": 2.6933693885803223 }, { "auxiliary_loss_clip": 0.01146995, "auxiliary_loss_mlp": 0.01026095, "balance_loss_clip": 1.04856014, "balance_loss_mlp": 1.01836467, "epoch": 0.4920339084951602, "flos": 26067520091520.0, "grad_norm": 2.868231388971745, "language_loss": 0.80834234, "learning_rate": 2.1486463911030372e-06, "loss": 0.83007324, "num_input_tokens_seen": 88094430, "step": 4092, "time_per_iteration": 2.8007242679595947 }, { "auxiliary_loss_clip": 0.01153592, "auxiliary_loss_mlp": 0.0102721, "balance_loss_clip": 1.04745364, "balance_loss_mlp": 1.0193063, "epoch": 0.4921541513857993, "flos": 25081269384960.0, "grad_norm": 1.8997663034763077, "language_loss": 0.74524403, "learning_rate": 2.147869562675324e-06, "loss": 0.76705205, "num_input_tokens_seen": 88113400, "step": 4093, "time_per_iteration": 3.6766157150268555 }, { "auxiliary_loss_clip": 0.01173189, "auxiliary_loss_mlp": 0.01032337, "balance_loss_clip": 1.05245864, "balance_loss_mlp": 1.0239867, "epoch": 0.49227439427643843, "flos": 24389809597440.0, "grad_norm": 1.9531436105029472, "language_loss": 0.72161031, "learning_rate": 2.147092711815915e-06, "loss": 0.74366552, "num_input_tokens_seen": 88132750, "step": 4094, "time_per_iteration": 3.6719284057617188 }, { "auxiliary_loss_clip": 0.01136811, "auxiliary_loss_mlp": 0.0102926, "balance_loss_clip": 1.04805553, "balance_loss_mlp": 1.0208919, "epoch": 0.4923946371670775, "flos": 11363753018880.0, "grad_norm": 2.404865874371887, "language_loss": 0.86751288, "learning_rate": 2.1463158386426593e-06, "loss": 0.88917363, "num_input_tokens_seen": 88150560, "step": 4095, "time_per_iteration": 2.683340311050415 }, { "auxiliary_loss_clip": 0.01162571, "auxiliary_loss_mlp": 0.01026884, "balance_loss_clip": 1.05012369, "balance_loss_mlp": 1.0184412, "epoch": 0.4925148800577166, "flos": 30445964334720.0, "grad_norm": 2.3295843227400215, "language_loss": 0.77586412, "learning_rate": 2.145538943273407e-06, "loss": 0.7977587, "num_input_tokens_seen": 88170835, "step": 4096, "time_per_iteration": 2.772714376449585 }, { "auxiliary_loss_clip": 0.01188493, "auxiliary_loss_mlp": 0.01027659, "balance_loss_clip": 1.0555625, "balance_loss_mlp": 1.01989818, "epoch": 0.49263512294835565, "flos": 20850454039680.0, "grad_norm": 2.0921426172300897, "language_loss": 0.72061694, "learning_rate": 2.144762025826013e-06, "loss": 0.74277842, "num_input_tokens_seen": 88189925, "step": 4097, "time_per_iteration": 4.410549879074097 }, { "auxiliary_loss_clip": 0.0117341, "auxiliary_loss_mlp": 0.01024052, "balance_loss_clip": 1.0487628, "balance_loss_mlp": 1.01594603, "epoch": 0.49275536583899476, "flos": 23767477534080.0, "grad_norm": 2.1707610089838227, "language_loss": 0.86893791, "learning_rate": 2.143985086418334e-06, "loss": 0.89091253, "num_input_tokens_seen": 88205105, "step": 4098, "time_per_iteration": 2.6296422481536865 }, { "auxiliary_loss_clip": 0.01155232, "auxiliary_loss_mlp": 0.01029013, "balance_loss_clip": 1.04794836, "balance_loss_mlp": 1.02158034, "epoch": 0.4928756087296339, "flos": 22273522041600.0, "grad_norm": 1.331473670637252, "language_loss": 0.76684403, "learning_rate": 2.1432081251682324e-06, "loss": 0.78868645, "num_input_tokens_seen": 88225475, "step": 4099, "time_per_iteration": 2.684675931930542 }, { "auxiliary_loss_clip": 0.01168327, "auxiliary_loss_mlp": 0.01034698, "balance_loss_clip": 1.05156302, "balance_loss_mlp": 1.02676511, "epoch": 0.49299585162027293, "flos": 19645471463040.0, "grad_norm": 1.669869483234081, "language_loss": 0.87121487, "learning_rate": 2.142431142193572e-06, "loss": 0.8932451, "num_input_tokens_seen": 88243255, "step": 4100, "time_per_iteration": 2.5751090049743652 }, { "auxiliary_loss_clip": 0.01184298, "auxiliary_loss_mlp": 0.01026966, "balance_loss_clip": 1.05269432, "balance_loss_mlp": 1.01883602, "epoch": 0.49311609451091204, "flos": 38837138497920.0, "grad_norm": 2.6088800958001253, "language_loss": 0.71678621, "learning_rate": 2.1416541376122207e-06, "loss": 0.73889887, "num_input_tokens_seen": 88263435, "step": 4101, "time_per_iteration": 2.7626793384552 }, { "auxiliary_loss_clip": 0.01180541, "auxiliary_loss_mlp": 0.0102477, "balance_loss_clip": 1.04883456, "balance_loss_mlp": 1.01637197, "epoch": 0.49323633740155115, "flos": 28329102161280.0, "grad_norm": 2.381386229599501, "language_loss": 0.72925907, "learning_rate": 2.1408771115420496e-06, "loss": 0.7513122, "num_input_tokens_seen": 88283295, "step": 4102, "time_per_iteration": 2.6249895095825195 }, { "auxiliary_loss_clip": 0.01131568, "auxiliary_loss_mlp": 0.01027462, "balance_loss_clip": 1.05114698, "balance_loss_mlp": 1.01974893, "epoch": 0.4933565802921902, "flos": 21135584200320.0, "grad_norm": 11.304618949356206, "language_loss": 0.64868879, "learning_rate": 2.140100064100932e-06, "loss": 0.67027903, "num_input_tokens_seen": 88299270, "step": 4103, "time_per_iteration": 2.748246431350708 }, { "auxiliary_loss_clip": 0.01169092, "auxiliary_loss_mlp": 0.01031006, "balance_loss_clip": 1.05214405, "balance_loss_mlp": 1.02324522, "epoch": 0.4934768231828293, "flos": 18039007595520.0, "grad_norm": 1.8701455087462335, "language_loss": 0.76028866, "learning_rate": 2.139322995406746e-06, "loss": 0.78228962, "num_input_tokens_seen": 88316905, "step": 4104, "time_per_iteration": 2.6235668659210205 }, { "auxiliary_loss_clip": 0.01185977, "auxiliary_loss_mlp": 0.01034584, "balance_loss_clip": 1.05475068, "balance_loss_mlp": 1.02577174, "epoch": 0.4935970660734684, "flos": 23469957181440.0, "grad_norm": 2.4098479545284834, "language_loss": 0.79656112, "learning_rate": 2.1385459055773727e-06, "loss": 0.81876677, "num_input_tokens_seen": 88335095, "step": 4105, "time_per_iteration": 2.630225419998169 }, { "auxiliary_loss_clip": 0.01112821, "auxiliary_loss_mlp": 0.00762343, "balance_loss_clip": 1.04274297, "balance_loss_mlp": 1.00029659, "epoch": 0.4937173089641075, "flos": 64479258840960.0, "grad_norm": 2.0517292677830823, "language_loss": 0.73960066, "learning_rate": 2.137768794730696e-06, "loss": 0.75835228, "num_input_tokens_seen": 88358545, "step": 4106, "time_per_iteration": 3.2441718578338623 }, { "auxiliary_loss_clip": 0.01160863, "auxiliary_loss_mlp": 0.01027982, "balance_loss_clip": 1.05070603, "balance_loss_mlp": 1.0200367, "epoch": 0.4938375518547466, "flos": 22346025644160.0, "grad_norm": 1.7906919532505554, "language_loss": 0.80322134, "learning_rate": 2.1369916629846026e-06, "loss": 0.82510978, "num_input_tokens_seen": 88378295, "step": 4107, "time_per_iteration": 2.686285972595215 }, { "auxiliary_loss_clip": 0.01158142, "auxiliary_loss_mlp": 0.01031275, "balance_loss_clip": 1.04958606, "balance_loss_mlp": 1.02312088, "epoch": 0.4939577947453857, "flos": 17858700299520.0, "grad_norm": 1.8561989515385042, "language_loss": 0.75189698, "learning_rate": 2.136214510456983e-06, "loss": 0.77379119, "num_input_tokens_seen": 88396750, "step": 4108, "time_per_iteration": 2.732663631439209 }, { "auxiliary_loss_clip": 0.01045805, "auxiliary_loss_mlp": 0.0075438, "balance_loss_clip": 1.01581335, "balance_loss_mlp": 1.00030363, "epoch": 0.49407803763602476, "flos": 70066746875520.0, "grad_norm": 0.8892520658895272, "language_loss": 0.63100636, "learning_rate": 2.1354373372657296e-06, "loss": 0.64900815, "num_input_tokens_seen": 88455190, "step": 4109, "time_per_iteration": 3.344489097595215 }, { "auxiliary_loss_clip": 0.01185234, "auxiliary_loss_mlp": 0.01029891, "balance_loss_clip": 1.05351865, "balance_loss_mlp": 1.02161765, "epoch": 0.49419828052666387, "flos": 24317485562880.0, "grad_norm": 1.490218012607458, "language_loss": 0.71228987, "learning_rate": 2.1346601435287404e-06, "loss": 0.7344411, "num_input_tokens_seen": 88477460, "step": 4110, "time_per_iteration": 2.6385204792022705 }, { "auxiliary_loss_clip": 0.01153896, "auxiliary_loss_mlp": 0.01027967, "balance_loss_clip": 1.0473659, "balance_loss_mlp": 1.01969373, "epoch": 0.494318523417303, "flos": 29386060790400.0, "grad_norm": 1.8043204520351457, "language_loss": 0.80026066, "learning_rate": 2.1338829293639144e-06, "loss": 0.8220793, "num_input_tokens_seen": 88497820, "step": 4111, "time_per_iteration": 2.709716796875 }, { "auxiliary_loss_clip": 0.01125842, "auxiliary_loss_mlp": 0.01027986, "balance_loss_clip": 1.04519916, "balance_loss_mlp": 1.01980865, "epoch": 0.49443876630794203, "flos": 15268284195840.0, "grad_norm": 2.2363337158099106, "language_loss": 0.8325671, "learning_rate": 2.1331056948891547e-06, "loss": 0.85410535, "num_input_tokens_seen": 88514920, "step": 4112, "time_per_iteration": 2.722691774368286 }, { "auxiliary_loss_clip": 0.01150664, "auxiliary_loss_mlp": 0.01025535, "balance_loss_clip": 1.04771626, "balance_loss_mlp": 1.0171603, "epoch": 0.49455900919858115, "flos": 12347453859840.0, "grad_norm": 2.3500485954076185, "language_loss": 0.76494336, "learning_rate": 2.1323284402223666e-06, "loss": 0.78670526, "num_input_tokens_seen": 88530910, "step": 4113, "time_per_iteration": 2.5932061672210693 }, { "auxiliary_loss_clip": 0.01184368, "auxiliary_loss_mlp": 0.00762195, "balance_loss_clip": 1.05610955, "balance_loss_mlp": 1.0002811, "epoch": 0.4946792520892202, "flos": 22779610715520.0, "grad_norm": 1.847822677149436, "language_loss": 0.882128, "learning_rate": 2.1315511654814597e-06, "loss": 0.90159357, "num_input_tokens_seen": 88549320, "step": 4114, "time_per_iteration": 2.670654535293579 }, { "auxiliary_loss_clip": 0.01147064, "auxiliary_loss_mlp": 0.01024047, "balance_loss_clip": 1.04894948, "balance_loss_mlp": 1.01671004, "epoch": 0.4947994949798593, "flos": 23148126299520.0, "grad_norm": 1.9344431933000936, "language_loss": 0.78470993, "learning_rate": 2.1307738707843456e-06, "loss": 0.80642104, "num_input_tokens_seen": 88568985, "step": 4115, "time_per_iteration": 2.7016329765319824 }, { "auxiliary_loss_clip": 0.01177245, "auxiliary_loss_mlp": 0.01030549, "balance_loss_clip": 1.05352461, "balance_loss_mlp": 1.02185309, "epoch": 0.4949197378704984, "flos": 23659997063040.0, "grad_norm": 1.9981652842888369, "language_loss": 0.69469088, "learning_rate": 2.1299965562489385e-06, "loss": 0.71676886, "num_input_tokens_seen": 88588790, "step": 4116, "time_per_iteration": 2.6574652194976807 }, { "auxiliary_loss_clip": 0.01167965, "auxiliary_loss_mlp": 0.01023854, "balance_loss_clip": 1.04925251, "balance_loss_mlp": 1.01598024, "epoch": 0.4950399807611375, "flos": 26911493026560.0, "grad_norm": 1.5714116964184477, "language_loss": 0.79169607, "learning_rate": 2.129219221993158e-06, "loss": 0.81361425, "num_input_tokens_seen": 88613575, "step": 4117, "time_per_iteration": 2.7980785369873047 }, { "auxiliary_loss_clip": 0.01047248, "auxiliary_loss_mlp": 0.01003682, "balance_loss_clip": 1.0129602, "balance_loss_mlp": 1.00219786, "epoch": 0.4951602236517766, "flos": 67315270187520.0, "grad_norm": 0.8041599280679855, "language_loss": 0.59985292, "learning_rate": 2.128441868134924e-06, "loss": 0.62036216, "num_input_tokens_seen": 88675510, "step": 4118, "time_per_iteration": 3.2767646312713623 }, { "auxiliary_loss_clip": 0.01144108, "auxiliary_loss_mlp": 0.01031176, "balance_loss_clip": 1.04797006, "balance_loss_mlp": 1.02389836, "epoch": 0.4952804665424157, "flos": 19901442758400.0, "grad_norm": 2.0731981528984167, "language_loss": 0.82945895, "learning_rate": 2.1276644947921606e-06, "loss": 0.85121179, "num_input_tokens_seen": 88694425, "step": 4119, "time_per_iteration": 3.7208800315856934 }, { "auxiliary_loss_clip": 0.01169331, "auxiliary_loss_mlp": 0.01025335, "balance_loss_clip": 1.04958892, "balance_loss_mlp": 1.01623344, "epoch": 0.49540070943305475, "flos": 18806813740800.0, "grad_norm": 2.9288430585415, "language_loss": 0.82257468, "learning_rate": 2.126887102082795e-06, "loss": 0.84452128, "num_input_tokens_seen": 88714450, "step": 4120, "time_per_iteration": 3.619898796081543 }, { "auxiliary_loss_clip": 0.01141019, "auxiliary_loss_mlp": 0.01027955, "balance_loss_clip": 1.04374158, "balance_loss_mlp": 1.02048659, "epoch": 0.49552095232369386, "flos": 24934179191040.0, "grad_norm": 2.22454144235084, "language_loss": 0.70327747, "learning_rate": 2.126109690124757e-06, "loss": 0.72496712, "num_input_tokens_seen": 88735265, "step": 4121, "time_per_iteration": 2.8440003395080566 }, { "auxiliary_loss_clip": 0.01127625, "auxiliary_loss_mlp": 0.01030154, "balance_loss_clip": 1.0462184, "balance_loss_mlp": 1.02212512, "epoch": 0.495641195214333, "flos": 22857249962880.0, "grad_norm": 1.7034075690877302, "language_loss": 0.71123517, "learning_rate": 2.1253322590359786e-06, "loss": 0.732813, "num_input_tokens_seen": 88754600, "step": 4122, "time_per_iteration": 3.6207664012908936 }, { "auxiliary_loss_clip": 0.01166419, "auxiliary_loss_mlp": 0.01034411, "balance_loss_clip": 1.04917312, "balance_loss_mlp": 1.02631116, "epoch": 0.49576143810497203, "flos": 25769748343680.0, "grad_norm": 1.693893121970706, "language_loss": 0.7412222, "learning_rate": 2.124554808934397e-06, "loss": 0.76323044, "num_input_tokens_seen": 88775180, "step": 4123, "time_per_iteration": 3.6027958393096924 }, { "auxiliary_loss_clip": 0.01118748, "auxiliary_loss_mlp": 0.01024752, "balance_loss_clip": 1.04098845, "balance_loss_mlp": 1.01635408, "epoch": 0.49588168099561114, "flos": 22128838058880.0, "grad_norm": 1.8068187518465213, "language_loss": 0.72951543, "learning_rate": 2.1237773399379496e-06, "loss": 0.75095046, "num_input_tokens_seen": 88796145, "step": 4124, "time_per_iteration": 2.7604830265045166 }, { "auxiliary_loss_clip": 0.01158182, "auxiliary_loss_mlp": 0.01034498, "balance_loss_clip": 1.04438519, "balance_loss_mlp": 1.02677357, "epoch": 0.49600192388625025, "flos": 24387331559040.0, "grad_norm": 1.724789406572805, "language_loss": 0.86896372, "learning_rate": 2.122999852164578e-06, "loss": 0.89089048, "num_input_tokens_seen": 88816765, "step": 4125, "time_per_iteration": 2.7239248752593994 }, { "auxiliary_loss_clip": 0.0112345, "auxiliary_loss_mlp": 0.01025626, "balance_loss_clip": 1.0465157, "balance_loss_mlp": 1.0180738, "epoch": 0.4961221667768893, "flos": 22857429530880.0, "grad_norm": 2.253252959821838, "language_loss": 0.58345854, "learning_rate": 2.122222345732227e-06, "loss": 0.60494936, "num_input_tokens_seen": 88836680, "step": 4126, "time_per_iteration": 2.7814407348632812 }, { "auxiliary_loss_clip": 0.01142435, "auxiliary_loss_mlp": 0.01032125, "balance_loss_clip": 1.04801595, "balance_loss_mlp": 1.02371526, "epoch": 0.4962424096675284, "flos": 17858089768320.0, "grad_norm": 1.98822415818956, "language_loss": 0.83065629, "learning_rate": 2.121444820758843e-06, "loss": 0.85240191, "num_input_tokens_seen": 88855320, "step": 4127, "time_per_iteration": 2.7356035709381104 }, { "auxiliary_loss_clip": 0.01128978, "auxiliary_loss_mlp": 0.01037388, "balance_loss_clip": 1.04795361, "balance_loss_mlp": 1.02882862, "epoch": 0.49636265255816747, "flos": 21793611404160.0, "grad_norm": 1.9130037154146204, "language_loss": 0.78495699, "learning_rate": 2.120667277362376e-06, "loss": 0.80662066, "num_input_tokens_seen": 88874035, "step": 4128, "time_per_iteration": 2.678474187850952 }, { "auxiliary_loss_clip": 0.01187913, "auxiliary_loss_mlp": 0.01029497, "balance_loss_clip": 1.05517423, "balance_loss_mlp": 1.02111053, "epoch": 0.4964828954488066, "flos": 16358603581440.0, "grad_norm": 2.1228242837549813, "language_loss": 0.84747052, "learning_rate": 2.1198897156607796e-06, "loss": 0.86964464, "num_input_tokens_seen": 88891390, "step": 4129, "time_per_iteration": 2.5810461044311523 }, { "auxiliary_loss_clip": 0.01172821, "auxiliary_loss_mlp": 0.01024893, "balance_loss_clip": 1.04895592, "balance_loss_mlp": 1.01696014, "epoch": 0.4966031383394457, "flos": 24711101775360.0, "grad_norm": 2.0430825167533615, "language_loss": 0.73699498, "learning_rate": 2.1191121357720085e-06, "loss": 0.75897211, "num_input_tokens_seen": 88909450, "step": 4130, "time_per_iteration": 2.6485555171966553 }, { "auxiliary_loss_clip": 0.01118939, "auxiliary_loss_mlp": 0.01028717, "balance_loss_clip": 1.04429197, "balance_loss_mlp": 1.02052689, "epoch": 0.49672338123008475, "flos": 22930615491840.0, "grad_norm": 1.6946415783410982, "language_loss": 0.74953175, "learning_rate": 2.1183345378140206e-06, "loss": 0.77100837, "num_input_tokens_seen": 88929195, "step": 4131, "time_per_iteration": 2.7105612754821777 }, { "auxiliary_loss_clip": 0.01072935, "auxiliary_loss_mlp": 0.01001334, "balance_loss_clip": 1.01489091, "balance_loss_mlp": 0.99972516, "epoch": 0.49684362412072386, "flos": 65976736844160.0, "grad_norm": 0.8532066866492722, "language_loss": 0.61935806, "learning_rate": 2.1175569219047783e-06, "loss": 0.64010078, "num_input_tokens_seen": 88990635, "step": 4132, "time_per_iteration": 3.3240840435028076 }, { "auxiliary_loss_clip": 0.01184481, "auxiliary_loss_mlp": 0.010343, "balance_loss_clip": 1.05268133, "balance_loss_mlp": 1.02647412, "epoch": 0.49696386701136297, "flos": 19971288754560.0, "grad_norm": 1.5349231611445002, "language_loss": 0.73317599, "learning_rate": 2.1167792881622437e-06, "loss": 0.75536382, "num_input_tokens_seen": 89009655, "step": 4133, "time_per_iteration": 2.553307056427002 }, { "auxiliary_loss_clip": 0.01151527, "auxiliary_loss_mlp": 0.01028289, "balance_loss_clip": 1.04891539, "balance_loss_mlp": 1.02070165, "epoch": 0.497084109902002, "flos": 24750819239040.0, "grad_norm": 1.669046828320635, "language_loss": 0.81043762, "learning_rate": 2.116001636704384e-06, "loss": 0.83223581, "num_input_tokens_seen": 89030040, "step": 4134, "time_per_iteration": 2.734523296356201 }, { "auxiliary_loss_clip": 0.01135591, "auxiliary_loss_mlp": 0.01034417, "balance_loss_clip": 1.04701567, "balance_loss_mlp": 1.02666235, "epoch": 0.49720435279264114, "flos": 21871825269120.0, "grad_norm": 1.8379272816105148, "language_loss": 0.80255038, "learning_rate": 2.1152239676491685e-06, "loss": 0.82425046, "num_input_tokens_seen": 89048145, "step": 4135, "time_per_iteration": 2.7451138496398926 }, { "auxiliary_loss_clip": 0.01160341, "auxiliary_loss_mlp": 0.01029276, "balance_loss_clip": 1.04834509, "balance_loss_mlp": 1.02206111, "epoch": 0.49732459568328025, "flos": 23805794367360.0, "grad_norm": 2.362006028466471, "language_loss": 0.73669553, "learning_rate": 2.114446281114569e-06, "loss": 0.75859177, "num_input_tokens_seen": 89067165, "step": 4136, "time_per_iteration": 2.711787223815918 }, { "auxiliary_loss_clip": 0.01146488, "auxiliary_loss_mlp": 0.01027282, "balance_loss_clip": 1.04770374, "balance_loss_mlp": 1.01937842, "epoch": 0.4974448385739193, "flos": 20047742853120.0, "grad_norm": 1.864077435653133, "language_loss": 0.76158333, "learning_rate": 2.1136685772185587e-06, "loss": 0.78332102, "num_input_tokens_seen": 89086190, "step": 4137, "time_per_iteration": 2.697575807571411 }, { "auxiliary_loss_clip": 0.01153819, "auxiliary_loss_mlp": 0.00763665, "balance_loss_clip": 1.04264033, "balance_loss_mlp": 1.00022888, "epoch": 0.4975650814645584, "flos": 24821347593600.0, "grad_norm": 1.6234469831281209, "language_loss": 0.78013515, "learning_rate": 2.1128908560791163e-06, "loss": 0.79930997, "num_input_tokens_seen": 89106020, "step": 4138, "time_per_iteration": 2.766541004180908 }, { "auxiliary_loss_clip": 0.01182174, "auxiliary_loss_mlp": 0.01031447, "balance_loss_clip": 1.05120981, "balance_loss_mlp": 1.0228343, "epoch": 0.4976853243551975, "flos": 19829477859840.0, "grad_norm": 5.276923367354073, "language_loss": 0.78259593, "learning_rate": 2.1121131178142203e-06, "loss": 0.8047322, "num_input_tokens_seen": 89125385, "step": 4139, "time_per_iteration": 2.6261820793151855 }, { "auxiliary_loss_clip": 0.01151157, "auxiliary_loss_mlp": 0.01025314, "balance_loss_clip": 1.04424405, "balance_loss_mlp": 1.01801884, "epoch": 0.4978055672458366, "flos": 23142990654720.0, "grad_norm": 1.466620322438816, "language_loss": 0.82298654, "learning_rate": 2.1113353625418544e-06, "loss": 0.84475124, "num_input_tokens_seen": 89143935, "step": 4140, "time_per_iteration": 2.7037353515625 }, { "auxiliary_loss_clip": 0.01162682, "auxiliary_loss_mlp": 0.01027907, "balance_loss_clip": 1.05176997, "balance_loss_mlp": 1.02005076, "epoch": 0.4979258101364757, "flos": 15559914718080.0, "grad_norm": 1.7659175170905714, "language_loss": 0.79045534, "learning_rate": 2.1105575903800017e-06, "loss": 0.81236118, "num_input_tokens_seen": 89162655, "step": 4141, "time_per_iteration": 2.653367280960083 }, { "auxiliary_loss_clip": 0.01171719, "auxiliary_loss_mlp": 0.01037627, "balance_loss_clip": 1.04850841, "balance_loss_mlp": 1.02874637, "epoch": 0.4980460530271148, "flos": 26356169784960.0, "grad_norm": 2.1191618418759437, "language_loss": 0.84992719, "learning_rate": 2.1097798014466502e-06, "loss": 0.8720206, "num_input_tokens_seen": 89182255, "step": 4142, "time_per_iteration": 2.682246446609497 }, { "auxiliary_loss_clip": 0.01174761, "auxiliary_loss_mlp": 0.0102879, "balance_loss_clip": 1.05123329, "balance_loss_mlp": 1.02066255, "epoch": 0.49816629591775385, "flos": 17274541415040.0, "grad_norm": 2.7531281231478766, "language_loss": 0.59096885, "learning_rate": 2.109001995859791e-06, "loss": 0.61300439, "num_input_tokens_seen": 89201155, "step": 4143, "time_per_iteration": 2.597564697265625 }, { "auxiliary_loss_clip": 0.01060778, "auxiliary_loss_mlp": 0.01002222, "balance_loss_clip": 1.01620376, "balance_loss_mlp": 1.00047016, "epoch": 0.49828653880839296, "flos": 64930947344640.0, "grad_norm": 0.7890878730322277, "language_loss": 0.59993899, "learning_rate": 2.108224173737415e-06, "loss": 0.62056899, "num_input_tokens_seen": 89264455, "step": 4144, "time_per_iteration": 3.270787477493286 }, { "auxiliary_loss_clip": 0.01154391, "auxiliary_loss_mlp": 0.01029356, "balance_loss_clip": 1.04761541, "balance_loss_mlp": 1.02090418, "epoch": 0.498406781699032, "flos": 27484806003840.0, "grad_norm": 1.9128306623219133, "language_loss": 0.76483285, "learning_rate": 2.1074463351975183e-06, "loss": 0.78667033, "num_input_tokens_seen": 89283340, "step": 4145, "time_per_iteration": 3.6887989044189453 }, { "auxiliary_loss_clip": 0.0114795, "auxiliary_loss_mlp": 0.0102616, "balance_loss_clip": 1.04773796, "balance_loss_mlp": 1.01898062, "epoch": 0.49852702458967113, "flos": 31499870307840.0, "grad_norm": 2.1924228883175534, "language_loss": 0.71676558, "learning_rate": 2.106668480358098e-06, "loss": 0.73850667, "num_input_tokens_seen": 89303565, "step": 4146, "time_per_iteration": 3.7092318534851074 }, { "auxiliary_loss_clip": 0.01150471, "auxiliary_loss_mlp": 0.01032706, "balance_loss_clip": 1.04567719, "balance_loss_mlp": 1.02396834, "epoch": 0.49864726748031024, "flos": 22852868503680.0, "grad_norm": 1.6171487995231857, "language_loss": 0.71018374, "learning_rate": 2.105890609337154e-06, "loss": 0.73201549, "num_input_tokens_seen": 89322080, "step": 4147, "time_per_iteration": 2.684776544570923 }, { "auxiliary_loss_clip": 0.0108241, "auxiliary_loss_mlp": 0.01002219, "balance_loss_clip": 1.01567578, "balance_loss_mlp": 1.00063324, "epoch": 0.4987675103709493, "flos": 70405708544640.0, "grad_norm": 0.6913399730511093, "language_loss": 0.63762987, "learning_rate": 2.1051127222526883e-06, "loss": 0.65847611, "num_input_tokens_seen": 89394195, "step": 4148, "time_per_iteration": 4.229222297668457 }, { "auxiliary_loss_clip": 0.01171325, "auxiliary_loss_mlp": 0.0102894, "balance_loss_clip": 1.05511522, "balance_loss_mlp": 1.02146578, "epoch": 0.4988877532615884, "flos": 28767571482240.0, "grad_norm": 1.6897640103667007, "language_loss": 0.80697227, "learning_rate": 2.1043348192227067e-06, "loss": 0.82897496, "num_input_tokens_seen": 89414565, "step": 4149, "time_per_iteration": 3.769097089767456 }, { "auxiliary_loss_clip": 0.01132054, "auxiliary_loss_mlp": 0.01028432, "balance_loss_clip": 1.04610157, "balance_loss_mlp": 1.02106488, "epoch": 0.4990079961522275, "flos": 16872700988160.0, "grad_norm": 1.640237289477909, "language_loss": 0.61582917, "learning_rate": 2.1035569003652156e-06, "loss": 0.63743401, "num_input_tokens_seen": 89433195, "step": 4150, "time_per_iteration": 2.656050205230713 }, { "auxiliary_loss_clip": 0.01123935, "auxiliary_loss_mlp": 0.01036152, "balance_loss_clip": 1.04483581, "balance_loss_mlp": 1.02687716, "epoch": 0.4991282390428666, "flos": 13291042187520.0, "grad_norm": 2.299880548381162, "language_loss": 0.81880075, "learning_rate": 2.1027789657982255e-06, "loss": 0.84040159, "num_input_tokens_seen": 89447410, "step": 4151, "time_per_iteration": 2.7204132080078125 }, { "auxiliary_loss_clip": 0.01125344, "auxiliary_loss_mlp": 0.01031419, "balance_loss_clip": 1.04604363, "balance_loss_mlp": 1.02355146, "epoch": 0.4992484819335057, "flos": 21537496454400.0, "grad_norm": 2.248182486268081, "language_loss": 0.77384055, "learning_rate": 2.1020010156397482e-06, "loss": 0.79540813, "num_input_tokens_seen": 89464630, "step": 4152, "time_per_iteration": 2.736994743347168 }, { "auxiliary_loss_clip": 0.01168637, "auxiliary_loss_mlp": 0.01029052, "balance_loss_clip": 1.048208, "balance_loss_mlp": 1.02098131, "epoch": 0.4993687248241448, "flos": 24860095390080.0, "grad_norm": 1.6643205762803908, "language_loss": 0.77704865, "learning_rate": 2.101223050007797e-06, "loss": 0.79902554, "num_input_tokens_seen": 89483180, "step": 4153, "time_per_iteration": 2.64463472366333 }, { "auxiliary_loss_clip": 0.01081314, "auxiliary_loss_mlp": 0.0100216, "balance_loss_clip": 1.01501977, "balance_loss_mlp": 1.00062203, "epoch": 0.49948896771478385, "flos": 62941602453120.0, "grad_norm": 0.8270829298885973, "language_loss": 0.5379945, "learning_rate": 2.1004450690203904e-06, "loss": 0.55882919, "num_input_tokens_seen": 89539260, "step": 4154, "time_per_iteration": 3.226736545562744 }, { "auxiliary_loss_clip": 0.01080853, "auxiliary_loss_mlp": 0.01001258, "balance_loss_clip": 1.0148468, "balance_loss_mlp": 0.99972051, "epoch": 0.49960921060542296, "flos": 68284213516800.0, "grad_norm": 0.8511115716072202, "language_loss": 0.63316166, "learning_rate": 2.099667072795546e-06, "loss": 0.65398276, "num_input_tokens_seen": 89601380, "step": 4155, "time_per_iteration": 3.2274184226989746 }, { "auxiliary_loss_clip": 0.01165985, "auxiliary_loss_mlp": 0.01023924, "balance_loss_clip": 1.04810202, "balance_loss_mlp": 1.01596081, "epoch": 0.49972945349606207, "flos": 23659350618240.0, "grad_norm": 1.808042519103185, "language_loss": 0.79895514, "learning_rate": 2.0988890614512864e-06, "loss": 0.82085419, "num_input_tokens_seen": 89621270, "step": 4156, "time_per_iteration": 2.6885881423950195 }, { "auxiliary_loss_clip": 0.01158748, "auxiliary_loss_mlp": 0.01025466, "balance_loss_clip": 1.05113173, "balance_loss_mlp": 1.01777685, "epoch": 0.4998496963867011, "flos": 19755825022080.0, "grad_norm": 2.461082191365096, "language_loss": 0.83966863, "learning_rate": 2.098111035105635e-06, "loss": 0.86151081, "num_input_tokens_seen": 89639695, "step": 4157, "time_per_iteration": 2.6376068592071533 }, { "auxiliary_loss_clip": 0.01126954, "auxiliary_loss_mlp": 0.0102244, "balance_loss_clip": 1.04748511, "balance_loss_mlp": 1.01472092, "epoch": 0.49996993927734024, "flos": 22265728790400.0, "grad_norm": 1.6619227598615467, "language_loss": 0.73223007, "learning_rate": 2.0973329938766176e-06, "loss": 0.75372398, "num_input_tokens_seen": 89657125, "step": 4158, "time_per_iteration": 2.7323129177093506 }, { "auxiliary_loss_clip": 0.011762, "auxiliary_loss_mlp": 0.01031496, "balance_loss_clip": 1.05144787, "balance_loss_mlp": 1.02282333, "epoch": 0.5000901821679793, "flos": 23327212533120.0, "grad_norm": 1.8494821861107642, "language_loss": 0.79150236, "learning_rate": 2.0965549378822618e-06, "loss": 0.81357932, "num_input_tokens_seen": 89678415, "step": 4159, "time_per_iteration": 2.686502456665039 }, { "auxiliary_loss_clip": 0.01082774, "auxiliary_loss_mlp": 0.01032881, "balance_loss_clip": 1.04126978, "balance_loss_mlp": 1.02386308, "epoch": 0.5002104250586185, "flos": 20339014239360.0, "grad_norm": 2.024192513036943, "language_loss": 0.84326291, "learning_rate": 2.095776867240599e-06, "loss": 0.86441946, "num_input_tokens_seen": 89695405, "step": 4160, "time_per_iteration": 2.904794454574585 }, { "auxiliary_loss_clip": 0.01133451, "auxiliary_loss_mlp": 0.0103203, "balance_loss_clip": 1.04552686, "balance_loss_mlp": 1.0243113, "epoch": 0.5003306679492575, "flos": 13991372634240.0, "grad_norm": 1.9084147902697803, "language_loss": 0.82609308, "learning_rate": 2.094998782069661e-06, "loss": 0.84774792, "num_input_tokens_seen": 89713110, "step": 4161, "time_per_iteration": 3.0508813858032227 }, { "auxiliary_loss_clip": 0.01181044, "auxiliary_loss_mlp": 0.01033108, "balance_loss_clip": 1.05087996, "balance_loss_mlp": 1.02613986, "epoch": 0.5004509108398966, "flos": 27672762896640.0, "grad_norm": 2.15606363449673, "language_loss": 0.75440538, "learning_rate": 2.0942206824874845e-06, "loss": 0.77654696, "num_input_tokens_seen": 89735885, "step": 4162, "time_per_iteration": 2.6663448810577393 }, { "auxiliary_loss_clip": 0.01165395, "auxiliary_loss_mlp": 0.01029669, "balance_loss_clip": 1.04874802, "balance_loss_mlp": 1.0213542, "epoch": 0.5005711537305357, "flos": 14976186796800.0, "grad_norm": 2.0761966375045864, "language_loss": 0.7883929, "learning_rate": 2.093442568612105e-06, "loss": 0.8103435, "num_input_tokens_seen": 89753690, "step": 4163, "time_per_iteration": 2.6545279026031494 }, { "auxiliary_loss_clip": 0.0118563, "auxiliary_loss_mlp": 0.01029581, "balance_loss_clip": 1.05276918, "balance_loss_mlp": 1.02123046, "epoch": 0.5006913966211748, "flos": 26503259978880.0, "grad_norm": 1.4500630218557733, "language_loss": 0.84948397, "learning_rate": 2.0926644405615613e-06, "loss": 0.87163609, "num_input_tokens_seen": 89774590, "step": 4164, "time_per_iteration": 2.6310176849365234 }, { "auxiliary_loss_clip": 0.01136994, "auxiliary_loss_mlp": 0.01027543, "balance_loss_clip": 1.04649711, "balance_loss_mlp": 1.02018154, "epoch": 0.5008116395118138, "flos": 20449295971200.0, "grad_norm": 1.7313599828817534, "language_loss": 0.81139815, "learning_rate": 2.091886298453897e-06, "loss": 0.83304358, "num_input_tokens_seen": 89792775, "step": 4165, "time_per_iteration": 2.779615640640259 }, { "auxiliary_loss_clip": 0.01164322, "auxiliary_loss_mlp": 0.01027495, "balance_loss_clip": 1.04729486, "balance_loss_mlp": 1.01953816, "epoch": 0.500931882402453, "flos": 21579871524480.0, "grad_norm": 1.9985382828567275, "language_loss": 0.72407418, "learning_rate": 2.091108142407153e-06, "loss": 0.7459923, "num_input_tokens_seen": 89811515, "step": 4166, "time_per_iteration": 2.6181044578552246 }, { "auxiliary_loss_clip": 0.01065218, "auxiliary_loss_mlp": 0.01002943, "balance_loss_clip": 1.0224936, "balance_loss_mlp": 1.00130963, "epoch": 0.5010521252930921, "flos": 57785011925760.0, "grad_norm": 0.8416519672079961, "language_loss": 0.62400335, "learning_rate": 2.090329972539377e-06, "loss": 0.64468497, "num_input_tokens_seen": 89870080, "step": 4167, "time_per_iteration": 3.2728874683380127 }, { "auxiliary_loss_clip": 0.01073619, "auxiliary_loss_mlp": 0.01025154, "balance_loss_clip": 1.04024673, "balance_loss_mlp": 1.01720858, "epoch": 0.5011723681837311, "flos": 18625500864000.0, "grad_norm": 1.9485641665977913, "language_loss": 0.68765759, "learning_rate": 2.089551788968616e-06, "loss": 0.70864522, "num_input_tokens_seen": 89888045, "step": 4168, "time_per_iteration": 3.0246427059173584 }, { "auxiliary_loss_clip": 0.01079529, "auxiliary_loss_mlp": 0.0100193, "balance_loss_clip": 1.0137856, "balance_loss_mlp": 1.00033307, "epoch": 0.5012926110743702, "flos": 55883146608000.0, "grad_norm": 0.8401174593415519, "language_loss": 0.61073768, "learning_rate": 2.08877359181292e-06, "loss": 0.63155234, "num_input_tokens_seen": 89944610, "step": 4169, "time_per_iteration": 3.427090883255005 }, { "auxiliary_loss_clip": 0.01144372, "auxiliary_loss_mlp": 0.01028333, "balance_loss_clip": 1.04359949, "balance_loss_mlp": 1.02017343, "epoch": 0.5014128539650093, "flos": 24238266117120.0, "grad_norm": 2.85410478072569, "language_loss": 0.85753995, "learning_rate": 2.0879953811903396e-06, "loss": 0.87926698, "num_input_tokens_seen": 89959495, "step": 4170, "time_per_iteration": 2.7821552753448486 }, { "auxiliary_loss_clip": 0.01167086, "auxiliary_loss_mlp": 0.01027398, "balance_loss_clip": 1.05057847, "balance_loss_mlp": 1.01938105, "epoch": 0.5015330968556484, "flos": 27527468382720.0, "grad_norm": 2.0082013332474484, "language_loss": 0.78652883, "learning_rate": 2.08721715721893e-06, "loss": 0.80847359, "num_input_tokens_seen": 89978820, "step": 4171, "time_per_iteration": 4.076817989349365 }, { "auxiliary_loss_clip": 0.01164069, "auxiliary_loss_mlp": 0.01024575, "balance_loss_clip": 1.04790854, "balance_loss_mlp": 1.01689816, "epoch": 0.5016533397462875, "flos": 23800802376960.0, "grad_norm": 1.8411182860186295, "language_loss": 0.77069038, "learning_rate": 2.0864389200167477e-06, "loss": 0.79257679, "num_input_tokens_seen": 89997075, "step": 4172, "time_per_iteration": 3.622549295425415 }, { "auxiliary_loss_clip": 0.01171618, "auxiliary_loss_mlp": 0.00762622, "balance_loss_clip": 1.05156934, "balance_loss_mlp": 1.00029182, "epoch": 0.5017735826369266, "flos": 25295009264640.0, "grad_norm": 1.7759079661929333, "language_loss": 0.79145783, "learning_rate": 2.0856606697018504e-06, "loss": 0.81080019, "num_input_tokens_seen": 90015085, "step": 4173, "time_per_iteration": 2.875133514404297 }, { "auxiliary_loss_clip": 0.0114906, "auxiliary_loss_mlp": 0.01022289, "balance_loss_clip": 1.0460695, "balance_loss_mlp": 1.01399755, "epoch": 0.5018938255275657, "flos": 16873203778560.0, "grad_norm": 2.0826578663452366, "language_loss": 0.73306286, "learning_rate": 2.084882406392297e-06, "loss": 0.75477636, "num_input_tokens_seen": 90033045, "step": 4174, "time_per_iteration": 3.65132212638855 }, { "auxiliary_loss_clip": 0.01169301, "auxiliary_loss_mlp": 0.01020309, "balance_loss_clip": 1.0512166, "balance_loss_mlp": 1.01286471, "epoch": 0.5020140684182047, "flos": 25515429073920.0, "grad_norm": 5.101294750756256, "language_loss": 0.71419841, "learning_rate": 2.0841041302061496e-06, "loss": 0.73609447, "num_input_tokens_seen": 90052505, "step": 4175, "time_per_iteration": 2.6758530139923096 }, { "auxiliary_loss_clip": 0.01142854, "auxiliary_loss_mlp": 0.0102622, "balance_loss_clip": 1.04417467, "balance_loss_mlp": 1.01864457, "epoch": 0.5021343113088439, "flos": 23659278791040.0, "grad_norm": 2.1720853999616208, "language_loss": 0.75886285, "learning_rate": 2.083325841261473e-06, "loss": 0.78055364, "num_input_tokens_seen": 90071565, "step": 4176, "time_per_iteration": 3.606649398803711 }, { "auxiliary_loss_clip": 0.01148221, "auxiliary_loss_mlp": 0.01027483, "balance_loss_clip": 1.04543269, "balance_loss_mlp": 1.01915646, "epoch": 0.502254554199483, "flos": 24534673148160.0, "grad_norm": 2.115151919462895, "language_loss": 0.66928148, "learning_rate": 2.0825475396763322e-06, "loss": 0.69103849, "num_input_tokens_seen": 90092215, "step": 4177, "time_per_iteration": 2.692342758178711 }, { "auxiliary_loss_clip": 0.01076853, "auxiliary_loss_mlp": 0.0102899, "balance_loss_clip": 1.03926444, "balance_loss_mlp": 1.02063918, "epoch": 0.502374797090122, "flos": 34240285607040.0, "grad_norm": 1.5553888767649453, "language_loss": 0.65656286, "learning_rate": 2.081769225568796e-06, "loss": 0.67762125, "num_input_tokens_seen": 90114665, "step": 4178, "time_per_iteration": 3.0914196968078613 }, { "auxiliary_loss_clip": 0.01168602, "auxiliary_loss_mlp": 0.01030374, "balance_loss_clip": 1.04770446, "balance_loss_mlp": 1.02282214, "epoch": 0.5024950399807612, "flos": 26031106679040.0, "grad_norm": 1.611324435934562, "language_loss": 0.76205003, "learning_rate": 2.0809908990569327e-06, "loss": 0.78403974, "num_input_tokens_seen": 90136445, "step": 4179, "time_per_iteration": 3.021836757659912 }, { "auxiliary_loss_clip": 0.01150504, "auxiliary_loss_mlp": 0.01032866, "balance_loss_clip": 1.04673278, "balance_loss_mlp": 1.02533746, "epoch": 0.5026152828714002, "flos": 21252438120960.0, "grad_norm": 1.815330536111044, "language_loss": 0.79334056, "learning_rate": 2.0802125602588146e-06, "loss": 0.81517428, "num_input_tokens_seen": 90155710, "step": 4180, "time_per_iteration": 2.713195323944092 }, { "auxiliary_loss_clip": 0.01186318, "auxiliary_loss_mlp": 0.01024981, "balance_loss_clip": 1.05366898, "balance_loss_mlp": 1.01676798, "epoch": 0.5027355257620393, "flos": 30956111245440.0, "grad_norm": 3.4864962273628533, "language_loss": 0.66793501, "learning_rate": 2.0794342092925146e-06, "loss": 0.69004798, "num_input_tokens_seen": 90176845, "step": 4181, "time_per_iteration": 2.679734468460083 }, { "auxiliary_loss_clip": 0.01173341, "auxiliary_loss_mlp": 0.01030567, "balance_loss_clip": 1.05343735, "balance_loss_mlp": 1.02296185, "epoch": 0.5028557686526784, "flos": 24791147233920.0, "grad_norm": 1.9107429587290148, "language_loss": 0.68004799, "learning_rate": 2.078655846276108e-06, "loss": 0.70208704, "num_input_tokens_seen": 90197175, "step": 4182, "time_per_iteration": 2.657201051712036 }, { "auxiliary_loss_clip": 0.01149015, "auxiliary_loss_mlp": 0.01031978, "balance_loss_clip": 1.04682672, "balance_loss_mlp": 1.02359772, "epoch": 0.5029760115433175, "flos": 22966992990720.0, "grad_norm": 1.9810689739267082, "language_loss": 0.68829399, "learning_rate": 2.0778774713276727e-06, "loss": 0.71010393, "num_input_tokens_seen": 90216650, "step": 4183, "time_per_iteration": 2.720242738723755 }, { "auxiliary_loss_clip": 0.01165587, "auxiliary_loss_mlp": 0.01027417, "balance_loss_clip": 1.04706454, "balance_loss_mlp": 1.01940656, "epoch": 0.5030962544339566, "flos": 15305164485120.0, "grad_norm": 2.371071605341998, "language_loss": 0.67727637, "learning_rate": 2.077099084565287e-06, "loss": 0.69920641, "num_input_tokens_seen": 90234055, "step": 4184, "time_per_iteration": 2.599565267562866 }, { "auxiliary_loss_clip": 0.01147777, "auxiliary_loss_mlp": 0.01026985, "balance_loss_clip": 1.0446291, "balance_loss_mlp": 1.01849139, "epoch": 0.5032164973245957, "flos": 24494847943680.0, "grad_norm": 2.2661487330041608, "language_loss": 0.65069807, "learning_rate": 2.0763206861070313e-06, "loss": 0.67244571, "num_input_tokens_seen": 90253115, "step": 4185, "time_per_iteration": 2.7256710529327393 }, { "auxiliary_loss_clip": 0.01182704, "auxiliary_loss_mlp": 0.01029488, "balance_loss_clip": 1.05009592, "balance_loss_mlp": 1.02134025, "epoch": 0.5033367402152348, "flos": 16213452721920.0, "grad_norm": 1.9891549725504352, "language_loss": 0.75055361, "learning_rate": 2.0755422760709876e-06, "loss": 0.77267551, "num_input_tokens_seen": 90270515, "step": 4186, "time_per_iteration": 2.612501621246338 }, { "auxiliary_loss_clip": 0.01118138, "auxiliary_loss_mlp": 0.01023651, "balance_loss_clip": 1.04280448, "balance_loss_mlp": 1.01616502, "epoch": 0.5034569831058738, "flos": 21391375927680.0, "grad_norm": 1.8953053516344838, "language_loss": 0.76963735, "learning_rate": 2.0747638545752417e-06, "loss": 0.7910552, "num_input_tokens_seen": 90289075, "step": 4187, "time_per_iteration": 2.7648212909698486 }, { "auxiliary_loss_clip": 0.01150832, "auxiliary_loss_mlp": 0.01031836, "balance_loss_clip": 1.04811001, "balance_loss_mlp": 1.02381873, "epoch": 0.503577225996513, "flos": 20558751690240.0, "grad_norm": 2.2048373721697563, "language_loss": 0.83291066, "learning_rate": 2.073985421737878e-06, "loss": 0.85473734, "num_input_tokens_seen": 90306385, "step": 4188, "time_per_iteration": 2.725551128387451 }, { "auxiliary_loss_clip": 0.01172653, "auxiliary_loss_mlp": 0.01032955, "balance_loss_clip": 1.05143619, "balance_loss_mlp": 1.02441919, "epoch": 0.5036974688871521, "flos": 27229157930880.0, "grad_norm": 2.762331733466256, "language_loss": 0.74345297, "learning_rate": 2.0732069776769844e-06, "loss": 0.76550901, "num_input_tokens_seen": 90323795, "step": 4189, "time_per_iteration": 2.6788506507873535 }, { "auxiliary_loss_clip": 0.01185884, "auxiliary_loss_mlp": 0.01027899, "balance_loss_clip": 1.05404139, "balance_loss_mlp": 1.01951265, "epoch": 0.5038177117777911, "flos": 20412164286720.0, "grad_norm": 1.9776931605191572, "language_loss": 0.73219907, "learning_rate": 2.072428522510651e-06, "loss": 0.75433683, "num_input_tokens_seen": 90340360, "step": 4190, "time_per_iteration": 2.6441097259521484 }, { "auxiliary_loss_clip": 0.01137838, "auxiliary_loss_mlp": 0.01031637, "balance_loss_clip": 1.04736614, "balance_loss_mlp": 1.02329206, "epoch": 0.5039379546684303, "flos": 21907987286400.0, "grad_norm": 2.3819020572562697, "language_loss": 0.76073796, "learning_rate": 2.071650056356968e-06, "loss": 0.78243268, "num_input_tokens_seen": 90357900, "step": 4191, "time_per_iteration": 2.7456560134887695 }, { "auxiliary_loss_clip": 0.01183538, "auxiliary_loss_mlp": 0.01036449, "balance_loss_clip": 1.05217659, "balance_loss_mlp": 1.02814054, "epoch": 0.5040581975590693, "flos": 20010718909440.0, "grad_norm": 2.1482425177727915, "language_loss": 0.80078697, "learning_rate": 2.070871579334028e-06, "loss": 0.82298684, "num_input_tokens_seen": 90377010, "step": 4192, "time_per_iteration": 2.71087646484375 }, { "auxiliary_loss_clip": 0.01178815, "auxiliary_loss_mlp": 0.0103025, "balance_loss_clip": 1.04809928, "balance_loss_mlp": 1.0224061, "epoch": 0.5041784404497084, "flos": 20959837931520.0, "grad_norm": 3.0613769990018804, "language_loss": 0.71634221, "learning_rate": 2.0700930915599264e-06, "loss": 0.73843282, "num_input_tokens_seen": 90396740, "step": 4193, "time_per_iteration": 2.5917296409606934 }, { "auxiliary_loss_clip": 0.01181603, "auxiliary_loss_mlp": 0.01027026, "balance_loss_clip": 1.05084777, "balance_loss_mlp": 1.01962352, "epoch": 0.5042986833403476, "flos": 12495082757760.0, "grad_norm": 2.087851619360629, "language_loss": 0.78381133, "learning_rate": 2.0693145931527583e-06, "loss": 0.80589765, "num_input_tokens_seen": 90413220, "step": 4194, "time_per_iteration": 2.7896101474761963 }, { "auxiliary_loss_clip": 0.01150635, "auxiliary_loss_mlp": 0.01028189, "balance_loss_clip": 1.04835522, "balance_loss_mlp": 1.0203923, "epoch": 0.5044189262309866, "flos": 29202305788800.0, "grad_norm": 1.5224714572593459, "language_loss": 0.77984488, "learning_rate": 2.068536084230622e-06, "loss": 0.80163312, "num_input_tokens_seen": 90435085, "step": 4195, "time_per_iteration": 2.7925758361816406 }, { "auxiliary_loss_clip": 0.01169824, "auxiliary_loss_mlp": 0.01028026, "balance_loss_clip": 1.05130756, "balance_loss_mlp": 1.01928234, "epoch": 0.5045391691216257, "flos": 23873198238720.0, "grad_norm": 2.365633982538467, "language_loss": 0.89017838, "learning_rate": 2.067757564911616e-06, "loss": 0.91215694, "num_input_tokens_seen": 90453660, "step": 4196, "time_per_iteration": 2.708341360092163 }, { "auxiliary_loss_clip": 0.01163782, "auxiliary_loss_mlp": 0.00763246, "balance_loss_clip": 1.04948032, "balance_loss_mlp": 1.00026011, "epoch": 0.5046594120122648, "flos": 24644990793600.0, "grad_norm": 1.9052011721899877, "language_loss": 0.92615998, "learning_rate": 2.0669790353138407e-06, "loss": 0.94543028, "num_input_tokens_seen": 90472625, "step": 4197, "time_per_iteration": 3.6740736961364746 }, { "auxiliary_loss_clip": 0.01135106, "auxiliary_loss_mlp": 0.0076308, "balance_loss_clip": 1.04639912, "balance_loss_mlp": 1.00026917, "epoch": 0.5047796549029039, "flos": 23362835846400.0, "grad_norm": 2.3464352578273964, "language_loss": 0.73345006, "learning_rate": 2.0662004955553995e-06, "loss": 0.75243187, "num_input_tokens_seen": 90492325, "step": 4198, "time_per_iteration": 3.6061909198760986 }, { "auxiliary_loss_clip": 0.01151051, "auxiliary_loss_mlp": 0.01026173, "balance_loss_clip": 1.04735279, "balance_loss_mlp": 1.01865053, "epoch": 0.5048998977935429, "flos": 17304095329920.0, "grad_norm": 1.9213310851157395, "language_loss": 0.76631427, "learning_rate": 2.065421945754395e-06, "loss": 0.78808659, "num_input_tokens_seen": 90510055, "step": 4199, "time_per_iteration": 3.6012279987335205 }, { "auxiliary_loss_clip": 0.01128594, "auxiliary_loss_mlp": 0.01029984, "balance_loss_clip": 1.04696274, "balance_loss_mlp": 1.0226171, "epoch": 0.505020140684182, "flos": 34856979235200.0, "grad_norm": 1.7394713021140098, "language_loss": 0.78091085, "learning_rate": 2.0646433860289344e-06, "loss": 0.80249667, "num_input_tokens_seen": 90528980, "step": 4200, "time_per_iteration": 2.8401331901550293 }, { "auxiliary_loss_clip": 0.01173978, "auxiliary_loss_mlp": 0.00763338, "balance_loss_clip": 1.05033624, "balance_loss_mlp": 1.00025833, "epoch": 0.5051403835748212, "flos": 24863974058880.0, "grad_norm": 2.048517698415416, "language_loss": 0.82808638, "learning_rate": 2.0638648164971233e-06, "loss": 0.84745955, "num_input_tokens_seen": 90547445, "step": 4201, "time_per_iteration": 2.6215784549713135 }, { "auxiliary_loss_clip": 0.01153025, "auxiliary_loss_mlp": 0.01026425, "balance_loss_clip": 1.04839611, "balance_loss_mlp": 1.01908755, "epoch": 0.5052606264654602, "flos": 20959694277120.0, "grad_norm": 1.7309860575282148, "language_loss": 0.88424087, "learning_rate": 2.06308623727707e-06, "loss": 0.9060353, "num_input_tokens_seen": 90567545, "step": 4202, "time_per_iteration": 3.6678428649902344 }, { "auxiliary_loss_clip": 0.01167767, "auxiliary_loss_mlp": 0.01033012, "balance_loss_clip": 1.0504086, "balance_loss_mlp": 1.02381456, "epoch": 0.5053808693560993, "flos": 19642382893440.0, "grad_norm": 2.225811532672944, "language_loss": 0.76795447, "learning_rate": 2.0623076484868846e-06, "loss": 0.78996217, "num_input_tokens_seen": 90585000, "step": 4203, "time_per_iteration": 2.6137685775756836 }, { "auxiliary_loss_clip": 0.01051679, "auxiliary_loss_mlp": 0.01006888, "balance_loss_clip": 1.01127982, "balance_loss_mlp": 1.00532675, "epoch": 0.5055011122467384, "flos": 67504915019520.0, "grad_norm": 0.8311054218088607, "language_loss": 0.60642612, "learning_rate": 2.061529050244679e-06, "loss": 0.62701178, "num_input_tokens_seen": 90644745, "step": 4204, "time_per_iteration": 3.2554993629455566 }, { "auxiliary_loss_clip": 0.01147882, "auxiliary_loss_mlp": 0.01024327, "balance_loss_clip": 1.04707432, "balance_loss_mlp": 1.01631284, "epoch": 0.5056213551373775, "flos": 16872952383360.0, "grad_norm": 2.721617058483369, "language_loss": 0.7415899, "learning_rate": 2.060750442668565e-06, "loss": 0.76331198, "num_input_tokens_seen": 90662500, "step": 4205, "time_per_iteration": 2.721179246902466 }, { "auxiliary_loss_clip": 0.01170658, "auxiliary_loss_mlp": 0.01026199, "balance_loss_clip": 1.05385756, "balance_loss_mlp": 1.01824164, "epoch": 0.5057415980280165, "flos": 15334179696000.0, "grad_norm": 2.684867526835601, "language_loss": 0.63983607, "learning_rate": 2.059971825876657e-06, "loss": 0.66180456, "num_input_tokens_seen": 90677010, "step": 4206, "time_per_iteration": 2.6170902252197266 }, { "auxiliary_loss_clip": 0.0117163, "auxiliary_loss_mlp": 0.01024158, "balance_loss_clip": 1.05098903, "balance_loss_mlp": 1.01627207, "epoch": 0.5058618409186557, "flos": 19025976574080.0, "grad_norm": 2.0819972655753363, "language_loss": 0.76738554, "learning_rate": 2.0591931999870713e-06, "loss": 0.78934336, "num_input_tokens_seen": 90695935, "step": 4207, "time_per_iteration": 2.7252726554870605 }, { "auxiliary_loss_clip": 0.01061513, "auxiliary_loss_mlp": 0.01003424, "balance_loss_clip": 1.0118432, "balance_loss_mlp": 1.00181425, "epoch": 0.5059820838092948, "flos": 63453114080640.0, "grad_norm": 0.8291613101492425, "language_loss": 0.57600707, "learning_rate": 2.0584145651179234e-06, "loss": 0.59665644, "num_input_tokens_seen": 90751645, "step": 4208, "time_per_iteration": 3.241603374481201 }, { "auxiliary_loss_clip": 0.01153731, "auxiliary_loss_mlp": 0.00762731, "balance_loss_clip": 1.05026317, "balance_loss_mlp": 1.00032687, "epoch": 0.5061023266999338, "flos": 15441803821440.0, "grad_norm": 4.767999856636263, "language_loss": 0.80413616, "learning_rate": 2.0576359213873327e-06, "loss": 0.82330084, "num_input_tokens_seen": 90766795, "step": 4209, "time_per_iteration": 2.679694414138794 }, { "auxiliary_loss_clip": 0.01161757, "auxiliary_loss_mlp": 0.01026563, "balance_loss_clip": 1.04710221, "balance_loss_mlp": 1.01865911, "epoch": 0.506222569590573, "flos": 22451063990400.0, "grad_norm": 3.423705379713862, "language_loss": 0.70162678, "learning_rate": 2.056857268913419e-06, "loss": 0.72350997, "num_input_tokens_seen": 90786845, "step": 4210, "time_per_iteration": 2.680133581161499 }, { "auxiliary_loss_clip": 0.0117011, "auxiliary_loss_mlp": 0.01031419, "balance_loss_clip": 1.05205917, "balance_loss_mlp": 1.02315199, "epoch": 0.506342812481212, "flos": 17558665994880.0, "grad_norm": 2.4299161285522564, "language_loss": 0.84146369, "learning_rate": 2.056078607814303e-06, "loss": 0.8634789, "num_input_tokens_seen": 90802630, "step": 4211, "time_per_iteration": 2.6550557613372803 }, { "auxiliary_loss_clip": 0.01168368, "auxiliary_loss_mlp": 0.01033144, "balance_loss_clip": 1.05073667, "balance_loss_mlp": 1.02514493, "epoch": 0.5064630553718511, "flos": 23402050519680.0, "grad_norm": 2.117006350919607, "language_loss": 0.78300989, "learning_rate": 2.055299938208106e-06, "loss": 0.80502498, "num_input_tokens_seen": 90823620, "step": 4212, "time_per_iteration": 2.636911392211914 }, { "auxiliary_loss_clip": 0.01173655, "auxiliary_loss_mlp": 0.01032618, "balance_loss_clip": 1.05243528, "balance_loss_mlp": 1.02481592, "epoch": 0.5065832982624903, "flos": 23987035416960.0, "grad_norm": 1.6155203275529053, "language_loss": 0.86070669, "learning_rate": 2.0545212602129526e-06, "loss": 0.88276941, "num_input_tokens_seen": 90843475, "step": 4213, "time_per_iteration": 2.6521801948547363 }, { "auxiliary_loss_clip": 0.01149627, "auxiliary_loss_mlp": 0.01027832, "balance_loss_clip": 1.04679322, "balance_loss_mlp": 1.01963925, "epoch": 0.5067035411531293, "flos": 21503058289920.0, "grad_norm": 4.663001567306374, "language_loss": 0.66551018, "learning_rate": 2.0537425739469673e-06, "loss": 0.68728483, "num_input_tokens_seen": 90862410, "step": 4214, "time_per_iteration": 2.7132513523101807 }, { "auxiliary_loss_clip": 0.01068057, "auxiliary_loss_mlp": 0.01002848, "balance_loss_clip": 1.01238632, "balance_loss_mlp": 1.00128675, "epoch": 0.5068237840437684, "flos": 65934397687680.0, "grad_norm": 0.834543569772369, "language_loss": 0.59419799, "learning_rate": 2.052963879528276e-06, "loss": 0.61490703, "num_input_tokens_seen": 90922280, "step": 4215, "time_per_iteration": 3.1462759971618652 }, { "auxiliary_loss_clip": 0.01169314, "auxiliary_loss_mlp": 0.01029509, "balance_loss_clip": 1.05098259, "balance_loss_mlp": 1.02109861, "epoch": 0.5069440269344075, "flos": 27264206626560.0, "grad_norm": 2.096957256263172, "language_loss": 0.764, "learning_rate": 2.052185177075007e-06, "loss": 0.78598821, "num_input_tokens_seen": 90941850, "step": 4216, "time_per_iteration": 2.6854097843170166 }, { "auxiliary_loss_clip": 0.01170837, "auxiliary_loss_mlp": 0.01031429, "balance_loss_clip": 1.04981625, "balance_loss_mlp": 1.02272081, "epoch": 0.5070642698250466, "flos": 23366319465600.0, "grad_norm": 2.0454657097439877, "language_loss": 0.8280288, "learning_rate": 2.051406466705288e-06, "loss": 0.85005146, "num_input_tokens_seen": 90961390, "step": 4217, "time_per_iteration": 2.6286449432373047 }, { "auxiliary_loss_clip": 0.01180719, "auxiliary_loss_mlp": 0.01020828, "balance_loss_clip": 1.04968929, "balance_loss_mlp": 1.01329958, "epoch": 0.5071845127156857, "flos": 20340127560960.0, "grad_norm": 2.270308472242166, "language_loss": 0.81226552, "learning_rate": 2.0506277485372486e-06, "loss": 0.83428103, "num_input_tokens_seen": 90980215, "step": 4218, "time_per_iteration": 2.5990729331970215 }, { "auxiliary_loss_clip": 0.01161992, "auxiliary_loss_mlp": 0.01025293, "balance_loss_clip": 1.04863811, "balance_loss_mlp": 1.01716316, "epoch": 0.5073047556063248, "flos": 12092955022080.0, "grad_norm": 2.949332957722248, "language_loss": 0.66787112, "learning_rate": 2.04984902268902e-06, "loss": 0.689744, "num_input_tokens_seen": 90997415, "step": 4219, "time_per_iteration": 2.6079375743865967 }, { "auxiliary_loss_clip": 0.01175442, "auxiliary_loss_mlp": 0.01035279, "balance_loss_clip": 1.04986155, "balance_loss_mlp": 1.02634978, "epoch": 0.5074249984969639, "flos": 19682854542720.0, "grad_norm": 2.099156481898898, "language_loss": 0.75360185, "learning_rate": 2.0490702892787345e-06, "loss": 0.77570903, "num_input_tokens_seen": 91016475, "step": 4220, "time_per_iteration": 2.670884132385254 }, { "auxiliary_loss_clip": 0.01158816, "auxiliary_loss_mlp": 0.01029584, "balance_loss_clip": 1.04590034, "balance_loss_mlp": 1.02175808, "epoch": 0.5075452413876029, "flos": 28765703975040.0, "grad_norm": 1.5264643261504691, "language_loss": 0.62268585, "learning_rate": 2.0482915484245246e-06, "loss": 0.64456987, "num_input_tokens_seen": 91038095, "step": 4221, "time_per_iteration": 2.7097678184509277 }, { "auxiliary_loss_clip": 0.01121815, "auxiliary_loss_mlp": 0.0102915, "balance_loss_clip": 1.04657698, "balance_loss_mlp": 1.02019191, "epoch": 0.5076654842782421, "flos": 20339445202560.0, "grad_norm": 2.5321150910992993, "language_loss": 0.84112084, "learning_rate": 2.047512800244526e-06, "loss": 0.86263049, "num_input_tokens_seen": 91053360, "step": 4222, "time_per_iteration": 2.7460930347442627 }, { "auxiliary_loss_clip": 0.01168015, "auxiliary_loss_mlp": 0.01023536, "balance_loss_clip": 1.04994285, "balance_loss_mlp": 1.01514339, "epoch": 0.5077857271688812, "flos": 26359653404160.0, "grad_norm": 1.9381016844735832, "language_loss": 0.79083014, "learning_rate": 2.046734044856873e-06, "loss": 0.81274569, "num_input_tokens_seen": 91072770, "step": 4223, "time_per_iteration": 3.682727813720703 }, { "auxiliary_loss_clip": 0.01168261, "auxiliary_loss_mlp": 0.01029357, "balance_loss_clip": 1.05041146, "balance_loss_mlp": 1.02162039, "epoch": 0.5079059700595202, "flos": 21798962530560.0, "grad_norm": 2.011576481806382, "language_loss": 0.81113017, "learning_rate": 2.045955282379702e-06, "loss": 0.83310634, "num_input_tokens_seen": 91091430, "step": 4224, "time_per_iteration": 3.633483409881592 }, { "auxiliary_loss_clip": 0.01165101, "auxiliary_loss_mlp": 0.01030546, "balance_loss_clip": 1.04671836, "balance_loss_mlp": 1.02188516, "epoch": 0.5080262129501594, "flos": 13187943175680.0, "grad_norm": 3.604550938465008, "language_loss": 0.75323957, "learning_rate": 2.045176512931152e-06, "loss": 0.77519602, "num_input_tokens_seen": 91106060, "step": 4225, "time_per_iteration": 3.5453405380249023 }, { "auxiliary_loss_clip": 0.01143471, "auxiliary_loss_mlp": 0.01029414, "balance_loss_clip": 1.0465169, "balance_loss_mlp": 1.02180862, "epoch": 0.5081464558407984, "flos": 25301473712640.0, "grad_norm": 2.9211346177553517, "language_loss": 0.76338828, "learning_rate": 2.0443977366293604e-06, "loss": 0.78511709, "num_input_tokens_seen": 91124100, "step": 4226, "time_per_iteration": 2.7462334632873535 }, { "auxiliary_loss_clip": 0.01111123, "auxiliary_loss_mlp": 0.01028576, "balance_loss_clip": 1.04251766, "balance_loss_mlp": 1.02032661, "epoch": 0.5082666987314375, "flos": 30951226995840.0, "grad_norm": 2.256581258433177, "language_loss": 0.77009809, "learning_rate": 2.043618953592468e-06, "loss": 0.79149508, "num_input_tokens_seen": 91146555, "step": 4227, "time_per_iteration": 3.7615737915039062 }, { "auxiliary_loss_clip": 0.01156124, "auxiliary_loss_mlp": 0.01030895, "balance_loss_clip": 1.05103981, "balance_loss_mlp": 1.02214539, "epoch": 0.5083869416220766, "flos": 19682495406720.0, "grad_norm": 1.507943147901373, "language_loss": 0.81143141, "learning_rate": 2.0428401639386144e-06, "loss": 0.8333016, "num_input_tokens_seen": 91167120, "step": 4228, "time_per_iteration": 2.707005262374878 }, { "auxiliary_loss_clip": 0.01049131, "auxiliary_loss_mlp": 0.01001144, "balance_loss_clip": 1.01066899, "balance_loss_mlp": 0.99968398, "epoch": 0.5085071845127157, "flos": 71817535589760.0, "grad_norm": 0.8221350246816006, "language_loss": 0.58067518, "learning_rate": 2.042061367785943e-06, "loss": 0.60117793, "num_input_tokens_seen": 91220260, "step": 4229, "time_per_iteration": 3.201850175857544 }, { "auxiliary_loss_clip": 0.01140675, "auxiliary_loss_mlp": 0.01030989, "balance_loss_clip": 1.04289937, "balance_loss_mlp": 1.02231622, "epoch": 0.5086274274033548, "flos": 35951608252800.0, "grad_norm": 2.2674704311301874, "language_loss": 0.75350654, "learning_rate": 2.041282565252594e-06, "loss": 0.7752232, "num_input_tokens_seen": 91240425, "step": 4230, "time_per_iteration": 2.810241222381592 }, { "auxiliary_loss_clip": 0.01139042, "auxiliary_loss_mlp": 0.01025061, "balance_loss_clip": 1.04504919, "balance_loss_mlp": 1.01728857, "epoch": 0.5087476702939938, "flos": 23513732881920.0, "grad_norm": 1.7878090994965394, "language_loss": 0.77107334, "learning_rate": 2.040503756456714e-06, "loss": 0.79271448, "num_input_tokens_seen": 91259635, "step": 4231, "time_per_iteration": 2.7487435340881348 }, { "auxiliary_loss_clip": 0.01161135, "auxiliary_loss_mlp": 0.01031866, "balance_loss_clip": 1.04656458, "balance_loss_mlp": 1.02264464, "epoch": 0.508867913184633, "flos": 15122091841920.0, "grad_norm": 2.0470382258209954, "language_loss": 0.79175389, "learning_rate": 2.0397249415164456e-06, "loss": 0.81368387, "num_input_tokens_seen": 91276990, "step": 4232, "time_per_iteration": 2.602362632751465 }, { "auxiliary_loss_clip": 0.01143987, "auxiliary_loss_mlp": 0.01028278, "balance_loss_clip": 1.04245234, "balance_loss_mlp": 1.01982641, "epoch": 0.508988156075272, "flos": 25885309374720.0, "grad_norm": 1.7136886119577657, "language_loss": 0.80243319, "learning_rate": 2.0389461205499354e-06, "loss": 0.82415581, "num_input_tokens_seen": 91296125, "step": 4233, "time_per_iteration": 2.7469444274902344 }, { "auxiliary_loss_clip": 0.01143019, "auxiliary_loss_mlp": 0.01024384, "balance_loss_clip": 1.04567671, "balance_loss_mlp": 1.0165695, "epoch": 0.5091083989659111, "flos": 13844857057920.0, "grad_norm": 1.7901303361785612, "language_loss": 0.73562646, "learning_rate": 2.03816729367533e-06, "loss": 0.7573005, "num_input_tokens_seen": 91314280, "step": 4234, "time_per_iteration": 2.643343210220337 }, { "auxiliary_loss_clip": 0.0115789, "auxiliary_loss_mlp": 0.01028149, "balance_loss_clip": 1.04926968, "balance_loss_mlp": 1.02023327, "epoch": 0.5092286418565503, "flos": 21104881050240.0, "grad_norm": 3.8234780064007907, "language_loss": 0.71383798, "learning_rate": 2.0373884610107765e-06, "loss": 0.73569834, "num_input_tokens_seen": 91334595, "step": 4235, "time_per_iteration": 2.7530951499938965 }, { "auxiliary_loss_clip": 0.01171049, "auxiliary_loss_mlp": 0.01025056, "balance_loss_clip": 1.04745901, "balance_loss_mlp": 1.01751637, "epoch": 0.5093488847471893, "flos": 18621298972800.0, "grad_norm": 2.827427389745533, "language_loss": 0.69465226, "learning_rate": 2.0366096226744225e-06, "loss": 0.71661329, "num_input_tokens_seen": 91349790, "step": 4236, "time_per_iteration": 2.5716726779937744 }, { "auxiliary_loss_clip": 0.01159533, "auxiliary_loss_mlp": 0.01030529, "balance_loss_clip": 1.04710948, "balance_loss_mlp": 1.02255964, "epoch": 0.5094691276378284, "flos": 23803783205760.0, "grad_norm": 1.884613245947169, "language_loss": 0.76803505, "learning_rate": 2.035830778784418e-06, "loss": 0.78993565, "num_input_tokens_seen": 91370465, "step": 4237, "time_per_iteration": 2.666252374649048 }, { "auxiliary_loss_clip": 0.01159611, "auxiliary_loss_mlp": 0.01033369, "balance_loss_clip": 1.05236673, "balance_loss_mlp": 1.02573705, "epoch": 0.5095893705284675, "flos": 17420410546560.0, "grad_norm": 3.4601602087226313, "language_loss": 0.79902422, "learning_rate": 2.0350519294589134e-06, "loss": 0.82095408, "num_input_tokens_seen": 91388505, "step": 4238, "time_per_iteration": 2.6558332443237305 }, { "auxiliary_loss_clip": 0.0112326, "auxiliary_loss_mlp": 0.01030939, "balance_loss_clip": 1.04187131, "balance_loss_mlp": 1.02212381, "epoch": 0.5097096134191066, "flos": 25849362839040.0, "grad_norm": 1.6992148136169019, "language_loss": 0.82559937, "learning_rate": 2.0342730748160588e-06, "loss": 0.84714139, "num_input_tokens_seen": 91408970, "step": 4239, "time_per_iteration": 2.846330165863037 }, { "auxiliary_loss_clip": 0.0115084, "auxiliary_loss_mlp": 0.01030062, "balance_loss_clip": 1.04413962, "balance_loss_mlp": 1.02101374, "epoch": 0.5098298563097456, "flos": 27745122844800.0, "grad_norm": 2.0694718899715054, "language_loss": 0.70392418, "learning_rate": 2.033494214974006e-06, "loss": 0.72573316, "num_input_tokens_seen": 91430115, "step": 4240, "time_per_iteration": 2.7652196884155273 }, { "auxiliary_loss_clip": 0.0114402, "auxiliary_loss_mlp": 0.01031944, "balance_loss_clip": 1.04774833, "balance_loss_mlp": 1.02411199, "epoch": 0.5099500992003848, "flos": 21358913011200.0, "grad_norm": 1.794649802168663, "language_loss": 0.83729398, "learning_rate": 2.0327153500509067e-06, "loss": 0.85905361, "num_input_tokens_seen": 91449140, "step": 4241, "time_per_iteration": 2.647798538208008 }, { "auxiliary_loss_clip": 0.01158512, "auxiliary_loss_mlp": 0.01027116, "balance_loss_clip": 1.05075884, "balance_loss_mlp": 1.01874113, "epoch": 0.5100703420910239, "flos": 19865999013120.0, "grad_norm": 1.8614640784367724, "language_loss": 0.84557128, "learning_rate": 2.031936480164916e-06, "loss": 0.86742759, "num_input_tokens_seen": 91466880, "step": 4242, "time_per_iteration": 2.7399678230285645 }, { "auxiliary_loss_clip": 0.01154377, "auxiliary_loss_mlp": 0.0103076, "balance_loss_clip": 1.05145836, "balance_loss_mlp": 1.0228442, "epoch": 0.5101905849816629, "flos": 24648797635200.0, "grad_norm": 2.2683545006256565, "language_loss": 0.80711693, "learning_rate": 2.0311576054341857e-06, "loss": 0.82896829, "num_input_tokens_seen": 91487495, "step": 4243, "time_per_iteration": 2.7955117225646973 }, { "auxiliary_loss_clip": 0.0118789, "auxiliary_loss_mlp": 0.01023954, "balance_loss_clip": 1.05485356, "balance_loss_mlp": 1.01572871, "epoch": 0.5103108278723021, "flos": 22930076787840.0, "grad_norm": 1.7032508627711769, "language_loss": 0.62651908, "learning_rate": 2.0303787259768715e-06, "loss": 0.64863753, "num_input_tokens_seen": 91508395, "step": 4244, "time_per_iteration": 2.6356124877929688 }, { "auxiliary_loss_clip": 0.01156915, "auxiliary_loss_mlp": 0.01027332, "balance_loss_clip": 1.05094552, "balance_loss_mlp": 1.01904345, "epoch": 0.5104310707629411, "flos": 21506613736320.0, "grad_norm": 2.213187162513074, "language_loss": 0.68718481, "learning_rate": 2.0295998419111294e-06, "loss": 0.70902729, "num_input_tokens_seen": 91525685, "step": 4245, "time_per_iteration": 2.660735845565796 }, { "auxiliary_loss_clip": 0.01115358, "auxiliary_loss_mlp": 0.01034647, "balance_loss_clip": 1.04345536, "balance_loss_mlp": 1.02722585, "epoch": 0.5105513136535802, "flos": 14903180403840.0, "grad_norm": 3.027812203900172, "language_loss": 0.73523748, "learning_rate": 2.028820953355115e-06, "loss": 0.75673753, "num_input_tokens_seen": 91543785, "step": 4246, "time_per_iteration": 2.8027307987213135 }, { "auxiliary_loss_clip": 0.01162914, "auxiliary_loss_mlp": 0.01027048, "balance_loss_clip": 1.04992092, "balance_loss_mlp": 1.0181551, "epoch": 0.5106715565442194, "flos": 22602212421120.0, "grad_norm": 2.425300865988715, "language_loss": 0.78717911, "learning_rate": 2.0280420604269834e-06, "loss": 0.80907869, "num_input_tokens_seen": 91563325, "step": 4247, "time_per_iteration": 2.7292001247406006 }, { "auxiliary_loss_clip": 0.01066434, "auxiliary_loss_mlp": 0.01005766, "balance_loss_clip": 1.0131892, "balance_loss_mlp": 1.00419223, "epoch": 0.5107917994348584, "flos": 71027645558400.0, "grad_norm": 0.7044462418895434, "language_loss": 0.58905363, "learning_rate": 2.027263163244895e-06, "loss": 0.60977566, "num_input_tokens_seen": 91632450, "step": 4248, "time_per_iteration": 3.3702101707458496 }, { "auxiliary_loss_clip": 0.01166681, "auxiliary_loss_mlp": 0.01025452, "balance_loss_clip": 1.05047011, "balance_loss_mlp": 1.01773357, "epoch": 0.5109120423254975, "flos": 24827416992000.0, "grad_norm": 1.5559987202658407, "language_loss": 0.74623263, "learning_rate": 2.026484261927005e-06, "loss": 0.76815397, "num_input_tokens_seen": 91651945, "step": 4249, "time_per_iteration": 3.6684329509735107 }, { "auxiliary_loss_clip": 0.01177382, "auxiliary_loss_mlp": 0.01029518, "balance_loss_clip": 1.054039, "balance_loss_mlp": 1.02122676, "epoch": 0.5110322852161366, "flos": 21247661612160.0, "grad_norm": 2.0522762036906865, "language_loss": 0.73607242, "learning_rate": 2.025705356591475e-06, "loss": 0.75814146, "num_input_tokens_seen": 91669635, "step": 4250, "time_per_iteration": 3.6083621978759766 }, { "auxiliary_loss_clip": 0.01041524, "auxiliary_loss_mlp": 0.00754167, "balance_loss_clip": 1.01146543, "balance_loss_mlp": 1.00056577, "epoch": 0.5111525281067757, "flos": 66457114358400.0, "grad_norm": 0.7580895053274237, "language_loss": 0.57960117, "learning_rate": 2.024926447356462e-06, "loss": 0.59755814, "num_input_tokens_seen": 91731920, "step": 4251, "time_per_iteration": 4.156554222106934 }, { "auxiliary_loss_clip": 0.01171486, "auxiliary_loss_mlp": 0.01027407, "balance_loss_clip": 1.05207133, "balance_loss_mlp": 1.01897335, "epoch": 0.5112727709974147, "flos": 14866731077760.0, "grad_norm": 2.025099352185664, "language_loss": 0.79051709, "learning_rate": 2.024147534340127e-06, "loss": 0.81250602, "num_input_tokens_seen": 91749780, "step": 4252, "time_per_iteration": 2.684636116027832 }, { "auxiliary_loss_clip": 0.01149682, "auxiliary_loss_mlp": 0.01025875, "balance_loss_clip": 1.04406285, "balance_loss_mlp": 1.01788771, "epoch": 0.5113930138880539, "flos": 21177600134400.0, "grad_norm": 1.5745444452456543, "language_loss": 0.79480481, "learning_rate": 2.02336861766063e-06, "loss": 0.81656027, "num_input_tokens_seen": 91768840, "step": 4253, "time_per_iteration": 2.6903581619262695 }, { "auxiliary_loss_clip": 0.01180987, "auxiliary_loss_mlp": 0.01033267, "balance_loss_clip": 1.05561101, "balance_loss_mlp": 1.02468395, "epoch": 0.511513256778693, "flos": 20409111630720.0, "grad_norm": 1.8723186010908548, "language_loss": 0.7870307, "learning_rate": 2.0225896974361327e-06, "loss": 0.80917329, "num_input_tokens_seen": 91788945, "step": 4254, "time_per_iteration": 3.6285934448242188 }, { "auxiliary_loss_clip": 0.01042841, "auxiliary_loss_mlp": 0.01005618, "balance_loss_clip": 1.01201415, "balance_loss_mlp": 1.00399709, "epoch": 0.511633499669332, "flos": 69879975131520.0, "grad_norm": 0.8651359059432232, "language_loss": 0.59944272, "learning_rate": 2.0218107737847962e-06, "loss": 0.61992741, "num_input_tokens_seen": 91850990, "step": 4255, "time_per_iteration": 3.2887799739837646 }, { "auxiliary_loss_clip": 0.01185306, "auxiliary_loss_mlp": 0.01029474, "balance_loss_clip": 1.05386555, "balance_loss_mlp": 1.02108157, "epoch": 0.5117537425599712, "flos": 24097855852800.0, "grad_norm": 2.6342995059425514, "language_loss": 0.74327075, "learning_rate": 2.0210318468247826e-06, "loss": 0.76541853, "num_input_tokens_seen": 91869960, "step": 4256, "time_per_iteration": 2.632917881011963 }, { "auxiliary_loss_clip": 0.01157301, "auxiliary_loss_mlp": 0.01030208, "balance_loss_clip": 1.04865766, "balance_loss_mlp": 1.02174449, "epoch": 0.5118739854506102, "flos": 20959550622720.0, "grad_norm": 1.8295152265896288, "language_loss": 0.81785405, "learning_rate": 2.020252916674255e-06, "loss": 0.83972913, "num_input_tokens_seen": 91889075, "step": 4257, "time_per_iteration": 2.763042688369751 }, { "auxiliary_loss_clip": 0.0117242, "auxiliary_loss_mlp": 0.01028029, "balance_loss_clip": 1.05064988, "balance_loss_mlp": 1.02030396, "epoch": 0.5119942283412493, "flos": 17457326749440.0, "grad_norm": 1.9272404792603803, "language_loss": 0.80987304, "learning_rate": 2.019473983451375e-06, "loss": 0.83187753, "num_input_tokens_seen": 91907495, "step": 4258, "time_per_iteration": 2.6178691387176514 }, { "auxiliary_loss_clip": 0.0114696, "auxiliary_loss_mlp": 0.01026603, "balance_loss_clip": 1.04692769, "balance_loss_mlp": 1.01844335, "epoch": 0.5121144712318885, "flos": 21066743784960.0, "grad_norm": 1.7925899813285122, "language_loss": 0.71570003, "learning_rate": 2.0186950472743076e-06, "loss": 0.7374357, "num_input_tokens_seen": 91927400, "step": 4259, "time_per_iteration": 2.7691292762756348 }, { "auxiliary_loss_clip": 0.01186069, "auxiliary_loss_mlp": 0.01025641, "balance_loss_clip": 1.0533402, "balance_loss_mlp": 1.01753473, "epoch": 0.5122347141225275, "flos": 19860791541120.0, "grad_norm": 1.63305182996989, "language_loss": 0.7391718, "learning_rate": 2.0179161082612162e-06, "loss": 0.76128894, "num_input_tokens_seen": 91946790, "step": 4260, "time_per_iteration": 2.6032440662384033 }, { "auxiliary_loss_clip": 0.0114899, "auxiliary_loss_mlp": 0.01025605, "balance_loss_clip": 1.04504967, "balance_loss_mlp": 1.01705217, "epoch": 0.5123549570131666, "flos": 22528487756160.0, "grad_norm": 1.9810234116866055, "language_loss": 0.72685218, "learning_rate": 2.017137166530266e-06, "loss": 0.7485981, "num_input_tokens_seen": 91966325, "step": 4261, "time_per_iteration": 2.668117046356201 }, { "auxiliary_loss_clip": 0.01162472, "auxiliary_loss_mlp": 0.01028829, "balance_loss_clip": 1.05199933, "balance_loss_mlp": 1.02092516, "epoch": 0.5124751999038056, "flos": 20333375804160.0, "grad_norm": 5.1719832720519845, "language_loss": 0.80202305, "learning_rate": 2.0163582221996213e-06, "loss": 0.8239361, "num_input_tokens_seen": 91984700, "step": 4262, "time_per_iteration": 2.715456962585449 }, { "auxiliary_loss_clip": 0.01156224, "auxiliary_loss_mlp": 0.01025792, "balance_loss_clip": 1.04874611, "balance_loss_mlp": 1.01721513, "epoch": 0.5125954427944448, "flos": 39785970211200.0, "grad_norm": 2.1120652283810024, "language_loss": 0.68114227, "learning_rate": 2.015579275387446e-06, "loss": 0.70296246, "num_input_tokens_seen": 92010020, "step": 4263, "time_per_iteration": 2.794590711593628 }, { "auxiliary_loss_clip": 0.0114883, "auxiliary_loss_mlp": 0.01030601, "balance_loss_clip": 1.04952013, "balance_loss_mlp": 1.02246475, "epoch": 0.5127156856850839, "flos": 29205394358400.0, "grad_norm": 1.8700899212970121, "language_loss": 0.68795878, "learning_rate": 2.0148003262119085e-06, "loss": 0.7097531, "num_input_tokens_seen": 92030990, "step": 4264, "time_per_iteration": 2.7926645278930664 }, { "auxiliary_loss_clip": 0.0114204, "auxiliary_loss_mlp": 0.01026019, "balance_loss_clip": 1.0480423, "balance_loss_mlp": 1.01758456, "epoch": 0.5128359285757229, "flos": 13553693412480.0, "grad_norm": 1.9056512514797443, "language_loss": 0.7653532, "learning_rate": 2.0140213747911728e-06, "loss": 0.7870338, "num_input_tokens_seen": 92049525, "step": 4265, "time_per_iteration": 2.7287943363189697 }, { "auxiliary_loss_clip": 0.01142349, "auxiliary_loss_mlp": 0.01032716, "balance_loss_clip": 1.05145586, "balance_loss_mlp": 1.02441931, "epoch": 0.5129561714663621, "flos": 25192089820800.0, "grad_norm": 2.0986063084028856, "language_loss": 0.80794322, "learning_rate": 2.013242421243406e-06, "loss": 0.82969391, "num_input_tokens_seen": 92068430, "step": 4266, "time_per_iteration": 2.807281017303467 }, { "auxiliary_loss_clip": 0.01129523, "auxiliary_loss_mlp": 0.01029965, "balance_loss_clip": 1.04948401, "balance_loss_mlp": 1.02177811, "epoch": 0.5130764143570011, "flos": 18150223080960.0, "grad_norm": 1.7032575172625988, "language_loss": 0.789662, "learning_rate": 2.012463465686774e-06, "loss": 0.81125689, "num_input_tokens_seen": 92088180, "step": 4267, "time_per_iteration": 2.755438804626465 }, { "auxiliary_loss_clip": 0.01043825, "auxiliary_loss_mlp": 0.01002901, "balance_loss_clip": 1.01956868, "balance_loss_mlp": 1.00147653, "epoch": 0.5131966572476402, "flos": 59794896418560.0, "grad_norm": 0.766405452087811, "language_loss": 0.54804635, "learning_rate": 2.0116845082394446e-06, "loss": 0.56851363, "num_input_tokens_seen": 92153015, "step": 4268, "time_per_iteration": 3.319882392883301 }, { "auxiliary_loss_clip": 0.01170277, "auxiliary_loss_mlp": 0.01028815, "balance_loss_clip": 1.04777825, "balance_loss_mlp": 1.02083969, "epoch": 0.5133169001382794, "flos": 18515219132160.0, "grad_norm": 2.0464611180038585, "language_loss": 0.78935778, "learning_rate": 2.0109055490195836e-06, "loss": 0.81134868, "num_input_tokens_seen": 92171470, "step": 4269, "time_per_iteration": 2.6687822341918945 }, { "auxiliary_loss_clip": 0.0111896, "auxiliary_loss_mlp": 0.01029802, "balance_loss_clip": 1.04021096, "balance_loss_mlp": 1.0217073, "epoch": 0.5134371430289184, "flos": 15523537219200.0, "grad_norm": 2.1188305272805983, "language_loss": 0.64121878, "learning_rate": 2.0101265881453605e-06, "loss": 0.66270638, "num_input_tokens_seen": 92189945, "step": 4270, "time_per_iteration": 2.7287752628326416 }, { "auxiliary_loss_clip": 0.01148988, "auxiliary_loss_mlp": 0.01026841, "balance_loss_clip": 1.05000579, "balance_loss_mlp": 1.01932538, "epoch": 0.5135573859195575, "flos": 21433786911360.0, "grad_norm": 2.0910050434895533, "language_loss": 0.78196585, "learning_rate": 2.009347625734941e-06, "loss": 0.80372405, "num_input_tokens_seen": 92209855, "step": 4271, "time_per_iteration": 2.691169023513794 }, { "auxiliary_loss_clip": 0.01190299, "auxiliary_loss_mlp": 0.0103072, "balance_loss_clip": 1.05705631, "balance_loss_mlp": 1.02208352, "epoch": 0.5136776288101966, "flos": 17712651600000.0, "grad_norm": 2.645288233279411, "language_loss": 0.75510395, "learning_rate": 2.0085686619064954e-06, "loss": 0.77731407, "num_input_tokens_seen": 92226295, "step": 4272, "time_per_iteration": 2.5454154014587402 }, { "auxiliary_loss_clip": 0.01176812, "auxiliary_loss_mlp": 0.01037963, "balance_loss_clip": 1.05335832, "balance_loss_mlp": 1.02966011, "epoch": 0.5137978717008357, "flos": 16581680997120.0, "grad_norm": 2.083646639310859, "language_loss": 0.82764393, "learning_rate": 2.00778969677819e-06, "loss": 0.84979171, "num_input_tokens_seen": 92243330, "step": 4273, "time_per_iteration": 2.616377115249634 }, { "auxiliary_loss_clip": 0.01154381, "auxiliary_loss_mlp": 0.0102921, "balance_loss_clip": 1.04875863, "balance_loss_mlp": 1.0208658, "epoch": 0.5139181145914747, "flos": 20668243322880.0, "grad_norm": 1.7736662017803648, "language_loss": 0.63900268, "learning_rate": 2.0070107304681934e-06, "loss": 0.66083866, "num_input_tokens_seen": 92262285, "step": 4274, "time_per_iteration": 2.67352294921875 }, { "auxiliary_loss_clip": 0.0114056, "auxiliary_loss_mlp": 0.01024389, "balance_loss_clip": 1.04943752, "balance_loss_mlp": 1.017308, "epoch": 0.5140383574821139, "flos": 32926996546560.0, "grad_norm": 2.4687008746462564, "language_loss": 0.78343248, "learning_rate": 2.006231763094675e-06, "loss": 0.80508196, "num_input_tokens_seen": 92283305, "step": 4275, "time_per_iteration": 3.7122576236724854 }, { "auxiliary_loss_clip": 0.01155313, "auxiliary_loss_mlp": 0.01034527, "balance_loss_clip": 1.05520844, "balance_loss_mlp": 1.02665353, "epoch": 0.514158600372753, "flos": 19537093152000.0, "grad_norm": 2.3229218222063026, "language_loss": 0.88187015, "learning_rate": 2.0054527947758027e-06, "loss": 0.90376854, "num_input_tokens_seen": 92302105, "step": 4276, "time_per_iteration": 3.649789333343506 }, { "auxiliary_loss_clip": 0.01065533, "auxiliary_loss_mlp": 0.01002036, "balance_loss_clip": 1.01321971, "balance_loss_mlp": 1.00058126, "epoch": 0.514278843263392, "flos": 62523855279360.0, "grad_norm": 0.7242175184846665, "language_loss": 0.56000376, "learning_rate": 2.004673825629746e-06, "loss": 0.58067942, "num_input_tokens_seen": 92362885, "step": 4277, "time_per_iteration": 3.2330658435821533 }, { "auxiliary_loss_clip": 0.01148773, "auxiliary_loss_mlp": 0.01027744, "balance_loss_clip": 1.04590261, "balance_loss_mlp": 1.01944089, "epoch": 0.5143990861540312, "flos": 25882328545920.0, "grad_norm": 2.2620290086841344, "language_loss": 0.72838175, "learning_rate": 2.0038948557746744e-06, "loss": 0.75014687, "num_input_tokens_seen": 92384740, "step": 4278, "time_per_iteration": 2.771160364151001 }, { "auxiliary_loss_clip": 0.01166885, "auxiliary_loss_mlp": 0.01025159, "balance_loss_clip": 1.05053377, "balance_loss_mlp": 1.01723766, "epoch": 0.5145193290446702, "flos": 23330660238720.0, "grad_norm": 1.6727543139773986, "language_loss": 0.75154901, "learning_rate": 2.0031158853287558e-06, "loss": 0.77346945, "num_input_tokens_seen": 92405175, "step": 4279, "time_per_iteration": 2.7449469566345215 }, { "auxiliary_loss_clip": 0.01153678, "auxiliary_loss_mlp": 0.0102767, "balance_loss_clip": 1.05079222, "balance_loss_mlp": 1.01920605, "epoch": 0.5146395719353093, "flos": 22856603518080.0, "grad_norm": 2.4776488641643275, "language_loss": 0.70457232, "learning_rate": 2.0023369144101593e-06, "loss": 0.72638583, "num_input_tokens_seen": 92423345, "step": 4280, "time_per_iteration": 3.576746940612793 }, { "auxiliary_loss_clip": 0.01145833, "auxiliary_loss_mlp": 0.01027868, "balance_loss_clip": 1.04739356, "balance_loss_mlp": 1.02017856, "epoch": 0.5147598148259485, "flos": 26391577616640.0, "grad_norm": 2.057315456394534, "language_loss": 0.76820922, "learning_rate": 2.0015579431370555e-06, "loss": 0.78994626, "num_input_tokens_seen": 92445025, "step": 4281, "time_per_iteration": 2.7272138595581055 }, { "auxiliary_loss_clip": 0.01166879, "auxiliary_loss_mlp": 0.0102721, "balance_loss_clip": 1.05113935, "balance_loss_mlp": 1.01953876, "epoch": 0.5148800577165875, "flos": 29965694561280.0, "grad_norm": 2.380719225964806, "language_loss": 0.69996184, "learning_rate": 2.000778971627612e-06, "loss": 0.72190273, "num_input_tokens_seen": 92464490, "step": 4282, "time_per_iteration": 2.7115352153778076 }, { "auxiliary_loss_clip": 0.01149994, "auxiliary_loss_mlp": 0.01030496, "balance_loss_clip": 1.04831004, "balance_loss_mlp": 1.02262843, "epoch": 0.5150003006072266, "flos": 17931383470080.0, "grad_norm": 2.009446677089578, "language_loss": 0.89985871, "learning_rate": 2e-06, "loss": 0.92166364, "num_input_tokens_seen": 92482085, "step": 4283, "time_per_iteration": 2.6880173683166504 }, { "auxiliary_loss_clip": 0.01183246, "auxiliary_loss_mlp": 0.01029534, "balance_loss_clip": 1.05305302, "balance_loss_mlp": 1.02146983, "epoch": 0.5151205434978657, "flos": 18478733892480.0, "grad_norm": 1.9601885009184798, "language_loss": 0.85772777, "learning_rate": 1.9992210283723878e-06, "loss": 0.87985563, "num_input_tokens_seen": 92499325, "step": 4284, "time_per_iteration": 2.6190545558929443 }, { "auxiliary_loss_clip": 0.01182792, "auxiliary_loss_mlp": 0.01025867, "balance_loss_clip": 1.05268478, "balance_loss_mlp": 1.01837492, "epoch": 0.5152407863885048, "flos": 25341263003520.0, "grad_norm": 1.5385854226168054, "language_loss": 0.79308045, "learning_rate": 1.9984420568629448e-06, "loss": 0.81516701, "num_input_tokens_seen": 92522090, "step": 4285, "time_per_iteration": 2.717719793319702 }, { "auxiliary_loss_clip": 0.01171392, "auxiliary_loss_mlp": 0.01024383, "balance_loss_clip": 1.05194235, "balance_loss_mlp": 1.01690233, "epoch": 0.5153610292791438, "flos": 18329740277760.0, "grad_norm": 2.3473583069748356, "language_loss": 0.78541231, "learning_rate": 1.9976630855898405e-06, "loss": 0.80737001, "num_input_tokens_seen": 92539845, "step": 4286, "time_per_iteration": 2.619304656982422 }, { "auxiliary_loss_clip": 0.01148474, "auxiliary_loss_mlp": 0.01020987, "balance_loss_clip": 1.04450536, "balance_loss_mlp": 1.0143261, "epoch": 0.515481272169783, "flos": 30409945971840.0, "grad_norm": 2.2663077152240882, "language_loss": 0.74394208, "learning_rate": 1.9968841146712445e-06, "loss": 0.76563668, "num_input_tokens_seen": 92559460, "step": 4287, "time_per_iteration": 2.774848222732544 }, { "auxiliary_loss_clip": 0.01108183, "auxiliary_loss_mlp": 0.00762868, "balance_loss_clip": 1.04182935, "balance_loss_mlp": 1.00033283, "epoch": 0.5156015150604221, "flos": 23037305863680.0, "grad_norm": 1.693699981919337, "language_loss": 0.71627283, "learning_rate": 1.996105144225326e-06, "loss": 0.73498338, "num_input_tokens_seen": 92579695, "step": 4288, "time_per_iteration": 2.7832891941070557 }, { "auxiliary_loss_clip": 0.01166559, "auxiliary_loss_mlp": 0.01028194, "balance_loss_clip": 1.05139542, "balance_loss_mlp": 1.02072573, "epoch": 0.5157217579510611, "flos": 17858556645120.0, "grad_norm": 2.203241670002149, "language_loss": 0.78854704, "learning_rate": 1.995326174370254e-06, "loss": 0.8104946, "num_input_tokens_seen": 92598795, "step": 4289, "time_per_iteration": 2.73026180267334 }, { "auxiliary_loss_clip": 0.01165488, "auxiliary_loss_mlp": 0.00761947, "balance_loss_clip": 1.04841137, "balance_loss_mlp": 1.00019884, "epoch": 0.5158420008417003, "flos": 19171486569600.0, "grad_norm": 1.4380366010003816, "language_loss": 0.7294836, "learning_rate": 1.994547205224197e-06, "loss": 0.74875802, "num_input_tokens_seen": 92617700, "step": 4290, "time_per_iteration": 2.7143137454986572 }, { "auxiliary_loss_clip": 0.01150446, "auxiliary_loss_mlp": 0.01023819, "balance_loss_clip": 1.05000687, "balance_loss_mlp": 1.0162822, "epoch": 0.5159622437323393, "flos": 22419534827520.0, "grad_norm": 2.071379412065401, "language_loss": 0.67949325, "learning_rate": 1.993768236905325e-06, "loss": 0.70123589, "num_input_tokens_seen": 92638370, "step": 4291, "time_per_iteration": 2.7979423999786377 }, { "auxiliary_loss_clip": 0.01149913, "auxiliary_loss_mlp": 0.01030548, "balance_loss_clip": 1.04781842, "balance_loss_mlp": 1.02216792, "epoch": 0.5160824866229784, "flos": 24603010773120.0, "grad_norm": 3.8264167145017, "language_loss": 0.65815902, "learning_rate": 1.992989269531807e-06, "loss": 0.67996365, "num_input_tokens_seen": 92657180, "step": 4292, "time_per_iteration": 2.765042781829834 }, { "auxiliary_loss_clip": 0.01151664, "auxiliary_loss_mlp": 0.01022094, "balance_loss_clip": 1.04682732, "balance_loss_mlp": 1.01485193, "epoch": 0.5162027295136175, "flos": 18002737837440.0, "grad_norm": 3.3694371301829267, "language_loss": 0.6806519, "learning_rate": 1.99221030322181e-06, "loss": 0.70238948, "num_input_tokens_seen": 92673985, "step": 4293, "time_per_iteration": 2.665212392807007 }, { "auxiliary_loss_clip": 0.01157002, "auxiliary_loss_mlp": 0.01031287, "balance_loss_clip": 1.04738617, "balance_loss_mlp": 1.0238601, "epoch": 0.5163229724042566, "flos": 27344611221120.0, "grad_norm": 1.8073466057013952, "language_loss": 0.81111461, "learning_rate": 1.991431338093505e-06, "loss": 0.83299744, "num_input_tokens_seen": 92696340, "step": 4294, "time_per_iteration": 2.8028039932250977 }, { "auxiliary_loss_clip": 0.01154074, "auxiliary_loss_mlp": 0.01025265, "balance_loss_clip": 1.05084038, "balance_loss_mlp": 1.01848173, "epoch": 0.5164432152948957, "flos": 21762764599680.0, "grad_norm": 1.9014689419192339, "language_loss": 0.79511631, "learning_rate": 1.9906523742650587e-06, "loss": 0.81690967, "num_input_tokens_seen": 92715200, "step": 4295, "time_per_iteration": 2.685328245162964 }, { "auxiliary_loss_clip": 0.01183045, "auxiliary_loss_mlp": 0.01027933, "balance_loss_clip": 1.0509671, "balance_loss_mlp": 1.01949334, "epoch": 0.5165634581855347, "flos": 25550334115200.0, "grad_norm": 1.9879206144633594, "language_loss": 0.77237082, "learning_rate": 1.9898734118546397e-06, "loss": 0.79448056, "num_input_tokens_seen": 92735150, "step": 4296, "time_per_iteration": 2.6982617378234863 }, { "auxiliary_loss_clip": 0.01103422, "auxiliary_loss_mlp": 0.01029612, "balance_loss_clip": 1.04485273, "balance_loss_mlp": 1.0215776, "epoch": 0.5166837010761739, "flos": 19901191363200.0, "grad_norm": 1.530887177881446, "language_loss": 0.80446017, "learning_rate": 1.989094450980416e-06, "loss": 0.82579052, "num_input_tokens_seen": 92755250, "step": 4297, "time_per_iteration": 2.922832489013672 }, { "auxiliary_loss_clip": 0.01166279, "auxiliary_loss_mlp": 0.01021704, "balance_loss_clip": 1.04854882, "balance_loss_mlp": 1.01428354, "epoch": 0.516803943966813, "flos": 26646076454400.0, "grad_norm": 2.1565564190771274, "language_loss": 0.77001357, "learning_rate": 1.9883154917605556e-06, "loss": 0.79189342, "num_input_tokens_seen": 92774460, "step": 4298, "time_per_iteration": 3.5020172595977783 }, { "auxiliary_loss_clip": 0.01180732, "auxiliary_loss_mlp": 0.01031174, "balance_loss_clip": 1.05046129, "balance_loss_mlp": 1.02362549, "epoch": 0.516924186857452, "flos": 19682854542720.0, "grad_norm": 1.747453116700484, "language_loss": 0.83398557, "learning_rate": 1.9875365343132262e-06, "loss": 0.85610467, "num_input_tokens_seen": 92791580, "step": 4299, "time_per_iteration": 2.6476144790649414 }, { "auxiliary_loss_clip": 0.01169509, "auxiliary_loss_mlp": 0.00762452, "balance_loss_clip": 1.05179965, "balance_loss_mlp": 1.00024676, "epoch": 0.5170444297480912, "flos": 15956583586560.0, "grad_norm": 2.210165327965464, "language_loss": 0.8466984, "learning_rate": 1.9867575787565946e-06, "loss": 0.86601806, "num_input_tokens_seen": 92806240, "step": 4300, "time_per_iteration": 2.61966609954834 }, { "auxiliary_loss_clip": 0.01169294, "auxiliary_loss_mlp": 0.01035855, "balance_loss_clip": 1.05197835, "balance_loss_mlp": 1.02693784, "epoch": 0.5171646726387302, "flos": 14174157968640.0, "grad_norm": 2.2497316143525334, "language_loss": 0.8627699, "learning_rate": 1.9859786252088275e-06, "loss": 0.88482141, "num_input_tokens_seen": 92823420, "step": 4301, "time_per_iteration": 2.6327226161956787 }, { "auxiliary_loss_clip": 0.01145082, "auxiliary_loss_mlp": 0.01025552, "balance_loss_clip": 1.04860485, "balance_loss_mlp": 1.01705849, "epoch": 0.5172849155293693, "flos": 23578550974080.0, "grad_norm": 4.80721895216572, "language_loss": 0.66851074, "learning_rate": 1.9851996737880914e-06, "loss": 0.69021714, "num_input_tokens_seen": 92838605, "step": 4302, "time_per_iteration": 4.677529573440552 }, { "auxiliary_loss_clip": 0.01175581, "auxiliary_loss_mlp": 0.01031469, "balance_loss_clip": 1.05193019, "balance_loss_mlp": 1.02302265, "epoch": 0.5174051584200084, "flos": 14283541860480.0, "grad_norm": 2.764068710313437, "language_loss": 0.74488968, "learning_rate": 1.9844207246125537e-06, "loss": 0.76696014, "num_input_tokens_seen": 92855185, "step": 4303, "time_per_iteration": 2.5673739910125732 }, { "auxiliary_loss_clip": 0.01151678, "auxiliary_loss_mlp": 0.01029568, "balance_loss_clip": 1.04883265, "balance_loss_mlp": 1.02193236, "epoch": 0.5175254013106475, "flos": 37889384192640.0, "grad_norm": 1.7025420924983574, "language_loss": 0.68280017, "learning_rate": 1.983641777800379e-06, "loss": 0.70461267, "num_input_tokens_seen": 92877830, "step": 4304, "time_per_iteration": 2.850768804550171 }, { "auxiliary_loss_clip": 0.01058774, "auxiliary_loss_mlp": 0.01003957, "balance_loss_clip": 1.01201105, "balance_loss_mlp": 1.00253868, "epoch": 0.5176456442012866, "flos": 68549737829760.0, "grad_norm": 0.746463696035837, "language_loss": 0.5885644, "learning_rate": 1.9828628334697343e-06, "loss": 0.60919178, "num_input_tokens_seen": 92945040, "step": 4305, "time_per_iteration": 3.43418288230896 }, { "auxiliary_loss_clip": 0.01058337, "auxiliary_loss_mlp": 0.01002276, "balance_loss_clip": 1.01145315, "balance_loss_mlp": 1.00082743, "epoch": 0.5177658870919257, "flos": 64084137235200.0, "grad_norm": 0.7573182130819075, "language_loss": 0.54652345, "learning_rate": 1.982083891738784e-06, "loss": 0.56712961, "num_input_tokens_seen": 93005910, "step": 4306, "time_per_iteration": 4.546627759933472 }, { "auxiliary_loss_clip": 0.01149406, "auxiliary_loss_mlp": 0.01025831, "balance_loss_clip": 1.05183983, "balance_loss_mlp": 1.01866651, "epoch": 0.5178861299825648, "flos": 26651248012800.0, "grad_norm": 1.4791126017477254, "language_loss": 0.82854164, "learning_rate": 1.9813049527256923e-06, "loss": 0.85029399, "num_input_tokens_seen": 93026305, "step": 4307, "time_per_iteration": 2.794661283493042 }, { "auxiliary_loss_clip": 0.01138295, "auxiliary_loss_mlp": 0.01030477, "balance_loss_clip": 1.04468429, "balance_loss_mlp": 1.02276444, "epoch": 0.5180063728732038, "flos": 17931886260480.0, "grad_norm": 2.8471841404472817, "language_loss": 0.82032025, "learning_rate": 1.9805260165486252e-06, "loss": 0.84200799, "num_input_tokens_seen": 93045675, "step": 4308, "time_per_iteration": 2.731330633163452 }, { "auxiliary_loss_clip": 0.01170605, "auxiliary_loss_mlp": 0.01029754, "balance_loss_clip": 1.05362296, "balance_loss_mlp": 1.02166533, "epoch": 0.518126615763843, "flos": 19500895221120.0, "grad_norm": 2.05320601974293, "language_loss": 0.86155128, "learning_rate": 1.9797470833257457e-06, "loss": 0.88355482, "num_input_tokens_seen": 93065375, "step": 4309, "time_per_iteration": 2.7096712589263916 }, { "auxiliary_loss_clip": 0.01170917, "auxiliary_loss_mlp": 0.01032238, "balance_loss_clip": 1.05297089, "balance_loss_mlp": 1.0241611, "epoch": 0.5182468586544821, "flos": 20704082117760.0, "grad_norm": 2.015952670555153, "language_loss": 0.77566314, "learning_rate": 1.9789681531752177e-06, "loss": 0.79769468, "num_input_tokens_seen": 93085595, "step": 4310, "time_per_iteration": 2.6832869052886963 }, { "auxiliary_loss_clip": 0.01124798, "auxiliary_loss_mlp": 0.01029112, "balance_loss_clip": 1.0490123, "balance_loss_mlp": 1.02203107, "epoch": 0.5183671015451211, "flos": 23112107936640.0, "grad_norm": 1.7438015801676086, "language_loss": 0.72280329, "learning_rate": 1.978189226215204e-06, "loss": 0.74434245, "num_input_tokens_seen": 93106140, "step": 4311, "time_per_iteration": 2.7803261280059814 }, { "auxiliary_loss_clip": 0.01182343, "auxiliary_loss_mlp": 0.01031963, "balance_loss_clip": 1.05242205, "balance_loss_mlp": 1.02360058, "epoch": 0.5184873444357603, "flos": 17597090568960.0, "grad_norm": 1.709753032897899, "language_loss": 0.77218163, "learning_rate": 1.9774103025638675e-06, "loss": 0.79432464, "num_input_tokens_seen": 93124265, "step": 4312, "time_per_iteration": 2.6822755336761475 }, { "auxiliary_loss_clip": 0.01128756, "auxiliary_loss_mlp": 0.01030048, "balance_loss_clip": 1.05034888, "balance_loss_mlp": 1.02238917, "epoch": 0.5186075873263993, "flos": 24936800883840.0, "grad_norm": 1.8179613403278088, "language_loss": 0.76427615, "learning_rate": 1.9766313823393696e-06, "loss": 0.78586423, "num_input_tokens_seen": 93145130, "step": 4313, "time_per_iteration": 2.8908841609954834 }, { "auxiliary_loss_clip": 0.01117876, "auxiliary_loss_mlp": 0.0103459, "balance_loss_clip": 1.04202104, "balance_loss_mlp": 1.02634096, "epoch": 0.5187278302170384, "flos": 15190106244480.0, "grad_norm": 2.4885987604226325, "language_loss": 0.6930393, "learning_rate": 1.975852465659873e-06, "loss": 0.71456397, "num_input_tokens_seen": 93161110, "step": 4314, "time_per_iteration": 2.889552116394043 }, { "auxiliary_loss_clip": 0.01171871, "auxiliary_loss_mlp": 0.0103556, "balance_loss_clip": 1.05289745, "balance_loss_mlp": 1.02708375, "epoch": 0.5188480731076776, "flos": 25009412227200.0, "grad_norm": 2.20299208676975, "language_loss": 0.69978368, "learning_rate": 1.9750735526435377e-06, "loss": 0.72185802, "num_input_tokens_seen": 93178055, "step": 4315, "time_per_iteration": 2.708498477935791 }, { "auxiliary_loss_clip": 0.01154784, "auxiliary_loss_mlp": 0.01030183, "balance_loss_clip": 1.05182028, "balance_loss_mlp": 1.02254725, "epoch": 0.5189683159983166, "flos": 24790141653120.0, "grad_norm": 3.0159061290431795, "language_loss": 0.79643381, "learning_rate": 1.974294643408525e-06, "loss": 0.81828344, "num_input_tokens_seen": 93195850, "step": 4316, "time_per_iteration": 2.737149715423584 }, { "auxiliary_loss_clip": 0.01173919, "auxiliary_loss_mlp": 0.01025812, "balance_loss_clip": 1.05118871, "balance_loss_mlp": 1.01836109, "epoch": 0.5190885588889557, "flos": 24754266944640.0, "grad_norm": 3.3139351997769997, "language_loss": 0.67140168, "learning_rate": 1.9735157380729947e-06, "loss": 0.69339895, "num_input_tokens_seen": 93216260, "step": 4317, "time_per_iteration": 2.722264289855957 }, { "auxiliary_loss_clip": 0.0115436, "auxiliary_loss_mlp": 0.01024659, "balance_loss_clip": 1.04831576, "balance_loss_mlp": 1.01718163, "epoch": 0.5192088017795948, "flos": 24712646060160.0, "grad_norm": 1.8461758199598859, "language_loss": 0.84131515, "learning_rate": 1.9727368367551053e-06, "loss": 0.86310536, "num_input_tokens_seen": 93234810, "step": 4318, "time_per_iteration": 2.7310335636138916 }, { "auxiliary_loss_clip": 0.01143637, "auxiliary_loss_mlp": 0.01030053, "balance_loss_clip": 1.04699135, "balance_loss_mlp": 1.02148199, "epoch": 0.5193290446702339, "flos": 27229588894080.0, "grad_norm": 1.8255274639666053, "language_loss": 0.68338549, "learning_rate": 1.9719579395730164e-06, "loss": 0.70512241, "num_input_tokens_seen": 93254185, "step": 4319, "time_per_iteration": 2.806253433227539 }, { "auxiliary_loss_clip": 0.0118726, "auxiliary_loss_mlp": 0.01029105, "balance_loss_clip": 1.05721426, "balance_loss_mlp": 1.02136874, "epoch": 0.5194492875608729, "flos": 11473352392320.0, "grad_norm": 2.7463984416623513, "language_loss": 0.93919402, "learning_rate": 1.9711790466448854e-06, "loss": 0.96135759, "num_input_tokens_seen": 93268205, "step": 4320, "time_per_iteration": 2.645686626434326 }, { "auxiliary_loss_clip": 0.0113296, "auxiliary_loss_mlp": 0.01031342, "balance_loss_clip": 1.04853153, "balance_loss_mlp": 1.02287817, "epoch": 0.5195695304515121, "flos": 20338906498560.0, "grad_norm": 2.5002726231466137, "language_loss": 0.71550894, "learning_rate": 1.9704001580888704e-06, "loss": 0.73715186, "num_input_tokens_seen": 93286945, "step": 4321, "time_per_iteration": 2.86267352104187 }, { "auxiliary_loss_clip": 0.01147939, "auxiliary_loss_mlp": 0.0076269, "balance_loss_clip": 1.0453949, "balance_loss_mlp": 1.00021625, "epoch": 0.5196897733421512, "flos": 20048317470720.0, "grad_norm": 2.111072251528371, "language_loss": 0.87004125, "learning_rate": 1.9696212740231283e-06, "loss": 0.88914758, "num_input_tokens_seen": 93305595, "step": 4322, "time_per_iteration": 2.6753649711608887 }, { "auxiliary_loss_clip": 0.01175361, "auxiliary_loss_mlp": 0.01029223, "balance_loss_clip": 1.05108881, "balance_loss_mlp": 1.02101552, "epoch": 0.5198100162327902, "flos": 23805507058560.0, "grad_norm": 2.345426863982677, "language_loss": 0.82136631, "learning_rate": 1.9688423945658146e-06, "loss": 0.84341216, "num_input_tokens_seen": 93326460, "step": 4323, "time_per_iteration": 2.720966100692749 }, { "auxiliary_loss_clip": 0.01112175, "auxiliary_loss_mlp": 0.01033011, "balance_loss_clip": 1.03844762, "balance_loss_mlp": 1.02505374, "epoch": 0.5199302591234293, "flos": 24023951619840.0, "grad_norm": 2.3225183175107422, "language_loss": 0.71969616, "learning_rate": 1.9680635198350845e-06, "loss": 0.74114799, "num_input_tokens_seen": 93346170, "step": 4324, "time_per_iteration": 2.7868387699127197 }, { "auxiliary_loss_clip": 0.01170591, "auxiliary_loss_mlp": 0.01028561, "balance_loss_clip": 1.04966974, "balance_loss_mlp": 1.01951909, "epoch": 0.5200505020140684, "flos": 26359366095360.0, "grad_norm": 5.012314962184714, "language_loss": 0.72621071, "learning_rate": 1.967284649949093e-06, "loss": 0.7482022, "num_input_tokens_seen": 93365380, "step": 4325, "time_per_iteration": 2.737074136734009 }, { "auxiliary_loss_clip": 0.01136378, "auxiliary_loss_mlp": 0.01033084, "balance_loss_clip": 1.04363465, "balance_loss_mlp": 1.02503157, "epoch": 0.5201707449047075, "flos": 39604262284800.0, "grad_norm": 1.7904806334505594, "language_loss": 0.72611487, "learning_rate": 1.966505785025994e-06, "loss": 0.74780953, "num_input_tokens_seen": 93387285, "step": 4326, "time_per_iteration": 2.907452344894409 }, { "auxiliary_loss_clip": 0.01143885, "auxiliary_loss_mlp": 0.01032754, "balance_loss_clip": 1.04999483, "balance_loss_mlp": 1.02521396, "epoch": 0.5202909877953465, "flos": 53682788292480.0, "grad_norm": 1.8037083575201058, "language_loss": 0.76270509, "learning_rate": 1.965726925183941e-06, "loss": 0.78447151, "num_input_tokens_seen": 93410390, "step": 4327, "time_per_iteration": 3.934385299682617 }, { "auxiliary_loss_clip": 0.0118113, "auxiliary_loss_mlp": 0.0102987, "balance_loss_clip": 1.05100763, "balance_loss_mlp": 1.02183557, "epoch": 0.5204112306859857, "flos": 19537021324800.0, "grad_norm": 1.8865015122829725, "language_loss": 0.84989059, "learning_rate": 1.964948070541087e-06, "loss": 0.87200063, "num_input_tokens_seen": 93429050, "step": 4328, "time_per_iteration": 3.580063581466675 }, { "auxiliary_loss_clip": 0.01158431, "auxiliary_loss_mlp": 0.01029003, "balance_loss_clip": 1.04705024, "balance_loss_mlp": 1.02090263, "epoch": 0.5205314735766248, "flos": 15304697608320.0, "grad_norm": 2.3891939207417168, "language_loss": 0.69450378, "learning_rate": 1.9641692212155816e-06, "loss": 0.71637809, "num_input_tokens_seen": 93446815, "step": 4329, "time_per_iteration": 3.598076343536377 }, { "auxiliary_loss_clip": 0.01125171, "auxiliary_loss_mlp": 0.01029732, "balance_loss_clip": 1.04906058, "balance_loss_mlp": 1.02135468, "epoch": 0.5206517164672638, "flos": 59263701160320.0, "grad_norm": 2.1421411487096584, "language_loss": 0.72495884, "learning_rate": 1.9633903773255777e-06, "loss": 0.74650794, "num_input_tokens_seen": 93469130, "step": 4330, "time_per_iteration": 3.026902914047241 }, { "auxiliary_loss_clip": 0.01182787, "auxiliary_loss_mlp": 0.01030022, "balance_loss_clip": 1.05185056, "balance_loss_mlp": 1.02189744, "epoch": 0.520771959357903, "flos": 26871129118080.0, "grad_norm": 1.6255477718591331, "language_loss": 0.74670005, "learning_rate": 1.9626115389892237e-06, "loss": 0.76882815, "num_input_tokens_seen": 93489920, "step": 4331, "time_per_iteration": 2.6687910556793213 }, { "auxiliary_loss_clip": 0.01150195, "auxiliary_loss_mlp": 0.01035521, "balance_loss_clip": 1.05098319, "balance_loss_mlp": 1.02706659, "epoch": 0.520892202248542, "flos": 26907075653760.0, "grad_norm": 2.0392066531748396, "language_loss": 0.85562259, "learning_rate": 1.96183270632467e-06, "loss": 0.87747973, "num_input_tokens_seen": 93509770, "step": 4332, "time_per_iteration": 3.648815870285034 }, { "auxiliary_loss_clip": 0.01137319, "auxiliary_loss_mlp": 0.00763466, "balance_loss_clip": 1.0477457, "balance_loss_mlp": 1.00021482, "epoch": 0.5210124451391811, "flos": 25849434666240.0, "grad_norm": 1.6902652007666814, "language_loss": 0.79060745, "learning_rate": 1.9610538794500644e-06, "loss": 0.80961531, "num_input_tokens_seen": 93529320, "step": 4333, "time_per_iteration": 2.8012449741363525 }, { "auxiliary_loss_clip": 0.01046199, "auxiliary_loss_mlp": 0.01005182, "balance_loss_clip": 1.01138663, "balance_loss_mlp": 1.00375772, "epoch": 0.5211326880298203, "flos": 70553804319360.0, "grad_norm": 0.7742296792610236, "language_loss": 0.59436417, "learning_rate": 1.9602750584835542e-06, "loss": 0.614878, "num_input_tokens_seen": 93595255, "step": 4334, "time_per_iteration": 3.3547070026397705 }, { "auxiliary_loss_clip": 0.01150987, "auxiliary_loss_mlp": 0.01025737, "balance_loss_clip": 1.04618144, "balance_loss_mlp": 1.01836741, "epoch": 0.5212529309204593, "flos": 15628898787840.0, "grad_norm": 2.062345338681083, "language_loss": 0.83150995, "learning_rate": 1.959496243543286e-06, "loss": 0.85327721, "num_input_tokens_seen": 93613135, "step": 4335, "time_per_iteration": 2.639859676361084 }, { "auxiliary_loss_clip": 0.01174668, "auxiliary_loss_mlp": 0.01024683, "balance_loss_clip": 1.05596185, "balance_loss_mlp": 1.01701784, "epoch": 0.5213731738110984, "flos": 26242655829120.0, "grad_norm": 2.017832654779342, "language_loss": 0.79522741, "learning_rate": 1.9587174347474057e-06, "loss": 0.81722093, "num_input_tokens_seen": 93629645, "step": 4336, "time_per_iteration": 2.6603150367736816 }, { "auxiliary_loss_clip": 0.01109566, "auxiliary_loss_mlp": 0.01027815, "balance_loss_clip": 1.04176474, "balance_loss_mlp": 1.0190767, "epoch": 0.5214934167017375, "flos": 19418407637760.0, "grad_norm": 2.9674071410988767, "language_loss": 0.82009912, "learning_rate": 1.9579386322140574e-06, "loss": 0.84147292, "num_input_tokens_seen": 93645325, "step": 4337, "time_per_iteration": 2.690481424331665 }, { "auxiliary_loss_clip": 0.01187917, "auxiliary_loss_mlp": 0.00763831, "balance_loss_clip": 1.05446184, "balance_loss_mlp": 1.00034642, "epoch": 0.5216136595923766, "flos": 30955788023040.0, "grad_norm": 2.0843467730815646, "language_loss": 0.80960405, "learning_rate": 1.9571598360613854e-06, "loss": 0.82912159, "num_input_tokens_seen": 93668200, "step": 4338, "time_per_iteration": 2.7676608562469482 }, { "auxiliary_loss_clip": 0.01139934, "auxiliary_loss_mlp": 0.01028649, "balance_loss_clip": 1.04450655, "balance_loss_mlp": 1.02111459, "epoch": 0.5217339024830157, "flos": 21945047143680.0, "grad_norm": 5.202327789918528, "language_loss": 0.70378643, "learning_rate": 1.956381046407532e-06, "loss": 0.72547221, "num_input_tokens_seen": 93688495, "step": 4339, "time_per_iteration": 2.6835906505584717 }, { "auxiliary_loss_clip": 0.01138393, "auxiliary_loss_mlp": 0.01033857, "balance_loss_clip": 1.04643941, "balance_loss_mlp": 1.02585268, "epoch": 0.5218541453736548, "flos": 20923209037440.0, "grad_norm": 2.8551816841337168, "language_loss": 0.85940135, "learning_rate": 1.9556022633706394e-06, "loss": 0.8811239, "num_input_tokens_seen": 93707285, "step": 4340, "time_per_iteration": 2.737086296081543 }, { "auxiliary_loss_clip": 0.01147128, "auxiliary_loss_mlp": 0.01026259, "balance_loss_clip": 1.04916131, "balance_loss_mlp": 1.01785469, "epoch": 0.5219743882642939, "flos": 23951663498880.0, "grad_norm": 1.633871796446336, "language_loss": 0.7963289, "learning_rate": 1.954823487068848e-06, "loss": 0.81806278, "num_input_tokens_seen": 93727495, "step": 4341, "time_per_iteration": 2.6377642154693604 }, { "auxiliary_loss_clip": 0.0116859, "auxiliary_loss_mlp": 0.01028326, "balance_loss_clip": 1.05377078, "balance_loss_mlp": 1.02033854, "epoch": 0.5220946311549329, "flos": 28799280213120.0, "grad_norm": 1.630684455458344, "language_loss": 0.81293112, "learning_rate": 1.9540447176202976e-06, "loss": 0.8349002, "num_input_tokens_seen": 93748740, "step": 4342, "time_per_iteration": 2.721656560897827 }, { "auxiliary_loss_clip": 0.01065041, "auxiliary_loss_mlp": 0.01002506, "balance_loss_clip": 1.01378536, "balance_loss_mlp": 1.0010041, "epoch": 0.5222148740455721, "flos": 67189369017600.0, "grad_norm": 0.8791994028653938, "language_loss": 0.60699868, "learning_rate": 1.9532659551431272e-06, "loss": 0.62767422, "num_input_tokens_seen": 93815770, "step": 4343, "time_per_iteration": 3.38393497467041 }, { "auxiliary_loss_clip": 0.01172157, "auxiliary_loss_mlp": 0.01027308, "balance_loss_clip": 1.05186749, "balance_loss_mlp": 1.01953614, "epoch": 0.5223351169362112, "flos": 61856164339200.0, "grad_norm": 1.6187867247611154, "language_loss": 0.67715788, "learning_rate": 1.9524871997554744e-06, "loss": 0.69915253, "num_input_tokens_seen": 93843530, "step": 4344, "time_per_iteration": 3.065627098083496 }, { "auxiliary_loss_clip": 0.01169458, "auxiliary_loss_mlp": 0.01026217, "balance_loss_clip": 1.05166841, "balance_loss_mlp": 1.01815844, "epoch": 0.5224553598268502, "flos": 14647388676480.0, "grad_norm": 2.053701062657634, "language_loss": 0.80906737, "learning_rate": 1.951708451575475e-06, "loss": 0.83102417, "num_input_tokens_seen": 93860595, "step": 4345, "time_per_iteration": 2.6549623012542725 }, { "auxiliary_loss_clip": 0.011488, "auxiliary_loss_mlp": 0.01028488, "balance_loss_clip": 1.04643333, "balance_loss_mlp": 1.02028012, "epoch": 0.5225756027174894, "flos": 14826043946880.0, "grad_norm": 2.5589274779245352, "language_loss": 0.82239318, "learning_rate": 1.9509297107212657e-06, "loss": 0.84416604, "num_input_tokens_seen": 93877365, "step": 4346, "time_per_iteration": 2.650106430053711 }, { "auxiliary_loss_clip": 0.01182573, "auxiliary_loss_mlp": 0.01028503, "balance_loss_clip": 1.05226314, "balance_loss_mlp": 1.02062953, "epoch": 0.5226958456081284, "flos": 23512009029120.0, "grad_norm": 1.5930931813832145, "language_loss": 0.7935974, "learning_rate": 1.95015097731098e-06, "loss": 0.81570816, "num_input_tokens_seen": 93896855, "step": 4347, "time_per_iteration": 2.634014844894409 }, { "auxiliary_loss_clip": 0.01185438, "auxiliary_loss_mlp": 0.01028103, "balance_loss_clip": 1.0540669, "balance_loss_mlp": 1.01976991, "epoch": 0.5228160884987675, "flos": 19062928690560.0, "grad_norm": 2.038168117958015, "language_loss": 0.81896257, "learning_rate": 1.949372251462751e-06, "loss": 0.84109801, "num_input_tokens_seen": 93914270, "step": 4348, "time_per_iteration": 2.6558678150177 }, { "auxiliary_loss_clip": 0.01141856, "auxiliary_loss_mlp": 0.00762895, "balance_loss_clip": 1.04783893, "balance_loss_mlp": 1.00032616, "epoch": 0.5229363313894067, "flos": 21063224252160.0, "grad_norm": 1.827963527577306, "language_loss": 0.83261585, "learning_rate": 1.9485935332947124e-06, "loss": 0.85166335, "num_input_tokens_seen": 93932180, "step": 4349, "time_per_iteration": 2.773237943649292 }, { "auxiliary_loss_clip": 0.01150069, "auxiliary_loss_mlp": 0.01025608, "balance_loss_clip": 1.04896331, "balance_loss_mlp": 1.01812744, "epoch": 0.5230565742800457, "flos": 14830389492480.0, "grad_norm": 2.283413072049421, "language_loss": 0.83994591, "learning_rate": 1.947814822924993e-06, "loss": 0.86170262, "num_input_tokens_seen": 93949690, "step": 4350, "time_per_iteration": 2.6666791439056396 }, { "auxiliary_loss_clip": 0.01184874, "auxiliary_loss_mlp": 0.01028789, "balance_loss_clip": 1.05433583, "balance_loss_mlp": 1.02098036, "epoch": 0.5231768171706848, "flos": 25813021253760.0, "grad_norm": 2.0816664226167227, "language_loss": 0.82921231, "learning_rate": 1.9470361204717236e-06, "loss": 0.85134894, "num_input_tokens_seen": 93968830, "step": 4351, "time_per_iteration": 2.7068676948547363 }, { "auxiliary_loss_clip": 0.01144351, "auxiliary_loss_mlp": 0.00763875, "balance_loss_clip": 1.04693508, "balance_loss_mlp": 1.00033426, "epoch": 0.5232970600613239, "flos": 22743807834240.0, "grad_norm": 1.632024740308647, "language_loss": 0.80980182, "learning_rate": 1.9462574260530326e-06, "loss": 0.82888407, "num_input_tokens_seen": 93989110, "step": 4352, "time_per_iteration": 2.755526065826416 }, { "auxiliary_loss_clip": 0.01156773, "auxiliary_loss_mlp": 0.01028895, "balance_loss_clip": 1.04615664, "balance_loss_mlp": 1.02080035, "epoch": 0.523417302951963, "flos": 17310703432320.0, "grad_norm": 1.961091391878979, "language_loss": 0.80835783, "learning_rate": 1.9454787397870472e-06, "loss": 0.8302145, "num_input_tokens_seen": 94006430, "step": 4353, "time_per_iteration": 3.5751290321350098 }, { "auxiliary_loss_clip": 0.01106266, "auxiliary_loss_mlp": 0.01027946, "balance_loss_clip": 1.04615068, "balance_loss_mlp": 1.01968527, "epoch": 0.523537545842602, "flos": 18551740285440.0, "grad_norm": 2.193890051644502, "language_loss": 0.72261822, "learning_rate": 1.944700061791894e-06, "loss": 0.74396038, "num_input_tokens_seen": 94024825, "step": 4354, "time_per_iteration": 4.62663722038269 }, { "auxiliary_loss_clip": 0.01168091, "auxiliary_loss_mlp": 0.01024275, "balance_loss_clip": 1.05110693, "balance_loss_mlp": 1.01643062, "epoch": 0.5236577887332411, "flos": 19719267955200.0, "grad_norm": 2.220604218042637, "language_loss": 0.65336794, "learning_rate": 1.943921392185698e-06, "loss": 0.67529154, "num_input_tokens_seen": 94043450, "step": 4355, "time_per_iteration": 2.715052604675293 }, { "auxiliary_loss_clip": 0.01159456, "auxiliary_loss_mlp": 0.01030677, "balance_loss_clip": 1.04918337, "balance_loss_mlp": 1.02211809, "epoch": 0.5237780316238803, "flos": 23550218121600.0, "grad_norm": 2.024391726465528, "language_loss": 0.76995081, "learning_rate": 1.9431427310865814e-06, "loss": 0.79185212, "num_input_tokens_seen": 94063055, "step": 4356, "time_per_iteration": 2.685035467147827 }, { "auxiliary_loss_clip": 0.01120238, "auxiliary_loss_mlp": 0.0102618, "balance_loss_clip": 1.04327631, "balance_loss_mlp": 1.01850855, "epoch": 0.5238982745145193, "flos": 22491894775680.0, "grad_norm": 1.8083952270237356, "language_loss": 0.78515196, "learning_rate": 1.942364078612667e-06, "loss": 0.80661613, "num_input_tokens_seen": 94081785, "step": 4357, "time_per_iteration": 2.7654733657836914 }, { "auxiliary_loss_clip": 0.01144383, "auxiliary_loss_mlp": 0.01030796, "balance_loss_clip": 1.0453732, "balance_loss_mlp": 1.02257037, "epoch": 0.5240185174051584, "flos": 27088927234560.0, "grad_norm": 1.687236475496652, "language_loss": 0.75255591, "learning_rate": 1.9415854348820765e-06, "loss": 0.77430767, "num_input_tokens_seen": 94101635, "step": 4358, "time_per_iteration": 3.673182964324951 }, { "auxiliary_loss_clip": 0.01172674, "auxiliary_loss_mlp": 0.01033064, "balance_loss_clip": 1.04899645, "balance_loss_mlp": 1.02453434, "epoch": 0.5241387602957975, "flos": 22674680110080.0, "grad_norm": 2.688943752040632, "language_loss": 0.68549097, "learning_rate": 1.940806800012929e-06, "loss": 0.70754832, "num_input_tokens_seen": 94121705, "step": 4359, "time_per_iteration": 2.6536269187927246 }, { "auxiliary_loss_clip": 0.01124994, "auxiliary_loss_mlp": 0.00763609, "balance_loss_clip": 1.04687381, "balance_loss_mlp": 1.0003202, "epoch": 0.5242590031864366, "flos": 40553453134080.0, "grad_norm": 2.0971870061398126, "language_loss": 0.63679707, "learning_rate": 1.9400281741233432e-06, "loss": 0.6556831, "num_input_tokens_seen": 94146595, "step": 4360, "time_per_iteration": 2.947805166244507 }, { "auxiliary_loss_clip": 0.01038248, "auxiliary_loss_mlp": 0.01002253, "balance_loss_clip": 1.01176298, "balance_loss_mlp": 1.00069165, "epoch": 0.5243792460770756, "flos": 66676313105280.0, "grad_norm": 0.6512341783764806, "language_loss": 0.52556771, "learning_rate": 1.939249557331435e-06, "loss": 0.5459727, "num_input_tokens_seen": 94212410, "step": 4361, "time_per_iteration": 3.3155996799468994 }, { "auxiliary_loss_clip": 0.01150282, "auxiliary_loss_mlp": 0.0102616, "balance_loss_clip": 1.05023479, "balance_loss_mlp": 1.01783061, "epoch": 0.5244994889677148, "flos": 28183663992960.0, "grad_norm": 2.5713229585375084, "language_loss": 0.72902334, "learning_rate": 1.938470949755321e-06, "loss": 0.75078779, "num_input_tokens_seen": 94232290, "step": 4362, "time_per_iteration": 2.798473596572876 }, { "auxiliary_loss_clip": 0.01043246, "auxiliary_loss_mlp": 0.01004028, "balance_loss_clip": 1.01013207, "balance_loss_mlp": 1.00257921, "epoch": 0.5246197318583539, "flos": 65950379239680.0, "grad_norm": 0.8073214132040221, "language_loss": 0.55658919, "learning_rate": 1.937692351513115e-06, "loss": 0.57706195, "num_input_tokens_seen": 94291285, "step": 4363, "time_per_iteration": 3.254577398300171 }, { "auxiliary_loss_clip": 0.01174107, "auxiliary_loss_mlp": 0.01026547, "balance_loss_clip": 1.05096412, "balance_loss_mlp": 1.0184288, "epoch": 0.5247399747489929, "flos": 21033490769280.0, "grad_norm": 1.6269043785970971, "language_loss": 0.80721205, "learning_rate": 1.9369137627229297e-06, "loss": 0.82921863, "num_input_tokens_seen": 94309685, "step": 4364, "time_per_iteration": 2.6650190353393555 }, { "auxiliary_loss_clip": 0.01167283, "auxiliary_loss_mlp": 0.0102446, "balance_loss_clip": 1.05030978, "balance_loss_mlp": 1.01650858, "epoch": 0.5248602176396321, "flos": 19025940660480.0, "grad_norm": 2.461514790756503, "language_loss": 0.88343751, "learning_rate": 1.936135183502877e-06, "loss": 0.90535498, "num_input_tokens_seen": 94326985, "step": 4365, "time_per_iteration": 2.6811435222625732 }, { "auxiliary_loss_clip": 0.01145904, "auxiliary_loss_mlp": 0.01033814, "balance_loss_clip": 1.0488106, "balance_loss_mlp": 1.02499259, "epoch": 0.5249804605302711, "flos": 22200084685440.0, "grad_norm": 2.036694965775395, "language_loss": 0.8025319, "learning_rate": 1.935356613971066e-06, "loss": 0.82432902, "num_input_tokens_seen": 94347645, "step": 4366, "time_per_iteration": 2.7800798416137695 }, { "auxiliary_loss_clip": 0.01155993, "auxiliary_loss_mlp": 0.00763202, "balance_loss_clip": 1.05044365, "balance_loss_mlp": 1.00027013, "epoch": 0.5251007034209102, "flos": 23805686626560.0, "grad_norm": 1.7759401931312815, "language_loss": 0.76889539, "learning_rate": 1.9345780542456047e-06, "loss": 0.78808737, "num_input_tokens_seen": 94367020, "step": 4367, "time_per_iteration": 2.7514760494232178 }, { "auxiliary_loss_clip": 0.01160478, "auxiliary_loss_mlp": 0.01029583, "balance_loss_clip": 1.04903281, "balance_loss_mlp": 1.02107096, "epoch": 0.5252209463115494, "flos": 23294605962240.0, "grad_norm": 2.089035609182676, "language_loss": 0.71662956, "learning_rate": 1.9337995044446007e-06, "loss": 0.73853016, "num_input_tokens_seen": 94385860, "step": 4368, "time_per_iteration": 2.696051597595215 }, { "auxiliary_loss_clip": 0.01176647, "auxiliary_loss_mlp": 0.01033346, "balance_loss_clip": 1.05257964, "balance_loss_mlp": 1.0247035, "epoch": 0.5253411892021884, "flos": 19828687760640.0, "grad_norm": 2.0711986272542893, "language_loss": 0.7996586, "learning_rate": 1.9330209646861596e-06, "loss": 0.82175851, "num_input_tokens_seen": 94405010, "step": 4369, "time_per_iteration": 2.686502456665039 }, { "auxiliary_loss_clip": 0.01153936, "auxiliary_loss_mlp": 0.01024255, "balance_loss_clip": 1.04990518, "balance_loss_mlp": 1.01613665, "epoch": 0.5254614320928275, "flos": 24133730561280.0, "grad_norm": 1.6270279663297633, "language_loss": 0.78032243, "learning_rate": 1.9322424350883843e-06, "loss": 0.80210429, "num_input_tokens_seen": 94426845, "step": 4370, "time_per_iteration": 2.7543272972106934 }, { "auxiliary_loss_clip": 0.01155333, "auxiliary_loss_mlp": 0.0102767, "balance_loss_clip": 1.04756975, "balance_loss_mlp": 1.01991582, "epoch": 0.5255816749834666, "flos": 24644954880000.0, "grad_norm": 1.6592539420659862, "language_loss": 0.78915703, "learning_rate": 1.931463915769379e-06, "loss": 0.81098706, "num_input_tokens_seen": 94446960, "step": 4371, "time_per_iteration": 2.7543201446533203 }, { "auxiliary_loss_clip": 0.01125078, "auxiliary_loss_mlp": 0.01027632, "balance_loss_clip": 1.04586434, "balance_loss_mlp": 1.01904321, "epoch": 0.5257019178741057, "flos": 14136595320960.0, "grad_norm": 2.1673817482100968, "language_loss": 0.74022889, "learning_rate": 1.930685406847242e-06, "loss": 0.76175594, "num_input_tokens_seen": 94461535, "step": 4372, "time_per_iteration": 2.645777940750122 }, { "auxiliary_loss_clip": 0.01153001, "auxiliary_loss_mlp": 0.01029259, "balance_loss_clip": 1.05044758, "balance_loss_mlp": 1.02182627, "epoch": 0.5258221607647448, "flos": 23548961145600.0, "grad_norm": 1.4894360566837161, "language_loss": 0.81828511, "learning_rate": 1.9299069084400734e-06, "loss": 0.8401078, "num_input_tokens_seen": 94482395, "step": 4373, "time_per_iteration": 2.728212356567383 }, { "auxiliary_loss_clip": 0.011413, "auxiliary_loss_mlp": 0.01027199, "balance_loss_clip": 1.05192196, "balance_loss_mlp": 1.01847231, "epoch": 0.5259424036553839, "flos": 24966103403520.0, "grad_norm": 2.19954946984769, "language_loss": 0.69679385, "learning_rate": 1.9291284206659717e-06, "loss": 0.71847886, "num_input_tokens_seen": 94500580, "step": 4374, "time_per_iteration": 2.7593822479248047 }, { "auxiliary_loss_clip": 0.01187278, "auxiliary_loss_mlp": 0.01032698, "balance_loss_clip": 1.05484319, "balance_loss_mlp": 1.0237211, "epoch": 0.526062646546023, "flos": 28763908295040.0, "grad_norm": 2.111169143140802, "language_loss": 0.71761185, "learning_rate": 1.928349943643032e-06, "loss": 0.73981166, "num_input_tokens_seen": 94519680, "step": 4375, "time_per_iteration": 2.860269784927368 }, { "auxiliary_loss_clip": 0.01168791, "auxiliary_loss_mlp": 0.01029193, "balance_loss_clip": 1.0535152, "balance_loss_mlp": 1.02092624, "epoch": 0.526182889436662, "flos": 22821375254400.0, "grad_norm": 1.7062473921669083, "language_loss": 0.81933182, "learning_rate": 1.9275714774893493e-06, "loss": 0.84131169, "num_input_tokens_seen": 94539135, "step": 4376, "time_per_iteration": 2.7256040573120117 }, { "auxiliary_loss_clip": 0.01131616, "auxiliary_loss_mlp": 0.01035953, "balance_loss_clip": 1.04558349, "balance_loss_mlp": 1.0270009, "epoch": 0.5263031323273012, "flos": 22929466256640.0, "grad_norm": 2.3654260252469164, "language_loss": 0.72913843, "learning_rate": 1.9267930223230154e-06, "loss": 0.75081414, "num_input_tokens_seen": 94557610, "step": 4377, "time_per_iteration": 2.7389309406280518 }, { "auxiliary_loss_clip": 0.01158432, "auxiliary_loss_mlp": 0.01029701, "balance_loss_clip": 1.05097485, "balance_loss_mlp": 1.02135599, "epoch": 0.5264233752179402, "flos": 17748634049280.0, "grad_norm": 2.1659660277051693, "language_loss": 0.77929312, "learning_rate": 1.9260145782621224e-06, "loss": 0.80117446, "num_input_tokens_seen": 94575390, "step": 4378, "time_per_iteration": 2.6920933723449707 }, { "auxiliary_loss_clip": 0.01153243, "auxiliary_loss_mlp": 0.01033912, "balance_loss_clip": 1.0524745, "balance_loss_mlp": 1.02574027, "epoch": 0.5265436181085793, "flos": 24421626069120.0, "grad_norm": 2.0134292168063164, "language_loss": 0.88095212, "learning_rate": 1.925236145424758e-06, "loss": 0.90282369, "num_input_tokens_seen": 94594210, "step": 4379, "time_per_iteration": 3.6833786964416504 }, { "auxiliary_loss_clip": 0.01066381, "auxiliary_loss_mlp": 0.01004472, "balance_loss_clip": 1.01203322, "balance_loss_mlp": 1.00285089, "epoch": 0.5266638609992185, "flos": 69207298156800.0, "grad_norm": 0.6949042958182249, "language_loss": 0.57578075, "learning_rate": 1.924457723929012e-06, "loss": 0.59648925, "num_input_tokens_seen": 94665020, "step": 4380, "time_per_iteration": 4.267938852310181 }, { "auxiliary_loss_clip": 0.01170852, "auxiliary_loss_mlp": 0.01024177, "balance_loss_clip": 1.05234742, "balance_loss_mlp": 1.01609433, "epoch": 0.5267841038898575, "flos": 20738699850240.0, "grad_norm": 1.4522060553568614, "language_loss": 0.82875341, "learning_rate": 1.9236793138929685e-06, "loss": 0.85070366, "num_input_tokens_seen": 94684290, "step": 4381, "time_per_iteration": 3.551720142364502 }, { "auxiliary_loss_clip": 0.01174502, "auxiliary_loss_mlp": 0.01026156, "balance_loss_clip": 1.05168819, "balance_loss_mlp": 1.01774287, "epoch": 0.5269043467804966, "flos": 17234392988160.0, "grad_norm": 2.061035938917171, "language_loss": 0.81265992, "learning_rate": 1.9229009154347133e-06, "loss": 0.83466649, "num_input_tokens_seen": 94701880, "step": 4382, "time_per_iteration": 2.671293020248413 }, { "auxiliary_loss_clip": 0.01109695, "auxiliary_loss_mlp": 0.007629, "balance_loss_clip": 1.04307973, "balance_loss_mlp": 1.00027823, "epoch": 0.5270245896711357, "flos": 18223157646720.0, "grad_norm": 2.0914978789530445, "language_loss": 0.80632377, "learning_rate": 1.922122528672327e-06, "loss": 0.8250497, "num_input_tokens_seen": 94720545, "step": 4383, "time_per_iteration": 2.689232587814331 }, { "auxiliary_loss_clip": 0.01181097, "auxiliary_loss_mlp": 0.01030657, "balance_loss_clip": 1.05194163, "balance_loss_mlp": 1.02258086, "epoch": 0.5271448325617748, "flos": 21287558643840.0, "grad_norm": 2.2640248823425346, "language_loss": 0.78788418, "learning_rate": 1.9213441537238914e-06, "loss": 0.81000173, "num_input_tokens_seen": 94737420, "step": 4384, "time_per_iteration": 2.666966676712036 }, { "auxiliary_loss_clip": 0.01025582, "auxiliary_loss_mlp": 0.01001861, "balance_loss_clip": 1.01026821, "balance_loss_mlp": 1.00050187, "epoch": 0.5272650754524139, "flos": 65495497403520.0, "grad_norm": 0.8306524455551767, "language_loss": 0.57367992, "learning_rate": 1.920565790707485e-06, "loss": 0.59395432, "num_input_tokens_seen": 94802810, "step": 4385, "time_per_iteration": 4.380992889404297 }, { "auxiliary_loss_clip": 0.01137732, "auxiliary_loss_mlp": 0.0102936, "balance_loss_clip": 1.04746962, "balance_loss_mlp": 1.02027059, "epoch": 0.527385318343053, "flos": 19676426008320.0, "grad_norm": 1.9740825351373121, "language_loss": 0.65966463, "learning_rate": 1.9197874397411853e-06, "loss": 0.68133551, "num_input_tokens_seen": 94819440, "step": 4386, "time_per_iteration": 3.1066417694091797 }, { "auxiliary_loss_clip": 0.01142, "auxiliary_loss_mlp": 0.01026374, "balance_loss_clip": 1.04508615, "balance_loss_mlp": 1.0180589, "epoch": 0.5275055612336921, "flos": 12712018947840.0, "grad_norm": 3.043851194874605, "language_loss": 0.67320013, "learning_rate": 1.919009100943067e-06, "loss": 0.69488394, "num_input_tokens_seen": 94835130, "step": 4387, "time_per_iteration": 2.664076566696167 }, { "auxiliary_loss_clip": 0.01136227, "auxiliary_loss_mlp": 0.01036837, "balance_loss_clip": 1.04958749, "balance_loss_mlp": 1.02806306, "epoch": 0.5276258041243311, "flos": 17749029098880.0, "grad_norm": 1.9650062244272355, "language_loss": 0.6566239, "learning_rate": 1.9182307744312043e-06, "loss": 0.67835456, "num_input_tokens_seen": 94852235, "step": 4388, "time_per_iteration": 2.693253993988037 }, { "auxiliary_loss_clip": 0.01157511, "auxiliary_loss_mlp": 0.01029489, "balance_loss_clip": 1.04867744, "balance_loss_mlp": 1.02126324, "epoch": 0.5277460470149702, "flos": 22710447077760.0, "grad_norm": 1.8297587243454958, "language_loss": 0.76435232, "learning_rate": 1.9174524603236676e-06, "loss": 0.78622234, "num_input_tokens_seen": 94871185, "step": 4389, "time_per_iteration": 2.720808744430542 }, { "auxiliary_loss_clip": 0.01155141, "auxiliary_loss_mlp": 0.01034464, "balance_loss_clip": 1.04931474, "balance_loss_mlp": 1.02648902, "epoch": 0.5278662899056094, "flos": 19902699734400.0, "grad_norm": 1.9241423295290516, "language_loss": 0.76452714, "learning_rate": 1.916674158738527e-06, "loss": 0.78642315, "num_input_tokens_seen": 94890090, "step": 4390, "time_per_iteration": 2.6828348636627197 }, { "auxiliary_loss_clip": 0.01138739, "auxiliary_loss_mlp": 0.00764428, "balance_loss_clip": 1.04972649, "balance_loss_mlp": 1.00026357, "epoch": 0.5279865327962484, "flos": 18005215875840.0, "grad_norm": 1.9769914577373713, "language_loss": 0.60555744, "learning_rate": 1.9158958697938506e-06, "loss": 0.62458909, "num_input_tokens_seen": 94908470, "step": 4391, "time_per_iteration": 2.8245067596435547 }, { "auxiliary_loss_clip": 0.01149406, "auxiliary_loss_mlp": 0.01030181, "balance_loss_clip": 1.04926538, "balance_loss_mlp": 1.02186024, "epoch": 0.5281067756868875, "flos": 15924443892480.0, "grad_norm": 2.4181960079688074, "language_loss": 0.85856164, "learning_rate": 1.9151175936077032e-06, "loss": 0.8803575, "num_input_tokens_seen": 94923440, "step": 4392, "time_per_iteration": 2.66951584815979 }, { "auxiliary_loss_clip": 0.01167205, "auxiliary_loss_mlp": 0.01024641, "balance_loss_clip": 1.05156732, "balance_loss_mlp": 1.0164597, "epoch": 0.5282270185775266, "flos": 19426488197760.0, "grad_norm": 1.7120341317480272, "language_loss": 0.7950784, "learning_rate": 1.9143393302981507e-06, "loss": 0.81699687, "num_input_tokens_seen": 94941125, "step": 4393, "time_per_iteration": 2.6540300846099854 }, { "auxiliary_loss_clip": 0.01158701, "auxiliary_loss_mlp": 0.01030535, "balance_loss_clip": 1.05178297, "balance_loss_mlp": 1.02223849, "epoch": 0.5283472614681657, "flos": 16399613934720.0, "grad_norm": 2.053135989731018, "language_loss": 0.83608294, "learning_rate": 1.913561079983252e-06, "loss": 0.8579753, "num_input_tokens_seen": 94959950, "step": 4394, "time_per_iteration": 2.6335043907165527 }, { "auxiliary_loss_clip": 0.01161579, "auxiliary_loss_mlp": 0.01039394, "balance_loss_clip": 1.04936624, "balance_loss_mlp": 1.03010154, "epoch": 0.5284675043588047, "flos": 26760524163840.0, "grad_norm": 2.652303183336842, "language_loss": 0.75128841, "learning_rate": 1.9127828427810693e-06, "loss": 0.77329808, "num_input_tokens_seen": 94980515, "step": 4395, "time_per_iteration": 2.724336862564087 }, { "auxiliary_loss_clip": 0.01149058, "auxiliary_loss_mlp": 0.0103109, "balance_loss_clip": 1.04860711, "balance_loss_mlp": 1.02271557, "epoch": 0.5285877472494439, "flos": 19899898473600.0, "grad_norm": 2.1562572205101924, "language_loss": 0.81049806, "learning_rate": 1.9120046188096607e-06, "loss": 0.83229959, "num_input_tokens_seen": 94998560, "step": 4396, "time_per_iteration": 2.732729196548462 }, { "auxiliary_loss_clip": 0.01154072, "auxiliary_loss_mlp": 0.01028881, "balance_loss_clip": 1.05323601, "balance_loss_mlp": 1.02087867, "epoch": 0.528707990140083, "flos": 20011257613440.0, "grad_norm": 3.928881035084848, "language_loss": 0.74568033, "learning_rate": 1.9112264081870804e-06, "loss": 0.76750982, "num_input_tokens_seen": 95016950, "step": 4397, "time_per_iteration": 2.6809356212615967 }, { "auxiliary_loss_clip": 0.01138544, "auxiliary_loss_mlp": 0.01031983, "balance_loss_clip": 1.04953945, "balance_loss_mlp": 1.02332234, "epoch": 0.528828233030722, "flos": 20667956014080.0, "grad_norm": 2.5484886036110703, "language_loss": 0.75921869, "learning_rate": 1.9104482110313843e-06, "loss": 0.78092396, "num_input_tokens_seen": 95036540, "step": 4398, "time_per_iteration": 2.770430564880371 }, { "auxiliary_loss_clip": 0.01168589, "auxiliary_loss_mlp": 0.01033851, "balance_loss_clip": 1.05103195, "balance_loss_mlp": 1.02541089, "epoch": 0.5289484759213612, "flos": 25192448956800.0, "grad_norm": 1.9184905142822517, "language_loss": 0.74196064, "learning_rate": 1.909670027460623e-06, "loss": 0.76398504, "num_input_tokens_seen": 95053840, "step": 4399, "time_per_iteration": 2.6679933071136475 }, { "auxiliary_loss_clip": 0.01170924, "auxiliary_loss_mlp": 0.01029216, "balance_loss_clip": 1.0532136, "balance_loss_mlp": 1.0209192, "epoch": 0.5290687188120002, "flos": 31139255715840.0, "grad_norm": 1.914597062075373, "language_loss": 0.71729064, "learning_rate": 1.908891857592847e-06, "loss": 0.73929203, "num_input_tokens_seen": 95074910, "step": 4400, "time_per_iteration": 2.774463415145874 }, { "auxiliary_loss_clip": 0.01135038, "auxiliary_loss_mlp": 0.01021995, "balance_loss_clip": 1.04904532, "balance_loss_mlp": 1.01342392, "epoch": 0.5291889617026393, "flos": 20119851406080.0, "grad_norm": 2.252218094286598, "language_loss": 0.90295923, "learning_rate": 1.9081137015461034e-06, "loss": 0.92452955, "num_input_tokens_seen": 95090985, "step": 4401, "time_per_iteration": 2.7129905223846436 }, { "auxiliary_loss_clip": 0.01120178, "auxiliary_loss_mlp": 0.01031757, "balance_loss_clip": 1.04722142, "balance_loss_mlp": 1.02326286, "epoch": 0.5293092045932785, "flos": 19643747610240.0, "grad_norm": 1.858437256453434, "language_loss": 0.90481651, "learning_rate": 1.9073355594384383e-06, "loss": 0.92633581, "num_input_tokens_seen": 95109225, "step": 4402, "time_per_iteration": 2.790578603744507 }, { "auxiliary_loss_clip": 0.01133313, "auxiliary_loss_mlp": 0.0102652, "balance_loss_clip": 1.0469749, "balance_loss_mlp": 1.0178411, "epoch": 0.5294294474839175, "flos": 24317736958080.0, "grad_norm": 2.020619459800134, "language_loss": 0.8036145, "learning_rate": 1.906557431387895e-06, "loss": 0.82521284, "num_input_tokens_seen": 95128215, "step": 4403, "time_per_iteration": 2.721859931945801 }, { "auxiliary_loss_clip": 0.011381, "auxiliary_loss_mlp": 0.01027817, "balance_loss_clip": 1.05206418, "balance_loss_mlp": 1.01932371, "epoch": 0.5295496903745566, "flos": 18875941464960.0, "grad_norm": 2.0629777318904523, "language_loss": 0.78558904, "learning_rate": 1.905779317512516e-06, "loss": 0.80724823, "num_input_tokens_seen": 95145760, "step": 4404, "time_per_iteration": 2.7082860469818115 }, { "auxiliary_loss_clip": 0.01170443, "auxiliary_loss_mlp": 0.01029187, "balance_loss_clip": 1.05320728, "balance_loss_mlp": 1.02147388, "epoch": 0.5296699332651957, "flos": 20923101296640.0, "grad_norm": 1.9335089530208531, "language_loss": 0.80417323, "learning_rate": 1.9050012179303385e-06, "loss": 0.82616955, "num_input_tokens_seen": 95164270, "step": 4405, "time_per_iteration": 3.669764518737793 }, { "auxiliary_loss_clip": 0.01171603, "auxiliary_loss_mlp": 0.010256, "balance_loss_clip": 1.05097342, "balance_loss_mlp": 1.0171597, "epoch": 0.5297901761558348, "flos": 22046745525120.0, "grad_norm": 2.096785866707623, "language_loss": 0.68993914, "learning_rate": 1.904223132759401e-06, "loss": 0.71191108, "num_input_tokens_seen": 95182870, "step": 4406, "time_per_iteration": 4.579307556152344 }, { "auxiliary_loss_clip": 0.0117079, "auxiliary_loss_mlp": 0.01031436, "balance_loss_clip": 1.05150032, "balance_loss_mlp": 1.02325201, "epoch": 0.5299104190464738, "flos": 21798495653760.0, "grad_norm": 2.572511257112462, "language_loss": 0.68715513, "learning_rate": 1.9034450621177383e-06, "loss": 0.70917749, "num_input_tokens_seen": 95201190, "step": 4407, "time_per_iteration": 2.6572837829589844 }, { "auxiliary_loss_clip": 0.01171712, "auxiliary_loss_mlp": 0.01030504, "balance_loss_clip": 1.05432844, "balance_loss_mlp": 1.02161121, "epoch": 0.530030661937113, "flos": 14720790119040.0, "grad_norm": 2.1399127389271433, "language_loss": 0.70489264, "learning_rate": 1.9026670061233824e-06, "loss": 0.72691476, "num_input_tokens_seen": 95218625, "step": 4408, "time_per_iteration": 2.70247220993042 }, { "auxiliary_loss_clip": 0.01151804, "auxiliary_loss_mlp": 0.01029128, "balance_loss_clip": 1.05013728, "balance_loss_mlp": 1.021052, "epoch": 0.5301509048277521, "flos": 21251504367360.0, "grad_norm": 1.6838791657119094, "language_loss": 0.80763096, "learning_rate": 1.901888964894365e-06, "loss": 0.82944036, "num_input_tokens_seen": 95237665, "step": 4409, "time_per_iteration": 2.7121400833129883 }, { "auxiliary_loss_clip": 0.01186212, "auxiliary_loss_mlp": 0.0102942, "balance_loss_clip": 1.05323792, "balance_loss_mlp": 1.02128994, "epoch": 0.5302711477183911, "flos": 25957058791680.0, "grad_norm": 2.3018645924248173, "language_loss": 0.67912662, "learning_rate": 1.9011109385487134e-06, "loss": 0.70128286, "num_input_tokens_seen": 95258915, "step": 4410, "time_per_iteration": 3.552081823348999 }, { "auxiliary_loss_clip": 0.01186283, "auxiliary_loss_mlp": 0.01033024, "balance_loss_clip": 1.05359268, "balance_loss_mlp": 1.02480459, "epoch": 0.5303913906090303, "flos": 22273126992000.0, "grad_norm": 2.2629586042027947, "language_loss": 0.66364986, "learning_rate": 1.900332927204454e-06, "loss": 0.68584293, "num_input_tokens_seen": 95277365, "step": 4411, "time_per_iteration": 2.594653367996216 }, { "auxiliary_loss_clip": 0.01165185, "auxiliary_loss_mlp": 0.01033461, "balance_loss_clip": 1.05163598, "balance_loss_mlp": 1.02434754, "epoch": 0.5305116334996693, "flos": 24936010784640.0, "grad_norm": 1.8027620368650092, "language_loss": 0.7670294, "learning_rate": 1.8995549309796097e-06, "loss": 0.78901589, "num_input_tokens_seen": 95296670, "step": 4412, "time_per_iteration": 2.6839632987976074 }, { "auxiliary_loss_clip": 0.01177259, "auxiliary_loss_mlp": 0.01030637, "balance_loss_clip": 1.05464244, "balance_loss_mlp": 1.02227426, "epoch": 0.5306318763903084, "flos": 20189338266240.0, "grad_norm": 2.059473482375525, "language_loss": 0.76695418, "learning_rate": 1.8987769499922028e-06, "loss": 0.78903311, "num_input_tokens_seen": 95315640, "step": 4413, "time_per_iteration": 2.69919753074646 }, { "auxiliary_loss_clip": 0.01170063, "auxiliary_loss_mlp": 0.00763491, "balance_loss_clip": 1.05282259, "balance_loss_mlp": 1.00030041, "epoch": 0.5307521192809476, "flos": 20266366982400.0, "grad_norm": 2.1881085489188203, "language_loss": 0.70904779, "learning_rate": 1.897998984360252e-06, "loss": 0.7283833, "num_input_tokens_seen": 95334610, "step": 4414, "time_per_iteration": 2.665048599243164 }, { "auxiliary_loss_clip": 0.01152908, "auxiliary_loss_mlp": 0.01031252, "balance_loss_clip": 1.04969096, "balance_loss_mlp": 1.02313352, "epoch": 0.5308723621715866, "flos": 28844276976000.0, "grad_norm": 1.482583483614544, "language_loss": 0.78710443, "learning_rate": 1.897221034201775e-06, "loss": 0.80894601, "num_input_tokens_seen": 95358350, "step": 4415, "time_per_iteration": 2.737605094909668 }, { "auxiliary_loss_clip": 0.01142391, "auxiliary_loss_mlp": 0.01025286, "balance_loss_clip": 1.04715788, "balance_loss_mlp": 1.01757956, "epoch": 0.5309926050622257, "flos": 27457766040960.0, "grad_norm": 1.4817277788063077, "language_loss": 0.6648441, "learning_rate": 1.8964430996347842e-06, "loss": 0.68652081, "num_input_tokens_seen": 95379900, "step": 4416, "time_per_iteration": 2.7136640548706055 }, { "auxiliary_loss_clip": 0.01155662, "auxiliary_loss_mlp": 0.0102787, "balance_loss_clip": 1.04981947, "balance_loss_mlp": 1.01901841, "epoch": 0.5311128479528648, "flos": 20514545026560.0, "grad_norm": 1.6911891074210676, "language_loss": 0.8220048, "learning_rate": 1.8956651807772931e-06, "loss": 0.84384012, "num_input_tokens_seen": 95397935, "step": 4417, "time_per_iteration": 2.716836452484131 }, { "auxiliary_loss_clip": 0.01166874, "auxiliary_loss_mlp": 0.01025656, "balance_loss_clip": 1.05099237, "balance_loss_mlp": 1.01790714, "epoch": 0.5312330908435039, "flos": 21397660807680.0, "grad_norm": 1.829053089005621, "language_loss": 0.84315687, "learning_rate": 1.8948872777473115e-06, "loss": 0.86508214, "num_input_tokens_seen": 95415890, "step": 4418, "time_per_iteration": 2.619044542312622 }, { "auxiliary_loss_clip": 0.01156238, "auxiliary_loss_mlp": 0.01022893, "balance_loss_clip": 1.05198812, "balance_loss_mlp": 1.0148406, "epoch": 0.531353333734143, "flos": 24717350741760.0, "grad_norm": 1.6727673281825883, "language_loss": 0.63380563, "learning_rate": 1.8941093906628458e-06, "loss": 0.65559697, "num_input_tokens_seen": 95433675, "step": 4419, "time_per_iteration": 2.76678204536438 }, { "auxiliary_loss_clip": 0.01149875, "auxiliary_loss_mlp": 0.0102962, "balance_loss_clip": 1.04923999, "balance_loss_mlp": 1.02162123, "epoch": 0.531473576624782, "flos": 30480689808000.0, "grad_norm": 1.6773893156424067, "language_loss": 0.70996773, "learning_rate": 1.893331519641902e-06, "loss": 0.73176265, "num_input_tokens_seen": 95455820, "step": 4420, "time_per_iteration": 2.8477962017059326 }, { "auxiliary_loss_clip": 0.01129648, "auxiliary_loss_mlp": 0.01031061, "balance_loss_clip": 1.04522991, "balance_loss_mlp": 1.02222729, "epoch": 0.5315938195154212, "flos": 23002975440000.0, "grad_norm": 2.18397345109428, "language_loss": 0.73684978, "learning_rate": 1.8925536648024815e-06, "loss": 0.75845683, "num_input_tokens_seen": 95473240, "step": 4421, "time_per_iteration": 2.751095771789551 }, { "auxiliary_loss_clip": 0.0118688, "auxiliary_loss_mlp": 0.01027172, "balance_loss_clip": 1.05488527, "balance_loss_mlp": 1.01955414, "epoch": 0.5317140624060602, "flos": 22748584343040.0, "grad_norm": 2.199579337671364, "language_loss": 0.7597177, "learning_rate": 1.8917758262625849e-06, "loss": 0.78185821, "num_input_tokens_seen": 95493480, "step": 4422, "time_per_iteration": 2.5881690979003906 }, { "auxiliary_loss_clip": 0.01152644, "auxiliary_loss_mlp": 0.01028251, "balance_loss_clip": 1.05159187, "balance_loss_mlp": 1.02058291, "epoch": 0.5318343052966993, "flos": 22821087945600.0, "grad_norm": 1.7620824708031575, "language_loss": 0.81021965, "learning_rate": 1.8909980041402089e-06, "loss": 0.83202863, "num_input_tokens_seen": 95512075, "step": 4423, "time_per_iteration": 2.7035598754882812 }, { "auxiliary_loss_clip": 0.01162344, "auxiliary_loss_mlp": 0.01028501, "balance_loss_clip": 1.04918754, "balance_loss_mlp": 1.01980495, "epoch": 0.5319545481873384, "flos": 13626089274240.0, "grad_norm": 2.7121525223040317, "language_loss": 0.65725708, "learning_rate": 1.8902201985533494e-06, "loss": 0.67916554, "num_input_tokens_seen": 95529340, "step": 4424, "time_per_iteration": 2.577834129333496 }, { "auxiliary_loss_clip": 0.01155704, "auxiliary_loss_mlp": 0.01028781, "balance_loss_clip": 1.05215526, "balance_loss_mlp": 1.02022767, "epoch": 0.5320747910779775, "flos": 22162522037760.0, "grad_norm": 2.467131157895143, "language_loss": 0.74905348, "learning_rate": 1.8894424096199983e-06, "loss": 0.77089834, "num_input_tokens_seen": 95548545, "step": 4425, "time_per_iteration": 2.6902053356170654 }, { "auxiliary_loss_clip": 0.01173248, "auxiliary_loss_mlp": 0.01031864, "balance_loss_clip": 1.0546999, "balance_loss_mlp": 1.02365029, "epoch": 0.5321950339686166, "flos": 18588081870720.0, "grad_norm": 1.9860907063156943, "language_loss": 0.85985887, "learning_rate": 1.8886646374581463e-06, "loss": 0.88190997, "num_input_tokens_seen": 95567770, "step": 4426, "time_per_iteration": 2.664602041244507 }, { "auxiliary_loss_clip": 0.01166385, "auxiliary_loss_mlp": 0.01033825, "balance_loss_clip": 1.04935455, "balance_loss_mlp": 1.02509308, "epoch": 0.5323152768592557, "flos": 22856818999680.0, "grad_norm": 1.7533541527709298, "language_loss": 0.71279037, "learning_rate": 1.8878868821857795e-06, "loss": 0.73479247, "num_input_tokens_seen": 95587420, "step": 4427, "time_per_iteration": 2.630034923553467 }, { "auxiliary_loss_clip": 0.01122433, "auxiliary_loss_mlp": 0.01027543, "balance_loss_clip": 1.04486454, "balance_loss_mlp": 1.01891279, "epoch": 0.5324355197498948, "flos": 33948690998400.0, "grad_norm": 2.308206861050779, "language_loss": 0.75463855, "learning_rate": 1.8871091439208838e-06, "loss": 0.77613831, "num_input_tokens_seen": 95609030, "step": 4428, "time_per_iteration": 2.8556265830993652 }, { "auxiliary_loss_clip": 0.01124604, "auxiliary_loss_mlp": 0.01033957, "balance_loss_clip": 1.04716039, "balance_loss_mlp": 1.02511144, "epoch": 0.5325557626405338, "flos": 23256720092160.0, "grad_norm": 2.4529153823725505, "language_loss": 0.7719605, "learning_rate": 1.8863314227814414e-06, "loss": 0.7935462, "num_input_tokens_seen": 95627340, "step": 4429, "time_per_iteration": 2.743635892868042 }, { "auxiliary_loss_clip": 0.01173608, "auxiliary_loss_mlp": 0.01030988, "balance_loss_clip": 1.05248094, "balance_loss_mlp": 1.02233911, "epoch": 0.532676005531173, "flos": 26718687797760.0, "grad_norm": 2.752643607520451, "language_loss": 0.49089339, "learning_rate": 1.8855537188854313e-06, "loss": 0.51293939, "num_input_tokens_seen": 95646315, "step": 4430, "time_per_iteration": 2.712009906768799 }, { "auxiliary_loss_clip": 0.01173296, "auxiliary_loss_mlp": 0.01029126, "balance_loss_clip": 1.05087829, "balance_loss_mlp": 1.02149642, "epoch": 0.5327962484218121, "flos": 17894610921600.0, "grad_norm": 2.0093079489141816, "language_loss": 0.78212357, "learning_rate": 1.8847760323508315e-06, "loss": 0.80414772, "num_input_tokens_seen": 95665220, "step": 4431, "time_per_iteration": 3.576850175857544 }, { "auxiliary_loss_clip": 0.01151817, "auxiliary_loss_mlp": 0.0103114, "balance_loss_clip": 1.05039275, "balance_loss_mlp": 1.02327859, "epoch": 0.5329164913124511, "flos": 17925385898880.0, "grad_norm": 1.710311822583488, "language_loss": 0.76160765, "learning_rate": 1.883998363295616e-06, "loss": 0.78343719, "num_input_tokens_seen": 95682700, "step": 4432, "time_per_iteration": 4.582221031188965 }, { "auxiliary_loss_clip": 0.01055579, "auxiliary_loss_mlp": 0.01003699, "balance_loss_clip": 1.01461172, "balance_loss_mlp": 1.00213742, "epoch": 0.5330367342030903, "flos": 57254178781440.0, "grad_norm": 0.8747379982611112, "language_loss": 0.62657863, "learning_rate": 1.8832207118377565e-06, "loss": 0.64717138, "num_input_tokens_seen": 95738070, "step": 4433, "time_per_iteration": 3.1660594940185547 }, { "auxiliary_loss_clip": 0.01182488, "auxiliary_loss_mlp": 0.01023144, "balance_loss_clip": 1.05256152, "balance_loss_mlp": 1.01553583, "epoch": 0.5331569770937293, "flos": 17420518287360.0, "grad_norm": 2.5414307819962403, "language_loss": 0.69509089, "learning_rate": 1.882443078095222e-06, "loss": 0.71714723, "num_input_tokens_seen": 95756950, "step": 4434, "time_per_iteration": 2.629270315170288 }, { "auxiliary_loss_clip": 0.01036895, "auxiliary_loss_mlp": 0.01002943, "balance_loss_clip": 1.01392698, "balance_loss_mlp": 1.0013932, "epoch": 0.5332772199843684, "flos": 56750783627520.0, "grad_norm": 0.853604326992352, "language_loss": 0.66742158, "learning_rate": 1.8816654621859794e-06, "loss": 0.68781996, "num_input_tokens_seen": 95816615, "step": 4435, "time_per_iteration": 3.2092788219451904 }, { "auxiliary_loss_clip": 0.01181056, "auxiliary_loss_mlp": 0.01028414, "balance_loss_clip": 1.05097497, "balance_loss_mlp": 1.02023017, "epoch": 0.5333974628750076, "flos": 18697753071360.0, "grad_norm": 2.4864945874270874, "language_loss": 0.72649187, "learning_rate": 1.8808878642279915e-06, "loss": 0.7485866, "num_input_tokens_seen": 95832020, "step": 4436, "time_per_iteration": 3.5043752193450928 }, { "auxiliary_loss_clip": 0.01139375, "auxiliary_loss_mlp": 0.01030927, "balance_loss_clip": 1.04294443, "balance_loss_mlp": 1.02186763, "epoch": 0.5335177057656466, "flos": 23805507058560.0, "grad_norm": 2.1202774298500593, "language_loss": 0.65186739, "learning_rate": 1.8801102843392209e-06, "loss": 0.67357039, "num_input_tokens_seen": 95851425, "step": 4437, "time_per_iteration": 2.700889825820923 }, { "auxiliary_loss_clip": 0.01137082, "auxiliary_loss_mlp": 0.01032554, "balance_loss_clip": 1.04507887, "balance_loss_mlp": 1.02423882, "epoch": 0.5336379486562857, "flos": 25078683605760.0, "grad_norm": 1.5167107242185394, "language_loss": 0.85015595, "learning_rate": 1.8793327226376238e-06, "loss": 0.87185228, "num_input_tokens_seen": 95870745, "step": 4438, "time_per_iteration": 2.7530484199523926 }, { "auxiliary_loss_clip": 0.01162982, "auxiliary_loss_mlp": 0.0102803, "balance_loss_clip": 1.04927075, "balance_loss_mlp": 1.01984084, "epoch": 0.5337581915469248, "flos": 21396691140480.0, "grad_norm": 1.8922888923037553, "language_loss": 0.80271918, "learning_rate": 1.8785551792411569e-06, "loss": 0.82462931, "num_input_tokens_seen": 95889755, "step": 4439, "time_per_iteration": 2.6886472702026367 }, { "auxiliary_loss_clip": 0.01152606, "auxiliary_loss_mlp": 0.01028424, "balance_loss_clip": 1.04901922, "balance_loss_mlp": 1.02082479, "epoch": 0.5338784344375639, "flos": 14865905064960.0, "grad_norm": 2.057691034298841, "language_loss": 0.82768238, "learning_rate": 1.8777776542677733e-06, "loss": 0.84949267, "num_input_tokens_seen": 95907805, "step": 4440, "time_per_iteration": 2.6588706970214844 }, { "auxiliary_loss_clip": 0.01136957, "auxiliary_loss_mlp": 0.01029315, "balance_loss_clip": 1.04205573, "balance_loss_mlp": 1.02130389, "epoch": 0.5339986773282029, "flos": 20813501923200.0, "grad_norm": 1.726980074577696, "language_loss": 0.73207724, "learning_rate": 1.8770001478354216e-06, "loss": 0.75373995, "num_input_tokens_seen": 95927480, "step": 4441, "time_per_iteration": 2.729613780975342 }, { "auxiliary_loss_clip": 0.01166271, "auxiliary_loss_mlp": 0.01036738, "balance_loss_clip": 1.05007875, "balance_loss_mlp": 1.02752924, "epoch": 0.5341189202188421, "flos": 17969089772160.0, "grad_norm": 2.3174023448291634, "language_loss": 0.84282082, "learning_rate": 1.8762226600620504e-06, "loss": 0.86485088, "num_input_tokens_seen": 95946095, "step": 4442, "time_per_iteration": 2.6815185546875 }, { "auxiliary_loss_clip": 0.01162252, "auxiliary_loss_mlp": 0.01034024, "balance_loss_clip": 1.04784441, "balance_loss_mlp": 1.02488089, "epoch": 0.5342391631094812, "flos": 11031866328960.0, "grad_norm": 2.5302510654783923, "language_loss": 0.59623432, "learning_rate": 1.8754451910656031e-06, "loss": 0.61819708, "num_input_tokens_seen": 95959995, "step": 4443, "time_per_iteration": 2.645616292953491 }, { "auxiliary_loss_clip": 0.01135036, "auxiliary_loss_mlp": 0.01032738, "balance_loss_clip": 1.04778314, "balance_loss_mlp": 1.0243336, "epoch": 0.5343594060001202, "flos": 15339135772800.0, "grad_norm": 2.078024492829133, "language_loss": 0.82568276, "learning_rate": 1.8746677409640212e-06, "loss": 0.84736049, "num_input_tokens_seen": 95977095, "step": 4444, "time_per_iteration": 2.7675089836120605 }, { "auxiliary_loss_clip": 0.01175281, "auxiliary_loss_mlp": 0.01026214, "balance_loss_clip": 1.05335438, "balance_loss_mlp": 1.01837027, "epoch": 0.5344796488907594, "flos": 26900898514560.0, "grad_norm": 1.7536272519256981, "language_loss": 0.84726572, "learning_rate": 1.8738903098752432e-06, "loss": 0.86928058, "num_input_tokens_seen": 95996225, "step": 4445, "time_per_iteration": 2.6577117443084717 }, { "auxiliary_loss_clip": 0.01155015, "auxiliary_loss_mlp": 0.01029861, "balance_loss_clip": 1.04903984, "balance_loss_mlp": 1.0217185, "epoch": 0.5345998917813984, "flos": 25411216740480.0, "grad_norm": 2.17614825041103, "language_loss": 0.73181868, "learning_rate": 1.8731128979172052e-06, "loss": 0.75366741, "num_input_tokens_seen": 96015425, "step": 4446, "time_per_iteration": 2.679762125015259 }, { "auxiliary_loss_clip": 0.01152333, "auxiliary_loss_mlp": 0.01025195, "balance_loss_clip": 1.05085635, "balance_loss_mlp": 1.01705301, "epoch": 0.5347201346720375, "flos": 32853379622400.0, "grad_norm": 2.2795699027501986, "language_loss": 0.67264092, "learning_rate": 1.8723355052078394e-06, "loss": 0.69441617, "num_input_tokens_seen": 96035460, "step": 4447, "time_per_iteration": 2.78326416015625 }, { "auxiliary_loss_clip": 0.01165356, "auxiliary_loss_mlp": 0.01034606, "balance_loss_clip": 1.04850602, "balance_loss_mlp": 1.02595782, "epoch": 0.5348403775626767, "flos": 17967940536960.0, "grad_norm": 3.3570732539407198, "language_loss": 0.77177429, "learning_rate": 1.8715581318650765e-06, "loss": 0.79377389, "num_input_tokens_seen": 96054515, "step": 4448, "time_per_iteration": 2.6226115226745605 }, { "auxiliary_loss_clip": 0.01153836, "auxiliary_loss_mlp": 0.01027672, "balance_loss_clip": 1.05246913, "balance_loss_mlp": 1.01833475, "epoch": 0.5349606204533157, "flos": 17603339535360.0, "grad_norm": 2.392776368350053, "language_loss": 0.81891906, "learning_rate": 1.8707807780068422e-06, "loss": 0.84073412, "num_input_tokens_seen": 96072330, "step": 4449, "time_per_iteration": 2.747387647628784 }, { "auxiliary_loss_clip": 0.01156134, "auxiliary_loss_mlp": 0.01029046, "balance_loss_clip": 1.04984164, "balance_loss_mlp": 1.02018821, "epoch": 0.5350808633439548, "flos": 29167831710720.0, "grad_norm": 1.8743204529868835, "language_loss": 0.6663183, "learning_rate": 1.8700034437510611e-06, "loss": 0.68817008, "num_input_tokens_seen": 96092425, "step": 4450, "time_per_iteration": 2.7108876705169678 }, { "auxiliary_loss_clip": 0.01130611, "auxiliary_loss_mlp": 0.01029155, "balance_loss_clip": 1.04527211, "balance_loss_mlp": 1.02063131, "epoch": 0.5352011062345938, "flos": 19499997381120.0, "grad_norm": 2.2559811157254073, "language_loss": 0.81405675, "learning_rate": 1.8692261292156549e-06, "loss": 0.83565438, "num_input_tokens_seen": 96111660, "step": 4451, "time_per_iteration": 2.7242164611816406 }, { "auxiliary_loss_clip": 0.01181637, "auxiliary_loss_mlp": 0.01029847, "balance_loss_clip": 1.05338991, "balance_loss_mlp": 1.02190137, "epoch": 0.535321349125233, "flos": 23477642691840.0, "grad_norm": 1.8462635073243578, "language_loss": 0.81027043, "learning_rate": 1.8684488345185401e-06, "loss": 0.83238524, "num_input_tokens_seen": 96131835, "step": 4452, "time_per_iteration": 2.6357946395874023 }, { "auxiliary_loss_clip": 0.01186983, "auxiliary_loss_mlp": 0.01043369, "balance_loss_clip": 1.05477822, "balance_loss_mlp": 1.03449392, "epoch": 0.535441592015872, "flos": 20478059786880.0, "grad_norm": 2.1390902373440928, "language_loss": 0.78786927, "learning_rate": 1.8676715597776332e-06, "loss": 0.8101728, "num_input_tokens_seen": 96150180, "step": 4453, "time_per_iteration": 2.58256459236145 }, { "auxiliary_loss_clip": 0.01117192, "auxiliary_loss_mlp": 0.01023191, "balance_loss_clip": 1.04498363, "balance_loss_mlp": 1.01558876, "epoch": 0.5355618349065111, "flos": 19573147428480.0, "grad_norm": 1.8578144157536076, "language_loss": 0.76066208, "learning_rate": 1.8668943051108455e-06, "loss": 0.78206587, "num_input_tokens_seen": 96167485, "step": 4454, "time_per_iteration": 2.835299253463745 }, { "auxiliary_loss_clip": 0.01155457, "auxiliary_loss_mlp": 0.0102748, "balance_loss_clip": 1.04912901, "balance_loss_mlp": 1.01893258, "epoch": 0.5356820777971503, "flos": 24024633978240.0, "grad_norm": 1.8050102705231947, "language_loss": 0.76016104, "learning_rate": 1.8661170706360856e-06, "loss": 0.78199041, "num_input_tokens_seen": 96186650, "step": 4455, "time_per_iteration": 2.6151602268218994 }, { "auxiliary_loss_clip": 0.01171477, "auxiliary_loss_mlp": 0.01025177, "balance_loss_clip": 1.05402911, "balance_loss_mlp": 1.01748216, "epoch": 0.5358023206877893, "flos": 20884676722560.0, "grad_norm": 1.5906833679966104, "language_loss": 0.81332201, "learning_rate": 1.8653398564712594e-06, "loss": 0.83528858, "num_input_tokens_seen": 96205595, "step": 4456, "time_per_iteration": 2.6540868282318115 }, { "auxiliary_loss_clip": 0.011679, "auxiliary_loss_mlp": 0.01028573, "balance_loss_clip": 1.05240333, "balance_loss_mlp": 1.02021646, "epoch": 0.5359225635784284, "flos": 22418996123520.0, "grad_norm": 1.6292206450168432, "language_loss": 0.82466662, "learning_rate": 1.8645626627342704e-06, "loss": 0.84663129, "num_input_tokens_seen": 96226360, "step": 4457, "time_per_iteration": 3.6488094329833984 }, { "auxiliary_loss_clip": 0.01176575, "auxiliary_loss_mlp": 0.01025523, "balance_loss_clip": 1.05338883, "balance_loss_mlp": 1.01760769, "epoch": 0.5360428064690675, "flos": 24097784025600.0, "grad_norm": 2.231464768925505, "language_loss": 0.80882758, "learning_rate": 1.8637854895430172e-06, "loss": 0.83084857, "num_input_tokens_seen": 96245625, "step": 4458, "time_per_iteration": 3.5858066082000732 }, { "auxiliary_loss_clip": 0.0113247, "auxiliary_loss_mlp": 0.01028864, "balance_loss_clip": 1.04630351, "balance_loss_mlp": 1.02037024, "epoch": 0.5361630493597066, "flos": 21434505183360.0, "grad_norm": 2.0366759274736905, "language_loss": 0.69370139, "learning_rate": 1.8630083370153978e-06, "loss": 0.71531469, "num_input_tokens_seen": 96265265, "step": 4459, "time_per_iteration": 3.659310817718506 }, { "auxiliary_loss_clip": 0.01025198, "auxiliary_loss_mlp": 0.01002182, "balance_loss_clip": 1.01195049, "balance_loss_mlp": 1.00054932, "epoch": 0.5362832922503457, "flos": 68888696520960.0, "grad_norm": 0.7421671522865998, "language_loss": 0.55335021, "learning_rate": 1.8622312052693041e-06, "loss": 0.57362401, "num_input_tokens_seen": 96326445, "step": 4460, "time_per_iteration": 3.431311845779419 }, { "auxiliary_loss_clip": 0.01158904, "auxiliary_loss_mlp": 0.01025967, "balance_loss_clip": 1.04585373, "balance_loss_mlp": 1.01725888, "epoch": 0.5364035351409848, "flos": 9793702563840.0, "grad_norm": 2.2212954571542824, "language_loss": 0.71355683, "learning_rate": 1.8614540944226267e-06, "loss": 0.73540556, "num_input_tokens_seen": 96343115, "step": 4461, "time_per_iteration": 2.6211354732513428 }, { "auxiliary_loss_clip": 0.01151657, "auxiliary_loss_mlp": 0.01030833, "balance_loss_clip": 1.0502826, "balance_loss_mlp": 1.02244055, "epoch": 0.5365237780316239, "flos": 23290080848640.0, "grad_norm": 1.6926802908418481, "language_loss": 0.68141991, "learning_rate": 1.8606770045932537e-06, "loss": 0.70324481, "num_input_tokens_seen": 96362230, "step": 4462, "time_per_iteration": 3.543541431427002 }, { "auxiliary_loss_clip": 0.01130777, "auxiliary_loss_mlp": 0.01027629, "balance_loss_clip": 1.0424931, "balance_loss_mlp": 1.01897466, "epoch": 0.5366440209222629, "flos": 26578133879040.0, "grad_norm": 1.7669476062884524, "language_loss": 0.8159287, "learning_rate": 1.859899935899068e-06, "loss": 0.83751273, "num_input_tokens_seen": 96382085, "step": 4463, "time_per_iteration": 2.77738094329834 }, { "auxiliary_loss_clip": 0.01152432, "auxiliary_loss_mlp": 0.0103199, "balance_loss_clip": 1.05019653, "balance_loss_mlp": 1.02334082, "epoch": 0.5367642638129021, "flos": 19608052469760.0, "grad_norm": 1.7725349714344847, "language_loss": 0.7912401, "learning_rate": 1.8591228884579506e-06, "loss": 0.81308436, "num_input_tokens_seen": 96400580, "step": 4464, "time_per_iteration": 2.7159528732299805 }, { "auxiliary_loss_clip": 0.01140522, "auxiliary_loss_mlp": 0.01029181, "balance_loss_clip": 1.04504371, "balance_loss_mlp": 1.02128923, "epoch": 0.5368845067035412, "flos": 23915214172800.0, "grad_norm": 2.232577384643004, "language_loss": 0.82329154, "learning_rate": 1.8583458623877795e-06, "loss": 0.84498858, "num_input_tokens_seen": 96419680, "step": 4465, "time_per_iteration": 2.714412212371826 }, { "auxiliary_loss_clip": 0.01171737, "auxiliary_loss_mlp": 0.01024053, "balance_loss_clip": 1.0512259, "balance_loss_mlp": 1.01622677, "epoch": 0.5370047495941802, "flos": 16873131951360.0, "grad_norm": 1.8477766329767404, "language_loss": 0.74663341, "learning_rate": 1.8575688578064281e-06, "loss": 0.76859128, "num_input_tokens_seen": 96437805, "step": 4466, "time_per_iteration": 2.706969738006592 }, { "auxiliary_loss_clip": 0.01176219, "auxiliary_loss_mlp": 0.01034693, "balance_loss_clip": 1.05428028, "balance_loss_mlp": 1.02590716, "epoch": 0.5371249924848194, "flos": 20740926493440.0, "grad_norm": 1.7289069928955294, "language_loss": 0.76498938, "learning_rate": 1.8567918748317674e-06, "loss": 0.78709859, "num_input_tokens_seen": 96457155, "step": 4467, "time_per_iteration": 2.657076835632324 }, { "auxiliary_loss_clip": 0.01139387, "auxiliary_loss_mlp": 0.01030354, "balance_loss_clip": 1.04538572, "balance_loss_mlp": 1.0220269, "epoch": 0.5372452353754584, "flos": 17968120104960.0, "grad_norm": 2.1440872404973654, "language_loss": 0.83140898, "learning_rate": 1.8560149135816659e-06, "loss": 0.85310638, "num_input_tokens_seen": 96473990, "step": 4468, "time_per_iteration": 2.6917827129364014 }, { "auxiliary_loss_clip": 0.01164435, "auxiliary_loss_mlp": 0.01026411, "balance_loss_clip": 1.04822636, "balance_loss_mlp": 1.01837587, "epoch": 0.5373654782660975, "flos": 15377021642880.0, "grad_norm": 2.999371324091986, "language_loss": 0.84279168, "learning_rate": 1.8552379741739873e-06, "loss": 0.86470014, "num_input_tokens_seen": 96491335, "step": 4469, "time_per_iteration": 2.6227927207946777 }, { "auxiliary_loss_clip": 0.01042187, "auxiliary_loss_mlp": 0.00754113, "balance_loss_clip": 1.01108456, "balance_loss_mlp": 1.00042772, "epoch": 0.5374857211567367, "flos": 69000091574400.0, "grad_norm": 0.8929543015985278, "language_loss": 0.55655497, "learning_rate": 1.8544610567265935e-06, "loss": 0.57451802, "num_input_tokens_seen": 96545275, "step": 4470, "time_per_iteration": 3.204318046569824 }, { "auxiliary_loss_clip": 0.01155993, "auxiliary_loss_mlp": 0.00763671, "balance_loss_clip": 1.05192578, "balance_loss_mlp": 1.00031447, "epoch": 0.5376059640473757, "flos": 15085355207040.0, "grad_norm": 1.9564426822136085, "language_loss": 0.83454138, "learning_rate": 1.853684161357341e-06, "loss": 0.85373801, "num_input_tokens_seen": 96562935, "step": 4471, "time_per_iteration": 2.670901298522949 }, { "auxiliary_loss_clip": 0.01168134, "auxiliary_loss_mlp": 0.00763445, "balance_loss_clip": 1.051525, "balance_loss_mlp": 1.00024271, "epoch": 0.5377262069380148, "flos": 19792597570560.0, "grad_norm": 4.85384481487378, "language_loss": 0.76960313, "learning_rate": 1.852907288184085e-06, "loss": 0.78891897, "num_input_tokens_seen": 96581820, "step": 4472, "time_per_iteration": 2.6365270614624023 }, { "auxiliary_loss_clip": 0.01125355, "auxiliary_loss_mlp": 0.01030934, "balance_loss_clip": 1.04405355, "balance_loss_mlp": 1.02290571, "epoch": 0.5378464498286539, "flos": 30003077640960.0, "grad_norm": 2.106816650142174, "language_loss": 0.70295799, "learning_rate": 1.8521304373246762e-06, "loss": 0.72452092, "num_input_tokens_seen": 96602865, "step": 4473, "time_per_iteration": 2.80812406539917 }, { "auxiliary_loss_clip": 0.01172122, "auxiliary_loss_mlp": 0.01035677, "balance_loss_clip": 1.05004704, "balance_loss_mlp": 1.02709389, "epoch": 0.537966692719293, "flos": 21251217058560.0, "grad_norm": 2.580670376508372, "language_loss": 0.88572675, "learning_rate": 1.8513536088969626e-06, "loss": 0.90780473, "num_input_tokens_seen": 96620530, "step": 4474, "time_per_iteration": 2.608285903930664 }, { "auxiliary_loss_clip": 0.01171709, "auxiliary_loss_mlp": 0.01029283, "balance_loss_clip": 1.05249906, "balance_loss_mlp": 1.02042556, "epoch": 0.538086935609932, "flos": 21543170803200.0, "grad_norm": 1.6053895120399482, "language_loss": 0.80759811, "learning_rate": 1.8505768030187884e-06, "loss": 0.82960802, "num_input_tokens_seen": 96640660, "step": 4475, "time_per_iteration": 2.6804568767547607 }, { "auxiliary_loss_clip": 0.01153225, "auxiliary_loss_mlp": 0.01029064, "balance_loss_clip": 1.05153048, "balance_loss_mlp": 1.02087474, "epoch": 0.5382071785005712, "flos": 22747219626240.0, "grad_norm": 1.587945484226246, "language_loss": 0.79952997, "learning_rate": 1.849800019807995e-06, "loss": 0.82135284, "num_input_tokens_seen": 96661885, "step": 4476, "time_per_iteration": 2.6981992721557617 }, { "auxiliary_loss_clip": 0.01136979, "auxiliary_loss_mlp": 0.01032616, "balance_loss_clip": 1.04707539, "balance_loss_mlp": 1.02457523, "epoch": 0.5383274213912103, "flos": 24934574240640.0, "grad_norm": 2.2554900103437125, "language_loss": 0.71181369, "learning_rate": 1.8490232593824186e-06, "loss": 0.7335096, "num_input_tokens_seen": 96678340, "step": 4477, "time_per_iteration": 2.7794244289398193 }, { "auxiliary_loss_clip": 0.01149274, "auxiliary_loss_mlp": 0.01027399, "balance_loss_clip": 1.04782403, "balance_loss_mlp": 1.01932824, "epoch": 0.5384476642818493, "flos": 22310186849280.0, "grad_norm": 2.053523952397613, "language_loss": 0.84648877, "learning_rate": 1.8482465218598935e-06, "loss": 0.8682555, "num_input_tokens_seen": 96698285, "step": 4478, "time_per_iteration": 2.647535800933838 }, { "auxiliary_loss_clip": 0.01138561, "auxiliary_loss_mlp": 0.01037565, "balance_loss_clip": 1.04709291, "balance_loss_mlp": 1.02876782, "epoch": 0.5385679071724885, "flos": 22711021695360.0, "grad_norm": 2.3669212994290723, "language_loss": 0.83607042, "learning_rate": 1.8474698073582508e-06, "loss": 0.85783172, "num_input_tokens_seen": 96719655, "step": 4479, "time_per_iteration": 2.6946322917938232 }, { "auxiliary_loss_clip": 0.01142961, "auxiliary_loss_mlp": 0.01029669, "balance_loss_clip": 1.04496455, "balance_loss_mlp": 1.02093709, "epoch": 0.5386881500631275, "flos": 15953746412160.0, "grad_norm": 3.6155790141516784, "language_loss": 0.87705851, "learning_rate": 1.8466931159953166e-06, "loss": 0.89878482, "num_input_tokens_seen": 96736290, "step": 4480, "time_per_iteration": 2.6859610080718994 }, { "auxiliary_loss_clip": 0.01160311, "auxiliary_loss_mlp": 0.0102927, "balance_loss_clip": 1.05320072, "balance_loss_mlp": 1.02074671, "epoch": 0.5388083929537666, "flos": 24060041809920.0, "grad_norm": 1.787057636112188, "language_loss": 0.84246463, "learning_rate": 1.8459164478889158e-06, "loss": 0.86436045, "num_input_tokens_seen": 96757685, "step": 4481, "time_per_iteration": 2.6710474491119385 }, { "auxiliary_loss_clip": 0.01130914, "auxiliary_loss_mlp": 0.01024765, "balance_loss_clip": 1.04416919, "balance_loss_mlp": 1.01749659, "epoch": 0.5389286358444056, "flos": 22236893147520.0, "grad_norm": 1.6773970846725486, "language_loss": 0.76133442, "learning_rate": 1.8451398031568663e-06, "loss": 0.78289115, "num_input_tokens_seen": 96777310, "step": 4482, "time_per_iteration": 2.7020978927612305 }, { "auxiliary_loss_clip": 0.01139635, "auxiliary_loss_mlp": 0.01033065, "balance_loss_clip": 1.04823101, "balance_loss_mlp": 1.02433932, "epoch": 0.5390488787350448, "flos": 24281718595200.0, "grad_norm": 1.4841246704341133, "language_loss": 0.74601638, "learning_rate": 1.844363181916986e-06, "loss": 0.76774341, "num_input_tokens_seen": 96798035, "step": 4483, "time_per_iteration": 3.7384109497070312 }, { "auxiliary_loss_clip": 0.01164815, "auxiliary_loss_mlp": 0.0102718, "balance_loss_clip": 1.04854584, "balance_loss_mlp": 1.0182693, "epoch": 0.5391691216256839, "flos": 16581393688320.0, "grad_norm": 4.07311245129802, "language_loss": 0.83047181, "learning_rate": 1.8435865842870868e-06, "loss": 0.85239178, "num_input_tokens_seen": 96815975, "step": 4484, "time_per_iteration": 2.6335999965667725 }, { "auxiliary_loss_clip": 0.01144412, "auxiliary_loss_mlp": 0.00763913, "balance_loss_clip": 1.04543519, "balance_loss_mlp": 1.00019121, "epoch": 0.5392893645163229, "flos": 23330049707520.0, "grad_norm": 1.8326906848611888, "language_loss": 0.71699464, "learning_rate": 1.8428100103849787e-06, "loss": 0.73607779, "num_input_tokens_seen": 96835770, "step": 4485, "time_per_iteration": 3.536465883255005 }, { "auxiliary_loss_clip": 0.01153204, "auxiliary_loss_mlp": 0.01030125, "balance_loss_clip": 1.05030012, "balance_loss_mlp": 1.02144027, "epoch": 0.5394096074069621, "flos": 15669801400320.0, "grad_norm": 2.242677675524116, "language_loss": 0.73577929, "learning_rate": 1.842033460328467e-06, "loss": 0.75761259, "num_input_tokens_seen": 96854490, "step": 4486, "time_per_iteration": 2.6450300216674805 }, { "auxiliary_loss_clip": 0.01159056, "auxiliary_loss_mlp": 0.00763429, "balance_loss_clip": 1.04975605, "balance_loss_mlp": 1.00022733, "epoch": 0.5395298502976011, "flos": 22893447893760.0, "grad_norm": 2.404478916102859, "language_loss": 0.75182509, "learning_rate": 1.8412569342353541e-06, "loss": 0.77104998, "num_input_tokens_seen": 96874645, "step": 4487, "time_per_iteration": 2.744840621948242 }, { "auxiliary_loss_clip": 0.01161088, "auxiliary_loss_mlp": 0.01032203, "balance_loss_clip": 1.05171371, "balance_loss_mlp": 1.02288055, "epoch": 0.5396500931882402, "flos": 23842135952640.0, "grad_norm": 1.8751407604536472, "language_loss": 0.84506762, "learning_rate": 1.840480432223438e-06, "loss": 0.86700046, "num_input_tokens_seen": 96893650, "step": 4488, "time_per_iteration": 3.5878994464874268 }, { "auxiliary_loss_clip": 0.01157337, "auxiliary_loss_mlp": 0.01029756, "balance_loss_clip": 1.04824436, "balance_loss_mlp": 1.02138758, "epoch": 0.5397703360788794, "flos": 26322988596480.0, "grad_norm": 2.0213511915156377, "language_loss": 0.77587557, "learning_rate": 1.8397039544105131e-06, "loss": 0.79774642, "num_input_tokens_seen": 96912735, "step": 4489, "time_per_iteration": 2.757640838623047 }, { "auxiliary_loss_clip": 0.01147586, "auxiliary_loss_mlp": 0.01028005, "balance_loss_clip": 1.04481459, "balance_loss_mlp": 1.01942205, "epoch": 0.5398905789695184, "flos": 21214588164480.0, "grad_norm": 1.7151451309884629, "language_loss": 0.69785297, "learning_rate": 1.8389275009143711e-06, "loss": 0.71960878, "num_input_tokens_seen": 96932475, "step": 4490, "time_per_iteration": 2.6806225776672363 }, { "auxiliary_loss_clip": 0.01178963, "auxiliary_loss_mlp": 0.0102775, "balance_loss_clip": 1.04951239, "balance_loss_mlp": 1.01936984, "epoch": 0.5400108218601575, "flos": 25080335631360.0, "grad_norm": 1.6941582493664962, "language_loss": 0.73475015, "learning_rate": 1.8381510718527988e-06, "loss": 0.75681722, "num_input_tokens_seen": 96952085, "step": 4491, "time_per_iteration": 2.6398253440856934 }, { "auxiliary_loss_clip": 0.01156413, "auxiliary_loss_mlp": 0.01028473, "balance_loss_clip": 1.04691291, "balance_loss_mlp": 1.02058101, "epoch": 0.5401310647507966, "flos": 26357498588160.0, "grad_norm": 1.8162920823656636, "language_loss": 0.63757181, "learning_rate": 1.8373746673435812e-06, "loss": 0.65942061, "num_input_tokens_seen": 96973110, "step": 4492, "time_per_iteration": 2.801379442214966 }, { "auxiliary_loss_clip": 0.0118711, "auxiliary_loss_mlp": 0.01028073, "balance_loss_clip": 1.05333447, "balance_loss_mlp": 1.01952577, "epoch": 0.5402513076414357, "flos": 27855332749440.0, "grad_norm": 1.6407675708596272, "language_loss": 0.79135919, "learning_rate": 1.8365982875044964e-06, "loss": 0.81351101, "num_input_tokens_seen": 96993420, "step": 4493, "time_per_iteration": 2.661428689956665 }, { "auxiliary_loss_clip": 0.01176772, "auxiliary_loss_mlp": 0.00765003, "balance_loss_clip": 1.05268002, "balance_loss_mlp": 1.00021887, "epoch": 0.5403715505320748, "flos": 22893771116160.0, "grad_norm": 2.2836601906519816, "language_loss": 0.76513553, "learning_rate": 1.8358219324533217e-06, "loss": 0.78455329, "num_input_tokens_seen": 97013685, "step": 4494, "time_per_iteration": 2.679168224334717 }, { "auxiliary_loss_clip": 0.01147228, "auxiliary_loss_mlp": 0.01031702, "balance_loss_clip": 1.04581559, "balance_loss_mlp": 1.02407289, "epoch": 0.5404917934227139, "flos": 30224143895040.0, "grad_norm": 2.4634098873532246, "language_loss": 0.70466864, "learning_rate": 1.8350456023078292e-06, "loss": 0.72645795, "num_input_tokens_seen": 97036060, "step": 4495, "time_per_iteration": 2.748868942260742 }, { "auxiliary_loss_clip": 0.01188486, "auxiliary_loss_mlp": 0.01039458, "balance_loss_clip": 1.05249178, "balance_loss_mlp": 1.03049898, "epoch": 0.540612036313353, "flos": 19938502615680.0, "grad_norm": 2.842309072725374, "language_loss": 0.78224218, "learning_rate": 1.8342692971857874e-06, "loss": 0.80452156, "num_input_tokens_seen": 97055260, "step": 4496, "time_per_iteration": 2.5953657627105713 }, { "auxiliary_loss_clip": 0.01151484, "auxiliary_loss_mlp": 0.01027802, "balance_loss_clip": 1.04813075, "balance_loss_mlp": 1.01975489, "epoch": 0.540732279203992, "flos": 24279599692800.0, "grad_norm": 2.558862561593949, "language_loss": 0.71488833, "learning_rate": 1.833493017204962e-06, "loss": 0.73668116, "num_input_tokens_seen": 97075365, "step": 4497, "time_per_iteration": 2.689786911010742 }, { "auxiliary_loss_clip": 0.01182492, "auxiliary_loss_mlp": 0.01029527, "balance_loss_clip": 1.05131006, "balance_loss_mlp": 1.02156949, "epoch": 0.5408525220946312, "flos": 20193216935040.0, "grad_norm": 3.6136761814672953, "language_loss": 0.77928126, "learning_rate": 1.8327167624831134e-06, "loss": 0.80140144, "num_input_tokens_seen": 97093095, "step": 4498, "time_per_iteration": 2.707052230834961 }, { "auxiliary_loss_clip": 0.0118223, "auxiliary_loss_mlp": 0.0102517, "balance_loss_clip": 1.05299509, "balance_loss_mlp": 1.01653385, "epoch": 0.5409727649852702, "flos": 24134448833280.0, "grad_norm": 1.6025674411595683, "language_loss": 0.70945507, "learning_rate": 1.831940533137999e-06, "loss": 0.73152906, "num_input_tokens_seen": 97112000, "step": 4499, "time_per_iteration": 2.617994785308838 }, { "auxiliary_loss_clip": 0.0116607, "auxiliary_loss_mlp": 0.01025113, "balance_loss_clip": 1.05221725, "balance_loss_mlp": 1.01744783, "epoch": 0.5410930078759093, "flos": 23912700220800.0, "grad_norm": 1.716419231076312, "language_loss": 0.72629642, "learning_rate": 1.8311643292873718e-06, "loss": 0.74820822, "num_input_tokens_seen": 97130820, "step": 4500, "time_per_iteration": 2.643759250640869 }, { "auxiliary_loss_clip": 0.01166173, "auxiliary_loss_mlp": 0.01026369, "balance_loss_clip": 1.04992938, "balance_loss_mlp": 1.01855516, "epoch": 0.5412132507665485, "flos": 21105132445440.0, "grad_norm": 1.7765735358090062, "language_loss": 0.88022947, "learning_rate": 1.8303881510489818e-06, "loss": 0.90215492, "num_input_tokens_seen": 97149210, "step": 4501, "time_per_iteration": 2.5760936737060547 }, { "auxiliary_loss_clip": 0.01156291, "auxiliary_loss_mlp": 0.01028013, "balance_loss_clip": 1.05046296, "balance_loss_mlp": 1.01994824, "epoch": 0.5413334936571875, "flos": 30227340205440.0, "grad_norm": 5.375738852453281, "language_loss": 0.69373363, "learning_rate": 1.829611998540574e-06, "loss": 0.71557671, "num_input_tokens_seen": 97170415, "step": 4502, "time_per_iteration": 2.7537841796875 }, { "auxiliary_loss_clip": 0.01168647, "auxiliary_loss_mlp": 0.00763482, "balance_loss_clip": 1.05000401, "balance_loss_mlp": 1.00022483, "epoch": 0.5414537365478266, "flos": 24279635606400.0, "grad_norm": 1.8223699893858867, "language_loss": 0.79811013, "learning_rate": 1.8288358718798914e-06, "loss": 0.81743145, "num_input_tokens_seen": 97189605, "step": 4503, "time_per_iteration": 2.6696269512176514 }, { "auxiliary_loss_clip": 0.01163158, "auxiliary_loss_mlp": 0.00763062, "balance_loss_clip": 1.04963231, "balance_loss_mlp": 1.00022793, "epoch": 0.5415739794384657, "flos": 16654543735680.0, "grad_norm": 1.7694664099578135, "language_loss": 0.72766364, "learning_rate": 1.8280597711846703e-06, "loss": 0.74692583, "num_input_tokens_seen": 97207845, "step": 4504, "time_per_iteration": 2.732043981552124 }, { "auxiliary_loss_clip": 0.01166492, "auxiliary_loss_mlp": 0.01025417, "balance_loss_clip": 1.05205083, "balance_loss_mlp": 1.01717341, "epoch": 0.5416942223291048, "flos": 23185724860800.0, "grad_norm": 1.9143258249297963, "language_loss": 0.83254319, "learning_rate": 1.8272836965726455e-06, "loss": 0.85446233, "num_input_tokens_seen": 97226780, "step": 4505, "time_per_iteration": 2.6476781368255615 }, { "auxiliary_loss_clip": 0.01108364, "auxiliary_loss_mlp": 0.01035426, "balance_loss_clip": 1.04173112, "balance_loss_mlp": 1.02718902, "epoch": 0.5418144652197439, "flos": 20303247271680.0, "grad_norm": 1.7067042374171384, "language_loss": 0.77912915, "learning_rate": 1.8265076481615461e-06, "loss": 0.80056709, "num_input_tokens_seen": 97246695, "step": 4506, "time_per_iteration": 2.758835554122925 }, { "auxiliary_loss_clip": 0.01156675, "auxiliary_loss_mlp": 0.01026097, "balance_loss_clip": 1.05302978, "balance_loss_mlp": 1.01687074, "epoch": 0.541934708110383, "flos": 12458633431680.0, "grad_norm": 2.1650426962876375, "language_loss": 0.87106156, "learning_rate": 1.8257316260690987e-06, "loss": 0.89288926, "num_input_tokens_seen": 97264480, "step": 4507, "time_per_iteration": 2.654953718185425 }, { "auxiliary_loss_clip": 0.01168251, "auxiliary_loss_mlp": 0.01024309, "balance_loss_clip": 1.04996002, "balance_loss_mlp": 1.01687658, "epoch": 0.5420549510010221, "flos": 21253802837760.0, "grad_norm": 1.5857920742515368, "language_loss": 0.76206845, "learning_rate": 1.8249556304130254e-06, "loss": 0.78399408, "num_input_tokens_seen": 97285760, "step": 4508, "time_per_iteration": 2.6863386631011963 }, { "auxiliary_loss_clip": 0.01144567, "auxiliary_loss_mlp": 0.01033703, "balance_loss_clip": 1.04709375, "balance_loss_mlp": 1.02501822, "epoch": 0.5421751938916611, "flos": 29490524519040.0, "grad_norm": 2.1849595519581646, "language_loss": 0.68974149, "learning_rate": 1.824179661311044e-06, "loss": 0.71152425, "num_input_tokens_seen": 97304510, "step": 4509, "time_per_iteration": 3.8503623008728027 }, { "auxiliary_loss_clip": 0.01123665, "auxiliary_loss_mlp": 0.01029402, "balance_loss_clip": 1.0425185, "balance_loss_mlp": 1.02105188, "epoch": 0.5422954367823003, "flos": 18734238311040.0, "grad_norm": 1.8529497804113326, "language_loss": 0.80026221, "learning_rate": 1.823403718880868e-06, "loss": 0.82179284, "num_input_tokens_seen": 97323270, "step": 4510, "time_per_iteration": 3.6918458938598633 }, { "auxiliary_loss_clip": 0.01153703, "auxiliary_loss_mlp": 0.01025441, "balance_loss_clip": 1.0466814, "balance_loss_mlp": 1.01757312, "epoch": 0.5424156796729394, "flos": 39969006940800.0, "grad_norm": 1.6845400144598266, "language_loss": 0.66504961, "learning_rate": 1.822627803240207e-06, "loss": 0.68684107, "num_input_tokens_seen": 97345600, "step": 4511, "time_per_iteration": 3.7475943565368652 }, { "auxiliary_loss_clip": 0.01144557, "auxiliary_loss_mlp": 0.01034144, "balance_loss_clip": 1.0473423, "balance_loss_mlp": 1.025895, "epoch": 0.5425359225635784, "flos": 11546538353280.0, "grad_norm": 2.2147829896992226, "language_loss": 0.85110098, "learning_rate": 1.8218519145067675e-06, "loss": 0.87288797, "num_input_tokens_seen": 97361220, "step": 4512, "time_per_iteration": 2.69085693359375 }, { "auxiliary_loss_clip": 0.0113553, "auxiliary_loss_mlp": 0.01028299, "balance_loss_clip": 1.04559803, "balance_loss_mlp": 1.01988244, "epoch": 0.5426561654542175, "flos": 20229702174720.0, "grad_norm": 1.894324577942147, "language_loss": 0.89488119, "learning_rate": 1.8210760527982508e-06, "loss": 0.91651946, "num_input_tokens_seen": 97381505, "step": 4513, "time_per_iteration": 2.7229795455932617 }, { "auxiliary_loss_clip": 0.01157838, "auxiliary_loss_mlp": 0.0076316, "balance_loss_clip": 1.05284333, "balance_loss_mlp": 1.00021458, "epoch": 0.5427764083448566, "flos": 21871681614720.0, "grad_norm": 4.9002028909502195, "language_loss": 0.75170904, "learning_rate": 1.8203002182323552e-06, "loss": 0.77091897, "num_input_tokens_seen": 97399060, "step": 4514, "time_per_iteration": 3.5980536937713623 }, { "auxiliary_loss_clip": 0.01156577, "auxiliary_loss_mlp": 0.01029276, "balance_loss_clip": 1.05035305, "balance_loss_mlp": 1.02118218, "epoch": 0.5428966512354957, "flos": 19640946349440.0, "grad_norm": 1.7534062060695885, "language_loss": 0.75671422, "learning_rate": 1.819524410926773e-06, "loss": 0.77857274, "num_input_tokens_seen": 97416740, "step": 4515, "time_per_iteration": 2.7132346630096436 }, { "auxiliary_loss_clip": 0.01108971, "auxiliary_loss_mlp": 0.01033916, "balance_loss_clip": 1.04381204, "balance_loss_mlp": 1.02539229, "epoch": 0.5430168941261347, "flos": 22382187661440.0, "grad_norm": 1.684777980531034, "language_loss": 0.7695142, "learning_rate": 1.8187486309991944e-06, "loss": 0.79094303, "num_input_tokens_seen": 97437620, "step": 4516, "time_per_iteration": 2.7904083728790283 }, { "auxiliary_loss_clip": 0.01176665, "auxiliary_loss_mlp": 0.01028257, "balance_loss_clip": 1.05450773, "balance_loss_mlp": 1.02057362, "epoch": 0.5431371370167739, "flos": 18764187275520.0, "grad_norm": 1.649096604608107, "language_loss": 0.77478206, "learning_rate": 1.817972878567304e-06, "loss": 0.79683125, "num_input_tokens_seen": 97456275, "step": 4517, "time_per_iteration": 2.612614154815674 }, { "auxiliary_loss_clip": 0.01158041, "auxiliary_loss_mlp": 0.01028161, "balance_loss_clip": 1.04845107, "balance_loss_mlp": 1.02038264, "epoch": 0.543257379907413, "flos": 18806023641600.0, "grad_norm": 1.6846500083166898, "language_loss": 0.76326406, "learning_rate": 1.8171971537487834e-06, "loss": 0.78512609, "num_input_tokens_seen": 97474925, "step": 4518, "time_per_iteration": 2.720061779022217 }, { "auxiliary_loss_clip": 0.01184163, "auxiliary_loss_mlp": 0.01029855, "balance_loss_clip": 1.05149031, "balance_loss_mlp": 1.02153432, "epoch": 0.543377622798052, "flos": 17493381025920.0, "grad_norm": 1.8953825179276578, "language_loss": 0.80521321, "learning_rate": 1.8164214566613093e-06, "loss": 0.82735336, "num_input_tokens_seen": 97493550, "step": 4519, "time_per_iteration": 2.6553099155426025 }, { "auxiliary_loss_clip": 0.0118318, "auxiliary_loss_mlp": 0.01035449, "balance_loss_clip": 1.05283046, "balance_loss_mlp": 1.02687144, "epoch": 0.5434978656886912, "flos": 18989311766400.0, "grad_norm": 3.416232634415561, "language_loss": 0.65322006, "learning_rate": 1.8156457874225547e-06, "loss": 0.67540634, "num_input_tokens_seen": 97512010, "step": 4520, "time_per_iteration": 2.693605422973633 }, { "auxiliary_loss_clip": 0.01147457, "auxiliary_loss_mlp": 0.01026347, "balance_loss_clip": 1.04976714, "balance_loss_mlp": 1.01872957, "epoch": 0.5436181085793302, "flos": 17274936464640.0, "grad_norm": 2.1845858881761058, "language_loss": 0.80606848, "learning_rate": 1.814870146150187e-06, "loss": 0.82780653, "num_input_tokens_seen": 97530120, "step": 4521, "time_per_iteration": 2.7053284645080566 }, { "auxiliary_loss_clip": 0.01161907, "auxiliary_loss_mlp": 0.01027551, "balance_loss_clip": 1.04850864, "balance_loss_mlp": 1.01892626, "epoch": 0.5437383514699693, "flos": 19098587917440.0, "grad_norm": 2.7115003803420086, "language_loss": 0.78827161, "learning_rate": 1.814094532961871e-06, "loss": 0.81016624, "num_input_tokens_seen": 97548695, "step": 4522, "time_per_iteration": 2.6635892391204834 }, { "auxiliary_loss_clip": 0.01127147, "auxiliary_loss_mlp": 0.01027749, "balance_loss_clip": 1.04549885, "balance_loss_mlp": 1.01912451, "epoch": 0.5438585943606085, "flos": 22602715211520.0, "grad_norm": 4.723978148252467, "language_loss": 0.83982038, "learning_rate": 1.8133189479752666e-06, "loss": 0.86136937, "num_input_tokens_seen": 97567625, "step": 4523, "time_per_iteration": 2.7019596099853516 }, { "auxiliary_loss_clip": 0.01183196, "auxiliary_loss_mlp": 0.01030521, "balance_loss_clip": 1.05239224, "balance_loss_mlp": 1.02258182, "epoch": 0.5439788372512475, "flos": 21798495653760.0, "grad_norm": 2.3310760880464567, "language_loss": 0.82012916, "learning_rate": 1.8125433913080292e-06, "loss": 0.84226632, "num_input_tokens_seen": 97585325, "step": 4524, "time_per_iteration": 2.644388437271118 }, { "auxiliary_loss_clip": 0.01062181, "auxiliary_loss_mlp": 0.01028621, "balance_loss_clip": 1.03824019, "balance_loss_mlp": 1.02059817, "epoch": 0.5440990801418866, "flos": 16399362539520.0, "grad_norm": 2.008489841221986, "language_loss": 0.82773662, "learning_rate": 1.811767863077811e-06, "loss": 0.84864467, "num_input_tokens_seen": 97604275, "step": 4525, "time_per_iteration": 3.0538370609283447 }, { "auxiliary_loss_clip": 0.01104434, "auxiliary_loss_mlp": 0.0102878, "balance_loss_clip": 1.04565489, "balance_loss_mlp": 1.02119207, "epoch": 0.5442193230325257, "flos": 21615638492160.0, "grad_norm": 1.7610557598916503, "language_loss": 0.78276402, "learning_rate": 1.8109923634022577e-06, "loss": 0.80409616, "num_input_tokens_seen": 97624300, "step": 4526, "time_per_iteration": 3.1037790775299072 }, { "auxiliary_loss_clip": 0.01185069, "auxiliary_loss_mlp": 0.01029181, "balance_loss_clip": 1.05249953, "balance_loss_mlp": 1.02139115, "epoch": 0.5443395659231648, "flos": 15481198062720.0, "grad_norm": 2.613250103838892, "language_loss": 0.86778247, "learning_rate": 1.8102168923990128e-06, "loss": 0.889925, "num_input_tokens_seen": 97637845, "step": 4527, "time_per_iteration": 2.566715717315674 }, { "auxiliary_loss_clip": 0.01171183, "auxiliary_loss_mlp": 0.00763176, "balance_loss_clip": 1.05235362, "balance_loss_mlp": 1.00026202, "epoch": 0.5444598088138038, "flos": 18770436241920.0, "grad_norm": 2.2449723933140375, "language_loss": 0.79943764, "learning_rate": 1.809441450185714e-06, "loss": 0.81878126, "num_input_tokens_seen": 97656330, "step": 4528, "time_per_iteration": 2.611246347427368 }, { "auxiliary_loss_clip": 0.0115899, "auxiliary_loss_mlp": 0.01027683, "balance_loss_clip": 1.04768562, "balance_loss_mlp": 1.01949286, "epoch": 0.544580051704443, "flos": 21142335957120.0, "grad_norm": 2.4308110276056607, "language_loss": 0.732391, "learning_rate": 1.8086660368799958e-06, "loss": 0.75425774, "num_input_tokens_seen": 97674380, "step": 4529, "time_per_iteration": 2.6160969734191895 }, { "auxiliary_loss_clip": 0.0115846, "auxiliary_loss_mlp": 0.01027274, "balance_loss_clip": 1.05133915, "balance_loss_mlp": 1.0189178, "epoch": 0.5447002945950821, "flos": 32491508054400.0, "grad_norm": 1.879335170582177, "language_loss": 0.77392018, "learning_rate": 1.807890652599488e-06, "loss": 0.79577756, "num_input_tokens_seen": 97698765, "step": 4530, "time_per_iteration": 2.787216901779175 }, { "auxiliary_loss_clip": 0.01179685, "auxiliary_loss_mlp": 0.01029359, "balance_loss_clip": 1.05200505, "balance_loss_mlp": 1.02203381, "epoch": 0.5448205374857211, "flos": 11798307757440.0, "grad_norm": 1.901934687182875, "language_loss": 0.82570177, "learning_rate": 1.8071152974618156e-06, "loss": 0.84779221, "num_input_tokens_seen": 97716565, "step": 4531, "time_per_iteration": 2.5854718685150146 }, { "auxiliary_loss_clip": 0.01139922, "auxiliary_loss_mlp": 0.00762895, "balance_loss_clip": 1.04611719, "balance_loss_mlp": 1.00018883, "epoch": 0.5449407803763603, "flos": 24133766474880.0, "grad_norm": 2.110587676923799, "language_loss": 0.78649706, "learning_rate": 1.806339971584599e-06, "loss": 0.80552518, "num_input_tokens_seen": 97733225, "step": 4532, "time_per_iteration": 2.8077473640441895 }, { "auxiliary_loss_clip": 0.01185498, "auxiliary_loss_mlp": 0.01026353, "balance_loss_clip": 1.05445945, "balance_loss_mlp": 1.01893187, "epoch": 0.5450610232669993, "flos": 23258551685760.0, "grad_norm": 6.907644634360941, "language_loss": 0.85150373, "learning_rate": 1.8055646750854546e-06, "loss": 0.8736223, "num_input_tokens_seen": 97752735, "step": 4533, "time_per_iteration": 2.64298939704895 }, { "auxiliary_loss_clip": 0.01161976, "auxiliary_loss_mlp": 0.01032439, "balance_loss_clip": 1.05165577, "balance_loss_mlp": 1.02460647, "epoch": 0.5451812661576384, "flos": 17785083375360.0, "grad_norm": 2.9588510249166644, "language_loss": 0.81567597, "learning_rate": 1.8047894080819945e-06, "loss": 0.83762014, "num_input_tokens_seen": 97769985, "step": 4534, "time_per_iteration": 2.6212339401245117 }, { "auxiliary_loss_clip": 0.0107552, "auxiliary_loss_mlp": 0.01003324, "balance_loss_clip": 1.01361513, "balance_loss_mlp": 1.00194156, "epoch": 0.5453015090482776, "flos": 71062586513280.0, "grad_norm": 0.7303625197151306, "language_loss": 0.63176405, "learning_rate": 1.8040141706918258e-06, "loss": 0.65255249, "num_input_tokens_seen": 97831225, "step": 4535, "time_per_iteration": 4.239135265350342 }, { "auxiliary_loss_clip": 0.0115744, "auxiliary_loss_mlp": 0.01033493, "balance_loss_clip": 1.05194902, "balance_loss_mlp": 1.02526188, "epoch": 0.5454217519389166, "flos": 25552201622400.0, "grad_norm": 2.0817249009636942, "language_loss": 0.76880682, "learning_rate": 1.8032389630325525e-06, "loss": 0.79071617, "num_input_tokens_seen": 97849975, "step": 4536, "time_per_iteration": 4.225529432296753 }, { "auxiliary_loss_clip": 0.01152013, "auxiliary_loss_mlp": 0.01026975, "balance_loss_clip": 1.04570961, "balance_loss_mlp": 1.01870775, "epoch": 0.5455419948295557, "flos": 23658345037440.0, "grad_norm": 1.7680439576164435, "language_loss": 0.75388366, "learning_rate": 1.8024637852217707e-06, "loss": 0.77567351, "num_input_tokens_seen": 97869700, "step": 4537, "time_per_iteration": 3.617051601409912 }, { "auxiliary_loss_clip": 0.01155004, "auxiliary_loss_mlp": 0.01029881, "balance_loss_clip": 1.05147028, "balance_loss_mlp": 1.02186418, "epoch": 0.5456622377201948, "flos": 23403989854080.0, "grad_norm": 1.6896772536276168, "language_loss": 0.8450278, "learning_rate": 1.8016886373770766e-06, "loss": 0.86687666, "num_input_tokens_seen": 97888215, "step": 4538, "time_per_iteration": 2.7369158267974854 }, { "auxiliary_loss_clip": 0.01153455, "auxiliary_loss_mlp": 0.01032974, "balance_loss_clip": 1.05047536, "balance_loss_mlp": 1.02522492, "epoch": 0.5457824806108339, "flos": 23988040997760.0, "grad_norm": 2.4476252960590394, "language_loss": 0.78939152, "learning_rate": 1.8009135196160579e-06, "loss": 0.81125581, "num_input_tokens_seen": 97907090, "step": 4539, "time_per_iteration": 2.7281315326690674 }, { "auxiliary_loss_clip": 0.01136008, "auxiliary_loss_mlp": 0.01030114, "balance_loss_clip": 1.04636788, "balance_loss_mlp": 1.02219236, "epoch": 0.545902723501473, "flos": 22565870835840.0, "grad_norm": 1.862390761852363, "language_loss": 0.84493172, "learning_rate": 1.8001384320563e-06, "loss": 0.86659288, "num_input_tokens_seen": 97927345, "step": 4540, "time_per_iteration": 3.65763521194458 }, { "auxiliary_loss_clip": 0.01075121, "auxiliary_loss_mlp": 0.0100203, "balance_loss_clip": 1.01305199, "balance_loss_mlp": 1.00062335, "epoch": 0.5460229663921121, "flos": 55198399685760.0, "grad_norm": 0.7809450905760909, "language_loss": 0.57756686, "learning_rate": 1.7993633748153833e-06, "loss": 0.59833837, "num_input_tokens_seen": 97981950, "step": 4541, "time_per_iteration": 3.1040353775024414 }, { "auxiliary_loss_clip": 0.01173766, "auxiliary_loss_mlp": 0.0103115, "balance_loss_clip": 1.05298626, "balance_loss_mlp": 1.02263808, "epoch": 0.5461432092827512, "flos": 15413866018560.0, "grad_norm": 1.729082725606598, "language_loss": 0.7300936, "learning_rate": 1.7985883480108834e-06, "loss": 0.75214279, "num_input_tokens_seen": 97999585, "step": 4542, "time_per_iteration": 2.6568305492401123 }, { "auxiliary_loss_clip": 0.01163243, "auxiliary_loss_mlp": 0.01025186, "balance_loss_clip": 1.04979587, "balance_loss_mlp": 1.01770532, "epoch": 0.5462634521733902, "flos": 24024921287040.0, "grad_norm": 1.621367202525802, "language_loss": 0.72004926, "learning_rate": 1.797813351760371e-06, "loss": 0.74193358, "num_input_tokens_seen": 98021290, "step": 4543, "time_per_iteration": 2.6556859016418457 }, { "auxiliary_loss_clip": 0.01183194, "auxiliary_loss_mlp": 0.01032123, "balance_loss_clip": 1.05114889, "balance_loss_mlp": 1.02406406, "epoch": 0.5463836950640293, "flos": 22820944291200.0, "grad_norm": 1.6633343504205937, "language_loss": 0.78281111, "learning_rate": 1.7970383861814116e-06, "loss": 0.8049643, "num_input_tokens_seen": 98041060, "step": 4544, "time_per_iteration": 2.6639328002929688 }, { "auxiliary_loss_clip": 0.01170919, "auxiliary_loss_mlp": 0.01032012, "balance_loss_clip": 1.05248427, "balance_loss_mlp": 1.02394795, "epoch": 0.5465039379546685, "flos": 20448290390400.0, "grad_norm": 1.9790088211230228, "language_loss": 0.73938745, "learning_rate": 1.7962634513915684e-06, "loss": 0.76141679, "num_input_tokens_seen": 98058410, "step": 4545, "time_per_iteration": 2.64389705657959 }, { "auxiliary_loss_clip": 0.01180899, "auxiliary_loss_mlp": 0.01033344, "balance_loss_clip": 1.05189002, "balance_loss_mlp": 1.02514827, "epoch": 0.5466241808453075, "flos": 17343310003200.0, "grad_norm": 1.832875381525187, "language_loss": 0.79387629, "learning_rate": 1.7954885475083969e-06, "loss": 0.8160187, "num_input_tokens_seen": 98076080, "step": 4546, "time_per_iteration": 2.6242525577545166 }, { "auxiliary_loss_clip": 0.01188018, "auxiliary_loss_mlp": 0.01030913, "balance_loss_clip": 1.05474842, "balance_loss_mlp": 1.02303898, "epoch": 0.5467444237359466, "flos": 21617039122560.0, "grad_norm": 2.214701115257723, "language_loss": 0.72015762, "learning_rate": 1.7947136746494513e-06, "loss": 0.74234688, "num_input_tokens_seen": 98096995, "step": 4547, "time_per_iteration": 2.6596758365631104 }, { "auxiliary_loss_clip": 0.01168771, "auxiliary_loss_mlp": 0.01026795, "balance_loss_clip": 1.0522635, "balance_loss_mlp": 1.01890922, "epoch": 0.5468646666265857, "flos": 24170467196160.0, "grad_norm": 2.1203361590030942, "language_loss": 0.87858027, "learning_rate": 1.793938832932277e-06, "loss": 0.90053594, "num_input_tokens_seen": 98115105, "step": 4548, "time_per_iteration": 2.7142553329467773 }, { "auxiliary_loss_clip": 0.01183268, "auxiliary_loss_mlp": 0.01028493, "balance_loss_clip": 1.05294681, "balance_loss_mlp": 1.02055943, "epoch": 0.5469849095172248, "flos": 27527001505920.0, "grad_norm": 2.158356809372934, "language_loss": 0.70749694, "learning_rate": 1.7931640224744185e-06, "loss": 0.72961456, "num_input_tokens_seen": 98135655, "step": 4549, "time_per_iteration": 2.6343131065368652 }, { "auxiliary_loss_clip": 0.01129578, "auxiliary_loss_mlp": 0.01027375, "balance_loss_clip": 1.0429287, "balance_loss_mlp": 1.01944768, "epoch": 0.5471051524078638, "flos": 27964680727680.0, "grad_norm": 1.7834254866817754, "language_loss": 0.73573101, "learning_rate": 1.7923892433934127e-06, "loss": 0.7573005, "num_input_tokens_seen": 98156730, "step": 4550, "time_per_iteration": 2.7674665451049805 }, { "auxiliary_loss_clip": 0.01158711, "auxiliary_loss_mlp": 0.0076295, "balance_loss_clip": 1.05155396, "balance_loss_mlp": 1.00030696, "epoch": 0.547225395298503, "flos": 18150510389760.0, "grad_norm": 1.847272015928521, "language_loss": 0.78900027, "learning_rate": 1.7916144958067939e-06, "loss": 0.80821693, "num_input_tokens_seen": 98174590, "step": 4551, "time_per_iteration": 2.6805801391601562 }, { "auxiliary_loss_clip": 0.01172015, "auxiliary_loss_mlp": 0.01030884, "balance_loss_clip": 1.05178595, "balance_loss_mlp": 1.02285492, "epoch": 0.5473456381891421, "flos": 21361498790400.0, "grad_norm": 1.7096268688851672, "language_loss": 0.79019624, "learning_rate": 1.7908397798320905e-06, "loss": 0.81222522, "num_input_tokens_seen": 98194325, "step": 4552, "time_per_iteration": 2.776017189025879 }, { "auxiliary_loss_clip": 0.01171461, "auxiliary_loss_mlp": 0.00763405, "balance_loss_clip": 1.05235422, "balance_loss_mlp": 1.00030422, "epoch": 0.5474658810797811, "flos": 19932145908480.0, "grad_norm": 2.2476663014557485, "language_loss": 0.74315387, "learning_rate": 1.7900650955868265e-06, "loss": 0.76250255, "num_input_tokens_seen": 98213970, "step": 4553, "time_per_iteration": 2.6954760551452637 }, { "auxiliary_loss_clip": 0.01171459, "auxiliary_loss_mlp": 0.00762899, "balance_loss_clip": 1.05623746, "balance_loss_mlp": 1.00028205, "epoch": 0.5475861239704203, "flos": 50476217264640.0, "grad_norm": 1.4388617124099763, "language_loss": 0.76473844, "learning_rate": 1.7892904431885202e-06, "loss": 0.78408194, "num_input_tokens_seen": 98241145, "step": 4554, "time_per_iteration": 2.9159655570983887 }, { "auxiliary_loss_clip": 0.0112515, "auxiliary_loss_mlp": 0.01025334, "balance_loss_clip": 1.0422107, "balance_loss_mlp": 1.01776373, "epoch": 0.5477063668610593, "flos": 20705123612160.0, "grad_norm": 1.949644924001728, "language_loss": 0.75754976, "learning_rate": 1.788515822754686e-06, "loss": 0.77905464, "num_input_tokens_seen": 98261565, "step": 4555, "time_per_iteration": 2.7779712677001953 }, { "auxiliary_loss_clip": 0.01141499, "auxiliary_loss_mlp": 0.01030776, "balance_loss_clip": 1.04708934, "balance_loss_mlp": 1.02241969, "epoch": 0.5478266097516984, "flos": 19609740408960.0, "grad_norm": 2.1267169766157408, "language_loss": 0.78020424, "learning_rate": 1.7877412344028335e-06, "loss": 0.80192697, "num_input_tokens_seen": 98281370, "step": 4556, "time_per_iteration": 2.7188591957092285 }, { "auxiliary_loss_clip": 0.0117127, "auxiliary_loss_mlp": 0.01028882, "balance_loss_clip": 1.05191851, "balance_loss_mlp": 1.02141929, "epoch": 0.5479468526423376, "flos": 12896599962240.0, "grad_norm": 2.3325921088102066, "language_loss": 0.77402323, "learning_rate": 1.7869666782504668e-06, "loss": 0.7960248, "num_input_tokens_seen": 98297950, "step": 4557, "time_per_iteration": 2.7008039951324463 }, { "auxiliary_loss_clip": 0.01140197, "auxiliary_loss_mlp": 0.01027108, "balance_loss_clip": 1.04468465, "balance_loss_mlp": 1.01934743, "epoch": 0.5480670955329766, "flos": 18588800142720.0, "grad_norm": 2.6828276861388702, "language_loss": 0.68524241, "learning_rate": 1.7861921544150867e-06, "loss": 0.7069155, "num_input_tokens_seen": 98316800, "step": 4558, "time_per_iteration": 2.666175127029419 }, { "auxiliary_loss_clip": 0.0109982, "auxiliary_loss_mlp": 0.00762887, "balance_loss_clip": 1.04595375, "balance_loss_mlp": 1.00018752, "epoch": 0.5481873384236157, "flos": 15954608338560.0, "grad_norm": 1.9300026080164525, "language_loss": 0.7678709, "learning_rate": 1.7854176630141856e-06, "loss": 0.78649795, "num_input_tokens_seen": 98333935, "step": 4559, "time_per_iteration": 2.7772793769836426 }, { "auxiliary_loss_clip": 0.01185222, "auxiliary_loss_mlp": 0.01027723, "balance_loss_clip": 1.05295575, "balance_loss_mlp": 1.01909828, "epoch": 0.5483075813142548, "flos": 22783812606720.0, "grad_norm": 2.3093519149136963, "language_loss": 0.83958107, "learning_rate": 1.784643204165255e-06, "loss": 0.86171055, "num_input_tokens_seen": 98353255, "step": 4560, "time_per_iteration": 2.6628353595733643 }, { "auxiliary_loss_clip": 0.01165959, "auxiliary_loss_mlp": 0.01029378, "balance_loss_clip": 1.05320811, "balance_loss_mlp": 1.0216887, "epoch": 0.5484278242048939, "flos": 19317212046720.0, "grad_norm": 1.927334921178401, "language_loss": 0.77230191, "learning_rate": 1.7838687779857783e-06, "loss": 0.79425532, "num_input_tokens_seen": 98371130, "step": 4561, "time_per_iteration": 3.650987386703491 }, { "auxiliary_loss_clip": 0.01145757, "auxiliary_loss_mlp": 0.010332, "balance_loss_clip": 1.04698718, "balance_loss_mlp": 1.02559733, "epoch": 0.5485480670955329, "flos": 22816024128000.0, "grad_norm": 1.750582957506157, "language_loss": 0.6399399, "learning_rate": 1.7830943845932366e-06, "loss": 0.66172951, "num_input_tokens_seen": 98390455, "step": 4562, "time_per_iteration": 3.948986768722534 }, { "auxiliary_loss_clip": 0.01159143, "auxiliary_loss_mlp": 0.01029073, "balance_loss_clip": 1.05190825, "balance_loss_mlp": 1.0213306, "epoch": 0.5486683099861721, "flos": 22671304231680.0, "grad_norm": 1.6967465181504904, "language_loss": 0.74984336, "learning_rate": 1.7823200241051044e-06, "loss": 0.77172559, "num_input_tokens_seen": 98409370, "step": 4563, "time_per_iteration": 3.6697232723236084 }, { "auxiliary_loss_clip": 0.011843, "auxiliary_loss_mlp": 0.01027382, "balance_loss_clip": 1.05460441, "balance_loss_mlp": 1.01988339, "epoch": 0.5487885528768112, "flos": 23149383275520.0, "grad_norm": 2.0558343119048406, "language_loss": 0.80022717, "learning_rate": 1.7815456966388513e-06, "loss": 0.82234401, "num_input_tokens_seen": 98428465, "step": 4564, "time_per_iteration": 2.6580450534820557 }, { "auxiliary_loss_clip": 0.01141622, "auxiliary_loss_mlp": 0.01031508, "balance_loss_clip": 1.04683816, "balance_loss_mlp": 1.02285957, "epoch": 0.5489087957674502, "flos": 22053928245120.0, "grad_norm": 2.1046618929031395, "language_loss": 0.81191516, "learning_rate": 1.780771402311943e-06, "loss": 0.83364642, "num_input_tokens_seen": 98447300, "step": 4565, "time_per_iteration": 2.7053802013397217 }, { "auxiliary_loss_clip": 0.01156897, "auxiliary_loss_mlp": 0.01026204, "balance_loss_clip": 1.05154777, "balance_loss_mlp": 1.0174365, "epoch": 0.5490290386580894, "flos": 24315977191680.0, "grad_norm": 1.7679812929156262, "language_loss": 0.78456712, "learning_rate": 1.7799971412418374e-06, "loss": 0.80639815, "num_input_tokens_seen": 98468695, "step": 4566, "time_per_iteration": 3.5916011333465576 }, { "auxiliary_loss_clip": 0.01139046, "auxiliary_loss_mlp": 0.01027038, "balance_loss_clip": 1.04667687, "balance_loss_mlp": 1.01863945, "epoch": 0.5491492815487284, "flos": 18294942977280.0, "grad_norm": 2.887471909176422, "language_loss": 0.74158472, "learning_rate": 1.7792229135459918e-06, "loss": 0.76324558, "num_input_tokens_seen": 98485345, "step": 4567, "time_per_iteration": 2.649627923965454 }, { "auxiliary_loss_clip": 0.01036907, "auxiliary_loss_mlp": 0.01001141, "balance_loss_clip": 1.02226853, "balance_loss_mlp": 0.9998123, "epoch": 0.5492695244393675, "flos": 64550257050240.0, "grad_norm": 0.7402766086861442, "language_loss": 0.61564469, "learning_rate": 1.7784487193418538e-06, "loss": 0.63602519, "num_input_tokens_seen": 98543195, "step": 4568, "time_per_iteration": 3.183910369873047 }, { "auxiliary_loss_clip": 0.01123163, "auxiliary_loss_mlp": 0.01028216, "balance_loss_clip": 1.04024243, "balance_loss_mlp": 1.01927483, "epoch": 0.5493897673300067, "flos": 17379579761280.0, "grad_norm": 2.3374596128474385, "language_loss": 0.61091942, "learning_rate": 1.7776745587468698e-06, "loss": 0.63243324, "num_input_tokens_seen": 98560620, "step": 4569, "time_per_iteration": 2.707425594329834 }, { "auxiliary_loss_clip": 0.01179964, "auxiliary_loss_mlp": 0.01028095, "balance_loss_clip": 1.04986048, "balance_loss_mlp": 1.02018547, "epoch": 0.5495100102206457, "flos": 19901765980800.0, "grad_norm": 2.3719672218247805, "language_loss": 0.8204518, "learning_rate": 1.7769004318784776e-06, "loss": 0.8425324, "num_input_tokens_seen": 98578265, "step": 4570, "time_per_iteration": 2.6281685829162598 }, { "auxiliary_loss_clip": 0.01168434, "auxiliary_loss_mlp": 0.01022627, "balance_loss_clip": 1.05067277, "balance_loss_mlp": 1.01502168, "epoch": 0.5496302531112848, "flos": 16727190992640.0, "grad_norm": 1.700774211398538, "language_loss": 0.80817425, "learning_rate": 1.776126338854113e-06, "loss": 0.83008492, "num_input_tokens_seen": 98596055, "step": 4571, "time_per_iteration": 2.624265432357788 }, { "auxiliary_loss_clip": 0.01164698, "auxiliary_loss_mlp": 0.01027553, "balance_loss_clip": 1.05206084, "balance_loss_mlp": 1.01974809, "epoch": 0.5497504960019239, "flos": 24572343536640.0, "grad_norm": 1.7097697227246709, "language_loss": 0.84600002, "learning_rate": 1.7753522797912044e-06, "loss": 0.86792254, "num_input_tokens_seen": 98616140, "step": 4572, "time_per_iteration": 2.717280864715576 }, { "auxiliary_loss_clip": 0.01162467, "auxiliary_loss_mlp": 0.01032881, "balance_loss_clip": 1.04970026, "balance_loss_mlp": 1.02411938, "epoch": 0.549870738892563, "flos": 15450494912640.0, "grad_norm": 2.11000163455766, "language_loss": 0.69871449, "learning_rate": 1.7745782548071765e-06, "loss": 0.72066796, "num_input_tokens_seen": 98633035, "step": 4573, "time_per_iteration": 2.6669082641601562 }, { "auxiliary_loss_clip": 0.01136988, "auxiliary_loss_mlp": 0.01027152, "balance_loss_clip": 1.05155861, "balance_loss_mlp": 1.01945734, "epoch": 0.549990981783202, "flos": 21069114082560.0, "grad_norm": 1.7683869966856318, "language_loss": 0.74050939, "learning_rate": 1.7738042640194482e-06, "loss": 0.76215082, "num_input_tokens_seen": 98652700, "step": 4574, "time_per_iteration": 2.774610757827759 }, { "auxiliary_loss_clip": 0.01180657, "auxiliary_loss_mlp": 0.01026154, "balance_loss_clip": 1.05131233, "balance_loss_mlp": 1.01752973, "epoch": 0.5501112246738411, "flos": 21395901041280.0, "grad_norm": 1.8600651152152181, "language_loss": 0.70856273, "learning_rate": 1.7730303075454335e-06, "loss": 0.73063082, "num_input_tokens_seen": 98671590, "step": 4575, "time_per_iteration": 2.655052423477173 }, { "auxiliary_loss_clip": 0.01146434, "auxiliary_loss_mlp": 0.01026514, "balance_loss_clip": 1.04912949, "balance_loss_mlp": 1.01846719, "epoch": 0.5502314675644803, "flos": 17456931699840.0, "grad_norm": 1.870820588835222, "language_loss": 0.84624004, "learning_rate": 1.7722563855025402e-06, "loss": 0.86796951, "num_input_tokens_seen": 98689620, "step": 4576, "time_per_iteration": 2.760298252105713 }, { "auxiliary_loss_clip": 0.01153204, "auxiliary_loss_mlp": 0.01028006, "balance_loss_clip": 1.04416394, "balance_loss_mlp": 1.02021575, "epoch": 0.5503517104551193, "flos": 24310410583680.0, "grad_norm": 2.4447962244975723, "language_loss": 0.7098543, "learning_rate": 1.7714824980081721e-06, "loss": 0.73166645, "num_input_tokens_seen": 98708915, "step": 4577, "time_per_iteration": 2.6460909843444824 }, { "auxiliary_loss_clip": 0.01165777, "auxiliary_loss_mlp": 0.01023969, "balance_loss_clip": 1.05292392, "balance_loss_mlp": 1.0160594, "epoch": 0.5504719533457584, "flos": 22419427086720.0, "grad_norm": 1.6554825923371679, "language_loss": 0.73829013, "learning_rate": 1.7707086451797276e-06, "loss": 0.76018757, "num_input_tokens_seen": 98729790, "step": 4578, "time_per_iteration": 2.7348670959472656 }, { "auxiliary_loss_clip": 0.01040903, "auxiliary_loss_mlp": 0.01001147, "balance_loss_clip": 1.00943279, "balance_loss_mlp": 0.99981195, "epoch": 0.5505921962363975, "flos": 67294155968640.0, "grad_norm": 0.6973790928398105, "language_loss": 0.52266467, "learning_rate": 1.7699348271345993e-06, "loss": 0.54308522, "num_input_tokens_seen": 98792415, "step": 4579, "time_per_iteration": 3.2107629776000977 }, { "auxiliary_loss_clip": 0.01036275, "auxiliary_loss_mlp": 0.01003384, "balance_loss_clip": 1.00981236, "balance_loss_mlp": 1.00201893, "epoch": 0.5507124391270366, "flos": 45685125578880.0, "grad_norm": 0.9151903958255929, "language_loss": 0.54422748, "learning_rate": 1.7691610439901753e-06, "loss": 0.56462407, "num_input_tokens_seen": 98855350, "step": 4580, "time_per_iteration": 3.3387930393218994 }, { "auxiliary_loss_clip": 0.01168604, "auxiliary_loss_mlp": 0.01026798, "balance_loss_clip": 1.05030751, "balance_loss_mlp": 1.0189538, "epoch": 0.5508326820176757, "flos": 22273845264000.0, "grad_norm": 1.9330032795150789, "language_loss": 0.75688183, "learning_rate": 1.7683872958638367e-06, "loss": 0.77883589, "num_input_tokens_seen": 98874230, "step": 4581, "time_per_iteration": 2.6635684967041016 }, { "auxiliary_loss_clip": 0.01150411, "auxiliary_loss_mlp": 0.01027025, "balance_loss_clip": 1.04720318, "balance_loss_mlp": 1.01896596, "epoch": 0.5509529249083148, "flos": 20012442762240.0, "grad_norm": 2.723327461850464, "language_loss": 0.84253252, "learning_rate": 1.7676135828729614e-06, "loss": 0.86430687, "num_input_tokens_seen": 98893940, "step": 4582, "time_per_iteration": 2.7095084190368652 }, { "auxiliary_loss_clip": 0.0116634, "auxiliary_loss_mlp": 0.01028743, "balance_loss_clip": 1.05141902, "balance_loss_mlp": 1.02037442, "epoch": 0.5510731677989539, "flos": 21834801325440.0, "grad_norm": 1.9475738249660668, "language_loss": 0.8302815, "learning_rate": 1.7668399051349205e-06, "loss": 0.85223234, "num_input_tokens_seen": 98913620, "step": 4583, "time_per_iteration": 2.713707447052002 }, { "auxiliary_loss_clip": 0.01135973, "auxiliary_loss_mlp": 0.0102896, "balance_loss_clip": 1.04672432, "balance_loss_mlp": 1.02047276, "epoch": 0.5511934106895929, "flos": 21467901853440.0, "grad_norm": 1.9670512086049303, "language_loss": 0.83267999, "learning_rate": 1.766066262767081e-06, "loss": 0.85432929, "num_input_tokens_seen": 98931460, "step": 4584, "time_per_iteration": 2.701094388961792 }, { "auxiliary_loss_clip": 0.01150457, "auxiliary_loss_mlp": 0.01025034, "balance_loss_clip": 1.05073595, "balance_loss_mlp": 1.01755333, "epoch": 0.5513136535802321, "flos": 21068934514560.0, "grad_norm": 2.0095124304268137, "language_loss": 0.77069181, "learning_rate": 1.765292655886803e-06, "loss": 0.79244673, "num_input_tokens_seen": 98950105, "step": 4585, "time_per_iteration": 2.6748225688934326 }, { "auxiliary_loss_clip": 0.01150718, "auxiliary_loss_mlp": 0.01027011, "balance_loss_clip": 1.04962254, "balance_loss_mlp": 1.01936972, "epoch": 0.5514338964708712, "flos": 27815004754560.0, "grad_norm": 2.382590096800934, "language_loss": 0.71125662, "learning_rate": 1.764519084611443e-06, "loss": 0.7330339, "num_input_tokens_seen": 98970560, "step": 4586, "time_per_iteration": 2.7616167068481445 }, { "auxiliary_loss_clip": 0.01154655, "auxiliary_loss_mlp": 0.01028031, "balance_loss_clip": 1.04899561, "balance_loss_mlp": 1.01897752, "epoch": 0.5515541393615102, "flos": 21908525990400.0, "grad_norm": 2.0740705594237583, "language_loss": 0.77649224, "learning_rate": 1.7637455490583505e-06, "loss": 0.7983191, "num_input_tokens_seen": 98989885, "step": 4587, "time_per_iteration": 3.9394783973693848 }, { "auxiliary_loss_clip": 0.01166687, "auxiliary_loss_mlp": 0.01025321, "balance_loss_clip": 1.05013049, "balance_loss_mlp": 1.01757228, "epoch": 0.5516743822521494, "flos": 20485422074880.0, "grad_norm": 2.0272676315352447, "language_loss": 0.77459568, "learning_rate": 1.7629720493448701e-06, "loss": 0.79651576, "num_input_tokens_seen": 99007180, "step": 4588, "time_per_iteration": 3.579955816268921 }, { "auxiliary_loss_clip": 0.01161721, "auxiliary_loss_mlp": 0.01027529, "balance_loss_clip": 1.04913807, "balance_loss_mlp": 1.01957178, "epoch": 0.5517946251427884, "flos": 14940383915520.0, "grad_norm": 1.7318988537329254, "language_loss": 0.85487777, "learning_rate": 1.7621985855883418e-06, "loss": 0.87677026, "num_input_tokens_seen": 99023880, "step": 4589, "time_per_iteration": 3.6680572032928467 }, { "auxiliary_loss_clip": 0.011481, "auxiliary_loss_mlp": 0.0102468, "balance_loss_clip": 1.04896379, "balance_loss_mlp": 1.01708615, "epoch": 0.5519148680334275, "flos": 18404865573120.0, "grad_norm": 1.7779379178617607, "language_loss": 0.72686255, "learning_rate": 1.7614251579060983e-06, "loss": 0.74859035, "num_input_tokens_seen": 99042475, "step": 4590, "time_per_iteration": 2.613234043121338 }, { "auxiliary_loss_clip": 0.0114206, "auxiliary_loss_mlp": 0.01026784, "balance_loss_clip": 1.04733634, "balance_loss_mlp": 1.01854658, "epoch": 0.5520351109240667, "flos": 25113337251840.0, "grad_norm": 1.6529284590743913, "language_loss": 0.84686124, "learning_rate": 1.76065176641547e-06, "loss": 0.86854959, "num_input_tokens_seen": 99065185, "step": 4591, "time_per_iteration": 2.7440178394317627 }, { "auxiliary_loss_clip": 0.01168365, "auxiliary_loss_mlp": 0.01026271, "balance_loss_clip": 1.04851234, "balance_loss_mlp": 1.01809883, "epoch": 0.5521553538147057, "flos": 21069545045760.0, "grad_norm": 1.6310467440295227, "language_loss": 0.77907443, "learning_rate": 1.759878411233777e-06, "loss": 0.80102074, "num_input_tokens_seen": 99083645, "step": 4592, "time_per_iteration": 3.5074656009674072 }, { "auxiliary_loss_clip": 0.01167568, "auxiliary_loss_mlp": 0.01029003, "balance_loss_clip": 1.05117142, "balance_loss_mlp": 1.02132642, "epoch": 0.5522755967053448, "flos": 18879999701760.0, "grad_norm": 2.162108611188334, "language_loss": 0.75629735, "learning_rate": 1.7591050924783388e-06, "loss": 0.77826309, "num_input_tokens_seen": 99100835, "step": 4593, "time_per_iteration": 2.6860623359680176 }, { "auxiliary_loss_clip": 0.01028996, "auxiliary_loss_mlp": 0.01001393, "balance_loss_clip": 1.00992334, "balance_loss_mlp": 0.99985522, "epoch": 0.5523958395959839, "flos": 64675622494080.0, "grad_norm": 0.8426295675922896, "language_loss": 0.57951707, "learning_rate": 1.7583318102664661e-06, "loss": 0.59982097, "num_input_tokens_seen": 99168400, "step": 4594, "time_per_iteration": 3.382849931716919 }, { "auxiliary_loss_clip": 0.01168183, "auxiliary_loss_mlp": 0.01026158, "balance_loss_clip": 1.04707026, "balance_loss_mlp": 1.01831961, "epoch": 0.552516082486623, "flos": 10889732211840.0, "grad_norm": 1.8936759637587908, "language_loss": 0.78866303, "learning_rate": 1.757558564715466e-06, "loss": 0.81060648, "num_input_tokens_seen": 99186475, "step": 4595, "time_per_iteration": 2.853739023208618 }, { "auxiliary_loss_clip": 0.01167928, "auxiliary_loss_mlp": 0.01026018, "balance_loss_clip": 1.04928148, "balance_loss_mlp": 1.01750088, "epoch": 0.552636325377262, "flos": 22199797376640.0, "grad_norm": 2.70231832944461, "language_loss": 0.74091738, "learning_rate": 1.7567853559426386e-06, "loss": 0.76285684, "num_input_tokens_seen": 99203525, "step": 4596, "time_per_iteration": 2.660952568054199 }, { "auxiliary_loss_clip": 0.0116651, "auxiliary_loss_mlp": 0.01032361, "balance_loss_clip": 1.04919434, "balance_loss_mlp": 1.02439237, "epoch": 0.5527565682679012, "flos": 23988184652160.0, "grad_norm": 2.1591767732668488, "language_loss": 0.75077784, "learning_rate": 1.7560121840652797e-06, "loss": 0.77276653, "num_input_tokens_seen": 99222910, "step": 4597, "time_per_iteration": 2.659105062484741 }, { "auxiliary_loss_clip": 0.01128982, "auxiliary_loss_mlp": 0.01030429, "balance_loss_clip": 1.04729497, "balance_loss_mlp": 1.02201283, "epoch": 0.5528768111585403, "flos": 19719267955200.0, "grad_norm": 1.7458067378315032, "language_loss": 0.69421947, "learning_rate": 1.7552390492006782e-06, "loss": 0.71581364, "num_input_tokens_seen": 99241230, "step": 4598, "time_per_iteration": 2.7106409072875977 }, { "auxiliary_loss_clip": 0.01132858, "auxiliary_loss_mlp": 0.00762722, "balance_loss_clip": 1.04393899, "balance_loss_mlp": 1.00028527, "epoch": 0.5529970540491793, "flos": 26215975002240.0, "grad_norm": 1.7861366817599131, "language_loss": 0.65220028, "learning_rate": 1.7544659514661184e-06, "loss": 0.67115611, "num_input_tokens_seen": 99264320, "step": 4599, "time_per_iteration": 2.7875328063964844 }, { "auxiliary_loss_clip": 0.01149911, "auxiliary_loss_mlp": 0.01021289, "balance_loss_clip": 1.04735541, "balance_loss_mlp": 1.01336753, "epoch": 0.5531172969398185, "flos": 24425971614720.0, "grad_norm": 2.044359738319887, "language_loss": 0.79711807, "learning_rate": 1.7536928909788786e-06, "loss": 0.81883001, "num_input_tokens_seen": 99283625, "step": 4600, "time_per_iteration": 2.704087495803833 }, { "auxiliary_loss_clip": 0.01034077, "auxiliary_loss_mlp": 0.0100157, "balance_loss_clip": 1.01104271, "balance_loss_mlp": 1.00003183, "epoch": 0.5532375398304575, "flos": 64907316195840.0, "grad_norm": 0.8757320857935373, "language_loss": 0.61975348, "learning_rate": 1.752919867856231e-06, "loss": 0.6401099, "num_input_tokens_seen": 99335270, "step": 4601, "time_per_iteration": 3.1664230823516846 }, { "auxiliary_loss_clip": 0.01145958, "auxiliary_loss_mlp": 0.01024211, "balance_loss_clip": 1.04607272, "balance_loss_mlp": 1.01703477, "epoch": 0.5533577827210966, "flos": 19683105937920.0, "grad_norm": 1.6885803237073533, "language_loss": 0.79031229, "learning_rate": 1.7521468822154436e-06, "loss": 0.81201398, "num_input_tokens_seen": 99354185, "step": 4602, "time_per_iteration": 2.767529010772705 }, { "auxiliary_loss_clip": 0.01147079, "auxiliary_loss_mlp": 0.01025675, "balance_loss_clip": 1.04889083, "balance_loss_mlp": 1.01771736, "epoch": 0.5534780256117358, "flos": 32306496076800.0, "grad_norm": 2.0410691400504954, "language_loss": 0.75485134, "learning_rate": 1.751373934173777e-06, "loss": 0.7765789, "num_input_tokens_seen": 99376930, "step": 4603, "time_per_iteration": 2.8398706912994385 }, { "auxiliary_loss_clip": 0.01182593, "auxiliary_loss_mlp": 0.01035857, "balance_loss_clip": 1.0510397, "balance_loss_mlp": 1.02772105, "epoch": 0.5535982685023748, "flos": 23222425582080.0, "grad_norm": 1.5974547110112078, "language_loss": 0.7300936, "learning_rate": 1.750601023848487e-06, "loss": 0.75227809, "num_input_tokens_seen": 99397655, "step": 4604, "time_per_iteration": 2.69842267036438 }, { "auxiliary_loss_clip": 0.01180322, "auxiliary_loss_mlp": 0.00762841, "balance_loss_clip": 1.05138636, "balance_loss_mlp": 1.00025272, "epoch": 0.5537185113930139, "flos": 24352534258560.0, "grad_norm": 1.9817746293966638, "language_loss": 0.73789722, "learning_rate": 1.749828151356823e-06, "loss": 0.75732887, "num_input_tokens_seen": 99417850, "step": 4605, "time_per_iteration": 2.6311209201812744 }, { "auxiliary_loss_clip": 0.01153504, "auxiliary_loss_mlp": 0.01024457, "balance_loss_clip": 1.04943669, "balance_loss_mlp": 1.01721251, "epoch": 0.553838754283653, "flos": 23549068886400.0, "grad_norm": 1.642492212969575, "language_loss": 0.76012105, "learning_rate": 1.7490553168160297e-06, "loss": 0.78190064, "num_input_tokens_seen": 99438920, "step": 4606, "time_per_iteration": 2.6645493507385254 }, { "auxiliary_loss_clip": 0.01151513, "auxiliary_loss_mlp": 0.01023762, "balance_loss_clip": 1.04758084, "balance_loss_mlp": 1.01636553, "epoch": 0.5539589971742921, "flos": 17275044205440.0, "grad_norm": 2.0078681360448463, "language_loss": 0.76447642, "learning_rate": 1.748282520343345e-06, "loss": 0.78622913, "num_input_tokens_seen": 99457950, "step": 4607, "time_per_iteration": 2.7047903537750244 }, { "auxiliary_loss_clip": 0.01173308, "auxiliary_loss_mlp": 0.01030424, "balance_loss_clip": 1.04959357, "balance_loss_mlp": 1.02219319, "epoch": 0.5540792400649311, "flos": 27564169104000.0, "grad_norm": 1.6689292462961036, "language_loss": 0.78341734, "learning_rate": 1.7475097620560023e-06, "loss": 0.80545461, "num_input_tokens_seen": 99478015, "step": 4608, "time_per_iteration": 2.703479290008545 }, { "auxiliary_loss_clip": 0.01179599, "auxiliary_loss_mlp": 0.01024003, "balance_loss_clip": 1.05101037, "balance_loss_mlp": 1.01624823, "epoch": 0.5541994829555702, "flos": 23878657105920.0, "grad_norm": 2.054659598843556, "language_loss": 0.71066314, "learning_rate": 1.746737042071228e-06, "loss": 0.73269916, "num_input_tokens_seen": 99496520, "step": 4609, "time_per_iteration": 2.6678073406219482 }, { "auxiliary_loss_clip": 0.01146303, "auxiliary_loss_mlp": 0.01026708, "balance_loss_clip": 1.04665279, "balance_loss_mlp": 1.01940095, "epoch": 0.5543197258462094, "flos": 20115721342080.0, "grad_norm": 1.7763299925563623, "language_loss": 0.79068112, "learning_rate": 1.7459643605062424e-06, "loss": 0.81241119, "num_input_tokens_seen": 99513780, "step": 4610, "time_per_iteration": 2.64394474029541 }, { "auxiliary_loss_clip": 0.01119813, "auxiliary_loss_mlp": 0.01022554, "balance_loss_clip": 1.04517019, "balance_loss_mlp": 1.01460826, "epoch": 0.5544399687368484, "flos": 20916565021440.0, "grad_norm": 1.6245136035390824, "language_loss": 0.80567098, "learning_rate": 1.745191717478262e-06, "loss": 0.82709467, "num_input_tokens_seen": 99532360, "step": 4611, "time_per_iteration": 2.860574722290039 }, { "auxiliary_loss_clip": 0.01146385, "auxiliary_loss_mlp": 0.01026764, "balance_loss_clip": 1.04743195, "balance_loss_mlp": 1.01930177, "epoch": 0.5545602116274875, "flos": 25518661297920.0, "grad_norm": 1.968169901351404, "language_loss": 0.79358405, "learning_rate": 1.7444191131044948e-06, "loss": 0.81531554, "num_input_tokens_seen": 99552635, "step": 4612, "time_per_iteration": 2.7060670852661133 }, { "auxiliary_loss_clip": 0.01151889, "auxiliary_loss_mlp": 0.01024438, "balance_loss_clip": 1.04944777, "balance_loss_mlp": 1.01642704, "epoch": 0.5546804545181266, "flos": 20995568985600.0, "grad_norm": 1.6687913517338082, "language_loss": 0.73212349, "learning_rate": 1.7436465475021456e-06, "loss": 0.7538867, "num_input_tokens_seen": 99572685, "step": 4613, "time_per_iteration": 4.84477162361145 }, { "auxiliary_loss_clip": 0.01128245, "auxiliary_loss_mlp": 0.01025763, "balance_loss_clip": 1.04560685, "balance_loss_mlp": 1.0181272, "epoch": 0.5548006974087657, "flos": 26833638297600.0, "grad_norm": 2.3304272670695743, "language_loss": 0.71737158, "learning_rate": 1.7428740207884111e-06, "loss": 0.73891163, "num_input_tokens_seen": 99593565, "step": 4614, "time_per_iteration": 2.786623001098633 }, { "auxiliary_loss_clip": 0.01127705, "auxiliary_loss_mlp": 0.01025183, "balance_loss_clip": 1.0467, "balance_loss_mlp": 1.01641524, "epoch": 0.5549209402994048, "flos": 33656414031360.0, "grad_norm": 1.8998045727407282, "language_loss": 0.61149991, "learning_rate": 1.7421015330804833e-06, "loss": 0.63302875, "num_input_tokens_seen": 99613485, "step": 4615, "time_per_iteration": 3.6982786655426025 }, { "auxiliary_loss_clip": 0.0118029, "auxiliary_loss_mlp": 0.01026867, "balance_loss_clip": 1.05153728, "balance_loss_mlp": 1.01865935, "epoch": 0.5550411831900439, "flos": 23769524609280.0, "grad_norm": 2.1113036659079647, "language_loss": 0.72403216, "learning_rate": 1.7413290844955475e-06, "loss": 0.74610376, "num_input_tokens_seen": 99633515, "step": 4616, "time_per_iteration": 2.632690906524658 }, { "auxiliary_loss_clip": 0.01158771, "auxiliary_loss_mlp": 0.01024218, "balance_loss_clip": 1.05047047, "balance_loss_mlp": 1.01608157, "epoch": 0.555161426080683, "flos": 21651189978240.0, "grad_norm": 1.9560967485940315, "language_loss": 0.77978146, "learning_rate": 1.7405566751507843e-06, "loss": 0.8016113, "num_input_tokens_seen": 99651560, "step": 4617, "time_per_iteration": 3.631929874420166 }, { "auxiliary_loss_clip": 0.01135808, "auxiliary_loss_mlp": 0.01022922, "balance_loss_clip": 1.04593539, "balance_loss_mlp": 1.01574588, "epoch": 0.555281668971322, "flos": 49563116605440.0, "grad_norm": 1.474548622853772, "language_loss": 0.67555594, "learning_rate": 1.7397843051633668e-06, "loss": 0.6971432, "num_input_tokens_seen": 99674255, "step": 4618, "time_per_iteration": 3.020416736602783 }, { "auxiliary_loss_clip": 0.01161683, "auxiliary_loss_mlp": 0.01028247, "balance_loss_clip": 1.04923797, "balance_loss_mlp": 1.0207727, "epoch": 0.5554019118619612, "flos": 20741608851840.0, "grad_norm": 1.5675068297054797, "language_loss": 0.71718544, "learning_rate": 1.739011974650464e-06, "loss": 0.73908472, "num_input_tokens_seen": 99693585, "step": 4619, "time_per_iteration": 2.661813259124756 }, { "auxiliary_loss_clip": 0.01126693, "auxiliary_loss_mlp": 0.01028744, "balance_loss_clip": 1.04613137, "balance_loss_mlp": 1.02060831, "epoch": 0.5555221547526003, "flos": 25483217552640.0, "grad_norm": 2.1698073261659627, "language_loss": 0.76298428, "learning_rate": 1.7382396837292365e-06, "loss": 0.78453863, "num_input_tokens_seen": 99714045, "step": 4620, "time_per_iteration": 2.8197319507598877 }, { "auxiliary_loss_clip": 0.01182287, "auxiliary_loss_mlp": 0.01029718, "balance_loss_clip": 1.05214858, "balance_loss_mlp": 1.02110171, "epoch": 0.5556423976432393, "flos": 21762513204480.0, "grad_norm": 1.7394237933946188, "language_loss": 0.73829216, "learning_rate": 1.737467432516841e-06, "loss": 0.76041222, "num_input_tokens_seen": 99734145, "step": 4621, "time_per_iteration": 2.6147940158843994 }, { "auxiliary_loss_clip": 0.01150922, "auxiliary_loss_mlp": 0.01026253, "balance_loss_clip": 1.0450927, "balance_loss_mlp": 1.01811147, "epoch": 0.5557626405338785, "flos": 24900171989760.0, "grad_norm": 2.356186458649112, "language_loss": 0.74292362, "learning_rate": 1.7366952211304274e-06, "loss": 0.76469535, "num_input_tokens_seen": 99751990, "step": 4622, "time_per_iteration": 2.7667336463928223 }, { "auxiliary_loss_clip": 0.01141705, "auxiliary_loss_mlp": 0.01030081, "balance_loss_clip": 1.04553115, "balance_loss_mlp": 1.02224326, "epoch": 0.5558828834245175, "flos": 18697501676160.0, "grad_norm": 2.446681093163606, "language_loss": 0.83211637, "learning_rate": 1.735923049687139e-06, "loss": 0.85383415, "num_input_tokens_seen": 99768565, "step": 4623, "time_per_iteration": 2.6100871562957764 }, { "auxiliary_loss_clip": 0.01144066, "auxiliary_loss_mlp": 0.01024467, "balance_loss_clip": 1.04426479, "balance_loss_mlp": 1.01682615, "epoch": 0.5560031263151566, "flos": 27272179445760.0, "grad_norm": 1.5453904138823986, "language_loss": 0.73939252, "learning_rate": 1.7351509183041144e-06, "loss": 0.76107788, "num_input_tokens_seen": 99788895, "step": 4624, "time_per_iteration": 2.7044906616210938 }, { "auxiliary_loss_clip": 0.0118221, "auxiliary_loss_mlp": 0.01028218, "balance_loss_clip": 1.05257487, "balance_loss_mlp": 1.0209074, "epoch": 0.5561233692057957, "flos": 23403738458880.0, "grad_norm": 1.9658164247775558, "language_loss": 0.71557283, "learning_rate": 1.7343788270984852e-06, "loss": 0.73767716, "num_input_tokens_seen": 99808035, "step": 4625, "time_per_iteration": 2.595393657684326 }, { "auxiliary_loss_clip": 0.01153543, "auxiliary_loss_mlp": 0.01033603, "balance_loss_clip": 1.05037212, "balance_loss_mlp": 1.02506757, "epoch": 0.5562436120964348, "flos": 37670867804160.0, "grad_norm": 1.8252894454082575, "language_loss": 0.74412382, "learning_rate": 1.7336067761873764e-06, "loss": 0.76599526, "num_input_tokens_seen": 99830460, "step": 4626, "time_per_iteration": 2.769731283187866 }, { "auxiliary_loss_clip": 0.01171512, "auxiliary_loss_mlp": 0.01025581, "balance_loss_clip": 1.04885924, "balance_loss_mlp": 1.01736772, "epoch": 0.5563638549870739, "flos": 25155245445120.0, "grad_norm": 1.9031470493220044, "language_loss": 0.76179671, "learning_rate": 1.7328347656879076e-06, "loss": 0.7837677, "num_input_tokens_seen": 99850320, "step": 4627, "time_per_iteration": 2.6360702514648438 }, { "auxiliary_loss_clip": 0.01137538, "auxiliary_loss_mlp": 0.01033025, "balance_loss_clip": 1.04624057, "balance_loss_mlp": 1.02444768, "epoch": 0.556484097877713, "flos": 13581810783360.0, "grad_norm": 2.1717535785842914, "language_loss": 0.6874575, "learning_rate": 1.7320627957171927e-06, "loss": 0.70916319, "num_input_tokens_seen": 99864980, "step": 4628, "time_per_iteration": 2.650557041168213 }, { "auxiliary_loss_clip": 0.01180616, "auxiliary_loss_mlp": 0.01028965, "balance_loss_clip": 1.05287325, "balance_loss_mlp": 1.02083468, "epoch": 0.5566043407683521, "flos": 24681368292480.0, "grad_norm": 1.7929145190283247, "language_loss": 0.81170511, "learning_rate": 1.7312908663923382e-06, "loss": 0.83380091, "num_input_tokens_seen": 99881155, "step": 4629, "time_per_iteration": 2.6393423080444336 }, { "auxiliary_loss_clip": 0.01157388, "auxiliary_loss_mlp": 0.01024558, "balance_loss_clip": 1.04563093, "balance_loss_mlp": 1.0164932, "epoch": 0.5567245836589911, "flos": 20588161950720.0, "grad_norm": 2.3410795210458817, "language_loss": 0.67256689, "learning_rate": 1.7305189778304463e-06, "loss": 0.69438636, "num_input_tokens_seen": 99899330, "step": 4630, "time_per_iteration": 2.6303060054779053 }, { "auxiliary_loss_clip": 0.01152535, "auxiliary_loss_mlp": 0.01027456, "balance_loss_clip": 1.05128503, "balance_loss_mlp": 1.01942778, "epoch": 0.5568448265496303, "flos": 20704189858560.0, "grad_norm": 1.9390946825667246, "language_loss": 0.80391484, "learning_rate": 1.729747130148611e-06, "loss": 0.82571471, "num_input_tokens_seen": 99918525, "step": 4631, "time_per_iteration": 2.662760019302368 }, { "auxiliary_loss_clip": 0.0114447, "auxiliary_loss_mlp": 0.01033467, "balance_loss_clip": 1.0472877, "balance_loss_mlp": 1.0247829, "epoch": 0.5569650694402694, "flos": 25302910256640.0, "grad_norm": 2.0913873943411043, "language_loss": 0.76774192, "learning_rate": 1.7289753234639208e-06, "loss": 0.78952128, "num_input_tokens_seen": 99937500, "step": 4632, "time_per_iteration": 2.727421760559082 }, { "auxiliary_loss_clip": 0.01169804, "auxiliary_loss_mlp": 0.01036254, "balance_loss_clip": 1.0500617, "balance_loss_mlp": 1.02796292, "epoch": 0.5570853123309084, "flos": 19712623939200.0, "grad_norm": 1.9265301119105505, "language_loss": 0.76819205, "learning_rate": 1.7282035578934592e-06, "loss": 0.79025257, "num_input_tokens_seen": 99955665, "step": 4633, "time_per_iteration": 2.646646738052368 }, { "auxiliary_loss_clip": 0.01147354, "auxiliary_loss_mlp": 0.01028264, "balance_loss_clip": 1.05185533, "balance_loss_mlp": 1.02021766, "epoch": 0.5572055552215476, "flos": 16108091153280.0, "grad_norm": 2.120107796780374, "language_loss": 0.78819859, "learning_rate": 1.727431833554301e-06, "loss": 0.80995476, "num_input_tokens_seen": 99974140, "step": 4634, "time_per_iteration": 2.6311233043670654 }, { "auxiliary_loss_clip": 0.01115032, "auxiliary_loss_mlp": 0.01031136, "balance_loss_clip": 1.04166961, "balance_loss_mlp": 1.02357793, "epoch": 0.5573257981121866, "flos": 17128815937920.0, "grad_norm": 2.0318198295269982, "language_loss": 0.7733174, "learning_rate": 1.7266601505635175e-06, "loss": 0.79477906, "num_input_tokens_seen": 99991480, "step": 4635, "time_per_iteration": 3.0732650756835938 }, { "auxiliary_loss_clip": 0.01168505, "auxiliary_loss_mlp": 0.01032305, "balance_loss_clip": 1.05225945, "balance_loss_mlp": 1.02434754, "epoch": 0.5574460410028257, "flos": 18807029222400.0, "grad_norm": 2.051734748187882, "language_loss": 0.75603414, "learning_rate": 1.7258885090381717e-06, "loss": 0.77804226, "num_input_tokens_seen": 100009520, "step": 4636, "time_per_iteration": 2.8333001136779785 }, { "auxiliary_loss_clip": 0.01153956, "auxiliary_loss_mlp": 0.01027695, "balance_loss_clip": 1.04671133, "balance_loss_mlp": 1.02070284, "epoch": 0.5575662838934649, "flos": 29642678530560.0, "grad_norm": 2.1455819847401365, "language_loss": 0.78887147, "learning_rate": 1.7251169090953213e-06, "loss": 0.81068802, "num_input_tokens_seen": 100029995, "step": 4637, "time_per_iteration": 2.7316410541534424 }, { "auxiliary_loss_clip": 0.01165919, "auxiliary_loss_mlp": 0.01031207, "balance_loss_clip": 1.04917121, "balance_loss_mlp": 1.02302969, "epoch": 0.5576865267841039, "flos": 22054466949120.0, "grad_norm": 2.5213662132560435, "language_loss": 0.76683038, "learning_rate": 1.7243453508520168e-06, "loss": 0.78880167, "num_input_tokens_seen": 100046980, "step": 4638, "time_per_iteration": 3.64043927192688 }, { "auxiliary_loss_clip": 0.01149974, "auxiliary_loss_mlp": 0.010256, "balance_loss_clip": 1.04423034, "balance_loss_mlp": 1.01802146, "epoch": 0.557806769674743, "flos": 17196040241280.0, "grad_norm": 2.100843130555985, "language_loss": 0.8469733, "learning_rate": 1.7235738344253038e-06, "loss": 0.868729, "num_input_tokens_seen": 100060610, "step": 4639, "time_per_iteration": 2.651303768157959 }, { "auxiliary_loss_clip": 0.01164395, "auxiliary_loss_mlp": 0.01028526, "balance_loss_clip": 1.05033505, "balance_loss_mlp": 1.02056289, "epoch": 0.557927012565382, "flos": 24712717887360.0, "grad_norm": 2.3957266494216665, "language_loss": 0.82811922, "learning_rate": 1.72280235993222e-06, "loss": 0.85004842, "num_input_tokens_seen": 100078915, "step": 4640, "time_per_iteration": 3.607492446899414 }, { "auxiliary_loss_clip": 0.01163735, "auxiliary_loss_mlp": 0.00763398, "balance_loss_clip": 1.04929519, "balance_loss_mlp": 1.00032496, "epoch": 0.5580472554560212, "flos": 16983090460800.0, "grad_norm": 2.354066777053142, "language_loss": 0.69865334, "learning_rate": 1.722030927489798e-06, "loss": 0.71792471, "num_input_tokens_seen": 100096195, "step": 4641, "time_per_iteration": 3.5771536827087402 }, { "auxiliary_loss_clip": 0.01140014, "auxiliary_loss_mlp": 0.01027679, "balance_loss_clip": 1.04910231, "balance_loss_mlp": 1.0193758, "epoch": 0.5581674983466602, "flos": 23509100027520.0, "grad_norm": 1.958369524972769, "language_loss": 0.74102819, "learning_rate": 1.7212595372150634e-06, "loss": 0.76270509, "num_input_tokens_seen": 100116175, "step": 4642, "time_per_iteration": 2.793229579925537 }, { "auxiliary_loss_clip": 0.01181204, "auxiliary_loss_mlp": 0.01028764, "balance_loss_clip": 1.05236375, "balance_loss_mlp": 1.02100384, "epoch": 0.5582877412372993, "flos": 13480291969920.0, "grad_norm": 2.458259518248705, "language_loss": 0.7277, "learning_rate": 1.720488189225035e-06, "loss": 0.74979973, "num_input_tokens_seen": 100133875, "step": 4643, "time_per_iteration": 3.5307369232177734 }, { "auxiliary_loss_clip": 0.01168086, "auxiliary_loss_mlp": 0.01028608, "balance_loss_clip": 1.04888749, "balance_loss_mlp": 1.02081728, "epoch": 0.5584079841279385, "flos": 21903605827200.0, "grad_norm": 2.537920407523779, "language_loss": 0.79306775, "learning_rate": 1.7197168836367265e-06, "loss": 0.81503475, "num_input_tokens_seen": 100150685, "step": 4644, "time_per_iteration": 2.6668155193328857 }, { "auxiliary_loss_clip": 0.01162818, "auxiliary_loss_mlp": 0.00762955, "balance_loss_clip": 1.04837108, "balance_loss_mlp": 1.00024545, "epoch": 0.5585282270185775, "flos": 18843550375680.0, "grad_norm": 1.87766278363126, "language_loss": 0.82353479, "learning_rate": 1.7189456205671433e-06, "loss": 0.84279251, "num_input_tokens_seen": 100169530, "step": 4645, "time_per_iteration": 2.684387445449829 }, { "auxiliary_loss_clip": 0.01174311, "auxiliary_loss_mlp": 0.01027101, "balance_loss_clip": 1.05107927, "balance_loss_mlp": 1.01897693, "epoch": 0.5586484699092166, "flos": 21868449390720.0, "grad_norm": 2.896251298068004, "language_loss": 0.82434189, "learning_rate": 1.7181744001332866e-06, "loss": 0.84635592, "num_input_tokens_seen": 100188140, "step": 4646, "time_per_iteration": 2.802567481994629 }, { "auxiliary_loss_clip": 0.01179839, "auxiliary_loss_mlp": 0.0102776, "balance_loss_clip": 1.05185843, "balance_loss_mlp": 1.02044022, "epoch": 0.5587687127998557, "flos": 22893232412160.0, "grad_norm": 2.4784121648226165, "language_loss": 0.6319648, "learning_rate": 1.7174032224521493e-06, "loss": 0.65404081, "num_input_tokens_seen": 100206850, "step": 4647, "time_per_iteration": 2.6848559379577637 }, { "auxiliary_loss_clip": 0.01164569, "auxiliary_loss_mlp": 0.01024755, "balance_loss_clip": 1.04950297, "balance_loss_mlp": 1.01725984, "epoch": 0.5588889556904948, "flos": 20303067703680.0, "grad_norm": 1.6733308947972305, "language_loss": 0.69900656, "learning_rate": 1.7166320876407184e-06, "loss": 0.7208997, "num_input_tokens_seen": 100226270, "step": 4648, "time_per_iteration": 2.700667142868042 }, { "auxiliary_loss_clip": 0.01184823, "auxiliary_loss_mlp": 0.00763826, "balance_loss_clip": 1.05379474, "balance_loss_mlp": 1.00031018, "epoch": 0.5590091985811338, "flos": 16472153450880.0, "grad_norm": 1.8617661272089383, "language_loss": 0.67926246, "learning_rate": 1.7158609958159742e-06, "loss": 0.69874895, "num_input_tokens_seen": 100243675, "step": 4649, "time_per_iteration": 2.7556564807891846 }, { "auxiliary_loss_clip": 0.01118771, "auxiliary_loss_mlp": 0.01029824, "balance_loss_clip": 1.04342616, "balance_loss_mlp": 1.02144349, "epoch": 0.559129441471773, "flos": 14532186781440.0, "grad_norm": 2.2839195456866643, "language_loss": 0.78176105, "learning_rate": 1.7150899470948911e-06, "loss": 0.80324709, "num_input_tokens_seen": 100258940, "step": 4650, "time_per_iteration": 2.859292984008789 }, { "auxiliary_loss_clip": 0.0104734, "auxiliary_loss_mlp": 0.0100232, "balance_loss_clip": 1.00974369, "balance_loss_mlp": 1.00088978, "epoch": 0.5592496843624121, "flos": 60521009852160.0, "grad_norm": 0.8379854314193489, "language_loss": 0.56629443, "learning_rate": 1.7143189415944365e-06, "loss": 0.58679098, "num_input_tokens_seen": 100323400, "step": 4651, "time_per_iteration": 3.2557644844055176 }, { "auxiliary_loss_clip": 0.01161611, "auxiliary_loss_mlp": 0.01027006, "balance_loss_clip": 1.04864097, "balance_loss_mlp": 1.01972842, "epoch": 0.5593699272530511, "flos": 20886256920960.0, "grad_norm": 1.6281031580553942, "language_loss": 0.76374686, "learning_rate": 1.7135479794315714e-06, "loss": 0.78563309, "num_input_tokens_seen": 100340355, "step": 4652, "time_per_iteration": 2.701763868331909 }, { "auxiliary_loss_clip": 0.01134718, "auxiliary_loss_mlp": 0.01027109, "balance_loss_clip": 1.04591453, "balance_loss_mlp": 1.01942039, "epoch": 0.5594901701436903, "flos": 12896743616640.0, "grad_norm": 1.9499210337336101, "language_loss": 0.78937721, "learning_rate": 1.7127770607232502e-06, "loss": 0.81099546, "num_input_tokens_seen": 100358900, "step": 4653, "time_per_iteration": 2.674389362335205 }, { "auxiliary_loss_clip": 0.01144589, "auxiliary_loss_mlp": 0.01028376, "balance_loss_clip": 1.04791462, "balance_loss_mlp": 1.02079463, "epoch": 0.5596104130343293, "flos": 23112107936640.0, "grad_norm": 2.608515395794512, "language_loss": 0.8010847, "learning_rate": 1.7120061855864204e-06, "loss": 0.82281435, "num_input_tokens_seen": 100378910, "step": 4654, "time_per_iteration": 2.738274335861206 }, { "auxiliary_loss_clip": 0.01166806, "auxiliary_loss_mlp": 0.01030348, "balance_loss_clip": 1.05208302, "balance_loss_mlp": 1.02224183, "epoch": 0.5597306559249684, "flos": 25957812977280.0, "grad_norm": 1.9333335772754632, "language_loss": 0.70735943, "learning_rate": 1.7112353541380233e-06, "loss": 0.72933096, "num_input_tokens_seen": 100398770, "step": 4655, "time_per_iteration": 2.681320905685425 }, { "auxiliary_loss_clip": 0.01152852, "auxiliary_loss_mlp": 0.0103262, "balance_loss_clip": 1.04886746, "balance_loss_mlp": 1.02479959, "epoch": 0.5598508988156076, "flos": 22492289825280.0, "grad_norm": 1.474541127337974, "language_loss": 0.72149652, "learning_rate": 1.7104645664949931e-06, "loss": 0.74335122, "num_input_tokens_seen": 100421240, "step": 4656, "time_per_iteration": 2.6743571758270264 }, { "auxiliary_loss_clip": 0.01151169, "auxiliary_loss_mlp": 0.01024358, "balance_loss_clip": 1.0460943, "balance_loss_mlp": 1.01644301, "epoch": 0.5599711417062466, "flos": 23112538899840.0, "grad_norm": 1.758078232720619, "language_loss": 0.71497601, "learning_rate": 1.7096938227742584e-06, "loss": 0.73673129, "num_input_tokens_seen": 100442370, "step": 4657, "time_per_iteration": 2.704732894897461 }, { "auxiliary_loss_clip": 0.01182094, "auxiliary_loss_mlp": 0.01027535, "balance_loss_clip": 1.05188084, "balance_loss_mlp": 1.01903546, "epoch": 0.5600913845968857, "flos": 22339345714560.0, "grad_norm": 2.066011457892653, "language_loss": 0.84145939, "learning_rate": 1.70892312309274e-06, "loss": 0.86355567, "num_input_tokens_seen": 100460260, "step": 4658, "time_per_iteration": 2.5490872859954834 }, { "auxiliary_loss_clip": 0.0114917, "auxiliary_loss_mlp": 0.01030011, "balance_loss_clip": 1.04159856, "balance_loss_mlp": 1.02166033, "epoch": 0.5602116274875248, "flos": 17633791290240.0, "grad_norm": 2.2116496275065014, "language_loss": 0.68311489, "learning_rate": 1.7081524675673523e-06, "loss": 0.7049067, "num_input_tokens_seen": 100475750, "step": 4659, "time_per_iteration": 2.6439735889434814 }, { "auxiliary_loss_clip": 0.01050741, "auxiliary_loss_mlp": 0.01000847, "balance_loss_clip": 1.01016057, "balance_loss_mlp": 0.99941635, "epoch": 0.5603318703781639, "flos": 70115945529600.0, "grad_norm": 0.7782727895000924, "language_loss": 0.59588826, "learning_rate": 1.7073818563150026e-06, "loss": 0.61640406, "num_input_tokens_seen": 100537830, "step": 4660, "time_per_iteration": 3.3295950889587402 }, { "auxiliary_loss_clip": 0.01160054, "auxiliary_loss_mlp": 0.01032824, "balance_loss_clip": 1.04732049, "balance_loss_mlp": 1.02464962, "epoch": 0.560452113268803, "flos": 18545850455040.0, "grad_norm": 5.268178256964221, "language_loss": 0.86426491, "learning_rate": 1.7066112894525935e-06, "loss": 0.88619369, "num_input_tokens_seen": 100555910, "step": 4661, "time_per_iteration": 2.668867349624634 }, { "auxiliary_loss_clip": 0.01145843, "auxiliary_loss_mlp": 0.0103271, "balance_loss_clip": 1.04727221, "balance_loss_mlp": 1.02475834, "epoch": 0.5605723561594421, "flos": 25264665250560.0, "grad_norm": 1.7741031654416468, "language_loss": 0.72954839, "learning_rate": 1.7058407670970177e-06, "loss": 0.75133389, "num_input_tokens_seen": 100577385, "step": 4662, "time_per_iteration": 2.6764779090881348 }, { "auxiliary_loss_clip": 0.01173894, "auxiliary_loss_mlp": 0.01032304, "balance_loss_clip": 1.05087161, "balance_loss_mlp": 1.02390027, "epoch": 0.5606925990500812, "flos": 20594949621120.0, "grad_norm": 1.6488969702076917, "language_loss": 0.61241871, "learning_rate": 1.7050702893651643e-06, "loss": 0.63448071, "num_input_tokens_seen": 100596965, "step": 4663, "time_per_iteration": 2.6287930011749268 }, { "auxiliary_loss_clip": 0.0116943, "auxiliary_loss_mlp": 0.01033919, "balance_loss_clip": 1.05168545, "balance_loss_mlp": 1.02560973, "epoch": 0.5608128419407202, "flos": 35006044677120.0, "grad_norm": 2.2950334938136243, "language_loss": 0.75236171, "learning_rate": 1.7042998563739134e-06, "loss": 0.77439523, "num_input_tokens_seen": 100615315, "step": 4664, "time_per_iteration": 3.6370463371276855 }, { "auxiliary_loss_clip": 0.01160983, "auxiliary_loss_mlp": 0.01028869, "balance_loss_clip": 1.04973924, "balance_loss_mlp": 1.01970148, "epoch": 0.5609330848313594, "flos": 24639819235200.0, "grad_norm": 2.4689023634123486, "language_loss": 0.7220825, "learning_rate": 1.703529468240139e-06, "loss": 0.743981, "num_input_tokens_seen": 100634185, "step": 4665, "time_per_iteration": 2.5581390857696533 }, { "auxiliary_loss_clip": 0.0114681, "auxiliary_loss_mlp": 0.01021742, "balance_loss_clip": 1.04829717, "balance_loss_mlp": 1.01400208, "epoch": 0.5610533277219985, "flos": 18762894385920.0, "grad_norm": 2.2164941081385288, "language_loss": 0.73717344, "learning_rate": 1.7027591250807088e-06, "loss": 0.75885892, "num_input_tokens_seen": 100651360, "step": 4666, "time_per_iteration": 3.386622667312622 }, { "auxiliary_loss_clip": 0.01183849, "auxiliary_loss_mlp": 0.01025368, "balance_loss_clip": 1.05269206, "balance_loss_mlp": 1.01789069, "epoch": 0.5611735706126375, "flos": 15012384727680.0, "grad_norm": 2.437501892487151, "language_loss": 0.84596705, "learning_rate": 1.7019888270124825e-06, "loss": 0.86805928, "num_input_tokens_seen": 100668525, "step": 4667, "time_per_iteration": 3.397329568862915 }, { "auxiliary_loss_clip": 0.01173289, "auxiliary_loss_mlp": 0.01030441, "balance_loss_clip": 1.05320394, "balance_loss_mlp": 1.02253723, "epoch": 0.5612938135032767, "flos": 16468167041280.0, "grad_norm": 2.036207372689026, "language_loss": 0.81986332, "learning_rate": 1.7012185741523147e-06, "loss": 0.84190059, "num_input_tokens_seen": 100684850, "step": 4668, "time_per_iteration": 2.562749147415161 }, { "auxiliary_loss_clip": 0.01185799, "auxiliary_loss_mlp": 0.01033392, "balance_loss_clip": 1.05535316, "balance_loss_mlp": 1.02493978, "epoch": 0.5614140563939157, "flos": 25666433850240.0, "grad_norm": 3.9196329947582655, "language_loss": 0.62839359, "learning_rate": 1.7004483666170514e-06, "loss": 0.65058553, "num_input_tokens_seen": 100705345, "step": 4669, "time_per_iteration": 3.5400469303131104 }, { "auxiliary_loss_clip": 0.01164243, "auxiliary_loss_mlp": 0.01033787, "balance_loss_clip": 1.0495739, "balance_loss_mlp": 1.02543652, "epoch": 0.5615342992845548, "flos": 24717566223360.0, "grad_norm": 2.0679862629560186, "language_loss": 0.80247283, "learning_rate": 1.699678204523533e-06, "loss": 0.82445306, "num_input_tokens_seen": 100725210, "step": 4670, "time_per_iteration": 2.6882379055023193 }, { "auxiliary_loss_clip": 0.01156417, "auxiliary_loss_mlp": 0.01024919, "balance_loss_clip": 1.05084026, "balance_loss_mlp": 1.01667559, "epoch": 0.5616545421751938, "flos": 22015934634240.0, "grad_norm": 3.2723950052186495, "language_loss": 0.68959939, "learning_rate": 1.6989080879885918e-06, "loss": 0.71141279, "num_input_tokens_seen": 100743070, "step": 4671, "time_per_iteration": 2.662358283996582 }, { "auxiliary_loss_clip": 0.01043132, "auxiliary_loss_mlp": 0.0100246, "balance_loss_clip": 1.01328444, "balance_loss_mlp": 1.0011071, "epoch": 0.561774785065833, "flos": 53760358690560.0, "grad_norm": 0.891251952382261, "language_loss": 0.61067647, "learning_rate": 1.6981380171290544e-06, "loss": 0.63113236, "num_input_tokens_seen": 100804095, "step": 4672, "time_per_iteration": 3.283496141433716 }, { "auxiliary_loss_clip": 0.01147254, "auxiliary_loss_mlp": 0.01023748, "balance_loss_clip": 1.04557002, "balance_loss_mlp": 1.01554012, "epoch": 0.5618950279564721, "flos": 19750007018880.0, "grad_norm": 5.604544801833084, "language_loss": 0.74217898, "learning_rate": 1.6973679920617396e-06, "loss": 0.76388901, "num_input_tokens_seen": 100821630, "step": 4673, "time_per_iteration": 2.671304225921631 }, { "auxiliary_loss_clip": 0.01151647, "auxiliary_loss_mlp": 0.01023778, "balance_loss_clip": 1.05052567, "balance_loss_mlp": 1.01596999, "epoch": 0.5620152708471111, "flos": 16800592435200.0, "grad_norm": 2.8883879246362905, "language_loss": 0.84930885, "learning_rate": 1.6965980129034603e-06, "loss": 0.87106311, "num_input_tokens_seen": 100839015, "step": 4674, "time_per_iteration": 2.6611170768737793 }, { "auxiliary_loss_clip": 0.01154282, "auxiliary_loss_mlp": 0.01021975, "balance_loss_clip": 1.05035889, "balance_loss_mlp": 1.01380324, "epoch": 0.5621355137377503, "flos": 26797799502720.0, "grad_norm": 1.6686521712911004, "language_loss": 0.76802582, "learning_rate": 1.6958280797710209e-06, "loss": 0.78978837, "num_input_tokens_seen": 100860940, "step": 4675, "time_per_iteration": 2.8043601512908936 }, { "auxiliary_loss_clip": 0.01051341, "auxiliary_loss_mlp": 0.01004174, "balance_loss_clip": 1.01206183, "balance_loss_mlp": 1.00275505, "epoch": 0.5622557566283893, "flos": 61207046686080.0, "grad_norm": 0.7132453224407957, "language_loss": 0.54724866, "learning_rate": 1.6950581927812198e-06, "loss": 0.5678038, "num_input_tokens_seen": 100920510, "step": 4676, "time_per_iteration": 3.1277053356170654 }, { "auxiliary_loss_clip": 0.01167739, "auxiliary_loss_mlp": 0.01028551, "balance_loss_clip": 1.04948354, "balance_loss_mlp": 1.02059937, "epoch": 0.5623759995190284, "flos": 26468534505600.0, "grad_norm": 1.9940085196562543, "language_loss": 0.79634237, "learning_rate": 1.6942883520508486e-06, "loss": 0.81830531, "num_input_tokens_seen": 100939245, "step": 4677, "time_per_iteration": 2.6593196392059326 }, { "auxiliary_loss_clip": 0.01168193, "auxiliary_loss_mlp": 0.01027272, "balance_loss_clip": 1.05100834, "balance_loss_mlp": 1.01968479, "epoch": 0.5624962424096676, "flos": 19390900798080.0, "grad_norm": 1.9835901875616895, "language_loss": 0.76859868, "learning_rate": 1.693518557696691e-06, "loss": 0.79055333, "num_input_tokens_seen": 100958385, "step": 4678, "time_per_iteration": 2.6692769527435303 }, { "auxiliary_loss_clip": 0.01160561, "auxiliary_loss_mlp": 0.01029598, "balance_loss_clip": 1.04530823, "balance_loss_mlp": 1.02157521, "epoch": 0.5626164853003066, "flos": 20667345482880.0, "grad_norm": 2.1420473693537687, "language_loss": 0.88855827, "learning_rate": 1.6927488098355252e-06, "loss": 0.91045988, "num_input_tokens_seen": 100976015, "step": 4679, "time_per_iteration": 2.7635915279388428 }, { "auxiliary_loss_clip": 0.01036669, "auxiliary_loss_mlp": 0.0100294, "balance_loss_clip": 1.01089644, "balance_loss_mlp": 1.00146186, "epoch": 0.5627367281909457, "flos": 62766071665920.0, "grad_norm": 0.8960021728672598, "language_loss": 0.63151956, "learning_rate": 1.6919791085841201e-06, "loss": 0.65191555, "num_input_tokens_seen": 101033425, "step": 4680, "time_per_iteration": 3.240887403488159 }, { "auxiliary_loss_clip": 0.01159483, "auxiliary_loss_mlp": 0.01029354, "balance_loss_clip": 1.04633522, "balance_loss_mlp": 1.02097344, "epoch": 0.5628569710815848, "flos": 12787144243200.0, "grad_norm": 2.4119958018594754, "language_loss": 0.79237992, "learning_rate": 1.6912094540592396e-06, "loss": 0.81426823, "num_input_tokens_seen": 101048945, "step": 4681, "time_per_iteration": 2.721222162246704 }, { "auxiliary_loss_clip": 0.01163215, "auxiliary_loss_mlp": 0.01028208, "balance_loss_clip": 1.0475812, "balance_loss_mlp": 1.02011371, "epoch": 0.5629772139722239, "flos": 13762082165760.0, "grad_norm": 2.692766796325648, "language_loss": 0.81304324, "learning_rate": 1.6904398463776393e-06, "loss": 0.83495742, "num_input_tokens_seen": 101062745, "step": 4682, "time_per_iteration": 2.585064649581909 }, { "auxiliary_loss_clip": 0.01165716, "auxiliary_loss_mlp": 0.01027238, "balance_loss_clip": 1.04733729, "balance_loss_mlp": 1.01951039, "epoch": 0.5630974568628629, "flos": 21467830026240.0, "grad_norm": 1.986637955573854, "language_loss": 0.72595185, "learning_rate": 1.6896702856560683e-06, "loss": 0.74788141, "num_input_tokens_seen": 101081840, "step": 4683, "time_per_iteration": 2.6382105350494385 }, { "auxiliary_loss_clip": 0.01132419, "auxiliary_loss_mlp": 0.0102635, "balance_loss_clip": 1.04207706, "balance_loss_mlp": 1.01798153, "epoch": 0.5632176997535021, "flos": 14245907385600.0, "grad_norm": 2.969966718035667, "language_loss": 0.69508636, "learning_rate": 1.6889007720112677e-06, "loss": 0.71667409, "num_input_tokens_seen": 101099585, "step": 4684, "time_per_iteration": 2.649993419647217 }, { "auxiliary_loss_clip": 0.01168635, "auxiliary_loss_mlp": 0.01028722, "balance_loss_clip": 1.05064785, "balance_loss_mlp": 1.02104497, "epoch": 0.5633379426441412, "flos": 20812244947200.0, "grad_norm": 1.6983158852140172, "language_loss": 0.7726289, "learning_rate": 1.6881313055599734e-06, "loss": 0.79460239, "num_input_tokens_seen": 101119515, "step": 4685, "time_per_iteration": 2.6453113555908203 }, { "auxiliary_loss_clip": 0.01136038, "auxiliary_loss_mlp": 0.01028974, "balance_loss_clip": 1.04270315, "balance_loss_mlp": 1.02059317, "epoch": 0.5634581855347802, "flos": 22600883617920.0, "grad_norm": 2.6256302771073945, "language_loss": 0.82396293, "learning_rate": 1.6873618864189117e-06, "loss": 0.845613, "num_input_tokens_seen": 101135285, "step": 4686, "time_per_iteration": 2.67458438873291 }, { "auxiliary_loss_clip": 0.01165091, "auxiliary_loss_mlp": 0.01028278, "balance_loss_clip": 1.04937327, "balance_loss_mlp": 1.02001119, "epoch": 0.5635784284254194, "flos": 21506972872320.0, "grad_norm": 2.1970487918841926, "language_loss": 0.78234249, "learning_rate": 1.686592514704803e-06, "loss": 0.80427617, "num_input_tokens_seen": 101152680, "step": 4687, "time_per_iteration": 2.6899025440216064 }, { "auxiliary_loss_clip": 0.01153591, "auxiliary_loss_mlp": 0.01024617, "balance_loss_clip": 1.05143762, "balance_loss_mlp": 1.01756549, "epoch": 0.5636986713160584, "flos": 19827466698240.0, "grad_norm": 2.528030433191082, "language_loss": 0.71102542, "learning_rate": 1.685823190534361e-06, "loss": 0.73280746, "num_input_tokens_seen": 101170920, "step": 4688, "time_per_iteration": 2.645359516143799 }, { "auxiliary_loss_clip": 0.01184426, "auxiliary_loss_mlp": 0.01030629, "balance_loss_clip": 1.05192685, "balance_loss_mlp": 1.02236223, "epoch": 0.5638189142066975, "flos": 19792453916160.0, "grad_norm": 2.2168338892957564, "language_loss": 0.83887362, "learning_rate": 1.6850539140242907e-06, "loss": 0.8610242, "num_input_tokens_seen": 101190180, "step": 4689, "time_per_iteration": 2.644681692123413 }, { "auxiliary_loss_clip": 0.01168624, "auxiliary_loss_mlp": 0.01027291, "balance_loss_clip": 1.04818666, "balance_loss_mlp": 1.01949453, "epoch": 0.5639391570973367, "flos": 22893771116160.0, "grad_norm": 2.1056113559202476, "language_loss": 0.82170987, "learning_rate": 1.684284685291292e-06, "loss": 0.84366906, "num_input_tokens_seen": 101211825, "step": 4690, "time_per_iteration": 2.6609227657318115 }, { "auxiliary_loss_clip": 0.01179598, "auxiliary_loss_mlp": 0.01031688, "balance_loss_clip": 1.05088603, "balance_loss_mlp": 1.02232432, "epoch": 0.5640593999879757, "flos": 23727077712000.0, "grad_norm": 4.096138194442154, "language_loss": 0.81575257, "learning_rate": 1.683515504452055e-06, "loss": 0.83786547, "num_input_tokens_seen": 101229200, "step": 4691, "time_per_iteration": 3.6334943771362305 }, { "auxiliary_loss_clip": 0.01128929, "auxiliary_loss_mlp": 0.0103062, "balance_loss_clip": 1.04468203, "balance_loss_mlp": 1.02231741, "epoch": 0.5641796428786148, "flos": 22710123855360.0, "grad_norm": 1.6069116922170086, "language_loss": 0.66375524, "learning_rate": 1.6827463716232648e-06, "loss": 0.68535072, "num_input_tokens_seen": 101249860, "step": 4692, "time_per_iteration": 3.822901725769043 }, { "auxiliary_loss_clip": 0.01166748, "auxiliary_loss_mlp": 0.00762802, "balance_loss_clip": 1.04976726, "balance_loss_mlp": 1.0003252, "epoch": 0.5642998857692539, "flos": 19791987039360.0, "grad_norm": 1.9293441985654194, "language_loss": 0.75805914, "learning_rate": 1.6819772869215972e-06, "loss": 0.7773546, "num_input_tokens_seen": 101268940, "step": 4693, "time_per_iteration": 3.490962266921997 }, { "auxiliary_loss_clip": 0.01158255, "auxiliary_loss_mlp": 0.01028615, "balance_loss_clip": 1.04995346, "balance_loss_mlp": 1.02074718, "epoch": 0.564420128659893, "flos": 23185904428800.0, "grad_norm": 1.7077748959889714, "language_loss": 0.81998897, "learning_rate": 1.6812082504637228e-06, "loss": 0.84185767, "num_input_tokens_seen": 101290260, "step": 4694, "time_per_iteration": 2.6713359355926514 }, { "auxiliary_loss_clip": 0.01164338, "auxiliary_loss_mlp": 0.01029894, "balance_loss_clip": 1.05193126, "balance_loss_mlp": 1.02176952, "epoch": 0.564540371550532, "flos": 23258264376960.0, "grad_norm": 1.5772503225008376, "language_loss": 0.74576783, "learning_rate": 1.6804392623663025e-06, "loss": 0.76771009, "num_input_tokens_seen": 101311465, "step": 4695, "time_per_iteration": 3.593247413635254 }, { "auxiliary_loss_clip": 0.0116098, "auxiliary_loss_mlp": 0.01026545, "balance_loss_clip": 1.04912269, "balance_loss_mlp": 1.01896024, "epoch": 0.5646606144411712, "flos": 25010058672000.0, "grad_norm": 1.965626749814175, "language_loss": 0.78352535, "learning_rate": 1.6796703227459935e-06, "loss": 0.80540067, "num_input_tokens_seen": 101329420, "step": 4696, "time_per_iteration": 2.690711498260498 }, { "auxiliary_loss_clip": 0.01111703, "auxiliary_loss_mlp": 0.01027193, "balance_loss_clip": 1.04140484, "balance_loss_mlp": 1.02009428, "epoch": 0.5647808573318103, "flos": 36539645806080.0, "grad_norm": 2.0474080920370437, "language_loss": 0.76008427, "learning_rate": 1.6789014317194407e-06, "loss": 0.78147316, "num_input_tokens_seen": 101350900, "step": 4697, "time_per_iteration": 2.8789632320404053 }, { "auxiliary_loss_clip": 0.01158149, "auxiliary_loss_mlp": 0.01030813, "balance_loss_clip": 1.04880834, "balance_loss_mlp": 1.02274847, "epoch": 0.5649011002224493, "flos": 22528451842560.0, "grad_norm": 2.4050864590594156, "language_loss": 0.7240926, "learning_rate": 1.6781325894032853e-06, "loss": 0.74598217, "num_input_tokens_seen": 101369860, "step": 4698, "time_per_iteration": 2.6996679306030273 }, { "auxiliary_loss_clip": 0.01147108, "auxiliary_loss_mlp": 0.01029682, "balance_loss_clip": 1.04969847, "balance_loss_mlp": 1.02244353, "epoch": 0.5650213431130885, "flos": 18515147304960.0, "grad_norm": 1.8970440021676518, "language_loss": 0.91751039, "learning_rate": 1.6773637959141608e-06, "loss": 0.9392783, "num_input_tokens_seen": 101386835, "step": 4699, "time_per_iteration": 2.67732572555542 }, { "auxiliary_loss_clip": 0.01141069, "auxiliary_loss_mlp": 0.01024321, "balance_loss_clip": 1.04484487, "balance_loss_mlp": 1.01675081, "epoch": 0.5651415860037275, "flos": 17526310819200.0, "grad_norm": 2.002864459168587, "language_loss": 0.66388977, "learning_rate": 1.6765950513686915e-06, "loss": 0.68554366, "num_input_tokens_seen": 101404945, "step": 4700, "time_per_iteration": 2.657261848449707 }, { "auxiliary_loss_clip": 0.01123596, "auxiliary_loss_mlp": 0.01028952, "balance_loss_clip": 1.04238117, "balance_loss_mlp": 1.02044058, "epoch": 0.5652618288943666, "flos": 25520026014720.0, "grad_norm": 2.2199005516160755, "language_loss": 0.76225734, "learning_rate": 1.675826355883496e-06, "loss": 0.78378284, "num_input_tokens_seen": 101424160, "step": 4701, "time_per_iteration": 2.753084182739258 }, { "auxiliary_loss_clip": 0.0114633, "auxiliary_loss_mlp": 0.01028191, "balance_loss_clip": 1.04971194, "balance_loss_mlp": 1.02059126, "epoch": 0.5653820717850057, "flos": 19683105937920.0, "grad_norm": 2.0392810157581414, "language_loss": 0.792328, "learning_rate": 1.6750577095751848e-06, "loss": 0.8140732, "num_input_tokens_seen": 101443270, "step": 4702, "time_per_iteration": 2.7329561710357666 }, { "auxiliary_loss_clip": 0.01178658, "auxiliary_loss_mlp": 0.01025083, "balance_loss_clip": 1.05198693, "balance_loss_mlp": 1.01773429, "epoch": 0.5655023146756448, "flos": 26979722910720.0, "grad_norm": 1.8156720423021762, "language_loss": 0.72827089, "learning_rate": 1.6742891125603605e-06, "loss": 0.75030828, "num_input_tokens_seen": 101464175, "step": 4703, "time_per_iteration": 2.6545629501342773 }, { "auxiliary_loss_clip": 0.01165895, "auxiliary_loss_mlp": 0.01025287, "balance_loss_clip": 1.05004811, "balance_loss_mlp": 1.01667428, "epoch": 0.5656225575662839, "flos": 27669351104640.0, "grad_norm": 2.2336567512951695, "language_loss": 0.71963716, "learning_rate": 1.6735205649556185e-06, "loss": 0.74154902, "num_input_tokens_seen": 101484045, "step": 4704, "time_per_iteration": 2.7313594818115234 }, { "auxiliary_loss_clip": 0.01145256, "auxiliary_loss_mlp": 0.01024171, "balance_loss_clip": 1.05019951, "balance_loss_mlp": 1.01679826, "epoch": 0.5657428004569229, "flos": 24349732997760.0, "grad_norm": 1.5637498540681032, "language_loss": 0.85015428, "learning_rate": 1.6727520668775476e-06, "loss": 0.87184858, "num_input_tokens_seen": 101504330, "step": 4705, "time_per_iteration": 2.6997270584106445 }, { "auxiliary_loss_clip": 0.01181357, "auxiliary_loss_mlp": 0.01032041, "balance_loss_clip": 1.05035233, "balance_loss_mlp": 1.02394664, "epoch": 0.5658630433475621, "flos": 21944041562880.0, "grad_norm": 1.6281431111509017, "language_loss": 0.74962127, "learning_rate": 1.6719836184427275e-06, "loss": 0.77175528, "num_input_tokens_seen": 101524635, "step": 4706, "time_per_iteration": 2.674062967300415 }, { "auxiliary_loss_clip": 0.0114598, "auxiliary_loss_mlp": 0.01032561, "balance_loss_clip": 1.04451466, "balance_loss_mlp": 1.02433562, "epoch": 0.5659832862382012, "flos": 30409012218240.0, "grad_norm": 1.944619954142119, "language_loss": 0.64288545, "learning_rate": 1.671215219767733e-06, "loss": 0.66467083, "num_input_tokens_seen": 101544095, "step": 4707, "time_per_iteration": 2.6963918209075928 }, { "auxiliary_loss_clip": 0.01123936, "auxiliary_loss_mlp": 0.01027104, "balance_loss_clip": 1.04375541, "balance_loss_mlp": 1.01921797, "epoch": 0.5661035291288402, "flos": 13188194570880.0, "grad_norm": 2.6564815143361473, "language_loss": 0.76511264, "learning_rate": 1.670446870969127e-06, "loss": 0.786623, "num_input_tokens_seen": 101561760, "step": 4708, "time_per_iteration": 2.776554584503174 }, { "auxiliary_loss_clip": 0.0115417, "auxiliary_loss_mlp": 0.01026271, "balance_loss_clip": 1.04836178, "balance_loss_mlp": 1.01842988, "epoch": 0.5662237720194794, "flos": 16143032108160.0, "grad_norm": 2.021039342406061, "language_loss": 0.7978307, "learning_rate": 1.6696785721634685e-06, "loss": 0.81963515, "num_input_tokens_seen": 101576245, "step": 4709, "time_per_iteration": 2.6210153102874756 }, { "auxiliary_loss_clip": 0.01169614, "auxiliary_loss_mlp": 0.01029343, "balance_loss_clip": 1.05140567, "balance_loss_mlp": 1.02106953, "epoch": 0.5663440149101184, "flos": 17676848718720.0, "grad_norm": 7.392332701143294, "language_loss": 0.7351824, "learning_rate": 1.6689103234673086e-06, "loss": 0.75717199, "num_input_tokens_seen": 101594565, "step": 4710, "time_per_iteration": 2.618055820465088 }, { "auxiliary_loss_clip": 0.01151504, "auxiliary_loss_mlp": 0.01028069, "balance_loss_clip": 1.05026877, "balance_loss_mlp": 1.02013588, "epoch": 0.5664642578007575, "flos": 23368330627200.0, "grad_norm": 2.4630765834242996, "language_loss": 0.77050215, "learning_rate": 1.668142124997189e-06, "loss": 0.7922979, "num_input_tokens_seen": 101614225, "step": 4711, "time_per_iteration": 2.709811210632324 }, { "auxiliary_loss_clip": 0.01045409, "auxiliary_loss_mlp": 0.01005647, "balance_loss_clip": 1.01370883, "balance_loss_mlp": 1.00416303, "epoch": 0.5665845006913967, "flos": 65516470945920.0, "grad_norm": 0.7493948026478168, "language_loss": 0.59820884, "learning_rate": 1.6673739768696453e-06, "loss": 0.6187194, "num_input_tokens_seen": 101680795, "step": 4712, "time_per_iteration": 3.253933906555176 }, { "auxiliary_loss_clip": 0.01157574, "auxiliary_loss_mlp": 0.01027588, "balance_loss_clip": 1.04814649, "balance_loss_mlp": 1.01924896, "epoch": 0.5667047435820357, "flos": 26140885620480.0, "grad_norm": 1.780029599441437, "language_loss": 0.77186775, "learning_rate": 1.6666058792012052e-06, "loss": 0.79371935, "num_input_tokens_seen": 101701680, "step": 4713, "time_per_iteration": 2.714496612548828 }, { "auxiliary_loss_clip": 0.01062585, "auxiliary_loss_mlp": 0.01002736, "balance_loss_clip": 1.01085687, "balance_loss_mlp": 1.00138903, "epoch": 0.5668249864726748, "flos": 71866949725440.0, "grad_norm": 0.8779696345571035, "language_loss": 0.68687618, "learning_rate": 1.6658378321083878e-06, "loss": 0.70752943, "num_input_tokens_seen": 101766010, "step": 4714, "time_per_iteration": 3.24955677986145 }, { "auxiliary_loss_clip": 0.01112575, "auxiliary_loss_mlp": 0.0102566, "balance_loss_clip": 1.0449326, "balance_loss_mlp": 1.0174706, "epoch": 0.5669452293633139, "flos": 22195667312640.0, "grad_norm": 1.6704845474246885, "language_loss": 0.82267237, "learning_rate": 1.6650698357077055e-06, "loss": 0.8440547, "num_input_tokens_seen": 101783055, "step": 4715, "time_per_iteration": 2.9100146293640137 }, { "auxiliary_loss_clip": 0.01156, "auxiliary_loss_mlp": 0.01029063, "balance_loss_clip": 1.04740822, "balance_loss_mlp": 1.02086687, "epoch": 0.567065472253953, "flos": 18223193560320.0, "grad_norm": 3.245905905082006, "language_loss": 0.80695146, "learning_rate": 1.6643018901156632e-06, "loss": 0.82880211, "num_input_tokens_seen": 101802150, "step": 4716, "time_per_iteration": 2.96390962600708 }, { "auxiliary_loss_clip": 0.01154132, "auxiliary_loss_mlp": 0.01023372, "balance_loss_clip": 1.04751921, "balance_loss_mlp": 1.01546848, "epoch": 0.567185715144592, "flos": 20371548983040.0, "grad_norm": 2.8571552292920397, "language_loss": 0.79456997, "learning_rate": 1.6635339954487566e-06, "loss": 0.81634498, "num_input_tokens_seen": 101818025, "step": 4717, "time_per_iteration": 3.651630163192749 }, { "auxiliary_loss_clip": 0.01156929, "auxiliary_loss_mlp": 0.01030936, "balance_loss_clip": 1.04945445, "balance_loss_mlp": 1.0224725, "epoch": 0.5673059580352312, "flos": 23221348174080.0, "grad_norm": 1.95336007079537, "language_loss": 0.82057697, "learning_rate": 1.6627661518234765e-06, "loss": 0.84245563, "num_input_tokens_seen": 101837280, "step": 4718, "time_per_iteration": 3.7306814193725586 }, { "auxiliary_loss_clip": 0.01126826, "auxiliary_loss_mlp": 0.01026736, "balance_loss_clip": 1.04767931, "balance_loss_mlp": 1.01854038, "epoch": 0.5674262009258703, "flos": 21719599430400.0, "grad_norm": 1.6202535056846814, "language_loss": 0.85536981, "learning_rate": 1.661998359356302e-06, "loss": 0.87690544, "num_input_tokens_seen": 101856310, "step": 4719, "time_per_iteration": 3.6449577808380127 }, { "auxiliary_loss_clip": 0.01073219, "auxiliary_loss_mlp": 0.01001277, "balance_loss_clip": 1.01264477, "balance_loss_mlp": 0.99991763, "epoch": 0.5675464438165093, "flos": 67470369114240.0, "grad_norm": 0.7518157801377123, "language_loss": 0.55733144, "learning_rate": 1.6612306181637077e-06, "loss": 0.57807636, "num_input_tokens_seen": 101915635, "step": 4720, "time_per_iteration": 3.1676645278930664 }, { "auxiliary_loss_clip": 0.01137225, "auxiliary_loss_mlp": 0.01029907, "balance_loss_clip": 1.04608965, "balance_loss_mlp": 1.02264094, "epoch": 0.5676666867071485, "flos": 18879173688960.0, "grad_norm": 2.9805401531988096, "language_loss": 0.65901518, "learning_rate": 1.6604629283621598e-06, "loss": 0.68068647, "num_input_tokens_seen": 101933565, "step": 4721, "time_per_iteration": 3.547450065612793 }, { "auxiliary_loss_clip": 0.01184287, "auxiliary_loss_mlp": 0.01028436, "balance_loss_clip": 1.0530616, "balance_loss_mlp": 1.01973343, "epoch": 0.5677869295977875, "flos": 33546778744320.0, "grad_norm": 1.9998568794351688, "language_loss": 0.74333549, "learning_rate": 1.6596952900681152e-06, "loss": 0.7654627, "num_input_tokens_seen": 101954325, "step": 4722, "time_per_iteration": 2.729280948638916 }, { "auxiliary_loss_clip": 0.01115595, "auxiliary_loss_mlp": 0.01032343, "balance_loss_clip": 1.04742289, "balance_loss_mlp": 1.02336669, "epoch": 0.5679071724884266, "flos": 28037256157440.0, "grad_norm": 2.2360990370454807, "language_loss": 0.82481933, "learning_rate": 1.658927703398025e-06, "loss": 0.84629869, "num_input_tokens_seen": 101974390, "step": 4723, "time_per_iteration": 2.762680768966675 }, { "auxiliary_loss_clip": 0.01120654, "auxiliary_loss_mlp": 0.01025621, "balance_loss_clip": 1.03929734, "balance_loss_mlp": 1.01755118, "epoch": 0.5680274153790658, "flos": 23550110380800.0, "grad_norm": 2.86813818670903, "language_loss": 0.77606392, "learning_rate": 1.6581601684683309e-06, "loss": 0.7975266, "num_input_tokens_seen": 101994815, "step": 4724, "time_per_iteration": 2.7953555583953857 }, { "auxiliary_loss_clip": 0.01167497, "auxiliary_loss_mlp": 0.01029347, "balance_loss_clip": 1.05139279, "balance_loss_mlp": 1.02108586, "epoch": 0.5681476582697048, "flos": 22455158140800.0, "grad_norm": 3.094482368370397, "language_loss": 0.68461072, "learning_rate": 1.6573926853954674e-06, "loss": 0.70657915, "num_input_tokens_seen": 102012400, "step": 4725, "time_per_iteration": 2.6643662452697754 }, { "auxiliary_loss_clip": 0.01146093, "auxiliary_loss_mlp": 0.01030442, "balance_loss_clip": 1.04643941, "balance_loss_mlp": 1.02320051, "epoch": 0.5682679011603439, "flos": 19536913584000.0, "grad_norm": 2.129182663745878, "language_loss": 0.83072746, "learning_rate": 1.6566252542958608e-06, "loss": 0.85249281, "num_input_tokens_seen": 102031900, "step": 4726, "time_per_iteration": 2.685662269592285 }, { "auxiliary_loss_clip": 0.01128263, "auxiliary_loss_mlp": 0.01032073, "balance_loss_clip": 1.0448916, "balance_loss_mlp": 1.02338552, "epoch": 0.568388144050983, "flos": 28765488493440.0, "grad_norm": 2.2135270746356164, "language_loss": 0.78581095, "learning_rate": 1.6558578752859305e-06, "loss": 0.80741423, "num_input_tokens_seen": 102050860, "step": 4727, "time_per_iteration": 2.758790969848633 }, { "auxiliary_loss_clip": 0.01135694, "auxiliary_loss_mlp": 0.01029122, "balance_loss_clip": 1.04406118, "balance_loss_mlp": 1.02028227, "epoch": 0.5685083869416221, "flos": 21209452519680.0, "grad_norm": 1.7668884400671239, "language_loss": 0.79134822, "learning_rate": 1.6550905484820865e-06, "loss": 0.81299645, "num_input_tokens_seen": 102069320, "step": 4728, "time_per_iteration": 2.741384744644165 }, { "auxiliary_loss_clip": 0.01182285, "auxiliary_loss_mlp": 0.01037926, "balance_loss_clip": 1.05184817, "balance_loss_mlp": 1.02904439, "epoch": 0.5686286298322611, "flos": 24827021942400.0, "grad_norm": 2.262928547137723, "language_loss": 0.78927761, "learning_rate": 1.6543232740007328e-06, "loss": 0.81147981, "num_input_tokens_seen": 102086435, "step": 4729, "time_per_iteration": 2.6308155059814453 }, { "auxiliary_loss_clip": 0.01169339, "auxiliary_loss_mlp": 0.01032598, "balance_loss_clip": 1.04993403, "balance_loss_mlp": 1.02462316, "epoch": 0.5687488727229003, "flos": 26615121909120.0, "grad_norm": 2.494375921299094, "language_loss": 0.66834134, "learning_rate": 1.653556051958263e-06, "loss": 0.69036072, "num_input_tokens_seen": 102106115, "step": 4730, "time_per_iteration": 2.70350980758667 }, { "auxiliary_loss_clip": 0.01092268, "auxiliary_loss_mlp": 0.01028582, "balance_loss_clip": 1.04137075, "balance_loss_mlp": 1.01976013, "epoch": 0.5688691156135394, "flos": 20808725414400.0, "grad_norm": 1.7114310408710192, "language_loss": 0.73470879, "learning_rate": 1.6527888824710642e-06, "loss": 0.75591731, "num_input_tokens_seen": 102125715, "step": 4731, "time_per_iteration": 2.721170425415039 }, { "auxiliary_loss_clip": 0.01126342, "auxiliary_loss_mlp": 0.01031462, "balance_loss_clip": 1.04130578, "balance_loss_mlp": 1.02306914, "epoch": 0.5689893585041784, "flos": 25880963829120.0, "grad_norm": 2.5488976194060267, "language_loss": 0.76722813, "learning_rate": 1.6520217656555166e-06, "loss": 0.7888062, "num_input_tokens_seen": 102145005, "step": 4732, "time_per_iteration": 2.8119709491729736 }, { "auxiliary_loss_clip": 0.01137748, "auxiliary_loss_mlp": 0.01024202, "balance_loss_clip": 1.04397845, "balance_loss_mlp": 1.0160538, "epoch": 0.5691096013948175, "flos": 23477463123840.0, "grad_norm": 1.7618986310024225, "language_loss": 0.70529413, "learning_rate": 1.65125470162799e-06, "loss": 0.72691369, "num_input_tokens_seen": 102165360, "step": 4733, "time_per_iteration": 2.7242531776428223 }, { "auxiliary_loss_clip": 0.01140142, "auxiliary_loss_mlp": 0.01034352, "balance_loss_clip": 1.04557419, "balance_loss_mlp": 1.02618647, "epoch": 0.5692298442854566, "flos": 18075600576000.0, "grad_norm": 2.2049772232491325, "language_loss": 0.69626254, "learning_rate": 1.6504876905048485e-06, "loss": 0.7180075, "num_input_tokens_seen": 102182320, "step": 4734, "time_per_iteration": 2.664064645767212 }, { "auxiliary_loss_clip": 0.01179993, "auxiliary_loss_mlp": 0.01028844, "balance_loss_clip": 1.05296361, "balance_loss_mlp": 1.02129853, "epoch": 0.5693500871760957, "flos": 23039317025280.0, "grad_norm": 1.6142362854811159, "language_loss": 0.7223109, "learning_rate": 1.6497207324024464e-06, "loss": 0.74439925, "num_input_tokens_seen": 102201220, "step": 4735, "time_per_iteration": 2.7004737854003906 }, { "auxiliary_loss_clip": 0.01158031, "auxiliary_loss_mlp": 0.01028243, "balance_loss_clip": 1.04732978, "balance_loss_mlp": 1.02052426, "epoch": 0.5694703300667348, "flos": 18989670902400.0, "grad_norm": 1.8028209091249696, "language_loss": 0.82902896, "learning_rate": 1.6489538274371305e-06, "loss": 0.85089177, "num_input_tokens_seen": 102219825, "step": 4736, "time_per_iteration": 2.6663053035736084 }, { "auxiliary_loss_clip": 0.01160869, "auxiliary_loss_mlp": 0.01030054, "balance_loss_clip": 1.05115271, "balance_loss_mlp": 1.02215064, "epoch": 0.5695905729573739, "flos": 21908705558400.0, "grad_norm": 1.874104928532494, "language_loss": 0.83129978, "learning_rate": 1.6481869757252396e-06, "loss": 0.85320902, "num_input_tokens_seen": 102238160, "step": 4737, "time_per_iteration": 2.688802719116211 }, { "auxiliary_loss_clip": 0.01165494, "auxiliary_loss_mlp": 0.01030232, "balance_loss_clip": 1.04988134, "balance_loss_mlp": 1.0221374, "epoch": 0.569710815848013, "flos": 28476659232000.0, "grad_norm": 1.4912227449595794, "language_loss": 0.71969545, "learning_rate": 1.647420177383105e-06, "loss": 0.74165267, "num_input_tokens_seen": 102261030, "step": 4738, "time_per_iteration": 2.6924633979797363 }, { "auxiliary_loss_clip": 0.01164544, "auxiliary_loss_mlp": 0.01028642, "balance_loss_clip": 1.05369902, "balance_loss_mlp": 1.0210309, "epoch": 0.569831058738652, "flos": 28366162018560.0, "grad_norm": 1.9963237409571895, "language_loss": 0.72659218, "learning_rate": 1.646653432527049e-06, "loss": 0.74852395, "num_input_tokens_seen": 102281670, "step": 4739, "time_per_iteration": 2.8014814853668213 }, { "auxiliary_loss_clip": 0.01138414, "auxiliary_loss_mlp": 0.01025861, "balance_loss_clip": 1.04756534, "balance_loss_mlp": 1.01833904, "epoch": 0.5699513016292912, "flos": 25849973370240.0, "grad_norm": 1.4916160629304287, "language_loss": 0.74416494, "learning_rate": 1.645886741273387e-06, "loss": 0.76580769, "num_input_tokens_seen": 102303485, "step": 4740, "time_per_iteration": 2.767712116241455 }, { "auxiliary_loss_clip": 0.01132245, "auxiliary_loss_mlp": 0.01024948, "balance_loss_clip": 1.04987788, "balance_loss_mlp": 1.01679444, "epoch": 0.5700715445199303, "flos": 18037858360320.0, "grad_norm": 1.9387046903319267, "language_loss": 0.73816633, "learning_rate": 1.645120103738424e-06, "loss": 0.75973827, "num_input_tokens_seen": 102320995, "step": 4741, "time_per_iteration": 2.8393783569335938 }, { "auxiliary_loss_clip": 0.01152744, "auxiliary_loss_mlp": 0.00762209, "balance_loss_clip": 1.04685116, "balance_loss_mlp": 1.00022197, "epoch": 0.5701917874105693, "flos": 11473352392320.0, "grad_norm": 2.292097819540539, "language_loss": 0.83757287, "learning_rate": 1.6443535200384591e-06, "loss": 0.85672241, "num_input_tokens_seen": 102339170, "step": 4742, "time_per_iteration": 2.6524109840393066 }, { "auxiliary_loss_clip": 0.01182585, "auxiliary_loss_mlp": 0.01027454, "balance_loss_clip": 1.05522537, "balance_loss_mlp": 1.01943731, "epoch": 0.5703120303012085, "flos": 21761759018880.0, "grad_norm": 1.893381218909576, "language_loss": 0.70646393, "learning_rate": 1.6435869902897827e-06, "loss": 0.72856438, "num_input_tokens_seen": 102357750, "step": 4743, "time_per_iteration": 3.5527892112731934 }, { "auxiliary_loss_clip": 0.01040169, "auxiliary_loss_mlp": 0.01001466, "balance_loss_clip": 1.01211977, "balance_loss_mlp": 1.00003552, "epoch": 0.5704322731918475, "flos": 56746258513920.0, "grad_norm": 1.034349178747778, "language_loss": 0.61949646, "learning_rate": 1.6428205146086764e-06, "loss": 0.63991284, "num_input_tokens_seen": 102419730, "step": 4744, "time_per_iteration": 5.237643718719482 }, { "auxiliary_loss_clip": 0.01155123, "auxiliary_loss_mlp": 0.01025617, "balance_loss_clip": 1.0472393, "balance_loss_mlp": 1.01790428, "epoch": 0.5705525160824866, "flos": 20741141975040.0, "grad_norm": 1.4586842549074204, "language_loss": 0.70616704, "learning_rate": 1.6420540931114142e-06, "loss": 0.72797447, "num_input_tokens_seen": 102440320, "step": 4745, "time_per_iteration": 2.735504388809204 }, { "auxiliary_loss_clip": 0.01151085, "auxiliary_loss_mlp": 0.01029578, "balance_loss_clip": 1.04777765, "balance_loss_mlp": 1.02171052, "epoch": 0.5706727589731257, "flos": 18771262254720.0, "grad_norm": 1.811355173018474, "language_loss": 0.79025048, "learning_rate": 1.6412877259142616e-06, "loss": 0.81205714, "num_input_tokens_seen": 102460240, "step": 4746, "time_per_iteration": 2.655766725540161 }, { "auxiliary_loss_clip": 0.0114691, "auxiliary_loss_mlp": 0.01032386, "balance_loss_clip": 1.04662859, "balance_loss_mlp": 1.0243516, "epoch": 0.5707930018637648, "flos": 27634733372160.0, "grad_norm": 2.0610969293286874, "language_loss": 0.73927337, "learning_rate": 1.6405214131334757e-06, "loss": 0.76106632, "num_input_tokens_seen": 102478765, "step": 4747, "time_per_iteration": 3.6808650493621826 }, { "auxiliary_loss_clip": 0.01120672, "auxiliary_loss_mlp": 0.01026471, "balance_loss_clip": 1.04742873, "balance_loss_mlp": 1.01894021, "epoch": 0.5709132447544039, "flos": 27597673514880.0, "grad_norm": 2.407921030969635, "language_loss": 0.79225737, "learning_rate": 1.6397551548853052e-06, "loss": 0.81372881, "num_input_tokens_seen": 102496930, "step": 4748, "time_per_iteration": 2.8126418590545654 }, { "auxiliary_loss_clip": 0.0114901, "auxiliary_loss_mlp": 0.01023376, "balance_loss_clip": 1.04649997, "balance_loss_mlp": 1.01516211, "epoch": 0.571033487645043, "flos": 21686095019520.0, "grad_norm": 1.958676945865462, "language_loss": 0.7090112, "learning_rate": 1.6389889512859917e-06, "loss": 0.73073506, "num_input_tokens_seen": 102516590, "step": 4749, "time_per_iteration": 2.731142520904541 }, { "auxiliary_loss_clip": 0.01050565, "auxiliary_loss_mlp": 0.01001315, "balance_loss_clip": 1.01193082, "balance_loss_mlp": 0.99999231, "epoch": 0.5711537305356821, "flos": 70181445980160.0, "grad_norm": 0.8110282473493834, "language_loss": 0.60302418, "learning_rate": 1.638222802451767e-06, "loss": 0.62354302, "num_input_tokens_seen": 102578070, "step": 4750, "time_per_iteration": 3.2817327976226807 }, { "auxiliary_loss_clip": 0.01159368, "auxiliary_loss_mlp": 0.01027421, "balance_loss_clip": 1.05030334, "balance_loss_mlp": 1.01977324, "epoch": 0.5712739734263211, "flos": 24717494396160.0, "grad_norm": 1.9329624921447397, "language_loss": 0.75452483, "learning_rate": 1.6374567084988561e-06, "loss": 0.77639276, "num_input_tokens_seen": 102599255, "step": 4751, "time_per_iteration": 2.7464630603790283 }, { "auxiliary_loss_clip": 0.0115799, "auxiliary_loss_mlp": 0.01028624, "balance_loss_clip": 1.05236816, "balance_loss_mlp": 1.02029753, "epoch": 0.5713942163169603, "flos": 26578169792640.0, "grad_norm": 1.8112156398654655, "language_loss": 0.76546997, "learning_rate": 1.6366906695434738e-06, "loss": 0.78733611, "num_input_tokens_seen": 102621775, "step": 4752, "time_per_iteration": 2.7545156478881836 }, { "auxiliary_loss_clip": 0.01163674, "auxiliary_loss_mlp": 0.0103046, "balance_loss_clip": 1.04930389, "balance_loss_mlp": 1.02249694, "epoch": 0.5715144592075994, "flos": 21142443697920.0, "grad_norm": 2.099414425886561, "language_loss": 0.85860556, "learning_rate": 1.6359246857018275e-06, "loss": 0.88054693, "num_input_tokens_seen": 102639305, "step": 4753, "time_per_iteration": 2.59777569770813 }, { "auxiliary_loss_clip": 0.01122212, "auxiliary_loss_mlp": 0.01026896, "balance_loss_clip": 1.04305625, "balance_loss_mlp": 1.01936245, "epoch": 0.5716347020982384, "flos": 23330265189120.0, "grad_norm": 2.28662688857944, "language_loss": 0.78246939, "learning_rate": 1.6351587570901178e-06, "loss": 0.8039605, "num_input_tokens_seen": 102659430, "step": 4754, "time_per_iteration": 2.753528118133545 }, { "auxiliary_loss_clip": 0.01138322, "auxiliary_loss_mlp": 0.01030386, "balance_loss_clip": 1.04862046, "balance_loss_mlp": 1.02186894, "epoch": 0.5717549449888776, "flos": 17009555806080.0, "grad_norm": 2.6946333470368207, "language_loss": 0.75951552, "learning_rate": 1.634392883824534e-06, "loss": 0.78120255, "num_input_tokens_seen": 102671430, "step": 4755, "time_per_iteration": 2.6603033542633057 }, { "auxiliary_loss_clip": 0.01127162, "auxiliary_loss_mlp": 0.01028153, "balance_loss_clip": 1.04544652, "balance_loss_mlp": 1.0203743, "epoch": 0.5718751878795166, "flos": 35518130922240.0, "grad_norm": 1.8117781874392749, "language_loss": 0.6793381, "learning_rate": 1.6336270660212595e-06, "loss": 0.70089126, "num_input_tokens_seen": 102693025, "step": 4756, "time_per_iteration": 2.896963357925415 }, { "auxiliary_loss_clip": 0.01152108, "auxiliary_loss_mlp": 0.01034287, "balance_loss_clip": 1.05393362, "balance_loss_mlp": 1.02626467, "epoch": 0.5719954307701557, "flos": 38613989255040.0, "grad_norm": 2.83015982457641, "language_loss": 0.66014016, "learning_rate": 1.6328613037964676e-06, "loss": 0.68200409, "num_input_tokens_seen": 102716090, "step": 4757, "time_per_iteration": 2.857815742492676 }, { "auxiliary_loss_clip": 0.0116443, "auxiliary_loss_mlp": 0.01024632, "balance_loss_clip": 1.04841745, "balance_loss_mlp": 1.01702058, "epoch": 0.5721156736607949, "flos": 20631111638400.0, "grad_norm": 1.8347617046797027, "language_loss": 0.68173903, "learning_rate": 1.6320955972663241e-06, "loss": 0.70362961, "num_input_tokens_seen": 102735685, "step": 4758, "time_per_iteration": 2.626655101776123 }, { "auxiliary_loss_clip": 0.01162618, "auxiliary_loss_mlp": 0.01026185, "balance_loss_clip": 1.0480268, "balance_loss_mlp": 1.01837707, "epoch": 0.5722359165514339, "flos": 37415076076800.0, "grad_norm": 2.4449492373539727, "language_loss": 0.65299791, "learning_rate": 1.6313299465469857e-06, "loss": 0.67488593, "num_input_tokens_seen": 102758415, "step": 4759, "time_per_iteration": 2.8303604125976562 }, { "auxiliary_loss_clip": 0.01160888, "auxiliary_loss_mlp": 0.01029202, "balance_loss_clip": 1.04828072, "balance_loss_mlp": 1.02062488, "epoch": 0.572356159442073, "flos": 21972877205760.0, "grad_norm": 2.6416889958032757, "language_loss": 0.79556584, "learning_rate": 1.6305643517546014e-06, "loss": 0.81746674, "num_input_tokens_seen": 102773795, "step": 4760, "time_per_iteration": 2.6117725372314453 }, { "auxiliary_loss_clip": 0.01179032, "auxiliary_loss_mlp": 0.01028901, "balance_loss_clip": 1.05169535, "balance_loss_mlp": 1.0212183, "epoch": 0.5724764023327121, "flos": 19135540033920.0, "grad_norm": 1.826157504749068, "language_loss": 0.84550393, "learning_rate": 1.629798813005311e-06, "loss": 0.86758327, "num_input_tokens_seen": 102793515, "step": 4761, "time_per_iteration": 2.6745831966400146 }, { "auxiliary_loss_clip": 0.01120616, "auxiliary_loss_mlp": 0.01025577, "balance_loss_clip": 1.04600906, "balance_loss_mlp": 1.0177536, "epoch": 0.5725966452233512, "flos": 22819759142400.0, "grad_norm": 1.9917075510248097, "language_loss": 0.70698607, "learning_rate": 1.6290333304152473e-06, "loss": 0.72844803, "num_input_tokens_seen": 102813390, "step": 4762, "time_per_iteration": 2.7662999629974365 }, { "auxiliary_loss_clip": 0.01147307, "auxiliary_loss_mlp": 0.01030145, "balance_loss_clip": 1.05144298, "balance_loss_mlp": 1.02205706, "epoch": 0.5727168881139902, "flos": 41496610498560.0, "grad_norm": 1.784370124125701, "language_loss": 0.57096016, "learning_rate": 1.6282679041005314e-06, "loss": 0.59273475, "num_input_tokens_seen": 102838980, "step": 4763, "time_per_iteration": 2.90012264251709 }, { "auxiliary_loss_clip": 0.01139696, "auxiliary_loss_mlp": 0.01027915, "balance_loss_clip": 1.04210019, "balance_loss_mlp": 1.01998758, "epoch": 0.5728371310046293, "flos": 14647675985280.0, "grad_norm": 2.26006617253784, "language_loss": 0.87213522, "learning_rate": 1.6275025341772789e-06, "loss": 0.89381135, "num_input_tokens_seen": 102855285, "step": 4764, "time_per_iteration": 2.6794021129608154 }, { "auxiliary_loss_clip": 0.01151278, "auxiliary_loss_mlp": 0.01036406, "balance_loss_clip": 1.04751873, "balance_loss_mlp": 1.02834797, "epoch": 0.5729573738952685, "flos": 21506613736320.0, "grad_norm": 2.4060288070597085, "language_loss": 0.81941235, "learning_rate": 1.626737220761596e-06, "loss": 0.84128922, "num_input_tokens_seen": 102872750, "step": 4765, "time_per_iteration": 2.6775851249694824 }, { "auxiliary_loss_clip": 0.01164314, "auxiliary_loss_mlp": 0.01026416, "balance_loss_clip": 1.05143964, "balance_loss_mlp": 1.01870871, "epoch": 0.5730776167859075, "flos": 23621680229760.0, "grad_norm": 3.092610066699434, "language_loss": 0.7868886, "learning_rate": 1.62597196396958e-06, "loss": 0.80879593, "num_input_tokens_seen": 102890920, "step": 4766, "time_per_iteration": 2.658482789993286 }, { "auxiliary_loss_clip": 0.01165042, "auxiliary_loss_mlp": 0.01028582, "balance_loss_clip": 1.05115998, "balance_loss_mlp": 1.02062476, "epoch": 0.5731978596765466, "flos": 25739224761600.0, "grad_norm": 1.9985190408509372, "language_loss": 0.8589375, "learning_rate": 1.6252067639173197e-06, "loss": 0.8808738, "num_input_tokens_seen": 102912830, "step": 4767, "time_per_iteration": 2.6694159507751465 }, { "auxiliary_loss_clip": 0.01168266, "auxiliary_loss_mlp": 0.01029013, "balance_loss_clip": 1.05068624, "balance_loss_mlp": 1.02142262, "epoch": 0.5733181025671857, "flos": 26359509749760.0, "grad_norm": 1.7799905302689805, "language_loss": 0.69406056, "learning_rate": 1.6244416207208956e-06, "loss": 0.71603334, "num_input_tokens_seen": 102933765, "step": 4768, "time_per_iteration": 2.755201816558838 }, { "auxiliary_loss_clip": 0.01137898, "auxiliary_loss_mlp": 0.01030892, "balance_loss_clip": 1.04786551, "balance_loss_mlp": 1.02242827, "epoch": 0.5734383454578248, "flos": 29423874833280.0, "grad_norm": 1.6698256192247165, "language_loss": 0.73371649, "learning_rate": 1.6236765344963787e-06, "loss": 0.75540447, "num_input_tokens_seen": 102955025, "step": 4769, "time_per_iteration": 3.7249560356140137 }, { "auxiliary_loss_clip": 0.01151921, "auxiliary_loss_mlp": 0.01032209, "balance_loss_clip": 1.04821837, "balance_loss_mlp": 1.02436543, "epoch": 0.5735585883484638, "flos": 34969954487040.0, "grad_norm": 2.177647406318903, "language_loss": 0.69201505, "learning_rate": 1.6229115053598322e-06, "loss": 0.71385646, "num_input_tokens_seen": 102976780, "step": 4770, "time_per_iteration": 3.709921360015869 }, { "auxiliary_loss_clip": 0.01165618, "auxiliary_loss_mlp": 0.01025208, "balance_loss_clip": 1.05149174, "balance_loss_mlp": 1.01732254, "epoch": 0.573678831239103, "flos": 18770759464320.0, "grad_norm": 2.221817620577284, "language_loss": 0.72385871, "learning_rate": 1.6221465334273108e-06, "loss": 0.745767, "num_input_tokens_seen": 102995990, "step": 4771, "time_per_iteration": 3.536027431488037 }, { "auxiliary_loss_clip": 0.01141759, "auxiliary_loss_mlp": 0.0102992, "balance_loss_clip": 1.04757071, "balance_loss_mlp": 1.02129507, "epoch": 0.5737990741297421, "flos": 25702883176320.0, "grad_norm": 1.9283236277087668, "language_loss": 0.61365873, "learning_rate": 1.6213816188148593e-06, "loss": 0.63537556, "num_input_tokens_seen": 103014695, "step": 4772, "time_per_iteration": 2.6991376876831055 }, { "auxiliary_loss_clip": 0.0114112, "auxiliary_loss_mlp": 0.01023175, "balance_loss_clip": 1.04793286, "balance_loss_mlp": 1.01563537, "epoch": 0.5739193170203811, "flos": 27269234530560.0, "grad_norm": 1.6764218683516148, "language_loss": 0.77354246, "learning_rate": 1.6206167616385162e-06, "loss": 0.79518539, "num_input_tokens_seen": 103035760, "step": 4773, "time_per_iteration": 3.6526262760162354 }, { "auxiliary_loss_clip": 0.01157894, "auxiliary_loss_mlp": 0.01028996, "balance_loss_clip": 1.05059326, "balance_loss_mlp": 1.02038932, "epoch": 0.5740395599110203, "flos": 12239721993600.0, "grad_norm": 2.095504114987213, "language_loss": 0.73755515, "learning_rate": 1.6198519620143078e-06, "loss": 0.75942397, "num_input_tokens_seen": 103052915, "step": 4774, "time_per_iteration": 2.7699227333068848 }, { "auxiliary_loss_clip": 0.01140627, "auxiliary_loss_mlp": 0.01030166, "balance_loss_clip": 1.04859126, "balance_loss_mlp": 1.02193499, "epoch": 0.5741598028016593, "flos": 25921399564800.0, "grad_norm": 1.5537592664489628, "language_loss": 0.78041452, "learning_rate": 1.6190872200582546e-06, "loss": 0.80212247, "num_input_tokens_seen": 103074655, "step": 4775, "time_per_iteration": 2.7176008224487305 }, { "auxiliary_loss_clip": 0.01144064, "auxiliary_loss_mlp": 0.00762969, "balance_loss_clip": 1.0453341, "balance_loss_mlp": 1.00039148, "epoch": 0.5742800456922984, "flos": 19244133826560.0, "grad_norm": 3.417168084251413, "language_loss": 0.78439486, "learning_rate": 1.6183225358863676e-06, "loss": 0.80346519, "num_input_tokens_seen": 103091550, "step": 4776, "time_per_iteration": 2.697296619415283 }, { "auxiliary_loss_clip": 0.01140934, "auxiliary_loss_mlp": 0.01028771, "balance_loss_clip": 1.04415965, "balance_loss_mlp": 1.02068865, "epoch": 0.5744002885829376, "flos": 30920487932160.0, "grad_norm": 2.095863713194777, "language_loss": 0.71682191, "learning_rate": 1.617557909614648e-06, "loss": 0.73851895, "num_input_tokens_seen": 103110985, "step": 4777, "time_per_iteration": 2.7670600414276123 }, { "auxiliary_loss_clip": 0.01134273, "auxiliary_loss_mlp": 0.01034729, "balance_loss_clip": 1.04433179, "balance_loss_mlp": 1.02717113, "epoch": 0.5745205314735766, "flos": 23840017050240.0, "grad_norm": 1.9315257643681496, "language_loss": 0.86138296, "learning_rate": 1.6167933413590899e-06, "loss": 0.88307303, "num_input_tokens_seen": 103129890, "step": 4778, "time_per_iteration": 2.758100986480713 }, { "auxiliary_loss_clip": 0.01164307, "auxiliary_loss_mlp": 0.01025369, "balance_loss_clip": 1.04871023, "balance_loss_mlp": 1.01729262, "epoch": 0.5746407743642157, "flos": 12311902373760.0, "grad_norm": 2.3543268769487913, "language_loss": 0.90495336, "learning_rate": 1.6160288312356773e-06, "loss": 0.92685014, "num_input_tokens_seen": 103147020, "step": 4779, "time_per_iteration": 2.584770679473877 }, { "auxiliary_loss_clip": 0.01167986, "auxiliary_loss_mlp": 0.01028317, "balance_loss_clip": 1.04870319, "balance_loss_mlp": 1.0207119, "epoch": 0.5747610172548548, "flos": 24133658734080.0, "grad_norm": 1.7234452507953941, "language_loss": 0.81762356, "learning_rate": 1.6152643793603857e-06, "loss": 0.83958656, "num_input_tokens_seen": 103167370, "step": 4780, "time_per_iteration": 2.7057905197143555 }, { "auxiliary_loss_clip": 0.01178458, "auxiliary_loss_mlp": 0.01031713, "balance_loss_clip": 1.05167627, "balance_loss_mlp": 1.02368414, "epoch": 0.5748812601454939, "flos": 25408451393280.0, "grad_norm": 1.7070949083966485, "language_loss": 0.87604463, "learning_rate": 1.6144999858491815e-06, "loss": 0.89814633, "num_input_tokens_seen": 103186000, "step": 4781, "time_per_iteration": 2.617354393005371 }, { "auxiliary_loss_clip": 0.01153906, "auxiliary_loss_mlp": 0.01021744, "balance_loss_clip": 1.04643893, "balance_loss_mlp": 1.01344109, "epoch": 0.575001503036133, "flos": 30624942827520.0, "grad_norm": 1.642124066481995, "language_loss": 0.85716397, "learning_rate": 1.6137356508180232e-06, "loss": 0.87892044, "num_input_tokens_seen": 103207710, "step": 4782, "time_per_iteration": 2.7888691425323486 }, { "auxiliary_loss_clip": 0.01176687, "auxiliary_loss_mlp": 0.00763158, "balance_loss_clip": 1.04846954, "balance_loss_mlp": 1.00030005, "epoch": 0.5751217459267721, "flos": 21726566668800.0, "grad_norm": 1.9522228511530828, "language_loss": 0.81645799, "learning_rate": 1.6129713743828593e-06, "loss": 0.83585644, "num_input_tokens_seen": 103226720, "step": 4783, "time_per_iteration": 2.636209726333618 }, { "auxiliary_loss_clip": 0.01149869, "auxiliary_loss_mlp": 0.01024267, "balance_loss_clip": 1.04473197, "balance_loss_mlp": 1.01631534, "epoch": 0.5752419888174112, "flos": 21651620941440.0, "grad_norm": 1.4487592488173378, "language_loss": 0.75577283, "learning_rate": 1.6122071566596306e-06, "loss": 0.77751422, "num_input_tokens_seen": 103246995, "step": 4784, "time_per_iteration": 2.7845652103424072 }, { "auxiliary_loss_clip": 0.01168928, "auxiliary_loss_mlp": 0.01026422, "balance_loss_clip": 1.05127776, "balance_loss_mlp": 1.01827431, "epoch": 0.5753622317080502, "flos": 17775997234560.0, "grad_norm": 2.2516838698757895, "language_loss": 0.83491778, "learning_rate": 1.6114429977642674e-06, "loss": 0.85687125, "num_input_tokens_seen": 103261500, "step": 4785, "time_per_iteration": 2.6625168323516846 }, { "auxiliary_loss_clip": 0.01165802, "auxiliary_loss_mlp": 0.01029312, "balance_loss_clip": 1.05305219, "balance_loss_mlp": 1.02105653, "epoch": 0.5754824745986894, "flos": 19789616741760.0, "grad_norm": 1.7282142493476607, "language_loss": 0.73689878, "learning_rate": 1.6106788978126926e-06, "loss": 0.75884992, "num_input_tokens_seen": 103280475, "step": 4786, "time_per_iteration": 2.656543493270874 }, { "auxiliary_loss_clip": 0.01116733, "auxiliary_loss_mlp": 0.01026896, "balance_loss_clip": 1.04239845, "balance_loss_mlp": 1.01893866, "epoch": 0.5756027174893285, "flos": 30985665160320.0, "grad_norm": 2.3024469624386543, "language_loss": 0.79120374, "learning_rate": 1.6099148569208196e-06, "loss": 0.81264007, "num_input_tokens_seen": 103297695, "step": 4787, "time_per_iteration": 2.88704252243042 }, { "auxiliary_loss_clip": 0.0115041, "auxiliary_loss_mlp": 0.01030075, "balance_loss_clip": 1.04919481, "balance_loss_mlp": 1.02194452, "epoch": 0.5757229603799675, "flos": 28546864364160.0, "grad_norm": 2.7075001635951264, "language_loss": 0.625754, "learning_rate": 1.6091508752045523e-06, "loss": 0.64755881, "num_input_tokens_seen": 103318575, "step": 4788, "time_per_iteration": 2.7798895835876465 }, { "auxiliary_loss_clip": 0.01126065, "auxiliary_loss_mlp": 0.01032161, "balance_loss_clip": 1.04167163, "balance_loss_mlp": 1.02381599, "epoch": 0.5758432032706067, "flos": 22999024944000.0, "grad_norm": 1.648430164990783, "language_loss": 0.86553705, "learning_rate": 1.608386952779787e-06, "loss": 0.88711929, "num_input_tokens_seen": 103337945, "step": 4789, "time_per_iteration": 2.839604616165161 }, { "auxiliary_loss_clip": 0.0115481, "auxiliary_loss_mlp": 0.01029522, "balance_loss_clip": 1.04789758, "balance_loss_mlp": 1.02136767, "epoch": 0.5759634461612457, "flos": 25739727552000.0, "grad_norm": 1.6721436864373276, "language_loss": 0.75079656, "learning_rate": 1.6076230897624098e-06, "loss": 0.77263981, "num_input_tokens_seen": 103360150, "step": 4790, "time_per_iteration": 2.770280361175537 }, { "auxiliary_loss_clip": 0.01165567, "auxiliary_loss_mlp": 0.01027046, "balance_loss_clip": 1.04670358, "balance_loss_mlp": 1.01929176, "epoch": 0.5760836890518848, "flos": 30591761639040.0, "grad_norm": 2.2616303685058328, "language_loss": 0.77408433, "learning_rate": 1.6068592862682974e-06, "loss": 0.79601049, "num_input_tokens_seen": 103378305, "step": 4791, "time_per_iteration": 2.7082903385162354 }, { "auxiliary_loss_clip": 0.01151264, "auxiliary_loss_mlp": 0.01030546, "balance_loss_clip": 1.04694104, "balance_loss_mlp": 1.02296388, "epoch": 0.576203931942524, "flos": 36538963447680.0, "grad_norm": 1.7701982789470414, "language_loss": 0.73684037, "learning_rate": 1.6060955424133187e-06, "loss": 0.75865847, "num_input_tokens_seen": 103399230, "step": 4792, "time_per_iteration": 2.9551002979278564 }, { "auxiliary_loss_clip": 0.01164441, "auxiliary_loss_mlp": 0.01027504, "balance_loss_clip": 1.0506537, "balance_loss_mlp": 1.01914167, "epoch": 0.576324174833163, "flos": 25516937445120.0, "grad_norm": 1.6864632674856155, "language_loss": 0.89452451, "learning_rate": 1.6053318583133332e-06, "loss": 0.91644394, "num_input_tokens_seen": 103420100, "step": 4793, "time_per_iteration": 2.668419599533081 }, { "auxiliary_loss_clip": 0.01162224, "auxiliary_loss_mlp": 0.01026347, "balance_loss_clip": 1.0489701, "balance_loss_mlp": 1.01864028, "epoch": 0.5764444177238021, "flos": 25119262995840.0, "grad_norm": 2.2683321821463682, "language_loss": 0.74831522, "learning_rate": 1.6045682340841907e-06, "loss": 0.77020091, "num_input_tokens_seen": 103439025, "step": 4794, "time_per_iteration": 2.7149057388305664 }, { "auxiliary_loss_clip": 0.01041976, "auxiliary_loss_mlp": 0.00754358, "balance_loss_clip": 1.0117383, "balance_loss_mlp": 1.00044274, "epoch": 0.5765646606144411, "flos": 62212687758720.0, "grad_norm": 0.7519051442805301, "language_loss": 0.57912672, "learning_rate": 1.6038046698417336e-06, "loss": 0.59709007, "num_input_tokens_seen": 103499920, "step": 4795, "time_per_iteration": 4.166090726852417 }, { "auxiliary_loss_clip": 0.01164881, "auxiliary_loss_mlp": 0.01025988, "balance_loss_clip": 1.04835653, "balance_loss_mlp": 1.01820421, "epoch": 0.5766849035050803, "flos": 25118760205440.0, "grad_norm": 2.6579629314622926, "language_loss": 0.69013107, "learning_rate": 1.6030411657017919e-06, "loss": 0.71203971, "num_input_tokens_seen": 103519575, "step": 4796, "time_per_iteration": 4.642420291900635 }, { "auxiliary_loss_clip": 0.01155057, "auxiliary_loss_mlp": 0.01027465, "balance_loss_clip": 1.04710126, "balance_loss_mlp": 1.01986504, "epoch": 0.5768051463957193, "flos": 15991093578240.0, "grad_norm": 1.781648522493346, "language_loss": 0.84323764, "learning_rate": 1.6022777217801903e-06, "loss": 0.86506283, "num_input_tokens_seen": 103536530, "step": 4797, "time_per_iteration": 2.662346363067627 }, { "auxiliary_loss_clip": 0.0113691, "auxiliary_loss_mlp": 0.01030625, "balance_loss_clip": 1.0493927, "balance_loss_mlp": 1.02257288, "epoch": 0.5769253892863584, "flos": 22163635359360.0, "grad_norm": 1.9763608687826577, "language_loss": 0.73670685, "learning_rate": 1.601514338192742e-06, "loss": 0.7583822, "num_input_tokens_seen": 103556460, "step": 4798, "time_per_iteration": 2.889357805252075 }, { "auxiliary_loss_clip": 0.01175207, "auxiliary_loss_mlp": 0.01023944, "balance_loss_clip": 1.04895258, "balance_loss_mlp": 1.01691628, "epoch": 0.5770456321769976, "flos": 22856388036480.0, "grad_norm": 2.0093607511826237, "language_loss": 0.71475172, "learning_rate": 1.6007510150552514e-06, "loss": 0.73674327, "num_input_tokens_seen": 103574520, "step": 4799, "time_per_iteration": 3.549248695373535 }, { "auxiliary_loss_clip": 0.01169663, "auxiliary_loss_mlp": 0.01032923, "balance_loss_clip": 1.04862952, "balance_loss_mlp": 1.02482247, "epoch": 0.5771658750676366, "flos": 46353672489600.0, "grad_norm": 2.357252598134425, "language_loss": 0.62375629, "learning_rate": 1.599987752483515e-06, "loss": 0.64578223, "num_input_tokens_seen": 103598965, "step": 4800, "time_per_iteration": 2.9047908782958984 }, { "auxiliary_loss_clip": 0.01129376, "auxiliary_loss_mlp": 0.01024747, "balance_loss_clip": 1.04389632, "balance_loss_mlp": 1.01680768, "epoch": 0.5772861179582757, "flos": 22159972172160.0, "grad_norm": 1.606018328135046, "language_loss": 0.68362385, "learning_rate": 1.5992245505933184e-06, "loss": 0.70516515, "num_input_tokens_seen": 103618665, "step": 4801, "time_per_iteration": 2.7833058834075928 }, { "auxiliary_loss_clip": 0.01179653, "auxiliary_loss_mlp": 0.01028443, "balance_loss_clip": 1.05195022, "balance_loss_mlp": 1.02071786, "epoch": 0.5774063608489148, "flos": 31248926916480.0, "grad_norm": 1.9964211535834067, "language_loss": 0.71462095, "learning_rate": 1.5984614095004388e-06, "loss": 0.73670197, "num_input_tokens_seen": 103639800, "step": 4802, "time_per_iteration": 2.7912344932556152 }, { "auxiliary_loss_clip": 0.01158333, "auxiliary_loss_mlp": 0.01030387, "balance_loss_clip": 1.04756069, "balance_loss_mlp": 1.02209592, "epoch": 0.5775266037395539, "flos": 22527123039360.0, "grad_norm": 2.2694279212731385, "language_loss": 0.8096993, "learning_rate": 1.5976983293206438e-06, "loss": 0.83158648, "num_input_tokens_seen": 103655605, "step": 4803, "time_per_iteration": 2.6690542697906494 }, { "auxiliary_loss_clip": 0.01143855, "auxiliary_loss_mlp": 0.01025673, "balance_loss_clip": 1.0429554, "balance_loss_mlp": 1.01790023, "epoch": 0.577646846630193, "flos": 21068790860160.0, "grad_norm": 1.837381740565245, "language_loss": 0.71404636, "learning_rate": 1.5969353101696928e-06, "loss": 0.73574162, "num_input_tokens_seen": 103674045, "step": 4804, "time_per_iteration": 2.7176706790924072 }, { "auxiliary_loss_clip": 0.01163037, "auxiliary_loss_mlp": 0.01025682, "balance_loss_clip": 1.04773378, "balance_loss_mlp": 1.01810324, "epoch": 0.5777670895208321, "flos": 29714284293120.0, "grad_norm": 3.610257481967702, "language_loss": 0.79829144, "learning_rate": 1.5961723521633341e-06, "loss": 0.82017857, "num_input_tokens_seen": 103695285, "step": 4805, "time_per_iteration": 2.842996835708618 }, { "auxiliary_loss_clip": 0.0114601, "auxiliary_loss_mlp": 0.01021914, "balance_loss_clip": 1.04530549, "balance_loss_mlp": 1.01463056, "epoch": 0.5778873324114712, "flos": 19500428344320.0, "grad_norm": 2.3407323841831826, "language_loss": 0.9093622, "learning_rate": 1.5954094554173097e-06, "loss": 0.93104148, "num_input_tokens_seen": 103713275, "step": 4806, "time_per_iteration": 2.729781150817871 }, { "auxiliary_loss_clip": 0.01155824, "auxiliary_loss_mlp": 0.01028364, "balance_loss_clip": 1.04847491, "balance_loss_mlp": 1.0211246, "epoch": 0.5780075753021102, "flos": 14136846716160.0, "grad_norm": 2.098557728275986, "language_loss": 0.79145223, "learning_rate": 1.5946466200473482e-06, "loss": 0.81329411, "num_input_tokens_seen": 103731185, "step": 4807, "time_per_iteration": 2.7304961681365967 }, { "auxiliary_loss_clip": 0.01154822, "auxiliary_loss_mlp": 0.01025423, "balance_loss_clip": 1.04680276, "balance_loss_mlp": 1.01835942, "epoch": 0.5781278181927494, "flos": 15262178883840.0, "grad_norm": 1.7911139202453887, "language_loss": 0.83283859, "learning_rate": 1.5938838461691723e-06, "loss": 0.85464102, "num_input_tokens_seen": 103748095, "step": 4808, "time_per_iteration": 2.645071506500244 }, { "auxiliary_loss_clip": 0.01178625, "auxiliary_loss_mlp": 0.0102844, "balance_loss_clip": 1.05026555, "balance_loss_mlp": 1.02057242, "epoch": 0.5782480610833884, "flos": 16726831856640.0, "grad_norm": 2.9820301336252872, "language_loss": 0.82881004, "learning_rate": 1.593121133898494e-06, "loss": 0.85088074, "num_input_tokens_seen": 103765300, "step": 4809, "time_per_iteration": 2.6404664516448975 }, { "auxiliary_loss_clip": 0.0117144, "auxiliary_loss_mlp": 0.01029424, "balance_loss_clip": 1.05091047, "balance_loss_mlp": 1.0213598, "epoch": 0.5783683039740275, "flos": 25482140144640.0, "grad_norm": 3.915205444579454, "language_loss": 0.7931546, "learning_rate": 1.592358483351016e-06, "loss": 0.8151632, "num_input_tokens_seen": 103785475, "step": 4810, "time_per_iteration": 2.7308287620544434 }, { "auxiliary_loss_clip": 0.01161339, "auxiliary_loss_mlp": 0.0102376, "balance_loss_clip": 1.04904461, "balance_loss_mlp": 1.01611841, "epoch": 0.5784885468646667, "flos": 18405835240320.0, "grad_norm": 1.8673580119218698, "language_loss": 0.72515088, "learning_rate": 1.5915958946424326e-06, "loss": 0.74700189, "num_input_tokens_seen": 103804160, "step": 4811, "time_per_iteration": 2.646707773208618 }, { "auxiliary_loss_clip": 0.01137858, "auxiliary_loss_mlp": 0.00763696, "balance_loss_clip": 1.04701853, "balance_loss_mlp": 1.00031853, "epoch": 0.5786087897553057, "flos": 46100717936640.0, "grad_norm": 1.7628471907730814, "language_loss": 0.74400818, "learning_rate": 1.5908333678884271e-06, "loss": 0.76302373, "num_input_tokens_seen": 103830580, "step": 4812, "time_per_iteration": 2.9720335006713867 }, { "auxiliary_loss_clip": 0.01165115, "auxiliary_loss_mlp": 0.01031334, "balance_loss_clip": 1.05095243, "balance_loss_mlp": 1.02342486, "epoch": 0.5787290326459448, "flos": 12385950261120.0, "grad_norm": 1.8009008410000176, "language_loss": 0.74025285, "learning_rate": 1.5900709032046743e-06, "loss": 0.7622174, "num_input_tokens_seen": 103848655, "step": 4813, "time_per_iteration": 2.715083122253418 }, { "auxiliary_loss_clip": 0.01147256, "auxiliary_loss_mlp": 0.01028773, "balance_loss_clip": 1.05154037, "balance_loss_mlp": 1.0210247, "epoch": 0.5788492755365839, "flos": 23290332243840.0, "grad_norm": 2.399356385328847, "language_loss": 0.7821824, "learning_rate": 1.5893085007068391e-06, "loss": 0.80394268, "num_input_tokens_seen": 103866215, "step": 4814, "time_per_iteration": 2.7025232315063477 }, { "auxiliary_loss_clip": 0.01136455, "auxiliary_loss_mlp": 0.01028521, "balance_loss_clip": 1.04385185, "balance_loss_mlp": 1.02038479, "epoch": 0.578969518427223, "flos": 24061047390720.0, "grad_norm": 1.9928238413938981, "language_loss": 0.71054566, "learning_rate": 1.5885461605105786e-06, "loss": 0.7321955, "num_input_tokens_seen": 103887815, "step": 4815, "time_per_iteration": 2.8755176067352295 }, { "auxiliary_loss_clip": 0.01152273, "auxiliary_loss_mlp": 0.01024204, "balance_loss_clip": 1.04823208, "balance_loss_mlp": 1.0157994, "epoch": 0.579089761317862, "flos": 21871825269120.0, "grad_norm": 1.8462444993773925, "language_loss": 0.76932299, "learning_rate": 1.5877838827315375e-06, "loss": 0.79108781, "num_input_tokens_seen": 103906360, "step": 4816, "time_per_iteration": 2.744380474090576 }, { "auxiliary_loss_clip": 0.0117663, "auxiliary_loss_mlp": 0.01027976, "balance_loss_clip": 1.05016804, "balance_loss_mlp": 1.01953578, "epoch": 0.5792100042085012, "flos": 22929681738240.0, "grad_norm": 2.745596647260364, "language_loss": 0.70155478, "learning_rate": 1.587021667485355e-06, "loss": 0.72360086, "num_input_tokens_seen": 103925730, "step": 4817, "time_per_iteration": 2.7295403480529785 }, { "auxiliary_loss_clip": 0.01150966, "auxiliary_loss_mlp": 0.01029794, "balance_loss_clip": 1.04439664, "balance_loss_mlp": 1.02252245, "epoch": 0.5793302470991403, "flos": 21470056669440.0, "grad_norm": 1.7999477665836339, "language_loss": 0.7851429, "learning_rate": 1.5862595148876559e-06, "loss": 0.80695045, "num_input_tokens_seen": 103945835, "step": 4818, "time_per_iteration": 2.769815444946289 }, { "auxiliary_loss_clip": 0.01120777, "auxiliary_loss_mlp": 0.01026441, "balance_loss_clip": 1.04301167, "balance_loss_mlp": 1.01849258, "epoch": 0.5794504899897793, "flos": 12711013367040.0, "grad_norm": 2.1494615963444113, "language_loss": 0.76276273, "learning_rate": 1.58549742505406e-06, "loss": 0.78423488, "num_input_tokens_seen": 103960580, "step": 4819, "time_per_iteration": 2.751699924468994 }, { "auxiliary_loss_clip": 0.01178647, "auxiliary_loss_mlp": 0.01027829, "balance_loss_clip": 1.05050313, "balance_loss_mlp": 1.01969278, "epoch": 0.5795707328804185, "flos": 14867054300160.0, "grad_norm": 2.1074461119388865, "language_loss": 0.75822258, "learning_rate": 1.5847353981001747e-06, "loss": 0.78028733, "num_input_tokens_seen": 103977760, "step": 4820, "time_per_iteration": 2.610646963119507 }, { "auxiliary_loss_clip": 0.01143103, "auxiliary_loss_mlp": 0.01024807, "balance_loss_clip": 1.04395545, "balance_loss_mlp": 1.01694536, "epoch": 0.5796909757710575, "flos": 36430046432640.0, "grad_norm": 1.8129513760307927, "language_loss": 0.69826722, "learning_rate": 1.5839734341415993e-06, "loss": 0.71994638, "num_input_tokens_seen": 103999960, "step": 4821, "time_per_iteration": 3.7006351947784424 }, { "auxiliary_loss_clip": 0.01160454, "auxiliary_loss_mlp": 0.01032292, "balance_loss_clip": 1.05262184, "balance_loss_mlp": 1.02488327, "epoch": 0.5798112186616966, "flos": 23039891642880.0, "grad_norm": 1.8559977177210367, "language_loss": 0.76569676, "learning_rate": 1.5832115332939238e-06, "loss": 0.78762424, "num_input_tokens_seen": 104018400, "step": 4822, "time_per_iteration": 4.611956596374512 }, { "auxiliary_loss_clip": 0.01167165, "auxiliary_loss_mlp": 0.01029806, "balance_loss_clip": 1.05064332, "balance_loss_mlp": 1.02168822, "epoch": 0.5799314615523358, "flos": 16652604401280.0, "grad_norm": 2.245662271165546, "language_loss": 0.74971801, "learning_rate": 1.5824496956727272e-06, "loss": 0.77168775, "num_input_tokens_seen": 104035605, "step": 4823, "time_per_iteration": 2.710479736328125 }, { "auxiliary_loss_clip": 0.01149858, "auxiliary_loss_mlp": 0.01020901, "balance_loss_clip": 1.04832518, "balance_loss_mlp": 1.0136385, "epoch": 0.5800517044429748, "flos": 20485673470080.0, "grad_norm": 2.1425306122115217, "language_loss": 0.73133302, "learning_rate": 1.5816879213935797e-06, "loss": 0.75304061, "num_input_tokens_seen": 104054415, "step": 4824, "time_per_iteration": 2.694781541824341 }, { "auxiliary_loss_clip": 0.01160843, "auxiliary_loss_mlp": 0.01024574, "balance_loss_clip": 1.05021083, "balance_loss_mlp": 1.01734662, "epoch": 0.5801719473336139, "flos": 31538258968320.0, "grad_norm": 1.510273454970695, "language_loss": 0.79766512, "learning_rate": 1.5809262105720416e-06, "loss": 0.81951928, "num_input_tokens_seen": 104075455, "step": 4825, "time_per_iteration": 3.609889030456543 }, { "auxiliary_loss_clip": 0.01174474, "auxiliary_loss_mlp": 0.01024978, "balance_loss_clip": 1.04920864, "balance_loss_mlp": 1.0177542, "epoch": 0.580292190224253, "flos": 20375966355840.0, "grad_norm": 1.562397784236009, "language_loss": 0.79597008, "learning_rate": 1.5801645633236644e-06, "loss": 0.81796455, "num_input_tokens_seen": 104096440, "step": 4826, "time_per_iteration": 2.6447839736938477 }, { "auxiliary_loss_clip": 0.01141881, "auxiliary_loss_mlp": 0.01027988, "balance_loss_clip": 1.04517055, "balance_loss_mlp": 1.02096033, "epoch": 0.5804124331148921, "flos": 26615373304320.0, "grad_norm": 3.177937797559018, "language_loss": 0.77271628, "learning_rate": 1.579402979763989e-06, "loss": 0.794415, "num_input_tokens_seen": 104116775, "step": 4827, "time_per_iteration": 2.6867330074310303 }, { "auxiliary_loss_clip": 0.0112008, "auxiliary_loss_mlp": 0.01027708, "balance_loss_clip": 1.04607582, "balance_loss_mlp": 1.01976299, "epoch": 0.5805326760055312, "flos": 13478496289920.0, "grad_norm": 2.1589701146353546, "language_loss": 0.80955815, "learning_rate": 1.578641460008548e-06, "loss": 0.83103609, "num_input_tokens_seen": 104134510, "step": 4828, "time_per_iteration": 2.8279311656951904 }, { "auxiliary_loss_clip": 0.01161144, "auxiliary_loss_mlp": 0.01023436, "balance_loss_clip": 1.05047703, "balance_loss_mlp": 1.01594353, "epoch": 0.5806529188961702, "flos": 12091374823680.0, "grad_norm": 1.9776188956979768, "language_loss": 0.68005401, "learning_rate": 1.5778800041728613e-06, "loss": 0.70189977, "num_input_tokens_seen": 104150800, "step": 4829, "time_per_iteration": 2.6512575149536133 }, { "auxiliary_loss_clip": 0.01156771, "auxiliary_loss_mlp": 0.01028171, "balance_loss_clip": 1.04892397, "balance_loss_mlp": 1.02069616, "epoch": 0.5807731617868094, "flos": 26214107495040.0, "grad_norm": 1.5685985644723586, "language_loss": 0.66465056, "learning_rate": 1.577118612372443e-06, "loss": 0.68649995, "num_input_tokens_seen": 104172640, "step": 4830, "time_per_iteration": 2.7201194763183594 }, { "auxiliary_loss_clip": 0.01142599, "auxiliary_loss_mlp": 0.00763781, "balance_loss_clip": 1.04394054, "balance_loss_mlp": 1.00036252, "epoch": 0.5808934046774484, "flos": 37962139190400.0, "grad_norm": 1.950952780857524, "language_loss": 0.70375872, "learning_rate": 1.5763572847227943e-06, "loss": 0.72282243, "num_input_tokens_seen": 104193525, "step": 4831, "time_per_iteration": 2.8177454471588135 }, { "auxiliary_loss_clip": 0.01161378, "auxiliary_loss_mlp": 0.01026498, "balance_loss_clip": 1.04649568, "balance_loss_mlp": 1.01909494, "epoch": 0.5810136475680875, "flos": 20485853038080.0, "grad_norm": 1.7001054571471879, "language_loss": 0.81632513, "learning_rate": 1.5755960213394091e-06, "loss": 0.83820391, "num_input_tokens_seen": 104210625, "step": 4832, "time_per_iteration": 2.6602303981781006 }, { "auxiliary_loss_clip": 0.01140388, "auxiliary_loss_mlp": 0.01027765, "balance_loss_clip": 1.04698825, "balance_loss_mlp": 1.01990306, "epoch": 0.5811338904587267, "flos": 17530153574400.0, "grad_norm": 1.857175395675574, "language_loss": 0.7815336, "learning_rate": 1.5748348223377703e-06, "loss": 0.80321515, "num_input_tokens_seen": 104228180, "step": 4833, "time_per_iteration": 2.7245676517486572 }, { "auxiliary_loss_clip": 0.01150114, "auxiliary_loss_mlp": 0.01029238, "balance_loss_clip": 1.05032301, "balance_loss_mlp": 1.02169204, "epoch": 0.5812541333493657, "flos": 19458017360640.0, "grad_norm": 1.831640634637841, "language_loss": 0.77856165, "learning_rate": 1.5740736878333507e-06, "loss": 0.80035514, "num_input_tokens_seen": 104246020, "step": 4834, "time_per_iteration": 2.700939178466797 }, { "auxiliary_loss_clip": 0.01153504, "auxiliary_loss_mlp": 0.01024923, "balance_loss_clip": 1.0467515, "balance_loss_mlp": 1.01710916, "epoch": 0.5813743762400048, "flos": 20594949621120.0, "grad_norm": 4.046233416423454, "language_loss": 0.78068811, "learning_rate": 1.5733126179416143e-06, "loss": 0.80247241, "num_input_tokens_seen": 104260505, "step": 4835, "time_per_iteration": 2.6953349113464355 }, { "auxiliary_loss_clip": 0.01161623, "auxiliary_loss_mlp": 0.0102635, "balance_loss_clip": 1.04966497, "balance_loss_mlp": 1.01857138, "epoch": 0.5814946191306439, "flos": 33178227246720.0, "grad_norm": 2.305735061835053, "language_loss": 0.72547507, "learning_rate": 1.5725516127780137e-06, "loss": 0.74735475, "num_input_tokens_seen": 104282640, "step": 4836, "time_per_iteration": 2.7221062183380127 }, { "auxiliary_loss_clip": 0.01171033, "auxiliary_loss_mlp": 0.01028576, "balance_loss_clip": 1.04948449, "balance_loss_mlp": 1.02045822, "epoch": 0.581614862021283, "flos": 16143283503360.0, "grad_norm": 2.2501332324962293, "language_loss": 0.88714397, "learning_rate": 1.5717906724579943e-06, "loss": 0.90913999, "num_input_tokens_seen": 104299700, "step": 4837, "time_per_iteration": 2.6497535705566406 }, { "auxiliary_loss_clip": 0.01142625, "auxiliary_loss_mlp": 0.0103001, "balance_loss_clip": 1.04559803, "balance_loss_mlp": 1.02189779, "epoch": 0.581735104911922, "flos": 33802642298880.0, "grad_norm": 2.721232335176114, "language_loss": 0.67619133, "learning_rate": 1.571029797096989e-06, "loss": 0.6979177, "num_input_tokens_seen": 104320805, "step": 4838, "time_per_iteration": 2.866564989089966 }, { "auxiliary_loss_clip": 0.01175611, "auxiliary_loss_mlp": 0.01023148, "balance_loss_clip": 1.05003917, "balance_loss_mlp": 1.01585293, "epoch": 0.5818553478025612, "flos": 23331163029120.0, "grad_norm": 1.755628989243359, "language_loss": 0.78863204, "learning_rate": 1.570268986810423e-06, "loss": 0.81061971, "num_input_tokens_seen": 104340700, "step": 4839, "time_per_iteration": 2.800551176071167 }, { "auxiliary_loss_clip": 0.01146608, "auxiliary_loss_mlp": 0.01025244, "balance_loss_clip": 1.04734325, "balance_loss_mlp": 1.01801407, "epoch": 0.5819755906932003, "flos": 20996143603200.0, "grad_norm": 1.941383028692232, "language_loss": 0.74885219, "learning_rate": 1.5695082417137096e-06, "loss": 0.7705707, "num_input_tokens_seen": 104358575, "step": 4840, "time_per_iteration": 2.6957130432128906 }, { "auxiliary_loss_clip": 0.01144875, "auxiliary_loss_mlp": 0.01022881, "balance_loss_clip": 1.04437149, "balance_loss_mlp": 1.01602316, "epoch": 0.5820958335838393, "flos": 21431668008960.0, "grad_norm": 2.0925431665375447, "language_loss": 0.75186765, "learning_rate": 1.5687475619222539e-06, "loss": 0.77354521, "num_input_tokens_seen": 104378530, "step": 4841, "time_per_iteration": 2.75512957572937 }, { "auxiliary_loss_clip": 0.01143064, "auxiliary_loss_mlp": 0.01028599, "balance_loss_clip": 1.04422164, "balance_loss_mlp": 1.02014995, "epoch": 0.5822160764744785, "flos": 17967473660160.0, "grad_norm": 2.292013531357256, "language_loss": 0.73779178, "learning_rate": 1.5679869475514496e-06, "loss": 0.75950837, "num_input_tokens_seen": 104395465, "step": 4842, "time_per_iteration": 2.64642596244812 }, { "auxiliary_loss_clip": 0.01162252, "auxiliary_loss_mlp": 0.01020357, "balance_loss_clip": 1.04755747, "balance_loss_mlp": 1.01215529, "epoch": 0.5823363193651175, "flos": 23033858158080.0, "grad_norm": 2.1018675092715027, "language_loss": 0.81454873, "learning_rate": 1.567226398716682e-06, "loss": 0.83637482, "num_input_tokens_seen": 104415380, "step": 4843, "time_per_iteration": 2.6630876064300537 }, { "auxiliary_loss_clip": 0.0115735, "auxiliary_loss_mlp": 0.01022976, "balance_loss_clip": 1.04851866, "balance_loss_mlp": 1.01489961, "epoch": 0.5824565622557566, "flos": 32891840110080.0, "grad_norm": 1.5817796973552598, "language_loss": 0.61647075, "learning_rate": 1.566465915533326e-06, "loss": 0.63827395, "num_input_tokens_seen": 104437410, "step": 4844, "time_per_iteration": 2.7603211402893066 }, { "auxiliary_loss_clip": 0.01162473, "auxiliary_loss_mlp": 0.01026621, "balance_loss_clip": 1.04945803, "balance_loss_mlp": 1.01884818, "epoch": 0.5825768051463958, "flos": 22229674513920.0, "grad_norm": 2.308391537125453, "language_loss": 0.88281685, "learning_rate": 1.5657054981167458e-06, "loss": 0.90470773, "num_input_tokens_seen": 104456305, "step": 4845, "time_per_iteration": 2.638922929763794 }, { "auxiliary_loss_clip": 0.01159996, "auxiliary_loss_mlp": 0.01026409, "balance_loss_clip": 1.04828835, "balance_loss_mlp": 1.01936722, "epoch": 0.5826970480370348, "flos": 28001561016960.0, "grad_norm": 2.3806884686418597, "language_loss": 0.67396128, "learning_rate": 1.5649451465822965e-06, "loss": 0.69582534, "num_input_tokens_seen": 104477695, "step": 4846, "time_per_iteration": 3.6124820709228516 }, { "auxiliary_loss_clip": 0.01120036, "auxiliary_loss_mlp": 0.01027694, "balance_loss_clip": 1.04741037, "balance_loss_mlp": 1.02040982, "epoch": 0.5828172909276739, "flos": 17858053854720.0, "grad_norm": 1.67778973283529, "language_loss": 0.83771163, "learning_rate": 1.5641848610453218e-06, "loss": 0.85918891, "num_input_tokens_seen": 104496355, "step": 4847, "time_per_iteration": 2.720186948776245 }, { "auxiliary_loss_clip": 0.01162565, "auxiliary_loss_mlp": 0.01030085, "balance_loss_clip": 1.05101657, "balance_loss_mlp": 1.0220685, "epoch": 0.582937533818313, "flos": 19865244827520.0, "grad_norm": 2.0890814243998106, "language_loss": 0.86056793, "learning_rate": 1.563424641621158e-06, "loss": 0.88249439, "num_input_tokens_seen": 104515535, "step": 4848, "time_per_iteration": 3.6739563941955566 }, { "auxiliary_loss_clip": 0.01151093, "auxiliary_loss_mlp": 0.01022186, "balance_loss_clip": 1.0477705, "balance_loss_mlp": 1.01428819, "epoch": 0.5830577767089521, "flos": 26870734068480.0, "grad_norm": 2.0946185914631186, "language_loss": 0.6994496, "learning_rate": 1.5626644884251282e-06, "loss": 0.72118235, "num_input_tokens_seen": 104535055, "step": 4849, "time_per_iteration": 2.759951114654541 }, { "auxiliary_loss_clip": 0.01174509, "auxiliary_loss_mlp": 0.01029545, "balance_loss_clip": 1.04913974, "balance_loss_mlp": 1.02189779, "epoch": 0.5831780195995911, "flos": 25298205575040.0, "grad_norm": 1.6862776510407782, "language_loss": 0.88074148, "learning_rate": 1.5619044015725488e-06, "loss": 0.90278208, "num_input_tokens_seen": 104554745, "step": 4850, "time_per_iteration": 2.6001181602478027 }, { "auxiliary_loss_clip": 0.0118552, "auxiliary_loss_mlp": 0.01029903, "balance_loss_clip": 1.05467308, "balance_loss_mlp": 1.02113557, "epoch": 0.5832982624902303, "flos": 14756988049920.0, "grad_norm": 2.20730254942022, "language_loss": 0.86969006, "learning_rate": 1.5611443811787224e-06, "loss": 0.89184439, "num_input_tokens_seen": 104568870, "step": 4851, "time_per_iteration": 3.550734758377075 }, { "auxiliary_loss_clip": 0.01159454, "auxiliary_loss_mlp": 0.01023568, "balance_loss_clip": 1.05040145, "balance_loss_mlp": 1.01651645, "epoch": 0.5834185053808694, "flos": 20444555376000.0, "grad_norm": 2.265616943615762, "language_loss": 0.69282818, "learning_rate": 1.560384427358945e-06, "loss": 0.71465838, "num_input_tokens_seen": 104588415, "step": 4852, "time_per_iteration": 2.6184380054473877 }, { "auxiliary_loss_clip": 0.01142898, "auxiliary_loss_mlp": 0.01024222, "balance_loss_clip": 1.04310727, "balance_loss_mlp": 1.01637232, "epoch": 0.5835387482715084, "flos": 27200394115200.0, "grad_norm": 1.622122838907256, "language_loss": 0.73052275, "learning_rate": 1.5596245402284998e-06, "loss": 0.75219399, "num_input_tokens_seen": 104611940, "step": 4853, "time_per_iteration": 2.73319935798645 }, { "auxiliary_loss_clip": 0.01166556, "auxiliary_loss_mlp": 0.01030183, "balance_loss_clip": 1.05300987, "balance_loss_mlp": 1.02269113, "epoch": 0.5836589911621476, "flos": 16654615562880.0, "grad_norm": 1.7878507873432576, "language_loss": 0.82146072, "learning_rate": 1.5588647199026619e-06, "loss": 0.84342813, "num_input_tokens_seen": 104629675, "step": 4854, "time_per_iteration": 2.618955612182617 }, { "auxiliary_loss_clip": 0.01182003, "auxiliary_loss_mlp": 0.01027153, "balance_loss_clip": 1.05317163, "balance_loss_mlp": 1.01904082, "epoch": 0.5837792340527866, "flos": 20446817932800.0, "grad_norm": 2.002784173198503, "language_loss": 0.87576073, "learning_rate": 1.5581049664966956e-06, "loss": 0.8978523, "num_input_tokens_seen": 104647435, "step": 4855, "time_per_iteration": 2.633305311203003 }, { "auxiliary_loss_clip": 0.01022941, "auxiliary_loss_mlp": 0.01001231, "balance_loss_clip": 1.01093793, "balance_loss_mlp": 0.99984771, "epoch": 0.5838994769434257, "flos": 65995480765440.0, "grad_norm": 2.189559679819557, "language_loss": 0.65154046, "learning_rate": 1.5573452801258545e-06, "loss": 0.6717822, "num_input_tokens_seen": 104694605, "step": 4856, "time_per_iteration": 3.1412153244018555 }, { "auxiliary_loss_clip": 0.01170113, "auxiliary_loss_mlp": 0.01028709, "balance_loss_clip": 1.05144763, "balance_loss_mlp": 1.02102602, "epoch": 0.5840197198340649, "flos": 21470523546240.0, "grad_norm": 1.933447614807768, "language_loss": 0.63370502, "learning_rate": 1.5565856609053824e-06, "loss": 0.65569317, "num_input_tokens_seen": 104713400, "step": 4857, "time_per_iteration": 2.6318531036376953 }, { "auxiliary_loss_clip": 0.01180361, "auxiliary_loss_mlp": 0.01028871, "balance_loss_clip": 1.05309498, "balance_loss_mlp": 1.02071738, "epoch": 0.5841399627247039, "flos": 19135144984320.0, "grad_norm": 1.8286063410427205, "language_loss": 0.80105531, "learning_rate": 1.5558261089505127e-06, "loss": 0.82314765, "num_input_tokens_seen": 104732130, "step": 4858, "time_per_iteration": 2.612337112426758 }, { "auxiliary_loss_clip": 0.01164647, "auxiliary_loss_mlp": 0.01028455, "balance_loss_clip": 1.05064762, "balance_loss_mlp": 1.02037287, "epoch": 0.584260205615343, "flos": 26425692558720.0, "grad_norm": 2.1609520629328665, "language_loss": 0.80105972, "learning_rate": 1.5550666243764697e-06, "loss": 0.82299078, "num_input_tokens_seen": 104750290, "step": 4859, "time_per_iteration": 2.6635963916778564 }, { "auxiliary_loss_clip": 0.01163559, "auxiliary_loss_mlp": 0.01026781, "balance_loss_clip": 1.04901218, "balance_loss_mlp": 1.01965857, "epoch": 0.584380448505982, "flos": 13881809174400.0, "grad_norm": 1.927599509881764, "language_loss": 0.76820534, "learning_rate": 1.554307207298465e-06, "loss": 0.79010874, "num_input_tokens_seen": 104768550, "step": 4860, "time_per_iteration": 2.7117021083831787 }, { "auxiliary_loss_clip": 0.01183015, "auxiliary_loss_mlp": 0.01028546, "balance_loss_clip": 1.05350721, "balance_loss_mlp": 1.02046072, "epoch": 0.5845006913966212, "flos": 21543709507200.0, "grad_norm": 1.8958881826774852, "language_loss": 0.78429496, "learning_rate": 1.553547857831704e-06, "loss": 0.80641055, "num_input_tokens_seen": 104785060, "step": 4861, "time_per_iteration": 2.5912201404571533 }, { "auxiliary_loss_clip": 0.01073935, "auxiliary_loss_mlp": 0.01002639, "balance_loss_clip": 1.01305985, "balance_loss_mlp": 1.00107753, "epoch": 0.5846209342872603, "flos": 58375452712320.0, "grad_norm": 0.8801084609693695, "language_loss": 0.64158285, "learning_rate": 1.5527885760913771e-06, "loss": 0.66234857, "num_input_tokens_seen": 104834950, "step": 4862, "time_per_iteration": 3.0212576389312744 }, { "auxiliary_loss_clip": 0.01147365, "auxiliary_loss_mlp": 0.01023602, "balance_loss_clip": 1.04914606, "balance_loss_mlp": 1.01701832, "epoch": 0.5847411771778993, "flos": 18588045957120.0, "grad_norm": 1.7950834296259093, "language_loss": 0.76532221, "learning_rate": 1.552029362192668e-06, "loss": 0.78703189, "num_input_tokens_seen": 104854210, "step": 4863, "time_per_iteration": 2.644284248352051 }, { "auxiliary_loss_clip": 0.01130239, "auxiliary_loss_mlp": 0.0102612, "balance_loss_clip": 1.04676723, "balance_loss_mlp": 1.01803434, "epoch": 0.5848614200685385, "flos": 24240780069120.0, "grad_norm": 8.084568101077373, "language_loss": 0.72532213, "learning_rate": 1.5512702162507478e-06, "loss": 0.74688578, "num_input_tokens_seen": 104874525, "step": 4864, "time_per_iteration": 2.798851251602173 }, { "auxiliary_loss_clip": 0.01052198, "auxiliary_loss_mlp": 0.01001656, "balance_loss_clip": 1.01240718, "balance_loss_mlp": 1.00008285, "epoch": 0.5849816629591775, "flos": 71660245933440.0, "grad_norm": 1.1001091077984224, "language_loss": 0.55748677, "learning_rate": 1.5505111383807792e-06, "loss": 0.57802528, "num_input_tokens_seen": 104937195, "step": 4865, "time_per_iteration": 3.295311689376831 }, { "auxiliary_loss_clip": 0.01123318, "auxiliary_loss_mlp": 0.01027844, "balance_loss_clip": 1.04301214, "balance_loss_mlp": 1.01995802, "epoch": 0.5851019058498166, "flos": 23802095266560.0, "grad_norm": 1.8713735851802469, "language_loss": 0.80758423, "learning_rate": 1.5497521286979138e-06, "loss": 0.82909584, "num_input_tokens_seen": 104957435, "step": 4866, "time_per_iteration": 2.743245840072632 }, { "auxiliary_loss_clip": 0.01138018, "auxiliary_loss_mlp": 0.01031754, "balance_loss_clip": 1.04480863, "balance_loss_mlp": 1.023368, "epoch": 0.5852221487404557, "flos": 24388516707840.0, "grad_norm": 2.268170688300956, "language_loss": 0.74260116, "learning_rate": 1.5489931873172927e-06, "loss": 0.76429886, "num_input_tokens_seen": 104978755, "step": 4867, "time_per_iteration": 2.7442164421081543 }, { "auxiliary_loss_clip": 0.01089365, "auxiliary_loss_mlp": 0.01033475, "balance_loss_clip": 1.03775907, "balance_loss_mlp": 1.0260725, "epoch": 0.5853423916310948, "flos": 27271425260160.0, "grad_norm": 1.902790732696549, "language_loss": 0.79090607, "learning_rate": 1.5482343143540467e-06, "loss": 0.81213444, "num_input_tokens_seen": 105000020, "step": 4868, "time_per_iteration": 2.8656864166259766 }, { "auxiliary_loss_clip": 0.01137375, "auxiliary_loss_mlp": 0.00762379, "balance_loss_clip": 1.04541636, "balance_loss_mlp": 1.00024939, "epoch": 0.5854626345217339, "flos": 11983786611840.0, "grad_norm": 2.9349628190071324, "language_loss": 0.83326346, "learning_rate": 1.547475509923295e-06, "loss": 0.85226101, "num_input_tokens_seen": 105017060, "step": 4869, "time_per_iteration": 2.733159303665161 }, { "auxiliary_loss_clip": 0.01032039, "auxiliary_loss_mlp": 0.0100089, "balance_loss_clip": 1.01062, "balance_loss_mlp": 0.99932796, "epoch": 0.585582877412373, "flos": 64342335173760.0, "grad_norm": 0.7375390063164634, "language_loss": 0.56005239, "learning_rate": 1.5467167741401495e-06, "loss": 0.58038163, "num_input_tokens_seen": 105078540, "step": 4870, "time_per_iteration": 3.279709815979004 }, { "auxiliary_loss_clip": 0.01147132, "auxiliary_loss_mlp": 0.01029377, "balance_loss_clip": 1.04452181, "balance_loss_mlp": 1.02158093, "epoch": 0.5857031203030121, "flos": 17011926103680.0, "grad_norm": 2.396602183375033, "language_loss": 0.71902311, "learning_rate": 1.5459581071197083e-06, "loss": 0.74078828, "num_input_tokens_seen": 105094200, "step": 4871, "time_per_iteration": 2.7566263675689697 }, { "auxiliary_loss_clip": 0.01169157, "auxiliary_loss_mlp": 0.01027255, "balance_loss_clip": 1.05324793, "balance_loss_mlp": 1.01958358, "epoch": 0.5858233631936511, "flos": 20885682303360.0, "grad_norm": 2.048398434874254, "language_loss": 0.83093679, "learning_rate": 1.5451995089770624e-06, "loss": 0.85290086, "num_input_tokens_seen": 105113985, "step": 4872, "time_per_iteration": 2.677126169204712 }, { "auxiliary_loss_clip": 0.01175727, "auxiliary_loss_mlp": 0.01025464, "balance_loss_clip": 1.05097032, "balance_loss_mlp": 1.01841331, "epoch": 0.5859436060842903, "flos": 23191902000000.0, "grad_norm": 1.3783326311436244, "language_loss": 0.71918356, "learning_rate": 1.5444409798272885e-06, "loss": 0.74119544, "num_input_tokens_seen": 105138075, "step": 4873, "time_per_iteration": 3.6415350437164307 }, { "auxiliary_loss_clip": 0.01134155, "auxiliary_loss_mlp": 0.01033337, "balance_loss_clip": 1.04340005, "balance_loss_mlp": 1.02503371, "epoch": 0.5860638489749294, "flos": 22492648961280.0, "grad_norm": 2.2647533303839147, "language_loss": 0.81057906, "learning_rate": 1.543682519785456e-06, "loss": 0.83225399, "num_input_tokens_seen": 105156555, "step": 4874, "time_per_iteration": 4.581416606903076 }, { "auxiliary_loss_clip": 0.01147777, "auxiliary_loss_mlp": 0.01027988, "balance_loss_clip": 1.04628372, "balance_loss_mlp": 1.02074003, "epoch": 0.5861840918655684, "flos": 17566243764480.0, "grad_norm": 2.41548406527139, "language_loss": 0.80605137, "learning_rate": 1.5429241289666219e-06, "loss": 0.82780904, "num_input_tokens_seen": 105174055, "step": 4875, "time_per_iteration": 2.7073020935058594 }, { "auxiliary_loss_clip": 0.01141776, "auxiliary_loss_mlp": 0.01023555, "balance_loss_clip": 1.04656863, "balance_loss_mlp": 1.01615238, "epoch": 0.5863043347562076, "flos": 25556152118400.0, "grad_norm": 2.1651528219474248, "language_loss": 0.69708472, "learning_rate": 1.5421658074858342e-06, "loss": 0.71873808, "num_input_tokens_seen": 105192160, "step": 4876, "time_per_iteration": 3.641961097717285 }, { "auxiliary_loss_clip": 0.01144479, "auxiliary_loss_mlp": 0.01032734, "balance_loss_clip": 1.04801714, "balance_loss_mlp": 1.02463961, "epoch": 0.5864245776468466, "flos": 20667525050880.0, "grad_norm": 2.294436141596231, "language_loss": 0.66547793, "learning_rate": 1.5414075554581298e-06, "loss": 0.68725002, "num_input_tokens_seen": 105210205, "step": 4877, "time_per_iteration": 2.721721887588501 }, { "auxiliary_loss_clip": 0.0118009, "auxiliary_loss_mlp": 0.01030779, "balance_loss_clip": 1.05042291, "balance_loss_mlp": 1.02305412, "epoch": 0.5865448205374857, "flos": 28913907490560.0, "grad_norm": 3.094594771709935, "language_loss": 0.78146303, "learning_rate": 1.5406493729985348e-06, "loss": 0.8035717, "num_input_tokens_seen": 105229400, "step": 4878, "time_per_iteration": 2.736154317855835 }, { "auxiliary_loss_clip": 0.0112676, "auxiliary_loss_mlp": 0.00762911, "balance_loss_clip": 1.04672551, "balance_loss_mlp": 1.00028408, "epoch": 0.5866650634281249, "flos": 25842575168640.0, "grad_norm": 2.258639984969489, "language_loss": 0.71790022, "learning_rate": 1.5398912602220644e-06, "loss": 0.73679698, "num_input_tokens_seen": 105248675, "step": 4879, "time_per_iteration": 2.7802629470825195 }, { "auxiliary_loss_clip": 0.01130757, "auxiliary_loss_mlp": 0.01031489, "balance_loss_clip": 1.04451823, "balance_loss_mlp": 1.02344203, "epoch": 0.5867853063187639, "flos": 17052325925760.0, "grad_norm": 2.0021809142525355, "language_loss": 0.78226912, "learning_rate": 1.539133217243724e-06, "loss": 0.80389154, "num_input_tokens_seen": 105265695, "step": 4880, "time_per_iteration": 2.745915174484253 }, { "auxiliary_loss_clip": 0.01139778, "auxiliary_loss_mlp": 0.01025145, "balance_loss_clip": 1.04634535, "balance_loss_mlp": 1.01661575, "epoch": 0.586905549209403, "flos": 24645026707200.0, "grad_norm": 2.174076298590697, "language_loss": 0.75996447, "learning_rate": 1.5383752441785081e-06, "loss": 0.78161371, "num_input_tokens_seen": 105284920, "step": 4881, "time_per_iteration": 2.727510690689087 }, { "auxiliary_loss_clip": 0.01169218, "auxiliary_loss_mlp": 0.01029351, "balance_loss_clip": 1.0503068, "balance_loss_mlp": 1.02147985, "epoch": 0.5870257921000421, "flos": 14720538723840.0, "grad_norm": 2.132414687422191, "language_loss": 0.85416007, "learning_rate": 1.5376173411414003e-06, "loss": 0.87614572, "num_input_tokens_seen": 105302960, "step": 4882, "time_per_iteration": 2.6639392375946045 }, { "auxiliary_loss_clip": 0.01148191, "auxiliary_loss_mlp": 0.01031938, "balance_loss_clip": 1.04354846, "balance_loss_mlp": 1.02417755, "epoch": 0.5871460349906812, "flos": 23914998691200.0, "grad_norm": 2.3908345344027877, "language_loss": 0.78946471, "learning_rate": 1.5368595082473753e-06, "loss": 0.81126606, "num_input_tokens_seen": 105321260, "step": 4883, "time_per_iteration": 2.722381830215454 }, { "auxiliary_loss_clip": 0.01166381, "auxiliary_loss_mlp": 0.01024678, "balance_loss_clip": 1.04916143, "balance_loss_mlp": 1.0176034, "epoch": 0.5872662778813202, "flos": 22164174063360.0, "grad_norm": 1.7575592420640564, "language_loss": 0.78024781, "learning_rate": 1.5361017456113935e-06, "loss": 0.80215836, "num_input_tokens_seen": 105341610, "step": 4884, "time_per_iteration": 2.770350933074951 }, { "auxiliary_loss_clip": 0.01166021, "auxiliary_loss_mlp": 0.01028591, "balance_loss_clip": 1.04845893, "balance_loss_mlp": 1.02069354, "epoch": 0.5873865207719594, "flos": 18441925430400.0, "grad_norm": 2.018502238382439, "language_loss": 0.86187297, "learning_rate": 1.5353440533484085e-06, "loss": 0.8838191, "num_input_tokens_seen": 105360465, "step": 4885, "time_per_iteration": 2.592813491821289 }, { "auxiliary_loss_clip": 0.0115423, "auxiliary_loss_mlp": 0.01034073, "balance_loss_clip": 1.04866517, "balance_loss_mlp": 1.02588367, "epoch": 0.5875067636625985, "flos": 54015321427200.0, "grad_norm": 1.9171547990166948, "language_loss": 0.65921986, "learning_rate": 1.534586431573361e-06, "loss": 0.68110293, "num_input_tokens_seen": 105385405, "step": 4886, "time_per_iteration": 3.0140206813812256 }, { "auxiliary_loss_clip": 0.01105396, "auxiliary_loss_mlp": 0.01029527, "balance_loss_clip": 1.0393337, "balance_loss_mlp": 1.02094412, "epoch": 0.5876270065532375, "flos": 27995707100160.0, "grad_norm": 1.8282061836307484, "language_loss": 0.79051685, "learning_rate": 1.5338288804011817e-06, "loss": 0.8118661, "num_input_tokens_seen": 105404905, "step": 4887, "time_per_iteration": 2.9724414348602295 }, { "auxiliary_loss_clip": 0.01144558, "auxiliary_loss_mlp": 0.01030943, "balance_loss_clip": 1.04511142, "balance_loss_mlp": 1.02282536, "epoch": 0.5877472494438767, "flos": 21361462876800.0, "grad_norm": 1.9772791264624847, "language_loss": 0.71108431, "learning_rate": 1.533071399946791e-06, "loss": 0.73283935, "num_input_tokens_seen": 105423650, "step": 4888, "time_per_iteration": 2.7016940116882324 }, { "auxiliary_loss_clip": 0.01150393, "auxiliary_loss_mlp": 0.01027211, "balance_loss_clip": 1.04580975, "balance_loss_mlp": 1.0193851, "epoch": 0.5878674923345157, "flos": 22383013674240.0, "grad_norm": 1.9241131294930416, "language_loss": 0.57447851, "learning_rate": 1.5323139903250977e-06, "loss": 0.59625459, "num_input_tokens_seen": 105444255, "step": 4889, "time_per_iteration": 2.770261287689209 }, { "auxiliary_loss_clip": 0.01153826, "auxiliary_loss_mlp": 0.01033139, "balance_loss_clip": 1.05153656, "balance_loss_mlp": 1.02526546, "epoch": 0.5879877352251548, "flos": 21868664872320.0, "grad_norm": 1.6462654499554887, "language_loss": 0.76992106, "learning_rate": 1.5315566516510002e-06, "loss": 0.79179072, "num_input_tokens_seen": 105462425, "step": 4890, "time_per_iteration": 2.6702401638031006 }, { "auxiliary_loss_clip": 0.01177036, "auxiliary_loss_mlp": 0.01027899, "balance_loss_clip": 1.05088675, "balance_loss_mlp": 1.01954865, "epoch": 0.5881079781157939, "flos": 17493811989120.0, "grad_norm": 1.740136546501356, "language_loss": 0.67610127, "learning_rate": 1.5307993840393857e-06, "loss": 0.69815058, "num_input_tokens_seen": 105480505, "step": 4891, "time_per_iteration": 2.6186251640319824 }, { "auxiliary_loss_clip": 0.0117642, "auxiliary_loss_mlp": 0.01028234, "balance_loss_clip": 1.04962587, "balance_loss_mlp": 1.02096796, "epoch": 0.588228221006433, "flos": 22601853285120.0, "grad_norm": 1.8055722014968005, "language_loss": 0.8039943, "learning_rate": 1.530042187605132e-06, "loss": 0.82604086, "num_input_tokens_seen": 105499760, "step": 4892, "time_per_iteration": 2.594836950302124 }, { "auxiliary_loss_clip": 0.01163768, "auxiliary_loss_mlp": 0.00761743, "balance_loss_clip": 1.05042768, "balance_loss_mlp": 1.00035954, "epoch": 0.5883484638970721, "flos": 26176939896960.0, "grad_norm": 1.4150408690419767, "language_loss": 0.84261572, "learning_rate": 1.5292850624631044e-06, "loss": 0.86187088, "num_input_tokens_seen": 105521955, "step": 4893, "time_per_iteration": 2.7300970554351807 }, { "auxiliary_loss_clip": 0.01160627, "auxiliary_loss_mlp": 0.01028033, "balance_loss_clip": 1.05011797, "balance_loss_mlp": 1.02040935, "epoch": 0.5884687067877111, "flos": 30443737691520.0, "grad_norm": 2.018345781513594, "language_loss": 0.80277693, "learning_rate": 1.5285280087281593e-06, "loss": 0.82466352, "num_input_tokens_seen": 105542685, "step": 4894, "time_per_iteration": 2.6546895503997803 }, { "auxiliary_loss_clip": 0.01051908, "auxiliary_loss_mlp": 0.01001132, "balance_loss_clip": 1.01054955, "balance_loss_mlp": 0.99970764, "epoch": 0.5885889496783503, "flos": 70507550580480.0, "grad_norm": 0.6358789448993087, "language_loss": 0.56587797, "learning_rate": 1.5277710265151398e-06, "loss": 0.58640838, "num_input_tokens_seen": 105612165, "step": 4895, "time_per_iteration": 3.4039573669433594 }, { "auxiliary_loss_clip": 0.01162819, "auxiliary_loss_mlp": 0.01032156, "balance_loss_clip": 1.048195, "balance_loss_mlp": 1.02380586, "epoch": 0.5887091925689893, "flos": 19098767485440.0, "grad_norm": 2.7807579329350807, "language_loss": 0.77577746, "learning_rate": 1.5270141159388803e-06, "loss": 0.79772723, "num_input_tokens_seen": 105629185, "step": 4896, "time_per_iteration": 2.709073305130005 }, { "auxiliary_loss_clip": 0.01177912, "auxiliary_loss_mlp": 0.01027159, "balance_loss_clip": 1.04981828, "balance_loss_mlp": 1.01954758, "epoch": 0.5888294354596284, "flos": 23294282739840.0, "grad_norm": 1.7935306765304087, "language_loss": 0.8040964, "learning_rate": 1.526257277114203e-06, "loss": 0.82614714, "num_input_tokens_seen": 105650260, "step": 4897, "time_per_iteration": 2.633680820465088 }, { "auxiliary_loss_clip": 0.01145715, "auxiliary_loss_mlp": 0.01027383, "balance_loss_clip": 1.04839361, "balance_loss_mlp": 1.01958919, "epoch": 0.5889496783502676, "flos": 21981532383360.0, "grad_norm": 2.1112585436742797, "language_loss": 0.79186845, "learning_rate": 1.5255005101559201e-06, "loss": 0.81359947, "num_input_tokens_seen": 105667870, "step": 4898, "time_per_iteration": 2.6852736473083496 }, { "auxiliary_loss_clip": 0.01165181, "auxiliary_loss_mlp": 0.01022731, "balance_loss_clip": 1.04886043, "balance_loss_mlp": 1.01518834, "epoch": 0.5890699212409066, "flos": 21685233093120.0, "grad_norm": 2.290658708120417, "language_loss": 0.76640528, "learning_rate": 1.524743815178833e-06, "loss": 0.78828442, "num_input_tokens_seen": 105685830, "step": 4899, "time_per_iteration": 3.539773464202881 }, { "auxiliary_loss_clip": 0.011465, "auxiliary_loss_mlp": 0.01030106, "balance_loss_clip": 1.04282618, "balance_loss_mlp": 1.02295291, "epoch": 0.5891901641315457, "flos": 19464553635840.0, "grad_norm": 2.1326674253242395, "language_loss": 0.80868244, "learning_rate": 1.5239871922977315e-06, "loss": 0.83044851, "num_input_tokens_seen": 105705745, "step": 4900, "time_per_iteration": 4.574876308441162 }, { "auxiliary_loss_clip": 0.01146709, "auxiliary_loss_mlp": 0.01029448, "balance_loss_clip": 1.04512024, "balance_loss_mlp": 1.02192307, "epoch": 0.5893104070221848, "flos": 19609884063360.0, "grad_norm": 1.925169698697133, "language_loss": 0.89542276, "learning_rate": 1.523230641627394e-06, "loss": 0.91718435, "num_input_tokens_seen": 105724730, "step": 4901, "time_per_iteration": 2.739413261413574 }, { "auxiliary_loss_clip": 0.01122173, "auxiliary_loss_mlp": 0.01029063, "balance_loss_clip": 1.04100609, "balance_loss_mlp": 1.02143931, "epoch": 0.5894306499128239, "flos": 29060063930880.0, "grad_norm": 1.8915484756659846, "language_loss": 0.72912544, "learning_rate": 1.5224741632825888e-06, "loss": 0.75063783, "num_input_tokens_seen": 105744920, "step": 4902, "time_per_iteration": 3.7648000717163086 }, { "auxiliary_loss_clip": 0.01182866, "auxiliary_loss_mlp": 0.01034713, "balance_loss_clip": 1.05384839, "balance_loss_mlp": 1.02732766, "epoch": 0.589550892803463, "flos": 42298890721920.0, "grad_norm": 1.833636762237935, "language_loss": 0.69204426, "learning_rate": 1.521717757378074e-06, "loss": 0.71422005, "num_input_tokens_seen": 105765465, "step": 4903, "time_per_iteration": 2.8651678562164307 }, { "auxiliary_loss_clip": 0.01167432, "auxiliary_loss_mlp": 0.01026972, "balance_loss_clip": 1.0501194, "balance_loss_mlp": 1.0192827, "epoch": 0.5896711356941021, "flos": 14137062197760.0, "grad_norm": 1.7163651512574383, "language_loss": 0.68985522, "learning_rate": 1.5209614240285943e-06, "loss": 0.71179926, "num_input_tokens_seen": 105783120, "step": 4904, "time_per_iteration": 2.6265976428985596 }, { "auxiliary_loss_clip": 0.01175242, "auxiliary_loss_mlp": 0.0076233, "balance_loss_clip": 1.04918015, "balance_loss_mlp": 1.00035501, "epoch": 0.5897913785847412, "flos": 17201355454080.0, "grad_norm": 2.1005658132755274, "language_loss": 0.8489967, "learning_rate": 1.520205163348887e-06, "loss": 0.86837244, "num_input_tokens_seen": 105801055, "step": 4905, "time_per_iteration": 2.5811798572540283 }, { "auxiliary_loss_clip": 0.01041798, "auxiliary_loss_mlp": 0.01001614, "balance_loss_clip": 1.00921726, "balance_loss_mlp": 1.00014818, "epoch": 0.5899116214753802, "flos": 48794164202880.0, "grad_norm": 0.7261276204999577, "language_loss": 0.56957895, "learning_rate": 1.519448975453674e-06, "loss": 0.59001309, "num_input_tokens_seen": 105856155, "step": 4906, "time_per_iteration": 3.1605563163757324 }, { "auxiliary_loss_clip": 0.01165105, "auxiliary_loss_mlp": 0.00763033, "balance_loss_clip": 1.05154741, "balance_loss_mlp": 1.00043154, "epoch": 0.5900318643660194, "flos": 21103659987840.0, "grad_norm": 3.8432336646899756, "language_loss": 0.75601995, "learning_rate": 1.5186928604576696e-06, "loss": 0.77530134, "num_input_tokens_seen": 105873350, "step": 4907, "time_per_iteration": 2.608877420425415 }, { "auxiliary_loss_clip": 0.01148758, "auxiliary_loss_mlp": 0.01024283, "balance_loss_clip": 1.04598475, "balance_loss_mlp": 1.01702309, "epoch": 0.5901521072566585, "flos": 21178390233600.0, "grad_norm": 2.058329891845543, "language_loss": 0.7744028, "learning_rate": 1.5179368184755752e-06, "loss": 0.79613316, "num_input_tokens_seen": 105891435, "step": 4908, "time_per_iteration": 2.6908013820648193 }, { "auxiliary_loss_clip": 0.01147499, "auxiliary_loss_mlp": 0.01025387, "balance_loss_clip": 1.04723144, "balance_loss_mlp": 1.0181632, "epoch": 0.5902723501472975, "flos": 20225967160320.0, "grad_norm": 1.887621407512652, "language_loss": 0.82585454, "learning_rate": 1.5171808496220821e-06, "loss": 0.84758341, "num_input_tokens_seen": 105910190, "step": 4909, "time_per_iteration": 2.6149702072143555 }, { "auxiliary_loss_clip": 0.01152232, "auxiliary_loss_mlp": 0.01024601, "balance_loss_clip": 1.04662359, "balance_loss_mlp": 1.01751375, "epoch": 0.5903925930379367, "flos": 22964407211520.0, "grad_norm": 1.7860935293149574, "language_loss": 0.81485391, "learning_rate": 1.5164249540118708e-06, "loss": 0.83662224, "num_input_tokens_seen": 105929315, "step": 4910, "time_per_iteration": 2.7021431922912598 }, { "auxiliary_loss_clip": 0.01109853, "auxiliary_loss_mlp": 0.0102498, "balance_loss_clip": 1.04196167, "balance_loss_mlp": 1.01764011, "epoch": 0.5905128359285757, "flos": 23367720096000.0, "grad_norm": 1.886022707984701, "language_loss": 0.83078909, "learning_rate": 1.5156691317596093e-06, "loss": 0.85213745, "num_input_tokens_seen": 105950740, "step": 4911, "time_per_iteration": 2.777646541595459 }, { "auxiliary_loss_clip": 0.01166011, "auxiliary_loss_mlp": 0.00762197, "balance_loss_clip": 1.04849696, "balance_loss_mlp": 1.00036001, "epoch": 0.5906330788192148, "flos": 28032335994240.0, "grad_norm": 2.2961028825192056, "language_loss": 0.67074621, "learning_rate": 1.5149133829799556e-06, "loss": 0.69002825, "num_input_tokens_seen": 105968735, "step": 4912, "time_per_iteration": 2.699658155441284 }, { "auxiliary_loss_clip": 0.01156572, "auxiliary_loss_mlp": 0.01025575, "balance_loss_clip": 1.04766798, "balance_loss_mlp": 1.01787746, "epoch": 0.590753321709854, "flos": 18477943793280.0, "grad_norm": 2.018146604083056, "language_loss": 0.80802184, "learning_rate": 1.5141577077875556e-06, "loss": 0.82984328, "num_input_tokens_seen": 105986060, "step": 4913, "time_per_iteration": 2.79193377494812 }, { "auxiliary_loss_clip": 0.0116595, "auxiliary_loss_mlp": 0.01022296, "balance_loss_clip": 1.04949164, "balance_loss_mlp": 1.01510501, "epoch": 0.590873564600493, "flos": 16873706568960.0, "grad_norm": 1.9865414966011417, "language_loss": 0.72472531, "learning_rate": 1.5134021062970451e-06, "loss": 0.74660778, "num_input_tokens_seen": 106004440, "step": 4914, "time_per_iteration": 2.6209969520568848 }, { "auxiliary_loss_clip": 0.01128058, "auxiliary_loss_mlp": 0.01029723, "balance_loss_clip": 1.04965913, "balance_loss_mlp": 1.02274966, "epoch": 0.5909938074911321, "flos": 13516166678400.0, "grad_norm": 2.098946525312931, "language_loss": 0.81389976, "learning_rate": 1.5126465786230483e-06, "loss": 0.83547753, "num_input_tokens_seen": 106021215, "step": 4915, "time_per_iteration": 2.693582534790039 }, { "auxiliary_loss_clip": 0.01176807, "auxiliary_loss_mlp": 0.01024616, "balance_loss_clip": 1.05118632, "balance_loss_mlp": 1.01747251, "epoch": 0.5911140503817712, "flos": 26024067613440.0, "grad_norm": 1.6509665836582343, "language_loss": 0.82004726, "learning_rate": 1.5118911248801787e-06, "loss": 0.84206146, "num_input_tokens_seen": 106039225, "step": 4916, "time_per_iteration": 2.644364833831787 }, { "auxiliary_loss_clip": 0.01155007, "auxiliary_loss_mlp": 0.01026253, "balance_loss_clip": 1.04569554, "balance_loss_mlp": 1.0192914, "epoch": 0.5912342932724103, "flos": 23258731253760.0, "grad_norm": 2.000289607260502, "language_loss": 0.79614651, "learning_rate": 1.5111357451830364e-06, "loss": 0.81795913, "num_input_tokens_seen": 106057920, "step": 4917, "time_per_iteration": 2.6517252922058105 }, { "auxiliary_loss_clip": 0.01160195, "auxiliary_loss_mlp": 0.01027498, "balance_loss_clip": 1.04629052, "balance_loss_mlp": 1.02046514, "epoch": 0.5913545361630493, "flos": 19573039687680.0, "grad_norm": 2.1177890270499704, "language_loss": 0.71032518, "learning_rate": 1.5103804396462131e-06, "loss": 0.73220211, "num_input_tokens_seen": 106077855, "step": 4918, "time_per_iteration": 2.6090574264526367 }, { "auxiliary_loss_clip": 0.01164928, "auxiliary_loss_mlp": 0.01028274, "balance_loss_clip": 1.04671907, "balance_loss_mlp": 1.02043056, "epoch": 0.5914747790536885, "flos": 26213532877440.0, "grad_norm": 2.1281737835293675, "language_loss": 0.80131173, "learning_rate": 1.5096252083842877e-06, "loss": 0.82324386, "num_input_tokens_seen": 106097065, "step": 4919, "time_per_iteration": 2.747882843017578 }, { "auxiliary_loss_clip": 0.0115797, "auxiliary_loss_mlp": 0.01027714, "balance_loss_clip": 1.04389763, "balance_loss_mlp": 1.02044857, "epoch": 0.5915950219443276, "flos": 27417545786880.0, "grad_norm": 1.7752138463293996, "language_loss": 0.85031021, "learning_rate": 1.5088700515118285e-06, "loss": 0.87216711, "num_input_tokens_seen": 106116385, "step": 4920, "time_per_iteration": 2.7212016582489014 }, { "auxiliary_loss_clip": 0.01132175, "auxiliary_loss_mlp": 0.01026164, "balance_loss_clip": 1.04796767, "balance_loss_mlp": 1.01828766, "epoch": 0.5917152648349666, "flos": 21907879545600.0, "grad_norm": 1.8616438125388663, "language_loss": 0.66644305, "learning_rate": 1.508114969143392e-06, "loss": 0.68802643, "num_input_tokens_seen": 106136370, "step": 4921, "time_per_iteration": 2.708624839782715 }, { "auxiliary_loss_clip": 0.01148213, "auxiliary_loss_mlp": 0.01029241, "balance_loss_clip": 1.04404163, "balance_loss_mlp": 1.02233303, "epoch": 0.5918355077256057, "flos": 28109185142400.0, "grad_norm": 1.528157886800069, "language_loss": 0.77279973, "learning_rate": 1.5073599613935238e-06, "loss": 0.79457426, "num_input_tokens_seen": 106158490, "step": 4922, "time_per_iteration": 2.7133371829986572 }, { "auxiliary_loss_clip": 0.01148971, "auxiliary_loss_mlp": 0.01023706, "balance_loss_clip": 1.04647875, "balance_loss_mlp": 1.0164727, "epoch": 0.5919557506162448, "flos": 28183807647360.0, "grad_norm": 2.7564064509657142, "language_loss": 0.57801664, "learning_rate": 1.5066050283767574e-06, "loss": 0.59974337, "num_input_tokens_seen": 106179170, "step": 4923, "time_per_iteration": 2.7322793006896973 }, { "auxiliary_loss_clip": 0.0114453, "auxiliary_loss_mlp": 0.01024185, "balance_loss_clip": 1.04692149, "balance_loss_mlp": 1.01744032, "epoch": 0.5920759935068839, "flos": 12094355652480.0, "grad_norm": 1.9951460732544322, "language_loss": 0.82682478, "learning_rate": 1.505850170207616e-06, "loss": 0.84851193, "num_input_tokens_seen": 106196035, "step": 4924, "time_per_iteration": 3.5459909439086914 }, { "auxiliary_loss_clip": 0.01147451, "auxiliary_loss_mlp": 0.01029908, "balance_loss_clip": 1.04619503, "balance_loss_mlp": 1.02268445, "epoch": 0.592196236397523, "flos": 29424772673280.0, "grad_norm": 2.1070301641481244, "language_loss": 0.7789886, "learning_rate": 1.505095387000611e-06, "loss": 0.80076224, "num_input_tokens_seen": 106218335, "step": 4925, "time_per_iteration": 3.671443462371826 }, { "auxiliary_loss_clip": 0.01139889, "auxiliary_loss_mlp": 0.01026907, "balance_loss_clip": 1.04665816, "balance_loss_mlp": 1.01957595, "epoch": 0.5923164792881621, "flos": 24384709866240.0, "grad_norm": 1.9849252037190008, "language_loss": 0.74387193, "learning_rate": 1.504340678870242e-06, "loss": 0.76553988, "num_input_tokens_seen": 106236550, "step": 4926, "time_per_iteration": 2.7268197536468506 }, { "auxiliary_loss_clip": 0.01158723, "auxiliary_loss_mlp": 0.01025898, "balance_loss_clip": 1.04713011, "balance_loss_mlp": 1.01853693, "epoch": 0.5924367221788012, "flos": 24024238928640.0, "grad_norm": 2.1989745276419184, "language_loss": 0.89875001, "learning_rate": 1.5035860459309989e-06, "loss": 0.92059624, "num_input_tokens_seen": 106254265, "step": 4927, "time_per_iteration": 3.714533567428589 }, { "auxiliary_loss_clip": 0.01143474, "auxiliary_loss_mlp": 0.01027204, "balance_loss_clip": 1.04550195, "balance_loss_mlp": 1.01916885, "epoch": 0.5925569650694402, "flos": 26870590414080.0, "grad_norm": 1.7374245404197874, "language_loss": 0.63824964, "learning_rate": 1.5028314882973568e-06, "loss": 0.65995634, "num_input_tokens_seen": 106274670, "step": 4928, "time_per_iteration": 3.6416409015655518 }, { "auxiliary_loss_clip": 0.01146805, "auxiliary_loss_mlp": 0.01034879, "balance_loss_clip": 1.0461942, "balance_loss_mlp": 1.02706468, "epoch": 0.5926772079600794, "flos": 22302788647680.0, "grad_norm": 1.8351227075015912, "language_loss": 0.84849453, "learning_rate": 1.502077006083783e-06, "loss": 0.87031138, "num_input_tokens_seen": 106293330, "step": 4929, "time_per_iteration": 2.684135913848877 }, { "auxiliary_loss_clip": 0.01166988, "auxiliary_loss_mlp": 0.00762061, "balance_loss_clip": 1.05127478, "balance_loss_mlp": 1.00030804, "epoch": 0.5927974508507184, "flos": 19865244827520.0, "grad_norm": 1.929855842175919, "language_loss": 0.76628655, "learning_rate": 1.5013225994047315e-06, "loss": 0.78557706, "num_input_tokens_seen": 106310960, "step": 4930, "time_per_iteration": 2.664365291595459 }, { "auxiliary_loss_clip": 0.01164566, "auxiliary_loss_mlp": 0.00762391, "balance_loss_clip": 1.05053985, "balance_loss_mlp": 1.00041056, "epoch": 0.5929176937413575, "flos": 15776743167360.0, "grad_norm": 1.5601533452751828, "language_loss": 0.80694544, "learning_rate": 1.5005682683746452e-06, "loss": 0.82621503, "num_input_tokens_seen": 106329475, "step": 4931, "time_per_iteration": 2.655264139175415 }, { "auxiliary_loss_clip": 0.01165671, "auxiliary_loss_mlp": 0.01023696, "balance_loss_clip": 1.05232048, "balance_loss_mlp": 1.01661181, "epoch": 0.5930379366319967, "flos": 17601472028160.0, "grad_norm": 3.473202059069269, "language_loss": 0.73014712, "learning_rate": 1.4998140131079553e-06, "loss": 0.75204086, "num_input_tokens_seen": 106345565, "step": 4932, "time_per_iteration": 2.645610809326172 }, { "auxiliary_loss_clip": 0.01102625, "auxiliary_loss_mlp": 0.00762328, "balance_loss_clip": 1.04204106, "balance_loss_mlp": 1.00031996, "epoch": 0.5931581795226357, "flos": 17704283731200.0, "grad_norm": 1.7952848941786759, "language_loss": 0.73371428, "learning_rate": 1.4990598337190821e-06, "loss": 0.7523638, "num_input_tokens_seen": 106361920, "step": 4933, "time_per_iteration": 2.869096517562866 }, { "auxiliary_loss_clip": 0.01174698, "auxiliary_loss_mlp": 0.0076212, "balance_loss_clip": 1.04931426, "balance_loss_mlp": 1.00038075, "epoch": 0.5932784224132748, "flos": 24280102483200.0, "grad_norm": 1.8609330816637781, "language_loss": 0.68177021, "learning_rate": 1.4983057303224338e-06, "loss": 0.70113838, "num_input_tokens_seen": 106381735, "step": 4934, "time_per_iteration": 2.7019543647766113 }, { "auxiliary_loss_clip": 0.01117831, "auxiliary_loss_mlp": 0.01027254, "balance_loss_clip": 1.04206979, "balance_loss_mlp": 1.02008104, "epoch": 0.5933986653039139, "flos": 22926700909440.0, "grad_norm": 1.6330041852577895, "language_loss": 0.87480175, "learning_rate": 1.4975517030324072e-06, "loss": 0.89625263, "num_input_tokens_seen": 106399745, "step": 4935, "time_per_iteration": 2.879046678543091 }, { "auxiliary_loss_clip": 0.0106941, "auxiliary_loss_mlp": 0.00753789, "balance_loss_clip": 1.01016378, "balance_loss_mlp": 1.00033402, "epoch": 0.593518908194553, "flos": 71121730256640.0, "grad_norm": 0.7834552091149868, "language_loss": 0.61766863, "learning_rate": 1.4967977519633882e-06, "loss": 0.63590062, "num_input_tokens_seen": 106457205, "step": 4936, "time_per_iteration": 3.375359058380127 }, { "auxiliary_loss_clip": 0.01133161, "auxiliary_loss_mlp": 0.01025138, "balance_loss_clip": 1.04591322, "balance_loss_mlp": 1.01717472, "epoch": 0.593639151085192, "flos": 20448649526400.0, "grad_norm": 1.8996652854987477, "language_loss": 0.78097266, "learning_rate": 1.4960438772297494e-06, "loss": 0.80255568, "num_input_tokens_seen": 106474250, "step": 4937, "time_per_iteration": 2.631826162338257 }, { "auxiliary_loss_clip": 0.01151835, "auxiliary_loss_mlp": 0.01025471, "balance_loss_clip": 1.04609859, "balance_loss_mlp": 1.01803565, "epoch": 0.5937593939758312, "flos": 30883428074880.0, "grad_norm": 2.406045999110213, "language_loss": 0.73724508, "learning_rate": 1.495290078945855e-06, "loss": 0.75901812, "num_input_tokens_seen": 106494015, "step": 4938, "time_per_iteration": 2.7481181621551514 }, { "auxiliary_loss_clip": 0.01175109, "auxiliary_loss_mlp": 0.01029255, "balance_loss_clip": 1.05077052, "balance_loss_mlp": 1.02201557, "epoch": 0.5938796368664703, "flos": 36898069668480.0, "grad_norm": 3.1565330882268703, "language_loss": 0.74553788, "learning_rate": 1.4945363572260529e-06, "loss": 0.76758146, "num_input_tokens_seen": 106515010, "step": 4939, "time_per_iteration": 2.7523868083953857 }, { "auxiliary_loss_clip": 0.0115798, "auxiliary_loss_mlp": 0.01026481, "balance_loss_clip": 1.04702652, "balance_loss_mlp": 1.01943564, "epoch": 0.5939998797571093, "flos": 23842926051840.0, "grad_norm": 2.07074245523757, "language_loss": 0.67881203, "learning_rate": 1.4937827121846845e-06, "loss": 0.70065665, "num_input_tokens_seen": 106535265, "step": 4940, "time_per_iteration": 2.6404669284820557 }, { "auxiliary_loss_clip": 0.0112937, "auxiliary_loss_mlp": 0.01029876, "balance_loss_clip": 1.04728198, "balance_loss_mlp": 1.02221644, "epoch": 0.5941201226477485, "flos": 25191407462400.0, "grad_norm": 1.528919300996716, "language_loss": 0.73509103, "learning_rate": 1.4930291439360755e-06, "loss": 0.75668347, "num_input_tokens_seen": 106557830, "step": 4941, "time_per_iteration": 2.7574355602264404 }, { "auxiliary_loss_clip": 0.01162922, "auxiliary_loss_mlp": 0.01032008, "balance_loss_clip": 1.04863036, "balance_loss_mlp": 1.02381253, "epoch": 0.5942403655383875, "flos": 22418996123520.0, "grad_norm": 1.8821525518320121, "language_loss": 0.79288542, "learning_rate": 1.4922756525945427e-06, "loss": 0.81483471, "num_input_tokens_seen": 106577140, "step": 4942, "time_per_iteration": 2.6362712383270264 }, { "auxiliary_loss_clip": 0.01059847, "auxiliary_loss_mlp": 0.01000862, "balance_loss_clip": 1.00977492, "balance_loss_mlp": 0.99940729, "epoch": 0.5943606084290266, "flos": 67629310796160.0, "grad_norm": 0.7745454027183941, "language_loss": 0.59575236, "learning_rate": 1.4915222382743894e-06, "loss": 0.61635947, "num_input_tokens_seen": 106635975, "step": 4943, "time_per_iteration": 3.3216826915740967 }, { "auxiliary_loss_clip": 0.01164709, "auxiliary_loss_mlp": 0.01031785, "balance_loss_clip": 1.05090916, "balance_loss_mlp": 1.02457869, "epoch": 0.5944808513196658, "flos": 18223157646720.0, "grad_norm": 2.4145949082932643, "language_loss": 0.71859765, "learning_rate": 1.4907689010899085e-06, "loss": 0.74056262, "num_input_tokens_seen": 106653555, "step": 4944, "time_per_iteration": 2.696063995361328 }, { "auxiliary_loss_clip": 0.01147255, "auxiliary_loss_mlp": 0.01022284, "balance_loss_clip": 1.04736722, "balance_loss_mlp": 1.01478899, "epoch": 0.5946010942103048, "flos": 24790824011520.0, "grad_norm": 1.9970512468044177, "language_loss": 0.62627369, "learning_rate": 1.4900156411553804e-06, "loss": 0.64796913, "num_input_tokens_seen": 106673385, "step": 4945, "time_per_iteration": 2.755937099456787 }, { "auxiliary_loss_clip": 0.01151707, "auxiliary_loss_mlp": 0.01033008, "balance_loss_clip": 1.04963422, "balance_loss_mlp": 1.02566469, "epoch": 0.5947213371009439, "flos": 15231619388160.0, "grad_norm": 2.217266874433972, "language_loss": 0.85504138, "learning_rate": 1.4892624585850739e-06, "loss": 0.87688857, "num_input_tokens_seen": 106691740, "step": 4946, "time_per_iteration": 2.626225471496582 }, { "auxiliary_loss_clip": 0.01178728, "auxiliary_loss_mlp": 0.01025544, "balance_loss_clip": 1.05095673, "balance_loss_mlp": 1.01769459, "epoch": 0.594841579991583, "flos": 25848069949440.0, "grad_norm": 3.7621189102549693, "language_loss": 0.7968986, "learning_rate": 1.4885093534932465e-06, "loss": 0.81894135, "num_input_tokens_seen": 106709705, "step": 4947, "time_per_iteration": 2.6524109840393066 }, { "auxiliary_loss_clip": 0.01149233, "auxiliary_loss_mlp": 0.01031908, "balance_loss_clip": 1.05021203, "balance_loss_mlp": 1.02427268, "epoch": 0.5949618228822221, "flos": 23981109672960.0, "grad_norm": 2.17079063834989, "language_loss": 0.71543622, "learning_rate": 1.4877563259941433e-06, "loss": 0.73724759, "num_input_tokens_seen": 106727560, "step": 4948, "time_per_iteration": 2.695841073989868 }, { "auxiliary_loss_clip": 0.01173635, "auxiliary_loss_mlp": 0.01027662, "balance_loss_clip": 1.05417943, "balance_loss_mlp": 1.01959193, "epoch": 0.5950820657728612, "flos": 40547491476480.0, "grad_norm": 2.5168648517024703, "language_loss": 0.68135923, "learning_rate": 1.4870033762019988e-06, "loss": 0.70337224, "num_input_tokens_seen": 106747725, "step": 4949, "time_per_iteration": 2.7337679862976074 }, { "auxiliary_loss_clip": 0.01144199, "auxiliary_loss_mlp": 0.01024535, "balance_loss_clip": 1.04590869, "balance_loss_mlp": 1.01675701, "epoch": 0.5952023086635003, "flos": 23184467884800.0, "grad_norm": 1.5581550825109456, "language_loss": 0.73364913, "learning_rate": 1.4862505042310334e-06, "loss": 0.75533652, "num_input_tokens_seen": 106767010, "step": 4950, "time_per_iteration": 3.6990439891815186 }, { "auxiliary_loss_clip": 0.0113992, "auxiliary_loss_mlp": 0.01028723, "balance_loss_clip": 1.04666436, "balance_loss_mlp": 1.02115345, "epoch": 0.5953225515541394, "flos": 33653289548160.0, "grad_norm": 1.6561506438949212, "language_loss": 0.69615495, "learning_rate": 1.4854977101954587e-06, "loss": 0.71784139, "num_input_tokens_seen": 106789230, "step": 4951, "time_per_iteration": 3.6728219985961914 }, { "auxiliary_loss_clip": 0.01161938, "auxiliary_loss_mlp": 0.0102755, "balance_loss_clip": 1.04527819, "balance_loss_mlp": 1.01977444, "epoch": 0.5954427944447784, "flos": 24459619680000.0, "grad_norm": 1.8769045931652921, "language_loss": 0.86637646, "learning_rate": 1.4847449942094716e-06, "loss": 0.88827133, "num_input_tokens_seen": 106808110, "step": 4952, "time_per_iteration": 3.5909407138824463 }, { "auxiliary_loss_clip": 0.01142818, "auxiliary_loss_mlp": 0.0102228, "balance_loss_clip": 1.04623818, "balance_loss_mlp": 1.01478219, "epoch": 0.5955630373354175, "flos": 18551848026240.0, "grad_norm": 2.338284585982738, "language_loss": 0.8639586, "learning_rate": 1.4839923563872598e-06, "loss": 0.88560957, "num_input_tokens_seen": 106826650, "step": 4953, "time_per_iteration": 2.668748140335083 }, { "auxiliary_loss_clip": 0.01134512, "auxiliary_loss_mlp": 0.01032523, "balance_loss_clip": 1.04762721, "balance_loss_mlp": 1.02506077, "epoch": 0.5956832802260567, "flos": 19791699730560.0, "grad_norm": 1.6809302522707876, "language_loss": 0.75966096, "learning_rate": 1.483239796842997e-06, "loss": 0.7813313, "num_input_tokens_seen": 106844680, "step": 4954, "time_per_iteration": 2.658351421356201 }, { "auxiliary_loss_clip": 0.01134189, "auxiliary_loss_mlp": 0.01028189, "balance_loss_clip": 1.04570949, "balance_loss_mlp": 1.02060688, "epoch": 0.5958035231166957, "flos": 19750868945280.0, "grad_norm": 1.622825129218234, "language_loss": 0.83730525, "learning_rate": 1.4824873156908462e-06, "loss": 0.85892904, "num_input_tokens_seen": 106862605, "step": 4955, "time_per_iteration": 3.5932064056396484 }, { "auxiliary_loss_clip": 0.01166817, "auxiliary_loss_mlp": 0.00763133, "balance_loss_clip": 1.05168653, "balance_loss_mlp": 1.00038528, "epoch": 0.5959237660073348, "flos": 21652806090240.0, "grad_norm": 1.5194661797305027, "language_loss": 0.75879288, "learning_rate": 1.4817349130449584e-06, "loss": 0.77809238, "num_input_tokens_seen": 106882325, "step": 4956, "time_per_iteration": 2.6862120628356934 }, { "auxiliary_loss_clip": 0.01159185, "auxiliary_loss_mlp": 0.01025453, "balance_loss_clip": 1.04763031, "balance_loss_mlp": 1.01839542, "epoch": 0.5960440088979739, "flos": 21171207513600.0, "grad_norm": 2.6989827599012455, "language_loss": 0.82674837, "learning_rate": 1.4809825890194717e-06, "loss": 0.84859478, "num_input_tokens_seen": 106900995, "step": 4957, "time_per_iteration": 2.630902051925659 }, { "auxiliary_loss_clip": 0.01142236, "auxiliary_loss_mlp": 0.01026093, "balance_loss_clip": 1.04552639, "balance_loss_mlp": 1.0185771, "epoch": 0.596164251788613, "flos": 14757526753920.0, "grad_norm": 1.8605523188671738, "language_loss": 0.77308142, "learning_rate": 1.4802303437285139e-06, "loss": 0.7947647, "num_input_tokens_seen": 106918265, "step": 4958, "time_per_iteration": 2.8147284984588623 }, { "auxiliary_loss_clip": 0.01144943, "auxiliary_loss_mlp": 0.01022297, "balance_loss_clip": 1.04495668, "balance_loss_mlp": 1.01477754, "epoch": 0.596284494679252, "flos": 20485924865280.0, "grad_norm": 3.1164179526804237, "language_loss": 0.80827016, "learning_rate": 1.4794781772861994e-06, "loss": 0.82994258, "num_input_tokens_seen": 106934760, "step": 4959, "time_per_iteration": 2.6869027614593506 }, { "auxiliary_loss_clip": 0.01146989, "auxiliary_loss_mlp": 0.00762262, "balance_loss_clip": 1.04625249, "balance_loss_mlp": 1.00038064, "epoch": 0.5964047375698912, "flos": 31212262108800.0, "grad_norm": 1.948902827572513, "language_loss": 0.66892266, "learning_rate": 1.4787260898066324e-06, "loss": 0.68801522, "num_input_tokens_seen": 106954760, "step": 4960, "time_per_iteration": 2.771940231323242 }, { "auxiliary_loss_clip": 0.01174262, "auxiliary_loss_mlp": 0.01024873, "balance_loss_clip": 1.0502882, "balance_loss_mlp": 1.01764321, "epoch": 0.5965249804605303, "flos": 27483620855040.0, "grad_norm": 2.1605123964985116, "language_loss": 0.85705084, "learning_rate": 1.4779740814039023e-06, "loss": 0.87904227, "num_input_tokens_seen": 106974845, "step": 4961, "time_per_iteration": 2.6879870891571045 }, { "auxiliary_loss_clip": 0.01174596, "auxiliary_loss_mlp": 0.01021949, "balance_loss_clip": 1.04902303, "balance_loss_mlp": 1.01386094, "epoch": 0.5966452233511693, "flos": 30773936442240.0, "grad_norm": 1.9437018735303446, "language_loss": 0.6871255, "learning_rate": 1.4772221521920894e-06, "loss": 0.70909095, "num_input_tokens_seen": 106994870, "step": 4962, "time_per_iteration": 2.7051305770874023 }, { "auxiliary_loss_clip": 0.01147628, "auxiliary_loss_mlp": 0.01029238, "balance_loss_clip": 1.04878104, "balance_loss_mlp": 1.02145314, "epoch": 0.5967654662418085, "flos": 25481170477440.0, "grad_norm": 3.496652556396209, "language_loss": 0.74320066, "learning_rate": 1.4764703022852598e-06, "loss": 0.76496929, "num_input_tokens_seen": 107015390, "step": 4963, "time_per_iteration": 2.7165348529815674 }, { "auxiliary_loss_clip": 0.01092845, "auxiliary_loss_mlp": 0.0102811, "balance_loss_clip": 1.03999472, "balance_loss_mlp": 1.02144647, "epoch": 0.5968857091324475, "flos": 19099126621440.0, "grad_norm": 1.9836007547313463, "language_loss": 0.77198255, "learning_rate": 1.4757185317974696e-06, "loss": 0.79319215, "num_input_tokens_seen": 107033775, "step": 4964, "time_per_iteration": 2.7762391567230225 }, { "auxiliary_loss_clip": 0.01163314, "auxiliary_loss_mlp": 0.01026683, "balance_loss_clip": 1.04860091, "balance_loss_mlp": 1.01860702, "epoch": 0.5970059520230866, "flos": 23692711374720.0, "grad_norm": 2.2274890348095715, "language_loss": 0.70681584, "learning_rate": 1.474966840842761e-06, "loss": 0.72871578, "num_input_tokens_seen": 107053355, "step": 4965, "time_per_iteration": 2.6759989261627197 }, { "auxiliary_loss_clip": 0.01162876, "auxiliary_loss_mlp": 0.010249, "balance_loss_clip": 1.04797935, "balance_loss_mlp": 1.01751161, "epoch": 0.5971261949137258, "flos": 23185545292800.0, "grad_norm": 1.8216510722485046, "language_loss": 0.87092376, "learning_rate": 1.4742152295351655e-06, "loss": 0.89280146, "num_input_tokens_seen": 107072510, "step": 4966, "time_per_iteration": 2.662252187728882 }, { "auxiliary_loss_clip": 0.01162492, "auxiliary_loss_mlp": 0.00762976, "balance_loss_clip": 1.0482918, "balance_loss_mlp": 1.00030446, "epoch": 0.5972464378043648, "flos": 20557710195840.0, "grad_norm": 2.310148593338156, "language_loss": 0.64144266, "learning_rate": 1.4734636979887016e-06, "loss": 0.66069734, "num_input_tokens_seen": 107089970, "step": 4967, "time_per_iteration": 2.656702995300293 }, { "auxiliary_loss_clip": 0.01138861, "auxiliary_loss_mlp": 0.01024341, "balance_loss_clip": 1.04556227, "balance_loss_mlp": 1.01686072, "epoch": 0.5973666806950039, "flos": 29387030457600.0, "grad_norm": 1.9570763154506534, "language_loss": 0.9030174, "learning_rate": 1.4727122463173755e-06, "loss": 0.92464948, "num_input_tokens_seen": 107108500, "step": 4968, "time_per_iteration": 2.8027780055999756 }, { "auxiliary_loss_clip": 0.01145023, "auxiliary_loss_mlp": 0.01025826, "balance_loss_clip": 1.0479207, "balance_loss_mlp": 1.01850414, "epoch": 0.597486923585643, "flos": 22273522041600.0, "grad_norm": 1.8692096463166925, "language_loss": 0.64752495, "learning_rate": 1.471960874635183e-06, "loss": 0.66923344, "num_input_tokens_seen": 107128060, "step": 4969, "time_per_iteration": 2.91204571723938 }, { "auxiliary_loss_clip": 0.01140117, "auxiliary_loss_mlp": 0.01023938, "balance_loss_clip": 1.04353905, "balance_loss_mlp": 1.01637435, "epoch": 0.5976071664762821, "flos": 13772461196160.0, "grad_norm": 2.4784733591170847, "language_loss": 0.70519096, "learning_rate": 1.4712095830561055e-06, "loss": 0.7268315, "num_input_tokens_seen": 107146550, "step": 4970, "time_per_iteration": 2.671529769897461 }, { "auxiliary_loss_clip": 0.01146363, "auxiliary_loss_mlp": 0.01022786, "balance_loss_clip": 1.04487586, "balance_loss_mlp": 1.01563644, "epoch": 0.5977274093669211, "flos": 19098623831040.0, "grad_norm": 1.7337669579252672, "language_loss": 0.80786115, "learning_rate": 1.4704583716941147e-06, "loss": 0.82955265, "num_input_tokens_seen": 107165415, "step": 4971, "time_per_iteration": 2.663783073425293 }, { "auxiliary_loss_clip": 0.01155808, "auxiliary_loss_mlp": 0.01026793, "balance_loss_clip": 1.04874134, "balance_loss_mlp": 1.01850832, "epoch": 0.5978476522575603, "flos": 20376002269440.0, "grad_norm": 1.7485822365449648, "language_loss": 0.72292298, "learning_rate": 1.4697072406631672e-06, "loss": 0.74474895, "num_input_tokens_seen": 107185320, "step": 4972, "time_per_iteration": 2.6577064990997314 }, { "auxiliary_loss_clip": 0.01126295, "auxiliary_loss_mlp": 0.01037081, "balance_loss_clip": 1.04964566, "balance_loss_mlp": 1.02931464, "epoch": 0.5979678951481994, "flos": 29023147728000.0, "grad_norm": 1.6442295515928538, "language_loss": 0.7258296, "learning_rate": 1.4689561900772097e-06, "loss": 0.74746335, "num_input_tokens_seen": 107205380, "step": 4973, "time_per_iteration": 2.7852942943573 }, { "auxiliary_loss_clip": 0.01144557, "auxiliary_loss_mlp": 0.01034888, "balance_loss_clip": 1.04388666, "balance_loss_mlp": 1.02733016, "epoch": 0.5980881380388384, "flos": 17967689141760.0, "grad_norm": 2.3590652531219525, "language_loss": 0.72582763, "learning_rate": 1.4682052200501758e-06, "loss": 0.74762213, "num_input_tokens_seen": 107222585, "step": 4974, "time_per_iteration": 2.664034128189087 }, { "auxiliary_loss_clip": 0.01173494, "auxiliary_loss_mlp": 0.01029593, "balance_loss_clip": 1.04869485, "balance_loss_mlp": 1.02242565, "epoch": 0.5982083809294776, "flos": 22962827013120.0, "grad_norm": 1.742813449056899, "language_loss": 0.80094767, "learning_rate": 1.4674543306959876e-06, "loss": 0.8229785, "num_input_tokens_seen": 107242055, "step": 4975, "time_per_iteration": 2.72810959815979 }, { "auxiliary_loss_clip": 0.0115124, "auxiliary_loss_mlp": 0.01028598, "balance_loss_clip": 1.04658675, "balance_loss_mlp": 1.02006805, "epoch": 0.5983286238201166, "flos": 20991941712000.0, "grad_norm": 2.2135592536200397, "language_loss": 0.84194541, "learning_rate": 1.4667035221285535e-06, "loss": 0.8637439, "num_input_tokens_seen": 107259695, "step": 4976, "time_per_iteration": 3.6211395263671875 }, { "auxiliary_loss_clip": 0.01159763, "auxiliary_loss_mlp": 0.0102519, "balance_loss_clip": 1.04996049, "balance_loss_mlp": 1.01801991, "epoch": 0.5984488667107557, "flos": 28183448511360.0, "grad_norm": 1.8937107112323335, "language_loss": 0.74422824, "learning_rate": 1.4659527944617715e-06, "loss": 0.76607776, "num_input_tokens_seen": 107279640, "step": 4977, "time_per_iteration": 3.544301986694336 }, { "auxiliary_loss_clip": 0.01098939, "auxiliary_loss_mlp": 0.01027912, "balance_loss_clip": 1.04045033, "balance_loss_mlp": 1.0204674, "epoch": 0.5985691096013949, "flos": 16471794314880.0, "grad_norm": 1.9855193615362312, "language_loss": 0.76120651, "learning_rate": 1.465202147809526e-06, "loss": 0.78247505, "num_input_tokens_seen": 107298135, "step": 4978, "time_per_iteration": 3.659019947052002 }, { "auxiliary_loss_clip": 0.01176185, "auxiliary_loss_mlp": 0.01026107, "balance_loss_clip": 1.05002856, "balance_loss_mlp": 1.01907349, "epoch": 0.5986893524920339, "flos": 26719046933760.0, "grad_norm": 2.126129869938563, "language_loss": 0.76334977, "learning_rate": 1.4644515822856888e-06, "loss": 0.78537267, "num_input_tokens_seen": 107316570, "step": 4979, "time_per_iteration": 2.617230176925659 }, { "auxiliary_loss_clip": 0.01037436, "auxiliary_loss_mlp": 0.0100291, "balance_loss_clip": 1.00956309, "balance_loss_mlp": 1.00145578, "epoch": 0.598809595382673, "flos": 61608061100160.0, "grad_norm": 0.7577283473028841, "language_loss": 0.56525201, "learning_rate": 1.4637010980041215e-06, "loss": 0.58565545, "num_input_tokens_seen": 107378680, "step": 4980, "time_per_iteration": 4.172622442245483 }, { "auxiliary_loss_clip": 0.01177762, "auxiliary_loss_mlp": 0.01032541, "balance_loss_clip": 1.05156636, "balance_loss_mlp": 1.02447009, "epoch": 0.5989298382733121, "flos": 11801719549440.0, "grad_norm": 2.2015109682280674, "language_loss": 0.89729786, "learning_rate": 1.4629506950786707e-06, "loss": 0.91940081, "num_input_tokens_seen": 107394860, "step": 4981, "time_per_iteration": 2.6379129886627197 }, { "auxiliary_loss_clip": 0.01069123, "auxiliary_loss_mlp": 0.01001658, "balance_loss_clip": 1.01047826, "balance_loss_mlp": 1.00028121, "epoch": 0.5990500811639512, "flos": 60025800021120.0, "grad_norm": 0.8155934540320864, "language_loss": 0.56046015, "learning_rate": 1.4622003736231733e-06, "loss": 0.58116794, "num_input_tokens_seen": 107453850, "step": 4982, "time_per_iteration": 3.2368595600128174 }, { "auxiliary_loss_clip": 0.0115867, "auxiliary_loss_mlp": 0.01026704, "balance_loss_clip": 1.04893994, "balance_loss_mlp": 1.0190922, "epoch": 0.5991703240545903, "flos": 18222726683520.0, "grad_norm": 1.8040983779012478, "language_loss": 0.80395114, "learning_rate": 1.461450133751451e-06, "loss": 0.82580489, "num_input_tokens_seen": 107471920, "step": 4983, "time_per_iteration": 2.6203761100769043 }, { "auxiliary_loss_clip": 0.01164115, "auxiliary_loss_mlp": 0.0102446, "balance_loss_clip": 1.04836094, "balance_loss_mlp": 1.01723886, "epoch": 0.5992905669452293, "flos": 27709894581120.0, "grad_norm": 1.699729858697749, "language_loss": 0.75607479, "learning_rate": 1.4606999755773153e-06, "loss": 0.77796054, "num_input_tokens_seen": 107493125, "step": 4984, "time_per_iteration": 2.673377275466919 }, { "auxiliary_loss_clip": 0.01174456, "auxiliary_loss_mlp": 0.01025381, "balance_loss_clip": 1.05015087, "balance_loss_mlp": 1.01764393, "epoch": 0.5994108098358685, "flos": 20449008662400.0, "grad_norm": 1.7105095461652378, "language_loss": 0.82045698, "learning_rate": 1.4599498992145643e-06, "loss": 0.84245539, "num_input_tokens_seen": 107513150, "step": 4985, "time_per_iteration": 2.657057046890259 }, { "auxiliary_loss_clip": 0.01151239, "auxiliary_loss_mlp": 0.0076247, "balance_loss_clip": 1.04745626, "balance_loss_mlp": 1.00044823, "epoch": 0.5995310527265075, "flos": 22269966595200.0, "grad_norm": 2.821284256225329, "language_loss": 0.70468926, "learning_rate": 1.4591999047769846e-06, "loss": 0.72382629, "num_input_tokens_seen": 107532005, "step": 4986, "time_per_iteration": 2.6651856899261475 }, { "auxiliary_loss_clip": 0.01098931, "auxiliary_loss_mlp": 0.01028446, "balance_loss_clip": 1.03836322, "balance_loss_mlp": 1.02073312, "epoch": 0.5996512956171466, "flos": 18916951818240.0, "grad_norm": 1.6655067413566855, "language_loss": 0.7570219, "learning_rate": 1.4584499923783486e-06, "loss": 0.77829564, "num_input_tokens_seen": 107550585, "step": 4987, "time_per_iteration": 2.6733927726745605 }, { "auxiliary_loss_clip": 0.01148702, "auxiliary_loss_mlp": 0.01027656, "balance_loss_clip": 1.04833603, "balance_loss_mlp": 1.0199734, "epoch": 0.5997715385077858, "flos": 15370916330880.0, "grad_norm": 2.4458279903328397, "language_loss": 0.75861722, "learning_rate": 1.457700162132419e-06, "loss": 0.78038085, "num_input_tokens_seen": 107567575, "step": 4988, "time_per_iteration": 2.647803783416748 }, { "auxiliary_loss_clip": 0.01116751, "auxiliary_loss_mlp": 0.01025305, "balance_loss_clip": 1.04427564, "balance_loss_mlp": 1.01784861, "epoch": 0.5998917813984248, "flos": 25264844818560.0, "grad_norm": 2.0274138832116915, "language_loss": 0.72447371, "learning_rate": 1.4569504141529433e-06, "loss": 0.74589431, "num_input_tokens_seen": 107585410, "step": 4989, "time_per_iteration": 2.7681922912597656 }, { "auxiliary_loss_clip": 0.01160683, "auxiliary_loss_mlp": 0.01024908, "balance_loss_clip": 1.05025816, "balance_loss_mlp": 1.01673007, "epoch": 0.6000120242890639, "flos": 22054502862720.0, "grad_norm": 2.229542913503612, "language_loss": 0.72293025, "learning_rate": 1.456200748553658e-06, "loss": 0.74478614, "num_input_tokens_seen": 107603405, "step": 4990, "time_per_iteration": 2.7586252689361572 }, { "auxiliary_loss_clip": 0.01177822, "auxiliary_loss_mlp": 0.01034564, "balance_loss_clip": 1.05173874, "balance_loss_mlp": 1.02647591, "epoch": 0.600132267179703, "flos": 29863421562240.0, "grad_norm": 1.8311269296003176, "language_loss": 0.78879142, "learning_rate": 1.455451165448287e-06, "loss": 0.81091529, "num_input_tokens_seen": 107626060, "step": 4991, "time_per_iteration": 2.7449238300323486 }, { "auxiliary_loss_clip": 0.01146881, "auxiliary_loss_mlp": 0.01031124, "balance_loss_clip": 1.04713464, "balance_loss_mlp": 1.0230056, "epoch": 0.6002525100703421, "flos": 25045358762880.0, "grad_norm": 2.8142612497468256, "language_loss": 0.73995018, "learning_rate": 1.4547016649505407e-06, "loss": 0.76173019, "num_input_tokens_seen": 107644070, "step": 4992, "time_per_iteration": 2.712965250015259 }, { "auxiliary_loss_clip": 0.01129807, "auxiliary_loss_mlp": 0.01030074, "balance_loss_clip": 1.04079795, "balance_loss_mlp": 1.02222979, "epoch": 0.6003727529609811, "flos": 20849592113280.0, "grad_norm": 2.066832787943725, "language_loss": 0.84808391, "learning_rate": 1.4539522471741193e-06, "loss": 0.86968267, "num_input_tokens_seen": 107661495, "step": 4993, "time_per_iteration": 2.9016408920288086 }, { "auxiliary_loss_clip": 0.01168101, "auxiliary_loss_mlp": 0.01031301, "balance_loss_clip": 1.05004084, "balance_loss_mlp": 1.02296257, "epoch": 0.6004929958516203, "flos": 15594604277760.0, "grad_norm": 2.1687986983771927, "language_loss": 0.71459174, "learning_rate": 1.4532029122327067e-06, "loss": 0.73658574, "num_input_tokens_seen": 107678280, "step": 4994, "time_per_iteration": 2.6248929500579834 }, { "auxiliary_loss_clip": 0.01128294, "auxiliary_loss_mlp": 0.01028773, "balance_loss_clip": 1.04761577, "balance_loss_mlp": 1.02173376, "epoch": 0.6006132387422594, "flos": 21763267390080.0, "grad_norm": 1.9993562021188211, "language_loss": 0.75652814, "learning_rate": 1.4524536602399783e-06, "loss": 0.77809882, "num_input_tokens_seen": 107697370, "step": 4995, "time_per_iteration": 2.6576426029205322 }, { "auxiliary_loss_clip": 0.0114764, "auxiliary_loss_mlp": 0.01027208, "balance_loss_clip": 1.05090714, "balance_loss_mlp": 1.01967132, "epoch": 0.6007334816328984, "flos": 22858542852480.0, "grad_norm": 1.5962312636885647, "language_loss": 0.77501786, "learning_rate": 1.4517044913095938e-06, "loss": 0.79676634, "num_input_tokens_seen": 107717790, "step": 4996, "time_per_iteration": 2.6031856536865234 }, { "auxiliary_loss_clip": 0.01162213, "auxiliary_loss_mlp": 0.0102866, "balance_loss_clip": 1.04840517, "balance_loss_mlp": 1.02079201, "epoch": 0.6008537245235376, "flos": 28324577047680.0, "grad_norm": 2.171677519191441, "language_loss": 0.81636995, "learning_rate": 1.4509554055552022e-06, "loss": 0.83827877, "num_input_tokens_seen": 107738020, "step": 4997, "time_per_iteration": 2.614917278289795 }, { "auxiliary_loss_clip": 0.01146737, "auxiliary_loss_mlp": 0.01030528, "balance_loss_clip": 1.04679215, "balance_loss_mlp": 1.02280068, "epoch": 0.6009739674141766, "flos": 20886113266560.0, "grad_norm": 2.234984670630521, "language_loss": 0.83850694, "learning_rate": 1.450206403090439e-06, "loss": 0.86027956, "num_input_tokens_seen": 107756215, "step": 4998, "time_per_iteration": 2.6205227375030518 }, { "auxiliary_loss_clip": 0.01159455, "auxiliary_loss_mlp": 0.01027715, "balance_loss_clip": 1.04985797, "balance_loss_mlp": 1.02055037, "epoch": 0.6010942103048157, "flos": 20481004702080.0, "grad_norm": 1.923039522096354, "language_loss": 0.86052781, "learning_rate": 1.4494574840289274e-06, "loss": 0.8823995, "num_input_tokens_seen": 107773330, "step": 4999, "time_per_iteration": 2.6754627227783203 }, { "auxiliary_loss_clip": 0.0116861, "auxiliary_loss_mlp": 0.01032258, "balance_loss_clip": 1.04965281, "balance_loss_mlp": 1.02378213, "epoch": 0.6012144531954549, "flos": 23805973935360.0, "grad_norm": 1.6824643289282981, "language_loss": 0.73743987, "learning_rate": 1.4487086484842782e-06, "loss": 0.75944853, "num_input_tokens_seen": 107791975, "step": 5000, "time_per_iteration": 2.7889316082000732 }, { "auxiliary_loss_clip": 0.01173817, "auxiliary_loss_mlp": 0.0102346, "balance_loss_clip": 1.04940665, "balance_loss_mlp": 1.01603937, "epoch": 0.6013346960860939, "flos": 18988378012800.0, "grad_norm": 1.909405005234516, "language_loss": 0.60089952, "learning_rate": 1.4479598965700878e-06, "loss": 0.62287229, "num_input_tokens_seen": 107809240, "step": 5001, "time_per_iteration": 3.501450538635254 }, { "auxiliary_loss_clip": 0.01130531, "auxiliary_loss_mlp": 0.01029853, "balance_loss_clip": 1.04376078, "balance_loss_mlp": 1.02243805, "epoch": 0.601454938976733, "flos": 24025316336640.0, "grad_norm": 2.1552275029686183, "language_loss": 0.69116771, "learning_rate": 1.4472112283999427e-06, "loss": 0.71277153, "num_input_tokens_seen": 107827895, "step": 5002, "time_per_iteration": 2.737783908843994 }, { "auxiliary_loss_clip": 0.01160277, "auxiliary_loss_mlp": 0.01026484, "balance_loss_clip": 1.05239105, "balance_loss_mlp": 1.0189743, "epoch": 0.6015751818673721, "flos": 26427129102720.0, "grad_norm": 2.1411522917084462, "language_loss": 0.68869638, "learning_rate": 1.4464626440874143e-06, "loss": 0.71056396, "num_input_tokens_seen": 107847010, "step": 5003, "time_per_iteration": 3.5529987812042236 }, { "auxiliary_loss_clip": 0.01127504, "auxiliary_loss_mlp": 0.01028814, "balance_loss_clip": 1.0415628, "balance_loss_mlp": 1.02068698, "epoch": 0.6016954247580112, "flos": 13115260005120.0, "grad_norm": 2.421227621092797, "language_loss": 0.74458504, "learning_rate": 1.4457141437460636e-06, "loss": 0.76614821, "num_input_tokens_seen": 107864235, "step": 5004, "time_per_iteration": 3.5994081497192383 }, { "auxiliary_loss_clip": 0.01147862, "auxiliary_loss_mlp": 0.01031073, "balance_loss_clip": 1.04659247, "balance_loss_mlp": 1.02281165, "epoch": 0.6018156676486502, "flos": 23768447201280.0, "grad_norm": 1.9783213232247108, "language_loss": 0.73283482, "learning_rate": 1.444965727489436e-06, "loss": 0.75462419, "num_input_tokens_seen": 107883680, "step": 5005, "time_per_iteration": 2.5895891189575195 }, { "auxiliary_loss_clip": 0.01129648, "auxiliary_loss_mlp": 0.01029378, "balance_loss_clip": 1.04296708, "balance_loss_mlp": 1.02116489, "epoch": 0.6019359105392894, "flos": 26469360518400.0, "grad_norm": 2.368925015647772, "language_loss": 0.63183415, "learning_rate": 1.444217395431066e-06, "loss": 0.65342438, "num_input_tokens_seen": 107906220, "step": 5006, "time_per_iteration": 3.4323251247406006 }, { "auxiliary_loss_clip": 0.01033215, "auxiliary_loss_mlp": 0.01001745, "balance_loss_clip": 1.00969291, "balance_loss_mlp": 1.000458, "epoch": 0.6020561534299285, "flos": 69190849728000.0, "grad_norm": 0.8309960699152448, "language_loss": 0.55832732, "learning_rate": 1.4434691476844755e-06, "loss": 0.57867694, "num_input_tokens_seen": 107967195, "step": 5007, "time_per_iteration": 3.2029309272766113 }, { "auxiliary_loss_clip": 0.01146857, "auxiliary_loss_mlp": 0.01029797, "balance_loss_clip": 1.05058956, "balance_loss_mlp": 1.02233434, "epoch": 0.6021763963205675, "flos": 21835304115840.0, "grad_norm": 2.2505375170237847, "language_loss": 0.66802537, "learning_rate": 1.4427209843631729e-06, "loss": 0.68979192, "num_input_tokens_seen": 107984245, "step": 5008, "time_per_iteration": 2.703042984008789 }, { "auxiliary_loss_clip": 0.01175423, "auxiliary_loss_mlp": 0.00762405, "balance_loss_clip": 1.05111337, "balance_loss_mlp": 1.00055385, "epoch": 0.6022966392112067, "flos": 26578636669440.0, "grad_norm": 1.7959321209807007, "language_loss": 0.80555946, "learning_rate": 1.4419729055806534e-06, "loss": 0.82493782, "num_input_tokens_seen": 108003680, "step": 5009, "time_per_iteration": 2.637044668197632 }, { "auxiliary_loss_clip": 0.01141918, "auxiliary_loss_mlp": 0.00762525, "balance_loss_clip": 1.04756641, "balance_loss_mlp": 1.00039804, "epoch": 0.6024168821018457, "flos": 20703722981760.0, "grad_norm": 1.7704001005201455, "language_loss": 0.82227659, "learning_rate": 1.441224911450401e-06, "loss": 0.84132105, "num_input_tokens_seen": 108019635, "step": 5010, "time_per_iteration": 2.773890256881714 }, { "auxiliary_loss_clip": 0.01167267, "auxiliary_loss_mlp": 0.01028078, "balance_loss_clip": 1.05093133, "balance_loss_mlp": 1.0205915, "epoch": 0.6025371249924848, "flos": 24680973242880.0, "grad_norm": 1.7712176978128895, "language_loss": 0.82064855, "learning_rate": 1.4404770020858851e-06, "loss": 0.84260201, "num_input_tokens_seen": 108039120, "step": 5011, "time_per_iteration": 2.675382137298584 }, { "auxiliary_loss_clip": 0.01156024, "auxiliary_loss_mlp": 0.01028804, "balance_loss_clip": 1.0477078, "balance_loss_mlp": 1.02049494, "epoch": 0.602657367883124, "flos": 25955801815680.0, "grad_norm": 1.7996071502404007, "language_loss": 0.85822195, "learning_rate": 1.439729177600563e-06, "loss": 0.88007027, "num_input_tokens_seen": 108059615, "step": 5012, "time_per_iteration": 2.691541910171509 }, { "auxiliary_loss_clip": 0.01159479, "auxiliary_loss_mlp": 0.01027763, "balance_loss_clip": 1.04908085, "balance_loss_mlp": 1.02044332, "epoch": 0.602777610773763, "flos": 16690633925760.0, "grad_norm": 1.8141227878972637, "language_loss": 0.73402786, "learning_rate": 1.4389814381078793e-06, "loss": 0.75590026, "num_input_tokens_seen": 108078855, "step": 5013, "time_per_iteration": 2.692416191101074 }, { "auxiliary_loss_clip": 0.01068671, "auxiliary_loss_mlp": 0.01030576, "balance_loss_clip": 1.04226673, "balance_loss_mlp": 1.02327406, "epoch": 0.6028978536644021, "flos": 13334243270400.0, "grad_norm": 2.1902465090951853, "language_loss": 0.79799104, "learning_rate": 1.438233783721265e-06, "loss": 0.81898355, "num_input_tokens_seen": 108095020, "step": 5014, "time_per_iteration": 3.179271936416626 }, { "auxiliary_loss_clip": 0.01144687, "auxiliary_loss_mlp": 0.01030523, "balance_loss_clip": 1.05154192, "balance_loss_mlp": 1.02315021, "epoch": 0.6030180965550412, "flos": 19644825018240.0, "grad_norm": 3.5764874742502526, "language_loss": 0.78025657, "learning_rate": 1.43748621455414e-06, "loss": 0.80200863, "num_input_tokens_seen": 108111455, "step": 5015, "time_per_iteration": 3.1983656883239746 }, { "auxiliary_loss_clip": 0.01146933, "auxiliary_loss_mlp": 0.01027396, "balance_loss_clip": 1.04933667, "balance_loss_mlp": 1.01962352, "epoch": 0.6031383394456803, "flos": 14458390289280.0, "grad_norm": 2.8112249118798, "language_loss": 0.80876547, "learning_rate": 1.4367387307199082e-06, "loss": 0.83050877, "num_input_tokens_seen": 108128305, "step": 5016, "time_per_iteration": 2.7225799560546875 }, { "auxiliary_loss_clip": 0.01154943, "auxiliary_loss_mlp": 0.01028221, "balance_loss_clip": 1.04549909, "balance_loss_mlp": 1.02070808, "epoch": 0.6032585823363193, "flos": 13917791623680.0, "grad_norm": 1.8663937249686873, "language_loss": 0.82447815, "learning_rate": 1.4359913323319632e-06, "loss": 0.84630978, "num_input_tokens_seen": 108145475, "step": 5017, "time_per_iteration": 2.6768887042999268 }, { "auxiliary_loss_clip": 0.01093841, "auxiliary_loss_mlp": 0.01022309, "balance_loss_clip": 1.03929496, "balance_loss_mlp": 1.01433432, "epoch": 0.6033788252269584, "flos": 24353252530560.0, "grad_norm": 1.7472455463410574, "language_loss": 0.77557904, "learning_rate": 1.4352440195036847e-06, "loss": 0.79674053, "num_input_tokens_seen": 108165650, "step": 5018, "time_per_iteration": 3.2117042541503906 }, { "auxiliary_loss_clip": 0.0109535, "auxiliary_loss_mlp": 0.01026661, "balance_loss_clip": 1.0391748, "balance_loss_mlp": 1.01880825, "epoch": 0.6034990681175976, "flos": 25521247077120.0, "grad_norm": 1.5560127143919007, "language_loss": 0.80187523, "learning_rate": 1.4344967923484395e-06, "loss": 0.82309532, "num_input_tokens_seen": 108187620, "step": 5019, "time_per_iteration": 3.056807041168213 }, { "auxiliary_loss_clip": 0.01157242, "auxiliary_loss_mlp": 0.01026494, "balance_loss_clip": 1.04708815, "balance_loss_mlp": 1.01885271, "epoch": 0.6036193110082366, "flos": 25958387594880.0, "grad_norm": 2.72340220139867, "language_loss": 0.7199797, "learning_rate": 1.433749650979581e-06, "loss": 0.74181706, "num_input_tokens_seen": 108207605, "step": 5020, "time_per_iteration": 3.377457857131958 }, { "auxiliary_loss_clip": 0.01136726, "auxiliary_loss_mlp": 0.01026435, "balance_loss_clip": 1.0458076, "balance_loss_mlp": 1.01943731, "epoch": 0.6037395538988757, "flos": 25593427457280.0, "grad_norm": 1.9977401522236844, "language_loss": 0.68038237, "learning_rate": 1.433002595510451e-06, "loss": 0.70201397, "num_input_tokens_seen": 108226385, "step": 5021, "time_per_iteration": 2.830561637878418 }, { "auxiliary_loss_clip": 0.01142272, "auxiliary_loss_mlp": 0.0076261, "balance_loss_clip": 1.0456481, "balance_loss_mlp": 1.00040078, "epoch": 0.6038597967895148, "flos": 17816253402240.0, "grad_norm": 1.9160488157094748, "language_loss": 0.7213906, "learning_rate": 1.4322556260543757e-06, "loss": 0.74043936, "num_input_tokens_seen": 108242960, "step": 5022, "time_per_iteration": 2.7014102935791016 }, { "auxiliary_loss_clip": 0.01039405, "auxiliary_loss_mlp": 0.01002491, "balance_loss_clip": 1.01143622, "balance_loss_mlp": 1.00112629, "epoch": 0.6039800396801539, "flos": 65169213235200.0, "grad_norm": 0.8920757920305581, "language_loss": 0.62775153, "learning_rate": 1.4315087427246703e-06, "loss": 0.64817047, "num_input_tokens_seen": 108296785, "step": 5023, "time_per_iteration": 3.167895793914795 }, { "auxiliary_loss_clip": 0.01070121, "auxiliary_loss_mlp": 0.01001596, "balance_loss_clip": 1.01145077, "balance_loss_mlp": 1.00029612, "epoch": 0.604100282570793, "flos": 67386409073280.0, "grad_norm": 0.871283349322447, "language_loss": 0.5850395, "learning_rate": 1.4307619456346372e-06, "loss": 0.60575664, "num_input_tokens_seen": 108341090, "step": 5024, "time_per_iteration": 2.9324069023132324 }, { "auxiliary_loss_clip": 0.01164063, "auxiliary_loss_mlp": 0.01029076, "balance_loss_clip": 1.04659295, "balance_loss_mlp": 1.02074945, "epoch": 0.6042205254614321, "flos": 35297495631360.0, "grad_norm": 1.7358304422334627, "language_loss": 0.74176627, "learning_rate": 1.430015234897564e-06, "loss": 0.76369762, "num_input_tokens_seen": 108364370, "step": 5025, "time_per_iteration": 2.809781074523926 }, { "auxiliary_loss_clip": 0.01172838, "auxiliary_loss_mlp": 0.00762664, "balance_loss_clip": 1.04788756, "balance_loss_mlp": 1.00050139, "epoch": 0.6043407683520712, "flos": 45658262206080.0, "grad_norm": 1.7952190703201352, "language_loss": 0.66457057, "learning_rate": 1.4292686106267274e-06, "loss": 0.68392557, "num_input_tokens_seen": 108387220, "step": 5026, "time_per_iteration": 2.8330507278442383 }, { "auxiliary_loss_clip": 0.0116611, "auxiliary_loss_mlp": 0.01026289, "balance_loss_clip": 1.04958248, "balance_loss_mlp": 1.01766372, "epoch": 0.6044610112427102, "flos": 16180020138240.0, "grad_norm": 1.5784952717699534, "language_loss": 0.77065784, "learning_rate": 1.4285220729353876e-06, "loss": 0.7925818, "num_input_tokens_seen": 108405760, "step": 5027, "time_per_iteration": 3.78674054145813 }, { "auxiliary_loss_clip": 0.01145114, "auxiliary_loss_mlp": 0.0102228, "balance_loss_clip": 1.04386473, "balance_loss_mlp": 1.01489472, "epoch": 0.6045812541333494, "flos": 13804062186240.0, "grad_norm": 1.8915746205369093, "language_loss": 0.77806807, "learning_rate": 1.4277756219367957e-06, "loss": 0.79974198, "num_input_tokens_seen": 108422785, "step": 5028, "time_per_iteration": 3.630660057067871 }, { "auxiliary_loss_clip": 0.01142081, "auxiliary_loss_mlp": 0.01027116, "balance_loss_clip": 1.04860473, "balance_loss_mlp": 1.0191108, "epoch": 0.6047014970239885, "flos": 19975059682560.0, "grad_norm": 2.1383705176383923, "language_loss": 0.7957648, "learning_rate": 1.4270292577441864e-06, "loss": 0.81745678, "num_input_tokens_seen": 108442290, "step": 5029, "time_per_iteration": 3.69783878326416 }, { "auxiliary_loss_clip": 0.01166192, "auxiliary_loss_mlp": 0.01022857, "balance_loss_clip": 1.04838204, "balance_loss_mlp": 1.01523936, "epoch": 0.6048217399146275, "flos": 25337097025920.0, "grad_norm": 1.8633701952750807, "language_loss": 0.72020757, "learning_rate": 1.4262829804707836e-06, "loss": 0.74209809, "num_input_tokens_seen": 108464280, "step": 5030, "time_per_iteration": 2.735646963119507 }, { "auxiliary_loss_clip": 0.01162327, "auxiliary_loss_mlp": 0.01027867, "balance_loss_clip": 1.04591656, "balance_loss_mlp": 1.01946902, "epoch": 0.6049419828052667, "flos": 26030819370240.0, "grad_norm": 1.436724671093019, "language_loss": 0.69779861, "learning_rate": 1.4255367902297958e-06, "loss": 0.71970057, "num_input_tokens_seen": 108485610, "step": 5031, "time_per_iteration": 2.6805918216705322 }, { "auxiliary_loss_clip": 0.01173022, "auxiliary_loss_mlp": 0.01028795, "balance_loss_clip": 1.04991245, "balance_loss_mlp": 1.02146673, "epoch": 0.6050622256959057, "flos": 14648106948480.0, "grad_norm": 2.3174314075281623, "language_loss": 0.78477192, "learning_rate": 1.4247906871344215e-06, "loss": 0.80679011, "num_input_tokens_seen": 108501005, "step": 5032, "time_per_iteration": 3.5489003658294678 }, { "auxiliary_loss_clip": 0.01142258, "auxiliary_loss_mlp": 0.0102801, "balance_loss_clip": 1.04445446, "balance_loss_mlp": 1.02055359, "epoch": 0.6051824685865448, "flos": 23331450337920.0, "grad_norm": 2.219451323212368, "language_loss": 0.75520462, "learning_rate": 1.4240446712978415e-06, "loss": 0.77690732, "num_input_tokens_seen": 108519990, "step": 5033, "time_per_iteration": 2.7841176986694336 }, { "auxiliary_loss_clip": 0.01163547, "auxiliary_loss_mlp": 0.01031102, "balance_loss_clip": 1.04678488, "balance_loss_mlp": 1.02282894, "epoch": 0.605302711477184, "flos": 27563307177600.0, "grad_norm": 2.580800873220544, "language_loss": 0.74725306, "learning_rate": 1.423298742833227e-06, "loss": 0.76919961, "num_input_tokens_seen": 108538650, "step": 5034, "time_per_iteration": 2.6969282627105713 }, { "auxiliary_loss_clip": 0.01138149, "auxiliary_loss_mlp": 0.0102633, "balance_loss_clip": 1.04428577, "balance_loss_mlp": 1.01853657, "epoch": 0.605422954367823, "flos": 15154698412800.0, "grad_norm": 2.1444130029078923, "language_loss": 0.7167697, "learning_rate": 1.4225529018537352e-06, "loss": 0.73841447, "num_input_tokens_seen": 108554155, "step": 5035, "time_per_iteration": 2.683643341064453 }, { "auxiliary_loss_clip": 0.01173727, "auxiliary_loss_mlp": 0.01029431, "balance_loss_clip": 1.04915404, "balance_loss_mlp": 1.02258897, "epoch": 0.6055431972584621, "flos": 27673912131840.0, "grad_norm": 1.516246265500322, "language_loss": 0.77885175, "learning_rate": 1.4218071484725082e-06, "loss": 0.80088329, "num_input_tokens_seen": 108576275, "step": 5036, "time_per_iteration": 2.6514716148376465 }, { "auxiliary_loss_clip": 0.01145106, "auxiliary_loss_mlp": 0.01031715, "balance_loss_clip": 1.04908001, "balance_loss_mlp": 1.02429152, "epoch": 0.6056634401491012, "flos": 19387489006080.0, "grad_norm": 2.0720834826300414, "language_loss": 0.76459736, "learning_rate": 1.4210614828026786e-06, "loss": 0.78636557, "num_input_tokens_seen": 108594125, "step": 5037, "time_per_iteration": 2.6622183322906494 }, { "auxiliary_loss_clip": 0.01171795, "auxiliary_loss_mlp": 0.01024076, "balance_loss_clip": 1.04733539, "balance_loss_mlp": 1.01644039, "epoch": 0.6057836830397403, "flos": 24789459294720.0, "grad_norm": 1.524051763954337, "language_loss": 0.7449702, "learning_rate": 1.4203159049573605e-06, "loss": 0.76692891, "num_input_tokens_seen": 108615360, "step": 5038, "time_per_iteration": 2.600682258605957 }, { "auxiliary_loss_clip": 0.0115294, "auxiliary_loss_mlp": 0.01027105, "balance_loss_clip": 1.04479194, "balance_loss_mlp": 1.01910925, "epoch": 0.6059039259303793, "flos": 20558248899840.0, "grad_norm": 2.9608739983378656, "language_loss": 0.87261182, "learning_rate": 1.4195704150496593e-06, "loss": 0.89441228, "num_input_tokens_seen": 108633075, "step": 5039, "time_per_iteration": 2.6481616497039795 }, { "auxiliary_loss_clip": 0.01148466, "auxiliary_loss_mlp": 0.01024184, "balance_loss_clip": 1.04885888, "balance_loss_mlp": 1.01651621, "epoch": 0.6060241688210185, "flos": 21069724613760.0, "grad_norm": 1.7074773104448981, "language_loss": 0.73631048, "learning_rate": 1.4188250131926639e-06, "loss": 0.75803697, "num_input_tokens_seen": 108651875, "step": 5040, "time_per_iteration": 2.671645164489746 }, { "auxiliary_loss_clip": 0.01147493, "auxiliary_loss_mlp": 0.01029991, "balance_loss_clip": 1.04692221, "balance_loss_mlp": 1.02231133, "epoch": 0.6061444117116576, "flos": 16361081619840.0, "grad_norm": 1.7665053704913436, "language_loss": 0.80622667, "learning_rate": 1.4180796994994525e-06, "loss": 0.8280015, "num_input_tokens_seen": 108669290, "step": 5041, "time_per_iteration": 2.643491744995117 }, { "auxiliary_loss_clip": 0.01143462, "auxiliary_loss_mlp": 0.01025126, "balance_loss_clip": 1.04528451, "balance_loss_mlp": 1.01740694, "epoch": 0.6062646546022966, "flos": 21507296094720.0, "grad_norm": 1.7459314059254187, "language_loss": 0.71880239, "learning_rate": 1.4173344740830877e-06, "loss": 0.74048823, "num_input_tokens_seen": 108688420, "step": 5042, "time_per_iteration": 2.678483724594116 }, { "auxiliary_loss_clip": 0.01142361, "auxiliary_loss_mlp": 0.01028009, "balance_loss_clip": 1.0493567, "balance_loss_mlp": 1.02086878, "epoch": 0.6063848974929358, "flos": 38983151283840.0, "grad_norm": 1.4407724471679442, "language_loss": 0.704252, "learning_rate": 1.4165893370566206e-06, "loss": 0.72595561, "num_input_tokens_seen": 108712175, "step": 5043, "time_per_iteration": 2.8046371936798096 }, { "auxiliary_loss_clip": 0.01156498, "auxiliary_loss_mlp": 0.01031443, "balance_loss_clip": 1.04605865, "balance_loss_mlp": 1.02392054, "epoch": 0.6065051403835748, "flos": 19646584784640.0, "grad_norm": 1.8692023053347995, "language_loss": 0.77849877, "learning_rate": 1.4158442885330865e-06, "loss": 0.8003782, "num_input_tokens_seen": 108730745, "step": 5044, "time_per_iteration": 2.7018954753875732 }, { "auxiliary_loss_clip": 0.01154779, "auxiliary_loss_mlp": 0.01027086, "balance_loss_clip": 1.04605782, "balance_loss_mlp": 1.01946294, "epoch": 0.6066253832742139, "flos": 23513086437120.0, "grad_norm": 2.109540476549584, "language_loss": 0.78734219, "learning_rate": 1.4150993286255094e-06, "loss": 0.80916083, "num_input_tokens_seen": 108749995, "step": 5045, "time_per_iteration": 2.67759108543396 }, { "auxiliary_loss_clip": 0.01174402, "auxiliary_loss_mlp": 0.0102799, "balance_loss_clip": 1.04901123, "balance_loss_mlp": 1.02096295, "epoch": 0.6067456261648531, "flos": 19133708440320.0, "grad_norm": 1.8607932955318516, "language_loss": 0.79663706, "learning_rate": 1.4143544574468993e-06, "loss": 0.81866097, "num_input_tokens_seen": 108768355, "step": 5046, "time_per_iteration": 2.552079200744629 }, { "auxiliary_loss_clip": 0.01157104, "auxiliary_loss_mlp": 0.01024584, "balance_loss_clip": 1.04752088, "balance_loss_mlp": 1.01752138, "epoch": 0.6068658690554921, "flos": 20520614424960.0, "grad_norm": 1.7135791702772045, "language_loss": 0.82690108, "learning_rate": 1.4136096751102523e-06, "loss": 0.84871793, "num_input_tokens_seen": 108786685, "step": 5047, "time_per_iteration": 2.6374855041503906 }, { "auxiliary_loss_clip": 0.01148636, "auxiliary_loss_mlp": 0.01026794, "balance_loss_clip": 1.0469892, "balance_loss_mlp": 1.01933193, "epoch": 0.6069861119461312, "flos": 27374560185600.0, "grad_norm": 2.3090417849371567, "language_loss": 0.82972741, "learning_rate": 1.4128649817285516e-06, "loss": 0.8514818, "num_input_tokens_seen": 108804820, "step": 5048, "time_per_iteration": 2.7611138820648193 }, { "auxiliary_loss_clip": 0.01148089, "auxiliary_loss_mlp": 0.01031954, "balance_loss_clip": 1.04520452, "balance_loss_mlp": 1.02516532, "epoch": 0.6071063548367702, "flos": 25626500904960.0, "grad_norm": 1.7521350697924771, "language_loss": 0.63174343, "learning_rate": 1.412120377414766e-06, "loss": 0.65354389, "num_input_tokens_seen": 108825010, "step": 5049, "time_per_iteration": 2.704341173171997 }, { "auxiliary_loss_clip": 0.01174819, "auxiliary_loss_mlp": 0.01025709, "balance_loss_clip": 1.05021524, "balance_loss_mlp": 1.01901865, "epoch": 0.6072265977274094, "flos": 24460517520000.0, "grad_norm": 1.6190112310216156, "language_loss": 0.71375883, "learning_rate": 1.4113758622818522e-06, "loss": 0.73576403, "num_input_tokens_seen": 108845075, "step": 5050, "time_per_iteration": 2.6372625827789307 }, { "auxiliary_loss_clip": 0.01153387, "auxiliary_loss_mlp": 0.00762509, "balance_loss_clip": 1.04972267, "balance_loss_mlp": 1.00047648, "epoch": 0.6073468406180484, "flos": 18149253413760.0, "grad_norm": 1.9603853427703537, "language_loss": 0.83132088, "learning_rate": 1.410631436442751e-06, "loss": 0.85047984, "num_input_tokens_seen": 108863870, "step": 5051, "time_per_iteration": 2.694211483001709 }, { "auxiliary_loss_clip": 0.01164616, "auxiliary_loss_mlp": 0.01029381, "balance_loss_clip": 1.04818773, "balance_loss_mlp": 1.02175772, "epoch": 0.6074670835086875, "flos": 20697617669760.0, "grad_norm": 2.453851160473683, "language_loss": 0.86446846, "learning_rate": 1.4098871000103936e-06, "loss": 0.88640845, "num_input_tokens_seen": 108882470, "step": 5052, "time_per_iteration": 2.6802048683166504 }, { "auxiliary_loss_clip": 0.01147787, "auxiliary_loss_mlp": 0.01026399, "balance_loss_clip": 1.04683042, "balance_loss_mlp": 1.01904345, "epoch": 0.6075873263993267, "flos": 23769955572480.0, "grad_norm": 1.787758566022627, "language_loss": 0.82585704, "learning_rate": 1.409142853097693e-06, "loss": 0.84759879, "num_input_tokens_seen": 108902710, "step": 5053, "time_per_iteration": 3.6108081340789795 }, { "auxiliary_loss_clip": 0.01150986, "auxiliary_loss_mlp": 0.01026829, "balance_loss_clip": 1.05033302, "balance_loss_mlp": 1.01953888, "epoch": 0.6077075692899657, "flos": 24454484035200.0, "grad_norm": 2.344193048659633, "language_loss": 0.79538631, "learning_rate": 1.408398695817553e-06, "loss": 0.81716442, "num_input_tokens_seen": 108919935, "step": 5054, "time_per_iteration": 2.7133491039276123 }, { "auxiliary_loss_clip": 0.0114766, "auxiliary_loss_mlp": 0.01029482, "balance_loss_clip": 1.04584837, "balance_loss_mlp": 1.02116668, "epoch": 0.6078278121806048, "flos": 27382102041600.0, "grad_norm": 1.8877527791977893, "language_loss": 0.70078266, "learning_rate": 1.4076546282828593e-06, "loss": 0.72255403, "num_input_tokens_seen": 108942790, "step": 5055, "time_per_iteration": 3.7269246578216553 }, { "auxiliary_loss_clip": 0.01149758, "auxiliary_loss_mlp": 0.01028353, "balance_loss_clip": 1.04358816, "balance_loss_mlp": 1.02080429, "epoch": 0.6079480550712439, "flos": 38436447306240.0, "grad_norm": 2.249194779709867, "language_loss": 0.6628319, "learning_rate": 1.4069106506064874e-06, "loss": 0.68461305, "num_input_tokens_seen": 108964215, "step": 5056, "time_per_iteration": 3.8192877769470215 }, { "auxiliary_loss_clip": 0.01143703, "auxiliary_loss_mlp": 0.01029376, "balance_loss_clip": 1.04892921, "balance_loss_mlp": 1.02214003, "epoch": 0.608068297961883, "flos": 25336271013120.0, "grad_norm": 1.7641820975107054, "language_loss": 0.7845726, "learning_rate": 1.4061667629012989e-06, "loss": 0.80630344, "num_input_tokens_seen": 108984885, "step": 5057, "time_per_iteration": 2.7895429134368896 }, { "auxiliary_loss_clip": 0.01137217, "auxiliary_loss_mlp": 0.01026114, "balance_loss_clip": 1.04505432, "balance_loss_mlp": 1.01847243, "epoch": 0.608188540852522, "flos": 24202463235840.0, "grad_norm": 1.6356214173453085, "language_loss": 0.8318857, "learning_rate": 1.40542296528014e-06, "loss": 0.85351896, "num_input_tokens_seen": 109004545, "step": 5058, "time_per_iteration": 3.6207730770111084 }, { "auxiliary_loss_clip": 0.01161907, "auxiliary_loss_mlp": 0.01030035, "balance_loss_clip": 1.04841208, "balance_loss_mlp": 1.02209568, "epoch": 0.6083087837431612, "flos": 21284146851840.0, "grad_norm": 2.321070280158195, "language_loss": 0.76475817, "learning_rate": 1.4046792578558452e-06, "loss": 0.7866776, "num_input_tokens_seen": 109022440, "step": 5059, "time_per_iteration": 2.5691776275634766 }, { "auxiliary_loss_clip": 0.01143611, "auxiliary_loss_mlp": 0.01026184, "balance_loss_clip": 1.04632556, "balance_loss_mlp": 1.01861715, "epoch": 0.6084290266338003, "flos": 16471435178880.0, "grad_norm": 2.271413106923837, "language_loss": 0.76071185, "learning_rate": 1.4039356407412325e-06, "loss": 0.78240979, "num_input_tokens_seen": 109035680, "step": 5060, "time_per_iteration": 2.6533749103546143 }, { "auxiliary_loss_clip": 0.0106177, "auxiliary_loss_mlp": 0.01001418, "balance_loss_clip": 1.01234055, "balance_loss_mlp": 1.00008917, "epoch": 0.6085492695244393, "flos": 66443574931200.0, "grad_norm": 0.7748442699825008, "language_loss": 0.57030928, "learning_rate": 1.40319211404911e-06, "loss": 0.59094119, "num_input_tokens_seen": 109090680, "step": 5061, "time_per_iteration": 3.1475319862365723 }, { "auxiliary_loss_clip": 0.0117584, "auxiliary_loss_mlp": 0.01028767, "balance_loss_clip": 1.05060458, "balance_loss_mlp": 1.02113783, "epoch": 0.6086695124150785, "flos": 23618986709760.0, "grad_norm": 1.9276146166952546, "language_loss": 0.90525162, "learning_rate": 1.4024486778922691e-06, "loss": 0.92729777, "num_input_tokens_seen": 109108995, "step": 5062, "time_per_iteration": 2.609812021255493 }, { "auxiliary_loss_clip": 0.0115025, "auxiliary_loss_mlp": 0.01025512, "balance_loss_clip": 1.04448915, "balance_loss_mlp": 1.01795375, "epoch": 0.6087897553057176, "flos": 20157054917760.0, "grad_norm": 1.842942928713752, "language_loss": 0.77807277, "learning_rate": 1.4017053323834884e-06, "loss": 0.79983038, "num_input_tokens_seen": 109128825, "step": 5063, "time_per_iteration": 2.672015905380249 }, { "auxiliary_loss_clip": 0.01149884, "auxiliary_loss_mlp": 0.01021811, "balance_loss_clip": 1.04573703, "balance_loss_mlp": 1.01462245, "epoch": 0.6089099981963566, "flos": 25482535194240.0, "grad_norm": 1.7970566532738677, "language_loss": 0.75936437, "learning_rate": 1.4009620776355333e-06, "loss": 0.78108132, "num_input_tokens_seen": 109150425, "step": 5064, "time_per_iteration": 2.6676270961761475 }, { "auxiliary_loss_clip": 0.01157104, "auxiliary_loss_mlp": 0.01026599, "balance_loss_clip": 1.04716527, "balance_loss_mlp": 1.01939559, "epoch": 0.6090302410869958, "flos": 25332895134720.0, "grad_norm": 2.594178801401346, "language_loss": 0.7913332, "learning_rate": 1.4002189137611553e-06, "loss": 0.81317019, "num_input_tokens_seen": 109169765, "step": 5065, "time_per_iteration": 2.722817897796631 }, { "auxiliary_loss_clip": 0.01158854, "auxiliary_loss_mlp": 0.01026248, "balance_loss_clip": 1.04678607, "balance_loss_mlp": 1.01905417, "epoch": 0.6091504839776348, "flos": 23987358639360.0, "grad_norm": 1.8833345537736377, "language_loss": 0.69624871, "learning_rate": 1.3994758408730901e-06, "loss": 0.71809971, "num_input_tokens_seen": 109188950, "step": 5066, "time_per_iteration": 2.6628191471099854 }, { "auxiliary_loss_clip": 0.01149809, "auxiliary_loss_mlp": 0.01028617, "balance_loss_clip": 1.049577, "balance_loss_mlp": 1.01989055, "epoch": 0.6092707268682739, "flos": 29643037666560.0, "grad_norm": 2.162622121316223, "language_loss": 0.77105331, "learning_rate": 1.3987328590840629e-06, "loss": 0.79283762, "num_input_tokens_seen": 109209895, "step": 5067, "time_per_iteration": 2.7814409732818604 }, { "auxiliary_loss_clip": 0.011569, "auxiliary_loss_mlp": 0.01024915, "balance_loss_clip": 1.04719818, "balance_loss_mlp": 1.01671338, "epoch": 0.609390969758913, "flos": 24024957200640.0, "grad_norm": 2.0301867093589117, "language_loss": 0.8616277, "learning_rate": 1.397989968506783e-06, "loss": 0.88344586, "num_input_tokens_seen": 109228905, "step": 5068, "time_per_iteration": 2.6547365188598633 }, { "auxiliary_loss_clip": 0.01181412, "auxiliary_loss_mlp": 0.01027678, "balance_loss_clip": 1.0524075, "balance_loss_mlp": 1.0201081, "epoch": 0.6095112126495521, "flos": 11102143288320.0, "grad_norm": 2.1621678281581276, "language_loss": 0.72205263, "learning_rate": 1.3972471692539458e-06, "loss": 0.74414355, "num_input_tokens_seen": 109243620, "step": 5069, "time_per_iteration": 2.622789144515991 }, { "auxiliary_loss_clip": 0.01147022, "auxiliary_loss_mlp": 0.01029499, "balance_loss_clip": 1.05071831, "balance_loss_mlp": 1.02187002, "epoch": 0.6096314555401912, "flos": 17265491187840.0, "grad_norm": 2.031216404375011, "language_loss": 0.75300646, "learning_rate": 1.3965044614382348e-06, "loss": 0.77477163, "num_input_tokens_seen": 109259070, "step": 5070, "time_per_iteration": 2.7041192054748535 }, { "auxiliary_loss_clip": 0.01180223, "auxiliary_loss_mlp": 0.01030644, "balance_loss_clip": 1.05330431, "balance_loss_mlp": 1.02234149, "epoch": 0.6097516984308303, "flos": 21645910679040.0, "grad_norm": 2.7998282383892255, "language_loss": 0.75246078, "learning_rate": 1.3957618451723162e-06, "loss": 0.77456951, "num_input_tokens_seen": 109275100, "step": 5071, "time_per_iteration": 2.677096366882324 }, { "auxiliary_loss_clip": 0.01149189, "auxiliary_loss_mlp": 0.01034728, "balance_loss_clip": 1.047755, "balance_loss_mlp": 1.02676439, "epoch": 0.6098719413214694, "flos": 27199208966400.0, "grad_norm": 2.0534722133160237, "language_loss": 0.71625698, "learning_rate": 1.3950193205688457e-06, "loss": 0.73809612, "num_input_tokens_seen": 109294825, "step": 5072, "time_per_iteration": 2.692213535308838 }, { "auxiliary_loss_clip": 0.011461, "auxiliary_loss_mlp": 0.0102669, "balance_loss_clip": 1.04750872, "balance_loss_mlp": 1.01937652, "epoch": 0.6099921842121084, "flos": 20412954385920.0, "grad_norm": 1.7917431273889666, "language_loss": 0.83486664, "learning_rate": 1.3942768877404627e-06, "loss": 0.85659456, "num_input_tokens_seen": 109313790, "step": 5073, "time_per_iteration": 2.6535658836364746 }, { "auxiliary_loss_clip": 0.01176467, "auxiliary_loss_mlp": 0.0102377, "balance_loss_clip": 1.05078244, "balance_loss_mlp": 1.01623857, "epoch": 0.6101124271027476, "flos": 23366139897600.0, "grad_norm": 1.5057480141620876, "language_loss": 0.73475677, "learning_rate": 1.393534546799795e-06, "loss": 0.75675917, "num_input_tokens_seen": 109333490, "step": 5074, "time_per_iteration": 2.6030752658843994 }, { "auxiliary_loss_clip": 0.01138666, "auxiliary_loss_mlp": 0.01025661, "balance_loss_clip": 1.04588211, "balance_loss_mlp": 1.0178647, "epoch": 0.6102326699933867, "flos": 26687840993280.0, "grad_norm": 1.9068559341800866, "language_loss": 0.68138719, "learning_rate": 1.3927922978594536e-06, "loss": 0.70303047, "num_input_tokens_seen": 109354575, "step": 5075, "time_per_iteration": 2.6772894859313965 }, { "auxiliary_loss_clip": 0.01055281, "auxiliary_loss_mlp": 0.01001862, "balance_loss_clip": 1.00990701, "balance_loss_mlp": 1.00052643, "epoch": 0.6103529128840257, "flos": 60644612551680.0, "grad_norm": 0.7702837971750532, "language_loss": 0.57381576, "learning_rate": 1.3920501410320387e-06, "loss": 0.59438723, "num_input_tokens_seen": 109410690, "step": 5076, "time_per_iteration": 3.154648780822754 }, { "auxiliary_loss_clip": 0.01147369, "auxiliary_loss_mlp": 0.01027679, "balance_loss_clip": 1.04604065, "balance_loss_mlp": 1.01955533, "epoch": 0.6104731557746649, "flos": 19021307806080.0, "grad_norm": 2.39989036774582, "language_loss": 0.76184165, "learning_rate": 1.3913080764301333e-06, "loss": 0.7835921, "num_input_tokens_seen": 109427650, "step": 5077, "time_per_iteration": 2.6623387336730957 }, { "auxiliary_loss_clip": 0.01129943, "auxiliary_loss_mlp": 0.01027616, "balance_loss_clip": 1.04371595, "balance_loss_mlp": 1.01984918, "epoch": 0.6105933986653039, "flos": 23366894083200.0, "grad_norm": 2.0208728968143848, "language_loss": 0.71377897, "learning_rate": 1.3905661041663085e-06, "loss": 0.73535454, "num_input_tokens_seen": 109448835, "step": 5078, "time_per_iteration": 2.8191845417022705 }, { "auxiliary_loss_clip": 0.01164369, "auxiliary_loss_mlp": 0.01028451, "balance_loss_clip": 1.05028796, "balance_loss_mlp": 1.0204165, "epoch": 0.610713641555943, "flos": 34637565006720.0, "grad_norm": 2.270242190261541, "language_loss": 0.65188146, "learning_rate": 1.389824224353122e-06, "loss": 0.67380965, "num_input_tokens_seen": 109470425, "step": 5079, "time_per_iteration": 3.7860398292541504 }, { "auxiliary_loss_clip": 0.01162766, "auxiliary_loss_mlp": 0.01032972, "balance_loss_clip": 1.05072021, "balance_loss_mlp": 1.02565265, "epoch": 0.610833884446582, "flos": 26646471504000.0, "grad_norm": 1.653751615546049, "language_loss": 0.77024817, "learning_rate": 1.389082437103115e-06, "loss": 0.79220557, "num_input_tokens_seen": 109489695, "step": 5080, "time_per_iteration": 3.5545904636383057 }, { "auxiliary_loss_clip": 0.01132976, "auxiliary_loss_mlp": 0.0102374, "balance_loss_clip": 1.04384708, "balance_loss_mlp": 1.01531768, "epoch": 0.6109541273372212, "flos": 21215126868480.0, "grad_norm": 2.7956545335016814, "language_loss": 0.78495049, "learning_rate": 1.3883407425288172e-06, "loss": 0.8065176, "num_input_tokens_seen": 109510030, "step": 5081, "time_per_iteration": 3.6652467250823975 }, { "auxiliary_loss_clip": 0.01141827, "auxiliary_loss_mlp": 0.01021814, "balance_loss_clip": 1.04370642, "balance_loss_mlp": 1.01345181, "epoch": 0.6110743702278603, "flos": 20084084438400.0, "grad_norm": 2.482333674879216, "language_loss": 0.79745674, "learning_rate": 1.3875991407427417e-06, "loss": 0.81909323, "num_input_tokens_seen": 109528255, "step": 5082, "time_per_iteration": 2.70442795753479 }, { "auxiliary_loss_clip": 0.01040299, "auxiliary_loss_mlp": 0.01003432, "balance_loss_clip": 1.00915051, "balance_loss_mlp": 1.00205481, "epoch": 0.6111946131184993, "flos": 68302957438080.0, "grad_norm": 0.769428636143955, "language_loss": 0.58209747, "learning_rate": 1.38685763185739e-06, "loss": 0.60253477, "num_input_tokens_seen": 109581915, "step": 5083, "time_per_iteration": 3.323903799057007 }, { "auxiliary_loss_clip": 0.01173394, "auxiliary_loss_mlp": 0.01026508, "balance_loss_clip": 1.0482645, "balance_loss_mlp": 1.01920342, "epoch": 0.6113148560091385, "flos": 19937676602880.0, "grad_norm": 2.4608168522728966, "language_loss": 0.67672396, "learning_rate": 1.3861162159852476e-06, "loss": 0.69872296, "num_input_tokens_seen": 109600050, "step": 5084, "time_per_iteration": 3.6494317054748535 }, { "auxiliary_loss_clip": 0.01153066, "auxiliary_loss_mlp": 0.01025282, "balance_loss_clip": 1.04939508, "balance_loss_mlp": 1.01726544, "epoch": 0.6114350988997775, "flos": 23731854220800.0, "grad_norm": 1.6287432313150219, "language_loss": 0.80081099, "learning_rate": 1.3853748932387875e-06, "loss": 0.82259452, "num_input_tokens_seen": 109620690, "step": 5085, "time_per_iteration": 2.806335210800171 }, { "auxiliary_loss_clip": 0.01136545, "auxiliary_loss_mlp": 0.01023107, "balance_loss_clip": 1.04368234, "balance_loss_mlp": 1.01506972, "epoch": 0.6115553417904166, "flos": 24023700224640.0, "grad_norm": 2.4478155433616817, "language_loss": 0.74719644, "learning_rate": 1.3846336637304671e-06, "loss": 0.76879299, "num_input_tokens_seen": 109638960, "step": 5086, "time_per_iteration": 2.724245548248291 }, { "auxiliary_loss_clip": 0.01139949, "auxiliary_loss_mlp": 0.01024411, "balance_loss_clip": 1.04611671, "balance_loss_mlp": 1.01659691, "epoch": 0.6116755846810558, "flos": 23733542160000.0, "grad_norm": 2.3017752554114383, "language_loss": 0.83155388, "learning_rate": 1.3838925275727316e-06, "loss": 0.85319746, "num_input_tokens_seen": 109659700, "step": 5087, "time_per_iteration": 2.7186381816864014 }, { "auxiliary_loss_clip": 0.01176454, "auxiliary_loss_mlp": 0.01026411, "balance_loss_clip": 1.05118906, "balance_loss_mlp": 1.0190382, "epoch": 0.6117958275716948, "flos": 18661626967680.0, "grad_norm": 2.0108947975046227, "language_loss": 0.78928006, "learning_rate": 1.3831514848780089e-06, "loss": 0.81130874, "num_input_tokens_seen": 109679275, "step": 5088, "time_per_iteration": 2.6995601654052734 }, { "auxiliary_loss_clip": 0.0115396, "auxiliary_loss_mlp": 0.01024832, "balance_loss_clip": 1.04651999, "balance_loss_mlp": 1.01781046, "epoch": 0.6119160704623339, "flos": 16471183783680.0, "grad_norm": 2.4922608323486832, "language_loss": 0.92110914, "learning_rate": 1.3824105357587152e-06, "loss": 0.94289702, "num_input_tokens_seen": 109696380, "step": 5089, "time_per_iteration": 2.6134634017944336 }, { "auxiliary_loss_clip": 0.01139732, "auxiliary_loss_mlp": 0.01022683, "balance_loss_clip": 1.04336262, "balance_loss_mlp": 1.0157094, "epoch": 0.612036313352973, "flos": 23915465568000.0, "grad_norm": 1.5731645071657891, "language_loss": 0.82481557, "learning_rate": 1.381669680327253e-06, "loss": 0.84643972, "num_input_tokens_seen": 109718060, "step": 5090, "time_per_iteration": 2.6760735511779785 }, { "auxiliary_loss_clip": 0.01141311, "auxiliary_loss_mlp": 0.01028626, "balance_loss_clip": 1.04810929, "balance_loss_mlp": 1.02051735, "epoch": 0.6121565562436121, "flos": 26974766833920.0, "grad_norm": 2.097683624069703, "language_loss": 0.70889509, "learning_rate": 1.380928918696008e-06, "loss": 0.7305944, "num_input_tokens_seen": 109736830, "step": 5091, "time_per_iteration": 2.739025592803955 }, { "auxiliary_loss_clip": 0.0116169, "auxiliary_loss_mlp": 0.01027159, "balance_loss_clip": 1.04928732, "balance_loss_mlp": 1.01893318, "epoch": 0.6122767991342511, "flos": 15668867646720.0, "grad_norm": 2.4341000921912648, "language_loss": 0.71157807, "learning_rate": 1.3801882509773548e-06, "loss": 0.73346651, "num_input_tokens_seen": 109754690, "step": 5092, "time_per_iteration": 2.592524528503418 }, { "auxiliary_loss_clip": 0.01154421, "auxiliary_loss_mlp": 0.01022444, "balance_loss_clip": 1.0443747, "balance_loss_mlp": 1.01509476, "epoch": 0.6123970420248903, "flos": 27964321591680.0, "grad_norm": 1.6194449802237743, "language_loss": 0.81641567, "learning_rate": 1.3794476772836503e-06, "loss": 0.8381843, "num_input_tokens_seen": 109775790, "step": 5093, "time_per_iteration": 2.6756527423858643 }, { "auxiliary_loss_clip": 0.011261, "auxiliary_loss_mlp": 0.01024741, "balance_loss_clip": 1.04578793, "balance_loss_mlp": 1.01699877, "epoch": 0.6125172849155294, "flos": 21468727866240.0, "grad_norm": 1.7176692259859787, "language_loss": 0.84585953, "learning_rate": 1.3787071977272402e-06, "loss": 0.86736798, "num_input_tokens_seen": 109795050, "step": 5094, "time_per_iteration": 2.724889039993286 }, { "auxiliary_loss_clip": 0.01119136, "auxiliary_loss_mlp": 0.0102821, "balance_loss_clip": 1.04806328, "balance_loss_mlp": 1.02052438, "epoch": 0.6126375278061684, "flos": 16248321849600.0, "grad_norm": 2.510817459522866, "language_loss": 0.7186352, "learning_rate": 1.3779668124204535e-06, "loss": 0.74010867, "num_input_tokens_seen": 109811465, "step": 5095, "time_per_iteration": 2.8334970474243164 }, { "auxiliary_loss_clip": 0.0114417, "auxiliary_loss_mlp": 0.01025372, "balance_loss_clip": 1.05047488, "balance_loss_mlp": 1.01695633, "epoch": 0.6127577706968076, "flos": 20448865008000.0, "grad_norm": 1.6342850178437835, "language_loss": 0.80676025, "learning_rate": 1.3772265214756074e-06, "loss": 0.82845569, "num_input_tokens_seen": 109831225, "step": 5096, "time_per_iteration": 2.7553205490112305 }, { "auxiliary_loss_clip": 0.01164355, "auxiliary_loss_mlp": 0.0102673, "balance_loss_clip": 1.04688895, "balance_loss_mlp": 1.01851106, "epoch": 0.6128780135874466, "flos": 18260397072000.0, "grad_norm": 1.8692480290703006, "language_loss": 0.75007999, "learning_rate": 1.3764863250050025e-06, "loss": 0.77199078, "num_input_tokens_seen": 109849465, "step": 5097, "time_per_iteration": 2.6290128231048584 }, { "auxiliary_loss_clip": 0.01132758, "auxiliary_loss_mlp": 0.01026601, "balance_loss_clip": 1.04358256, "balance_loss_mlp": 1.0193944, "epoch": 0.6129982564780857, "flos": 24937088192640.0, "grad_norm": 1.871237289758781, "language_loss": 0.80239207, "learning_rate": 1.3757462231209272e-06, "loss": 0.82398564, "num_input_tokens_seen": 109869770, "step": 5098, "time_per_iteration": 2.75150465965271 }, { "auxiliary_loss_clip": 0.01140279, "auxiliary_loss_mlp": 0.01030947, "balance_loss_clip": 1.04477584, "balance_loss_mlp": 1.0231564, "epoch": 0.6131184993687249, "flos": 22492038430080.0, "grad_norm": 2.324017785406244, "language_loss": 0.88887429, "learning_rate": 1.3750062159356525e-06, "loss": 0.91058648, "num_input_tokens_seen": 109889120, "step": 5099, "time_per_iteration": 2.9186325073242188 }, { "auxiliary_loss_clip": 0.01122898, "auxiliary_loss_mlp": 0.01030371, "balance_loss_clip": 1.04250598, "balance_loss_mlp": 1.02265787, "epoch": 0.6132387422593639, "flos": 15885839750400.0, "grad_norm": 1.737963350495826, "language_loss": 0.83472532, "learning_rate": 1.3742663035614382e-06, "loss": 0.85625803, "num_input_tokens_seen": 109906490, "step": 5100, "time_per_iteration": 2.8609015941619873 }, { "auxiliary_loss_clip": 0.01176137, "auxiliary_loss_mlp": 0.01033252, "balance_loss_clip": 1.05032015, "balance_loss_mlp": 1.02523565, "epoch": 0.613358985150003, "flos": 25411539962880.0, "grad_norm": 2.0833523320931184, "language_loss": 0.79908377, "learning_rate": 1.3735264861105283e-06, "loss": 0.82117772, "num_input_tokens_seen": 109927130, "step": 5101, "time_per_iteration": 2.795558214187622 }, { "auxiliary_loss_clip": 0.01133871, "auxiliary_loss_mlp": 0.01023488, "balance_loss_clip": 1.04453278, "balance_loss_mlp": 1.01651382, "epoch": 0.6134792280406421, "flos": 21361283308800.0, "grad_norm": 1.9460032681867656, "language_loss": 0.78066307, "learning_rate": 1.372786763695152e-06, "loss": 0.80223668, "num_input_tokens_seen": 109945890, "step": 5102, "time_per_iteration": 2.713202953338623 }, { "auxiliary_loss_clip": 0.01160516, "auxiliary_loss_mlp": 0.01023166, "balance_loss_clip": 1.04578972, "balance_loss_mlp": 1.01459455, "epoch": 0.6135994709312812, "flos": 21211248199680.0, "grad_norm": 1.885747920346991, "language_loss": 0.7741785, "learning_rate": 1.3720471364275257e-06, "loss": 0.79601526, "num_input_tokens_seen": 109965535, "step": 5103, "time_per_iteration": 2.6315817832946777 }, { "auxiliary_loss_clip": 0.01128751, "auxiliary_loss_mlp": 0.00762925, "balance_loss_clip": 1.04404199, "balance_loss_mlp": 1.00054479, "epoch": 0.6137197138219203, "flos": 14794047907200.0, "grad_norm": 2.078619914517779, "language_loss": 0.78045285, "learning_rate": 1.3713076044198486e-06, "loss": 0.79936957, "num_input_tokens_seen": 109982345, "step": 5104, "time_per_iteration": 3.7225918769836426 }, { "auxiliary_loss_clip": 0.01140278, "auxiliary_loss_mlp": 0.01024253, "balance_loss_clip": 1.04538965, "balance_loss_mlp": 1.01722848, "epoch": 0.6138399567125594, "flos": 20084515401600.0, "grad_norm": 2.249859640336811, "language_loss": 0.8084814, "learning_rate": 1.3705681677843086e-06, "loss": 0.8301267, "num_input_tokens_seen": 110000940, "step": 5105, "time_per_iteration": 2.668070077896118 }, { "auxiliary_loss_clip": 0.01068321, "auxiliary_loss_mlp": 0.01001546, "balance_loss_clip": 1.00991249, "balance_loss_mlp": 1.00017476, "epoch": 0.6139601996031985, "flos": 60123838193280.0, "grad_norm": 0.7707510659849622, "language_loss": 0.60561806, "learning_rate": 1.3698288266330768e-06, "loss": 0.62631673, "num_input_tokens_seen": 110061565, "step": 5106, "time_per_iteration": 3.295222043991089 }, { "auxiliary_loss_clip": 0.01143712, "auxiliary_loss_mlp": 0.01023923, "balance_loss_clip": 1.04974627, "balance_loss_mlp": 1.01676416, "epoch": 0.6140804424938375, "flos": 23586703361280.0, "grad_norm": 2.1696506615365614, "language_loss": 0.72314322, "learning_rate": 1.3690895810783113e-06, "loss": 0.74481964, "num_input_tokens_seen": 110080360, "step": 5107, "time_per_iteration": 4.557645559310913 }, { "auxiliary_loss_clip": 0.01110479, "auxiliary_loss_mlp": 0.00762466, "balance_loss_clip": 1.04161072, "balance_loss_mlp": 1.00047958, "epoch": 0.6142006853844767, "flos": 21398199511680.0, "grad_norm": 2.1720846546211283, "language_loss": 0.71711397, "learning_rate": 1.3683504312321543e-06, "loss": 0.73584342, "num_input_tokens_seen": 110100695, "step": 5108, "time_per_iteration": 2.819596529006958 }, { "auxiliary_loss_clip": 0.0116452, "auxiliary_loss_mlp": 0.0102933, "balance_loss_clip": 1.04813349, "balance_loss_mlp": 1.02166224, "epoch": 0.6143209282751158, "flos": 12057367622400.0, "grad_norm": 2.30332009779576, "language_loss": 0.79731083, "learning_rate": 1.3676113772067355e-06, "loss": 0.81924939, "num_input_tokens_seen": 110117750, "step": 5109, "time_per_iteration": 2.6225333213806152 }, { "auxiliary_loss_clip": 0.01123948, "auxiliary_loss_mlp": 0.01030795, "balance_loss_clip": 1.04387593, "balance_loss_mlp": 1.02310598, "epoch": 0.6144411711657548, "flos": 25082274965760.0, "grad_norm": 2.2727888892407364, "language_loss": 0.72822255, "learning_rate": 1.3668724191141671e-06, "loss": 0.74976999, "num_input_tokens_seen": 110137020, "step": 5110, "time_per_iteration": 3.8047516345977783 }, { "auxiliary_loss_clip": 0.01129745, "auxiliary_loss_mlp": 0.01030089, "balance_loss_clip": 1.05066681, "balance_loss_mlp": 1.0224719, "epoch": 0.6145614140563939, "flos": 20114069316480.0, "grad_norm": 2.3710671031094783, "language_loss": 0.66740304, "learning_rate": 1.3661335570665493e-06, "loss": 0.68900138, "num_input_tokens_seen": 110154930, "step": 5111, "time_per_iteration": 2.732600212097168 }, { "auxiliary_loss_clip": 0.01150803, "auxiliary_loss_mlp": 0.01022768, "balance_loss_clip": 1.04987216, "balance_loss_mlp": 1.01472449, "epoch": 0.614681656947033, "flos": 16800376953600.0, "grad_norm": 2.454539298026532, "language_loss": 0.69910383, "learning_rate": 1.3653947911759676e-06, "loss": 0.72083956, "num_input_tokens_seen": 110172480, "step": 5112, "time_per_iteration": 2.7454352378845215 }, { "auxiliary_loss_clip": 0.01112345, "auxiliary_loss_mlp": 0.01032091, "balance_loss_clip": 1.04450762, "balance_loss_mlp": 1.02378798, "epoch": 0.6148018998376721, "flos": 38801587011840.0, "grad_norm": 1.6626911804290145, "language_loss": 0.74597621, "learning_rate": 1.3646561215544904e-06, "loss": 0.76742059, "num_input_tokens_seen": 110197120, "step": 5113, "time_per_iteration": 2.903869152069092 }, { "auxiliary_loss_clip": 0.0116132, "auxiliary_loss_mlp": 0.01029143, "balance_loss_clip": 1.04836977, "balance_loss_mlp": 1.02139699, "epoch": 0.6149221427283111, "flos": 23327032965120.0, "grad_norm": 2.155760607759631, "language_loss": 0.79388016, "learning_rate": 1.363917548314176e-06, "loss": 0.81578481, "num_input_tokens_seen": 110216385, "step": 5114, "time_per_iteration": 2.6495800018310547 }, { "auxiliary_loss_clip": 0.01167049, "auxiliary_loss_mlp": 0.01024809, "balance_loss_clip": 1.0486176, "balance_loss_mlp": 1.01697111, "epoch": 0.6150423856189503, "flos": 22379494141440.0, "grad_norm": 1.8074752968636894, "language_loss": 0.7340253, "learning_rate": 1.3631790715670626e-06, "loss": 0.75594389, "num_input_tokens_seen": 110234790, "step": 5115, "time_per_iteration": 2.7059402465820312 }, { "auxiliary_loss_clip": 0.01081114, "auxiliary_loss_mlp": 0.01024788, "balance_loss_clip": 1.04267943, "balance_loss_mlp": 1.01799858, "epoch": 0.6151626285095894, "flos": 18692078722560.0, "grad_norm": 2.0241632661351323, "language_loss": 0.85746992, "learning_rate": 1.3624406914251783e-06, "loss": 0.87852895, "num_input_tokens_seen": 110251910, "step": 5116, "time_per_iteration": 2.946479558944702 }, { "auxiliary_loss_clip": 0.01162808, "auxiliary_loss_mlp": 0.01027112, "balance_loss_clip": 1.04800773, "balance_loss_mlp": 1.01993012, "epoch": 0.6152828714002284, "flos": 15851688894720.0, "grad_norm": 2.264189305527427, "language_loss": 0.88305652, "learning_rate": 1.3617024080005335e-06, "loss": 0.90495569, "num_input_tokens_seen": 110268810, "step": 5117, "time_per_iteration": 2.9764926433563232 }, { "auxiliary_loss_clip": 0.01150594, "auxiliary_loss_mlp": 0.00762533, "balance_loss_clip": 1.04590368, "balance_loss_mlp": 1.00051939, "epoch": 0.6154031142908676, "flos": 24869792062080.0, "grad_norm": 1.5552460830434998, "language_loss": 0.74627233, "learning_rate": 1.3609642214051266e-06, "loss": 0.76540357, "num_input_tokens_seen": 110293035, "step": 5118, "time_per_iteration": 2.760932207107544 }, { "auxiliary_loss_clip": 0.01142714, "auxiliary_loss_mlp": 0.0102906, "balance_loss_clip": 1.04994798, "balance_loss_mlp": 1.0214988, "epoch": 0.6155233571815066, "flos": 19244744357760.0, "grad_norm": 2.8883832057335392, "language_loss": 0.66001266, "learning_rate": 1.3602261317509385e-06, "loss": 0.68173045, "num_input_tokens_seen": 110309695, "step": 5119, "time_per_iteration": 2.714329719543457 }, { "auxiliary_loss_clip": 0.01161808, "auxiliary_loss_mlp": 0.01027964, "balance_loss_clip": 1.04709339, "balance_loss_mlp": 1.01938105, "epoch": 0.6156436000721457, "flos": 18770077105920.0, "grad_norm": 3.5471802356062923, "language_loss": 0.82612252, "learning_rate": 1.3594881391499387e-06, "loss": 0.8480202, "num_input_tokens_seen": 110328610, "step": 5120, "time_per_iteration": 2.6424176692962646 }, { "auxiliary_loss_clip": 0.01148762, "auxiliary_loss_mlp": 0.01023737, "balance_loss_clip": 1.04847682, "balance_loss_mlp": 1.015571, "epoch": 0.6157638429627849, "flos": 18041198325120.0, "grad_norm": 1.6626917953618883, "language_loss": 0.7944755, "learning_rate": 1.3587502437140778e-06, "loss": 0.81620049, "num_input_tokens_seen": 110346775, "step": 5121, "time_per_iteration": 2.727005958557129 }, { "auxiliary_loss_clip": 0.01150782, "auxiliary_loss_mlp": 0.01029938, "balance_loss_clip": 1.04663897, "balance_loss_mlp": 1.02268457, "epoch": 0.6158840858534239, "flos": 25556726736000.0, "grad_norm": 2.579619000881958, "language_loss": 0.85027325, "learning_rate": 1.3580124455552952e-06, "loss": 0.87208045, "num_input_tokens_seen": 110366140, "step": 5122, "time_per_iteration": 2.7320878505706787 }, { "auxiliary_loss_clip": 0.01161399, "auxiliary_loss_mlp": 0.00762314, "balance_loss_clip": 1.05099094, "balance_loss_mlp": 1.00053, "epoch": 0.616004328744063, "flos": 24640788902400.0, "grad_norm": 1.8620717534832194, "language_loss": 0.87536311, "learning_rate": 1.3572747447855148e-06, "loss": 0.89460027, "num_input_tokens_seen": 110386550, "step": 5123, "time_per_iteration": 2.6589739322662354 }, { "auxiliary_loss_clip": 0.01177117, "auxiliary_loss_mlp": 0.01024955, "balance_loss_clip": 1.05057037, "balance_loss_mlp": 1.01718283, "epoch": 0.6161245716347021, "flos": 21689686379520.0, "grad_norm": 3.702941604161871, "language_loss": 0.6914674, "learning_rate": 1.356537141516644e-06, "loss": 0.7134881, "num_input_tokens_seen": 110403970, "step": 5124, "time_per_iteration": 2.632977247238159 }, { "auxiliary_loss_clip": 0.01162501, "auxiliary_loss_mlp": 0.0102718, "balance_loss_clip": 1.05165327, "balance_loss_mlp": 1.02010226, "epoch": 0.6162448145253412, "flos": 35189225061120.0, "grad_norm": 1.867792406663666, "language_loss": 0.61886847, "learning_rate": 1.3557996358605775e-06, "loss": 0.64076531, "num_input_tokens_seen": 110423890, "step": 5125, "time_per_iteration": 2.7215487957000732 }, { "auxiliary_loss_clip": 0.01159071, "auxiliary_loss_mlp": 0.01026366, "balance_loss_clip": 1.04682517, "balance_loss_mlp": 1.01922572, "epoch": 0.6163650574159802, "flos": 21615279356160.0, "grad_norm": 2.1043129745283395, "language_loss": 0.70459473, "learning_rate": 1.3550622279291941e-06, "loss": 0.72644913, "num_input_tokens_seen": 110442035, "step": 5126, "time_per_iteration": 2.7399542331695557 }, { "auxiliary_loss_clip": 0.01107545, "auxiliary_loss_mlp": 0.01025932, "balance_loss_clip": 1.03985453, "balance_loss_mlp": 1.01851678, "epoch": 0.6164853003066194, "flos": 24572163968640.0, "grad_norm": 1.4354359396100267, "language_loss": 0.83301657, "learning_rate": 1.354324917834358e-06, "loss": 0.85435128, "num_input_tokens_seen": 110463280, "step": 5127, "time_per_iteration": 2.8185513019561768 }, { "auxiliary_loss_clip": 0.01099227, "auxiliary_loss_mlp": 0.00762034, "balance_loss_clip": 1.04035056, "balance_loss_mlp": 1.0004878, "epoch": 0.6166055431972585, "flos": 21835986474240.0, "grad_norm": 1.7987626333025601, "language_loss": 0.77002358, "learning_rate": 1.353587705687918e-06, "loss": 0.78863621, "num_input_tokens_seen": 110481455, "step": 5128, "time_per_iteration": 2.834465265274048 }, { "auxiliary_loss_clip": 0.01152916, "auxiliary_loss_mlp": 0.01027727, "balance_loss_clip": 1.05034673, "balance_loss_mlp": 1.01952493, "epoch": 0.6167257860878975, "flos": 17785262943360.0, "grad_norm": 2.6202025211606315, "language_loss": 0.71916336, "learning_rate": 1.3528505916017096e-06, "loss": 0.74096978, "num_input_tokens_seen": 110499155, "step": 5129, "time_per_iteration": 2.698446273803711 }, { "auxiliary_loss_clip": 0.0116033, "auxiliary_loss_mlp": 0.01026512, "balance_loss_clip": 1.04727125, "balance_loss_mlp": 1.01884079, "epoch": 0.6168460289785367, "flos": 23214811898880.0, "grad_norm": 2.2220748649869635, "language_loss": 0.88745308, "learning_rate": 1.3521135756875514e-06, "loss": 0.90932149, "num_input_tokens_seen": 110515470, "step": 5130, "time_per_iteration": 3.5330092906951904 }, { "auxiliary_loss_clip": 0.01097084, "auxiliary_loss_mlp": 0.01030017, "balance_loss_clip": 1.04067314, "balance_loss_mlp": 1.02290964, "epoch": 0.6169662718691757, "flos": 26213281482240.0, "grad_norm": 1.4608311824689204, "language_loss": 0.86417389, "learning_rate": 1.3513766580572496e-06, "loss": 0.88544488, "num_input_tokens_seen": 110538290, "step": 5131, "time_per_iteration": 2.853149175643921 }, { "auxiliary_loss_clip": 0.01161211, "auxiliary_loss_mlp": 0.01029321, "balance_loss_clip": 1.05042601, "balance_loss_mlp": 1.02235329, "epoch": 0.6170865147598148, "flos": 19026120228480.0, "grad_norm": 2.009584382608736, "language_loss": 0.77313226, "learning_rate": 1.3506398388225924e-06, "loss": 0.79503757, "num_input_tokens_seen": 110555610, "step": 5132, "time_per_iteration": 2.675783634185791 }, { "auxiliary_loss_clip": 0.01172692, "auxiliary_loss_mlp": 0.01028692, "balance_loss_clip": 1.05023241, "balance_loss_mlp": 1.02124763, "epoch": 0.617206757650454, "flos": 18260361158400.0, "grad_norm": 1.7631389328091054, "language_loss": 0.71904421, "learning_rate": 1.349903118095355e-06, "loss": 0.74105799, "num_input_tokens_seen": 110574745, "step": 5133, "time_per_iteration": 4.711911916732788 }, { "auxiliary_loss_clip": 0.01165047, "auxiliary_loss_mlp": 0.01020641, "balance_loss_clip": 1.0494678, "balance_loss_mlp": 1.01375365, "epoch": 0.617327000541093, "flos": 18186959715840.0, "grad_norm": 1.6579777536516793, "language_loss": 0.73521966, "learning_rate": 1.349166495987298e-06, "loss": 0.75707662, "num_input_tokens_seen": 110593310, "step": 5134, "time_per_iteration": 2.6439762115478516 }, { "auxiliary_loss_clip": 0.01060006, "auxiliary_loss_mlp": 0.01007764, "balance_loss_clip": 1.02513003, "balance_loss_mlp": 1.00639307, "epoch": 0.6174472434317321, "flos": 61833796122240.0, "grad_norm": 0.8653160513871059, "language_loss": 0.60845083, "learning_rate": 1.348429972610166e-06, "loss": 0.62912852, "num_input_tokens_seen": 110657615, "step": 5135, "time_per_iteration": 3.3337912559509277 }, { "auxiliary_loss_clip": 0.01030661, "auxiliary_loss_mlp": 0.01004099, "balance_loss_clip": 1.02168107, "balance_loss_mlp": 1.00278139, "epoch": 0.6175674863223712, "flos": 71230970494080.0, "grad_norm": 0.8468687473811993, "language_loss": 0.57846498, "learning_rate": 1.3476935480756897e-06, "loss": 0.59881258, "num_input_tokens_seen": 110714365, "step": 5136, "time_per_iteration": 4.095571517944336 }, { "auxiliary_loss_clip": 0.01125402, "auxiliary_loss_mlp": 0.01027025, "balance_loss_clip": 1.04418933, "balance_loss_mlp": 1.01957095, "epoch": 0.6176877292130103, "flos": 21835447770240.0, "grad_norm": 2.457109644593805, "language_loss": 0.75352645, "learning_rate": 1.346957222495583e-06, "loss": 0.77505076, "num_input_tokens_seen": 110732160, "step": 5137, "time_per_iteration": 2.793405294418335 }, { "auxiliary_loss_clip": 0.01154325, "auxiliary_loss_mlp": 0.00762366, "balance_loss_clip": 1.05132031, "balance_loss_mlp": 1.0005796, "epoch": 0.6178079721036493, "flos": 17741738638080.0, "grad_norm": 2.382259330288585, "language_loss": 0.71433252, "learning_rate": 1.3462209959815466e-06, "loss": 0.73349941, "num_input_tokens_seen": 110746900, "step": 5138, "time_per_iteration": 2.6927008628845215 }, { "auxiliary_loss_clip": 0.01149046, "auxiliary_loss_mlp": 0.0102464, "balance_loss_clip": 1.04829824, "balance_loss_mlp": 1.01737666, "epoch": 0.6179282149942885, "flos": 22633131052800.0, "grad_norm": 1.9263983054730647, "language_loss": 0.74305111, "learning_rate": 1.345484868645265e-06, "loss": 0.76478791, "num_input_tokens_seen": 110765710, "step": 5139, "time_per_iteration": 2.680694341659546 }, { "auxiliary_loss_clip": 0.01139694, "auxiliary_loss_mlp": 0.01027052, "balance_loss_clip": 1.04524279, "balance_loss_mlp": 1.01967323, "epoch": 0.6180484578849276, "flos": 22310330503680.0, "grad_norm": 1.9648647488793296, "language_loss": 0.78480124, "learning_rate": 1.3447488405984088e-06, "loss": 0.80646867, "num_input_tokens_seen": 110783970, "step": 5140, "time_per_iteration": 2.714421510696411 }, { "auxiliary_loss_clip": 0.01145149, "auxiliary_loss_mlp": 0.01031502, "balance_loss_clip": 1.0466876, "balance_loss_mlp": 1.02375305, "epoch": 0.6181687007755666, "flos": 35225458905600.0, "grad_norm": 4.9941183153593975, "language_loss": 0.69981349, "learning_rate": 1.3440129119526322e-06, "loss": 0.72158003, "num_input_tokens_seen": 110806395, "step": 5141, "time_per_iteration": 2.8164448738098145 }, { "auxiliary_loss_clip": 0.01072076, "auxiliary_loss_mlp": 0.01000906, "balance_loss_clip": 1.01357698, "balance_loss_mlp": 0.99964792, "epoch": 0.6182889436662057, "flos": 61547370094080.0, "grad_norm": 0.8323921930977222, "language_loss": 0.51232994, "learning_rate": 1.3432770828195762e-06, "loss": 0.53305972, "num_input_tokens_seen": 110867380, "step": 5142, "time_per_iteration": 3.3947131633758545 }, { "auxiliary_loss_clip": 0.01123162, "auxiliary_loss_mlp": 0.01031551, "balance_loss_clip": 1.04216599, "balance_loss_mlp": 1.02389216, "epoch": 0.6184091865568448, "flos": 19609991804160.0, "grad_norm": 2.7321161052111216, "language_loss": 0.70511115, "learning_rate": 1.3425413533108635e-06, "loss": 0.72665828, "num_input_tokens_seen": 110885980, "step": 5143, "time_per_iteration": 2.7420411109924316 }, { "auxiliary_loss_clip": 0.01120259, "auxiliary_loss_mlp": 0.01025662, "balance_loss_clip": 1.04775834, "balance_loss_mlp": 1.01791334, "epoch": 0.6185294294474839, "flos": 23586882929280.0, "grad_norm": 2.737316670582121, "language_loss": 0.70746505, "learning_rate": 1.341805723538105e-06, "loss": 0.72892427, "num_input_tokens_seen": 110906085, "step": 5144, "time_per_iteration": 2.7991676330566406 }, { "auxiliary_loss_clip": 0.01153917, "auxiliary_loss_mlp": 0.01030303, "balance_loss_clip": 1.04771972, "balance_loss_mlp": 1.02249515, "epoch": 0.618649672338123, "flos": 26762032535040.0, "grad_norm": 1.541682840690723, "language_loss": 0.77484572, "learning_rate": 1.3410701936128948e-06, "loss": 0.7966879, "num_input_tokens_seen": 110928865, "step": 5145, "time_per_iteration": 2.706430435180664 }, { "auxiliary_loss_clip": 0.01161378, "auxiliary_loss_mlp": 0.01024312, "balance_loss_clip": 1.05000198, "balance_loss_mlp": 1.01665878, "epoch": 0.6187699152287621, "flos": 14456630522880.0, "grad_norm": 3.820765396652624, "language_loss": 0.84926009, "learning_rate": 1.340334763646812e-06, "loss": 0.871117, "num_input_tokens_seen": 110943000, "step": 5146, "time_per_iteration": 2.6816675662994385 }, { "auxiliary_loss_clip": 0.01176813, "auxiliary_loss_mlp": 0.010299, "balance_loss_clip": 1.05001867, "balance_loss_mlp": 1.02202344, "epoch": 0.6188901581194012, "flos": 20084766796800.0, "grad_norm": 1.8258164335627944, "language_loss": 0.74608648, "learning_rate": 1.3395994337514218e-06, "loss": 0.76815367, "num_input_tokens_seen": 110963170, "step": 5147, "time_per_iteration": 2.6112060546875 }, { "auxiliary_loss_clip": 0.01151305, "auxiliary_loss_mlp": 0.01024837, "balance_loss_clip": 1.04456997, "balance_loss_mlp": 1.01725543, "epoch": 0.6190104010100402, "flos": 25700728360320.0, "grad_norm": 3.0328764077602215, "language_loss": 0.78643072, "learning_rate": 1.3388642040382725e-06, "loss": 0.80819213, "num_input_tokens_seen": 110983595, "step": 5148, "time_per_iteration": 2.791912317276001 }, { "auxiliary_loss_clip": 0.01133079, "auxiliary_loss_mlp": 0.01029548, "balance_loss_clip": 1.0410192, "balance_loss_mlp": 1.02157903, "epoch": 0.6191306439006794, "flos": 30442372974720.0, "grad_norm": 1.6924065875120395, "language_loss": 0.84288317, "learning_rate": 1.3381290746188975e-06, "loss": 0.86450946, "num_input_tokens_seen": 111002965, "step": 5149, "time_per_iteration": 2.743757486343384 }, { "auxiliary_loss_clip": 0.0116188, "auxiliary_loss_mlp": 0.01026591, "balance_loss_clip": 1.05081749, "balance_loss_mlp": 1.01903272, "epoch": 0.6192508867913185, "flos": 26685793918080.0, "grad_norm": 5.877035890663604, "language_loss": 0.6691348, "learning_rate": 1.3373940456048152e-06, "loss": 0.69101948, "num_input_tokens_seen": 111022990, "step": 5150, "time_per_iteration": 2.6878085136413574 }, { "auxiliary_loss_clip": 0.01174737, "auxiliary_loss_mlp": 0.01026089, "balance_loss_clip": 1.05128264, "balance_loss_mlp": 1.01905012, "epoch": 0.6193711296819575, "flos": 36722036090880.0, "grad_norm": 1.645514157551669, "language_loss": 0.59308267, "learning_rate": 1.3366591171075299e-06, "loss": 0.61509091, "num_input_tokens_seen": 111046495, "step": 5151, "time_per_iteration": 2.712066650390625 }, { "auxiliary_loss_clip": 0.0114515, "auxiliary_loss_mlp": 0.0102444, "balance_loss_clip": 1.04765224, "balance_loss_mlp": 1.01746583, "epoch": 0.6194913725725967, "flos": 25192556697600.0, "grad_norm": 1.9540831206298006, "language_loss": 0.91262162, "learning_rate": 1.335924289238529e-06, "loss": 0.93431753, "num_input_tokens_seen": 111065705, "step": 5152, "time_per_iteration": 2.7269067764282227 }, { "auxiliary_loss_clip": 0.01163307, "auxiliary_loss_mlp": 0.00762953, "balance_loss_clip": 1.05466592, "balance_loss_mlp": 1.00056088, "epoch": 0.6196116154632357, "flos": 21178821196800.0, "grad_norm": 1.7868151830255712, "language_loss": 0.76638234, "learning_rate": 1.3351895621092859e-06, "loss": 0.78564501, "num_input_tokens_seen": 111086050, "step": 5153, "time_per_iteration": 2.6874210834503174 }, { "auxiliary_loss_clip": 0.01063397, "auxiliary_loss_mlp": 0.01026038, "balance_loss_clip": 1.03350067, "balance_loss_mlp": 1.01854539, "epoch": 0.6197318583538748, "flos": 16253744803200.0, "grad_norm": 2.0866012968309926, "language_loss": 0.76468241, "learning_rate": 1.3344549358312567e-06, "loss": 0.7855767, "num_input_tokens_seen": 111104450, "step": 5154, "time_per_iteration": 3.1413373947143555 }, { "auxiliary_loss_clip": 0.01166316, "auxiliary_loss_mlp": 0.01025907, "balance_loss_clip": 1.05267453, "balance_loss_mlp": 1.01863837, "epoch": 0.619852101244514, "flos": 24425612478720.0, "grad_norm": 2.06225268436024, "language_loss": 0.78358239, "learning_rate": 1.3337204105158852e-06, "loss": 0.80550462, "num_input_tokens_seen": 111123320, "step": 5155, "time_per_iteration": 3.256554126739502 }, { "auxiliary_loss_clip": 0.01118667, "auxiliary_loss_mlp": 0.0102378, "balance_loss_clip": 1.03822875, "balance_loss_mlp": 1.01642776, "epoch": 0.619972344135153, "flos": 16727298733440.0, "grad_norm": 2.365523723372955, "language_loss": 0.72707713, "learning_rate": 1.332985986274597e-06, "loss": 0.74850154, "num_input_tokens_seen": 111140950, "step": 5156, "time_per_iteration": 4.3413472175598145 }, { "auxiliary_loss_clip": 0.01096911, "auxiliary_loss_mlp": 0.00762288, "balance_loss_clip": 1.04372835, "balance_loss_mlp": 1.00052714, "epoch": 0.6200925870257921, "flos": 12495190498560.0, "grad_norm": 2.0140275519881734, "language_loss": 0.7538327, "learning_rate": 1.3322516632188047e-06, "loss": 0.7724247, "num_input_tokens_seen": 111157845, "step": 5157, "time_per_iteration": 2.9348976612091064 }, { "auxiliary_loss_clip": 0.01130172, "auxiliary_loss_mlp": 0.01029314, "balance_loss_clip": 1.0449698, "balance_loss_mlp": 1.0216161, "epoch": 0.6202128299164312, "flos": 26539350168960.0, "grad_norm": 1.812767283664021, "language_loss": 0.67067498, "learning_rate": 1.3315174414599045e-06, "loss": 0.6922698, "num_input_tokens_seen": 111179165, "step": 5158, "time_per_iteration": 3.926490068435669 }, { "auxiliary_loss_clip": 0.01155158, "auxiliary_loss_mlp": 0.0102782, "balance_loss_clip": 1.04667997, "balance_loss_mlp": 1.0199703, "epoch": 0.6203330728070703, "flos": 18770508069120.0, "grad_norm": 1.8924795807099115, "language_loss": 0.75291669, "learning_rate": 1.3307833211092768e-06, "loss": 0.77474654, "num_input_tokens_seen": 111197830, "step": 5159, "time_per_iteration": 3.597700834274292 }, { "auxiliary_loss_clip": 0.01176576, "auxiliary_loss_mlp": 0.01027594, "balance_loss_clip": 1.05287266, "balance_loss_mlp": 1.02007794, "epoch": 0.6204533156977093, "flos": 20629782835200.0, "grad_norm": 1.570854426727099, "language_loss": 0.75267327, "learning_rate": 1.3300493022782873e-06, "loss": 0.77471501, "num_input_tokens_seen": 111218400, "step": 5160, "time_per_iteration": 2.6456613540649414 }, { "auxiliary_loss_clip": 0.01107017, "auxiliary_loss_mlp": 0.00762731, "balance_loss_clip": 1.04198146, "balance_loss_mlp": 1.00056338, "epoch": 0.6205735585883485, "flos": 17348050598400.0, "grad_norm": 2.0307614736758546, "language_loss": 0.72599739, "learning_rate": 1.3293153850782855e-06, "loss": 0.74469483, "num_input_tokens_seen": 111236720, "step": 5161, "time_per_iteration": 2.7428035736083984 }, { "auxiliary_loss_clip": 0.01123322, "auxiliary_loss_mlp": 0.01030347, "balance_loss_clip": 1.04416251, "balance_loss_mlp": 1.02244997, "epoch": 0.6206938014789876, "flos": 22965017742720.0, "grad_norm": 1.8346152092028212, "language_loss": 0.71383893, "learning_rate": 1.3285815696206069e-06, "loss": 0.73537564, "num_input_tokens_seen": 111258265, "step": 5162, "time_per_iteration": 3.656492233276367 }, { "auxiliary_loss_clip": 0.01133309, "auxiliary_loss_mlp": 0.01026491, "balance_loss_clip": 1.04159045, "balance_loss_mlp": 1.01884413, "epoch": 0.6208140443696266, "flos": 23983192661760.0, "grad_norm": 2.2130039300193154, "language_loss": 0.76883346, "learning_rate": 1.32784785601657e-06, "loss": 0.7904315, "num_input_tokens_seen": 111277675, "step": 5163, "time_per_iteration": 2.732396125793457 }, { "auxiliary_loss_clip": 0.01146921, "auxiliary_loss_mlp": 0.01022169, "balance_loss_clip": 1.04423451, "balance_loss_mlp": 1.01469469, "epoch": 0.6209342872602658, "flos": 35077291303680.0, "grad_norm": 1.8443039108672714, "language_loss": 0.73685312, "learning_rate": 1.3271142443774798e-06, "loss": 0.75854409, "num_input_tokens_seen": 111299910, "step": 5164, "time_per_iteration": 2.831143617630005 }, { "auxiliary_loss_clip": 0.0114633, "auxiliary_loss_mlp": 0.01026232, "balance_loss_clip": 1.05025291, "balance_loss_mlp": 1.01880503, "epoch": 0.6210545301509048, "flos": 26979327861120.0, "grad_norm": 2.3081532449357764, "language_loss": 0.81722677, "learning_rate": 1.3263807348146228e-06, "loss": 0.83895236, "num_input_tokens_seen": 111319765, "step": 5165, "time_per_iteration": 2.6854279041290283 }, { "auxiliary_loss_clip": 0.01142811, "auxiliary_loss_mlp": 0.01021469, "balance_loss_clip": 1.04205418, "balance_loss_mlp": 1.01388168, "epoch": 0.6211747730415439, "flos": 33618240852480.0, "grad_norm": 2.181876568621823, "language_loss": 0.73586744, "learning_rate": 1.3256473274392733e-06, "loss": 0.7575103, "num_input_tokens_seen": 111341110, "step": 5166, "time_per_iteration": 2.748460292816162 }, { "auxiliary_loss_clip": 0.01174755, "auxiliary_loss_mlp": 0.01024205, "balance_loss_clip": 1.04950666, "balance_loss_mlp": 1.01690984, "epoch": 0.6212950159321831, "flos": 34167099646080.0, "grad_norm": 1.980631046904707, "language_loss": 0.70065355, "learning_rate": 1.3249140223626873e-06, "loss": 0.72264314, "num_input_tokens_seen": 111362730, "step": 5167, "time_per_iteration": 2.692476987838745 }, { "auxiliary_loss_clip": 0.01159617, "auxiliary_loss_mlp": 0.01027262, "balance_loss_clip": 1.04907072, "balance_loss_mlp": 1.01967096, "epoch": 0.6214152588228221, "flos": 27965758135680.0, "grad_norm": 1.5624993942910204, "language_loss": 0.75369394, "learning_rate": 1.3241808196961077e-06, "loss": 0.7755627, "num_input_tokens_seen": 111383855, "step": 5168, "time_per_iteration": 2.6380221843719482 }, { "auxiliary_loss_clip": 0.01135069, "auxiliary_loss_mlp": 0.01023357, "balance_loss_clip": 1.04453683, "balance_loss_mlp": 1.01642823, "epoch": 0.6215355017134612, "flos": 20230204965120.0, "grad_norm": 1.7124997014721468, "language_loss": 0.70922452, "learning_rate": 1.3234477195507608e-06, "loss": 0.73080873, "num_input_tokens_seen": 111402685, "step": 5169, "time_per_iteration": 2.654358148574829 }, { "auxiliary_loss_clip": 0.01129365, "auxiliary_loss_mlp": 0.010343, "balance_loss_clip": 1.04677236, "balance_loss_mlp": 1.02673578, "epoch": 0.6216557446041003, "flos": 41428129219200.0, "grad_norm": 2.2504697232453, "language_loss": 0.62461686, "learning_rate": 1.322714722037857e-06, "loss": 0.64625347, "num_input_tokens_seen": 111424130, "step": 5170, "time_per_iteration": 2.8941967487335205 }, { "auxiliary_loss_clip": 0.01139505, "auxiliary_loss_mlp": 0.01028087, "balance_loss_clip": 1.04474247, "balance_loss_mlp": 1.01943803, "epoch": 0.6217759874947394, "flos": 27928770105600.0, "grad_norm": 2.5693995895181247, "language_loss": 0.77336812, "learning_rate": 1.321981827268591e-06, "loss": 0.79504406, "num_input_tokens_seen": 111444785, "step": 5171, "time_per_iteration": 2.760979413986206 }, { "auxiliary_loss_clip": 0.01148668, "auxiliary_loss_mlp": 0.01026339, "balance_loss_clip": 1.04642022, "balance_loss_mlp": 1.0190165, "epoch": 0.6218962303853784, "flos": 21765673601280.0, "grad_norm": 1.6678952378089609, "language_loss": 0.81035626, "learning_rate": 1.3212490353541426e-06, "loss": 0.83210635, "num_input_tokens_seen": 111467045, "step": 5172, "time_per_iteration": 2.803497314453125 }, { "auxiliary_loss_clip": 0.01177497, "auxiliary_loss_mlp": 0.01028316, "balance_loss_clip": 1.05136752, "balance_loss_mlp": 1.02037096, "epoch": 0.6220164732760175, "flos": 21246260981760.0, "grad_norm": 1.9245050051904422, "language_loss": 0.80549318, "learning_rate": 1.3205163464056762e-06, "loss": 0.82755131, "num_input_tokens_seen": 111483650, "step": 5173, "time_per_iteration": 2.6149773597717285 }, { "auxiliary_loss_clip": 0.01158666, "auxiliary_loss_mlp": 0.01024773, "balance_loss_clip": 1.04725862, "balance_loss_mlp": 1.01745331, "epoch": 0.6221367161666567, "flos": 26136360506880.0, "grad_norm": 1.8001800949161348, "language_loss": 0.73201573, "learning_rate": 1.319783760534339e-06, "loss": 0.7538501, "num_input_tokens_seen": 111502895, "step": 5174, "time_per_iteration": 2.7096168994903564 }, { "auxiliary_loss_clip": 0.01162368, "auxiliary_loss_mlp": 0.01029032, "balance_loss_clip": 1.05127883, "balance_loss_mlp": 1.0208962, "epoch": 0.6222569590572957, "flos": 16284196558080.0, "grad_norm": 2.035815446723046, "language_loss": 0.75302672, "learning_rate": 1.319051277851266e-06, "loss": 0.77494073, "num_input_tokens_seen": 111519180, "step": 5175, "time_per_iteration": 2.7719216346740723 }, { "auxiliary_loss_clip": 0.01162779, "auxiliary_loss_mlp": 0.0102455, "balance_loss_clip": 1.04808187, "balance_loss_mlp": 1.01745141, "epoch": 0.6223772019479348, "flos": 18223840005120.0, "grad_norm": 2.07951943974655, "language_loss": 0.84048992, "learning_rate": 1.3183188984675716e-06, "loss": 0.86236322, "num_input_tokens_seen": 111537545, "step": 5176, "time_per_iteration": 2.5973520278930664 }, { "auxiliary_loss_clip": 0.01147935, "auxiliary_loss_mlp": 0.01027359, "balance_loss_clip": 1.04959393, "balance_loss_mlp": 1.01965773, "epoch": 0.6224974448385739, "flos": 27489797994240.0, "grad_norm": 2.5753177652824943, "language_loss": 0.71426469, "learning_rate": 1.3175866224943586e-06, "loss": 0.73601758, "num_input_tokens_seen": 111556265, "step": 5177, "time_per_iteration": 2.760357618331909 }, { "auxiliary_loss_clip": 0.01155958, "auxiliary_loss_mlp": 0.01023986, "balance_loss_clip": 1.05131912, "balance_loss_mlp": 1.01616549, "epoch": 0.622617687729213, "flos": 19791951125760.0, "grad_norm": 2.2975033495438195, "language_loss": 0.73807549, "learning_rate": 1.316854450042712e-06, "loss": 0.75987494, "num_input_tokens_seen": 111574205, "step": 5178, "time_per_iteration": 2.654616594314575 }, { "auxiliary_loss_clip": 0.01164808, "auxiliary_loss_mlp": 0.01029906, "balance_loss_clip": 1.04970396, "balance_loss_mlp": 1.02213669, "epoch": 0.622737930619852, "flos": 23038886062080.0, "grad_norm": 2.1564565912319695, "language_loss": 0.74528873, "learning_rate": 1.3161223812237024e-06, "loss": 0.76723582, "num_input_tokens_seen": 111593560, "step": 5179, "time_per_iteration": 2.644167423248291 }, { "auxiliary_loss_clip": 0.01175026, "auxiliary_loss_mlp": 0.01022256, "balance_loss_clip": 1.04954433, "balance_loss_mlp": 1.01454866, "epoch": 0.6228581735104912, "flos": 12634271959680.0, "grad_norm": 2.947748484244013, "language_loss": 0.85330629, "learning_rate": 1.3153904161483842e-06, "loss": 0.87527907, "num_input_tokens_seen": 111608860, "step": 5180, "time_per_iteration": 2.5517916679382324 }, { "auxiliary_loss_clip": 0.01130153, "auxiliary_loss_mlp": 0.01025272, "balance_loss_clip": 1.04341209, "balance_loss_mlp": 1.01750588, "epoch": 0.6229784164011303, "flos": 23802813538560.0, "grad_norm": 1.8610368907706776, "language_loss": 0.85174799, "learning_rate": 1.3146585549277953e-06, "loss": 0.87330222, "num_input_tokens_seen": 111627500, "step": 5181, "time_per_iteration": 2.672966480255127 }, { "auxiliary_loss_clip": 0.01157957, "auxiliary_loss_mlp": 0.01029277, "balance_loss_clip": 1.05072725, "balance_loss_mlp": 1.02143288, "epoch": 0.6230986592917693, "flos": 22414219614720.0, "grad_norm": 2.0581206312410703, "language_loss": 0.78416359, "learning_rate": 1.3139267976729591e-06, "loss": 0.80603588, "num_input_tokens_seen": 111647690, "step": 5182, "time_per_iteration": 3.6487162113189697 }, { "auxiliary_loss_clip": 0.0116575, "auxiliary_loss_mlp": 0.01024736, "balance_loss_clip": 1.05092168, "balance_loss_mlp": 1.01705289, "epoch": 0.6232189021824085, "flos": 34528217028480.0, "grad_norm": 1.684923432070248, "language_loss": 0.71615148, "learning_rate": 1.3131951444948815e-06, "loss": 0.7380563, "num_input_tokens_seen": 111667090, "step": 5183, "time_per_iteration": 2.6832220554351807 }, { "auxiliary_loss_clip": 0.01153532, "auxiliary_loss_mlp": 0.01024675, "balance_loss_clip": 1.05125415, "balance_loss_mlp": 1.01702785, "epoch": 0.6233391450730476, "flos": 22237000888320.0, "grad_norm": 1.7913109425262055, "language_loss": 0.76439404, "learning_rate": 1.3124635955045546e-06, "loss": 0.78617609, "num_input_tokens_seen": 111686905, "step": 5184, "time_per_iteration": 3.648263931274414 }, { "auxiliary_loss_clip": 0.01107494, "auxiliary_loss_mlp": 0.00762866, "balance_loss_clip": 1.04059398, "balance_loss_mlp": 1.00046921, "epoch": 0.6234593879636866, "flos": 20332693445760.0, "grad_norm": 1.9731241328755287, "language_loss": 0.84278369, "learning_rate": 1.3117321508129537e-06, "loss": 0.86148727, "num_input_tokens_seen": 111704985, "step": 5185, "time_per_iteration": 3.660752773284912 }, { "auxiliary_loss_clip": 0.01152168, "auxiliary_loss_mlp": 0.01027828, "balance_loss_clip": 1.05036891, "balance_loss_mlp": 1.01977515, "epoch": 0.6235796308543258, "flos": 20664903358080.0, "grad_norm": 1.5043872166648897, "language_loss": 0.76786864, "learning_rate": 1.3110008105310388e-06, "loss": 0.78966868, "num_input_tokens_seen": 111724805, "step": 5186, "time_per_iteration": 2.6329448223114014 }, { "auxiliary_loss_clip": 0.01175454, "auxiliary_loss_mlp": 0.01031102, "balance_loss_clip": 1.04825175, "balance_loss_mlp": 1.02335382, "epoch": 0.6236998737449648, "flos": 26618641441920.0, "grad_norm": 1.6734060642014357, "language_loss": 0.77943474, "learning_rate": 1.3102695747697526e-06, "loss": 0.80150032, "num_input_tokens_seen": 111747675, "step": 5187, "time_per_iteration": 2.7031948566436768 }, { "auxiliary_loss_clip": 0.01110067, "auxiliary_loss_mlp": 0.0102718, "balance_loss_clip": 1.04765773, "balance_loss_mlp": 1.01888275, "epoch": 0.6238201166356039, "flos": 12674599954560.0, "grad_norm": 7.351997014559598, "language_loss": 0.90637249, "learning_rate": 1.3095384436400237e-06, "loss": 0.92774487, "num_input_tokens_seen": 111759205, "step": 5188, "time_per_iteration": 2.764676809310913 }, { "auxiliary_loss_clip": 0.01156372, "auxiliary_loss_mlp": 0.01026805, "balance_loss_clip": 1.04867148, "balance_loss_mlp": 1.0187645, "epoch": 0.623940359526243, "flos": 10452160730880.0, "grad_norm": 2.5278912759688414, "language_loss": 0.82281864, "learning_rate": 1.3088074172527633e-06, "loss": 0.84465039, "num_input_tokens_seen": 111776335, "step": 5189, "time_per_iteration": 3.652198553085327 }, { "auxiliary_loss_clip": 0.01150537, "auxiliary_loss_mlp": 0.01027177, "balance_loss_clip": 1.04525208, "balance_loss_mlp": 1.01908255, "epoch": 0.6240606024168821, "flos": 29059525226880.0, "grad_norm": 2.481427498585943, "language_loss": 0.71805513, "learning_rate": 1.3080764957188684e-06, "loss": 0.73983228, "num_input_tokens_seen": 111796580, "step": 5190, "time_per_iteration": 2.73006534576416 }, { "auxiliary_loss_clip": 0.01123631, "auxiliary_loss_mlp": 0.01023763, "balance_loss_clip": 1.04337144, "balance_loss_mlp": 1.01650906, "epoch": 0.6241808453075212, "flos": 22018089450240.0, "grad_norm": 1.7651650770097989, "language_loss": 0.70727539, "learning_rate": 1.3073456791492192e-06, "loss": 0.72874933, "num_input_tokens_seen": 111816290, "step": 5191, "time_per_iteration": 2.7985713481903076 }, { "auxiliary_loss_clip": 0.01148586, "auxiliary_loss_mlp": 0.01027986, "balance_loss_clip": 1.0457468, "balance_loss_mlp": 1.02090526, "epoch": 0.6243010881981603, "flos": 21138708683520.0, "grad_norm": 2.010945957806756, "language_loss": 0.7859416, "learning_rate": 1.3066149676546801e-06, "loss": 0.80770731, "num_input_tokens_seen": 111834470, "step": 5192, "time_per_iteration": 2.6733171939849854 }, { "auxiliary_loss_clip": 0.01147444, "auxiliary_loss_mlp": 0.01025748, "balance_loss_clip": 1.05209756, "balance_loss_mlp": 1.01877761, "epoch": 0.6244213310887994, "flos": 22344948236160.0, "grad_norm": 2.2152246134327926, "language_loss": 0.66046417, "learning_rate": 1.3058843613460985e-06, "loss": 0.68219602, "num_input_tokens_seen": 111852410, "step": 5193, "time_per_iteration": 2.7289328575134277 }, { "auxiliary_loss_clip": 0.01141743, "auxiliary_loss_mlp": 0.01025657, "balance_loss_clip": 1.04586291, "balance_loss_mlp": 1.01801527, "epoch": 0.6245415739794384, "flos": 15231978524160.0, "grad_norm": 2.172803302578828, "language_loss": 0.74817193, "learning_rate": 1.3051538603343075e-06, "loss": 0.76984596, "num_input_tokens_seen": 111870340, "step": 5194, "time_per_iteration": 2.7779204845428467 }, { "auxiliary_loss_clip": 0.01163068, "auxiliary_loss_mlp": 0.01031473, "balance_loss_clip": 1.05117345, "balance_loss_mlp": 1.02423751, "epoch": 0.6246618168700776, "flos": 18879891960960.0, "grad_norm": 1.9060897301181912, "language_loss": 0.68065524, "learning_rate": 1.3044234647301235e-06, "loss": 0.70260066, "num_input_tokens_seen": 111888365, "step": 5195, "time_per_iteration": 2.59181809425354 }, { "auxiliary_loss_clip": 0.0115676, "auxiliary_loss_mlp": 0.01028203, "balance_loss_clip": 1.04832053, "balance_loss_mlp": 1.02154207, "epoch": 0.6247820597607167, "flos": 14319201087360.0, "grad_norm": 1.7474291677334377, "language_loss": 0.72415298, "learning_rate": 1.303693174644347e-06, "loss": 0.74600261, "num_input_tokens_seen": 111905840, "step": 5196, "time_per_iteration": 2.6396660804748535 }, { "auxiliary_loss_clip": 0.01141313, "auxiliary_loss_mlp": 0.01028211, "balance_loss_clip": 1.04483187, "balance_loss_mlp": 1.0203253, "epoch": 0.6249023026513557, "flos": 22637979388800.0, "grad_norm": 2.057619012703494, "language_loss": 0.80634171, "learning_rate": 1.3029629901877625e-06, "loss": 0.82803696, "num_input_tokens_seen": 111925215, "step": 5197, "time_per_iteration": 2.822543144226074 }, { "auxiliary_loss_clip": 0.01168712, "auxiliary_loss_mlp": 0.01026888, "balance_loss_clip": 1.05055368, "balance_loss_mlp": 1.01928854, "epoch": 0.6250225455419949, "flos": 20266690204800.0, "grad_norm": 4.077896614186699, "language_loss": 0.77503157, "learning_rate": 1.3022329114711376e-06, "loss": 0.79698765, "num_input_tokens_seen": 111943925, "step": 5198, "time_per_iteration": 2.6620635986328125 }, { "auxiliary_loss_clip": 0.01143509, "auxiliary_loss_mlp": 0.01029366, "balance_loss_clip": 1.04625988, "balance_loss_mlp": 1.02206469, "epoch": 0.6251427884326339, "flos": 23437853400960.0, "grad_norm": 1.7732169399525648, "language_loss": 0.69599688, "learning_rate": 1.3015029386052256e-06, "loss": 0.71772563, "num_input_tokens_seen": 111964095, "step": 5199, "time_per_iteration": 2.6389286518096924 }, { "auxiliary_loss_clip": 0.01145449, "auxiliary_loss_mlp": 0.01026667, "balance_loss_clip": 1.04775286, "balance_loss_mlp": 1.01921082, "epoch": 0.625263031323273, "flos": 31723055464320.0, "grad_norm": 2.2451072770929525, "language_loss": 0.72779465, "learning_rate": 1.3007730717007622e-06, "loss": 0.74951577, "num_input_tokens_seen": 111984910, "step": 5200, "time_per_iteration": 2.7686476707458496 }, { "auxiliary_loss_clip": 0.011808, "auxiliary_loss_mlp": 0.01025315, "balance_loss_clip": 1.05303085, "balance_loss_mlp": 1.01677048, "epoch": 0.6253832742139122, "flos": 24134341092480.0, "grad_norm": 1.8365324390972673, "language_loss": 0.75136322, "learning_rate": 1.3000433108684676e-06, "loss": 0.77342439, "num_input_tokens_seen": 112005410, "step": 5201, "time_per_iteration": 2.596453905105591 }, { "auxiliary_loss_clip": 0.01158517, "auxiliary_loss_mlp": 0.01026276, "balance_loss_clip": 1.04875398, "balance_loss_mlp": 1.01866746, "epoch": 0.6255035171045512, "flos": 27668812400640.0, "grad_norm": 2.4304844407984403, "language_loss": 0.80104089, "learning_rate": 1.2993136562190467e-06, "loss": 0.82288885, "num_input_tokens_seen": 112024530, "step": 5202, "time_per_iteration": 2.7395575046539307 }, { "auxiliary_loss_clip": 0.01153827, "auxiliary_loss_mlp": 0.01025926, "balance_loss_clip": 1.0493331, "balance_loss_mlp": 1.01827884, "epoch": 0.6256237599951903, "flos": 20227798753920.0, "grad_norm": 1.500772126087787, "language_loss": 0.70430833, "learning_rate": 1.2985841078631871e-06, "loss": 0.72610581, "num_input_tokens_seen": 112043850, "step": 5203, "time_per_iteration": 2.7329280376434326 }, { "auxiliary_loss_clip": 0.01101444, "auxiliary_loss_mlp": 0.01030242, "balance_loss_clip": 1.03908992, "balance_loss_mlp": 1.02232659, "epoch": 0.6257440028858293, "flos": 24170574936960.0, "grad_norm": 1.7638513433153822, "language_loss": 0.78384078, "learning_rate": 1.2978546659115608e-06, "loss": 0.80515766, "num_input_tokens_seen": 112061930, "step": 5204, "time_per_iteration": 2.819208860397339 }, { "auxiliary_loss_clip": 0.01153573, "auxiliary_loss_mlp": 0.01026053, "balance_loss_clip": 1.05015826, "balance_loss_mlp": 1.01839948, "epoch": 0.6258642457764685, "flos": 15851940289920.0, "grad_norm": 1.8078146928488974, "language_loss": 0.8520838, "learning_rate": 1.2971253304748228e-06, "loss": 0.87388003, "num_input_tokens_seen": 112079645, "step": 5205, "time_per_iteration": 2.7224647998809814 }, { "auxiliary_loss_clip": 0.01165357, "auxiliary_loss_mlp": 0.0103153, "balance_loss_clip": 1.05045915, "balance_loss_mlp": 1.02331066, "epoch": 0.6259844886671075, "flos": 11911354836480.0, "grad_norm": 1.7830712440868621, "language_loss": 0.75054866, "learning_rate": 1.296396101663614e-06, "loss": 0.77251756, "num_input_tokens_seen": 112096205, "step": 5206, "time_per_iteration": 2.543144702911377 }, { "auxiliary_loss_clip": 0.01165621, "auxiliary_loss_mlp": 0.01023112, "balance_loss_clip": 1.05072117, "balance_loss_mlp": 1.01579833, "epoch": 0.6261047315577466, "flos": 15887958652800.0, "grad_norm": 2.0749665865859446, "language_loss": 0.84210312, "learning_rate": 1.2956669795885565e-06, "loss": 0.86399043, "num_input_tokens_seen": 112112835, "step": 5207, "time_per_iteration": 2.603273630142212 }, { "auxiliary_loss_clip": 0.01125188, "auxiliary_loss_mlp": 0.01032843, "balance_loss_clip": 1.04599261, "balance_loss_mlp": 1.02442145, "epoch": 0.6262249744483858, "flos": 31248926916480.0, "grad_norm": 1.8166920335252112, "language_loss": 0.68329263, "learning_rate": 1.294937964360259e-06, "loss": 0.70487297, "num_input_tokens_seen": 112133105, "step": 5208, "time_per_iteration": 3.712355852127075 }, { "auxiliary_loss_clip": 0.01155005, "auxiliary_loss_mlp": 0.01025781, "balance_loss_clip": 1.04639602, "balance_loss_mlp": 1.01737678, "epoch": 0.6263452173390248, "flos": 27198598435200.0, "grad_norm": 2.281220222198203, "language_loss": 0.71568096, "learning_rate": 1.2942090560893108e-06, "loss": 0.73748875, "num_input_tokens_seen": 112152510, "step": 5209, "time_per_iteration": 2.7455599308013916 }, { "auxiliary_loss_clip": 0.01175449, "auxiliary_loss_mlp": 0.01028369, "balance_loss_clip": 1.05128515, "balance_loss_mlp": 1.02109146, "epoch": 0.6264654602296639, "flos": 37342069683840.0, "grad_norm": 1.7018628874425628, "language_loss": 0.60670936, "learning_rate": 1.2934802548862882e-06, "loss": 0.62874758, "num_input_tokens_seen": 112175295, "step": 5210, "time_per_iteration": 3.6763200759887695 }, { "auxiliary_loss_clip": 0.01144593, "auxiliary_loss_mlp": 0.01025707, "balance_loss_clip": 1.0454756, "balance_loss_mlp": 1.01853704, "epoch": 0.626585703120303, "flos": 14756952136320.0, "grad_norm": 2.3622471547907335, "language_loss": 0.82763648, "learning_rate": 1.292751560861749e-06, "loss": 0.84933943, "num_input_tokens_seen": 112190200, "step": 5211, "time_per_iteration": 3.6164357662200928 }, { "auxiliary_loss_clip": 0.01179718, "auxiliary_loss_mlp": 0.01031843, "balance_loss_clip": 1.05306745, "balance_loss_mlp": 1.02432692, "epoch": 0.6267059460109421, "flos": 22347318533760.0, "grad_norm": 1.8672179672466032, "language_loss": 0.79616684, "learning_rate": 1.2920229741262354e-06, "loss": 0.81828243, "num_input_tokens_seen": 112208205, "step": 5212, "time_per_iteration": 2.6227216720581055 }, { "auxiliary_loss_clip": 0.01150897, "auxiliary_loss_mlp": 0.01026817, "balance_loss_clip": 1.04796553, "balance_loss_mlp": 1.01946747, "epoch": 0.6268261889015811, "flos": 17748813617280.0, "grad_norm": 2.315121145441959, "language_loss": 0.75635123, "learning_rate": 1.2912944947902739e-06, "loss": 0.77812839, "num_input_tokens_seen": 112224690, "step": 5213, "time_per_iteration": 2.6239869594573975 }, { "auxiliary_loss_clip": 0.01156345, "auxiliary_loss_mlp": 0.01029407, "balance_loss_clip": 1.04920197, "balance_loss_mlp": 1.02136648, "epoch": 0.6269464317922203, "flos": 32846484211200.0, "grad_norm": 2.0096879998881616, "language_loss": 0.71103835, "learning_rate": 1.2905661229643742e-06, "loss": 0.73289585, "num_input_tokens_seen": 112244450, "step": 5214, "time_per_iteration": 3.7325282096862793 }, { "auxiliary_loss_clip": 0.01172677, "auxiliary_loss_mlp": 0.01029654, "balance_loss_clip": 1.04732442, "balance_loss_mlp": 1.02176869, "epoch": 0.6270666746828594, "flos": 17929192740480.0, "grad_norm": 3.32403892533118, "language_loss": 0.84107375, "learning_rate": 1.2898378587590299e-06, "loss": 0.86309701, "num_input_tokens_seen": 112261050, "step": 5215, "time_per_iteration": 2.598836660385132 }, { "auxiliary_loss_clip": 0.01158782, "auxiliary_loss_mlp": 0.01025534, "balance_loss_clip": 1.04887509, "balance_loss_mlp": 1.01795256, "epoch": 0.6271869175734984, "flos": 17457326749440.0, "grad_norm": 1.8791439894665605, "language_loss": 0.87720788, "learning_rate": 1.2891097022847173e-06, "loss": 0.89905107, "num_input_tokens_seen": 112278395, "step": 5216, "time_per_iteration": 2.6292872428894043 }, { "auxiliary_loss_clip": 0.01149583, "auxiliary_loss_mlp": 0.01027441, "balance_loss_clip": 1.0477066, "balance_loss_mlp": 1.01926947, "epoch": 0.6273071604641376, "flos": 26868615166080.0, "grad_norm": 6.044129248194314, "language_loss": 0.66519767, "learning_rate": 1.2883816536518978e-06, "loss": 0.68696797, "num_input_tokens_seen": 112299535, "step": 5217, "time_per_iteration": 2.7323081493377686 }, { "auxiliary_loss_clip": 0.01159115, "auxiliary_loss_mlp": 0.01028934, "balance_loss_clip": 1.04924166, "balance_loss_mlp": 1.02130413, "epoch": 0.6274274033547766, "flos": 26062384446720.0, "grad_norm": 2.8208762930993565, "language_loss": 0.81801426, "learning_rate": 1.2876537129710155e-06, "loss": 0.83989471, "num_input_tokens_seen": 112317265, "step": 5218, "time_per_iteration": 2.6719772815704346 }, { "auxiliary_loss_clip": 0.01149144, "auxiliary_loss_mlp": 0.01030074, "balance_loss_clip": 1.05345035, "balance_loss_mlp": 1.02173567, "epoch": 0.6275476462454157, "flos": 20266259241600.0, "grad_norm": 2.0773537865330267, "language_loss": 0.75413013, "learning_rate": 1.286925880352499e-06, "loss": 0.7759223, "num_input_tokens_seen": 112336125, "step": 5219, "time_per_iteration": 2.7685234546661377 }, { "auxiliary_loss_clip": 0.01146292, "auxiliary_loss_mlp": 0.01027125, "balance_loss_clip": 1.04904974, "balance_loss_mlp": 1.01919115, "epoch": 0.6276678891360549, "flos": 26320402817280.0, "grad_norm": 2.307175765344511, "language_loss": 0.71384776, "learning_rate": 1.2861981559067592e-06, "loss": 0.73558193, "num_input_tokens_seen": 112356730, "step": 5220, "time_per_iteration": 2.7136261463165283 }, { "auxiliary_loss_clip": 0.0110953, "auxiliary_loss_mlp": 0.01027795, "balance_loss_clip": 1.04138422, "balance_loss_mlp": 1.02002883, "epoch": 0.6277881320266939, "flos": 13912512324480.0, "grad_norm": 1.9620119408392362, "language_loss": 0.80192918, "learning_rate": 1.2854705397441917e-06, "loss": 0.82330245, "num_input_tokens_seen": 112372270, "step": 5221, "time_per_iteration": 2.751511812210083 }, { "auxiliary_loss_clip": 0.0112986, "auxiliary_loss_mlp": 0.01026121, "balance_loss_clip": 1.04419732, "balance_loss_mlp": 1.01902747, "epoch": 0.627908374917333, "flos": 27048922462080.0, "grad_norm": 2.37649303434929, "language_loss": 0.77555919, "learning_rate": 1.2847430319751747e-06, "loss": 0.79711902, "num_input_tokens_seen": 112390365, "step": 5222, "time_per_iteration": 2.7419698238372803 }, { "auxiliary_loss_clip": 0.01157314, "auxiliary_loss_mlp": 0.01030934, "balance_loss_clip": 1.05008256, "balance_loss_mlp": 1.02383494, "epoch": 0.6280286178079721, "flos": 23769201386880.0, "grad_norm": 2.458621915238166, "language_loss": 0.67458189, "learning_rate": 1.2840156327100712e-06, "loss": 0.69646436, "num_input_tokens_seen": 112407490, "step": 5223, "time_per_iteration": 2.592947006225586 }, { "auxiliary_loss_clip": 0.0117493, "auxiliary_loss_mlp": 0.01030571, "balance_loss_clip": 1.04985166, "balance_loss_mlp": 1.02289653, "epoch": 0.6281488606986112, "flos": 26359150613760.0, "grad_norm": 1.7826663993509597, "language_loss": 0.72112054, "learning_rate": 1.2832883420592272e-06, "loss": 0.74317551, "num_input_tokens_seen": 112426385, "step": 5224, "time_per_iteration": 2.6519529819488525 }, { "auxiliary_loss_clip": 0.0114247, "auxiliary_loss_mlp": 0.0103333, "balance_loss_clip": 1.04614985, "balance_loss_mlp": 1.02498829, "epoch": 0.6282691035892503, "flos": 36137194848000.0, "grad_norm": 2.554779050018085, "language_loss": 0.64855945, "learning_rate": 1.282561160132972e-06, "loss": 0.67031741, "num_input_tokens_seen": 112446905, "step": 5225, "time_per_iteration": 2.7584292888641357 }, { "auxiliary_loss_clip": 0.01153404, "auxiliary_loss_mlp": 0.01024563, "balance_loss_clip": 1.04640102, "balance_loss_mlp": 1.01682067, "epoch": 0.6283893464798894, "flos": 26537231266560.0, "grad_norm": 1.6691281558115578, "language_loss": 0.80983418, "learning_rate": 1.2818340870416186e-06, "loss": 0.83161384, "num_input_tokens_seen": 112468040, "step": 5226, "time_per_iteration": 2.750824451446533 }, { "auxiliary_loss_clip": 0.01141892, "auxiliary_loss_mlp": 0.01030663, "balance_loss_clip": 1.04439676, "balance_loss_mlp": 1.02253914, "epoch": 0.6285095893705285, "flos": 22237216369920.0, "grad_norm": 2.037206297775577, "language_loss": 0.76030457, "learning_rate": 1.2811071228954626e-06, "loss": 0.78203017, "num_input_tokens_seen": 112486675, "step": 5227, "time_per_iteration": 2.6922826766967773 }, { "auxiliary_loss_clip": 0.01147831, "auxiliary_loss_mlp": 0.01026966, "balance_loss_clip": 1.04923499, "balance_loss_mlp": 1.01957798, "epoch": 0.6286298322611675, "flos": 26542259170560.0, "grad_norm": 2.1808902380168718, "language_loss": 0.80902052, "learning_rate": 1.2803802678047846e-06, "loss": 0.83076847, "num_input_tokens_seen": 112506825, "step": 5228, "time_per_iteration": 2.759744882583618 }, { "auxiliary_loss_clip": 0.01152803, "auxiliary_loss_mlp": 0.01028514, "balance_loss_clip": 1.04938722, "balance_loss_mlp": 1.02030897, "epoch": 0.6287500751518067, "flos": 21795227516160.0, "grad_norm": 2.050991718841182, "language_loss": 0.74043691, "learning_rate": 1.279653521879848e-06, "loss": 0.76225007, "num_input_tokens_seen": 112526890, "step": 5229, "time_per_iteration": 2.684418201446533 }, { "auxiliary_loss_clip": 0.0108458, "auxiliary_loss_mlp": 0.01027877, "balance_loss_clip": 1.04181671, "balance_loss_mlp": 1.0203073, "epoch": 0.6288703180424458, "flos": 20009605587840.0, "grad_norm": 2.4005424501728703, "language_loss": 0.83924627, "learning_rate": 1.2789268852308997e-06, "loss": 0.86037081, "num_input_tokens_seen": 112542100, "step": 5230, "time_per_iteration": 2.9936957359313965 }, { "auxiliary_loss_clip": 0.01156493, "auxiliary_loss_mlp": 0.01026439, "balance_loss_clip": 1.04915404, "balance_loss_mlp": 1.01836205, "epoch": 0.6289905609330848, "flos": 22124923476480.0, "grad_norm": 2.189892330644738, "language_loss": 0.70461547, "learning_rate": 1.2782003579681688e-06, "loss": 0.72644484, "num_input_tokens_seen": 112561630, "step": 5231, "time_per_iteration": 2.779982089996338 }, { "auxiliary_loss_clip": 0.0117839, "auxiliary_loss_mlp": 0.01029455, "balance_loss_clip": 1.05192947, "balance_loss_mlp": 1.02189088, "epoch": 0.629110803823724, "flos": 25518481729920.0, "grad_norm": 1.6585234432446059, "language_loss": 0.74230003, "learning_rate": 1.2774739402018701e-06, "loss": 0.76437849, "num_input_tokens_seen": 112582465, "step": 5232, "time_per_iteration": 2.6644949913024902 }, { "auxiliary_loss_clip": 0.01163727, "auxiliary_loss_mlp": 0.01021944, "balance_loss_clip": 1.0535953, "balance_loss_mlp": 1.01415372, "epoch": 0.629231046714363, "flos": 20886616056960.0, "grad_norm": 1.798591138454584, "language_loss": 0.72998798, "learning_rate": 1.2767476320422002e-06, "loss": 0.75184464, "num_input_tokens_seen": 112602390, "step": 5233, "time_per_iteration": 2.781033992767334 }, { "auxiliary_loss_clip": 0.01045926, "auxiliary_loss_mlp": 0.0100241, "balance_loss_clip": 1.01368403, "balance_loss_mlp": 1.00098538, "epoch": 0.6293512896050021, "flos": 65050027908480.0, "grad_norm": 0.690127895417718, "language_loss": 0.57232964, "learning_rate": 1.2760214335993392e-06, "loss": 0.59281296, "num_input_tokens_seen": 112669035, "step": 5234, "time_per_iteration": 4.272091627120972 }, { "auxiliary_loss_clip": 0.01155224, "auxiliary_loss_mlp": 0.0103189, "balance_loss_clip": 1.04823685, "balance_loss_mlp": 1.02464843, "epoch": 0.6294715324956413, "flos": 34677857088000.0, "grad_norm": 2.3643171864412356, "language_loss": 0.58890367, "learning_rate": 1.2752953449834514e-06, "loss": 0.61077487, "num_input_tokens_seen": 112691485, "step": 5235, "time_per_iteration": 2.738816738128662 }, { "auxiliary_loss_clip": 0.0117726, "auxiliary_loss_mlp": 0.01025984, "balance_loss_clip": 1.05239344, "balance_loss_mlp": 1.01926661, "epoch": 0.6295917753862803, "flos": 22784207656320.0, "grad_norm": 1.6878301766201351, "language_loss": 0.80188656, "learning_rate": 1.2745693663046836e-06, "loss": 0.82391906, "num_input_tokens_seen": 112710555, "step": 5236, "time_per_iteration": 2.6195952892303467 }, { "auxiliary_loss_clip": 0.01156498, "auxiliary_loss_mlp": 0.01023607, "balance_loss_clip": 1.04722357, "balance_loss_mlp": 1.01639521, "epoch": 0.6297120182769194, "flos": 20850454039680.0, "grad_norm": 2.108978826833143, "language_loss": 0.80994105, "learning_rate": 1.2738434976731662e-06, "loss": 0.83174217, "num_input_tokens_seen": 112728740, "step": 5237, "time_per_iteration": 4.480233669281006 }, { "auxiliary_loss_clip": 0.01148959, "auxiliary_loss_mlp": 0.01028184, "balance_loss_clip": 1.0490222, "balance_loss_mlp": 1.01988137, "epoch": 0.6298322611675584, "flos": 19497662997120.0, "grad_norm": 1.508154106745284, "language_loss": 0.75040549, "learning_rate": 1.2731177391990125e-06, "loss": 0.77217692, "num_input_tokens_seen": 112748665, "step": 5238, "time_per_iteration": 2.7086920738220215 }, { "auxiliary_loss_clip": 0.01148212, "auxiliary_loss_mlp": 0.01022329, "balance_loss_clip": 1.04619277, "balance_loss_mlp": 1.0144608, "epoch": 0.6299525040581976, "flos": 12604466649600.0, "grad_norm": 2.108745013756051, "language_loss": 0.82074475, "learning_rate": 1.2723920909923203e-06, "loss": 0.84245014, "num_input_tokens_seen": 112764410, "step": 5239, "time_per_iteration": 2.717611789703369 }, { "auxiliary_loss_clip": 0.01072532, "auxiliary_loss_mlp": 0.01000497, "balance_loss_clip": 1.01363659, "balance_loss_mlp": 0.99911994, "epoch": 0.6300727469488366, "flos": 57725685636480.0, "grad_norm": 1.0493702367673794, "language_loss": 0.60468912, "learning_rate": 1.2716665531631688e-06, "loss": 0.62541938, "num_input_tokens_seen": 112818695, "step": 5240, "time_per_iteration": 4.189661741256714 }, { "auxiliary_loss_clip": 0.01166235, "auxiliary_loss_mlp": 0.01030145, "balance_loss_clip": 1.04791868, "balance_loss_mlp": 1.02272463, "epoch": 0.6301929898394757, "flos": 22527302607360.0, "grad_norm": 1.8343127121692446, "language_loss": 0.77563882, "learning_rate": 1.270941125821623e-06, "loss": 0.79760253, "num_input_tokens_seen": 112839120, "step": 5241, "time_per_iteration": 2.686007022857666 }, { "auxiliary_loss_clip": 0.01158262, "auxiliary_loss_mlp": 0.01024479, "balance_loss_clip": 1.047539, "balance_loss_mlp": 1.01692724, "epoch": 0.6303132327301149, "flos": 28293550675200.0, "grad_norm": 1.8888613216288033, "language_loss": 0.75380123, "learning_rate": 1.2702158090777278e-06, "loss": 0.77562863, "num_input_tokens_seen": 112860210, "step": 5242, "time_per_iteration": 2.679980516433716 }, { "auxiliary_loss_clip": 0.01127413, "auxiliary_loss_mlp": 0.0102773, "balance_loss_clip": 1.04370379, "balance_loss_mlp": 1.02006447, "epoch": 0.6304334756207539, "flos": 25264521596160.0, "grad_norm": 1.9347447441526566, "language_loss": 0.74932843, "learning_rate": 1.2694906030415148e-06, "loss": 0.77087986, "num_input_tokens_seen": 112877955, "step": 5243, "time_per_iteration": 2.742440700531006 }, { "auxiliary_loss_clip": 0.0115399, "auxiliary_loss_mlp": 0.01026938, "balance_loss_clip": 1.04743659, "balance_loss_mlp": 1.01895058, "epoch": 0.630553718511393, "flos": 18033548728320.0, "grad_norm": 3.322860633765409, "language_loss": 0.82389295, "learning_rate": 1.2687655078229958e-06, "loss": 0.84570217, "num_input_tokens_seen": 112892285, "step": 5244, "time_per_iteration": 2.61022686958313 }, { "auxiliary_loss_clip": 0.01147147, "auxiliary_loss_mlp": 0.01030941, "balance_loss_clip": 1.04939878, "balance_loss_mlp": 1.02288294, "epoch": 0.6306739614020321, "flos": 27304103658240.0, "grad_norm": 2.176585482787546, "language_loss": 0.69295555, "learning_rate": 1.2680405235321678e-06, "loss": 0.71473646, "num_input_tokens_seen": 112913620, "step": 5245, "time_per_iteration": 2.7868378162384033 }, { "auxiliary_loss_clip": 0.01150801, "auxiliary_loss_mlp": 0.00763275, "balance_loss_clip": 1.0500865, "balance_loss_mlp": 1.00050354, "epoch": 0.6307942042926712, "flos": 15341434243200.0, "grad_norm": 2.2064035664222206, "language_loss": 0.78862047, "learning_rate": 1.267315650279011e-06, "loss": 0.80776125, "num_input_tokens_seen": 112932090, "step": 5246, "time_per_iteration": 2.7240219116210938 }, { "auxiliary_loss_clip": 0.01130011, "auxiliary_loss_mlp": 0.01035133, "balance_loss_clip": 1.05047488, "balance_loss_mlp": 1.02696133, "epoch": 0.6309144471833102, "flos": 19606400444160.0, "grad_norm": 1.9053650983814128, "language_loss": 0.73962617, "learning_rate": 1.2665908881734874e-06, "loss": 0.76127762, "num_input_tokens_seen": 112950925, "step": 5247, "time_per_iteration": 2.735856056213379 }, { "auxiliary_loss_clip": 0.01162981, "auxiliary_loss_mlp": 0.01025973, "balance_loss_clip": 1.04940271, "balance_loss_mlp": 1.01863599, "epoch": 0.6310346900739494, "flos": 17493345112320.0, "grad_norm": 2.261371385351243, "language_loss": 0.84830868, "learning_rate": 1.2658662373255432e-06, "loss": 0.87019825, "num_input_tokens_seen": 112969315, "step": 5248, "time_per_iteration": 2.684905767440796 }, { "auxiliary_loss_clip": 0.0105003, "auxiliary_loss_mlp": 0.01005135, "balance_loss_clip": 1.01348805, "balance_loss_mlp": 1.00382996, "epoch": 0.6311549329645885, "flos": 55070164131840.0, "grad_norm": 0.7128106871617246, "language_loss": 0.52250278, "learning_rate": 1.2651416978451063e-06, "loss": 0.5430544, "num_input_tokens_seen": 113034700, "step": 5249, "time_per_iteration": 3.3005452156066895 }, { "auxiliary_loss_clip": 0.01179635, "auxiliary_loss_mlp": 0.01024327, "balance_loss_clip": 1.05264282, "balance_loss_mlp": 1.01595271, "epoch": 0.6312751758552275, "flos": 41902545075840.0, "grad_norm": 1.9224013915453904, "language_loss": 0.65360707, "learning_rate": 1.2644172698420903e-06, "loss": 0.67564666, "num_input_tokens_seen": 113056805, "step": 5250, "time_per_iteration": 2.886029005050659 }, { "auxiliary_loss_clip": 0.01136411, "auxiliary_loss_mlp": 0.01025545, "balance_loss_clip": 1.04705024, "balance_loss_mlp": 1.01734388, "epoch": 0.6313954187458667, "flos": 19646800266240.0, "grad_norm": 1.9284398155322893, "language_loss": 0.85078079, "learning_rate": 1.2636929534263892e-06, "loss": 0.8724004, "num_input_tokens_seen": 113075790, "step": 5251, "time_per_iteration": 2.803936004638672 }, { "auxiliary_loss_clip": 0.01134319, "auxiliary_loss_mlp": 0.01026733, "balance_loss_clip": 1.04151607, "balance_loss_mlp": 1.01918721, "epoch": 0.6315156616365057, "flos": 22894273906560.0, "grad_norm": 1.9408359147568504, "language_loss": 0.77854294, "learning_rate": 1.2629687487078821e-06, "loss": 0.80015349, "num_input_tokens_seen": 113094600, "step": 5252, "time_per_iteration": 2.7692813873291016 }, { "auxiliary_loss_clip": 0.01164661, "auxiliary_loss_mlp": 0.01019851, "balance_loss_clip": 1.04753828, "balance_loss_mlp": 1.01208496, "epoch": 0.6316359045271448, "flos": 23726251699200.0, "grad_norm": 2.098749712941411, "language_loss": 0.76709986, "learning_rate": 1.2622446557964293e-06, "loss": 0.78894496, "num_input_tokens_seen": 113112605, "step": 5253, "time_per_iteration": 2.636350631713867 }, { "auxiliary_loss_clip": 0.01144815, "auxiliary_loss_mlp": 0.01025104, "balance_loss_clip": 1.04222322, "balance_loss_mlp": 1.01767683, "epoch": 0.631756147417784, "flos": 33108417164160.0, "grad_norm": 1.7103046256413188, "language_loss": 0.71296573, "learning_rate": 1.261520674801876e-06, "loss": 0.73466492, "num_input_tokens_seen": 113133200, "step": 5254, "time_per_iteration": 2.6760008335113525 }, { "auxiliary_loss_clip": 0.01146138, "auxiliary_loss_mlp": 0.01025407, "balance_loss_clip": 1.04990578, "balance_loss_mlp": 1.01778996, "epoch": 0.631876390308423, "flos": 31248424126080.0, "grad_norm": 4.681023361472439, "language_loss": 0.72130442, "learning_rate": 1.2607968058340488e-06, "loss": 0.74301988, "num_input_tokens_seen": 113152895, "step": 5255, "time_per_iteration": 2.677609920501709 }, { "auxiliary_loss_clip": 0.01142429, "auxiliary_loss_mlp": 0.01028297, "balance_loss_clip": 1.04555833, "balance_loss_mlp": 1.02105463, "epoch": 0.6319966331990621, "flos": 24681152810880.0, "grad_norm": 1.7014455637467025, "language_loss": 0.73283833, "learning_rate": 1.2600730490027583e-06, "loss": 0.75454557, "num_input_tokens_seen": 113173135, "step": 5256, "time_per_iteration": 2.563051700592041 }, { "auxiliary_loss_clip": 0.01129929, "auxiliary_loss_mlp": 0.01029177, "balance_loss_clip": 1.04426682, "balance_loss_mlp": 1.0216012, "epoch": 0.6321168760897012, "flos": 17491764913920.0, "grad_norm": 1.8450099376041216, "language_loss": 0.80588508, "learning_rate": 1.2593494044177984e-06, "loss": 0.82747614, "num_input_tokens_seen": 113191440, "step": 5257, "time_per_iteration": 2.5518555641174316 }, { "auxiliary_loss_clip": 0.01178447, "auxiliary_loss_mlp": 0.01024552, "balance_loss_clip": 1.04919982, "balance_loss_mlp": 1.01673138, "epoch": 0.6322371189803403, "flos": 18295373940480.0, "grad_norm": 2.7308715307837548, "language_loss": 0.8097809, "learning_rate": 1.2586258721889448e-06, "loss": 0.83181089, "num_input_tokens_seen": 113208790, "step": 5258, "time_per_iteration": 2.4225099086761475 }, { "auxiliary_loss_clip": 0.01110858, "auxiliary_loss_mlp": 0.01029097, "balance_loss_clip": 1.04311728, "balance_loss_mlp": 1.02094269, "epoch": 0.6323573618709794, "flos": 20157270399360.0, "grad_norm": 1.9399142037700474, "language_loss": 0.81856251, "learning_rate": 1.2579024524259573e-06, "loss": 0.83996212, "num_input_tokens_seen": 113225050, "step": 5259, "time_per_iteration": 2.8477590084075928 }, { "auxiliary_loss_clip": 0.01141431, "auxiliary_loss_mlp": 0.01029612, "balance_loss_clip": 1.04236722, "balance_loss_mlp": 1.02234626, "epoch": 0.6324776047616185, "flos": 20042391726720.0, "grad_norm": 1.7595675280694352, "language_loss": 0.91455519, "learning_rate": 1.2571791452385768e-06, "loss": 0.93626565, "num_input_tokens_seen": 113242315, "step": 5260, "time_per_iteration": 3.546295166015625 }, { "auxiliary_loss_clip": 0.01144551, "auxiliary_loss_mlp": 0.01027154, "balance_loss_clip": 1.04651225, "balance_loss_mlp": 1.02019799, "epoch": 0.6325978476522576, "flos": 30848235724800.0, "grad_norm": 1.5980799780731958, "language_loss": 0.7708559, "learning_rate": 1.2564559507365301e-06, "loss": 0.79257298, "num_input_tokens_seen": 113264720, "step": 5261, "time_per_iteration": 2.7284038066864014 }, { "auxiliary_loss_clip": 0.01149378, "auxiliary_loss_mlp": 0.01031292, "balance_loss_clip": 1.04798865, "balance_loss_mlp": 1.0232811, "epoch": 0.6327180905428966, "flos": 24535104111360.0, "grad_norm": 2.479612934332986, "language_loss": 0.7888611, "learning_rate": 1.2557328690295244e-06, "loss": 0.81066775, "num_input_tokens_seen": 113282910, "step": 5262, "time_per_iteration": 3.6722707748413086 }, { "auxiliary_loss_clip": 0.01132748, "auxiliary_loss_mlp": 0.01025153, "balance_loss_clip": 1.0434922, "balance_loss_mlp": 1.01814926, "epoch": 0.6328383334335358, "flos": 21575274583680.0, "grad_norm": 1.7783285867579794, "language_loss": 0.76302314, "learning_rate": 1.255009900227251e-06, "loss": 0.78460217, "num_input_tokens_seen": 113301935, "step": 5263, "time_per_iteration": 3.60206937789917 }, { "auxiliary_loss_clip": 0.01173593, "auxiliary_loss_mlp": 0.01024664, "balance_loss_clip": 1.04965472, "balance_loss_mlp": 1.01757371, "epoch": 0.6329585763241748, "flos": 22929861306240.0, "grad_norm": 1.9708972384726533, "language_loss": 0.79302883, "learning_rate": 1.254287044439383e-06, "loss": 0.81501138, "num_input_tokens_seen": 113321540, "step": 5264, "time_per_iteration": 2.653999090194702 }, { "auxiliary_loss_clip": 0.01072333, "auxiliary_loss_mlp": 0.01000968, "balance_loss_clip": 1.01354587, "balance_loss_mlp": 0.99969286, "epoch": 0.6330788192148139, "flos": 70936897847040.0, "grad_norm": 0.7685258385029217, "language_loss": 0.54443771, "learning_rate": 1.2535643017755776e-06, "loss": 0.56517076, "num_input_tokens_seen": 113383730, "step": 5265, "time_per_iteration": 3.2321953773498535 }, { "auxiliary_loss_clip": 0.01131465, "auxiliary_loss_mlp": 0.01028915, "balance_loss_clip": 1.04325342, "balance_loss_mlp": 1.02184558, "epoch": 0.6331990621054531, "flos": 21244501215360.0, "grad_norm": 2.179287839555915, "language_loss": 0.72550017, "learning_rate": 1.2528416723454737e-06, "loss": 0.74710393, "num_input_tokens_seen": 113400400, "step": 5266, "time_per_iteration": 2.6771693229675293 }, { "auxiliary_loss_clip": 0.01172674, "auxiliary_loss_mlp": 0.01026536, "balance_loss_clip": 1.05031252, "balance_loss_mlp": 1.01929116, "epoch": 0.6333193049960921, "flos": 34459412526720.0, "grad_norm": 1.4101493332025945, "language_loss": 0.7122401, "learning_rate": 1.2521191562586945e-06, "loss": 0.73423219, "num_input_tokens_seen": 113424050, "step": 5267, "time_per_iteration": 3.644361734390259 }, { "auxiliary_loss_clip": 0.01173985, "auxiliary_loss_mlp": 0.00762369, "balance_loss_clip": 1.04910004, "balance_loss_mlp": 1.00046945, "epoch": 0.6334395478867312, "flos": 18329883932160.0, "grad_norm": 1.9412396535274732, "language_loss": 0.77111453, "learning_rate": 1.2513967536248445e-06, "loss": 0.79047805, "num_input_tokens_seen": 113440370, "step": 5268, "time_per_iteration": 2.5923566818237305 }, { "auxiliary_loss_clip": 0.01157413, "auxiliary_loss_mlp": 0.01030381, "balance_loss_clip": 1.04869974, "balance_loss_mlp": 1.02293646, "epoch": 0.6335597907773702, "flos": 23623152687360.0, "grad_norm": 1.9655436693651789, "language_loss": 0.81391728, "learning_rate": 1.2506744645535117e-06, "loss": 0.83579528, "num_input_tokens_seen": 113460800, "step": 5269, "time_per_iteration": 2.668367385864258 }, { "auxiliary_loss_clip": 0.01136196, "auxiliary_loss_mlp": 0.01022819, "balance_loss_clip": 1.03963041, "balance_loss_mlp": 1.01532686, "epoch": 0.6336800336680094, "flos": 22710913954560.0, "grad_norm": 1.9176739924810726, "language_loss": 0.60074353, "learning_rate": 1.249952289154267e-06, "loss": 0.62233377, "num_input_tokens_seen": 113480840, "step": 5270, "time_per_iteration": 2.6531097888946533 }, { "auxiliary_loss_clip": 0.01087948, "auxiliary_loss_mlp": 0.01028041, "balance_loss_clip": 1.03855634, "balance_loss_mlp": 1.02114725, "epoch": 0.6338002765586485, "flos": 23622757637760.0, "grad_norm": 1.9081654793991867, "language_loss": 0.76470029, "learning_rate": 1.2492302275366635e-06, "loss": 0.78586018, "num_input_tokens_seen": 113500515, "step": 5271, "time_per_iteration": 2.7984304428100586 }, { "auxiliary_loss_clip": 0.01154056, "auxiliary_loss_mlp": 0.01023897, "balance_loss_clip": 1.04560518, "balance_loss_mlp": 1.01595807, "epoch": 0.6339205194492875, "flos": 26505450708480.0, "grad_norm": 2.847661151612084, "language_loss": 0.64872015, "learning_rate": 1.2485082798102377e-06, "loss": 0.67049968, "num_input_tokens_seen": 113520930, "step": 5272, "time_per_iteration": 2.6851444244384766 }, { "auxiliary_loss_clip": 0.01136714, "auxiliary_loss_mlp": 0.01030417, "balance_loss_clip": 1.04423308, "balance_loss_mlp": 1.02225685, "epoch": 0.6340407623399267, "flos": 18544306170240.0, "grad_norm": 2.136818037044044, "language_loss": 0.68726349, "learning_rate": 1.2477864460845084e-06, "loss": 0.70893478, "num_input_tokens_seen": 113537330, "step": 5273, "time_per_iteration": 2.7614822387695312 }, { "auxiliary_loss_clip": 0.01145213, "auxiliary_loss_mlp": 0.01034647, "balance_loss_clip": 1.04568005, "balance_loss_mlp": 1.02652907, "epoch": 0.6341610052305657, "flos": 17712579772800.0, "grad_norm": 2.899265461942339, "language_loss": 0.73344773, "learning_rate": 1.2470647264689776e-06, "loss": 0.75524634, "num_input_tokens_seen": 113555810, "step": 5274, "time_per_iteration": 2.6677842140197754 }, { "auxiliary_loss_clip": 0.01107197, "auxiliary_loss_mlp": 0.01024417, "balance_loss_clip": 1.03973627, "balance_loss_mlp": 1.01625156, "epoch": 0.6342812481212048, "flos": 23587026583680.0, "grad_norm": 1.9968879115327172, "language_loss": 0.70827264, "learning_rate": 1.2463431210731282e-06, "loss": 0.72958875, "num_input_tokens_seen": 113575395, "step": 5275, "time_per_iteration": 2.7782480716705322 }, { "auxiliary_loss_clip": 0.01124148, "auxiliary_loss_mlp": 0.01028423, "balance_loss_clip": 1.0402149, "balance_loss_mlp": 1.02074647, "epoch": 0.634401491011844, "flos": 17821927751040.0, "grad_norm": 2.374643647540472, "language_loss": 0.76300782, "learning_rate": 1.2456216300064289e-06, "loss": 0.7845335, "num_input_tokens_seen": 113592945, "step": 5276, "time_per_iteration": 2.7298896312713623 }, { "auxiliary_loss_clip": 0.01140898, "auxiliary_loss_mlp": 0.01030407, "balance_loss_clip": 1.04537344, "balance_loss_mlp": 1.0228554, "epoch": 0.634521733902483, "flos": 21358158825600.0, "grad_norm": 1.6397897082300514, "language_loss": 0.78284878, "learning_rate": 1.244900253378328e-06, "loss": 0.80456185, "num_input_tokens_seen": 113613000, "step": 5277, "time_per_iteration": 2.6677749156951904 }, { "auxiliary_loss_clip": 0.01076397, "auxiliary_loss_mlp": 0.01030292, "balance_loss_clip": 1.04171216, "balance_loss_mlp": 1.02345276, "epoch": 0.6346419767931221, "flos": 16545052103040.0, "grad_norm": 1.9961195474176063, "language_loss": 0.69258094, "learning_rate": 1.2441789912982583e-06, "loss": 0.71364784, "num_input_tokens_seen": 113630085, "step": 5278, "time_per_iteration": 2.8389453887939453 }, { "auxiliary_loss_clip": 0.01163363, "auxiliary_loss_mlp": 0.01032075, "balance_loss_clip": 1.05043542, "balance_loss_mlp": 1.02368236, "epoch": 0.6347622196837612, "flos": 24350989973760.0, "grad_norm": 1.7637818613266685, "language_loss": 0.64792824, "learning_rate": 1.2434578438756346e-06, "loss": 0.6698826, "num_input_tokens_seen": 113650515, "step": 5279, "time_per_iteration": 2.6771962642669678 }, { "auxiliary_loss_clip": 0.01161409, "auxiliary_loss_mlp": 0.01026124, "balance_loss_clip": 1.04601741, "balance_loss_mlp": 1.01887941, "epoch": 0.6348824625744003, "flos": 64523178195840.0, "grad_norm": 2.3303269093291763, "language_loss": 0.78152448, "learning_rate": 1.242736811219855e-06, "loss": 0.8033998, "num_input_tokens_seen": 113676475, "step": 5280, "time_per_iteration": 2.9890549182891846 }, { "auxiliary_loss_clip": 0.01154543, "auxiliary_loss_mlp": 0.01024242, "balance_loss_clip": 1.04732609, "balance_loss_mlp": 1.01726866, "epoch": 0.6350027054650393, "flos": 28622133313920.0, "grad_norm": 1.675415837955028, "language_loss": 0.81703269, "learning_rate": 1.2420158934402988e-06, "loss": 0.83882052, "num_input_tokens_seen": 113697090, "step": 5281, "time_per_iteration": 2.7611465454101562 }, { "auxiliary_loss_clip": 0.01117522, "auxiliary_loss_mlp": 0.01024911, "balance_loss_clip": 1.04075909, "balance_loss_mlp": 1.01731431, "epoch": 0.6351229483556785, "flos": 23002544476800.0, "grad_norm": 2.1143692431512067, "language_loss": 0.85161644, "learning_rate": 1.2412950906463286e-06, "loss": 0.87304074, "num_input_tokens_seen": 113714395, "step": 5282, "time_per_iteration": 2.688275098800659 }, { "auxiliary_loss_clip": 0.01115977, "auxiliary_loss_mlp": 0.01030229, "balance_loss_clip": 1.04239368, "balance_loss_mlp": 1.02332079, "epoch": 0.6352431912463176, "flos": 21939300967680.0, "grad_norm": 2.0164715919051486, "language_loss": 0.89986688, "learning_rate": 1.2405744029472902e-06, "loss": 0.92132896, "num_input_tokens_seen": 113733880, "step": 5283, "time_per_iteration": 2.73823881149292 }, { "auxiliary_loss_clip": 0.01142669, "auxiliary_loss_mlp": 0.01025198, "balance_loss_clip": 1.04418111, "balance_loss_mlp": 1.01763439, "epoch": 0.6353634341369566, "flos": 13735257684480.0, "grad_norm": 1.8310045981907783, "language_loss": 0.76113969, "learning_rate": 1.2398538304525108e-06, "loss": 0.78281838, "num_input_tokens_seen": 113752505, "step": 5284, "time_per_iteration": 2.7103374004364014 }, { "auxiliary_loss_clip": 0.01126208, "auxiliary_loss_mlp": 0.01031939, "balance_loss_clip": 1.0457195, "balance_loss_mlp": 1.0232904, "epoch": 0.6354836770275958, "flos": 19316170552320.0, "grad_norm": 2.1330436067971297, "language_loss": 0.75468421, "learning_rate": 1.2391333732713016e-06, "loss": 0.77626568, "num_input_tokens_seen": 113770310, "step": 5285, "time_per_iteration": 2.7233896255493164 }, { "auxiliary_loss_clip": 0.0113029, "auxiliary_loss_mlp": 0.0103802, "balance_loss_clip": 1.04296708, "balance_loss_mlp": 1.02971148, "epoch": 0.6356039199182348, "flos": 21613375935360.0, "grad_norm": 2.0871409990848093, "language_loss": 0.79010153, "learning_rate": 1.2384130315129543e-06, "loss": 0.81178463, "num_input_tokens_seen": 113788635, "step": 5286, "time_per_iteration": 3.587177038192749 }, { "auxiliary_loss_clip": 0.01066996, "auxiliary_loss_mlp": 0.01027311, "balance_loss_clip": 1.03541219, "balance_loss_mlp": 1.01989055, "epoch": 0.6357241628088739, "flos": 18111978074880.0, "grad_norm": 2.120276243448093, "language_loss": 0.73289919, "learning_rate": 1.2376928052867447e-06, "loss": 0.75384223, "num_input_tokens_seen": 113807755, "step": 5287, "time_per_iteration": 3.038170576095581 }, { "auxiliary_loss_clip": 0.01148154, "auxiliary_loss_mlp": 0.01030738, "balance_loss_clip": 1.04968822, "balance_loss_mlp": 1.02346015, "epoch": 0.6358444056995131, "flos": 24935256599040.0, "grad_norm": 2.2837784572725206, "language_loss": 0.77486622, "learning_rate": 1.2369726947019299e-06, "loss": 0.79665518, "num_input_tokens_seen": 113828230, "step": 5288, "time_per_iteration": 4.634884595870972 }, { "auxiliary_loss_clip": 0.01156791, "auxiliary_loss_mlp": 0.01025436, "balance_loss_clip": 1.04478097, "balance_loss_mlp": 1.01846206, "epoch": 0.6359646485901521, "flos": 23293348986240.0, "grad_norm": 2.29941443290014, "language_loss": 0.67297822, "learning_rate": 1.2362526998677511e-06, "loss": 0.6948005, "num_input_tokens_seen": 113844595, "step": 5289, "time_per_iteration": 3.5291662216186523 }, { "auxiliary_loss_clip": 0.01148948, "auxiliary_loss_mlp": 0.01029778, "balance_loss_clip": 1.04633701, "balance_loss_mlp": 1.0230819, "epoch": 0.6360848914807912, "flos": 20887442069760.0, "grad_norm": 2.557268739667188, "language_loss": 0.84437758, "learning_rate": 1.2355328208934301e-06, "loss": 0.8661648, "num_input_tokens_seen": 113863470, "step": 5290, "time_per_iteration": 2.65129017829895 }, { "auxiliary_loss_clip": 0.01159182, "auxiliary_loss_mlp": 0.0076231, "balance_loss_clip": 1.04500699, "balance_loss_mlp": 1.00049663, "epoch": 0.6362051343714303, "flos": 18479775386880.0, "grad_norm": 1.77972467008367, "language_loss": 0.72071528, "learning_rate": 1.2348130578881728e-06, "loss": 0.73993021, "num_input_tokens_seen": 113881690, "step": 5291, "time_per_iteration": 2.618036985397339 }, { "auxiliary_loss_clip": 0.01177379, "auxiliary_loss_mlp": 0.01030144, "balance_loss_clip": 1.05040383, "balance_loss_mlp": 1.02203155, "epoch": 0.6363253772620694, "flos": 24389594115840.0, "grad_norm": 2.057771843033151, "language_loss": 0.75667763, "learning_rate": 1.2340934109611664e-06, "loss": 0.7787528, "num_input_tokens_seen": 113902450, "step": 5292, "time_per_iteration": 3.573744773864746 }, { "auxiliary_loss_clip": 0.01152606, "auxiliary_loss_mlp": 0.01027618, "balance_loss_clip": 1.04839814, "balance_loss_mlp": 1.01967573, "epoch": 0.6364456201527084, "flos": 25958243940480.0, "grad_norm": 2.181494360196889, "language_loss": 0.68380755, "learning_rate": 1.2333738802215798e-06, "loss": 0.7056098, "num_input_tokens_seen": 113922670, "step": 5293, "time_per_iteration": 2.7434213161468506 }, { "auxiliary_loss_clip": 0.01110469, "auxiliary_loss_mlp": 0.01022387, "balance_loss_clip": 1.039469, "balance_loss_mlp": 1.0154345, "epoch": 0.6365658630433476, "flos": 20740711011840.0, "grad_norm": 2.2689876735538337, "language_loss": 0.81064498, "learning_rate": 1.2326544657785668e-06, "loss": 0.83197355, "num_input_tokens_seen": 113942360, "step": 5294, "time_per_iteration": 2.8029656410217285 }, { "auxiliary_loss_clip": 0.0112284, "auxiliary_loss_mlp": 0.01035096, "balance_loss_clip": 1.04234719, "balance_loss_mlp": 1.02670979, "epoch": 0.6366861059339867, "flos": 21434146047360.0, "grad_norm": 2.5436829478612086, "language_loss": 0.74066776, "learning_rate": 1.2319351677412608e-06, "loss": 0.76224715, "num_input_tokens_seen": 113959405, "step": 5295, "time_per_iteration": 2.727306365966797 }, { "auxiliary_loss_clip": 0.01142147, "auxiliary_loss_mlp": 0.01025837, "balance_loss_clip": 1.04753351, "balance_loss_mlp": 1.01822233, "epoch": 0.6368063488246257, "flos": 22267093507200.0, "grad_norm": 1.8415800188896279, "language_loss": 0.74934649, "learning_rate": 1.2312159862187796e-06, "loss": 0.77102637, "num_input_tokens_seen": 113977815, "step": 5296, "time_per_iteration": 2.7386586666107178 }, { "auxiliary_loss_clip": 0.01180077, "auxiliary_loss_mlp": 0.01028374, "balance_loss_clip": 1.05303597, "balance_loss_mlp": 1.02061319, "epoch": 0.6369265917152649, "flos": 22420719976320.0, "grad_norm": 1.5819765176360219, "language_loss": 0.76343715, "learning_rate": 1.2304969213202217e-06, "loss": 0.78552169, "num_input_tokens_seen": 113999075, "step": 5297, "time_per_iteration": 2.6726162433624268 }, { "auxiliary_loss_clip": 0.01141602, "auxiliary_loss_mlp": 0.01029542, "balance_loss_clip": 1.04410934, "balance_loss_mlp": 1.0221715, "epoch": 0.6370468346059039, "flos": 24718176754560.0, "grad_norm": 2.4925276795690294, "language_loss": 0.79390979, "learning_rate": 1.2297779731546692e-06, "loss": 0.81562126, "num_input_tokens_seen": 114018170, "step": 5298, "time_per_iteration": 2.7346549034118652 }, { "auxiliary_loss_clip": 0.01143444, "auxiliary_loss_mlp": 0.01029182, "balance_loss_clip": 1.04820967, "balance_loss_mlp": 1.02209508, "epoch": 0.637167077496543, "flos": 25296589463040.0, "grad_norm": 2.067276268599477, "language_loss": 0.77789748, "learning_rate": 1.2290591418311853e-06, "loss": 0.79962373, "num_input_tokens_seen": 114035565, "step": 5299, "time_per_iteration": 2.7106876373291016 }, { "auxiliary_loss_clip": 0.01161208, "auxiliary_loss_mlp": 0.01027847, "balance_loss_clip": 1.05019212, "balance_loss_mlp": 1.02036107, "epoch": 0.637287320387182, "flos": 27671110871040.0, "grad_norm": 1.5693646145175868, "language_loss": 0.72306943, "learning_rate": 1.2283404274588172e-06, "loss": 0.74495995, "num_input_tokens_seen": 114054510, "step": 5300, "time_per_iteration": 2.7469778060913086 }, { "auxiliary_loss_clip": 0.00994892, "auxiliary_loss_mlp": 0.01000833, "balance_loss_clip": 1.00952315, "balance_loss_mlp": 0.9996528, "epoch": 0.6374075632778212, "flos": 63173406873600.0, "grad_norm": 0.7415873848415078, "language_loss": 0.52883971, "learning_rate": 1.227621830146592e-06, "loss": 0.54879701, "num_input_tokens_seen": 114109875, "step": 5301, "time_per_iteration": 3.404991388320923 }, { "auxiliary_loss_clip": 0.01136921, "auxiliary_loss_mlp": 0.0102631, "balance_loss_clip": 1.04707646, "balance_loss_mlp": 1.01862669, "epoch": 0.6375278061684603, "flos": 25558127366400.0, "grad_norm": 1.9585651414638527, "language_loss": 0.79148632, "learning_rate": 1.2269033500035217e-06, "loss": 0.81311864, "num_input_tokens_seen": 114130010, "step": 5302, "time_per_iteration": 3.368680000305176 }, { "auxiliary_loss_clip": 0.01131449, "auxiliary_loss_mlp": 0.01025915, "balance_loss_clip": 1.04551506, "balance_loss_mlp": 1.01802659, "epoch": 0.6376480490590993, "flos": 25666362023040.0, "grad_norm": 2.0675034852357927, "language_loss": 0.73615062, "learning_rate": 1.2261849871385988e-06, "loss": 0.75772429, "num_input_tokens_seen": 114151115, "step": 5303, "time_per_iteration": 2.760967493057251 }, { "auxiliary_loss_clip": 0.01176325, "auxiliary_loss_mlp": 0.0102894, "balance_loss_clip": 1.0501461, "balance_loss_mlp": 1.02156949, "epoch": 0.6377682919497385, "flos": 31537684350720.0, "grad_norm": 2.246283143761021, "language_loss": 0.62398255, "learning_rate": 1.2254667416607972e-06, "loss": 0.64603519, "num_input_tokens_seen": 114172715, "step": 5304, "time_per_iteration": 2.687709093093872 }, { "auxiliary_loss_clip": 0.01161412, "auxiliary_loss_mlp": 0.0102548, "balance_loss_clip": 1.05033386, "balance_loss_mlp": 1.01786828, "epoch": 0.6378885348403776, "flos": 23039209284480.0, "grad_norm": 1.925133563341257, "language_loss": 0.82811928, "learning_rate": 1.2247486136790756e-06, "loss": 0.84998822, "num_input_tokens_seen": 114192195, "step": 5305, "time_per_iteration": 2.841505527496338 }, { "auxiliary_loss_clip": 0.01162759, "auxiliary_loss_mlp": 0.01029764, "balance_loss_clip": 1.04940081, "balance_loss_mlp": 1.0223788, "epoch": 0.6380087777310166, "flos": 18697070712960.0, "grad_norm": 2.1775179487553835, "language_loss": 0.80852914, "learning_rate": 1.2240306033023726e-06, "loss": 0.83045435, "num_input_tokens_seen": 114210020, "step": 5306, "time_per_iteration": 2.594621181488037 }, { "auxiliary_loss_clip": 0.01131688, "auxiliary_loss_mlp": 0.01026027, "balance_loss_clip": 1.04118454, "balance_loss_mlp": 1.01871324, "epoch": 0.6381290206216558, "flos": 23331558078720.0, "grad_norm": 1.7488616917789837, "language_loss": 0.72020495, "learning_rate": 1.223312710639611e-06, "loss": 0.74178207, "num_input_tokens_seen": 114228740, "step": 5307, "time_per_iteration": 2.68925142288208 }, { "auxiliary_loss_clip": 0.01142872, "auxiliary_loss_mlp": 0.01029347, "balance_loss_clip": 1.04559803, "balance_loss_mlp": 1.02173531, "epoch": 0.6382492635122948, "flos": 18880466578560.0, "grad_norm": 2.1993259722054153, "language_loss": 0.86659992, "learning_rate": 1.2225949357996928e-06, "loss": 0.88832211, "num_input_tokens_seen": 114246865, "step": 5308, "time_per_iteration": 2.6075196266174316 }, { "auxiliary_loss_clip": 0.01156911, "auxiliary_loss_mlp": 0.01032299, "balance_loss_clip": 1.04963899, "balance_loss_mlp": 1.02519393, "epoch": 0.6383695064029339, "flos": 27819134818560.0, "grad_norm": 1.4832887252028344, "language_loss": 0.80379075, "learning_rate": 1.221877278891505e-06, "loss": 0.82568276, "num_input_tokens_seen": 114266120, "step": 5309, "time_per_iteration": 2.715855598449707 }, { "auxiliary_loss_clip": 0.01165209, "auxiliary_loss_mlp": 0.01033828, "balance_loss_clip": 1.04855645, "balance_loss_mlp": 1.02567101, "epoch": 0.638489749293573, "flos": 26395635853440.0, "grad_norm": 2.018405432019353, "language_loss": 0.71561974, "learning_rate": 1.221159740023915e-06, "loss": 0.7376101, "num_input_tokens_seen": 114285950, "step": 5310, "time_per_iteration": 2.655057907104492 }, { "auxiliary_loss_clip": 0.01141862, "auxiliary_loss_mlp": 0.00762883, "balance_loss_clip": 1.04819918, "balance_loss_mlp": 1.00053453, "epoch": 0.6386099921842121, "flos": 23988328306560.0, "grad_norm": 6.035588048634325, "language_loss": 0.7234509, "learning_rate": 1.2204423193057735e-06, "loss": 0.74249834, "num_input_tokens_seen": 114304780, "step": 5311, "time_per_iteration": 2.8233413696289062 }, { "auxiliary_loss_clip": 0.01053852, "auxiliary_loss_mlp": 0.01001003, "balance_loss_clip": 1.01542854, "balance_loss_mlp": 0.99974567, "epoch": 0.6387302350748512, "flos": 71731169337600.0, "grad_norm": 0.8452672690425748, "language_loss": 0.63308752, "learning_rate": 1.2197250168459122e-06, "loss": 0.65363598, "num_input_tokens_seen": 114361180, "step": 5312, "time_per_iteration": 4.199880838394165 }, { "auxiliary_loss_clip": 0.01163399, "auxiliary_loss_mlp": 0.01027068, "balance_loss_clip": 1.04858959, "balance_loss_mlp": 1.02010036, "epoch": 0.6388504779654903, "flos": 14535778141440.0, "grad_norm": 1.691616405903377, "language_loss": 0.7419101, "learning_rate": 1.2190078327531454e-06, "loss": 0.76381481, "num_input_tokens_seen": 114377425, "step": 5313, "time_per_iteration": 2.603571653366089 }, { "auxiliary_loss_clip": 0.01161222, "auxiliary_loss_mlp": 0.01025379, "balance_loss_clip": 1.04836512, "balance_loss_mlp": 1.01854503, "epoch": 0.6389707208561294, "flos": 22346133384960.0, "grad_norm": 1.4173063366601204, "language_loss": 0.7275964, "learning_rate": 1.2182907671362697e-06, "loss": 0.74946243, "num_input_tokens_seen": 114398120, "step": 5314, "time_per_iteration": 3.656221628189087 }, { "auxiliary_loss_clip": 0.01161402, "auxiliary_loss_mlp": 0.01028432, "balance_loss_clip": 1.05037165, "balance_loss_mlp": 1.0211457, "epoch": 0.6390909637467684, "flos": 19426883247360.0, "grad_norm": 2.7112823466930536, "language_loss": 0.78783345, "learning_rate": 1.2175738201040626e-06, "loss": 0.80973178, "num_input_tokens_seen": 114415160, "step": 5315, "time_per_iteration": 3.570990800857544 }, { "auxiliary_loss_clip": 0.01160563, "auxiliary_loss_mlp": 0.01027233, "balance_loss_clip": 1.04875159, "balance_loss_mlp": 1.01973462, "epoch": 0.6392112066374076, "flos": 24090852700800.0, "grad_norm": 2.4512719865386874, "language_loss": 0.78191501, "learning_rate": 1.2168569917652855e-06, "loss": 0.80379295, "num_input_tokens_seen": 114435015, "step": 5316, "time_per_iteration": 2.711521863937378 }, { "auxiliary_loss_clip": 0.01158919, "auxiliary_loss_mlp": 0.01025774, "balance_loss_clip": 1.04927981, "balance_loss_mlp": 1.01870775, "epoch": 0.6393314495280467, "flos": 26795141896320.0, "grad_norm": 1.5665602098075946, "language_loss": 0.64286017, "learning_rate": 1.2161402822286797e-06, "loss": 0.66470706, "num_input_tokens_seen": 114455700, "step": 5317, "time_per_iteration": 2.690436601638794 }, { "auxiliary_loss_clip": 0.01129294, "auxiliary_loss_mlp": 0.01023429, "balance_loss_clip": 1.04329109, "balance_loss_mlp": 1.0158056, "epoch": 0.6394516924186857, "flos": 20260692633600.0, "grad_norm": 1.9781297407167873, "language_loss": 0.79439914, "learning_rate": 1.2154236916029703e-06, "loss": 0.81592643, "num_input_tokens_seen": 114473675, "step": 5318, "time_per_iteration": 3.6103367805480957 }, { "auxiliary_loss_clip": 0.01114, "auxiliary_loss_mlp": 0.01024496, "balance_loss_clip": 1.03892767, "balance_loss_mlp": 1.01703084, "epoch": 0.6395719353093249, "flos": 18368847210240.0, "grad_norm": 2.2051832447757045, "language_loss": 0.73096967, "learning_rate": 1.2147072199968627e-06, "loss": 0.75235468, "num_input_tokens_seen": 114492310, "step": 5319, "time_per_iteration": 2.7295048236846924 }, { "auxiliary_loss_clip": 0.01155768, "auxiliary_loss_mlp": 0.01021281, "balance_loss_clip": 1.04699922, "balance_loss_mlp": 1.01420879, "epoch": 0.6396921781999639, "flos": 17566315591680.0, "grad_norm": 1.8858108512048277, "language_loss": 0.71759582, "learning_rate": 1.2139908675190454e-06, "loss": 0.73936629, "num_input_tokens_seen": 114511520, "step": 5320, "time_per_iteration": 2.731945753097534 }, { "auxiliary_loss_clip": 0.01090251, "auxiliary_loss_mlp": 0.01026906, "balance_loss_clip": 1.03541028, "balance_loss_mlp": 1.01953852, "epoch": 0.639812421090603, "flos": 21251252972160.0, "grad_norm": 1.9917184137324324, "language_loss": 0.74906778, "learning_rate": 1.2132746342781883e-06, "loss": 0.77023935, "num_input_tokens_seen": 114532680, "step": 5321, "time_per_iteration": 2.7771174907684326 }, { "auxiliary_loss_clip": 0.01174724, "auxiliary_loss_mlp": 0.01029606, "balance_loss_clip": 1.04901111, "balance_loss_mlp": 1.02220654, "epoch": 0.6399326639812422, "flos": 11180967684480.0, "grad_norm": 2.5922901615688914, "language_loss": 0.79438376, "learning_rate": 1.2125585203829442e-06, "loss": 0.81642711, "num_input_tokens_seen": 114548320, "step": 5322, "time_per_iteration": 2.5808699131011963 }, { "auxiliary_loss_clip": 0.01119064, "auxiliary_loss_mlp": 0.01028691, "balance_loss_clip": 1.04365969, "balance_loss_mlp": 1.02037048, "epoch": 0.6400529068718812, "flos": 23911048195200.0, "grad_norm": 2.04856855257734, "language_loss": 0.74466324, "learning_rate": 1.211842525941946e-06, "loss": 0.76614082, "num_input_tokens_seen": 114568115, "step": 5323, "time_per_iteration": 2.723644971847534 }, { "auxiliary_loss_clip": 0.01114824, "auxiliary_loss_mlp": 0.0102218, "balance_loss_clip": 1.04454517, "balance_loss_mlp": 1.01557875, "epoch": 0.6401731497625203, "flos": 44018724890880.0, "grad_norm": 2.113475347509435, "language_loss": 0.78656989, "learning_rate": 1.2111266510638105e-06, "loss": 0.80794001, "num_input_tokens_seen": 114591040, "step": 5324, "time_per_iteration": 2.922257661819458 }, { "auxiliary_loss_clip": 0.01096338, "auxiliary_loss_mlp": 0.0103017, "balance_loss_clip": 1.0416286, "balance_loss_mlp": 1.02291, "epoch": 0.6402933926531594, "flos": 20662209838080.0, "grad_norm": 1.8445528298622493, "language_loss": 0.80207598, "learning_rate": 1.2104108958571346e-06, "loss": 0.82334113, "num_input_tokens_seen": 114609310, "step": 5325, "time_per_iteration": 2.795203685760498 }, { "auxiliary_loss_clip": 0.01157155, "auxiliary_loss_mlp": 0.01026778, "balance_loss_clip": 1.04815531, "balance_loss_mlp": 1.01999474, "epoch": 0.6404136355437985, "flos": 24863327614080.0, "grad_norm": 1.4590001563982964, "language_loss": 0.75836217, "learning_rate": 1.2096952604304975e-06, "loss": 0.78020144, "num_input_tokens_seen": 114629740, "step": 5326, "time_per_iteration": 2.6378660202026367 }, { "auxiliary_loss_clip": 0.0115992, "auxiliary_loss_mlp": 0.01028004, "balance_loss_clip": 1.04558539, "balance_loss_mlp": 1.02039886, "epoch": 0.6405338784344375, "flos": 40479548901120.0, "grad_norm": 1.938829284381412, "language_loss": 0.70430952, "learning_rate": 1.2089797448924616e-06, "loss": 0.72618878, "num_input_tokens_seen": 114653615, "step": 5327, "time_per_iteration": 2.9087090492248535 }, { "auxiliary_loss_clip": 0.01120455, "auxiliary_loss_mlp": 0.01028842, "balance_loss_clip": 1.04040921, "balance_loss_mlp": 1.02136779, "epoch": 0.6406541213250767, "flos": 20886041439360.0, "grad_norm": 2.656040116407189, "language_loss": 0.65939748, "learning_rate": 1.2082643493515692e-06, "loss": 0.68089044, "num_input_tokens_seen": 114671935, "step": 5328, "time_per_iteration": 2.7894093990325928 }, { "auxiliary_loss_clip": 0.01159272, "auxiliary_loss_mlp": 0.01025569, "balance_loss_clip": 1.04792619, "balance_loss_mlp": 1.01824045, "epoch": 0.6407743642157158, "flos": 23295970679040.0, "grad_norm": 2.059808614781452, "language_loss": 0.8161419, "learning_rate": 1.207549073916346e-06, "loss": 0.83799028, "num_input_tokens_seen": 114692870, "step": 5329, "time_per_iteration": 2.7692503929138184 }, { "auxiliary_loss_clip": 0.01136793, "auxiliary_loss_mlp": 0.01023226, "balance_loss_clip": 1.04543293, "balance_loss_mlp": 1.01607907, "epoch": 0.6408946071063548, "flos": 15012636122880.0, "grad_norm": 2.5858881972164154, "language_loss": 0.77797782, "learning_rate": 1.2068339186952976e-06, "loss": 0.79957807, "num_input_tokens_seen": 114710410, "step": 5330, "time_per_iteration": 2.664731502532959 }, { "auxiliary_loss_clip": 0.01164942, "auxiliary_loss_mlp": 0.01025934, "balance_loss_clip": 1.04909468, "balance_loss_mlp": 1.01810765, "epoch": 0.6410148499969939, "flos": 22528595496960.0, "grad_norm": 2.022033454527087, "language_loss": 0.73591536, "learning_rate": 1.2061188837969136e-06, "loss": 0.75782406, "num_input_tokens_seen": 114730020, "step": 5331, "time_per_iteration": 2.6846861839294434 }, { "auxiliary_loss_clip": 0.01126116, "auxiliary_loss_mlp": 0.01025968, "balance_loss_clip": 1.04050219, "balance_loss_mlp": 1.01835036, "epoch": 0.641135092887633, "flos": 12422004537600.0, "grad_norm": 2.342157508519514, "language_loss": 0.84378564, "learning_rate": 1.2054039693296631e-06, "loss": 0.86530644, "num_input_tokens_seen": 114748015, "step": 5332, "time_per_iteration": 2.660792350769043 }, { "auxiliary_loss_clip": 0.0112665, "auxiliary_loss_mlp": 0.01024171, "balance_loss_clip": 1.04358208, "balance_loss_mlp": 1.01672626, "epoch": 0.6412553357782721, "flos": 22127329687680.0, "grad_norm": 1.639156255112616, "language_loss": 0.81725526, "learning_rate": 1.2046891754019992e-06, "loss": 0.83876348, "num_input_tokens_seen": 114768625, "step": 5333, "time_per_iteration": 2.844221353530884 }, { "auxiliary_loss_clip": 0.01159995, "auxiliary_loss_mlp": 0.01028302, "balance_loss_clip": 1.04634595, "balance_loss_mlp": 1.0206461, "epoch": 0.6413755786689112, "flos": 15888605097600.0, "grad_norm": 2.40033997367305, "language_loss": 0.82681113, "learning_rate": 1.2039745021223548e-06, "loss": 0.84869409, "num_input_tokens_seen": 114786045, "step": 5334, "time_per_iteration": 2.5942797660827637 }, { "auxiliary_loss_clip": 0.0102952, "auxiliary_loss_mlp": 0.01002306, "balance_loss_clip": 1.0182972, "balance_loss_mlp": 1.00113201, "epoch": 0.6414958215595503, "flos": 68039159955840.0, "grad_norm": 0.788256646346243, "language_loss": 0.57024866, "learning_rate": 1.2032599495991456e-06, "loss": 0.59056693, "num_input_tokens_seen": 114850785, "step": 5335, "time_per_iteration": 3.3964450359344482 }, { "auxiliary_loss_clip": 0.011604, "auxiliary_loss_mlp": 0.01022576, "balance_loss_clip": 1.04789495, "balance_loss_mlp": 1.01513433, "epoch": 0.6416160644501894, "flos": 44091300320640.0, "grad_norm": 1.76425795870613, "language_loss": 0.69726133, "learning_rate": 1.2025455179407685e-06, "loss": 0.71909112, "num_input_tokens_seen": 114871945, "step": 5336, "time_per_iteration": 2.86095929145813 }, { "auxiliary_loss_clip": 0.01155821, "auxiliary_loss_mlp": 0.00762644, "balance_loss_clip": 1.0478338, "balance_loss_mlp": 1.00054431, "epoch": 0.6417363073408284, "flos": 20959837931520.0, "grad_norm": 2.063697684325237, "language_loss": 0.73905039, "learning_rate": 1.2018312072556022e-06, "loss": 0.75823504, "num_input_tokens_seen": 114890445, "step": 5337, "time_per_iteration": 3.5729243755340576 }, { "auxiliary_loss_clip": 0.01170587, "auxiliary_loss_mlp": 0.00762284, "balance_loss_clip": 1.04721975, "balance_loss_mlp": 1.00055599, "epoch": 0.6418565502314676, "flos": 22455122227200.0, "grad_norm": 2.1030914912776986, "language_loss": 0.74635285, "learning_rate": 1.2011170176520077e-06, "loss": 0.76568151, "num_input_tokens_seen": 114911360, "step": 5338, "time_per_iteration": 2.669750690460205 }, { "auxiliary_loss_clip": 0.01085651, "auxiliary_loss_mlp": 0.01027062, "balance_loss_clip": 1.03789282, "balance_loss_mlp": 1.01955819, "epoch": 0.6419767931221066, "flos": 25045502417280.0, "grad_norm": 1.9703900969457737, "language_loss": 0.81288373, "learning_rate": 1.2004029492383256e-06, "loss": 0.8340109, "num_input_tokens_seen": 114932700, "step": 5339, "time_per_iteration": 2.810675859451294 }, { "auxiliary_loss_clip": 0.01156756, "auxiliary_loss_mlp": 0.01030442, "balance_loss_clip": 1.04779112, "balance_loss_mlp": 1.02319729, "epoch": 0.6420970360127457, "flos": 19463691709440.0, "grad_norm": 2.170590560732831, "language_loss": 0.73677218, "learning_rate": 1.1996890021228814e-06, "loss": 0.75864422, "num_input_tokens_seen": 114949475, "step": 5340, "time_per_iteration": 4.567188739776611 }, { "auxiliary_loss_clip": 0.0113898, "auxiliary_loss_mlp": 0.01026927, "balance_loss_clip": 1.04254699, "balance_loss_mlp": 1.01958919, "epoch": 0.6422172789033849, "flos": 40406147458560.0, "grad_norm": 1.5906500217097792, "language_loss": 0.69832337, "learning_rate": 1.1989751764139785e-06, "loss": 0.71998245, "num_input_tokens_seen": 114973125, "step": 5341, "time_per_iteration": 2.8053700923919678 }, { "auxiliary_loss_clip": 0.01112559, "auxiliary_loss_mlp": 0.0102459, "balance_loss_clip": 1.03769433, "balance_loss_mlp": 1.01711881, "epoch": 0.6423375217940239, "flos": 27672870637440.0, "grad_norm": 1.7448467086093649, "language_loss": 0.83006179, "learning_rate": 1.1982614722199044e-06, "loss": 0.85143328, "num_input_tokens_seen": 114994300, "step": 5342, "time_per_iteration": 2.772854804992676 }, { "auxiliary_loss_clip": 0.01149223, "auxiliary_loss_mlp": 0.01025057, "balance_loss_clip": 1.04457331, "balance_loss_mlp": 1.01786304, "epoch": 0.642457764684663, "flos": 18369242259840.0, "grad_norm": 2.269767469478735, "language_loss": 0.7776804, "learning_rate": 1.1975478896489276e-06, "loss": 0.79942322, "num_input_tokens_seen": 115012135, "step": 5343, "time_per_iteration": 2.7478439807891846 }, { "auxiliary_loss_clip": 0.01170525, "auxiliary_loss_mlp": 0.01029527, "balance_loss_clip": 1.04794669, "balance_loss_mlp": 1.02266681, "epoch": 0.6425780075753021, "flos": 19750509809280.0, "grad_norm": 2.042513485887519, "language_loss": 0.76595056, "learning_rate": 1.1968344288092981e-06, "loss": 0.78795111, "num_input_tokens_seen": 115028715, "step": 5344, "time_per_iteration": 3.436119318008423 }, { "auxiliary_loss_clip": 0.0115733, "auxiliary_loss_mlp": 0.00762572, "balance_loss_clip": 1.04792917, "balance_loss_mlp": 1.00066936, "epoch": 0.6426982504659412, "flos": 20558536208640.0, "grad_norm": 1.7391774110332807, "language_loss": 0.64619517, "learning_rate": 1.1961210898092468e-06, "loss": 0.66539419, "num_input_tokens_seen": 115047665, "step": 5345, "time_per_iteration": 2.6262311935424805 }, { "auxiliary_loss_clip": 0.01150191, "auxiliary_loss_mlp": 0.01026694, "balance_loss_clip": 1.04644859, "balance_loss_mlp": 1.01834619, "epoch": 0.6428184933565803, "flos": 17851984456320.0, "grad_norm": 2.676841641594589, "language_loss": 0.7957865, "learning_rate": 1.1954078727569874e-06, "loss": 0.81755531, "num_input_tokens_seen": 115064965, "step": 5346, "time_per_iteration": 2.70442533493042 }, { "auxiliary_loss_clip": 0.01137285, "auxiliary_loss_mlp": 0.00762464, "balance_loss_clip": 1.04552233, "balance_loss_mlp": 1.00056553, "epoch": 0.6429387362472194, "flos": 22456953820800.0, "grad_norm": 1.5705360765861687, "language_loss": 0.78144526, "learning_rate": 1.1946947777607141e-06, "loss": 0.80044276, "num_input_tokens_seen": 115086100, "step": 5347, "time_per_iteration": 2.7072107791900635 }, { "auxiliary_loss_clip": 0.01108173, "auxiliary_loss_mlp": 0.01025181, "balance_loss_clip": 1.03998256, "balance_loss_mlp": 1.01758099, "epoch": 0.6430589791378585, "flos": 24752579005440.0, "grad_norm": 2.0372635490273057, "language_loss": 0.80356467, "learning_rate": 1.1939818049286024e-06, "loss": 0.82489812, "num_input_tokens_seen": 115104260, "step": 5348, "time_per_iteration": 2.7729461193084717 }, { "auxiliary_loss_clip": 0.01088804, "auxiliary_loss_mlp": 0.01025379, "balance_loss_clip": 1.03917503, "balance_loss_mlp": 1.01836908, "epoch": 0.6431792220284975, "flos": 24901249397760.0, "grad_norm": 1.563965092637992, "language_loss": 0.75819218, "learning_rate": 1.1932689543688101e-06, "loss": 0.77933395, "num_input_tokens_seen": 115125365, "step": 5349, "time_per_iteration": 2.7750625610351562 }, { "auxiliary_loss_clip": 0.01145894, "auxiliary_loss_mlp": 0.01023075, "balance_loss_clip": 1.04834282, "balance_loss_mlp": 1.0155623, "epoch": 0.6432994649191367, "flos": 21032305620480.0, "grad_norm": 2.0100338099152957, "language_loss": 0.72439885, "learning_rate": 1.1925562261894756e-06, "loss": 0.7460885, "num_input_tokens_seen": 115144445, "step": 5350, "time_per_iteration": 2.7003867626190186 }, { "auxiliary_loss_clip": 0.01138003, "auxiliary_loss_mlp": 0.01026935, "balance_loss_clip": 1.04225433, "balance_loss_mlp": 1.020015, "epoch": 0.6434197078097758, "flos": 30884433655680.0, "grad_norm": 1.8184370871729474, "language_loss": 0.77702719, "learning_rate": 1.1918436204987207e-06, "loss": 0.79867655, "num_input_tokens_seen": 115166305, "step": 5351, "time_per_iteration": 2.7806246280670166 }, { "auxiliary_loss_clip": 0.01154874, "auxiliary_loss_mlp": 0.01025233, "balance_loss_clip": 1.04820895, "balance_loss_mlp": 1.01785064, "epoch": 0.6435399507004148, "flos": 15012492468480.0, "grad_norm": 2.056899739662769, "language_loss": 0.81682646, "learning_rate": 1.191131137404645e-06, "loss": 0.83862752, "num_input_tokens_seen": 115183045, "step": 5352, "time_per_iteration": 2.649717092514038 }, { "auxiliary_loss_clip": 0.01118949, "auxiliary_loss_mlp": 0.01027643, "balance_loss_clip": 1.04208505, "balance_loss_mlp": 1.02077365, "epoch": 0.643660193591054, "flos": 19901981462400.0, "grad_norm": 1.78680452105505, "language_loss": 0.7746706, "learning_rate": 1.190418777015333e-06, "loss": 0.79613656, "num_input_tokens_seen": 115201955, "step": 5353, "time_per_iteration": 2.694457530975342 }, { "auxiliary_loss_clip": 0.01144403, "auxiliary_loss_mlp": 0.01021693, "balance_loss_clip": 1.04655051, "balance_loss_mlp": 1.01486814, "epoch": 0.643780436481693, "flos": 24133622820480.0, "grad_norm": 1.4943752766375038, "language_loss": 0.73403805, "learning_rate": 1.1897065394388487e-06, "loss": 0.75569898, "num_input_tokens_seen": 115222395, "step": 5354, "time_per_iteration": 2.702378511428833 }, { "auxiliary_loss_clip": 0.01148265, "auxiliary_loss_mlp": 0.01028956, "balance_loss_clip": 1.05039942, "balance_loss_mlp": 1.02148151, "epoch": 0.6439006793723321, "flos": 23148808657920.0, "grad_norm": 1.4912080869991633, "language_loss": 0.76575661, "learning_rate": 1.1889944247832385e-06, "loss": 0.78752887, "num_input_tokens_seen": 115242635, "step": 5355, "time_per_iteration": 2.7087044715881348 }, { "auxiliary_loss_clip": 0.01161436, "auxiliary_loss_mlp": 0.01027851, "balance_loss_clip": 1.04634571, "balance_loss_mlp": 1.02054954, "epoch": 0.6440209222629713, "flos": 23617909301760.0, "grad_norm": 2.0959938159232556, "language_loss": 0.70790398, "learning_rate": 1.1882824331565283e-06, "loss": 0.72979689, "num_input_tokens_seen": 115262095, "step": 5356, "time_per_iteration": 2.683272361755371 }, { "auxiliary_loss_clip": 0.01124464, "auxiliary_loss_mlp": 0.01024784, "balance_loss_clip": 1.04017138, "balance_loss_mlp": 1.01733327, "epoch": 0.6441411651536103, "flos": 16544872535040.0, "grad_norm": 2.062048874571363, "language_loss": 0.89189792, "learning_rate": 1.1875705646667287e-06, "loss": 0.9133904, "num_input_tokens_seen": 115279985, "step": 5357, "time_per_iteration": 2.7670819759368896 }, { "auxiliary_loss_clip": 0.01153797, "auxiliary_loss_mlp": 0.01025196, "balance_loss_clip": 1.04364038, "balance_loss_mlp": 1.01761711, "epoch": 0.6442614080442494, "flos": 25410965345280.0, "grad_norm": 3.400425634250684, "language_loss": 0.75669146, "learning_rate": 1.1868588194218282e-06, "loss": 0.77848148, "num_input_tokens_seen": 115300365, "step": 5358, "time_per_iteration": 2.672905445098877 }, { "auxiliary_loss_clip": 0.01149685, "auxiliary_loss_mlp": 0.01031557, "balance_loss_clip": 1.04421461, "balance_loss_mlp": 1.02336156, "epoch": 0.6443816509348885, "flos": 28294017552000.0, "grad_norm": 1.4901082172100037, "language_loss": 0.73926723, "learning_rate": 1.1861471975297979e-06, "loss": 0.76107961, "num_input_tokens_seen": 115322060, "step": 5359, "time_per_iteration": 2.826045274734497 }, { "auxiliary_loss_clip": 0.01127298, "auxiliary_loss_mlp": 0.01026704, "balance_loss_clip": 1.04403734, "balance_loss_mlp": 1.01946807, "epoch": 0.6445018938255276, "flos": 36690075964800.0, "grad_norm": 2.3990169971496846, "language_loss": 0.71077365, "learning_rate": 1.185435699098591e-06, "loss": 0.73231363, "num_input_tokens_seen": 115348255, "step": 5360, "time_per_iteration": 2.884958267211914 }, { "auxiliary_loss_clip": 0.01147157, "auxiliary_loss_mlp": 0.01023726, "balance_loss_clip": 1.04547536, "balance_loss_mlp": 1.01559031, "epoch": 0.6446221367161666, "flos": 14501411804160.0, "grad_norm": 2.2496068432067675, "language_loss": 0.78449708, "learning_rate": 1.1847243242361403e-06, "loss": 0.80620587, "num_input_tokens_seen": 115366845, "step": 5361, "time_per_iteration": 2.6301448345184326 }, { "auxiliary_loss_clip": 0.01145422, "auxiliary_loss_mlp": 0.01021795, "balance_loss_clip": 1.04507196, "balance_loss_mlp": 1.01356339, "epoch": 0.6447423796068057, "flos": 24609367480320.0, "grad_norm": 1.5649113824567857, "language_loss": 0.78285044, "learning_rate": 1.1840130730503624e-06, "loss": 0.80452263, "num_input_tokens_seen": 115388125, "step": 5362, "time_per_iteration": 2.726356267929077 }, { "auxiliary_loss_clip": 0.01171805, "auxiliary_loss_mlp": 0.01022534, "balance_loss_clip": 1.04729223, "balance_loss_mlp": 1.01476693, "epoch": 0.6448626224974449, "flos": 25047298097280.0, "grad_norm": 1.9315023608156254, "language_loss": 0.75241446, "learning_rate": 1.1833019456491518e-06, "loss": 0.7743578, "num_input_tokens_seen": 115409655, "step": 5363, "time_per_iteration": 3.5432562828063965 }, { "auxiliary_loss_clip": 0.0115861, "auxiliary_loss_mlp": 0.01028924, "balance_loss_clip": 1.04617751, "balance_loss_mlp": 1.02152693, "epoch": 0.6449828653880839, "flos": 22530355263360.0, "grad_norm": 2.5329935574136164, "language_loss": 0.79068565, "learning_rate": 1.1825909421403871e-06, "loss": 0.81256104, "num_input_tokens_seen": 115428750, "step": 5364, "time_per_iteration": 2.677765130996704 }, { "auxiliary_loss_clip": 0.01160076, "auxiliary_loss_mlp": 0.01023817, "balance_loss_clip": 1.04730439, "balance_loss_mlp": 1.01676846, "epoch": 0.645103108278723, "flos": 25695736369920.0, "grad_norm": 1.7365938867513604, "language_loss": 0.76530206, "learning_rate": 1.181880062631926e-06, "loss": 0.78714097, "num_input_tokens_seen": 115448085, "step": 5365, "time_per_iteration": 2.6400818824768066 }, { "auxiliary_loss_clip": 0.01137887, "auxiliary_loss_mlp": 0.01031518, "balance_loss_clip": 1.04567862, "balance_loss_mlp": 1.02371609, "epoch": 0.6452233511693621, "flos": 27450331925760.0, "grad_norm": 2.0865732231088625, "language_loss": 0.84499347, "learning_rate": 1.1811693072316093e-06, "loss": 0.86668754, "num_input_tokens_seen": 115465765, "step": 5366, "time_per_iteration": 3.6165668964385986 }, { "auxiliary_loss_clip": 0.01172568, "auxiliary_loss_mlp": 0.00762807, "balance_loss_clip": 1.04796934, "balance_loss_mlp": 1.0004921, "epoch": 0.6453435940600012, "flos": 19208618254080.0, "grad_norm": 2.5388547274568136, "language_loss": 0.84377205, "learning_rate": 1.1804586760472574e-06, "loss": 0.86312586, "num_input_tokens_seen": 115482230, "step": 5367, "time_per_iteration": 2.614454984664917 }, { "auxiliary_loss_clip": 0.01127642, "auxiliary_loss_mlp": 0.0102078, "balance_loss_clip": 1.04290581, "balance_loss_mlp": 1.01383281, "epoch": 0.6454638369506402, "flos": 25737680476800.0, "grad_norm": 2.199759009682726, "language_loss": 0.80076301, "learning_rate": 1.1797481691866736e-06, "loss": 0.82224733, "num_input_tokens_seen": 115499455, "step": 5368, "time_per_iteration": 2.783536434173584 }, { "auxiliary_loss_clip": 0.01135192, "auxiliary_loss_mlp": 0.01025577, "balance_loss_clip": 1.04511857, "balance_loss_mlp": 1.0185411, "epoch": 0.6455840798412794, "flos": 20989176364800.0, "grad_norm": 1.9244119840657354, "language_loss": 0.83499479, "learning_rate": 1.1790377867576393e-06, "loss": 0.85660255, "num_input_tokens_seen": 115517205, "step": 5369, "time_per_iteration": 2.707237958908081 }, { "auxiliary_loss_clip": 0.01145322, "auxiliary_loss_mlp": 0.01024524, "balance_loss_clip": 1.04304314, "balance_loss_mlp": 1.01680529, "epoch": 0.6457043227319185, "flos": 26067556005120.0, "grad_norm": 2.1214243686687397, "language_loss": 0.76618493, "learning_rate": 1.1783275288679203e-06, "loss": 0.78788334, "num_input_tokens_seen": 115534370, "step": 5370, "time_per_iteration": 3.6010429859161377 }, { "auxiliary_loss_clip": 0.01065607, "auxiliary_loss_mlp": 0.01000759, "balance_loss_clip": 1.01591372, "balance_loss_mlp": 0.99952519, "epoch": 0.6458245656225575, "flos": 60370831088640.0, "grad_norm": 0.8434385549710722, "language_loss": 0.57112616, "learning_rate": 1.177617395625262e-06, "loss": 0.59178984, "num_input_tokens_seen": 115592345, "step": 5371, "time_per_iteration": 3.1792173385620117 }, { "auxiliary_loss_clip": 0.01157239, "auxiliary_loss_mlp": 0.0102442, "balance_loss_clip": 1.04744673, "balance_loss_mlp": 1.01751781, "epoch": 0.6459448085131967, "flos": 23076771932160.0, "grad_norm": 1.7551207272230125, "language_loss": 0.75131011, "learning_rate": 1.1769073871373908e-06, "loss": 0.77312672, "num_input_tokens_seen": 115612550, "step": 5372, "time_per_iteration": 2.7512664794921875 }, { "auxiliary_loss_clip": 0.01126117, "auxiliary_loss_mlp": 0.01026109, "balance_loss_clip": 1.04085863, "balance_loss_mlp": 1.01887631, "epoch": 0.6460650514038357, "flos": 22598190097920.0, "grad_norm": 1.6600787171452684, "language_loss": 0.8398149, "learning_rate": 1.176197503512015e-06, "loss": 0.86133718, "num_input_tokens_seen": 115632265, "step": 5373, "time_per_iteration": 2.6576781272888184 }, { "auxiliary_loss_clip": 0.01140494, "auxiliary_loss_mlp": 0.0102508, "balance_loss_clip": 1.04493272, "balance_loss_mlp": 1.0184257, "epoch": 0.6461852942944748, "flos": 20266726118400.0, "grad_norm": 2.1542696555323886, "language_loss": 0.82502747, "learning_rate": 1.1754877448568223e-06, "loss": 0.8466832, "num_input_tokens_seen": 115651720, "step": 5374, "time_per_iteration": 2.673281192779541 }, { "auxiliary_loss_clip": 0.01145753, "auxiliary_loss_mlp": 0.01023188, "balance_loss_clip": 1.04633498, "balance_loss_mlp": 1.01609468, "epoch": 0.646305537185114, "flos": 23367109564800.0, "grad_norm": 2.0571233315915074, "language_loss": 0.89899641, "learning_rate": 1.1747781112794837e-06, "loss": 0.92068577, "num_input_tokens_seen": 115668215, "step": 5375, "time_per_iteration": 2.6856021881103516 }, { "auxiliary_loss_clip": 0.0112873, "auxiliary_loss_mlp": 0.01029758, "balance_loss_clip": 1.04537749, "balance_loss_mlp": 1.02239394, "epoch": 0.646425780075753, "flos": 24277480790400.0, "grad_norm": 1.6753893931179573, "language_loss": 0.83064592, "learning_rate": 1.1740686028876487e-06, "loss": 0.85223079, "num_input_tokens_seen": 115687080, "step": 5376, "time_per_iteration": 2.7336153984069824 }, { "auxiliary_loss_clip": 0.01153482, "auxiliary_loss_mlp": 0.01023194, "balance_loss_clip": 1.04701662, "balance_loss_mlp": 1.01630116, "epoch": 0.6465460229663921, "flos": 20813968800000.0, "grad_norm": 2.5946284431008664, "language_loss": 0.75111806, "learning_rate": 1.1733592197889507e-06, "loss": 0.77288485, "num_input_tokens_seen": 115703990, "step": 5377, "time_per_iteration": 2.651505708694458 }, { "auxiliary_loss_clip": 0.01152227, "auxiliary_loss_mlp": 0.01030671, "balance_loss_clip": 1.04684377, "balance_loss_mlp": 1.02424288, "epoch": 0.6466662658570312, "flos": 22853299466880.0, "grad_norm": 1.8094205101710146, "language_loss": 0.72773337, "learning_rate": 1.1726499620910014e-06, "loss": 0.74956238, "num_input_tokens_seen": 115724270, "step": 5378, "time_per_iteration": 2.6381213665008545 }, { "auxiliary_loss_clip": 0.01155724, "auxiliary_loss_mlp": 0.01024297, "balance_loss_clip": 1.04727411, "balance_loss_mlp": 1.01688802, "epoch": 0.6467865087476703, "flos": 15304553953920.0, "grad_norm": 2.0303361256069183, "language_loss": 0.777053, "learning_rate": 1.1719408299013955e-06, "loss": 0.79885322, "num_input_tokens_seen": 115742995, "step": 5379, "time_per_iteration": 2.5652832984924316 }, { "auxiliary_loss_clip": 0.0117156, "auxiliary_loss_mlp": 0.01023965, "balance_loss_clip": 1.04996824, "balance_loss_mlp": 1.01718473, "epoch": 0.6469067516383094, "flos": 19573650218880.0, "grad_norm": 2.762577765048556, "language_loss": 0.75946212, "learning_rate": 1.1712318233277067e-06, "loss": 0.78141737, "num_input_tokens_seen": 115762015, "step": 5380, "time_per_iteration": 2.6073129177093506 }, { "auxiliary_loss_clip": 0.01064556, "auxiliary_loss_mlp": 0.010011, "balance_loss_clip": 1.01540661, "balance_loss_mlp": 0.99994355, "epoch": 0.6470269945289485, "flos": 65098002522240.0, "grad_norm": 0.7589278556937867, "language_loss": 0.57853854, "learning_rate": 1.1705229424774916e-06, "loss": 0.59919512, "num_input_tokens_seen": 115816285, "step": 5381, "time_per_iteration": 3.082566738128662 }, { "auxiliary_loss_clip": 0.01139932, "auxiliary_loss_mlp": 0.01026484, "balance_loss_clip": 1.04369426, "balance_loss_mlp": 1.01919484, "epoch": 0.6471472374195876, "flos": 30696943639680.0, "grad_norm": 1.7766920374217674, "language_loss": 0.64240742, "learning_rate": 1.1698141874582867e-06, "loss": 0.66407156, "num_input_tokens_seen": 115837330, "step": 5382, "time_per_iteration": 2.7807562351226807 }, { "auxiliary_loss_clip": 0.01168423, "auxiliary_loss_mlp": 0.01029899, "balance_loss_clip": 1.04828072, "balance_loss_mlp": 1.02307475, "epoch": 0.6472674803102266, "flos": 20521835487360.0, "grad_norm": 2.622466362224973, "language_loss": 0.72253215, "learning_rate": 1.169105558377609e-06, "loss": 0.74451542, "num_input_tokens_seen": 115857420, "step": 5383, "time_per_iteration": 2.6052772998809814 }, { "auxiliary_loss_clip": 0.01114848, "auxiliary_loss_mlp": 0.00762527, "balance_loss_clip": 1.04684377, "balance_loss_mlp": 1.00050974, "epoch": 0.6473877232008658, "flos": 24715447320960.0, "grad_norm": 1.745143412095823, "language_loss": 0.78389323, "learning_rate": 1.1683970553429587e-06, "loss": 0.8026669, "num_input_tokens_seen": 115878875, "step": 5384, "time_per_iteration": 2.7730660438537598 }, { "auxiliary_loss_clip": 0.01132576, "auxiliary_loss_mlp": 0.01031975, "balance_loss_clip": 1.04480267, "balance_loss_mlp": 1.02465534, "epoch": 0.6475079660915048, "flos": 15885552441600.0, "grad_norm": 1.7826656202278064, "language_loss": 0.81935829, "learning_rate": 1.1676886784618128e-06, "loss": 0.84100378, "num_input_tokens_seen": 115895540, "step": 5385, "time_per_iteration": 2.6778366565704346 }, { "auxiliary_loss_clip": 0.01156847, "auxiliary_loss_mlp": 0.01022849, "balance_loss_clip": 1.04699314, "balance_loss_mlp": 1.0151124, "epoch": 0.6476282089821439, "flos": 17381590922880.0, "grad_norm": 2.785706232897928, "language_loss": 0.84173679, "learning_rate": 1.1669804278416332e-06, "loss": 0.86353374, "num_input_tokens_seen": 115910265, "step": 5386, "time_per_iteration": 2.603102207183838 }, { "auxiliary_loss_clip": 0.01149008, "auxiliary_loss_mlp": 0.01027174, "balance_loss_clip": 1.04705393, "balance_loss_mlp": 1.0190084, "epoch": 0.6477484518727831, "flos": 20194078861440.0, "grad_norm": 1.8791672616888675, "language_loss": 0.71299517, "learning_rate": 1.1662723035898602e-06, "loss": 0.73475701, "num_input_tokens_seen": 115930025, "step": 5387, "time_per_iteration": 2.6967480182647705 }, { "auxiliary_loss_clip": 0.01158085, "auxiliary_loss_mlp": 0.01026651, "balance_loss_clip": 1.04805946, "balance_loss_mlp": 1.01870561, "epoch": 0.6478686947634221, "flos": 25410426641280.0, "grad_norm": 1.9906512472541673, "language_loss": 0.82071292, "learning_rate": 1.165564305813915e-06, "loss": 0.84256029, "num_input_tokens_seen": 115949025, "step": 5388, "time_per_iteration": 2.6486003398895264 }, { "auxiliary_loss_clip": 0.01154544, "auxiliary_loss_mlp": 0.01023304, "balance_loss_clip": 1.0463239, "balance_loss_mlp": 1.01638651, "epoch": 0.6479889376540612, "flos": 20083581648000.0, "grad_norm": 1.724494940142618, "language_loss": 0.81310701, "learning_rate": 1.1648564346212019e-06, "loss": 0.83488548, "num_input_tokens_seen": 115968145, "step": 5389, "time_per_iteration": 3.5773353576660156 }, { "auxiliary_loss_clip": 0.01155133, "auxiliary_loss_mlp": 0.01023854, "balance_loss_clip": 1.04952407, "balance_loss_mlp": 1.01712155, "epoch": 0.6481091805447003, "flos": 26758082039040.0, "grad_norm": 2.0638957706907353, "language_loss": 0.76039481, "learning_rate": 1.164148690119104e-06, "loss": 0.78218472, "num_input_tokens_seen": 115989425, "step": 5390, "time_per_iteration": 2.6782243251800537 }, { "auxiliary_loss_clip": 0.01170245, "auxiliary_loss_mlp": 0.01027856, "balance_loss_clip": 1.0484283, "balance_loss_mlp": 1.02068591, "epoch": 0.6482294234353394, "flos": 23952094462080.0, "grad_norm": 2.50375965284962, "language_loss": 0.7417559, "learning_rate": 1.163441072414985e-06, "loss": 0.76373696, "num_input_tokens_seen": 116009630, "step": 5391, "time_per_iteration": 2.625136137008667 }, { "auxiliary_loss_clip": 0.01158311, "auxiliary_loss_mlp": 0.01027435, "balance_loss_clip": 1.04985511, "balance_loss_mlp": 1.02055645, "epoch": 0.6483496663259785, "flos": 26209833776640.0, "grad_norm": 1.8415004086431124, "language_loss": 0.69824743, "learning_rate": 1.16273358161619e-06, "loss": 0.72010493, "num_input_tokens_seen": 116029965, "step": 5392, "time_per_iteration": 3.6492624282836914 }, { "auxiliary_loss_clip": 0.01150966, "auxiliary_loss_mlp": 0.01027096, "balance_loss_clip": 1.04654264, "balance_loss_mlp": 1.01960969, "epoch": 0.6484699092166175, "flos": 20922239370240.0, "grad_norm": 3.1760407728827507, "language_loss": 0.83272099, "learning_rate": 1.1620262178300446e-06, "loss": 0.85450161, "num_input_tokens_seen": 116048580, "step": 5393, "time_per_iteration": 3.5717647075653076 }, { "auxiliary_loss_clip": 0.01129393, "auxiliary_loss_mlp": 0.01024892, "balance_loss_clip": 1.04260683, "balance_loss_mlp": 1.01779866, "epoch": 0.6485901521072567, "flos": 33072865678080.0, "grad_norm": 1.6846624131637096, "language_loss": 0.75879163, "learning_rate": 1.1613189811638563e-06, "loss": 0.78033447, "num_input_tokens_seen": 116070305, "step": 5394, "time_per_iteration": 2.795661449432373 }, { "auxiliary_loss_clip": 0.01160925, "auxiliary_loss_mlp": 0.01020659, "balance_loss_clip": 1.04954934, "balance_loss_mlp": 1.01436806, "epoch": 0.6487103949978957, "flos": 22274060745600.0, "grad_norm": 1.5979339611823231, "language_loss": 0.78150076, "learning_rate": 1.1606118717249117e-06, "loss": 0.80331659, "num_input_tokens_seen": 116090405, "step": 5395, "time_per_iteration": 2.6128180027008057 }, { "auxiliary_loss_clip": 0.01173326, "auxiliary_loss_mlp": 0.01025245, "balance_loss_clip": 1.04685473, "balance_loss_mlp": 1.01746702, "epoch": 0.6488306378885348, "flos": 22930400010240.0, "grad_norm": 1.905316553607197, "language_loss": 0.67845178, "learning_rate": 1.1599048896204787e-06, "loss": 0.70043749, "num_input_tokens_seen": 116110285, "step": 5396, "time_per_iteration": 2.5943233966827393 }, { "auxiliary_loss_clip": 0.0113266, "auxiliary_loss_mlp": 0.01029409, "balance_loss_clip": 1.04477215, "balance_loss_mlp": 1.02202415, "epoch": 0.648950880779174, "flos": 20376110010240.0, "grad_norm": 1.7319305926143758, "language_loss": 0.80941182, "learning_rate": 1.1591980349578061e-06, "loss": 0.83103251, "num_input_tokens_seen": 116128955, "step": 5397, "time_per_iteration": 3.5727624893188477 }, { "auxiliary_loss_clip": 0.01038084, "auxiliary_loss_mlp": 0.0100091, "balance_loss_clip": 1.01132655, "balance_loss_mlp": 0.99975318, "epoch": 0.649071123669813, "flos": 59930889310080.0, "grad_norm": 0.735480428262205, "language_loss": 0.54254055, "learning_rate": 1.158491307844123e-06, "loss": 0.56293058, "num_input_tokens_seen": 116188875, "step": 5398, "time_per_iteration": 3.2304584980010986 }, { "auxiliary_loss_clip": 0.01143708, "auxiliary_loss_mlp": 0.01027668, "balance_loss_clip": 1.04708064, "balance_loss_mlp": 1.02061391, "epoch": 0.6491913665604521, "flos": 20446566537600.0, "grad_norm": 1.6317134464993608, "language_loss": 0.83845091, "learning_rate": 1.1577847083866387e-06, "loss": 0.8601647, "num_input_tokens_seen": 116207910, "step": 5399, "time_per_iteration": 2.709592580795288 }, { "auxiliary_loss_clip": 0.01132397, "auxiliary_loss_mlp": 0.01020527, "balance_loss_clip": 1.04224432, "balance_loss_mlp": 1.01310015, "epoch": 0.6493116094510912, "flos": 16946820702720.0, "grad_norm": 1.7623246428376285, "language_loss": 0.7204386, "learning_rate": 1.1570782366925453e-06, "loss": 0.74196786, "num_input_tokens_seen": 116226425, "step": 5400, "time_per_iteration": 2.642521858215332 }, { "auxiliary_loss_clip": 0.0114257, "auxiliary_loss_mlp": 0.01023367, "balance_loss_clip": 1.04100692, "balance_loss_mlp": 1.01579142, "epoch": 0.6494318523417303, "flos": 18802935072000.0, "grad_norm": 1.715697018064535, "language_loss": 0.75334656, "learning_rate": 1.1563718928690132e-06, "loss": 0.77500594, "num_input_tokens_seen": 116243860, "step": 5401, "time_per_iteration": 2.6233150959014893 }, { "auxiliary_loss_clip": 0.01128021, "auxiliary_loss_mlp": 0.01028948, "balance_loss_clip": 1.04357743, "balance_loss_mlp": 1.02189422, "epoch": 0.6495520952323693, "flos": 18982847318400.0, "grad_norm": 2.011850520844283, "language_loss": 0.71412146, "learning_rate": 1.1556656770231942e-06, "loss": 0.73569107, "num_input_tokens_seen": 116260055, "step": 5402, "time_per_iteration": 2.649190902709961 }, { "auxiliary_loss_clip": 0.01157813, "auxiliary_loss_mlp": 0.01024573, "balance_loss_clip": 1.04456699, "balance_loss_mlp": 1.01775169, "epoch": 0.6496723381230085, "flos": 22745388032640.0, "grad_norm": 2.3851315034547227, "language_loss": 0.76308608, "learning_rate": 1.1549595892622207e-06, "loss": 0.78490996, "num_input_tokens_seen": 116278825, "step": 5403, "time_per_iteration": 2.653712034225464 }, { "auxiliary_loss_clip": 0.01025039, "auxiliary_loss_mlp": 0.01004934, "balance_loss_clip": 1.01533735, "balance_loss_mlp": 1.00368822, "epoch": 0.6497925810136476, "flos": 62145283887360.0, "grad_norm": 0.872415412008942, "language_loss": 0.59044087, "learning_rate": 1.1542536296932047e-06, "loss": 0.61074066, "num_input_tokens_seen": 116342360, "step": 5404, "time_per_iteration": 3.2951741218566895 }, { "auxiliary_loss_clip": 0.01134161, "auxiliary_loss_mlp": 0.01032397, "balance_loss_clip": 1.04115021, "balance_loss_mlp": 1.02464259, "epoch": 0.6499128239042866, "flos": 20156731695360.0, "grad_norm": 1.7964754462553298, "language_loss": 0.70241433, "learning_rate": 1.1535477984232414e-06, "loss": 0.72407991, "num_input_tokens_seen": 116362235, "step": 5405, "time_per_iteration": 2.6729917526245117 }, { "auxiliary_loss_clip": 0.01115545, "auxiliary_loss_mlp": 0.01025985, "balance_loss_clip": 1.03858578, "balance_loss_mlp": 1.01922286, "epoch": 0.6500330667949258, "flos": 24462420940800.0, "grad_norm": 1.9955711547701482, "language_loss": 0.76900685, "learning_rate": 1.152842095559404e-06, "loss": 0.7904222, "num_input_tokens_seen": 116382895, "step": 5406, "time_per_iteration": 2.769484043121338 }, { "auxiliary_loss_clip": 0.01145621, "auxiliary_loss_mlp": 0.01023956, "balance_loss_clip": 1.04306126, "balance_loss_mlp": 1.01708984, "epoch": 0.6501533096855648, "flos": 25477399549440.0, "grad_norm": 1.726778454656137, "language_loss": 0.76670277, "learning_rate": 1.1521365212087474e-06, "loss": 0.7883985, "num_input_tokens_seen": 116402880, "step": 5407, "time_per_iteration": 2.660686492919922 }, { "auxiliary_loss_clip": 0.01158551, "auxiliary_loss_mlp": 0.01019927, "balance_loss_clip": 1.04721785, "balance_loss_mlp": 1.01285779, "epoch": 0.6502735525762039, "flos": 44819245347840.0, "grad_norm": 1.563196949250548, "language_loss": 0.70448095, "learning_rate": 1.1514310754783062e-06, "loss": 0.72626573, "num_input_tokens_seen": 116425830, "step": 5408, "time_per_iteration": 2.8571908473968506 }, { "auxiliary_loss_clip": 0.01147057, "auxiliary_loss_mlp": 0.01026382, "balance_loss_clip": 1.04704273, "balance_loss_mlp": 1.01957548, "epoch": 0.6503937954668431, "flos": 28658546726400.0, "grad_norm": 4.01134766643033, "language_loss": 0.7306217, "learning_rate": 1.1507257584750964e-06, "loss": 0.75235611, "num_input_tokens_seen": 116446010, "step": 5409, "time_per_iteration": 2.6980931758880615 }, { "auxiliary_loss_clip": 0.01172516, "auxiliary_loss_mlp": 0.01028999, "balance_loss_clip": 1.04956305, "balance_loss_mlp": 1.02189422, "epoch": 0.6505140383574821, "flos": 20922562592640.0, "grad_norm": 1.8660198188386592, "language_loss": 0.77320361, "learning_rate": 1.150020570306113e-06, "loss": 0.79521871, "num_input_tokens_seen": 116465150, "step": 5410, "time_per_iteration": 2.6570024490356445 }, { "auxiliary_loss_clip": 0.01135957, "auxiliary_loss_mlp": 0.01026596, "balance_loss_clip": 1.04110074, "balance_loss_mlp": 1.01956892, "epoch": 0.6506342812481212, "flos": 20595236929920.0, "grad_norm": 2.4755844118668, "language_loss": 0.74568045, "learning_rate": 1.1493155110783338e-06, "loss": 0.76730597, "num_input_tokens_seen": 116483675, "step": 5411, "time_per_iteration": 2.6703391075134277 }, { "auxiliary_loss_clip": 0.01158435, "auxiliary_loss_mlp": 0.01024718, "balance_loss_clip": 1.04702044, "balance_loss_mlp": 1.01734543, "epoch": 0.6507545241387603, "flos": 30226478279040.0, "grad_norm": 2.4042505292161764, "language_loss": 0.70207345, "learning_rate": 1.1486105808987155e-06, "loss": 0.72390497, "num_input_tokens_seen": 116505165, "step": 5412, "time_per_iteration": 2.636512517929077 }, { "auxiliary_loss_clip": 0.01160066, "auxiliary_loss_mlp": 0.01027762, "balance_loss_clip": 1.04919744, "balance_loss_mlp": 1.01984644, "epoch": 0.6508747670293994, "flos": 17128241320320.0, "grad_norm": 1.9826676412316855, "language_loss": 0.81387025, "learning_rate": 1.1479057798741947e-06, "loss": 0.83574855, "num_input_tokens_seen": 116523220, "step": 5413, "time_per_iteration": 2.604240894317627 }, { "auxiliary_loss_clip": 0.01057215, "auxiliary_loss_mlp": 0.01001098, "balance_loss_clip": 1.02328157, "balance_loss_mlp": 0.99967939, "epoch": 0.6509950099200384, "flos": 68559826573440.0, "grad_norm": 0.8052093363843152, "language_loss": 0.53339481, "learning_rate": 1.14720110811169e-06, "loss": 0.55397785, "num_input_tokens_seen": 116580450, "step": 5414, "time_per_iteration": 3.2446889877319336 }, { "auxiliary_loss_clip": 0.0116139, "auxiliary_loss_mlp": 0.01027527, "balance_loss_clip": 1.04973483, "balance_loss_mlp": 1.01975131, "epoch": 0.6511152528106776, "flos": 22347462188160.0, "grad_norm": 2.204017208741017, "language_loss": 0.76576138, "learning_rate": 1.146496565718098e-06, "loss": 0.78765059, "num_input_tokens_seen": 116601020, "step": 5415, "time_per_iteration": 3.6857950687408447 }, { "auxiliary_loss_clip": 0.01145992, "auxiliary_loss_mlp": 0.01023834, "balance_loss_clip": 1.04879451, "balance_loss_mlp": 1.01604915, "epoch": 0.6512354957013167, "flos": 20522158709760.0, "grad_norm": 1.8332592368788272, "language_loss": 0.75680029, "learning_rate": 1.1457921528002996e-06, "loss": 0.77849853, "num_input_tokens_seen": 116619455, "step": 5416, "time_per_iteration": 2.700971841812134 }, { "auxiliary_loss_clip": 0.01171104, "auxiliary_loss_mlp": 0.00762073, "balance_loss_clip": 1.04754615, "balance_loss_mlp": 1.00050199, "epoch": 0.6513557385919557, "flos": 32337342881280.0, "grad_norm": 2.6279831354721908, "language_loss": 0.72310966, "learning_rate": 1.1450878694651522e-06, "loss": 0.74244142, "num_input_tokens_seen": 116640020, "step": 5417, "time_per_iteration": 2.7004945278167725 }, { "auxiliary_loss_clip": 0.01115631, "auxiliary_loss_mlp": 0.01023467, "balance_loss_clip": 1.04102659, "balance_loss_mlp": 1.01631427, "epoch": 0.6514759814825949, "flos": 12093206417280.0, "grad_norm": 2.328681465638885, "language_loss": 0.63656169, "learning_rate": 1.1443837158194954e-06, "loss": 0.65795267, "num_input_tokens_seen": 116655165, "step": 5418, "time_per_iteration": 3.667316198348999 }, { "auxiliary_loss_clip": 0.0113202, "auxiliary_loss_mlp": 0.01027569, "balance_loss_clip": 1.04764938, "balance_loss_mlp": 1.02004123, "epoch": 0.651596224373234, "flos": 22526907557760.0, "grad_norm": 1.598869245555437, "language_loss": 0.74887311, "learning_rate": 1.1436796919701484e-06, "loss": 0.77046895, "num_input_tokens_seen": 116673880, "step": 5419, "time_per_iteration": 2.7325007915496826 }, { "auxiliary_loss_clip": 0.01144146, "auxiliary_loss_mlp": 0.01023724, "balance_loss_clip": 1.04707742, "balance_loss_mlp": 1.01643133, "epoch": 0.651716467263873, "flos": 27818955250560.0, "grad_norm": 2.1714434054568383, "language_loss": 0.61942703, "learning_rate": 1.1429757980239115e-06, "loss": 0.64110571, "num_input_tokens_seen": 116694305, "step": 5420, "time_per_iteration": 2.7146098613739014 }, { "auxiliary_loss_clip": 0.01175955, "auxiliary_loss_mlp": 0.0102877, "balance_loss_clip": 1.05051994, "balance_loss_mlp": 1.0206821, "epoch": 0.6518367101545122, "flos": 24316300414080.0, "grad_norm": 3.130492621938327, "language_loss": 0.82160842, "learning_rate": 1.1422720340875636e-06, "loss": 0.84365565, "num_input_tokens_seen": 116713055, "step": 5421, "time_per_iteration": 2.6725056171417236 }, { "auxiliary_loss_clip": 0.0116417, "auxiliary_loss_mlp": 0.01031291, "balance_loss_clip": 1.04841805, "balance_loss_mlp": 1.02323806, "epoch": 0.6519569530451512, "flos": 20011939971840.0, "grad_norm": 2.1480051088584795, "language_loss": 0.78879976, "learning_rate": 1.1415684002678671e-06, "loss": 0.81075442, "num_input_tokens_seen": 116731815, "step": 5422, "time_per_iteration": 3.566349983215332 }, { "auxiliary_loss_clip": 0.01146094, "auxiliary_loss_mlp": 0.01024068, "balance_loss_clip": 1.04419553, "balance_loss_mlp": 1.01646852, "epoch": 0.6520771959357903, "flos": 21576064682880.0, "grad_norm": 2.973947916144434, "language_loss": 0.77853322, "learning_rate": 1.1408648966715617e-06, "loss": 0.80023479, "num_input_tokens_seen": 116749335, "step": 5423, "time_per_iteration": 2.665269613265991 }, { "auxiliary_loss_clip": 0.01140427, "auxiliary_loss_mlp": 0.01030264, "balance_loss_clip": 1.04054153, "balance_loss_mlp": 1.0223124, "epoch": 0.6521974388264293, "flos": 22711021695360.0, "grad_norm": 2.0724395507587934, "language_loss": 0.72666824, "learning_rate": 1.1401615234053683e-06, "loss": 0.74837506, "num_input_tokens_seen": 116768155, "step": 5424, "time_per_iteration": 2.69214129447937 }, { "auxiliary_loss_clip": 0.01144465, "auxiliary_loss_mlp": 0.01024124, "balance_loss_clip": 1.04468095, "balance_loss_mlp": 1.0165422, "epoch": 0.6523176817170685, "flos": 23002939526400.0, "grad_norm": 2.256307187130822, "language_loss": 0.75869381, "learning_rate": 1.1394582805759885e-06, "loss": 0.78037965, "num_input_tokens_seen": 116787435, "step": 5425, "time_per_iteration": 2.7005515098571777 }, { "auxiliary_loss_clip": 0.01158407, "auxiliary_loss_mlp": 0.0102455, "balance_loss_clip": 1.04829323, "balance_loss_mlp": 1.01689935, "epoch": 0.6524379246077076, "flos": 21688249835520.0, "grad_norm": 14.192135172050383, "language_loss": 0.75780183, "learning_rate": 1.1387551682901022e-06, "loss": 0.77963144, "num_input_tokens_seen": 116808040, "step": 5426, "time_per_iteration": 2.585571765899658 }, { "auxiliary_loss_clip": 0.01128893, "auxiliary_loss_mlp": 0.01024659, "balance_loss_clip": 1.04469848, "balance_loss_mlp": 1.01715469, "epoch": 0.6525581674983466, "flos": 19390936711680.0, "grad_norm": 1.7720465234051483, "language_loss": 0.70572406, "learning_rate": 1.138052186654373e-06, "loss": 0.72725958, "num_input_tokens_seen": 116825510, "step": 5427, "time_per_iteration": 2.826972246170044 }, { "auxiliary_loss_clip": 0.01144647, "auxiliary_loss_mlp": 0.01029423, "balance_loss_clip": 1.04493654, "balance_loss_mlp": 1.02131665, "epoch": 0.6526784103889858, "flos": 17165444832000.0, "grad_norm": 2.048287840030225, "language_loss": 0.88153243, "learning_rate": 1.1373493357754417e-06, "loss": 0.90327311, "num_input_tokens_seen": 116844415, "step": 5428, "time_per_iteration": 2.656383991241455 }, { "auxiliary_loss_clip": 0.01171794, "auxiliary_loss_mlp": 0.01023824, "balance_loss_clip": 1.04709828, "balance_loss_mlp": 1.01684463, "epoch": 0.6527986532796248, "flos": 18989168112000.0, "grad_norm": 3.12947405051866, "language_loss": 0.7733174, "learning_rate": 1.1366466157599303e-06, "loss": 0.7952736, "num_input_tokens_seen": 116863690, "step": 5429, "time_per_iteration": 2.6151983737945557 }, { "auxiliary_loss_clip": 0.01110877, "auxiliary_loss_mlp": 0.00762361, "balance_loss_clip": 1.04040897, "balance_loss_mlp": 1.00063884, "epoch": 0.6529188961702639, "flos": 14238581011200.0, "grad_norm": 1.9820861923337434, "language_loss": 0.76243013, "learning_rate": 1.1359440267144412e-06, "loss": 0.7811625, "num_input_tokens_seen": 116881145, "step": 5430, "time_per_iteration": 2.710097312927246 }, { "auxiliary_loss_clip": 0.01159032, "auxiliary_loss_mlp": 0.01029199, "balance_loss_clip": 1.04650855, "balance_loss_mlp": 1.02196002, "epoch": 0.653039139060903, "flos": 36682929158400.0, "grad_norm": 1.9852106600436288, "language_loss": 0.74370182, "learning_rate": 1.1352415687455556e-06, "loss": 0.76558411, "num_input_tokens_seen": 116902405, "step": 5431, "time_per_iteration": 2.7089178562164307 }, { "auxiliary_loss_clip": 0.01158992, "auxiliary_loss_mlp": 0.01030857, "balance_loss_clip": 1.04902005, "balance_loss_mlp": 1.0231142, "epoch": 0.6531593819515421, "flos": 25376275785600.0, "grad_norm": 1.9541790218140125, "language_loss": 0.62939543, "learning_rate": 1.1345392419598362e-06, "loss": 0.65129387, "num_input_tokens_seen": 116921285, "step": 5432, "time_per_iteration": 2.644500732421875 }, { "auxiliary_loss_clip": 0.01152708, "auxiliary_loss_mlp": 0.01025972, "balance_loss_clip": 1.04670835, "balance_loss_mlp": 1.01852751, "epoch": 0.6532796248421812, "flos": 21178533888000.0, "grad_norm": 2.4586799907585264, "language_loss": 0.71873152, "learning_rate": 1.1338370464638263e-06, "loss": 0.74051833, "num_input_tokens_seen": 116940685, "step": 5433, "time_per_iteration": 2.6236720085144043 }, { "auxiliary_loss_clip": 0.01170687, "auxiliary_loss_mlp": 0.01024599, "balance_loss_clip": 1.04627526, "balance_loss_mlp": 1.01722264, "epoch": 0.6533998677328203, "flos": 17675950878720.0, "grad_norm": 2.3490334832223594, "language_loss": 0.63760912, "learning_rate": 1.1331349823640474e-06, "loss": 0.65956199, "num_input_tokens_seen": 116958115, "step": 5434, "time_per_iteration": 2.526662588119507 }, { "auxiliary_loss_clip": 0.01161081, "auxiliary_loss_mlp": 0.00761872, "balance_loss_clip": 1.04820263, "balance_loss_mlp": 1.00054562, "epoch": 0.6535201106234594, "flos": 28400384701440.0, "grad_norm": 2.0875325874195707, "language_loss": 0.77917004, "learning_rate": 1.132433049767003e-06, "loss": 0.79839957, "num_input_tokens_seen": 116976030, "step": 5435, "time_per_iteration": 2.6820363998413086 }, { "auxiliary_loss_clip": 0.01139829, "auxiliary_loss_mlp": 0.01021041, "balance_loss_clip": 1.04427826, "balance_loss_mlp": 1.01446319, "epoch": 0.6536403535140984, "flos": 23586667447680.0, "grad_norm": 2.4718569205573404, "language_loss": 0.81296974, "learning_rate": 1.1317312487791748e-06, "loss": 0.83457845, "num_input_tokens_seen": 116997680, "step": 5436, "time_per_iteration": 2.684680700302124 }, { "auxiliary_loss_clip": 0.01150623, "auxiliary_loss_mlp": 0.01024661, "balance_loss_clip": 1.04379344, "balance_loss_mlp": 1.01702011, "epoch": 0.6537605964047376, "flos": 21579476474880.0, "grad_norm": 2.1886455924998667, "language_loss": 0.72962093, "learning_rate": 1.1310295795070253e-06, "loss": 0.75137377, "num_input_tokens_seen": 117017620, "step": 5437, "time_per_iteration": 2.611844062805176 }, { "auxiliary_loss_clip": 0.01118673, "auxiliary_loss_mlp": 0.01023476, "balance_loss_clip": 1.04174495, "balance_loss_mlp": 1.01654983, "epoch": 0.6538808392953767, "flos": 26833997433600.0, "grad_norm": 1.8512680484387822, "language_loss": 0.80941689, "learning_rate": 1.1303280420569982e-06, "loss": 0.83083838, "num_input_tokens_seen": 117039505, "step": 5438, "time_per_iteration": 2.762617826461792 }, { "auxiliary_loss_clip": 0.01149272, "auxiliary_loss_mlp": 0.01023871, "balance_loss_clip": 1.04278326, "balance_loss_mlp": 1.01659274, "epoch": 0.6540010821860157, "flos": 30738241301760.0, "grad_norm": 1.6508634136155322, "language_loss": 0.77544975, "learning_rate": 1.1296266365355158e-06, "loss": 0.79718113, "num_input_tokens_seen": 117062890, "step": 5439, "time_per_iteration": 2.7710511684417725 }, { "auxiliary_loss_clip": 0.01133261, "auxiliary_loss_mlp": 0.01027653, "balance_loss_clip": 1.0453527, "balance_loss_mlp": 1.0199579, "epoch": 0.6541213250766549, "flos": 26907147480960.0, "grad_norm": 1.9747510129372408, "language_loss": 0.73850209, "learning_rate": 1.1289253630489806e-06, "loss": 0.76011121, "num_input_tokens_seen": 117083940, "step": 5440, "time_per_iteration": 2.748257637023926 }, { "auxiliary_loss_clip": 0.01164017, "auxiliary_loss_mlp": 0.01027785, "balance_loss_clip": 1.0468595, "balance_loss_mlp": 1.0198338, "epoch": 0.6542415679672939, "flos": 19172384409600.0, "grad_norm": 7.235445362496793, "language_loss": 0.72368771, "learning_rate": 1.1282242217037753e-06, "loss": 0.74560571, "num_input_tokens_seen": 117101440, "step": 5441, "time_per_iteration": 3.5673255920410156 }, { "auxiliary_loss_clip": 0.01106425, "auxiliary_loss_mlp": 0.0103019, "balance_loss_clip": 1.03682756, "balance_loss_mlp": 1.02189291, "epoch": 0.654361810857933, "flos": 48173517100800.0, "grad_norm": 2.039002463636506, "language_loss": 0.61901879, "learning_rate": 1.127523212606262e-06, "loss": 0.64038491, "num_input_tokens_seen": 117124265, "step": 5442, "time_per_iteration": 3.021934986114502 }, { "auxiliary_loss_clip": 0.01155248, "auxiliary_loss_mlp": 0.01023198, "balance_loss_clip": 1.0454073, "balance_loss_mlp": 1.01627851, "epoch": 0.6544820537485722, "flos": 26943165843840.0, "grad_norm": 1.562868577499215, "language_loss": 0.73021519, "learning_rate": 1.1268223358627835e-06, "loss": 0.75199974, "num_input_tokens_seen": 117146755, "step": 5443, "time_per_iteration": 2.7337329387664795 }, { "auxiliary_loss_clip": 0.01173182, "auxiliary_loss_mlp": 0.01024453, "balance_loss_clip": 1.04843426, "balance_loss_mlp": 1.01687121, "epoch": 0.6546022966392112, "flos": 20886328748160.0, "grad_norm": 1.92902295729455, "language_loss": 0.71977317, "learning_rate": 1.126121591579663e-06, "loss": 0.74174953, "num_input_tokens_seen": 117165960, "step": 5444, "time_per_iteration": 3.609884262084961 }, { "auxiliary_loss_clip": 0.01156524, "auxiliary_loss_mlp": 0.01028184, "balance_loss_clip": 1.04848206, "balance_loss_mlp": 1.02129102, "epoch": 0.6547225395298503, "flos": 24936693143040.0, "grad_norm": 1.8844845001010355, "language_loss": 0.69023132, "learning_rate": 1.1254209798632018e-06, "loss": 0.71207845, "num_input_tokens_seen": 117186980, "step": 5445, "time_per_iteration": 3.633784532546997 }, { "auxiliary_loss_clip": 0.01091987, "auxiliary_loss_mlp": 0.01024385, "balance_loss_clip": 1.03946352, "balance_loss_mlp": 1.01717305, "epoch": 0.6548427824204894, "flos": 22565942663040.0, "grad_norm": 1.8656957040198157, "language_loss": 0.84995329, "learning_rate": 1.124720500819683e-06, "loss": 0.87111706, "num_input_tokens_seen": 117205135, "step": 5446, "time_per_iteration": 2.8088340759277344 }, { "auxiliary_loss_clip": 0.01171817, "auxiliary_loss_mlp": 0.01028724, "balance_loss_clip": 1.04843223, "balance_loss_mlp": 1.02078414, "epoch": 0.6549630253111285, "flos": 18442500048000.0, "grad_norm": 1.979403183696688, "language_loss": 0.81863779, "learning_rate": 1.1240201545553682e-06, "loss": 0.84064317, "num_input_tokens_seen": 117222935, "step": 5447, "time_per_iteration": 2.6181750297546387 }, { "auxiliary_loss_clip": 0.01127831, "auxiliary_loss_mlp": 0.01023802, "balance_loss_clip": 1.04406905, "balance_loss_mlp": 1.01694751, "epoch": 0.6550832682017675, "flos": 25187313312000.0, "grad_norm": 1.8741119284952055, "language_loss": 0.73022419, "learning_rate": 1.1233199411764987e-06, "loss": 0.75174052, "num_input_tokens_seen": 117242370, "step": 5448, "time_per_iteration": 3.6163389682769775 }, { "auxiliary_loss_clip": 0.01114659, "auxiliary_loss_mlp": 0.01024741, "balance_loss_clip": 1.03919482, "balance_loss_mlp": 1.01771379, "epoch": 0.6552035110924067, "flos": 22748153379840.0, "grad_norm": 1.87866204952456, "language_loss": 0.69066441, "learning_rate": 1.1226198607892978e-06, "loss": 0.71205837, "num_input_tokens_seen": 117262930, "step": 5449, "time_per_iteration": 2.655757188796997 }, { "auxiliary_loss_clip": 0.01116352, "auxiliary_loss_mlp": 0.0102607, "balance_loss_clip": 1.04289889, "balance_loss_mlp": 1.01896548, "epoch": 0.6553237539830458, "flos": 21799178012160.0, "grad_norm": 1.8271886080493103, "language_loss": 0.79738712, "learning_rate": 1.1219199134999664e-06, "loss": 0.81881136, "num_input_tokens_seen": 117281430, "step": 5450, "time_per_iteration": 2.7710142135620117 }, { "auxiliary_loss_clip": 0.01146221, "auxiliary_loss_mlp": 0.01030534, "balance_loss_clip": 1.0451349, "balance_loss_mlp": 1.02238011, "epoch": 0.6554439968736848, "flos": 20887226588160.0, "grad_norm": 2.643118109770891, "language_loss": 0.78683543, "learning_rate": 1.1212200994146863e-06, "loss": 0.80860293, "num_input_tokens_seen": 117299185, "step": 5451, "time_per_iteration": 2.628506660461426 }, { "auxiliary_loss_clip": 0.01123432, "auxiliary_loss_mlp": 0.01023022, "balance_loss_clip": 1.03740525, "balance_loss_mlp": 1.01577997, "epoch": 0.655564239764324, "flos": 16139045698560.0, "grad_norm": 1.8898299904941038, "language_loss": 0.75610709, "learning_rate": 1.120520418639618e-06, "loss": 0.77757162, "num_input_tokens_seen": 117317720, "step": 5452, "time_per_iteration": 2.7033801078796387 }, { "auxiliary_loss_clip": 0.01159936, "auxiliary_loss_mlp": 0.01028286, "balance_loss_clip": 1.04898763, "balance_loss_mlp": 1.02123833, "epoch": 0.655684482654963, "flos": 29570354496000.0, "grad_norm": 2.0669862359186904, "language_loss": 0.83199477, "learning_rate": 1.119820871280903e-06, "loss": 0.85387707, "num_input_tokens_seen": 117338795, "step": 5453, "time_per_iteration": 2.7102773189544678 }, { "auxiliary_loss_clip": 0.01156932, "auxiliary_loss_mlp": 0.01028254, "balance_loss_clip": 1.04686296, "balance_loss_mlp": 1.02087843, "epoch": 0.6558047255456021, "flos": 29789409588480.0, "grad_norm": 1.9247218060188158, "language_loss": 0.73231924, "learning_rate": 1.1191214574446614e-06, "loss": 0.75417113, "num_input_tokens_seen": 117359040, "step": 5454, "time_per_iteration": 2.6666131019592285 }, { "auxiliary_loss_clip": 0.01137471, "auxiliary_loss_mlp": 0.01028082, "balance_loss_clip": 1.04276526, "balance_loss_mlp": 1.02043438, "epoch": 0.6559249684362413, "flos": 29059166090880.0, "grad_norm": 1.4907246106809866, "language_loss": 0.80146497, "learning_rate": 1.118422177236995e-06, "loss": 0.82312047, "num_input_tokens_seen": 117380865, "step": 5455, "time_per_iteration": 2.7533791065216064 }, { "auxiliary_loss_clip": 0.01144411, "auxiliary_loss_mlp": 0.01029918, "balance_loss_clip": 1.04467463, "balance_loss_mlp": 1.02224672, "epoch": 0.6560452113268803, "flos": 20225464369920.0, "grad_norm": 2.2121812929439137, "language_loss": 0.85582626, "learning_rate": 1.1177230307639835e-06, "loss": 0.87756956, "num_input_tokens_seen": 117398405, "step": 5456, "time_per_iteration": 2.679054021835327 }, { "auxiliary_loss_clip": 0.0112437, "auxiliary_loss_mlp": 0.01029358, "balance_loss_clip": 1.04262471, "balance_loss_mlp": 1.02138281, "epoch": 0.6561654542175194, "flos": 25045538330880.0, "grad_norm": 1.6528735227835665, "language_loss": 0.79051501, "learning_rate": 1.1170240181316865e-06, "loss": 0.81205225, "num_input_tokens_seen": 117419850, "step": 5457, "time_per_iteration": 2.8566107749938965 }, { "auxiliary_loss_clip": 0.01123401, "auxiliary_loss_mlp": 0.01026631, "balance_loss_clip": 1.04012227, "balance_loss_mlp": 1.01878715, "epoch": 0.6562856971081584, "flos": 22856711258880.0, "grad_norm": 2.1949371381820093, "language_loss": 0.79118311, "learning_rate": 1.1163251394461442e-06, "loss": 0.81268346, "num_input_tokens_seen": 117438330, "step": 5458, "time_per_iteration": 2.6920738220214844 }, { "auxiliary_loss_clip": 0.01156233, "auxiliary_loss_mlp": 0.01025707, "balance_loss_clip": 1.04736137, "balance_loss_mlp": 1.01796389, "epoch": 0.6564059399987976, "flos": 18872565586560.0, "grad_norm": 1.9913528696587777, "language_loss": 0.82567966, "learning_rate": 1.1156263948133746e-06, "loss": 0.84749907, "num_input_tokens_seen": 117454985, "step": 5459, "time_per_iteration": 2.632329225540161 }, { "auxiliary_loss_clip": 0.01106778, "auxiliary_loss_mlp": 0.00762426, "balance_loss_clip": 1.0408628, "balance_loss_mlp": 1.00059128, "epoch": 0.6565261828894366, "flos": 25484187219840.0, "grad_norm": 1.78046461095855, "language_loss": 0.77948117, "learning_rate": 1.1149277843393787e-06, "loss": 0.79817313, "num_input_tokens_seen": 117476145, "step": 5460, "time_per_iteration": 2.750566244125366 }, { "auxiliary_loss_clip": 0.01096152, "auxiliary_loss_mlp": 0.00763103, "balance_loss_clip": 1.03509784, "balance_loss_mlp": 1.0006268, "epoch": 0.6566464257800757, "flos": 19683500987520.0, "grad_norm": 2.592483128713835, "language_loss": 0.63285899, "learning_rate": 1.1142293081301342e-06, "loss": 0.65145159, "num_input_tokens_seen": 117494025, "step": 5461, "time_per_iteration": 2.8487167358398438 }, { "auxiliary_loss_clip": 0.01140657, "auxiliary_loss_mlp": 0.0102043, "balance_loss_clip": 1.04515111, "balance_loss_mlp": 1.01409757, "epoch": 0.6567666686707149, "flos": 23514127931520.0, "grad_norm": 1.5406863366007868, "language_loss": 0.67606211, "learning_rate": 1.1135309662915995e-06, "loss": 0.69767296, "num_input_tokens_seen": 117514190, "step": 5462, "time_per_iteration": 2.682324171066284 }, { "auxiliary_loss_clip": 0.01118302, "auxiliary_loss_mlp": 0.01020956, "balance_loss_clip": 1.03891563, "balance_loss_mlp": 1.0138036, "epoch": 0.6568869115613539, "flos": 32781342896640.0, "grad_norm": 2.137361897536097, "language_loss": 0.60419106, "learning_rate": 1.112832758929712e-06, "loss": 0.62558365, "num_input_tokens_seen": 117536800, "step": 5463, "time_per_iteration": 2.9119882583618164 }, { "auxiliary_loss_clip": 0.01155892, "auxiliary_loss_mlp": 0.01026229, "balance_loss_clip": 1.04612386, "balance_loss_mlp": 1.0185281, "epoch": 0.657007154451993, "flos": 18442428220800.0, "grad_norm": 1.7851865878587936, "language_loss": 0.74959987, "learning_rate": 1.11213468615039e-06, "loss": 0.77142107, "num_input_tokens_seen": 117556230, "step": 5464, "time_per_iteration": 2.647881031036377 }, { "auxiliary_loss_clip": 0.01098081, "auxiliary_loss_mlp": 0.01026833, "balance_loss_clip": 1.03952932, "balance_loss_mlp": 1.01974642, "epoch": 0.6571273973426321, "flos": 25156717902720.0, "grad_norm": 1.6952970761632329, "language_loss": 0.75329107, "learning_rate": 1.1114367480595292e-06, "loss": 0.77454019, "num_input_tokens_seen": 117577310, "step": 5465, "time_per_iteration": 2.7637722492218018 }, { "auxiliary_loss_clip": 0.01101648, "auxiliary_loss_mlp": 0.01029454, "balance_loss_clip": 1.04095185, "balance_loss_mlp": 1.02160454, "epoch": 0.6572476402332712, "flos": 17529830352000.0, "grad_norm": 2.2240586006309724, "language_loss": 0.81227696, "learning_rate": 1.1107389447630086e-06, "loss": 0.833588, "num_input_tokens_seen": 117596010, "step": 5466, "time_per_iteration": 2.7606360912323 }, { "auxiliary_loss_clip": 0.01138469, "auxiliary_loss_mlp": 0.00762392, "balance_loss_clip": 1.04174888, "balance_loss_mlp": 1.00056565, "epoch": 0.6573678831239103, "flos": 17014260487680.0, "grad_norm": 2.1024014960869915, "language_loss": 0.78359342, "learning_rate": 1.1100412763666818e-06, "loss": 0.80260205, "num_input_tokens_seen": 117611270, "step": 5467, "time_per_iteration": 3.6081461906433105 }, { "auxiliary_loss_clip": 0.011447, "auxiliary_loss_mlp": 0.01029111, "balance_loss_clip": 1.04576194, "balance_loss_mlp": 1.02168417, "epoch": 0.6574881260145494, "flos": 23910078528000.0, "grad_norm": 1.500033296809999, "language_loss": 0.80108434, "learning_rate": 1.1093437429763865e-06, "loss": 0.82282245, "num_input_tokens_seen": 117631535, "step": 5468, "time_per_iteration": 2.907742500305176 }, { "auxiliary_loss_clip": 0.0115705, "auxiliary_loss_mlp": 0.0102122, "balance_loss_clip": 1.04665971, "balance_loss_mlp": 1.013623, "epoch": 0.6576083689051885, "flos": 11218458504960.0, "grad_norm": 1.985508983726818, "language_loss": 0.73289478, "learning_rate": 1.1086463446979361e-06, "loss": 0.75467747, "num_input_tokens_seen": 117649885, "step": 5469, "time_per_iteration": 3.612353801727295 }, { "auxiliary_loss_clip": 0.01161031, "auxiliary_loss_mlp": 0.01025187, "balance_loss_clip": 1.04951799, "balance_loss_mlp": 1.01738501, "epoch": 0.6577286117958275, "flos": 22455553190400.0, "grad_norm": 1.8283972849280057, "language_loss": 0.77487117, "learning_rate": 1.1079490816371277e-06, "loss": 0.79673332, "num_input_tokens_seen": 117669650, "step": 5470, "time_per_iteration": 3.594592571258545 }, { "auxiliary_loss_clip": 0.01158167, "auxiliary_loss_mlp": 0.00762435, "balance_loss_clip": 1.04612708, "balance_loss_mlp": 1.0005641, "epoch": 0.6578488546864667, "flos": 21872184405120.0, "grad_norm": 1.9841601941796894, "language_loss": 0.7479279, "learning_rate": 1.1072519538997352e-06, "loss": 0.76713395, "num_input_tokens_seen": 117688790, "step": 5471, "time_per_iteration": 2.711440324783325 }, { "auxiliary_loss_clip": 0.01145636, "auxiliary_loss_mlp": 0.01021532, "balance_loss_clip": 1.04342043, "balance_loss_mlp": 1.01401544, "epoch": 0.6579690975771058, "flos": 23543753673600.0, "grad_norm": 2.8462165881849573, "language_loss": 0.82188445, "learning_rate": 1.1065549615915095e-06, "loss": 0.84355617, "num_input_tokens_seen": 117708620, "step": 5472, "time_per_iteration": 2.760585069656372 }, { "auxiliary_loss_clip": 0.01160081, "auxiliary_loss_mlp": 0.01028413, "balance_loss_clip": 1.0508858, "balance_loss_mlp": 1.02101862, "epoch": 0.6580893404677448, "flos": 32743995730560.0, "grad_norm": 2.6766810270575276, "language_loss": 0.78588343, "learning_rate": 1.105858104818187e-06, "loss": 0.80776846, "num_input_tokens_seen": 117729775, "step": 5473, "time_per_iteration": 2.7282819747924805 }, { "auxiliary_loss_clip": 0.01161712, "auxiliary_loss_mlp": 0.01030131, "balance_loss_clip": 1.04681206, "balance_loss_mlp": 1.02187562, "epoch": 0.658209583358384, "flos": 15888138220800.0, "grad_norm": 2.5388165388070534, "language_loss": 0.74683529, "learning_rate": 1.105161383685478e-06, "loss": 0.76875371, "num_input_tokens_seen": 117746160, "step": 5474, "time_per_iteration": 3.528578758239746 }, { "auxiliary_loss_clip": 0.01040328, "auxiliary_loss_mlp": 0.01004542, "balance_loss_clip": 1.01474738, "balance_loss_mlp": 1.00326681, "epoch": 0.658329826249023, "flos": 62695902447360.0, "grad_norm": 0.7225109128915901, "language_loss": 0.56300116, "learning_rate": 1.1044647982990771e-06, "loss": 0.58344984, "num_input_tokens_seen": 117808045, "step": 5475, "time_per_iteration": 3.2037813663482666 }, { "auxiliary_loss_clip": 0.01144841, "auxiliary_loss_mlp": 0.01026915, "balance_loss_clip": 1.04602408, "balance_loss_mlp": 1.01889825, "epoch": 0.6584500691396621, "flos": 31722624501120.0, "grad_norm": 2.4479948901781903, "language_loss": 0.64614105, "learning_rate": 1.1037683487646536e-06, "loss": 0.6678586, "num_input_tokens_seen": 117828330, "step": 5476, "time_per_iteration": 2.7692580223083496 }, { "auxiliary_loss_clip": 0.01141904, "auxiliary_loss_mlp": 0.00762235, "balance_loss_clip": 1.04712749, "balance_loss_mlp": 1.00052738, "epoch": 0.6585703120303013, "flos": 18406086635520.0, "grad_norm": 2.126267399094209, "language_loss": 0.77087557, "learning_rate": 1.1030720351878583e-06, "loss": 0.78991699, "num_input_tokens_seen": 117846450, "step": 5477, "time_per_iteration": 2.664565324783325 }, { "auxiliary_loss_clip": 0.01056041, "auxiliary_loss_mlp": 0.01004171, "balance_loss_clip": 1.01599836, "balance_loss_mlp": 1.00284219, "epoch": 0.6586905549209403, "flos": 58309880434560.0, "grad_norm": 0.811914981546454, "language_loss": 0.57625175, "learning_rate": 1.102375857674323e-06, "loss": 0.59685385, "num_input_tokens_seen": 117908365, "step": 5478, "time_per_iteration": 3.239750862121582 }, { "auxiliary_loss_clip": 0.0114321, "auxiliary_loss_mlp": 0.01023213, "balance_loss_clip": 1.04405212, "balance_loss_mlp": 1.01624179, "epoch": 0.6588107978115794, "flos": 22782627457920.0, "grad_norm": 1.781733151578761, "language_loss": 0.90316474, "learning_rate": 1.1016798163296561e-06, "loss": 0.92482889, "num_input_tokens_seen": 117927565, "step": 5479, "time_per_iteration": 2.743032455444336 }, { "auxiliary_loss_clip": 0.01159236, "auxiliary_loss_mlp": 0.01024034, "balance_loss_clip": 1.04555297, "balance_loss_mlp": 1.01663709, "epoch": 0.6589310407022185, "flos": 20667525050880.0, "grad_norm": 1.9220757351270759, "language_loss": 0.66131699, "learning_rate": 1.1009839112594471e-06, "loss": 0.68314964, "num_input_tokens_seen": 117945590, "step": 5480, "time_per_iteration": 2.5985333919525146 }, { "auxiliary_loss_clip": 0.01161761, "auxiliary_loss_mlp": 0.01021811, "balance_loss_clip": 1.04692984, "balance_loss_mlp": 1.01464379, "epoch": 0.6590512835928576, "flos": 25630595055360.0, "grad_norm": 2.07095634426907, "language_loss": 0.71949542, "learning_rate": 1.1002881425692638e-06, "loss": 0.74133122, "num_input_tokens_seen": 117966020, "step": 5481, "time_per_iteration": 2.7071070671081543 }, { "auxiliary_loss_clip": 0.01152304, "auxiliary_loss_mlp": 0.01026735, "balance_loss_clip": 1.04462111, "balance_loss_mlp": 1.01943922, "epoch": 0.6591715264834966, "flos": 23726108044800.0, "grad_norm": 1.848814859607197, "language_loss": 0.75193357, "learning_rate": 1.0995925103646532e-06, "loss": 0.77372402, "num_input_tokens_seen": 117984620, "step": 5482, "time_per_iteration": 2.6249442100524902 }, { "auxiliary_loss_clip": 0.01125295, "auxiliary_loss_mlp": 0.01023902, "balance_loss_clip": 1.0446291, "balance_loss_mlp": 1.01704466, "epoch": 0.6592917693741358, "flos": 35773850822400.0, "grad_norm": 1.6201716615860045, "language_loss": 0.67212021, "learning_rate": 1.0988970147511437e-06, "loss": 0.69361216, "num_input_tokens_seen": 118006500, "step": 5483, "time_per_iteration": 2.976287603378296 }, { "auxiliary_loss_clip": 0.01144515, "auxiliary_loss_mlp": 0.01023693, "balance_loss_clip": 1.04769921, "balance_loss_mlp": 1.01637316, "epoch": 0.6594120122647749, "flos": 21396834794880.0, "grad_norm": 2.121134265993353, "language_loss": 0.80464113, "learning_rate": 1.0982016558342405e-06, "loss": 0.82632327, "num_input_tokens_seen": 118025470, "step": 5484, "time_per_iteration": 2.6980535984039307 }, { "auxiliary_loss_clip": 0.01174448, "auxiliary_loss_mlp": 0.01022196, "balance_loss_clip": 1.05054033, "balance_loss_mlp": 1.01434946, "epoch": 0.6595322551554139, "flos": 19351829779200.0, "grad_norm": 2.2219558952327687, "language_loss": 0.71317214, "learning_rate": 1.0975064337194291e-06, "loss": 0.73513854, "num_input_tokens_seen": 118043515, "step": 5485, "time_per_iteration": 2.656280994415283 }, { "auxiliary_loss_clip": 0.01122892, "auxiliary_loss_mlp": 0.01024897, "balance_loss_clip": 1.04209244, "balance_loss_mlp": 1.01781559, "epoch": 0.6596524980460531, "flos": 16837113588480.0, "grad_norm": 1.576516265155251, "language_loss": 0.70254576, "learning_rate": 1.0968113485121743e-06, "loss": 0.7240237, "num_input_tokens_seen": 118063105, "step": 5486, "time_per_iteration": 2.7931225299835205 }, { "auxiliary_loss_clip": 0.01160024, "auxiliary_loss_mlp": 0.00763121, "balance_loss_clip": 1.04546154, "balance_loss_mlp": 1.00047016, "epoch": 0.6597727409366921, "flos": 21798567480960.0, "grad_norm": 2.432777151933775, "language_loss": 0.79819953, "learning_rate": 1.0961164003179185e-06, "loss": 0.81743097, "num_input_tokens_seen": 118081615, "step": 5487, "time_per_iteration": 2.6206037998199463 }, { "auxiliary_loss_clip": 0.01129307, "auxiliary_loss_mlp": 0.01028684, "balance_loss_clip": 1.04364777, "balance_loss_mlp": 1.02076876, "epoch": 0.6598929838273312, "flos": 23730704985600.0, "grad_norm": 1.8526709003714423, "language_loss": 0.84030318, "learning_rate": 1.0954215892420884e-06, "loss": 0.86188304, "num_input_tokens_seen": 118102315, "step": 5488, "time_per_iteration": 2.714679718017578 }, { "auxiliary_loss_clip": 0.01136802, "auxiliary_loss_mlp": 0.01026906, "balance_loss_clip": 1.04749644, "balance_loss_mlp": 1.01908588, "epoch": 0.6600132267179702, "flos": 19974520978560.0, "grad_norm": 1.591750087384913, "language_loss": 0.70661318, "learning_rate": 1.094726915390082e-06, "loss": 0.72825027, "num_input_tokens_seen": 118120650, "step": 5489, "time_per_iteration": 2.6814777851104736 }, { "auxiliary_loss_clip": 0.01161526, "auxiliary_loss_mlp": 0.01029809, "balance_loss_clip": 1.04956603, "balance_loss_mlp": 1.02231634, "epoch": 0.6601334696086094, "flos": 22342649765760.0, "grad_norm": 2.188090733610378, "language_loss": 0.6966483, "learning_rate": 1.0940323788672836e-06, "loss": 0.71856165, "num_input_tokens_seen": 118139825, "step": 5490, "time_per_iteration": 2.6518373489379883 }, { "auxiliary_loss_clip": 0.01155279, "auxiliary_loss_mlp": 0.01026078, "balance_loss_clip": 1.04761505, "balance_loss_mlp": 1.01876736, "epoch": 0.6602537124992485, "flos": 25703098657920.0, "grad_norm": 1.7186666304886211, "language_loss": 0.73776901, "learning_rate": 1.0933379797790522e-06, "loss": 0.75958258, "num_input_tokens_seen": 118159240, "step": 5491, "time_per_iteration": 2.6417956352233887 }, { "auxiliary_loss_clip": 0.01174063, "auxiliary_loss_mlp": 0.01033365, "balance_loss_clip": 1.05069685, "balance_loss_mlp": 1.02617669, "epoch": 0.6603739553898875, "flos": 25848572739840.0, "grad_norm": 2.4435977981069645, "language_loss": 0.71681023, "learning_rate": 1.0926437182307293e-06, "loss": 0.73888451, "num_input_tokens_seen": 118178050, "step": 5492, "time_per_iteration": 2.71757173538208 }, { "auxiliary_loss_clip": 0.01148584, "auxiliary_loss_mlp": 0.0102771, "balance_loss_clip": 1.0444417, "balance_loss_mlp": 1.02022111, "epoch": 0.6604941982805267, "flos": 24570296461440.0, "grad_norm": 2.0339032195782836, "language_loss": 0.78528816, "learning_rate": 1.0919495943276338e-06, "loss": 0.80705106, "num_input_tokens_seen": 118199070, "step": 5493, "time_per_iteration": 3.571887969970703 }, { "auxiliary_loss_clip": 0.01131805, "auxiliary_loss_mlp": 0.01025348, "balance_loss_clip": 1.04094172, "balance_loss_mlp": 1.01678252, "epoch": 0.6606144411711657, "flos": 13261775581440.0, "grad_norm": 2.7133279835832464, "language_loss": 0.76680374, "learning_rate": 1.0912556081750611e-06, "loss": 0.78837526, "num_input_tokens_seen": 118217000, "step": 5494, "time_per_iteration": 2.747368574142456 }, { "auxiliary_loss_clip": 0.01139339, "auxiliary_loss_mlp": 0.01026373, "balance_loss_clip": 1.04573631, "balance_loss_mlp": 1.01910686, "epoch": 0.6607346840618048, "flos": 25155281358720.0, "grad_norm": 2.065491172661617, "language_loss": 0.76419115, "learning_rate": 1.0905617598782909e-06, "loss": 0.78584832, "num_input_tokens_seen": 118237205, "step": 5495, "time_per_iteration": 3.659820318222046 }, { "auxiliary_loss_clip": 0.01109052, "auxiliary_loss_mlp": 0.01029537, "balance_loss_clip": 1.04073954, "balance_loss_mlp": 1.02217257, "epoch": 0.660854926952444, "flos": 17638029095040.0, "grad_norm": 1.9292010475395294, "language_loss": 0.81465316, "learning_rate": 1.0898680495425775e-06, "loss": 0.83603907, "num_input_tokens_seen": 118255495, "step": 5496, "time_per_iteration": 3.6012468338012695 }, { "auxiliary_loss_clip": 0.01148603, "auxiliary_loss_mlp": 0.01028611, "balance_loss_clip": 1.04728031, "balance_loss_mlp": 1.0205406, "epoch": 0.660975169843083, "flos": 16836000266880.0, "grad_norm": 1.6510132486357605, "language_loss": 0.80179226, "learning_rate": 1.0891744772731594e-06, "loss": 0.82356435, "num_input_tokens_seen": 118273310, "step": 5497, "time_per_iteration": 2.724155902862549 }, { "auxiliary_loss_clip": 0.01160206, "auxiliary_loss_mlp": 0.01025262, "balance_loss_clip": 1.04783177, "balance_loss_mlp": 1.01803732, "epoch": 0.6610954127337221, "flos": 26870410846080.0, "grad_norm": 1.598582658853339, "language_loss": 0.66152996, "learning_rate": 1.088481043175248e-06, "loss": 0.68338466, "num_input_tokens_seen": 118293880, "step": 5498, "time_per_iteration": 2.690173625946045 }, { "auxiliary_loss_clip": 0.01134624, "auxiliary_loss_mlp": 0.01032053, "balance_loss_clip": 1.04189885, "balance_loss_mlp": 1.02458167, "epoch": 0.6612156556243612, "flos": 26465697331200.0, "grad_norm": 1.7988312282958177, "language_loss": 0.75674796, "learning_rate": 1.0877877473540368e-06, "loss": 0.77841473, "num_input_tokens_seen": 118314465, "step": 5499, "time_per_iteration": 3.664267063140869 }, { "auxiliary_loss_clip": 0.01174087, "auxiliary_loss_mlp": 0.01028747, "balance_loss_clip": 1.04904902, "balance_loss_mlp": 1.02175224, "epoch": 0.6613358985150003, "flos": 19791915212160.0, "grad_norm": 2.102514570742598, "language_loss": 0.7224583, "learning_rate": 1.0870945899147002e-06, "loss": 0.74448669, "num_input_tokens_seen": 118331110, "step": 5500, "time_per_iteration": 2.6241533756256104 }, { "auxiliary_loss_clip": 0.01154793, "auxiliary_loss_mlp": 0.01027506, "balance_loss_clip": 1.04684925, "balance_loss_mlp": 1.02025795, "epoch": 0.6614561414056394, "flos": 26831627136000.0, "grad_norm": 1.8155396236889962, "language_loss": 0.76010931, "learning_rate": 1.0864015709623879e-06, "loss": 0.78193223, "num_input_tokens_seen": 118351980, "step": 5501, "time_per_iteration": 2.6770269870758057 }, { "auxiliary_loss_clip": 0.01161782, "auxiliary_loss_mlp": 0.01031645, "balance_loss_clip": 1.0465275, "balance_loss_mlp": 1.02427208, "epoch": 0.6615763842962785, "flos": 22894597128960.0, "grad_norm": 2.552213806209215, "language_loss": 0.80229872, "learning_rate": 1.0857086906022313e-06, "loss": 0.82423306, "num_input_tokens_seen": 118370315, "step": 5502, "time_per_iteration": 2.662813663482666 }, { "auxiliary_loss_clip": 0.0109075, "auxiliary_loss_mlp": 0.01027485, "balance_loss_clip": 1.04247904, "balance_loss_mlp": 1.02021933, "epoch": 0.6616966271869176, "flos": 24790321221120.0, "grad_norm": 2.0650104342377698, "language_loss": 0.73240244, "learning_rate": 1.0850159489393388e-06, "loss": 0.75358474, "num_input_tokens_seen": 118389575, "step": 5503, "time_per_iteration": 2.834150791168213 }, { "auxiliary_loss_clip": 0.01121909, "auxiliary_loss_mlp": 0.01033111, "balance_loss_clip": 1.03857088, "balance_loss_mlp": 1.02554107, "epoch": 0.6618168700775566, "flos": 17202109639680.0, "grad_norm": 1.8106058426215674, "language_loss": 0.82583642, "learning_rate": 1.0843233460787992e-06, "loss": 0.8473866, "num_input_tokens_seen": 118406790, "step": 5504, "time_per_iteration": 2.795630693435669 }, { "auxiliary_loss_clip": 0.01119394, "auxiliary_loss_mlp": 0.01025791, "balance_loss_clip": 1.04410267, "balance_loss_mlp": 1.01866293, "epoch": 0.6619371129681958, "flos": 25447091448960.0, "grad_norm": 1.8394838767047852, "language_loss": 0.78213453, "learning_rate": 1.0836308821256805e-06, "loss": 0.80358636, "num_input_tokens_seen": 118427590, "step": 5505, "time_per_iteration": 2.77901029586792 }, { "auxiliary_loss_clip": 0.01154781, "auxiliary_loss_mlp": 0.01023426, "balance_loss_clip": 1.04603505, "balance_loss_mlp": 1.01646411, "epoch": 0.6620573558588349, "flos": 18040444139520.0, "grad_norm": 2.077687820804473, "language_loss": 0.77653837, "learning_rate": 1.0829385571850282e-06, "loss": 0.79832047, "num_input_tokens_seen": 118444570, "step": 5506, "time_per_iteration": 2.66082501411438 }, { "auxiliary_loss_clip": 0.01178372, "auxiliary_loss_mlp": 0.01025744, "balance_loss_clip": 1.04963076, "balance_loss_mlp": 1.01711333, "epoch": 0.6621775987494739, "flos": 17785586165760.0, "grad_norm": 2.3946796795695886, "language_loss": 0.83441937, "learning_rate": 1.0822463713618679e-06, "loss": 0.85646051, "num_input_tokens_seen": 118461425, "step": 5507, "time_per_iteration": 2.6162307262420654 }, { "auxiliary_loss_clip": 0.01132775, "auxiliary_loss_mlp": 0.01027862, "balance_loss_clip": 1.04476309, "balance_loss_mlp": 1.0202328, "epoch": 0.6622978416401131, "flos": 17492590926720.0, "grad_norm": 2.0632533790232346, "language_loss": 0.84682941, "learning_rate": 1.0815543247612034e-06, "loss": 0.86843574, "num_input_tokens_seen": 118478495, "step": 5508, "time_per_iteration": 2.679636001586914 }, { "auxiliary_loss_clip": 0.01141056, "auxiliary_loss_mlp": 0.01025997, "balance_loss_clip": 1.04006851, "balance_loss_mlp": 1.01830816, "epoch": 0.6624180845307521, "flos": 21648352803840.0, "grad_norm": 2.2324963777055684, "language_loss": 0.83122355, "learning_rate": 1.0808624174880168e-06, "loss": 0.85289407, "num_input_tokens_seen": 118499145, "step": 5509, "time_per_iteration": 2.708400011062622 }, { "auxiliary_loss_clip": 0.01168854, "auxiliary_loss_mlp": 0.01027746, "balance_loss_clip": 1.0480504, "balance_loss_mlp": 1.02022934, "epoch": 0.6625383274213912, "flos": 23805902108160.0, "grad_norm": 1.6469696707950738, "language_loss": 0.80131209, "learning_rate": 1.080170649647272e-06, "loss": 0.82327807, "num_input_tokens_seen": 118518950, "step": 5510, "time_per_iteration": 2.6337897777557373 }, { "auxiliary_loss_clip": 0.01171859, "auxiliary_loss_mlp": 0.01023753, "balance_loss_clip": 1.0484221, "balance_loss_mlp": 1.01577163, "epoch": 0.6626585703120303, "flos": 33262941473280.0, "grad_norm": 1.6770824545589964, "language_loss": 0.67115468, "learning_rate": 1.0794790213439068e-06, "loss": 0.69311076, "num_input_tokens_seen": 118545850, "step": 5511, "time_per_iteration": 2.7604665756225586 }, { "auxiliary_loss_clip": 0.01114377, "auxiliary_loss_mlp": 0.01027744, "balance_loss_clip": 1.04134619, "balance_loss_mlp": 1.02014494, "epoch": 0.6627788132026694, "flos": 22085780630400.0, "grad_norm": 2.034582447812792, "language_loss": 0.78409696, "learning_rate": 1.078787532682843e-06, "loss": 0.80551815, "num_input_tokens_seen": 118563325, "step": 5512, "time_per_iteration": 2.817513942718506 }, { "auxiliary_loss_clip": 0.01157195, "auxiliary_loss_mlp": 0.0102428, "balance_loss_clip": 1.04712284, "balance_loss_mlp": 1.01704705, "epoch": 0.6628990560933085, "flos": 36173608260480.0, "grad_norm": 2.299569890728429, "language_loss": 0.7573086, "learning_rate": 1.0780961837689773e-06, "loss": 0.77912343, "num_input_tokens_seen": 118582835, "step": 5513, "time_per_iteration": 2.8635032176971436 }, { "auxiliary_loss_clip": 0.01136543, "auxiliary_loss_mlp": 0.01029725, "balance_loss_clip": 1.04424274, "balance_loss_mlp": 1.0220207, "epoch": 0.6630192989839476, "flos": 18513567106560.0, "grad_norm": 1.6382171626285609, "language_loss": 0.70061421, "learning_rate": 1.0774049747071883e-06, "loss": 0.72227687, "num_input_tokens_seen": 118600715, "step": 5514, "time_per_iteration": 2.7097952365875244 }, { "auxiliary_loss_clip": 0.01114872, "auxiliary_loss_mlp": 0.01028289, "balance_loss_clip": 1.04515576, "balance_loss_mlp": 1.02088642, "epoch": 0.6631395418745867, "flos": 35809510049280.0, "grad_norm": 2.9863664959167218, "language_loss": 0.68248427, "learning_rate": 1.076713905602332e-06, "loss": 0.70391583, "num_input_tokens_seen": 118621290, "step": 5515, "time_per_iteration": 2.9045188426971436 }, { "auxiliary_loss_clip": 0.01162276, "auxiliary_loss_mlp": 0.01028723, "balance_loss_clip": 1.04989243, "balance_loss_mlp": 1.02160668, "epoch": 0.6632597847652257, "flos": 20047742853120.0, "grad_norm": 1.71671176800019, "language_loss": 0.81306565, "learning_rate": 1.07602297655924e-06, "loss": 0.8349756, "num_input_tokens_seen": 118639610, "step": 5516, "time_per_iteration": 2.7744195461273193 }, { "auxiliary_loss_clip": 0.01175741, "auxiliary_loss_mlp": 0.01028792, "balance_loss_clip": 1.05223, "balance_loss_mlp": 1.02153802, "epoch": 0.6633800276558649, "flos": 21214480423680.0, "grad_norm": 1.7762758474308264, "language_loss": 0.81181604, "learning_rate": 1.0753321876827292e-06, "loss": 0.83386135, "num_input_tokens_seen": 118658895, "step": 5517, "time_per_iteration": 2.631793975830078 }, { "auxiliary_loss_clip": 0.01171993, "auxiliary_loss_mlp": 0.01024337, "balance_loss_clip": 1.04673946, "balance_loss_mlp": 1.01675582, "epoch": 0.663500270546504, "flos": 23987753688960.0, "grad_norm": 3.3397039241567814, "language_loss": 0.74144554, "learning_rate": 1.0746415390775893e-06, "loss": 0.76340878, "num_input_tokens_seen": 118677025, "step": 5518, "time_per_iteration": 2.617201805114746 }, { "auxiliary_loss_clip": 0.01171345, "auxiliary_loss_mlp": 0.01023989, "balance_loss_clip": 1.04976737, "balance_loss_mlp": 1.01671147, "epoch": 0.663620513437143, "flos": 17932389050880.0, "grad_norm": 1.9381324307562664, "language_loss": 0.76520085, "learning_rate": 1.0739510308485939e-06, "loss": 0.7871542, "num_input_tokens_seen": 118694240, "step": 5519, "time_per_iteration": 4.396620512008667 }, { "auxiliary_loss_clip": 0.01045816, "auxiliary_loss_mlp": 0.01002047, "balance_loss_clip": 1.014691, "balance_loss_mlp": 1.00061631, "epoch": 0.6637407563277821, "flos": 57840241086720.0, "grad_norm": 0.8153966597613861, "language_loss": 0.62413418, "learning_rate": 1.07326066310049e-06, "loss": 0.64461279, "num_input_tokens_seen": 118758365, "step": 5520, "time_per_iteration": 3.387009859085083 }, { "auxiliary_loss_clip": 0.01125779, "auxiliary_loss_mlp": 0.0102966, "balance_loss_clip": 1.04103184, "balance_loss_mlp": 1.0216732, "epoch": 0.6638609992184212, "flos": 27306007079040.0, "grad_norm": 1.9136908741476681, "language_loss": 0.7922405, "learning_rate": 1.0725704359380059e-06, "loss": 0.81379485, "num_input_tokens_seen": 118778220, "step": 5521, "time_per_iteration": 2.766756296157837 }, { "auxiliary_loss_clip": 0.01170267, "auxiliary_loss_mlp": 0.01024776, "balance_loss_clip": 1.0468179, "balance_loss_mlp": 1.01737332, "epoch": 0.6639812421090603, "flos": 18624854419200.0, "grad_norm": 1.761748808697812, "language_loss": 0.7212283, "learning_rate": 1.0718803494658497e-06, "loss": 0.74317873, "num_input_tokens_seen": 118797110, "step": 5522, "time_per_iteration": 3.6498806476593018 }, { "auxiliary_loss_clip": 0.01069739, "auxiliary_loss_mlp": 0.01031858, "balance_loss_clip": 1.03992057, "balance_loss_mlp": 1.02424049, "epoch": 0.6641014849996993, "flos": 15924479806080.0, "grad_norm": 2.2119213158620648, "language_loss": 0.83493382, "learning_rate": 1.071190403788707e-06, "loss": 0.85594982, "num_input_tokens_seen": 118812415, "step": 5523, "time_per_iteration": 3.697870969772339 }, { "auxiliary_loss_clip": 0.01137092, "auxiliary_loss_mlp": 0.01030202, "balance_loss_clip": 1.04632068, "balance_loss_mlp": 1.02227414, "epoch": 0.6642217278903385, "flos": 26505486622080.0, "grad_norm": 2.9087334353983776, "language_loss": 0.75687087, "learning_rate": 1.0705005990112415e-06, "loss": 0.77854389, "num_input_tokens_seen": 118832195, "step": 5524, "time_per_iteration": 2.728175401687622 }, { "auxiliary_loss_clip": 0.01104168, "auxiliary_loss_mlp": 0.01022789, "balance_loss_clip": 1.04182625, "balance_loss_mlp": 1.01499271, "epoch": 0.6643419707809776, "flos": 15377308951680.0, "grad_norm": 2.3802315807045975, "language_loss": 0.74571395, "learning_rate": 1.0698109352380957e-06, "loss": 0.76698351, "num_input_tokens_seen": 118849795, "step": 5525, "time_per_iteration": 2.696197032928467 }, { "auxiliary_loss_clip": 0.01171618, "auxiliary_loss_mlp": 0.01025456, "balance_loss_clip": 1.0481056, "balance_loss_mlp": 1.01827109, "epoch": 0.6644622136716166, "flos": 25117610970240.0, "grad_norm": 2.7207388164748583, "language_loss": 0.78470778, "learning_rate": 1.0691214125738909e-06, "loss": 0.80667853, "num_input_tokens_seen": 118870000, "step": 5526, "time_per_iteration": 3.573052406311035 }, { "auxiliary_loss_clip": 0.0107353, "auxiliary_loss_mlp": 0.01001616, "balance_loss_clip": 1.01535451, "balance_loss_mlp": 1.00034082, "epoch": 0.6645824565622558, "flos": 66201717680640.0, "grad_norm": 0.7827417595321566, "language_loss": 0.57508975, "learning_rate": 1.0684320311232287e-06, "loss": 0.59584117, "num_input_tokens_seen": 118932905, "step": 5527, "time_per_iteration": 3.255934238433838 }, { "auxiliary_loss_clip": 0.01138365, "auxiliary_loss_mlp": 0.01022295, "balance_loss_clip": 1.04319286, "balance_loss_mlp": 1.01442766, "epoch": 0.6647026994528948, "flos": 25082131311360.0, "grad_norm": 1.864349214330365, "language_loss": 0.8138752, "learning_rate": 1.0677427909906865e-06, "loss": 0.83548182, "num_input_tokens_seen": 118953355, "step": 5528, "time_per_iteration": 2.7268123626708984 }, { "auxiliary_loss_clip": 0.01176423, "auxiliary_loss_mlp": 0.01024992, "balance_loss_clip": 1.04935682, "balance_loss_mlp": 1.01758873, "epoch": 0.6648229423435339, "flos": 18222187979520.0, "grad_norm": 1.896701223612167, "language_loss": 0.72520018, "learning_rate": 1.0670536922808216e-06, "loss": 0.74721432, "num_input_tokens_seen": 118973480, "step": 5529, "time_per_iteration": 2.5958104133605957 }, { "auxiliary_loss_clip": 0.01143697, "auxiliary_loss_mlp": 0.01027434, "balance_loss_clip": 1.04495621, "balance_loss_mlp": 1.01985252, "epoch": 0.6649431852341731, "flos": 18296882311680.0, "grad_norm": 2.4139559885122197, "language_loss": 0.72043157, "learning_rate": 1.06636473509817e-06, "loss": 0.7421428, "num_input_tokens_seen": 118989860, "step": 5530, "time_per_iteration": 2.6316230297088623 }, { "auxiliary_loss_clip": 0.01139103, "auxiliary_loss_mlp": 0.0076261, "balance_loss_clip": 1.0433414, "balance_loss_mlp": 1.00062394, "epoch": 0.6650634281248121, "flos": 17019575700480.0, "grad_norm": 1.8292826404799298, "language_loss": 0.80721831, "learning_rate": 1.0656759195472447e-06, "loss": 0.82623541, "num_input_tokens_seen": 119007150, "step": 5531, "time_per_iteration": 2.6730198860168457 }, { "auxiliary_loss_clip": 0.0104849, "auxiliary_loss_mlp": 0.01003192, "balance_loss_clip": 1.01320267, "balance_loss_mlp": 1.00189841, "epoch": 0.6651836710154512, "flos": 69294810666240.0, "grad_norm": 0.7991494358614435, "language_loss": 0.59760058, "learning_rate": 1.0649872457325414e-06, "loss": 0.61811739, "num_input_tokens_seen": 119068435, "step": 5532, "time_per_iteration": 3.1604621410369873 }, { "auxiliary_loss_clip": 0.01064672, "auxiliary_loss_mlp": 0.01001645, "balance_loss_clip": 1.01547956, "balance_loss_mlp": 1.00035751, "epoch": 0.6653039139060903, "flos": 66883444882560.0, "grad_norm": 0.8494168811139466, "language_loss": 0.55120599, "learning_rate": 1.0642987137585278e-06, "loss": 0.57186913, "num_input_tokens_seen": 119127960, "step": 5533, "time_per_iteration": 3.1835052967071533 }, { "auxiliary_loss_clip": 0.0114265, "auxiliary_loss_mlp": 0.01029678, "balance_loss_clip": 1.04473555, "balance_loss_mlp": 1.02182221, "epoch": 0.6654241567967294, "flos": 21470056669440.0, "grad_norm": 2.090779350075203, "language_loss": 0.82868552, "learning_rate": 1.0636103237296561e-06, "loss": 0.85040885, "num_input_tokens_seen": 119146885, "step": 5534, "time_per_iteration": 2.699167490005493 }, { "auxiliary_loss_clip": 0.0115853, "auxiliary_loss_mlp": 0.01032075, "balance_loss_clip": 1.04997134, "balance_loss_mlp": 1.02448773, "epoch": 0.6655443996873684, "flos": 25119514391040.0, "grad_norm": 1.9465740379906644, "language_loss": 0.84064072, "learning_rate": 1.062922075750353e-06, "loss": 0.86254674, "num_input_tokens_seen": 119166900, "step": 5535, "time_per_iteration": 2.6890530586242676 }, { "auxiliary_loss_clip": 0.01131999, "auxiliary_loss_mlp": 0.01025489, "balance_loss_clip": 1.04315042, "balance_loss_mlp": 1.01850605, "epoch": 0.6656646425780076, "flos": 17457326749440.0, "grad_norm": 2.5029201574239424, "language_loss": 0.7180481, "learning_rate": 1.0622339699250267e-06, "loss": 0.73962295, "num_input_tokens_seen": 119184820, "step": 5536, "time_per_iteration": 2.7330074310302734 }, { "auxiliary_loss_clip": 0.01128637, "auxiliary_loss_mlp": 0.01020572, "balance_loss_clip": 1.04257226, "balance_loss_mlp": 1.01330018, "epoch": 0.6657848854686467, "flos": 23434190213760.0, "grad_norm": 2.8347360900146574, "language_loss": 0.79378927, "learning_rate": 1.0615460063580624e-06, "loss": 0.81528133, "num_input_tokens_seen": 119203295, "step": 5537, "time_per_iteration": 2.7683849334716797 }, { "auxiliary_loss_clip": 0.01144919, "auxiliary_loss_mlp": 0.01025352, "balance_loss_clip": 1.04476249, "balance_loss_mlp": 1.01809549, "epoch": 0.6659051283592857, "flos": 11509909459200.0, "grad_norm": 1.9400198760588567, "language_loss": 0.73412549, "learning_rate": 1.060858185153821e-06, "loss": 0.75582814, "num_input_tokens_seen": 119221395, "step": 5538, "time_per_iteration": 2.6868796348571777 }, { "auxiliary_loss_clip": 0.01148858, "auxiliary_loss_mlp": 0.01028213, "balance_loss_clip": 1.04588974, "balance_loss_mlp": 1.01997614, "epoch": 0.6660253712499249, "flos": 20594554571520.0, "grad_norm": 2.1910162045180717, "language_loss": 0.76172626, "learning_rate": 1.0601705064166474e-06, "loss": 0.78349698, "num_input_tokens_seen": 119239790, "step": 5539, "time_per_iteration": 2.6174168586730957 }, { "auxiliary_loss_clip": 0.01139076, "auxiliary_loss_mlp": 0.01024619, "balance_loss_clip": 1.04600489, "balance_loss_mlp": 1.0171268, "epoch": 0.666145614140564, "flos": 21251504367360.0, "grad_norm": 2.008897351834559, "language_loss": 0.73215079, "learning_rate": 1.0594829702508596e-06, "loss": 0.75378776, "num_input_tokens_seen": 119257505, "step": 5540, "time_per_iteration": 2.739698886871338 }, { "auxiliary_loss_clip": 0.01132421, "auxiliary_loss_mlp": 0.01022136, "balance_loss_clip": 1.04368496, "balance_loss_mlp": 1.01498985, "epoch": 0.666265857031203, "flos": 33726188200320.0, "grad_norm": 1.7255117937733588, "language_loss": 0.5519594, "learning_rate": 1.0587955767607592e-06, "loss": 0.57350492, "num_input_tokens_seen": 119279365, "step": 5541, "time_per_iteration": 2.832576036453247 }, { "auxiliary_loss_clip": 0.01172462, "auxiliary_loss_mlp": 0.01030938, "balance_loss_clip": 1.04773057, "balance_loss_mlp": 1.02323711, "epoch": 0.6663860999218422, "flos": 17456644391040.0, "grad_norm": 2.1127281204264547, "language_loss": 0.77447498, "learning_rate": 1.0581083260506206e-06, "loss": 0.79650897, "num_input_tokens_seen": 119296150, "step": 5542, "time_per_iteration": 2.6830027103424072 }, { "auxiliary_loss_clip": 0.01142127, "auxiliary_loss_mlp": 0.01023609, "balance_loss_clip": 1.0442369, "balance_loss_mlp": 1.01627159, "epoch": 0.6665063428124812, "flos": 17676740977920.0, "grad_norm": 2.3251671552687005, "language_loss": 0.75991321, "learning_rate": 1.0574212182246993e-06, "loss": 0.78157055, "num_input_tokens_seen": 119314845, "step": 5543, "time_per_iteration": 2.8738317489624023 }, { "auxiliary_loss_clip": 0.01149262, "auxiliary_loss_mlp": 0.01027854, "balance_loss_clip": 1.04489005, "balance_loss_mlp": 1.01980126, "epoch": 0.6666265857031203, "flos": 27673265687040.0, "grad_norm": 2.399282588198003, "language_loss": 0.76197147, "learning_rate": 1.0567342533872303e-06, "loss": 0.78374267, "num_input_tokens_seen": 119334875, "step": 5544, "time_per_iteration": 3.6982035636901855 }, { "auxiliary_loss_clip": 0.01145428, "auxiliary_loss_mlp": 0.01028874, "balance_loss_clip": 1.04705906, "balance_loss_mlp": 1.02158165, "epoch": 0.6667468285937594, "flos": 25046831220480.0, "grad_norm": 1.998371669579996, "language_loss": 0.8102715, "learning_rate": 1.0560474316424255e-06, "loss": 0.83201456, "num_input_tokens_seen": 119354635, "step": 5545, "time_per_iteration": 2.778339385986328 }, { "auxiliary_loss_clip": 0.01144634, "auxiliary_loss_mlp": 0.01024495, "balance_loss_clip": 1.04348445, "balance_loss_mlp": 1.01648974, "epoch": 0.6668670714843985, "flos": 22780472641920.0, "grad_norm": 2.2428927124532914, "language_loss": 0.73888534, "learning_rate": 1.0553607530944746e-06, "loss": 0.76057661, "num_input_tokens_seen": 119372690, "step": 5546, "time_per_iteration": 2.662250280380249 }, { "auxiliary_loss_clip": 0.01132408, "auxiliary_loss_mlp": 0.01028113, "balance_loss_clip": 1.0431782, "balance_loss_mlp": 1.02036488, "epoch": 0.6669873143750376, "flos": 22163886754560.0, "grad_norm": 1.9205402199937027, "language_loss": 0.89528465, "learning_rate": 1.0546742178475463e-06, "loss": 0.91688991, "num_input_tokens_seen": 119391685, "step": 5547, "time_per_iteration": 3.6857218742370605 }, { "auxiliary_loss_clip": 0.01123155, "auxiliary_loss_mlp": 0.01028225, "balance_loss_clip": 1.04443324, "balance_loss_mlp": 1.02121532, "epoch": 0.6671075572656767, "flos": 20514832335360.0, "grad_norm": 2.06214991182939, "language_loss": 0.86415339, "learning_rate": 1.0539878260057868e-06, "loss": 0.8856672, "num_input_tokens_seen": 119410725, "step": 5548, "time_per_iteration": 3.725162982940674 }, { "auxiliary_loss_clip": 0.01164596, "auxiliary_loss_mlp": 0.01029181, "balance_loss_clip": 1.05139935, "balance_loss_mlp": 1.02051425, "epoch": 0.6672278001563158, "flos": 17931203902080.0, "grad_norm": 2.642008394412806, "language_loss": 0.68730009, "learning_rate": 1.0533015776733226e-06, "loss": 0.70923787, "num_input_tokens_seen": 119426875, "step": 5549, "time_per_iteration": 2.6075572967529297 }, { "auxiliary_loss_clip": 0.01143848, "auxiliary_loss_mlp": 0.01025145, "balance_loss_clip": 1.04873848, "balance_loss_mlp": 1.01732492, "epoch": 0.6673480430469548, "flos": 22342146975360.0, "grad_norm": 2.2557921307871323, "language_loss": 0.79123092, "learning_rate": 1.0526154729542566e-06, "loss": 0.81292081, "num_input_tokens_seen": 119446935, "step": 5550, "time_per_iteration": 2.848194122314453 }, { "auxiliary_loss_clip": 0.01132672, "auxiliary_loss_mlp": 0.01032247, "balance_loss_clip": 1.04733503, "balance_loss_mlp": 1.02440262, "epoch": 0.6674682859375939, "flos": 20703830722560.0, "grad_norm": 2.4625949993221026, "language_loss": 0.79789776, "learning_rate": 1.0519295119526699e-06, "loss": 0.81954694, "num_input_tokens_seen": 119463240, "step": 5551, "time_per_iteration": 2.684818744659424 }, { "auxiliary_loss_clip": 0.01148798, "auxiliary_loss_mlp": 0.01028073, "balance_loss_clip": 1.04641473, "balance_loss_mlp": 1.02062261, "epoch": 0.667588528828233, "flos": 26206673379840.0, "grad_norm": 1.6745406205892186, "language_loss": 0.8344866, "learning_rate": 1.0512436947726227e-06, "loss": 0.85625529, "num_input_tokens_seen": 119484655, "step": 5552, "time_per_iteration": 3.5992443561553955 }, { "auxiliary_loss_clip": 0.01131935, "auxiliary_loss_mlp": 0.01025893, "balance_loss_clip": 1.04267776, "balance_loss_mlp": 1.01820445, "epoch": 0.6677087717188721, "flos": 23071025756160.0, "grad_norm": 2.132148561135527, "language_loss": 0.65313941, "learning_rate": 1.0505580215181517e-06, "loss": 0.67471772, "num_input_tokens_seen": 119502895, "step": 5553, "time_per_iteration": 2.689302444458008 }, { "auxiliary_loss_clip": 0.01035998, "auxiliary_loss_mlp": 0.01003108, "balance_loss_clip": 1.01910472, "balance_loss_mlp": 1.00191009, "epoch": 0.6678290146095112, "flos": 70941315219840.0, "grad_norm": 0.7810518136881069, "language_loss": 0.56627226, "learning_rate": 1.0498724922932753e-06, "loss": 0.58666337, "num_input_tokens_seen": 119561010, "step": 5554, "time_per_iteration": 3.231783628463745 }, { "auxiliary_loss_clip": 0.01180063, "auxiliary_loss_mlp": 0.0103022, "balance_loss_clip": 1.05142224, "balance_loss_mlp": 1.02167869, "epoch": 0.6679492575001503, "flos": 18661088263680.0, "grad_norm": 2.4046113593420815, "language_loss": 0.86040312, "learning_rate": 1.0491871072019851e-06, "loss": 0.88250589, "num_input_tokens_seen": 119578900, "step": 5555, "time_per_iteration": 2.610520601272583 }, { "auxiliary_loss_clip": 0.01134965, "auxiliary_loss_mlp": 0.01027188, "balance_loss_clip": 1.04158652, "balance_loss_mlp": 1.01936173, "epoch": 0.6680695003907894, "flos": 29711985822720.0, "grad_norm": 1.6425556443736054, "language_loss": 0.6395846, "learning_rate": 1.0485018663482555e-06, "loss": 0.66120613, "num_input_tokens_seen": 119598920, "step": 5556, "time_per_iteration": 2.7486085891723633 }, { "auxiliary_loss_clip": 0.0115557, "auxiliary_loss_mlp": 0.01028927, "balance_loss_clip": 1.04736733, "balance_loss_mlp": 1.02050447, "epoch": 0.6681897432814284, "flos": 28218964083840.0, "grad_norm": 2.8387395700614073, "language_loss": 0.70111781, "learning_rate": 1.0478167698360354e-06, "loss": 0.7229628, "num_input_tokens_seen": 119618220, "step": 5557, "time_per_iteration": 2.7754080295562744 }, { "auxiliary_loss_clip": 0.01150803, "auxiliary_loss_mlp": 0.01029571, "balance_loss_clip": 1.04415524, "balance_loss_mlp": 1.02157211, "epoch": 0.6683099861720676, "flos": 25046543911680.0, "grad_norm": 2.006459595991925, "language_loss": 0.70432043, "learning_rate": 1.0471318177692556e-06, "loss": 0.72612417, "num_input_tokens_seen": 119638520, "step": 5558, "time_per_iteration": 2.7546021938323975 }, { "auxiliary_loss_clip": 0.01118769, "auxiliary_loss_mlp": 0.01028653, "balance_loss_clip": 1.04133642, "balance_loss_mlp": 1.02080655, "epoch": 0.6684302290627067, "flos": 22996977868800.0, "grad_norm": 2.3825292521234744, "language_loss": 0.75916195, "learning_rate": 1.046447010251821e-06, "loss": 0.78063619, "num_input_tokens_seen": 119655850, "step": 5559, "time_per_iteration": 2.6862878799438477 }, { "auxiliary_loss_clip": 0.01145705, "auxiliary_loss_mlp": 0.01025244, "balance_loss_clip": 1.04787004, "balance_loss_mlp": 1.01828527, "epoch": 0.6685504719533457, "flos": 26573824247040.0, "grad_norm": 1.6968736109727582, "language_loss": 0.75716984, "learning_rate": 1.0457623473876157e-06, "loss": 0.77887928, "num_input_tokens_seen": 119675355, "step": 5560, "time_per_iteration": 2.732908010482788 }, { "auxiliary_loss_clip": 0.01172653, "auxiliary_loss_mlp": 0.01029955, "balance_loss_clip": 1.04925632, "balance_loss_mlp": 1.0226655, "epoch": 0.6686707148439849, "flos": 28986087870720.0, "grad_norm": 2.4125437328347967, "language_loss": 0.70816654, "learning_rate": 1.0450778292805046e-06, "loss": 0.73019266, "num_input_tokens_seen": 119695340, "step": 5561, "time_per_iteration": 2.6331405639648438 }, { "auxiliary_loss_clip": 0.01159394, "auxiliary_loss_mlp": 0.01028929, "balance_loss_clip": 1.04498947, "balance_loss_mlp": 1.02133799, "epoch": 0.6687909577346239, "flos": 23623152687360.0, "grad_norm": 2.046894057621575, "language_loss": 0.78544241, "learning_rate": 1.0443934560343267e-06, "loss": 0.80732566, "num_input_tokens_seen": 119716750, "step": 5562, "time_per_iteration": 2.6977884769439697 }, { "auxiliary_loss_clip": 0.01118841, "auxiliary_loss_mlp": 0.01024806, "balance_loss_clip": 1.04221749, "balance_loss_mlp": 1.0173676, "epoch": 0.668911200625263, "flos": 23148593176320.0, "grad_norm": 1.9017394458728372, "language_loss": 0.77942491, "learning_rate": 1.0437092277529034e-06, "loss": 0.8008613, "num_input_tokens_seen": 119736005, "step": 5563, "time_per_iteration": 2.7570395469665527 }, { "auxiliary_loss_clip": 0.01137751, "auxiliary_loss_mlp": 0.01033949, "balance_loss_clip": 1.04308224, "balance_loss_mlp": 1.02600968, "epoch": 0.6690314435159022, "flos": 18551919853440.0, "grad_norm": 3.961189391444136, "language_loss": 0.7369653, "learning_rate": 1.0430251445400292e-06, "loss": 0.75868225, "num_input_tokens_seen": 119754050, "step": 5564, "time_per_iteration": 2.6391401290893555 }, { "auxiliary_loss_clip": 0.01077736, "auxiliary_loss_mlp": 0.01026292, "balance_loss_clip": 1.04191935, "balance_loss_mlp": 1.01837027, "epoch": 0.6691516864065412, "flos": 31759540704000.0, "grad_norm": 2.111815697224342, "language_loss": 0.62629253, "learning_rate": 1.0423412064994787e-06, "loss": 0.64733279, "num_input_tokens_seen": 119774820, "step": 5565, "time_per_iteration": 3.149540662765503 }, { "auxiliary_loss_clip": 0.01132235, "auxiliary_loss_mlp": 0.01026322, "balance_loss_clip": 1.04329014, "balance_loss_mlp": 1.01967049, "epoch": 0.6692719292971803, "flos": 34933864296960.0, "grad_norm": 1.9897151254567262, "language_loss": 0.73970068, "learning_rate": 1.0416574137350064e-06, "loss": 0.76128626, "num_input_tokens_seen": 119795525, "step": 5566, "time_per_iteration": 3.1727914810180664 }, { "auxiliary_loss_clip": 0.01151254, "auxiliary_loss_mlp": 0.0102348, "balance_loss_clip": 1.0451107, "balance_loss_mlp": 1.01611841, "epoch": 0.6693921721878194, "flos": 20449188230400.0, "grad_norm": 2.1480724538604474, "language_loss": 0.80817735, "learning_rate": 1.0409737663503428e-06, "loss": 0.8299247, "num_input_tokens_seen": 119813905, "step": 5567, "time_per_iteration": 2.671480178833008 }, { "auxiliary_loss_clip": 0.01156048, "auxiliary_loss_mlp": 0.01025116, "balance_loss_clip": 1.04544997, "balance_loss_mlp": 1.01783574, "epoch": 0.6695124150784585, "flos": 16614538963200.0, "grad_norm": 1.83301664102826, "language_loss": 0.82930779, "learning_rate": 1.040290264449196e-06, "loss": 0.85111946, "num_input_tokens_seen": 119832010, "step": 5568, "time_per_iteration": 2.7063639163970947 }, { "auxiliary_loss_clip": 0.01154908, "auxiliary_loss_mlp": 0.01026937, "balance_loss_clip": 1.04830527, "balance_loss_mlp": 1.01965022, "epoch": 0.6696326579690975, "flos": 26652145852800.0, "grad_norm": 2.049665930997182, "language_loss": 0.64095235, "learning_rate": 1.0396069081352532e-06, "loss": 0.66277075, "num_input_tokens_seen": 119851165, "step": 5569, "time_per_iteration": 2.6890246868133545 }, { "auxiliary_loss_clip": 0.01072215, "auxiliary_loss_mlp": 0.01001855, "balance_loss_clip": 1.01441264, "balance_loss_mlp": 1.00057948, "epoch": 0.6697529008597367, "flos": 66964603662720.0, "grad_norm": 0.77194488483529, "language_loss": 0.55996835, "learning_rate": 1.0389236975121782e-06, "loss": 0.5807091, "num_input_tokens_seen": 119906015, "step": 5570, "time_per_iteration": 4.203742027282715 }, { "auxiliary_loss_clip": 0.01174761, "auxiliary_loss_mlp": 0.01025409, "balance_loss_clip": 1.04887116, "balance_loss_mlp": 1.01751173, "epoch": 0.6698731437503758, "flos": 20886939279360.0, "grad_norm": 2.2111947541401884, "language_loss": 0.71847427, "learning_rate": 1.0382406326836147e-06, "loss": 0.74047595, "num_input_tokens_seen": 119925160, "step": 5571, "time_per_iteration": 2.9389092922210693 }, { "auxiliary_loss_clip": 0.01165885, "auxiliary_loss_mlp": 0.01025774, "balance_loss_clip": 1.04897213, "balance_loss_mlp": 1.01793575, "epoch": 0.6699933866410148, "flos": 20409470766720.0, "grad_norm": 1.974523727208466, "language_loss": 0.75804532, "learning_rate": 1.0375577137531828e-06, "loss": 0.77996194, "num_input_tokens_seen": 119943720, "step": 5572, "time_per_iteration": 2.732252597808838 }, { "auxiliary_loss_clip": 0.01144717, "auxiliary_loss_mlp": 0.01028952, "balance_loss_clip": 1.04474759, "balance_loss_mlp": 1.02117705, "epoch": 0.670113629531654, "flos": 29023075900800.0, "grad_norm": 1.5336609638190344, "language_loss": 0.72237623, "learning_rate": 1.0368749408244802e-06, "loss": 0.74411297, "num_input_tokens_seen": 119966640, "step": 5573, "time_per_iteration": 3.6708884239196777 }, { "auxiliary_loss_clip": 0.01152566, "auxiliary_loss_mlp": 0.01021279, "balance_loss_clip": 1.04675555, "balance_loss_mlp": 1.01441002, "epoch": 0.670233872422293, "flos": 19791699730560.0, "grad_norm": 2.6529943433730665, "language_loss": 0.78822947, "learning_rate": 1.0361923140010836e-06, "loss": 0.80996799, "num_input_tokens_seen": 119985125, "step": 5574, "time_per_iteration": 3.6088156700134277 }, { "auxiliary_loss_clip": 0.01165965, "auxiliary_loss_mlp": 0.01026371, "balance_loss_clip": 1.04881418, "balance_loss_mlp": 1.01885831, "epoch": 0.6703541153129321, "flos": 24243689070720.0, "grad_norm": 2.0973974730599747, "language_loss": 0.63831913, "learning_rate": 1.0355098333865455e-06, "loss": 0.66024256, "num_input_tokens_seen": 120004355, "step": 5575, "time_per_iteration": 2.633697509765625 }, { "auxiliary_loss_clip": 0.01157807, "auxiliary_loss_mlp": 0.01024079, "balance_loss_clip": 1.05144298, "balance_loss_mlp": 1.01684046, "epoch": 0.6704743582035713, "flos": 26688523351680.0, "grad_norm": 1.6683597898468823, "language_loss": 0.69136381, "learning_rate": 1.0348274990844006e-06, "loss": 0.71318263, "num_input_tokens_seen": 120027115, "step": 5576, "time_per_iteration": 2.6844990253448486 }, { "auxiliary_loss_clip": 0.01159068, "auxiliary_loss_mlp": 0.01024004, "balance_loss_clip": 1.04876018, "balance_loss_mlp": 1.01665473, "epoch": 0.6705946010942103, "flos": 23514379326720.0, "grad_norm": 1.701636998764028, "language_loss": 0.72359014, "learning_rate": 1.034145311198155e-06, "loss": 0.74542087, "num_input_tokens_seen": 120047130, "step": 5577, "time_per_iteration": 2.6996164321899414 }, { "auxiliary_loss_clip": 0.01171117, "auxiliary_loss_mlp": 0.010253, "balance_loss_clip": 1.04890621, "balance_loss_mlp": 1.01829624, "epoch": 0.6707148439848494, "flos": 24061011477120.0, "grad_norm": 1.8395568624541752, "language_loss": 0.639714, "learning_rate": 1.0334632698312989e-06, "loss": 0.66167814, "num_input_tokens_seen": 120067925, "step": 5578, "time_per_iteration": 3.5222008228302 }, { "auxiliary_loss_clip": 0.01138315, "auxiliary_loss_mlp": 0.0102515, "balance_loss_clip": 1.04449928, "balance_loss_mlp": 1.01754737, "epoch": 0.6708350868754885, "flos": 22528667324160.0, "grad_norm": 1.9985962775434556, "language_loss": 0.75043499, "learning_rate": 1.032781375087295e-06, "loss": 0.77206963, "num_input_tokens_seen": 120087825, "step": 5579, "time_per_iteration": 2.7018957138061523 }, { "auxiliary_loss_clip": 0.01146169, "auxiliary_loss_mlp": 0.01025597, "balance_loss_clip": 1.04732573, "balance_loss_mlp": 1.01862001, "epoch": 0.6709553297661276, "flos": 25227749047680.0, "grad_norm": 1.3840362967114743, "language_loss": 0.6737597, "learning_rate": 1.0320996270695891e-06, "loss": 0.69547737, "num_input_tokens_seen": 120108895, "step": 5580, "time_per_iteration": 2.7406699657440186 }, { "auxiliary_loss_clip": 0.01129256, "auxiliary_loss_mlp": 0.01029928, "balance_loss_clip": 1.04245448, "balance_loss_mlp": 1.02161884, "epoch": 0.6710755726567667, "flos": 20448757267200.0, "grad_norm": 1.7452227913957243, "language_loss": 0.73362339, "learning_rate": 1.0314180258815998e-06, "loss": 0.75521517, "num_input_tokens_seen": 120127535, "step": 5581, "time_per_iteration": 2.6912853717803955 }, { "auxiliary_loss_clip": 0.01123523, "auxiliary_loss_mlp": 0.01027911, "balance_loss_clip": 1.04335976, "balance_loss_mlp": 1.02085686, "epoch": 0.6711958155474057, "flos": 25995411538560.0, "grad_norm": 1.8354039945466019, "language_loss": 0.74463153, "learning_rate": 1.0307365716267247e-06, "loss": 0.76614583, "num_input_tokens_seen": 120147980, "step": 5582, "time_per_iteration": 2.79166579246521 }, { "auxiliary_loss_clip": 0.01158256, "auxiliary_loss_mlp": 0.01024412, "balance_loss_clip": 1.04879987, "balance_loss_mlp": 1.01626682, "epoch": 0.6713160584380449, "flos": 19937712516480.0, "grad_norm": 5.65532134602039, "language_loss": 0.78064108, "learning_rate": 1.0300552644083423e-06, "loss": 0.8024677, "num_input_tokens_seen": 120166905, "step": 5583, "time_per_iteration": 2.647261619567871 }, { "auxiliary_loss_clip": 0.01135029, "auxiliary_loss_mlp": 0.01027266, "balance_loss_clip": 1.04789531, "balance_loss_mlp": 1.0187304, "epoch": 0.6714363013286839, "flos": 18223373128320.0, "grad_norm": 2.5008472623433744, "language_loss": 0.72625077, "learning_rate": 1.0293741043298036e-06, "loss": 0.74787378, "num_input_tokens_seen": 120185255, "step": 5584, "time_per_iteration": 2.7223284244537354 }, { "auxiliary_loss_clip": 0.01135638, "auxiliary_loss_mlp": 0.01030941, "balance_loss_clip": 1.04866767, "balance_loss_mlp": 1.02281666, "epoch": 0.671556544219323, "flos": 25812374808960.0, "grad_norm": 4.638312229081315, "language_loss": 0.71384537, "learning_rate": 1.0286930914944436e-06, "loss": 0.73551118, "num_input_tokens_seen": 120205070, "step": 5585, "time_per_iteration": 2.7481961250305176 }, { "auxiliary_loss_clip": 0.01171863, "auxiliary_loss_mlp": 0.01022665, "balance_loss_clip": 1.04697692, "balance_loss_mlp": 1.01511884, "epoch": 0.6716767871099621, "flos": 15850431918720.0, "grad_norm": 2.6387821665956985, "language_loss": 0.77316111, "learning_rate": 1.0280122260055684e-06, "loss": 0.79510641, "num_input_tokens_seen": 120220780, "step": 5586, "time_per_iteration": 2.5618083477020264 }, { "auxiliary_loss_clip": 0.01174133, "auxiliary_loss_mlp": 0.0102469, "balance_loss_clip": 1.04990554, "balance_loss_mlp": 1.01681042, "epoch": 0.6717970300006012, "flos": 19756112330880.0, "grad_norm": 1.9781844718713213, "language_loss": 0.82349598, "learning_rate": 1.0273315079664652e-06, "loss": 0.84548414, "num_input_tokens_seen": 120238735, "step": 5587, "time_per_iteration": 2.561619520187378 }, { "auxiliary_loss_clip": 0.01160297, "auxiliary_loss_mlp": 0.01028199, "balance_loss_clip": 1.04704726, "balance_loss_mlp": 1.02090645, "epoch": 0.6719172728912403, "flos": 25485049146240.0, "grad_norm": 2.480801711658586, "language_loss": 0.74358392, "learning_rate": 1.0266509374803992e-06, "loss": 0.7654689, "num_input_tokens_seen": 120259895, "step": 5588, "time_per_iteration": 2.7013163566589355 }, { "auxiliary_loss_clip": 0.01173365, "auxiliary_loss_mlp": 0.00763063, "balance_loss_clip": 1.04824317, "balance_loss_mlp": 1.00061631, "epoch": 0.6720375157818794, "flos": 15880344969600.0, "grad_norm": 2.295028816792916, "language_loss": 0.84097105, "learning_rate": 1.0259705146506123e-06, "loss": 0.86033535, "num_input_tokens_seen": 120274790, "step": 5589, "time_per_iteration": 2.5371172428131104 }, { "auxiliary_loss_clip": 0.01159391, "auxiliary_loss_mlp": 0.01027818, "balance_loss_clip": 1.0460279, "balance_loss_mlp": 1.01998281, "epoch": 0.6721577586725185, "flos": 32010843231360.0, "grad_norm": 2.644940252836303, "language_loss": 0.77787066, "learning_rate": 1.025290239580324e-06, "loss": 0.79974282, "num_input_tokens_seen": 120295460, "step": 5590, "time_per_iteration": 2.732830286026001 }, { "auxiliary_loss_clip": 0.01115731, "auxiliary_loss_mlp": 0.01022062, "balance_loss_clip": 1.04243672, "balance_loss_mlp": 1.01473343, "epoch": 0.6722780015631575, "flos": 20737873837440.0, "grad_norm": 1.7007563597323496, "language_loss": 0.75632882, "learning_rate": 1.0246101123727313e-06, "loss": 0.77770668, "num_input_tokens_seen": 120314440, "step": 5591, "time_per_iteration": 2.8272366523742676 }, { "auxiliary_loss_clip": 0.01159759, "auxiliary_loss_mlp": 0.01024891, "balance_loss_clip": 1.04777563, "balance_loss_mlp": 1.01782763, "epoch": 0.6723982444537967, "flos": 16909617191040.0, "grad_norm": 1.9507592644277296, "language_loss": 0.78894389, "learning_rate": 1.0239301331310085e-06, "loss": 0.81079042, "num_input_tokens_seen": 120332060, "step": 5592, "time_per_iteration": 2.6467511653900146 }, { "auxiliary_loss_clip": 0.01156266, "auxiliary_loss_mlp": 0.01026384, "balance_loss_clip": 1.04741931, "balance_loss_mlp": 1.01888609, "epoch": 0.6725184873444358, "flos": 20667812359680.0, "grad_norm": 2.2218773924535764, "language_loss": 0.88432622, "learning_rate": 1.0232503019583088e-06, "loss": 0.90615273, "num_input_tokens_seen": 120351670, "step": 5593, "time_per_iteration": 2.653676748275757 }, { "auxiliary_loss_clip": 0.01155032, "auxiliary_loss_mlp": 0.01029669, "balance_loss_clip": 1.04781914, "balance_loss_mlp": 1.02135766, "epoch": 0.6726387302350748, "flos": 23727616416000.0, "grad_norm": 1.8048514000805427, "language_loss": 0.69855452, "learning_rate": 1.0225706189577619e-06, "loss": 0.72040153, "num_input_tokens_seen": 120370195, "step": 5594, "time_per_iteration": 2.6803197860717773 }, { "auxiliary_loss_clip": 0.0116055, "auxiliary_loss_mlp": 0.01024622, "balance_loss_clip": 1.04869354, "balance_loss_mlp": 1.01669478, "epoch": 0.672758973125714, "flos": 15188274650880.0, "grad_norm": 2.0774472263956296, "language_loss": 0.74735409, "learning_rate": 1.021891084232475e-06, "loss": 0.76920581, "num_input_tokens_seen": 120388130, "step": 5595, "time_per_iteration": 2.640676736831665 }, { "auxiliary_loss_clip": 0.01159559, "auxiliary_loss_mlp": 0.01025277, "balance_loss_clip": 1.04673696, "balance_loss_mlp": 1.01788592, "epoch": 0.672879216016353, "flos": 18077252601600.0, "grad_norm": 2.5382746610715645, "language_loss": 0.80094409, "learning_rate": 1.0212116978855325e-06, "loss": 0.82279247, "num_input_tokens_seen": 120406145, "step": 5596, "time_per_iteration": 3.6259071826934814 }, { "auxiliary_loss_clip": 0.01127309, "auxiliary_loss_mlp": 0.01027907, "balance_loss_clip": 1.04250932, "balance_loss_mlp": 1.02058721, "epoch": 0.6729994589069921, "flos": 23476349802240.0, "grad_norm": 1.6732178573438625, "language_loss": 0.78808749, "learning_rate": 1.020532460019997e-06, "loss": 0.80963969, "num_input_tokens_seen": 120425395, "step": 5597, "time_per_iteration": 2.7591142654418945 }, { "auxiliary_loss_clip": 0.01091124, "auxiliary_loss_mlp": 0.01030202, "balance_loss_clip": 1.04088092, "balance_loss_mlp": 1.02293622, "epoch": 0.6731197017976313, "flos": 26322018929280.0, "grad_norm": 1.9225533321404857, "language_loss": 0.70734155, "learning_rate": 1.0198533707389096e-06, "loss": 0.72855479, "num_input_tokens_seen": 120446270, "step": 5598, "time_per_iteration": 2.962085723876953 }, { "auxiliary_loss_clip": 0.01156736, "auxiliary_loss_mlp": 0.00762741, "balance_loss_clip": 1.04813623, "balance_loss_mlp": 1.00054765, "epoch": 0.6732399446882703, "flos": 21616428591360.0, "grad_norm": 1.979576538143776, "language_loss": 0.73137981, "learning_rate": 1.0191744301452853e-06, "loss": 0.75057459, "num_input_tokens_seen": 120465570, "step": 5599, "time_per_iteration": 3.135270118713379 }, { "auxiliary_loss_clip": 0.01171699, "auxiliary_loss_mlp": 0.01027092, "balance_loss_clip": 1.04660749, "balance_loss_mlp": 1.01965916, "epoch": 0.6733601875789094, "flos": 25880173729920.0, "grad_norm": 1.7644782552543798, "language_loss": 0.704988, "learning_rate": 1.0184956383421208e-06, "loss": 0.72697592, "num_input_tokens_seen": 120484220, "step": 5600, "time_per_iteration": 4.532362461090088 }, { "auxiliary_loss_clip": 0.01163875, "auxiliary_loss_mlp": 0.01030475, "balance_loss_clip": 1.05010533, "balance_loss_mlp": 1.0225774, "epoch": 0.6734804304695485, "flos": 22929573997440.0, "grad_norm": 2.980124969205153, "language_loss": 0.65551782, "learning_rate": 1.017816995432387e-06, "loss": 0.67746127, "num_input_tokens_seen": 120503320, "step": 5601, "time_per_iteration": 2.6792614459991455 }, { "auxiliary_loss_clip": 0.0114318, "auxiliary_loss_mlp": 0.01025237, "balance_loss_clip": 1.04496193, "balance_loss_mlp": 1.0177747, "epoch": 0.6736006733601876, "flos": 18697968552960.0, "grad_norm": 2.3164402357509775, "language_loss": 0.74617624, "learning_rate": 1.0171385015190353e-06, "loss": 0.76786035, "num_input_tokens_seen": 120523180, "step": 5602, "time_per_iteration": 2.7164807319641113 }, { "auxiliary_loss_clip": 0.01140382, "auxiliary_loss_mlp": 0.00761838, "balance_loss_clip": 1.04821968, "balance_loss_mlp": 1.00047779, "epoch": 0.6737209162508266, "flos": 19427745173760.0, "grad_norm": 2.0824687294503494, "language_loss": 0.73097634, "learning_rate": 1.0164601567049908e-06, "loss": 0.74999857, "num_input_tokens_seen": 120541710, "step": 5603, "time_per_iteration": 2.657412052154541 }, { "auxiliary_loss_clip": 0.01143356, "auxiliary_loss_mlp": 0.01035625, "balance_loss_clip": 1.04636931, "balance_loss_mlp": 1.02740562, "epoch": 0.6738411591414658, "flos": 20158060498560.0, "grad_norm": 1.7380537021400595, "language_loss": 0.80120832, "learning_rate": 1.015781961093158e-06, "loss": 0.82299817, "num_input_tokens_seen": 120561030, "step": 5604, "time_per_iteration": 3.6905157566070557 }, { "auxiliary_loss_clip": 0.01146615, "auxiliary_loss_mlp": 0.01026144, "balance_loss_clip": 1.04210949, "balance_loss_mlp": 1.0189141, "epoch": 0.6739614020321049, "flos": 21653847584640.0, "grad_norm": 1.6547121668972335, "language_loss": 0.77087247, "learning_rate": 1.0151039147864197e-06, "loss": 0.7926001, "num_input_tokens_seen": 120581005, "step": 5605, "time_per_iteration": 2.735487222671509 }, { "auxiliary_loss_clip": 0.01086711, "auxiliary_loss_mlp": 0.01021433, "balance_loss_clip": 1.04620957, "balance_loss_mlp": 1.0133028, "epoch": 0.6740816449227439, "flos": 19171702051200.0, "grad_norm": 1.9949174297970291, "language_loss": 0.66060144, "learning_rate": 1.0144260178876336e-06, "loss": 0.68168288, "num_input_tokens_seen": 120600350, "step": 5606, "time_per_iteration": 2.8177146911621094 }, { "auxiliary_loss_clip": 0.01151305, "auxiliary_loss_mlp": 0.01028392, "balance_loss_clip": 1.04727447, "balance_loss_mlp": 1.02103949, "epoch": 0.6742018878133831, "flos": 21097015971840.0, "grad_norm": 2.4496737697392774, "language_loss": 0.66957605, "learning_rate": 1.0137482704996388e-06, "loss": 0.69137299, "num_input_tokens_seen": 120614700, "step": 5607, "time_per_iteration": 2.681610345840454 }, { "auxiliary_loss_clip": 0.01130912, "auxiliary_loss_mlp": 0.01025113, "balance_loss_clip": 1.04421699, "balance_loss_mlp": 1.01700675, "epoch": 0.6743221307040221, "flos": 23549966726400.0, "grad_norm": 2.164569296270932, "language_loss": 0.78940463, "learning_rate": 1.0130706727252461e-06, "loss": 0.81096482, "num_input_tokens_seen": 120631755, "step": 5608, "time_per_iteration": 2.689283847808838 }, { "auxiliary_loss_clip": 0.01135769, "auxiliary_loss_mlp": 0.01030542, "balance_loss_clip": 1.04579091, "balance_loss_mlp": 1.02293038, "epoch": 0.6744423735946612, "flos": 16249542912000.0, "grad_norm": 2.5762382794721397, "language_loss": 0.68218058, "learning_rate": 1.0123932246672468e-06, "loss": 0.70384371, "num_input_tokens_seen": 120645900, "step": 5609, "time_per_iteration": 2.722545623779297 }, { "auxiliary_loss_clip": 0.01026009, "auxiliary_loss_mlp": 0.00754235, "balance_loss_clip": 1.01242709, "balance_loss_mlp": 1.00050151, "epoch": 0.6745626164853004, "flos": 57843257829120.0, "grad_norm": 0.8028472716722818, "language_loss": 0.55793214, "learning_rate": 1.0117159264284114e-06, "loss": 0.57573462, "num_input_tokens_seen": 120709070, "step": 5610, "time_per_iteration": 3.229762554168701 }, { "auxiliary_loss_clip": 0.01146399, "auxiliary_loss_mlp": 0.01027652, "balance_loss_clip": 1.04584312, "balance_loss_mlp": 1.01959038, "epoch": 0.6746828593759394, "flos": 20485027025280.0, "grad_norm": 1.6199596471313982, "language_loss": 0.77036834, "learning_rate": 1.0110387781114837e-06, "loss": 0.79210889, "num_input_tokens_seen": 120727685, "step": 5611, "time_per_iteration": 2.740227460861206 }, { "auxiliary_loss_clip": 0.01171686, "auxiliary_loss_mlp": 0.01032665, "balance_loss_clip": 1.04830587, "balance_loss_mlp": 1.02478504, "epoch": 0.6748031022665785, "flos": 19208223204480.0, "grad_norm": 1.9751819457893176, "language_loss": 0.77098811, "learning_rate": 1.0103617798191872e-06, "loss": 0.79303169, "num_input_tokens_seen": 120747160, "step": 5612, "time_per_iteration": 2.667494058609009 }, { "auxiliary_loss_clip": 0.01141049, "auxiliary_loss_mlp": 0.01027562, "balance_loss_clip": 1.04592025, "balance_loss_mlp": 1.01973629, "epoch": 0.6749233451572175, "flos": 15195026407680.0, "grad_norm": 2.4898386321441786, "language_loss": 0.83021438, "learning_rate": 1.0096849316542217e-06, "loss": 0.85190046, "num_input_tokens_seen": 120763710, "step": 5613, "time_per_iteration": 2.6229987144470215 }, { "auxiliary_loss_clip": 0.01073739, "auxiliary_loss_mlp": 0.01025765, "balance_loss_clip": 1.03619325, "balance_loss_mlp": 1.0179987, "epoch": 0.6750435880478567, "flos": 26499489050880.0, "grad_norm": 5.548352424766525, "language_loss": 0.74533129, "learning_rate": 1.0090082337192643e-06, "loss": 0.76632631, "num_input_tokens_seen": 120783355, "step": 5614, "time_per_iteration": 2.8566932678222656 }, { "auxiliary_loss_clip": 0.01094433, "auxiliary_loss_mlp": 0.01020686, "balance_loss_clip": 1.03553474, "balance_loss_mlp": 1.01347947, "epoch": 0.6751638309384957, "flos": 23404313076480.0, "grad_norm": 1.9250115902088827, "language_loss": 0.78154719, "learning_rate": 1.0083316861169705e-06, "loss": 0.80269837, "num_input_tokens_seen": 120802090, "step": 5615, "time_per_iteration": 2.7658333778381348 }, { "auxiliary_loss_clip": 0.01135365, "auxiliary_loss_mlp": 0.01028758, "balance_loss_clip": 1.04158676, "balance_loss_mlp": 1.02096164, "epoch": 0.6752840738291348, "flos": 23441408847360.0, "grad_norm": 2.055620239732859, "language_loss": 0.71661985, "learning_rate": 1.0076552889499713e-06, "loss": 0.7382611, "num_input_tokens_seen": 120822855, "step": 5616, "time_per_iteration": 2.7836246490478516 }, { "auxiliary_loss_clip": 0.01158776, "auxiliary_loss_mlp": 0.01029412, "balance_loss_clip": 1.05043197, "balance_loss_mlp": 1.02183604, "epoch": 0.675404316719774, "flos": 30335826257280.0, "grad_norm": 2.0190496489226657, "language_loss": 0.73920786, "learning_rate": 1.006979042320876e-06, "loss": 0.7610898, "num_input_tokens_seen": 120843070, "step": 5617, "time_per_iteration": 2.713481903076172 }, { "auxiliary_loss_clip": 0.01136754, "auxiliary_loss_mlp": 0.0102295, "balance_loss_clip": 1.04049695, "balance_loss_mlp": 1.01558542, "epoch": 0.675524559610413, "flos": 23622613983360.0, "grad_norm": 2.328651265977191, "language_loss": 0.63237411, "learning_rate": 1.0063029463322702e-06, "loss": 0.6539712, "num_input_tokens_seen": 120863345, "step": 5618, "time_per_iteration": 2.6995837688446045 }, { "auxiliary_loss_clip": 0.01110937, "auxiliary_loss_mlp": 0.00762991, "balance_loss_clip": 1.0410403, "balance_loss_mlp": 1.00058484, "epoch": 0.6756448025010521, "flos": 21248631279360.0, "grad_norm": 2.3740472012629104, "language_loss": 0.75816512, "learning_rate": 1.0056270010867164e-06, "loss": 0.77690446, "num_input_tokens_seen": 120880915, "step": 5619, "time_per_iteration": 2.740149736404419 }, { "auxiliary_loss_clip": 0.0114697, "auxiliary_loss_mlp": 0.01024427, "balance_loss_clip": 1.04335666, "balance_loss_mlp": 1.01630926, "epoch": 0.6757650453916912, "flos": 21646521210240.0, "grad_norm": 2.6011823669195886, "language_loss": 0.78155828, "learning_rate": 1.004951206686758e-06, "loss": 0.80327225, "num_input_tokens_seen": 120899190, "step": 5620, "time_per_iteration": 2.673902750015259 }, { "auxiliary_loss_clip": 0.01155951, "auxiliary_loss_mlp": 0.01032167, "balance_loss_clip": 1.04764855, "balance_loss_mlp": 1.0249306, "epoch": 0.6758852882823303, "flos": 21795658479360.0, "grad_norm": 2.49675271649647, "language_loss": 0.71811426, "learning_rate": 1.0042755632349087e-06, "loss": 0.73999548, "num_input_tokens_seen": 120916080, "step": 5621, "time_per_iteration": 2.653812885284424 }, { "auxiliary_loss_clip": 0.01131986, "auxiliary_loss_mlp": 0.01030446, "balance_loss_clip": 1.04598165, "balance_loss_mlp": 1.02251828, "epoch": 0.6760055311729694, "flos": 27088783580160.0, "grad_norm": 2.2330127973169156, "language_loss": 0.62687755, "learning_rate": 1.0036000708336653e-06, "loss": 0.64850187, "num_input_tokens_seen": 120935210, "step": 5622, "time_per_iteration": 2.710482597351074 }, { "auxiliary_loss_clip": 0.01148303, "auxiliary_loss_mlp": 0.01025053, "balance_loss_clip": 1.04722953, "balance_loss_mlp": 1.01777864, "epoch": 0.6761257740636085, "flos": 17999792922240.0, "grad_norm": 3.943259077005317, "language_loss": 0.79268008, "learning_rate": 1.0029247295854984e-06, "loss": 0.81441367, "num_input_tokens_seen": 120951830, "step": 5623, "time_per_iteration": 3.613487482070923 }, { "auxiliary_loss_clip": 0.0113431, "auxiliary_loss_mlp": 0.01027593, "balance_loss_clip": 1.04503393, "balance_loss_mlp": 1.02085233, "epoch": 0.6762460169542476, "flos": 15121912273920.0, "grad_norm": 1.830363545364406, "language_loss": 0.71552223, "learning_rate": 1.0022495395928588e-06, "loss": 0.73714125, "num_input_tokens_seen": 120970310, "step": 5624, "time_per_iteration": 2.7659568786621094 }, { "auxiliary_loss_clip": 0.01072208, "auxiliary_loss_mlp": 0.01001471, "balance_loss_clip": 1.01518321, "balance_loss_mlp": 1.00026691, "epoch": 0.6763662598448866, "flos": 67886970030720.0, "grad_norm": 0.836156658565272, "language_loss": 0.62328517, "learning_rate": 1.0015745009581697e-06, "loss": 0.64402199, "num_input_tokens_seen": 121031915, "step": 5625, "time_per_iteration": 4.278688669204712 }, { "auxiliary_loss_clip": 0.01158074, "auxiliary_loss_mlp": 0.01025334, "balance_loss_clip": 1.04876542, "balance_loss_mlp": 1.01793969, "epoch": 0.6764865027355258, "flos": 20631829910400.0, "grad_norm": 2.802326638138946, "language_loss": 0.66845995, "learning_rate": 1.0008996137838343e-06, "loss": 0.69029403, "num_input_tokens_seen": 121050890, "step": 5626, "time_per_iteration": 3.5700061321258545 }, { "auxiliary_loss_clip": 0.01176387, "auxiliary_loss_mlp": 0.01025554, "balance_loss_clip": 1.04930329, "balance_loss_mlp": 1.01737905, "epoch": 0.6766067456261649, "flos": 21215809226880.0, "grad_norm": 1.9251392710950224, "language_loss": 0.79554623, "learning_rate": 1.000224878172234e-06, "loss": 0.81756568, "num_input_tokens_seen": 121070015, "step": 5627, "time_per_iteration": 2.637594699859619 }, { "auxiliary_loss_clip": 0.01160703, "auxiliary_loss_mlp": 0.0102381, "balance_loss_clip": 1.04735982, "balance_loss_mlp": 1.01661563, "epoch": 0.6767269885168039, "flos": 19938251220480.0, "grad_norm": 1.9654144533019624, "language_loss": 0.72909075, "learning_rate": 9.99550294225724e-07, "loss": 0.75093585, "num_input_tokens_seen": 121089170, "step": 5628, "time_per_iteration": 2.62931227684021 }, { "auxiliary_loss_clip": 0.01117666, "auxiliary_loss_mlp": 0.01025496, "balance_loss_clip": 1.04049051, "balance_loss_mlp": 1.01773608, "epoch": 0.6768472314074431, "flos": 20814076540800.0, "grad_norm": 2.137186621327736, "language_loss": 0.7285136, "learning_rate": 9.988758620466402e-07, "loss": 0.74994516, "num_input_tokens_seen": 121108040, "step": 5629, "time_per_iteration": 2.7638378143310547 }, { "auxiliary_loss_clip": 0.01110145, "auxiliary_loss_mlp": 0.01026942, "balance_loss_clip": 1.04242206, "balance_loss_mlp": 1.02050757, "epoch": 0.6769674742980821, "flos": 23186012169600.0, "grad_norm": 1.5324629585452674, "language_loss": 0.76249516, "learning_rate": 9.982015817372917e-07, "loss": 0.78386599, "num_input_tokens_seen": 121128480, "step": 5630, "time_per_iteration": 3.647925615310669 }, { "auxiliary_loss_clip": 0.01115857, "auxiliary_loss_mlp": 0.01026612, "balance_loss_clip": 1.04196262, "balance_loss_mlp": 1.01862526, "epoch": 0.6770877171887212, "flos": 24242934885120.0, "grad_norm": 2.25630976152184, "language_loss": 0.82313728, "learning_rate": 9.975274533999657e-07, "loss": 0.84456193, "num_input_tokens_seen": 121148010, "step": 5631, "time_per_iteration": 2.7790493965148926 }, { "auxiliary_loss_clip": 0.01175293, "auxiliary_loss_mlp": 0.01021043, "balance_loss_clip": 1.04929769, "balance_loss_mlp": 1.01350284, "epoch": 0.6772079600793603, "flos": 18141567903360.0, "grad_norm": 2.5255062704283384, "language_loss": 0.83988953, "learning_rate": 9.96853477136929e-07, "loss": 0.86185288, "num_input_tokens_seen": 121162755, "step": 5632, "time_per_iteration": 2.5044758319854736 }, { "auxiliary_loss_clip": 0.01121418, "auxiliary_loss_mlp": 0.01027861, "balance_loss_clip": 1.04045367, "balance_loss_mlp": 1.02011228, "epoch": 0.6773282029699994, "flos": 22452069571200.0, "grad_norm": 2.597752941017765, "language_loss": 0.75254214, "learning_rate": 9.96179653050422e-07, "loss": 0.77403498, "num_input_tokens_seen": 121182915, "step": 5633, "time_per_iteration": 2.7011470794677734 }, { "auxiliary_loss_clip": 0.01122076, "auxiliary_loss_mlp": 0.01023913, "balance_loss_clip": 1.04330277, "balance_loss_mlp": 1.01656973, "epoch": 0.6774484458606385, "flos": 18693730748160.0, "grad_norm": 2.069292776332559, "language_loss": 0.74108005, "learning_rate": 9.955059812426635e-07, "loss": 0.76253998, "num_input_tokens_seen": 121200445, "step": 5634, "time_per_iteration": 2.7119178771972656 }, { "auxiliary_loss_clip": 0.01175202, "auxiliary_loss_mlp": 0.01021383, "balance_loss_clip": 1.05220079, "balance_loss_mlp": 1.01373613, "epoch": 0.6775686887512776, "flos": 25994046821760.0, "grad_norm": 2.1806452243813754, "language_loss": 0.83088183, "learning_rate": 9.948324618158493e-07, "loss": 0.8528477, "num_input_tokens_seen": 121220785, "step": 5635, "time_per_iteration": 2.6580445766448975 }, { "auxiliary_loss_clip": 0.01159701, "auxiliary_loss_mlp": 0.01021827, "balance_loss_clip": 1.04570699, "balance_loss_mlp": 1.01421213, "epoch": 0.6776889316419167, "flos": 13587987922560.0, "grad_norm": 2.5433321248273475, "language_loss": 0.77866387, "learning_rate": 9.941590948721502e-07, "loss": 0.80047917, "num_input_tokens_seen": 121237985, "step": 5636, "time_per_iteration": 2.7068536281585693 }, { "auxiliary_loss_clip": 0.01139215, "auxiliary_loss_mlp": 0.01024649, "balance_loss_clip": 1.04508567, "balance_loss_mlp": 1.01782393, "epoch": 0.6778091745325557, "flos": 27601121220480.0, "grad_norm": 3.1535602705567243, "language_loss": 0.76572204, "learning_rate": 9.934858805137188e-07, "loss": 0.78736067, "num_input_tokens_seen": 121258635, "step": 5637, "time_per_iteration": 2.701462507247925 }, { "auxiliary_loss_clip": 0.01155338, "auxiliary_loss_mlp": 0.01024991, "balance_loss_clip": 1.04881716, "balance_loss_mlp": 1.01788664, "epoch": 0.6779294174231949, "flos": 18734058743040.0, "grad_norm": 1.6146172095275984, "language_loss": 0.80485415, "learning_rate": 9.92812818842677e-07, "loss": 0.82665741, "num_input_tokens_seen": 121277810, "step": 5638, "time_per_iteration": 2.6393420696258545 }, { "auxiliary_loss_clip": 0.01154541, "auxiliary_loss_mlp": 0.01023544, "balance_loss_clip": 1.04615104, "balance_loss_mlp": 1.01603127, "epoch": 0.678049660313834, "flos": 45873797765760.0, "grad_norm": 1.6808301606635099, "language_loss": 0.64113212, "learning_rate": 9.921399099611306e-07, "loss": 0.66291308, "num_input_tokens_seen": 121298975, "step": 5639, "time_per_iteration": 2.8180606365203857 }, { "auxiliary_loss_clip": 0.01146285, "auxiliary_loss_mlp": 0.0102666, "balance_loss_clip": 1.04446542, "balance_loss_mlp": 1.01953411, "epoch": 0.678169903204473, "flos": 19974556892160.0, "grad_norm": 1.5023220297590467, "language_loss": 0.6898362, "learning_rate": 9.914671539711588e-07, "loss": 0.71156561, "num_input_tokens_seen": 121318495, "step": 5640, "time_per_iteration": 2.6690948009490967 }, { "auxiliary_loss_clip": 0.01076271, "auxiliary_loss_mlp": 0.00763117, "balance_loss_clip": 1.04072905, "balance_loss_mlp": 1.0005827, "epoch": 0.6782901460951122, "flos": 21395613732480.0, "grad_norm": 2.516338416050967, "language_loss": 0.78301537, "learning_rate": 9.90794550974817e-07, "loss": 0.80140924, "num_input_tokens_seen": 121338890, "step": 5641, "time_per_iteration": 2.991572618484497 }, { "auxiliary_loss_clip": 0.01125203, "auxiliary_loss_mlp": 0.01028683, "balance_loss_clip": 1.04332697, "balance_loss_mlp": 1.02106583, "epoch": 0.6784103889857512, "flos": 21434002392960.0, "grad_norm": 2.1587111522706843, "language_loss": 0.81142557, "learning_rate": 9.901221010741407e-07, "loss": 0.83296442, "num_input_tokens_seen": 121358210, "step": 5642, "time_per_iteration": 3.0120208263397217 }, { "auxiliary_loss_clip": 0.01159589, "auxiliary_loss_mlp": 0.01026701, "balance_loss_clip": 1.04600012, "balance_loss_mlp": 1.01890516, "epoch": 0.6785306318763903, "flos": 32671923091200.0, "grad_norm": 1.8589368385098235, "language_loss": 0.74617702, "learning_rate": 9.894498043711375e-07, "loss": 0.76803994, "num_input_tokens_seen": 121379955, "step": 5643, "time_per_iteration": 2.710144281387329 }, { "auxiliary_loss_clip": 0.01139678, "auxiliary_loss_mlp": 0.01024011, "balance_loss_clip": 1.04272366, "balance_loss_mlp": 1.01705503, "epoch": 0.6786508747670293, "flos": 25632139340160.0, "grad_norm": 2.181876029897192, "language_loss": 0.69281673, "learning_rate": 9.887776609677962e-07, "loss": 0.71445364, "num_input_tokens_seen": 121401325, "step": 5644, "time_per_iteration": 2.70442271232605 }, { "auxiliary_loss_clip": 0.01117848, "auxiliary_loss_mlp": 0.01024846, "balance_loss_clip": 1.03898442, "balance_loss_mlp": 1.01788163, "epoch": 0.6787711176576685, "flos": 19171881619200.0, "grad_norm": 1.8204059257647656, "language_loss": 0.72461414, "learning_rate": 9.88105670966079e-07, "loss": 0.74604106, "num_input_tokens_seen": 121419785, "step": 5645, "time_per_iteration": 2.6737377643585205 }, { "auxiliary_loss_clip": 0.01103784, "auxiliary_loss_mlp": 0.01029451, "balance_loss_clip": 1.04238033, "balance_loss_mlp": 1.02170277, "epoch": 0.6788913605483076, "flos": 13985159581440.0, "grad_norm": 1.865303635923441, "language_loss": 0.78592896, "learning_rate": 9.874338344679283e-07, "loss": 0.80726129, "num_input_tokens_seen": 121435630, "step": 5646, "time_per_iteration": 2.763004779815674 }, { "auxiliary_loss_clip": 0.01168932, "auxiliary_loss_mlp": 0.01021636, "balance_loss_clip": 1.04797268, "balance_loss_mlp": 1.01463842, "epoch": 0.6790116034389466, "flos": 22017586659840.0, "grad_norm": 1.9825905582205374, "language_loss": 0.73949498, "learning_rate": 9.86762151575259e-07, "loss": 0.7614007, "num_input_tokens_seen": 121455625, "step": 5647, "time_per_iteration": 2.5971169471740723 }, { "auxiliary_loss_clip": 0.01117395, "auxiliary_loss_mlp": 0.00761542, "balance_loss_clip": 1.04415154, "balance_loss_mlp": 1.00056803, "epoch": 0.6791318463295858, "flos": 20922454851840.0, "grad_norm": 1.5489370430254157, "language_loss": 0.80035377, "learning_rate": 9.860906223899651e-07, "loss": 0.81914318, "num_input_tokens_seen": 121475020, "step": 5648, "time_per_iteration": 4.093108415603638 }, { "auxiliary_loss_clip": 0.01149502, "auxiliary_loss_mlp": 0.01023873, "balance_loss_clip": 1.04635417, "balance_loss_mlp": 1.01621127, "epoch": 0.6792520892202248, "flos": 28512749422080.0, "grad_norm": 1.7122207010124544, "language_loss": 0.75649381, "learning_rate": 9.854192470139184e-07, "loss": 0.77822757, "num_input_tokens_seen": 121496500, "step": 5649, "time_per_iteration": 2.7800652980804443 }, { "auxiliary_loss_clip": 0.01142862, "auxiliary_loss_mlp": 0.01029395, "balance_loss_clip": 1.04773569, "balance_loss_mlp": 1.02199864, "epoch": 0.6793723321108639, "flos": 20011904058240.0, "grad_norm": 2.274358264515876, "language_loss": 0.71743375, "learning_rate": 9.847480255489645e-07, "loss": 0.73915637, "num_input_tokens_seen": 121515525, "step": 5650, "time_per_iteration": 2.6828715801239014 }, { "auxiliary_loss_clip": 0.01149634, "auxiliary_loss_mlp": 0.0102404, "balance_loss_clip": 1.04641342, "balance_loss_mlp": 1.01610708, "epoch": 0.6794925750015031, "flos": 26649488246400.0, "grad_norm": 2.001881194683384, "language_loss": 0.69215226, "learning_rate": 9.840769580969295e-07, "loss": 0.713889, "num_input_tokens_seen": 121535965, "step": 5651, "time_per_iteration": 3.757110357284546 }, { "auxiliary_loss_clip": 0.01150815, "auxiliary_loss_mlp": 0.01026317, "balance_loss_clip": 1.04547119, "balance_loss_mlp": 1.01937556, "epoch": 0.6796128178921421, "flos": 21580374314880.0, "grad_norm": 1.7948272544713886, "language_loss": 0.79974496, "learning_rate": 9.834060447596114e-07, "loss": 0.82151628, "num_input_tokens_seen": 121555235, "step": 5652, "time_per_iteration": 3.6410605907440186 }, { "auxiliary_loss_clip": 0.01158784, "auxiliary_loss_mlp": 0.01026511, "balance_loss_clip": 1.04612184, "balance_loss_mlp": 1.01904249, "epoch": 0.6797330607827812, "flos": 22492002516480.0, "grad_norm": 1.936913262209491, "language_loss": 0.78392363, "learning_rate": 9.827352856387868e-07, "loss": 0.8057766, "num_input_tokens_seen": 121574945, "step": 5653, "time_per_iteration": 2.703279972076416 }, { "auxiliary_loss_clip": 0.0102152, "auxiliary_loss_mlp": 0.01001352, "balance_loss_clip": 1.01080024, "balance_loss_mlp": 1.00034463, "epoch": 0.6798533036734203, "flos": 66306648286080.0, "grad_norm": 0.776518237138747, "language_loss": 0.64263749, "learning_rate": 9.820646808362118e-07, "loss": 0.66286623, "num_input_tokens_seen": 121641200, "step": 5654, "time_per_iteration": 3.5675435066223145 }, { "auxiliary_loss_clip": 0.01142564, "auxiliary_loss_mlp": 0.01025125, "balance_loss_clip": 1.04781771, "balance_loss_mlp": 1.01753092, "epoch": 0.6799735465640594, "flos": 16180163792640.0, "grad_norm": 3.0512343888240934, "language_loss": 0.72713155, "learning_rate": 9.813942304536154e-07, "loss": 0.74880838, "num_input_tokens_seen": 121659170, "step": 5655, "time_per_iteration": 2.9382100105285645 }, { "auxiliary_loss_clip": 0.01145245, "auxiliary_loss_mlp": 0.01025823, "balance_loss_clip": 1.04557276, "balance_loss_mlp": 1.01865304, "epoch": 0.6800937894546984, "flos": 22125749489280.0, "grad_norm": 1.9382594220587697, "language_loss": 0.63420528, "learning_rate": 9.807239345927043e-07, "loss": 0.65591598, "num_input_tokens_seen": 121679180, "step": 5656, "time_per_iteration": 3.642383337020874 }, { "auxiliary_loss_clip": 0.01143145, "auxiliary_loss_mlp": 0.01023079, "balance_loss_clip": 1.04080808, "balance_loss_mlp": 1.0159359, "epoch": 0.6802140323453376, "flos": 31612953300480.0, "grad_norm": 2.3544822754541346, "language_loss": 0.71936029, "learning_rate": 9.80053793355162e-07, "loss": 0.74102253, "num_input_tokens_seen": 121697875, "step": 5657, "time_per_iteration": 2.70095157623291 }, { "auxiliary_loss_clip": 0.01113406, "auxiliary_loss_mlp": 0.01028638, "balance_loss_clip": 1.04283965, "balance_loss_mlp": 1.02161658, "epoch": 0.6803342752359767, "flos": 17712938908800.0, "grad_norm": 1.9147920958907063, "language_loss": 0.75011462, "learning_rate": 9.793838068426472e-07, "loss": 0.77153504, "num_input_tokens_seen": 121715570, "step": 5658, "time_per_iteration": 2.699640989303589 }, { "auxiliary_loss_clip": 0.01172915, "auxiliary_loss_mlp": 0.0102626, "balance_loss_clip": 1.05005503, "balance_loss_mlp": 1.01858628, "epoch": 0.6804545181266157, "flos": 11326800902400.0, "grad_norm": 2.6163517279601365, "language_loss": 0.61370015, "learning_rate": 9.78713975156799e-07, "loss": 0.63569188, "num_input_tokens_seen": 121731435, "step": 5659, "time_per_iteration": 2.5309886932373047 }, { "auxiliary_loss_clip": 0.01129764, "auxiliary_loss_mlp": 0.01035833, "balance_loss_clip": 1.04552627, "balance_loss_mlp": 1.02764344, "epoch": 0.6805747610172549, "flos": 29350976181120.0, "grad_norm": 2.1139411281544898, "language_loss": 0.71942472, "learning_rate": 9.780442983992273e-07, "loss": 0.74108064, "num_input_tokens_seen": 121749950, "step": 5660, "time_per_iteration": 2.7488083839416504 }, { "auxiliary_loss_clip": 0.01136992, "auxiliary_loss_mlp": 0.01025463, "balance_loss_clip": 1.04471433, "balance_loss_mlp": 1.01806593, "epoch": 0.680695003907894, "flos": 37631868612480.0, "grad_norm": 1.6756182778319337, "language_loss": 0.71810937, "learning_rate": 9.773747766715238e-07, "loss": 0.73973393, "num_input_tokens_seen": 121770770, "step": 5661, "time_per_iteration": 2.76985239982605 }, { "auxiliary_loss_clip": 0.01145286, "auxiliary_loss_mlp": 0.0102618, "balance_loss_clip": 1.04491341, "balance_loss_mlp": 1.01906002, "epoch": 0.680815246798533, "flos": 22127365601280.0, "grad_norm": 1.8661195166632119, "language_loss": 0.80098188, "learning_rate": 9.767054100752536e-07, "loss": 0.82269651, "num_input_tokens_seen": 121790720, "step": 5662, "time_per_iteration": 2.683082103729248 }, { "auxiliary_loss_clip": 0.01135598, "auxiliary_loss_mlp": 0.0102786, "balance_loss_clip": 1.04683471, "balance_loss_mlp": 1.01979518, "epoch": 0.6809354896891722, "flos": 17201822330880.0, "grad_norm": 2.6649592074581823, "language_loss": 0.82033217, "learning_rate": 9.760361987119584e-07, "loss": 0.84196675, "num_input_tokens_seen": 121808455, "step": 5663, "time_per_iteration": 2.6260786056518555 }, { "auxiliary_loss_clip": 0.01141779, "auxiliary_loss_mlp": 0.01028435, "balance_loss_clip": 1.04434037, "balance_loss_mlp": 1.02111018, "epoch": 0.6810557325798112, "flos": 12458166554880.0, "grad_norm": 1.9163305750440445, "language_loss": 0.67881477, "learning_rate": 9.753671426831592e-07, "loss": 0.70051694, "num_input_tokens_seen": 121824470, "step": 5664, "time_per_iteration": 2.6702826023101807 }, { "auxiliary_loss_clip": 0.01150164, "auxiliary_loss_mlp": 0.01022969, "balance_loss_clip": 1.04367602, "balance_loss_mlp": 1.01538467, "epoch": 0.6811759754704503, "flos": 22156165330560.0, "grad_norm": 1.7746016606696051, "language_loss": 0.79496086, "learning_rate": 9.746982420903483e-07, "loss": 0.81669223, "num_input_tokens_seen": 121842665, "step": 5665, "time_per_iteration": 2.6420562267303467 }, { "auxiliary_loss_clip": 0.01153332, "auxiliary_loss_mlp": 0.01022843, "balance_loss_clip": 1.04755592, "balance_loss_mlp": 1.01566625, "epoch": 0.6812962183610894, "flos": 17525377065600.0, "grad_norm": 1.5641216031806788, "language_loss": 0.74531984, "learning_rate": 9.740294970349993e-07, "loss": 0.76708156, "num_input_tokens_seen": 121859080, "step": 5666, "time_per_iteration": 2.596804141998291 }, { "auxiliary_loss_clip": 0.01051439, "auxiliary_loss_mlp": 0.01001088, "balance_loss_clip": 1.01521552, "balance_loss_mlp": 0.99990219, "epoch": 0.6814164612517285, "flos": 60274480855680.0, "grad_norm": 0.8791396548599791, "language_loss": 0.609375, "learning_rate": 9.733609076185594e-07, "loss": 0.62990028, "num_input_tokens_seen": 121915485, "step": 5667, "time_per_iteration": 3.191171407699585 }, { "auxiliary_loss_clip": 0.01158668, "auxiliary_loss_mlp": 0.01029567, "balance_loss_clip": 1.04920232, "balance_loss_mlp": 1.02225983, "epoch": 0.6815367041423676, "flos": 19317750750720.0, "grad_norm": 2.3178167145229156, "language_loss": 0.83698928, "learning_rate": 9.72692473942455e-07, "loss": 0.85887158, "num_input_tokens_seen": 121932710, "step": 5668, "time_per_iteration": 2.5936012268066406 }, { "auxiliary_loss_clip": 0.01118334, "auxiliary_loss_mlp": 0.01031653, "balance_loss_clip": 1.0444665, "balance_loss_mlp": 1.02386284, "epoch": 0.6816569470330067, "flos": 22161696024960.0, "grad_norm": 1.8809093624210158, "language_loss": 0.77643377, "learning_rate": 9.720241961080849e-07, "loss": 0.79793364, "num_input_tokens_seen": 121952025, "step": 5669, "time_per_iteration": 2.7268805503845215 }, { "auxiliary_loss_clip": 0.01170842, "auxiliary_loss_mlp": 0.01023632, "balance_loss_clip": 1.04630017, "balance_loss_mlp": 1.01635385, "epoch": 0.6817771899236458, "flos": 41463501137280.0, "grad_norm": 2.3465616045051694, "language_loss": 0.73154581, "learning_rate": 9.713560742168259e-07, "loss": 0.75349057, "num_input_tokens_seen": 121974650, "step": 5670, "time_per_iteration": 2.7638742923736572 }, { "auxiliary_loss_clip": 0.01127728, "auxiliary_loss_mlp": 0.01028179, "balance_loss_clip": 1.04249024, "balance_loss_mlp": 1.02112162, "epoch": 0.6818974328142848, "flos": 21106138026240.0, "grad_norm": 2.1085115005182966, "language_loss": 0.71552289, "learning_rate": 9.706881083700333e-07, "loss": 0.73708194, "num_input_tokens_seen": 121994335, "step": 5671, "time_per_iteration": 2.7430710792541504 }, { "auxiliary_loss_clip": 0.01105009, "auxiliary_loss_mlp": 0.01029421, "balance_loss_clip": 1.04692984, "balance_loss_mlp": 1.02201843, "epoch": 0.682017675704924, "flos": 20441897769600.0, "grad_norm": 2.0297460941932193, "language_loss": 0.82766551, "learning_rate": 9.700202986690357e-07, "loss": 0.84900981, "num_input_tokens_seen": 122012635, "step": 5672, "time_per_iteration": 2.736157178878784 }, { "auxiliary_loss_clip": 0.0115523, "auxiliary_loss_mlp": 0.00763187, "balance_loss_clip": 1.04613256, "balance_loss_mlp": 1.00050354, "epoch": 0.682137918595563, "flos": 20044438801920.0, "grad_norm": 2.012110426060409, "language_loss": 0.66537356, "learning_rate": 9.693526452151413e-07, "loss": 0.68455768, "num_input_tokens_seen": 122031685, "step": 5673, "time_per_iteration": 2.618607759475708 }, { "auxiliary_loss_clip": 0.01134199, "auxiliary_loss_mlp": 0.01029013, "balance_loss_clip": 1.04316175, "balance_loss_mlp": 1.02069521, "epoch": 0.6822581614862021, "flos": 31684559063040.0, "grad_norm": 1.72202880667935, "language_loss": 0.75748175, "learning_rate": 9.686851481096305e-07, "loss": 0.77911389, "num_input_tokens_seen": 122052995, "step": 5674, "time_per_iteration": 2.7543230056762695 }, { "auxiliary_loss_clip": 0.01097837, "auxiliary_loss_mlp": 0.01025086, "balance_loss_clip": 1.04049587, "balance_loss_mlp": 1.01799941, "epoch": 0.6823784043768413, "flos": 23477570864640.0, "grad_norm": 3.241500907025541, "language_loss": 0.71582299, "learning_rate": 9.68017807453762e-07, "loss": 0.7370522, "num_input_tokens_seen": 122071740, "step": 5675, "time_per_iteration": 3.686222791671753 }, { "auxiliary_loss_clip": 0.01145806, "auxiliary_loss_mlp": 0.00761925, "balance_loss_clip": 1.04733658, "balance_loss_mlp": 1.00053525, "epoch": 0.6824986472674803, "flos": 14137134024960.0, "grad_norm": 1.7721310378460424, "language_loss": 0.73100466, "learning_rate": 9.673506233487721e-07, "loss": 0.7500819, "num_input_tokens_seen": 122089705, "step": 5676, "time_per_iteration": 2.654435396194458 }, { "auxiliary_loss_clip": 0.01139828, "auxiliary_loss_mlp": 0.00761364, "balance_loss_clip": 1.04323721, "balance_loss_mlp": 1.00043535, "epoch": 0.6826188901581194, "flos": 21504997624320.0, "grad_norm": 1.7545153933056559, "language_loss": 0.86041021, "learning_rate": 9.666835958958717e-07, "loss": 0.87942207, "num_input_tokens_seen": 122109025, "step": 5677, "time_per_iteration": 3.641615867614746 }, { "auxiliary_loss_clip": 0.01172598, "auxiliary_loss_mlp": 0.01028933, "balance_loss_clip": 1.04999781, "balance_loss_mlp": 1.02171826, "epoch": 0.6827391330487584, "flos": 20810126044800.0, "grad_norm": 2.0538615358795123, "language_loss": 0.80001456, "learning_rate": 9.660167251962484e-07, "loss": 0.82202989, "num_input_tokens_seen": 122127385, "step": 5678, "time_per_iteration": 3.876284122467041 }, { "auxiliary_loss_clip": 0.0113001, "auxiliary_loss_mlp": 0.01026598, "balance_loss_clip": 1.04181337, "balance_loss_mlp": 1.01905179, "epoch": 0.6828593759393976, "flos": 21688788539520.0, "grad_norm": 3.4666103398676515, "language_loss": 0.77829558, "learning_rate": 9.653500113510654e-07, "loss": 0.79986167, "num_input_tokens_seen": 122146500, "step": 5679, "time_per_iteration": 2.8046910762786865 }, { "auxiliary_loss_clip": 0.01138774, "auxiliary_loss_mlp": 0.01026323, "balance_loss_clip": 1.04438424, "balance_loss_mlp": 1.01851463, "epoch": 0.6829796188300367, "flos": 25337707557120.0, "grad_norm": 2.4728975050452493, "language_loss": 0.66926688, "learning_rate": 9.646834544614627e-07, "loss": 0.69091785, "num_input_tokens_seen": 122167000, "step": 5680, "time_per_iteration": 2.743065118789673 }, { "auxiliary_loss_clip": 0.01137556, "auxiliary_loss_mlp": 0.01026141, "balance_loss_clip": 1.04600501, "balance_loss_mlp": 1.01911712, "epoch": 0.6830998617206757, "flos": 20704800389760.0, "grad_norm": 1.8890178556877153, "language_loss": 0.7643345, "learning_rate": 9.64017054628558e-07, "loss": 0.78597152, "num_input_tokens_seen": 122185825, "step": 5681, "time_per_iteration": 2.7170660495758057 }, { "auxiliary_loss_clip": 0.01116844, "auxiliary_loss_mlp": 0.01022845, "balance_loss_clip": 1.0408299, "balance_loss_mlp": 1.01560903, "epoch": 0.6832201046113149, "flos": 21726638496000.0, "grad_norm": 1.6219080112646582, "language_loss": 0.79226035, "learning_rate": 9.63350811953441e-07, "loss": 0.81365728, "num_input_tokens_seen": 122206200, "step": 5682, "time_per_iteration": 3.703184127807617 }, { "auxiliary_loss_clip": 0.01128588, "auxiliary_loss_mlp": 0.01022768, "balance_loss_clip": 1.04280543, "balance_loss_mlp": 1.01560998, "epoch": 0.6833403475019539, "flos": 19536554448000.0, "grad_norm": 2.0095693061307154, "language_loss": 0.7041325, "learning_rate": 9.626847265371826e-07, "loss": 0.72564602, "num_input_tokens_seen": 122225520, "step": 5683, "time_per_iteration": 2.7108614444732666 }, { "auxiliary_loss_clip": 0.01135439, "auxiliary_loss_mlp": 0.01029047, "balance_loss_clip": 1.04371369, "balance_loss_mlp": 1.0219841, "epoch": 0.683460590392593, "flos": 19352153001600.0, "grad_norm": 2.22697690436209, "language_loss": 0.78513336, "learning_rate": 9.620187984808262e-07, "loss": 0.80677819, "num_input_tokens_seen": 122244320, "step": 5684, "time_per_iteration": 2.6867051124572754 }, { "auxiliary_loss_clip": 0.01141947, "auxiliary_loss_mlp": 0.00762254, "balance_loss_clip": 1.04582334, "balance_loss_mlp": 1.00049114, "epoch": 0.6835808332832322, "flos": 23288500650240.0, "grad_norm": 1.5923121859707137, "language_loss": 0.85760677, "learning_rate": 9.613530278853919e-07, "loss": 0.87664872, "num_input_tokens_seen": 122264295, "step": 5685, "time_per_iteration": 2.6723363399505615 }, { "auxiliary_loss_clip": 0.01155158, "auxiliary_loss_mlp": 0.01025441, "balance_loss_clip": 1.04608166, "balance_loss_mlp": 1.01794815, "epoch": 0.6837010761738712, "flos": 21653416621440.0, "grad_norm": 2.128330854449357, "language_loss": 0.74369961, "learning_rate": 9.60687414851879e-07, "loss": 0.76550555, "num_input_tokens_seen": 122285300, "step": 5686, "time_per_iteration": 2.721127510070801 }, { "auxiliary_loss_clip": 0.01146618, "auxiliary_loss_mlp": 0.01020524, "balance_loss_clip": 1.04677749, "balance_loss_mlp": 1.01340723, "epoch": 0.6838213190645103, "flos": 17566387418880.0, "grad_norm": 2.4030622748177386, "language_loss": 0.77631301, "learning_rate": 9.600219594812575e-07, "loss": 0.79798442, "num_input_tokens_seen": 122303240, "step": 5687, "time_per_iteration": 2.612215757369995 }, { "auxiliary_loss_clip": 0.01167789, "auxiliary_loss_mlp": 0.01021945, "balance_loss_clip": 1.04639864, "balance_loss_mlp": 1.01520669, "epoch": 0.6839415619551494, "flos": 23112538899840.0, "grad_norm": 1.6659532567714697, "language_loss": 0.73219025, "learning_rate": 9.593566618744786e-07, "loss": 0.75408763, "num_input_tokens_seen": 122323390, "step": 5688, "time_per_iteration": 2.697328567504883 }, { "auxiliary_loss_clip": 0.0117033, "auxiliary_loss_mlp": 0.01026277, "balance_loss_clip": 1.04767549, "balance_loss_mlp": 1.0192827, "epoch": 0.6840618048457885, "flos": 22127868391680.0, "grad_norm": 1.8070888608928692, "language_loss": 0.74025744, "learning_rate": 9.58691522132466e-07, "loss": 0.76222354, "num_input_tokens_seen": 122342200, "step": 5689, "time_per_iteration": 2.731128692626953 }, { "auxiliary_loss_clip": 0.01147689, "auxiliary_loss_mlp": 0.01028782, "balance_loss_clip": 1.04698014, "balance_loss_mlp": 1.02074695, "epoch": 0.6841820477364275, "flos": 22015898720640.0, "grad_norm": 1.8983394835647847, "language_loss": 0.84542161, "learning_rate": 9.58026540356123e-07, "loss": 0.86718631, "num_input_tokens_seen": 122360465, "step": 5690, "time_per_iteration": 2.6994621753692627 }, { "auxiliary_loss_clip": 0.01159485, "auxiliary_loss_mlp": 0.01030826, "balance_loss_clip": 1.0470922, "balance_loss_mlp": 1.0232501, "epoch": 0.6843022906270667, "flos": 24900531125760.0, "grad_norm": 1.7522910108898768, "language_loss": 0.86624551, "learning_rate": 9.573617166463246e-07, "loss": 0.88814867, "num_input_tokens_seen": 122381680, "step": 5691, "time_per_iteration": 2.6156387329101562 }, { "auxiliary_loss_clip": 0.01143867, "auxiliary_loss_mlp": 0.01022727, "balance_loss_clip": 1.04390967, "balance_loss_mlp": 1.015589, "epoch": 0.6844225335177058, "flos": 19969924037760.0, "grad_norm": 1.9802643085676674, "language_loss": 0.60204363, "learning_rate": 9.56697051103924e-07, "loss": 0.62370956, "num_input_tokens_seen": 122399120, "step": 5692, "time_per_iteration": 2.709254741668701 }, { "auxiliary_loss_clip": 0.01140398, "auxiliary_loss_mlp": 0.01023931, "balance_loss_clip": 1.04520822, "balance_loss_mlp": 1.0170083, "epoch": 0.6845427764083448, "flos": 25883334126720.0, "grad_norm": 2.6402464139957273, "language_loss": 0.81170344, "learning_rate": 9.560325438297522e-07, "loss": 0.83334672, "num_input_tokens_seen": 122417430, "step": 5693, "time_per_iteration": 2.727132797241211 }, { "auxiliary_loss_clip": 0.01145328, "auxiliary_loss_mlp": 0.01029598, "balance_loss_clip": 1.04922044, "balance_loss_mlp": 1.02237093, "epoch": 0.684663019298984, "flos": 18880143356160.0, "grad_norm": 1.793128764344129, "language_loss": 0.8682822, "learning_rate": 9.553681949246127e-07, "loss": 0.89003146, "num_input_tokens_seen": 122435055, "step": 5694, "time_per_iteration": 2.6896986961364746 }, { "auxiliary_loss_clip": 0.01133121, "auxiliary_loss_mlp": 0.01032732, "balance_loss_clip": 1.04412532, "balance_loss_mlp": 1.02498317, "epoch": 0.684783262189623, "flos": 54193725302400.0, "grad_norm": 3.1243937621601368, "language_loss": 0.7535789, "learning_rate": 9.547040044892886e-07, "loss": 0.77523744, "num_input_tokens_seen": 122462570, "step": 5695, "time_per_iteration": 2.9682672023773193 }, { "auxiliary_loss_clip": 0.01064069, "auxiliary_loss_mlp": 0.01001487, "balance_loss_clip": 1.01525855, "balance_loss_mlp": 1.00038993, "epoch": 0.6849035050802621, "flos": 63970264143360.0, "grad_norm": 0.889399092057962, "language_loss": 0.60026091, "learning_rate": 9.540399726245354e-07, "loss": 0.62091649, "num_input_tokens_seen": 122519275, "step": 5696, "time_per_iteration": 3.134632110595703 }, { "auxiliary_loss_clip": 0.01138132, "auxiliary_loss_mlp": 0.01033022, "balance_loss_clip": 1.04255843, "balance_loss_mlp": 1.02486205, "epoch": 0.6850237479709013, "flos": 25224121774080.0, "grad_norm": 1.715351670355798, "language_loss": 0.69027495, "learning_rate": 9.533760994310859e-07, "loss": 0.71198654, "num_input_tokens_seen": 122539675, "step": 5697, "time_per_iteration": 2.7083899974823 }, { "auxiliary_loss_clip": 0.01173102, "auxiliary_loss_mlp": 0.01027902, "balance_loss_clip": 1.0499891, "balance_loss_mlp": 1.02046001, "epoch": 0.6851439908615403, "flos": 19354128249600.0, "grad_norm": 2.0593683469824144, "language_loss": 0.75367975, "learning_rate": 9.527123850096508e-07, "loss": 0.77568984, "num_input_tokens_seen": 122558035, "step": 5698, "time_per_iteration": 2.596379280090332 }, { "auxiliary_loss_clip": 0.01160713, "auxiliary_loss_mlp": 0.01025849, "balance_loss_clip": 1.04713178, "balance_loss_mlp": 1.01818347, "epoch": 0.6852642337521794, "flos": 23182133500800.0, "grad_norm": 1.9454362585602496, "language_loss": 0.72072613, "learning_rate": 9.520488294609142e-07, "loss": 0.74259174, "num_input_tokens_seen": 122576815, "step": 5699, "time_per_iteration": 2.6695423126220703 }, { "auxiliary_loss_clip": 0.01028831, "auxiliary_loss_mlp": 0.01000353, "balance_loss_clip": 1.01564789, "balance_loss_mlp": 0.99925613, "epoch": 0.6853844766428185, "flos": 62647206583680.0, "grad_norm": 0.7460187283212975, "language_loss": 0.53850186, "learning_rate": 9.513854328855368e-07, "loss": 0.55879366, "num_input_tokens_seen": 122634690, "step": 5700, "time_per_iteration": 4.187113285064697 }, { "auxiliary_loss_clip": 0.01166535, "auxiliary_loss_mlp": 0.01024922, "balance_loss_clip": 1.04606843, "balance_loss_mlp": 1.01811206, "epoch": 0.6855047195334576, "flos": 23437242869760.0, "grad_norm": 2.240416034551167, "language_loss": 0.81059515, "learning_rate": 9.507221953841558e-07, "loss": 0.8325097, "num_input_tokens_seen": 122652320, "step": 5701, "time_per_iteration": 2.6337695121765137 }, { "auxiliary_loss_clip": 0.01158838, "auxiliary_loss_mlp": 0.0102879, "balance_loss_clip": 1.04937375, "balance_loss_mlp": 1.02112198, "epoch": 0.6856249624240967, "flos": 20664831530880.0, "grad_norm": 1.5458304651233254, "language_loss": 0.77718824, "learning_rate": 9.500591170573824e-07, "loss": 0.79906452, "num_input_tokens_seen": 122672340, "step": 5702, "time_per_iteration": 2.6468875408172607 }, { "auxiliary_loss_clip": 0.01110074, "auxiliary_loss_mlp": 0.01031455, "balance_loss_clip": 1.04255259, "balance_loss_mlp": 1.02405798, "epoch": 0.6857452053147358, "flos": 17087302794240.0, "grad_norm": 1.7882005014539861, "language_loss": 0.74121797, "learning_rate": 9.493961980058078e-07, "loss": 0.76263326, "num_input_tokens_seen": 122689935, "step": 5703, "time_per_iteration": 3.720088481903076 }, { "auxiliary_loss_clip": 0.01084907, "auxiliary_loss_mlp": 0.01029389, "balance_loss_clip": 1.03781486, "balance_loss_mlp": 1.02223933, "epoch": 0.6858654482053749, "flos": 30847266057600.0, "grad_norm": 2.006910432939039, "language_loss": 0.67408401, "learning_rate": 9.48733438329993e-07, "loss": 0.69522697, "num_input_tokens_seen": 122710200, "step": 5704, "time_per_iteration": 3.7168586254119873 }, { "auxiliary_loss_clip": 0.01165883, "auxiliary_loss_mlp": 0.00761941, "balance_loss_clip": 1.04607677, "balance_loss_mlp": 1.00047445, "epoch": 0.6859856910960139, "flos": 28877314510080.0, "grad_norm": 1.8309177183552732, "language_loss": 0.74290049, "learning_rate": 9.480708381304807e-07, "loss": 0.76217866, "num_input_tokens_seen": 122731495, "step": 5705, "time_per_iteration": 2.716273546218872 }, { "auxiliary_loss_clip": 0.01111941, "auxiliary_loss_mlp": 0.01028979, "balance_loss_clip": 1.04432106, "balance_loss_mlp": 1.02158821, "epoch": 0.6861059339866531, "flos": 19354523299200.0, "grad_norm": 2.080948890723887, "language_loss": 0.83523762, "learning_rate": 9.474083975077858e-07, "loss": 0.85664678, "num_input_tokens_seen": 122748620, "step": 5706, "time_per_iteration": 2.758047103881836 }, { "auxiliary_loss_clip": 0.01148495, "auxiliary_loss_mlp": 0.01027597, "balance_loss_clip": 1.04325771, "balance_loss_mlp": 1.01974118, "epoch": 0.6862261768772921, "flos": 22199976944640.0, "grad_norm": 2.4798279225019964, "language_loss": 0.79818344, "learning_rate": 9.467461165623994e-07, "loss": 0.81994438, "num_input_tokens_seen": 122767670, "step": 5707, "time_per_iteration": 2.684525966644287 }, { "auxiliary_loss_clip": 0.01157608, "auxiliary_loss_mlp": 0.01022942, "balance_loss_clip": 1.04471564, "balance_loss_mlp": 1.01603365, "epoch": 0.6863464197679312, "flos": 26285677344000.0, "grad_norm": 1.9734357954421704, "language_loss": 0.79620779, "learning_rate": 9.46083995394791e-07, "loss": 0.81801331, "num_input_tokens_seen": 122785480, "step": 5708, "time_per_iteration": 3.5906710624694824 }, { "auxiliary_loss_clip": 0.01155023, "auxiliary_loss_mlp": 0.00762429, "balance_loss_clip": 1.04584002, "balance_loss_mlp": 1.00050402, "epoch": 0.6864666626585703, "flos": 37815228564480.0, "grad_norm": 1.861802094353502, "language_loss": 0.63544214, "learning_rate": 9.454220341054012e-07, "loss": 0.65461659, "num_input_tokens_seen": 122810265, "step": 5709, "time_per_iteration": 2.801804304122925 }, { "auxiliary_loss_clip": 0.0112688, "auxiliary_loss_mlp": 0.01020225, "balance_loss_clip": 1.04417765, "balance_loss_mlp": 1.01297688, "epoch": 0.6865869055492094, "flos": 19391152193280.0, "grad_norm": 2.1575769234007667, "language_loss": 0.808079, "learning_rate": 9.447602327946512e-07, "loss": 0.82955009, "num_input_tokens_seen": 122828905, "step": 5710, "time_per_iteration": 2.7218356132507324 }, { "auxiliary_loss_clip": 0.01139465, "auxiliary_loss_mlp": 0.01029572, "balance_loss_clip": 1.04249001, "balance_loss_mlp": 1.02236855, "epoch": 0.6867071484398485, "flos": 20375966355840.0, "grad_norm": 2.0236449765862985, "language_loss": 0.76173794, "learning_rate": 9.440985915629338e-07, "loss": 0.78342831, "num_input_tokens_seen": 122846235, "step": 5711, "time_per_iteration": 2.75539231300354 }, { "auxiliary_loss_clip": 0.01170466, "auxiliary_loss_mlp": 0.01029888, "balance_loss_clip": 1.05113208, "balance_loss_mlp": 1.02255082, "epoch": 0.6868273913304875, "flos": 15889143801600.0, "grad_norm": 1.984058464985516, "language_loss": 0.72873223, "learning_rate": 9.434371105106223e-07, "loss": 0.75073576, "num_input_tokens_seen": 122863835, "step": 5712, "time_per_iteration": 2.582150936126709 }, { "auxiliary_loss_clip": 0.01125513, "auxiliary_loss_mlp": 0.0102871, "balance_loss_clip": 1.04236531, "balance_loss_mlp": 1.02134025, "epoch": 0.6869476342211267, "flos": 24462492768000.0, "grad_norm": 1.8414592482800163, "language_loss": 0.70862424, "learning_rate": 9.427757897380602e-07, "loss": 0.73016655, "num_input_tokens_seen": 122883235, "step": 5713, "time_per_iteration": 2.703075647354126 }, { "auxiliary_loss_clip": 0.01124297, "auxiliary_loss_mlp": 0.01028468, "balance_loss_clip": 1.04399896, "balance_loss_mlp": 1.02109444, "epoch": 0.6870678771117658, "flos": 18442571875200.0, "grad_norm": 2.324107640267756, "language_loss": 0.85204375, "learning_rate": 9.421146293455695e-07, "loss": 0.8735714, "num_input_tokens_seen": 122898975, "step": 5714, "time_per_iteration": 2.6754708290100098 }, { "auxiliary_loss_clip": 0.0113927, "auxiliary_loss_mlp": 0.01026785, "balance_loss_clip": 1.04270577, "balance_loss_mlp": 1.01930773, "epoch": 0.6871881200024048, "flos": 22200371994240.0, "grad_norm": 2.2832674638598793, "language_loss": 0.68466806, "learning_rate": 9.414536294334489e-07, "loss": 0.70632863, "num_input_tokens_seen": 122918995, "step": 5715, "time_per_iteration": 2.6638376712799072 }, { "auxiliary_loss_clip": 0.01142655, "auxiliary_loss_mlp": 0.01021432, "balance_loss_clip": 1.04214966, "balance_loss_mlp": 1.0142529, "epoch": 0.687308362893044, "flos": 22127724737280.0, "grad_norm": 3.056115292540584, "language_loss": 0.6950435, "learning_rate": 9.407927901019708e-07, "loss": 0.7166844, "num_input_tokens_seen": 122938125, "step": 5716, "time_per_iteration": 2.6696979999542236 }, { "auxiliary_loss_clip": 0.01156576, "auxiliary_loss_mlp": 0.0101916, "balance_loss_clip": 1.04554141, "balance_loss_mlp": 1.01202512, "epoch": 0.687428605783683, "flos": 25040546340480.0, "grad_norm": 1.9491692070173496, "language_loss": 0.7668159, "learning_rate": 9.401321114513854e-07, "loss": 0.78857321, "num_input_tokens_seen": 122957020, "step": 5717, "time_per_iteration": 2.7206945419311523 }, { "auxiliary_loss_clip": 0.01170404, "auxiliary_loss_mlp": 0.01022601, "balance_loss_clip": 1.04830825, "balance_loss_mlp": 1.01526093, "epoch": 0.6875488486743221, "flos": 23770063313280.0, "grad_norm": 1.5735856314233618, "language_loss": 0.75703877, "learning_rate": 9.394715935819155e-07, "loss": 0.77896881, "num_input_tokens_seen": 122977410, "step": 5718, "time_per_iteration": 2.694150924682617 }, { "auxiliary_loss_clip": 0.01158226, "auxiliary_loss_mlp": 0.01029792, "balance_loss_clip": 1.04577506, "balance_loss_mlp": 1.02249658, "epoch": 0.6876690915649613, "flos": 25516937445120.0, "grad_norm": 2.208763144324675, "language_loss": 0.6308552, "learning_rate": 9.388112365937608e-07, "loss": 0.65273535, "num_input_tokens_seen": 122996875, "step": 5719, "time_per_iteration": 2.650710344314575 }, { "auxiliary_loss_clip": 0.01127517, "auxiliary_loss_mlp": 0.01029474, "balance_loss_clip": 1.04376447, "balance_loss_mlp": 1.02203536, "epoch": 0.6877893344556003, "flos": 19427996568960.0, "grad_norm": 2.102158939882556, "language_loss": 0.82977903, "learning_rate": 9.381510405870985e-07, "loss": 0.851349, "num_input_tokens_seen": 123015890, "step": 5720, "time_per_iteration": 2.6253960132598877 }, { "auxiliary_loss_clip": 0.01152266, "auxiliary_loss_mlp": 0.01026504, "balance_loss_clip": 1.04386973, "balance_loss_mlp": 1.01880336, "epoch": 0.6879095773462394, "flos": 18661303745280.0, "grad_norm": 2.210083621518969, "language_loss": 0.77206844, "learning_rate": 9.374910056620791e-07, "loss": 0.79385608, "num_input_tokens_seen": 123034955, "step": 5721, "time_per_iteration": 2.596195936203003 }, { "auxiliary_loss_clip": 0.0115964, "auxiliary_loss_mlp": 0.01026307, "balance_loss_clip": 1.04889154, "balance_loss_mlp": 1.01908612, "epoch": 0.6880298202368785, "flos": 20883132437760.0, "grad_norm": 1.668424449285381, "language_loss": 0.81079853, "learning_rate": 9.368311319188293e-07, "loss": 0.83265799, "num_input_tokens_seen": 123052770, "step": 5722, "time_per_iteration": 2.622199773788452 }, { "auxiliary_loss_clip": 0.01125939, "auxiliary_loss_mlp": 0.01029932, "balance_loss_clip": 1.04082859, "balance_loss_mlp": 1.02249956, "epoch": 0.6881500631275176, "flos": 30153292318080.0, "grad_norm": 1.7376272257595549, "language_loss": 0.79443508, "learning_rate": 9.361714194574515e-07, "loss": 0.81599379, "num_input_tokens_seen": 123075105, "step": 5723, "time_per_iteration": 2.830580234527588 }, { "auxiliary_loss_clip": 0.01071431, "auxiliary_loss_mlp": 0.01000864, "balance_loss_clip": 1.01435232, "balance_loss_mlp": 0.99970782, "epoch": 0.6882703060181566, "flos": 66181537215360.0, "grad_norm": 0.7325801815379657, "language_loss": 0.58247733, "learning_rate": 9.355118683780228e-07, "loss": 0.60320026, "num_input_tokens_seen": 123145175, "step": 5724, "time_per_iteration": 3.356541156768799 }, { "auxiliary_loss_clip": 0.01167453, "auxiliary_loss_mlp": 0.01025868, "balance_loss_clip": 1.04545915, "balance_loss_mlp": 1.01847148, "epoch": 0.6883905489087958, "flos": 18214646123520.0, "grad_norm": 2.069491869916693, "language_loss": 0.79184371, "learning_rate": 9.348524787805987e-07, "loss": 0.81377697, "num_input_tokens_seen": 123160365, "step": 5725, "time_per_iteration": 2.595020055770874 }, { "auxiliary_loss_clip": 0.01127641, "auxiliary_loss_mlp": 0.01024662, "balance_loss_clip": 1.03955984, "balance_loss_mlp": 1.01712847, "epoch": 0.6885107917994349, "flos": 14056262553600.0, "grad_norm": 2.756202906673922, "language_loss": 0.85753345, "learning_rate": 9.341932507652053e-07, "loss": 0.87905645, "num_input_tokens_seen": 123174855, "step": 5726, "time_per_iteration": 3.6474852561950684 }, { "auxiliary_loss_clip": 0.01138934, "auxiliary_loss_mlp": 0.01028919, "balance_loss_clip": 1.04128242, "balance_loss_mlp": 1.02103889, "epoch": 0.6886310346900739, "flos": 28690722334080.0, "grad_norm": 1.9642043354668144, "language_loss": 0.78523219, "learning_rate": 9.335341844318489e-07, "loss": 0.80691063, "num_input_tokens_seen": 123194995, "step": 5727, "time_per_iteration": 2.6935832500457764 }, { "auxiliary_loss_clip": 0.01139595, "auxiliary_loss_mlp": 0.01025846, "balance_loss_clip": 1.04453015, "balance_loss_mlp": 1.01855016, "epoch": 0.6887512775807131, "flos": 24535319592960.0, "grad_norm": 1.768022677084799, "language_loss": 0.73250121, "learning_rate": 9.328752798805091e-07, "loss": 0.75415564, "num_input_tokens_seen": 123213465, "step": 5728, "time_per_iteration": 3.6679186820983887 }, { "auxiliary_loss_clip": 0.01155508, "auxiliary_loss_mlp": 0.01027645, "balance_loss_clip": 1.04750586, "balance_loss_mlp": 1.01994956, "epoch": 0.6888715204713521, "flos": 22414363269120.0, "grad_norm": 2.086781438727497, "language_loss": 0.76339877, "learning_rate": 9.322165372111399e-07, "loss": 0.78523028, "num_input_tokens_seen": 123231610, "step": 5729, "time_per_iteration": 2.6472225189208984 }, { "auxiliary_loss_clip": 0.01120042, "auxiliary_loss_mlp": 0.010234, "balance_loss_clip": 1.04254651, "balance_loss_mlp": 1.01642346, "epoch": 0.6889917633619912, "flos": 22054323294720.0, "grad_norm": 2.0533270763093614, "language_loss": 0.75606412, "learning_rate": 9.315579565236747e-07, "loss": 0.7774986, "num_input_tokens_seen": 123250715, "step": 5730, "time_per_iteration": 3.6397438049316406 }, { "auxiliary_loss_clip": 0.01138428, "auxiliary_loss_mlp": 0.01026454, "balance_loss_clip": 1.04581404, "balance_loss_mlp": 1.0193876, "epoch": 0.6891120062526304, "flos": 23949724164480.0, "grad_norm": 1.7299420356839093, "language_loss": 0.74038839, "learning_rate": 9.308995379180162e-07, "loss": 0.76203722, "num_input_tokens_seen": 123270270, "step": 5731, "time_per_iteration": 2.7352676391601562 }, { "auxiliary_loss_clip": 0.01061763, "auxiliary_loss_mlp": 0.01000649, "balance_loss_clip": 1.01373482, "balance_loss_mlp": 0.99948663, "epoch": 0.6892322491432694, "flos": 64117354337280.0, "grad_norm": 0.7886640810767812, "language_loss": 0.59470785, "learning_rate": 9.302412814940488e-07, "loss": 0.61533189, "num_input_tokens_seen": 123333045, "step": 5732, "time_per_iteration": 3.2564289569854736 }, { "auxiliary_loss_clip": 0.01138893, "auxiliary_loss_mlp": 0.01030098, "balance_loss_clip": 1.04236758, "balance_loss_mlp": 1.02175283, "epoch": 0.6893524920339085, "flos": 23002436736000.0, "grad_norm": 4.099707294444931, "language_loss": 0.7166239, "learning_rate": 9.295831873516276e-07, "loss": 0.73831379, "num_input_tokens_seen": 123352320, "step": 5733, "time_per_iteration": 2.7194089889526367 }, { "auxiliary_loss_clip": 0.01168617, "auxiliary_loss_mlp": 0.01024131, "balance_loss_clip": 1.04778671, "balance_loss_mlp": 1.0172646, "epoch": 0.6894727349245476, "flos": 21396260177280.0, "grad_norm": 1.711387316726599, "language_loss": 0.76024401, "learning_rate": 9.289252555905873e-07, "loss": 0.78217149, "num_input_tokens_seen": 123372400, "step": 5734, "time_per_iteration": 3.679340362548828 }, { "auxiliary_loss_clip": 0.01158781, "auxiliary_loss_mlp": 0.01025338, "balance_loss_clip": 1.04971039, "balance_loss_mlp": 1.0177052, "epoch": 0.6895929778151867, "flos": 19865316654720.0, "grad_norm": 1.9719576260980314, "language_loss": 0.75710112, "learning_rate": 9.282674863107334e-07, "loss": 0.77894235, "num_input_tokens_seen": 123390215, "step": 5735, "time_per_iteration": 2.7184972763061523 }, { "auxiliary_loss_clip": 0.0115388, "auxiliary_loss_mlp": 0.0103066, "balance_loss_clip": 1.04709649, "balance_loss_mlp": 1.02316809, "epoch": 0.6897132207058257, "flos": 18179166464640.0, "grad_norm": 2.3308694341364418, "language_loss": 0.75719738, "learning_rate": 9.276098796118488e-07, "loss": 0.77904284, "num_input_tokens_seen": 123406700, "step": 5736, "time_per_iteration": 2.645996570587158 }, { "auxiliary_loss_clip": 0.01138836, "auxiliary_loss_mlp": 0.01027557, "balance_loss_clip": 1.044559, "balance_loss_mlp": 1.020908, "epoch": 0.6898334635964649, "flos": 32561641359360.0, "grad_norm": 1.9094504487960677, "language_loss": 0.66270196, "learning_rate": 9.269524355936938e-07, "loss": 0.68436587, "num_input_tokens_seen": 123429880, "step": 5737, "time_per_iteration": 2.8442065715789795 }, { "auxiliary_loss_clip": 0.01133201, "auxiliary_loss_mlp": 0.01024225, "balance_loss_clip": 1.04010749, "balance_loss_mlp": 1.01682222, "epoch": 0.689953706487104, "flos": 22819004956800.0, "grad_norm": 1.736307379372792, "language_loss": 0.85124457, "learning_rate": 9.262951543560002e-07, "loss": 0.87281883, "num_input_tokens_seen": 123449105, "step": 5738, "time_per_iteration": 2.743579626083374 }, { "auxiliary_loss_clip": 0.01140351, "auxiliary_loss_mlp": 0.01026328, "balance_loss_clip": 1.047979, "balance_loss_mlp": 1.01838219, "epoch": 0.690073949377743, "flos": 18515362786560.0, "grad_norm": 2.237044017953183, "language_loss": 0.8631804, "learning_rate": 9.256380359984795e-07, "loss": 0.88484716, "num_input_tokens_seen": 123466215, "step": 5739, "time_per_iteration": 2.6478874683380127 }, { "auxiliary_loss_clip": 0.01118548, "auxiliary_loss_mlp": 0.01023523, "balance_loss_clip": 1.03781796, "balance_loss_mlp": 1.01566696, "epoch": 0.6901941922683821, "flos": 34857194716800.0, "grad_norm": 2.0304094891933424, "language_loss": 0.75003648, "learning_rate": 9.249810806208139e-07, "loss": 0.7714572, "num_input_tokens_seen": 123485480, "step": 5740, "time_per_iteration": 2.858214855194092 }, { "auxiliary_loss_clip": 0.01108194, "auxiliary_loss_mlp": 0.00762072, "balance_loss_clip": 1.03632069, "balance_loss_mlp": 1.00045776, "epoch": 0.6903144351590212, "flos": 16253672976000.0, "grad_norm": 1.933036296954168, "language_loss": 0.80328119, "learning_rate": 9.243242883226627e-07, "loss": 0.82198387, "num_input_tokens_seen": 123504575, "step": 5741, "time_per_iteration": 2.7187411785125732 }, { "auxiliary_loss_clip": 0.01157477, "auxiliary_loss_mlp": 0.01027096, "balance_loss_clip": 1.04255795, "balance_loss_mlp": 1.01975858, "epoch": 0.6904346780496603, "flos": 28035137255040.0, "grad_norm": 1.8161343923149773, "language_loss": 0.69554293, "learning_rate": 9.236676592036628e-07, "loss": 0.71738863, "num_input_tokens_seen": 123524250, "step": 5742, "time_per_iteration": 2.6792664527893066 }, { "auxiliary_loss_clip": 0.01137212, "auxiliary_loss_mlp": 0.01027778, "balance_loss_clip": 1.04634643, "balance_loss_mlp": 1.02075338, "epoch": 0.6905549209402994, "flos": 23624266008960.0, "grad_norm": 1.8333666241987792, "language_loss": 0.73552585, "learning_rate": 9.230111933634228e-07, "loss": 0.75717568, "num_input_tokens_seen": 123545845, "step": 5743, "time_per_iteration": 2.6660268306732178 }, { "auxiliary_loss_clip": 0.01157747, "auxiliary_loss_mlp": 0.0103212, "balance_loss_clip": 1.04768682, "balance_loss_mlp": 1.02478838, "epoch": 0.6906751638309385, "flos": 23114945111040.0, "grad_norm": 1.483923879240007, "language_loss": 0.80768657, "learning_rate": 9.223548909015288e-07, "loss": 0.82958525, "num_input_tokens_seen": 123567535, "step": 5744, "time_per_iteration": 2.672898530960083 }, { "auxiliary_loss_clip": 0.0110788, "auxiliary_loss_mlp": 0.0102668, "balance_loss_clip": 1.04170871, "balance_loss_mlp": 1.01949763, "epoch": 0.6907954067215776, "flos": 27305468375040.0, "grad_norm": 1.8052687363118793, "language_loss": 0.72052419, "learning_rate": 9.216987519175407e-07, "loss": 0.74186981, "num_input_tokens_seen": 123587710, "step": 5745, "time_per_iteration": 2.776141405105591 }, { "auxiliary_loss_clip": 0.01152627, "auxiliary_loss_mlp": 0.01025222, "balance_loss_clip": 1.04731107, "balance_loss_mlp": 1.01841819, "epoch": 0.6909156496122166, "flos": 21689399070720.0, "grad_norm": 1.5954833753972755, "language_loss": 0.68410063, "learning_rate": 9.210427765109942e-07, "loss": 0.70587909, "num_input_tokens_seen": 123607385, "step": 5746, "time_per_iteration": 2.602827787399292 }, { "auxiliary_loss_clip": 0.011437, "auxiliary_loss_mlp": 0.01025801, "balance_loss_clip": 1.04393303, "balance_loss_mlp": 1.01824307, "epoch": 0.6910358925028558, "flos": 22561453463040.0, "grad_norm": 2.0213046251384363, "language_loss": 0.81125015, "learning_rate": 9.20386964781402e-07, "loss": 0.83294517, "num_input_tokens_seen": 123625405, "step": 5747, "time_per_iteration": 2.67291522026062 }, { "auxiliary_loss_clip": 0.01136116, "auxiliary_loss_mlp": 0.01024276, "balance_loss_clip": 1.04261875, "balance_loss_mlp": 1.01647687, "epoch": 0.6911561353934949, "flos": 22054107813120.0, "grad_norm": 2.229739579522858, "language_loss": 0.84490085, "learning_rate": 9.197313168282472e-07, "loss": 0.86650479, "num_input_tokens_seen": 123642850, "step": 5748, "time_per_iteration": 2.6242897510528564 }, { "auxiliary_loss_clip": 0.01150622, "auxiliary_loss_mlp": 0.0102487, "balance_loss_clip": 1.04338622, "balance_loss_mlp": 1.01778936, "epoch": 0.6912763782841339, "flos": 24206557386240.0, "grad_norm": 2.060458821778296, "language_loss": 0.72081953, "learning_rate": 9.190758327509935e-07, "loss": 0.74257445, "num_input_tokens_seen": 123661595, "step": 5749, "time_per_iteration": 2.631334066390991 }, { "auxiliary_loss_clip": 0.01031855, "auxiliary_loss_mlp": 0.00754296, "balance_loss_clip": 1.01249385, "balance_loss_mlp": 1.00018489, "epoch": 0.6913966211747731, "flos": 52329641091840.0, "grad_norm": 0.9288124407860343, "language_loss": 0.64404178, "learning_rate": 9.184205126490767e-07, "loss": 0.66190326, "num_input_tokens_seen": 123710490, "step": 5750, "time_per_iteration": 3.3088181018829346 }, { "auxiliary_loss_clip": 0.01041478, "auxiliary_loss_mlp": 0.00754132, "balance_loss_clip": 1.01475942, "balance_loss_mlp": 1.00024378, "epoch": 0.6915168640654121, "flos": 66741274851840.0, "grad_norm": 1.0944426119398258, "language_loss": 0.59633255, "learning_rate": 9.177653566219075e-07, "loss": 0.61428869, "num_input_tokens_seen": 123765215, "step": 5751, "time_per_iteration": 3.6083736419677734 }, { "auxiliary_loss_clip": 0.01129467, "auxiliary_loss_mlp": 0.01024081, "balance_loss_clip": 1.04137802, "balance_loss_mlp": 1.01704466, "epoch": 0.6916371069560512, "flos": 18296523175680.0, "grad_norm": 2.2201062898960777, "language_loss": 0.76399153, "learning_rate": 9.171103647688744e-07, "loss": 0.78552705, "num_input_tokens_seen": 123783955, "step": 5752, "time_per_iteration": 3.9796807765960693 }, { "auxiliary_loss_clip": 0.01078729, "auxiliary_loss_mlp": 0.0102313, "balance_loss_clip": 1.03839374, "balance_loss_mlp": 1.01623702, "epoch": 0.6917573498466904, "flos": 19645794685440.0, "grad_norm": 1.7999977643501113, "language_loss": 0.69300592, "learning_rate": 9.164555371893367e-07, "loss": 0.71402454, "num_input_tokens_seen": 123803885, "step": 5753, "time_per_iteration": 2.8831613063812256 }, { "auxiliary_loss_clip": 0.01155076, "auxiliary_loss_mlp": 0.00762123, "balance_loss_clip": 1.04694223, "balance_loss_mlp": 1.00048757, "epoch": 0.6918775927373294, "flos": 14210319985920.0, "grad_norm": 1.9860925236047187, "language_loss": 0.75482219, "learning_rate": 9.158008739826333e-07, "loss": 0.77399415, "num_input_tokens_seen": 123821485, "step": 5754, "time_per_iteration": 3.6422476768493652 }, { "auxiliary_loss_clip": 0.01139921, "auxiliary_loss_mlp": 0.0102638, "balance_loss_clip": 1.04579949, "balance_loss_mlp": 1.01906049, "epoch": 0.6919978356279685, "flos": 23985455218560.0, "grad_norm": 1.7817828261706212, "language_loss": 0.86741352, "learning_rate": 9.151463752480744e-07, "loss": 0.88907647, "num_input_tokens_seen": 123840215, "step": 5755, "time_per_iteration": 2.7939369678497314 }, { "auxiliary_loss_clip": 0.01116787, "auxiliary_loss_mlp": 0.01026402, "balance_loss_clip": 1.04035878, "balance_loss_mlp": 1.01938081, "epoch": 0.6921180785186076, "flos": 23622937205760.0, "grad_norm": 1.544130993260468, "language_loss": 0.80264813, "learning_rate": 9.144920410849493e-07, "loss": 0.82407999, "num_input_tokens_seen": 123861450, "step": 5756, "time_per_iteration": 3.6896073818206787 }, { "auxiliary_loss_clip": 0.01149659, "auxiliary_loss_mlp": 0.01027824, "balance_loss_clip": 1.04700303, "balance_loss_mlp": 1.02077281, "epoch": 0.6922383214092467, "flos": 21142623265920.0, "grad_norm": 1.6137093588188487, "language_loss": 0.80839121, "learning_rate": 9.138378715925176e-07, "loss": 0.8301661, "num_input_tokens_seen": 123880545, "step": 5757, "time_per_iteration": 2.644050359725952 }, { "auxiliary_loss_clip": 0.01131867, "auxiliary_loss_mlp": 0.01029659, "balance_loss_clip": 1.04092968, "balance_loss_mlp": 1.02232206, "epoch": 0.6923585642998857, "flos": 21470667200640.0, "grad_norm": 1.9387615437558192, "language_loss": 0.8100943, "learning_rate": 9.131838668700167e-07, "loss": 0.83170962, "num_input_tokens_seen": 123900615, "step": 5758, "time_per_iteration": 2.6702864170074463 }, { "auxiliary_loss_clip": 0.01124588, "auxiliary_loss_mlp": 0.0102463, "balance_loss_clip": 1.03982115, "balance_loss_mlp": 1.01696539, "epoch": 0.6924788071905249, "flos": 21105204272640.0, "grad_norm": 1.745645993692574, "language_loss": 0.86715937, "learning_rate": 9.125300270166598e-07, "loss": 0.88865149, "num_input_tokens_seen": 123921220, "step": 5759, "time_per_iteration": 2.7313644886016846 }, { "auxiliary_loss_clip": 0.01133044, "auxiliary_loss_mlp": 0.01022591, "balance_loss_clip": 1.04117107, "balance_loss_mlp": 1.01561975, "epoch": 0.692599050081164, "flos": 26250018117120.0, "grad_norm": 1.7032117997739717, "language_loss": 0.85530782, "learning_rate": 9.118763521316324e-07, "loss": 0.87686414, "num_input_tokens_seen": 123941795, "step": 5760, "time_per_iteration": 3.6700282096862793 }, { "auxiliary_loss_clip": 0.01167903, "auxiliary_loss_mlp": 0.00762375, "balance_loss_clip": 1.04564333, "balance_loss_mlp": 1.00046134, "epoch": 0.692719292971803, "flos": 20885215426560.0, "grad_norm": 1.869500513751974, "language_loss": 0.76051044, "learning_rate": 9.112228423140987e-07, "loss": 0.77981323, "num_input_tokens_seen": 123960715, "step": 5761, "time_per_iteration": 2.613426923751831 }, { "auxiliary_loss_clip": 0.01145784, "auxiliary_loss_mlp": 0.01023306, "balance_loss_clip": 1.04542041, "balance_loss_mlp": 1.01609945, "epoch": 0.6928395358624422, "flos": 25921938268800.0, "grad_norm": 2.3677708123754058, "language_loss": 0.86667734, "learning_rate": 9.105694976631932e-07, "loss": 0.88836825, "num_input_tokens_seen": 123978625, "step": 5762, "time_per_iteration": 2.712115526199341 }, { "auxiliary_loss_clip": 0.01157614, "auxiliary_loss_mlp": 0.01024677, "balance_loss_clip": 1.04777443, "balance_loss_mlp": 1.01739359, "epoch": 0.6929597787530812, "flos": 23586559706880.0, "grad_norm": 2.246975243028744, "language_loss": 0.72583926, "learning_rate": 9.099163182780283e-07, "loss": 0.74766213, "num_input_tokens_seen": 123996780, "step": 5763, "time_per_iteration": 2.6322591304779053 }, { "auxiliary_loss_clip": 0.01139989, "auxiliary_loss_mlp": 0.01023201, "balance_loss_clip": 1.0457052, "balance_loss_mlp": 1.01574111, "epoch": 0.6930800216437203, "flos": 18255656476800.0, "grad_norm": 2.7670021771331315, "language_loss": 0.48827931, "learning_rate": 9.092633042576916e-07, "loss": 0.50991124, "num_input_tokens_seen": 124014045, "step": 5764, "time_per_iteration": 2.6630594730377197 }, { "auxiliary_loss_clip": 0.01136025, "auxiliary_loss_mlp": 0.01026825, "balance_loss_clip": 1.0439868, "balance_loss_mlp": 1.01964223, "epoch": 0.6932002645343595, "flos": 29168621809920.0, "grad_norm": 1.8565396110218157, "language_loss": 0.56068003, "learning_rate": 9.086104557012446e-07, "loss": 0.58230853, "num_input_tokens_seen": 124034615, "step": 5765, "time_per_iteration": 2.8044261932373047 }, { "auxiliary_loss_clip": 0.01145874, "auxiliary_loss_mlp": 0.01019842, "balance_loss_clip": 1.0435878, "balance_loss_mlp": 1.01321054, "epoch": 0.6933205074249985, "flos": 23842746483840.0, "grad_norm": 3.149643354891842, "language_loss": 0.65533042, "learning_rate": 9.079577727077239e-07, "loss": 0.67698753, "num_input_tokens_seen": 124053445, "step": 5766, "time_per_iteration": 2.6773533821105957 }, { "auxiliary_loss_clip": 0.01157277, "auxiliary_loss_mlp": 0.01023529, "balance_loss_clip": 1.04676759, "balance_loss_mlp": 1.01597762, "epoch": 0.6934407503156376, "flos": 24166696268160.0, "grad_norm": 2.100180396246994, "language_loss": 0.71877599, "learning_rate": 9.073052553761404e-07, "loss": 0.74058414, "num_input_tokens_seen": 124072810, "step": 5767, "time_per_iteration": 2.747452974319458 }, { "auxiliary_loss_clip": 0.0111231, "auxiliary_loss_mlp": 0.01025, "balance_loss_clip": 1.03995883, "balance_loss_mlp": 1.01746011, "epoch": 0.6935609932062767, "flos": 20631327120000.0, "grad_norm": 1.6571405595456843, "language_loss": 0.78115249, "learning_rate": 9.066529038054805e-07, "loss": 0.80252564, "num_input_tokens_seen": 124092875, "step": 5768, "time_per_iteration": 2.7450742721557617 }, { "auxiliary_loss_clip": 0.01137234, "auxiliary_loss_mlp": 0.01029295, "balance_loss_clip": 1.04200625, "balance_loss_mlp": 1.02241623, "epoch": 0.6936812360969158, "flos": 18254184019200.0, "grad_norm": 2.105996261721915, "language_loss": 0.74100769, "learning_rate": 9.060007180947071e-07, "loss": 0.76267296, "num_input_tokens_seen": 124110930, "step": 5769, "time_per_iteration": 2.688722848892212 }, { "auxiliary_loss_clip": 0.01114959, "auxiliary_loss_mlp": 0.01027632, "balance_loss_clip": 1.04027319, "balance_loss_mlp": 1.01987469, "epoch": 0.6938014789875548, "flos": 31317336368640.0, "grad_norm": 1.9201081908493263, "language_loss": 0.73071945, "learning_rate": 9.053486983427534e-07, "loss": 0.75214535, "num_input_tokens_seen": 124132180, "step": 5770, "time_per_iteration": 2.7647948265075684 }, { "auxiliary_loss_clip": 0.01142621, "auxiliary_loss_mlp": 0.01022127, "balance_loss_clip": 1.04125881, "balance_loss_mlp": 1.01487267, "epoch": 0.6939217218781939, "flos": 17528429721600.0, "grad_norm": 1.931262579011117, "language_loss": 0.70828652, "learning_rate": 9.046968446485326e-07, "loss": 0.72993404, "num_input_tokens_seen": 124150585, "step": 5771, "time_per_iteration": 2.642146348953247 }, { "auxiliary_loss_clip": 0.01159254, "auxiliary_loss_mlp": 0.01021222, "balance_loss_clip": 1.04791248, "balance_loss_mlp": 1.01333618, "epoch": 0.6940419647688331, "flos": 18551776199040.0, "grad_norm": 2.185519904401988, "language_loss": 0.70548266, "learning_rate": 9.040451571109295e-07, "loss": 0.72728747, "num_input_tokens_seen": 124166205, "step": 5772, "time_per_iteration": 2.5596513748168945 }, { "auxiliary_loss_clip": 0.01047245, "auxiliary_loss_mlp": 0.01001773, "balance_loss_clip": 1.02418566, "balance_loss_mlp": 1.00041986, "epoch": 0.6941622076594721, "flos": 66926286829440.0, "grad_norm": 0.8314282134279664, "language_loss": 0.60438156, "learning_rate": 9.033936358288042e-07, "loss": 0.62487173, "num_input_tokens_seen": 124219940, "step": 5773, "time_per_iteration": 3.1799252033233643 }, { "auxiliary_loss_clip": 0.01169343, "auxiliary_loss_mlp": 0.01022792, "balance_loss_clip": 1.04666185, "balance_loss_mlp": 1.015154, "epoch": 0.6942824505501112, "flos": 26578062051840.0, "grad_norm": 1.7343229219088774, "language_loss": 0.82318759, "learning_rate": 9.027422809009937e-07, "loss": 0.84510899, "num_input_tokens_seen": 124239885, "step": 5774, "time_per_iteration": 2.570260524749756 }, { "auxiliary_loss_clip": 0.01156408, "auxiliary_loss_mlp": 0.01026984, "balance_loss_clip": 1.04407632, "balance_loss_mlp": 1.01976943, "epoch": 0.6944026934407503, "flos": 21248308056960.0, "grad_norm": 1.8258830462070064, "language_loss": 0.83420086, "learning_rate": 9.020910924263054e-07, "loss": 0.85603476, "num_input_tokens_seen": 124258410, "step": 5775, "time_per_iteration": 2.5962181091308594 }, { "auxiliary_loss_clip": 0.01046997, "auxiliary_loss_mlp": 0.01000831, "balance_loss_clip": 1.02507305, "balance_loss_mlp": 0.99948424, "epoch": 0.6945229363313894, "flos": 70677191537280.0, "grad_norm": 0.8151225802804196, "language_loss": 0.58105022, "learning_rate": 9.014400705035261e-07, "loss": 0.60152853, "num_input_tokens_seen": 124315315, "step": 5776, "time_per_iteration": 3.3166887760162354 }, { "auxiliary_loss_clip": 0.01167967, "auxiliary_loss_mlp": 0.01026035, "balance_loss_clip": 1.04807281, "balance_loss_mlp": 1.01904893, "epoch": 0.6946431792220285, "flos": 18952934267520.0, "grad_norm": 2.1288616960230735, "language_loss": 0.76860809, "learning_rate": 9.00789215231414e-07, "loss": 0.79054809, "num_input_tokens_seen": 124333710, "step": 5777, "time_per_iteration": 3.4972164630889893 }, { "auxiliary_loss_clip": 0.01123309, "auxiliary_loss_mlp": 0.00763108, "balance_loss_clip": 1.03725123, "balance_loss_mlp": 1.00049448, "epoch": 0.6947634221126676, "flos": 20338834671360.0, "grad_norm": 1.7127678941792257, "language_loss": 0.8183679, "learning_rate": 9.001385267087056e-07, "loss": 0.83723205, "num_input_tokens_seen": 124352855, "step": 5778, "time_per_iteration": 2.7845985889434814 }, { "auxiliary_loss_clip": 0.01158932, "auxiliary_loss_mlp": 0.01023716, "balance_loss_clip": 1.04672277, "balance_loss_mlp": 1.01674795, "epoch": 0.6948836650033067, "flos": 21833723917440.0, "grad_norm": 1.5206827550709692, "language_loss": 0.7045756, "learning_rate": 8.994880050341072e-07, "loss": 0.72640204, "num_input_tokens_seen": 124372960, "step": 5779, "time_per_iteration": 2.554785966873169 }, { "auxiliary_loss_clip": 0.01136509, "auxiliary_loss_mlp": 0.01030478, "balance_loss_clip": 1.04558706, "balance_loss_mlp": 1.02275348, "epoch": 0.6950039078939457, "flos": 23657519024640.0, "grad_norm": 1.7078173427065748, "language_loss": 0.77634847, "learning_rate": 8.988376503063026e-07, "loss": 0.7980184, "num_input_tokens_seen": 124394220, "step": 5780, "time_per_iteration": 2.724730968475342 }, { "auxiliary_loss_clip": 0.01119174, "auxiliary_loss_mlp": 0.01028581, "balance_loss_clip": 1.04084873, "balance_loss_mlp": 1.02108824, "epoch": 0.6951241507845849, "flos": 21792462168960.0, "grad_norm": 1.8405415639220446, "language_loss": 0.81867099, "learning_rate": 8.981874626239521e-07, "loss": 0.84014851, "num_input_tokens_seen": 124412795, "step": 5781, "time_per_iteration": 3.7039763927459717 }, { "auxiliary_loss_clip": 0.01158805, "auxiliary_loss_mlp": 0.01032138, "balance_loss_clip": 1.04908299, "balance_loss_mlp": 1.02483654, "epoch": 0.695244393675224, "flos": 14647568244480.0, "grad_norm": 2.101260795474958, "language_loss": 0.87924731, "learning_rate": 8.975374420856872e-07, "loss": 0.90115666, "num_input_tokens_seen": 124429690, "step": 5782, "time_per_iteration": 3.5646626949310303 }, { "auxiliary_loss_clip": 0.01117191, "auxiliary_loss_mlp": 0.01025405, "balance_loss_clip": 1.03887415, "balance_loss_mlp": 1.01829386, "epoch": 0.695364636565863, "flos": 16873203778560.0, "grad_norm": 2.299918211702129, "language_loss": 0.72771406, "learning_rate": 8.968875887901157e-07, "loss": 0.74914008, "num_input_tokens_seen": 124447070, "step": 5783, "time_per_iteration": 2.748270034790039 }, { "auxiliary_loss_clip": 0.01140294, "auxiliary_loss_mlp": 0.010281, "balance_loss_clip": 1.04169798, "balance_loss_mlp": 1.02072954, "epoch": 0.6954848794565022, "flos": 19354523299200.0, "grad_norm": 2.619582447003379, "language_loss": 0.63283062, "learning_rate": 8.9623790283582e-07, "loss": 0.65451455, "num_input_tokens_seen": 124464950, "step": 5784, "time_per_iteration": 2.6429190635681152 }, { "auxiliary_loss_clip": 0.01127531, "auxiliary_loss_mlp": 0.01026895, "balance_loss_clip": 1.04311848, "balance_loss_mlp": 1.01901495, "epoch": 0.6956051223471412, "flos": 18990209606400.0, "grad_norm": 2.2516763161422837, "language_loss": 0.76560092, "learning_rate": 8.955883843213561e-07, "loss": 0.78714514, "num_input_tokens_seen": 124483965, "step": 5785, "time_per_iteration": 2.7013463973999023 }, { "auxiliary_loss_clip": 0.01161296, "auxiliary_loss_mlp": 0.01030421, "balance_loss_clip": 1.04470611, "balance_loss_mlp": 1.02289271, "epoch": 0.6957253652377803, "flos": 16107229226880.0, "grad_norm": 1.8124991524136504, "language_loss": 0.868613, "learning_rate": 8.949390333452569e-07, "loss": 0.89053017, "num_input_tokens_seen": 124501910, "step": 5786, "time_per_iteration": 3.457934617996216 }, { "auxiliary_loss_clip": 0.01170254, "auxiliary_loss_mlp": 0.01025783, "balance_loss_clip": 1.05022728, "balance_loss_mlp": 1.01818347, "epoch": 0.6958456081284194, "flos": 29388646569600.0, "grad_norm": 2.584747929030377, "language_loss": 0.67882669, "learning_rate": 8.942898500060279e-07, "loss": 0.70078707, "num_input_tokens_seen": 124521625, "step": 5787, "time_per_iteration": 2.7019872665405273 }, { "auxiliary_loss_clip": 0.01123182, "auxiliary_loss_mlp": 0.01028444, "balance_loss_clip": 1.04434597, "balance_loss_mlp": 1.02144313, "epoch": 0.6959658510190585, "flos": 25154850395520.0, "grad_norm": 2.3997787921380493, "language_loss": 0.71892726, "learning_rate": 8.936408344021493e-07, "loss": 0.74044353, "num_input_tokens_seen": 124538540, "step": 5788, "time_per_iteration": 2.7720470428466797 }, { "auxiliary_loss_clip": 0.0115104, "auxiliary_loss_mlp": 0.01025494, "balance_loss_clip": 1.047876, "balance_loss_mlp": 1.01819885, "epoch": 0.6960860939096976, "flos": 42814388759040.0, "grad_norm": 2.2014817142966763, "language_loss": 0.70903707, "learning_rate": 8.929919866320765e-07, "loss": 0.73080242, "num_input_tokens_seen": 124559355, "step": 5789, "time_per_iteration": 2.882000207901001 }, { "auxiliary_loss_clip": 0.01134134, "auxiliary_loss_mlp": 0.00762251, "balance_loss_clip": 1.04371631, "balance_loss_mlp": 1.00045705, "epoch": 0.6962063368003367, "flos": 17566566986880.0, "grad_norm": 1.9550747144180705, "language_loss": 0.8174687, "learning_rate": 8.923433067942385e-07, "loss": 0.83643258, "num_input_tokens_seen": 124577920, "step": 5790, "time_per_iteration": 2.7166712284088135 }, { "auxiliary_loss_clip": 0.01136567, "auxiliary_loss_mlp": 0.01026331, "balance_loss_clip": 1.04537797, "balance_loss_mlp": 1.01868367, "epoch": 0.6963265796909758, "flos": 21251648021760.0, "grad_norm": 2.8045280358435987, "language_loss": 0.68638271, "learning_rate": 8.916947949870417e-07, "loss": 0.70801169, "num_input_tokens_seen": 124597585, "step": 5791, "time_per_iteration": 2.6313536167144775 }, { "auxiliary_loss_clip": 0.01061905, "auxiliary_loss_mlp": 0.01001687, "balance_loss_clip": 1.01443815, "balance_loss_mlp": 1.00060236, "epoch": 0.6964468225816148, "flos": 68828295801600.0, "grad_norm": 0.7499801687828961, "language_loss": 0.58118498, "learning_rate": 8.910464513088615e-07, "loss": 0.60182095, "num_input_tokens_seen": 124661625, "step": 5792, "time_per_iteration": 3.332893133163452 }, { "auxiliary_loss_clip": 0.01133585, "auxiliary_loss_mlp": 0.01022286, "balance_loss_clip": 1.04268146, "balance_loss_mlp": 1.01488614, "epoch": 0.696567065472254, "flos": 18950887192320.0, "grad_norm": 1.8145240945004306, "language_loss": 0.78503406, "learning_rate": 8.903982758580542e-07, "loss": 0.80659282, "num_input_tokens_seen": 124680565, "step": 5793, "time_per_iteration": 2.6937830448150635 }, { "auxiliary_loss_clip": 0.01138484, "auxiliary_loss_mlp": 0.01022968, "balance_loss_clip": 1.04422808, "balance_loss_mlp": 1.01520801, "epoch": 0.696687308362893, "flos": 22856675345280.0, "grad_norm": 1.9727758615387083, "language_loss": 0.80280197, "learning_rate": 8.897502687329457e-07, "loss": 0.82441652, "num_input_tokens_seen": 124700365, "step": 5794, "time_per_iteration": 2.7059104442596436 }, { "auxiliary_loss_clip": 0.01124038, "auxiliary_loss_mlp": 0.01024094, "balance_loss_clip": 1.04166651, "balance_loss_mlp": 1.01661992, "epoch": 0.6968075512535321, "flos": 24972926987520.0, "grad_norm": 2.2491711429016648, "language_loss": 0.80008692, "learning_rate": 8.891024300318382e-07, "loss": 0.82156825, "num_input_tokens_seen": 124718935, "step": 5795, "time_per_iteration": 2.734971284866333 }, { "auxiliary_loss_clip": 0.01117987, "auxiliary_loss_mlp": 0.01031553, "balance_loss_clip": 1.04061222, "balance_loss_mlp": 1.02495193, "epoch": 0.6969277941441713, "flos": 21030438113280.0, "grad_norm": 2.15729182418763, "language_loss": 0.75928724, "learning_rate": 8.884547598530103e-07, "loss": 0.78078264, "num_input_tokens_seen": 124739505, "step": 5796, "time_per_iteration": 2.6992619037628174 }, { "auxiliary_loss_clip": 0.01075505, "auxiliary_loss_mlp": 0.01032569, "balance_loss_clip": 1.03667617, "balance_loss_mlp": 1.02508891, "epoch": 0.6970480370348103, "flos": 21579404647680.0, "grad_norm": 1.7364400788061227, "language_loss": 0.75333351, "learning_rate": 8.8780725829471e-07, "loss": 0.77441424, "num_input_tokens_seen": 124757410, "step": 5797, "time_per_iteration": 2.818135976791382 }, { "auxiliary_loss_clip": 0.01172984, "auxiliary_loss_mlp": 0.01026536, "balance_loss_clip": 1.04952741, "balance_loss_mlp": 1.01958656, "epoch": 0.6971682799254494, "flos": 22419175691520.0, "grad_norm": 2.1667887720376906, "language_loss": 0.78315806, "learning_rate": 8.87159925455165e-07, "loss": 0.80515325, "num_input_tokens_seen": 124777240, "step": 5798, "time_per_iteration": 2.653728723526001 }, { "auxiliary_loss_clip": 0.011214, "auxiliary_loss_mlp": 0.01031667, "balance_loss_clip": 1.04019141, "balance_loss_mlp": 1.02456248, "epoch": 0.6972885228160886, "flos": 20005834659840.0, "grad_norm": 2.104885622348916, "language_loss": 0.73080021, "learning_rate": 8.865127614325738e-07, "loss": 0.75233084, "num_input_tokens_seen": 124795670, "step": 5799, "time_per_iteration": 2.666292190551758 }, { "auxiliary_loss_clip": 0.01135207, "auxiliary_loss_mlp": 0.01027601, "balance_loss_clip": 1.04159451, "balance_loss_mlp": 1.01913667, "epoch": 0.6974087657067276, "flos": 37853437656960.0, "grad_norm": 1.793853051994157, "language_loss": 0.66844225, "learning_rate": 8.85865766325113e-07, "loss": 0.69007039, "num_input_tokens_seen": 124819600, "step": 5800, "time_per_iteration": 2.763054847717285 }, { "auxiliary_loss_clip": 0.01139781, "auxiliary_loss_mlp": 0.01028612, "balance_loss_clip": 1.04437852, "balance_loss_mlp": 1.02048814, "epoch": 0.6975290085973667, "flos": 29489267543040.0, "grad_norm": 3.073350199315057, "language_loss": 0.71708202, "learning_rate": 8.852189402309287e-07, "loss": 0.73876595, "num_input_tokens_seen": 124838785, "step": 5801, "time_per_iteration": 2.704833745956421 }, { "auxiliary_loss_clip": 0.01155709, "auxiliary_loss_mlp": 0.01022941, "balance_loss_clip": 1.04628098, "balance_loss_mlp": 1.01528764, "epoch": 0.6976492514880057, "flos": 12895630295040.0, "grad_norm": 2.147682492184672, "language_loss": 0.74284005, "learning_rate": 8.845722832481441e-07, "loss": 0.76462662, "num_input_tokens_seen": 124854215, "step": 5802, "time_per_iteration": 2.5720767974853516 }, { "auxiliary_loss_clip": 0.01155238, "auxiliary_loss_mlp": 0.01031177, "balance_loss_clip": 1.04612362, "balance_loss_mlp": 1.02344608, "epoch": 0.6977694943786449, "flos": 24352929308160.0, "grad_norm": 2.031152574377187, "language_loss": 0.77410322, "learning_rate": 8.83925795474858e-07, "loss": 0.79596734, "num_input_tokens_seen": 124874340, "step": 5803, "time_per_iteration": 2.6433310508728027 }, { "auxiliary_loss_clip": 0.01126396, "auxiliary_loss_mlp": 0.01027358, "balance_loss_clip": 1.04525471, "balance_loss_mlp": 1.01951146, "epoch": 0.6978897372692839, "flos": 29898470257920.0, "grad_norm": 2.3711461055461593, "language_loss": 0.59320617, "learning_rate": 8.832794770091414e-07, "loss": 0.61474371, "num_input_tokens_seen": 124895175, "step": 5804, "time_per_iteration": 3.6935064792633057 }, { "auxiliary_loss_clip": 0.01145509, "auxiliary_loss_mlp": 0.01023336, "balance_loss_clip": 1.04327941, "balance_loss_mlp": 1.0159924, "epoch": 0.698009980159923, "flos": 21761579450880.0, "grad_norm": 2.148732620379686, "language_loss": 0.82424408, "learning_rate": 8.826333279490401e-07, "loss": 0.84593254, "num_input_tokens_seen": 124915810, "step": 5805, "time_per_iteration": 2.6968674659729004 }, { "auxiliary_loss_clip": 0.01144751, "auxiliary_loss_mlp": 0.01024037, "balance_loss_clip": 1.04707539, "balance_loss_mlp": 1.01693559, "epoch": 0.6981302230505622, "flos": 19857164267520.0, "grad_norm": 2.4500563451727366, "language_loss": 0.68242919, "learning_rate": 8.819873483925748e-07, "loss": 0.70411706, "num_input_tokens_seen": 124932930, "step": 5806, "time_per_iteration": 3.61437726020813 }, { "auxiliary_loss_clip": 0.01133331, "auxiliary_loss_mlp": 0.00762286, "balance_loss_clip": 1.04535079, "balance_loss_mlp": 1.00045228, "epoch": 0.6982504659412012, "flos": 22198648141440.0, "grad_norm": 2.204926847643738, "language_loss": 0.74118114, "learning_rate": 8.81341538437739e-07, "loss": 0.76013732, "num_input_tokens_seen": 124951220, "step": 5807, "time_per_iteration": 2.7330801486968994 }, { "auxiliary_loss_clip": 0.01145729, "auxiliary_loss_mlp": 0.01029638, "balance_loss_clip": 1.04303432, "balance_loss_mlp": 1.02218151, "epoch": 0.6983707088318403, "flos": 35588479708800.0, "grad_norm": 2.095883204222148, "language_loss": 0.67945111, "learning_rate": 8.80695898182503e-07, "loss": 0.70120478, "num_input_tokens_seen": 124972200, "step": 5808, "time_per_iteration": 3.657522439956665 }, { "auxiliary_loss_clip": 0.01059648, "auxiliary_loss_mlp": 0.01003513, "balance_loss_clip": 1.01885343, "balance_loss_mlp": 1.00214815, "epoch": 0.6984909517224794, "flos": 65440052760960.0, "grad_norm": 0.8842228311734813, "language_loss": 0.65112889, "learning_rate": 8.800504277248093e-07, "loss": 0.67176056, "num_input_tokens_seen": 125036950, "step": 5809, "time_per_iteration": 3.259469747543335 }, { "auxiliary_loss_clip": 0.01125676, "auxiliary_loss_mlp": 0.00762294, "balance_loss_clip": 1.04773569, "balance_loss_mlp": 1.00051689, "epoch": 0.6986111946131185, "flos": 18546927863040.0, "grad_norm": 1.6075793972597947, "language_loss": 0.75009966, "learning_rate": 8.794051271625753e-07, "loss": 0.76897937, "num_input_tokens_seen": 125054585, "step": 5810, "time_per_iteration": 2.6702182292938232 }, { "auxiliary_loss_clip": 0.0114153, "auxiliary_loss_mlp": 0.01026639, "balance_loss_clip": 1.04408193, "balance_loss_mlp": 1.0198319, "epoch": 0.6987314375037575, "flos": 23039173370880.0, "grad_norm": 1.645219443420088, "language_loss": 0.83282971, "learning_rate": 8.787599965936925e-07, "loss": 0.85451138, "num_input_tokens_seen": 125075515, "step": 5811, "time_per_iteration": 2.6780834197998047 }, { "auxiliary_loss_clip": 0.01119823, "auxiliary_loss_mlp": 0.0102696, "balance_loss_clip": 1.04290998, "balance_loss_mlp": 1.01970637, "epoch": 0.6988516803943967, "flos": 38400393029760.0, "grad_norm": 1.6484467305671195, "language_loss": 0.71975595, "learning_rate": 8.781150361160261e-07, "loss": 0.74122369, "num_input_tokens_seen": 125097425, "step": 5812, "time_per_iteration": 3.7405972480773926 }, { "auxiliary_loss_clip": 0.01131614, "auxiliary_loss_mlp": 0.01028753, "balance_loss_clip": 1.04408669, "balance_loss_mlp": 1.02078998, "epoch": 0.6989719232850358, "flos": 24096993926400.0, "grad_norm": 1.6260899064992103, "language_loss": 0.73579001, "learning_rate": 8.774702458274181e-07, "loss": 0.75739366, "num_input_tokens_seen": 125117830, "step": 5813, "time_per_iteration": 2.682807683944702 }, { "auxiliary_loss_clip": 0.01154348, "auxiliary_loss_mlp": 0.01026396, "balance_loss_clip": 1.04568839, "balance_loss_mlp": 1.01941347, "epoch": 0.6990921661756748, "flos": 14866838818560.0, "grad_norm": 2.7274135448381904, "language_loss": 0.70809257, "learning_rate": 8.768256258256799e-07, "loss": 0.72989994, "num_input_tokens_seen": 125134455, "step": 5814, "time_per_iteration": 2.652221202850342 }, { "auxiliary_loss_clip": 0.01161643, "auxiliary_loss_mlp": 0.01026496, "balance_loss_clip": 1.04809618, "balance_loss_mlp": 1.01898623, "epoch": 0.699212409066314, "flos": 20193719725440.0, "grad_norm": 1.8023459818206529, "language_loss": 0.74065483, "learning_rate": 8.76181176208602e-07, "loss": 0.76253617, "num_input_tokens_seen": 125152555, "step": 5815, "time_per_iteration": 2.5946483612060547 }, { "auxiliary_loss_clip": 0.01101589, "auxiliary_loss_mlp": 0.01028263, "balance_loss_clip": 1.03863621, "balance_loss_mlp": 1.02058625, "epoch": 0.699332651956953, "flos": 19427888828160.0, "grad_norm": 1.7875823160052262, "language_loss": 0.73591673, "learning_rate": 8.755368970739461e-07, "loss": 0.75721526, "num_input_tokens_seen": 125171915, "step": 5816, "time_per_iteration": 2.7624330520629883 }, { "auxiliary_loss_clip": 0.01132999, "auxiliary_loss_mlp": 0.01025391, "balance_loss_clip": 1.04224825, "balance_loss_mlp": 1.01757669, "epoch": 0.6994528948475921, "flos": 16143714466560.0, "grad_norm": 2.3064717999899624, "language_loss": 0.61137187, "learning_rate": 8.748927885194479e-07, "loss": 0.63295579, "num_input_tokens_seen": 125190220, "step": 5817, "time_per_iteration": 2.706987142562866 }, { "auxiliary_loss_clip": 0.01033998, "auxiliary_loss_mlp": 0.01003455, "balance_loss_clip": 1.017295, "balance_loss_mlp": 1.00226295, "epoch": 0.6995731377382313, "flos": 64952420699520.0, "grad_norm": 0.8273315282411723, "language_loss": 0.57402635, "learning_rate": 8.742488506428209e-07, "loss": 0.59440088, "num_input_tokens_seen": 125249310, "step": 5818, "time_per_iteration": 3.253621816635132 }, { "auxiliary_loss_clip": 0.01143751, "auxiliary_loss_mlp": 0.00762243, "balance_loss_clip": 1.04386973, "balance_loss_mlp": 1.00041771, "epoch": 0.6996933806288703, "flos": 24900136076160.0, "grad_norm": 2.0264930989992496, "language_loss": 0.78336942, "learning_rate": 8.736050835417466e-07, "loss": 0.80242932, "num_input_tokens_seen": 125269350, "step": 5819, "time_per_iteration": 2.794365882873535 }, { "auxiliary_loss_clip": 0.01159429, "auxiliary_loss_mlp": 0.01025237, "balance_loss_clip": 1.04593635, "balance_loss_mlp": 1.01777172, "epoch": 0.6998136235195094, "flos": 20777806782720.0, "grad_norm": 2.0566667762465807, "language_loss": 0.61141658, "learning_rate": 8.729614873138862e-07, "loss": 0.63326323, "num_input_tokens_seen": 125286985, "step": 5820, "time_per_iteration": 2.759895086288452 }, { "auxiliary_loss_clip": 0.0111997, "auxiliary_loss_mlp": 0.01027525, "balance_loss_clip": 1.04433441, "balance_loss_mlp": 1.01971698, "epoch": 0.6999338664101485, "flos": 23733470332800.0, "grad_norm": 1.9537224207431503, "language_loss": 0.77839947, "learning_rate": 8.723180620568716e-07, "loss": 0.79987442, "num_input_tokens_seen": 125306240, "step": 5821, "time_per_iteration": 2.769481658935547 }, { "auxiliary_loss_clip": 0.01145161, "auxiliary_loss_mlp": 0.01027505, "balance_loss_clip": 1.04412174, "balance_loss_mlp": 1.02004838, "epoch": 0.7000541093007876, "flos": 19864598382720.0, "grad_norm": 1.9832436344437712, "language_loss": 0.85252964, "learning_rate": 8.716748078683116e-07, "loss": 0.87425625, "num_input_tokens_seen": 125323015, "step": 5822, "time_per_iteration": 2.6799557209014893 }, { "auxiliary_loss_clip": 0.01079107, "auxiliary_loss_mlp": 0.01030416, "balance_loss_clip": 1.03883004, "balance_loss_mlp": 1.02248263, "epoch": 0.7001743521914267, "flos": 29679056029440.0, "grad_norm": 2.0555862421934017, "language_loss": 0.6878438, "learning_rate": 8.710317248457855e-07, "loss": 0.70893896, "num_input_tokens_seen": 125342630, "step": 5823, "time_per_iteration": 2.8831534385681152 }, { "auxiliary_loss_clip": 0.01137922, "auxiliary_loss_mlp": 0.01024302, "balance_loss_clip": 1.04607344, "balance_loss_mlp": 1.01666617, "epoch": 0.7002945950820658, "flos": 27489762080640.0, "grad_norm": 1.918262758777171, "language_loss": 0.7247088, "learning_rate": 8.703888130868482e-07, "loss": 0.74633098, "num_input_tokens_seen": 125364480, "step": 5824, "time_per_iteration": 2.8230650424957275 }, { "auxiliary_loss_clip": 0.01128457, "auxiliary_loss_mlp": 0.0102731, "balance_loss_clip": 1.04390609, "balance_loss_mlp": 1.01982605, "epoch": 0.7004148379727049, "flos": 22158463800960.0, "grad_norm": 3.0668078060857367, "language_loss": 0.82067657, "learning_rate": 8.697460726890307e-07, "loss": 0.84223425, "num_input_tokens_seen": 125381625, "step": 5825, "time_per_iteration": 2.7540905475616455 }, { "auxiliary_loss_clip": 0.01125604, "auxiliary_loss_mlp": 0.00762455, "balance_loss_clip": 1.03998923, "balance_loss_mlp": 1.000525, "epoch": 0.7005350808633439, "flos": 19423758764160.0, "grad_norm": 2.0669687758409263, "language_loss": 0.90771532, "learning_rate": 8.691035037498354e-07, "loss": 0.92659587, "num_input_tokens_seen": 125397615, "step": 5826, "time_per_iteration": 2.7043471336364746 }, { "auxiliary_loss_clip": 0.01135502, "auxiliary_loss_mlp": 0.01023972, "balance_loss_clip": 1.04112029, "balance_loss_mlp": 1.01689982, "epoch": 0.7006553237539831, "flos": 23476708938240.0, "grad_norm": 1.6305655583176488, "language_loss": 0.72444487, "learning_rate": 8.684611063667391e-07, "loss": 0.74603957, "num_input_tokens_seen": 125418080, "step": 5827, "time_per_iteration": 2.680734395980835 }, { "auxiliary_loss_clip": 0.0115397, "auxiliary_loss_mlp": 0.01026331, "balance_loss_clip": 1.04471183, "balance_loss_mlp": 1.01904762, "epoch": 0.7007755666446221, "flos": 31212872640000.0, "grad_norm": 2.5689016829073945, "language_loss": 0.76891118, "learning_rate": 8.678188806371935e-07, "loss": 0.7907142, "num_input_tokens_seen": 125440115, "step": 5828, "time_per_iteration": 2.721036911010742 }, { "auxiliary_loss_clip": 0.01153677, "auxiliary_loss_mlp": 0.01022113, "balance_loss_clip": 1.04471982, "balance_loss_mlp": 1.01578295, "epoch": 0.7008958095352612, "flos": 18149899858560.0, "grad_norm": 1.7028636290233696, "language_loss": 0.85258543, "learning_rate": 8.671768266586228e-07, "loss": 0.87434328, "num_input_tokens_seen": 125458240, "step": 5829, "time_per_iteration": 3.572343587875366 }, { "auxiliary_loss_clip": 0.01122814, "auxiliary_loss_mlp": 0.01025639, "balance_loss_clip": 1.04252052, "balance_loss_mlp": 1.01788199, "epoch": 0.7010160524259004, "flos": 27452307173760.0, "grad_norm": 4.915593312020079, "language_loss": 0.7816819, "learning_rate": 8.665349445284275e-07, "loss": 0.80316639, "num_input_tokens_seen": 125477980, "step": 5830, "time_per_iteration": 2.7273168563842773 }, { "auxiliary_loss_clip": 0.01124698, "auxiliary_loss_mlp": 0.01023495, "balance_loss_clip": 1.04285645, "balance_loss_mlp": 1.01634812, "epoch": 0.7011362953165394, "flos": 23842064125440.0, "grad_norm": 1.4805349447164358, "language_loss": 0.80934608, "learning_rate": 8.658932343439799e-07, "loss": 0.83082795, "num_input_tokens_seen": 125497765, "step": 5831, "time_per_iteration": 2.785181999206543 }, { "auxiliary_loss_clip": 0.01170172, "auxiliary_loss_mlp": 0.0102438, "balance_loss_clip": 1.04803717, "balance_loss_mlp": 1.01657152, "epoch": 0.7012565382071785, "flos": 24823430582400.0, "grad_norm": 2.2613592535809577, "language_loss": 0.77758253, "learning_rate": 8.65251696202627e-07, "loss": 0.79952812, "num_input_tokens_seen": 125514145, "step": 5832, "time_per_iteration": 3.566073179244995 }, { "auxiliary_loss_clip": 0.01130014, "auxiliary_loss_mlp": 0.01023245, "balance_loss_clip": 1.04446745, "balance_loss_mlp": 1.01593781, "epoch": 0.7013767810978175, "flos": 21397445326080.0, "grad_norm": 2.131440642551645, "language_loss": 0.87507391, "learning_rate": 8.646103302016896e-07, "loss": 0.89660656, "num_input_tokens_seen": 125533115, "step": 5833, "time_per_iteration": 3.627340316772461 }, { "auxiliary_loss_clip": 0.01123225, "auxiliary_loss_mlp": 0.01025969, "balance_loss_clip": 1.04126978, "balance_loss_mlp": 1.01868534, "epoch": 0.7014970239884567, "flos": 16687150306560.0, "grad_norm": 1.7278180318166494, "language_loss": 0.88248926, "learning_rate": 8.639691364384614e-07, "loss": 0.90398121, "num_input_tokens_seen": 125550740, "step": 5834, "time_per_iteration": 2.683443784713745 }, { "auxiliary_loss_clip": 0.01144281, "auxiliary_loss_mlp": 0.01024027, "balance_loss_clip": 1.04627287, "balance_loss_mlp": 1.01663637, "epoch": 0.7016172668790958, "flos": 12568268718720.0, "grad_norm": 2.014697372318599, "language_loss": 0.7288866, "learning_rate": 8.633281150102136e-07, "loss": 0.7505697, "num_input_tokens_seen": 125567590, "step": 5835, "time_per_iteration": 2.693934917449951 }, { "auxiliary_loss_clip": 0.01142431, "auxiliary_loss_mlp": 0.01028516, "balance_loss_clip": 1.04527497, "balance_loss_mlp": 1.02094293, "epoch": 0.7017375097697348, "flos": 17452729808640.0, "grad_norm": 2.2699714556555444, "language_loss": 0.6794824, "learning_rate": 8.626872660141855e-07, "loss": 0.70119178, "num_input_tokens_seen": 125585500, "step": 5836, "time_per_iteration": 2.721418857574463 }, { "auxiliary_loss_clip": 0.01117231, "auxiliary_loss_mlp": 0.01024436, "balance_loss_clip": 1.04483414, "balance_loss_mlp": 1.01737857, "epoch": 0.701857752660374, "flos": 18513028402560.0, "grad_norm": 1.6542522655245253, "language_loss": 0.74649477, "learning_rate": 8.620465895475957e-07, "loss": 0.76791137, "num_input_tokens_seen": 125603720, "step": 5837, "time_per_iteration": 3.7478787899017334 }, { "auxiliary_loss_clip": 0.01109476, "auxiliary_loss_mlp": 0.01026408, "balance_loss_clip": 1.04361677, "balance_loss_mlp": 1.01927304, "epoch": 0.701977995551013, "flos": 24425971614720.0, "grad_norm": 1.5190096587592972, "language_loss": 0.75205833, "learning_rate": 8.614060857076333e-07, "loss": 0.77341712, "num_input_tokens_seen": 125624390, "step": 5838, "time_per_iteration": 2.7823619842529297 }, { "auxiliary_loss_clip": 0.01138322, "auxiliary_loss_mlp": 0.01029501, "balance_loss_clip": 1.04507732, "balance_loss_mlp": 1.02212763, "epoch": 0.7020982384416521, "flos": 23002759958400.0, "grad_norm": 1.767495094189689, "language_loss": 0.74609023, "learning_rate": 8.60765754591462e-07, "loss": 0.76776844, "num_input_tokens_seen": 125644085, "step": 5839, "time_per_iteration": 2.6580851078033447 }, { "auxiliary_loss_clip": 0.01168747, "auxiliary_loss_mlp": 0.01029235, "balance_loss_clip": 1.04737425, "balance_loss_mlp": 1.02177274, "epoch": 0.7022184813322913, "flos": 20449080489600.0, "grad_norm": 1.9687074430062752, "language_loss": 0.72819912, "learning_rate": 8.601255962962211e-07, "loss": 0.75017893, "num_input_tokens_seen": 125663095, "step": 5840, "time_per_iteration": 2.6684701442718506 }, { "auxiliary_loss_clip": 0.01163309, "auxiliary_loss_mlp": 0.01028036, "balance_loss_clip": 1.04874396, "balance_loss_mlp": 1.01892257, "epoch": 0.7023387242229303, "flos": 19790514581760.0, "grad_norm": 2.5812301742349635, "language_loss": 0.7231406, "learning_rate": 8.594856109190194e-07, "loss": 0.74505407, "num_input_tokens_seen": 125680125, "step": 5841, "time_per_iteration": 2.6279587745666504 }, { "auxiliary_loss_clip": 0.01170666, "auxiliary_loss_mlp": 0.01024908, "balance_loss_clip": 1.04720187, "balance_loss_mlp": 1.01757622, "epoch": 0.7024589671135694, "flos": 33259278286080.0, "grad_norm": 1.7449815744445807, "language_loss": 0.69390082, "learning_rate": 8.588457985569446e-07, "loss": 0.71585649, "num_input_tokens_seen": 125703035, "step": 5842, "time_per_iteration": 2.800624132156372 }, { "auxiliary_loss_clip": 0.01172195, "auxiliary_loss_mlp": 0.01032321, "balance_loss_clip": 1.04844499, "balance_loss_mlp": 1.02478993, "epoch": 0.7025792100042085, "flos": 19098982967040.0, "grad_norm": 2.0849110612225608, "language_loss": 0.7188251, "learning_rate": 8.582061593070542e-07, "loss": 0.74087024, "num_input_tokens_seen": 125723765, "step": 5843, "time_per_iteration": 2.6414637565612793 }, { "auxiliary_loss_clip": 0.01171935, "auxiliary_loss_mlp": 0.00762542, "balance_loss_clip": 1.04913831, "balance_loss_mlp": 1.00046539, "epoch": 0.7026994528948476, "flos": 18952611045120.0, "grad_norm": 2.412183104473722, "language_loss": 0.77347505, "learning_rate": 8.57566693266383e-07, "loss": 0.79281974, "num_input_tokens_seen": 125741455, "step": 5844, "time_per_iteration": 2.5886712074279785 }, { "auxiliary_loss_clip": 0.01147597, "auxiliary_loss_mlp": 0.00762934, "balance_loss_clip": 1.0435127, "balance_loss_mlp": 1.0004884, "epoch": 0.7028196957854866, "flos": 19536662188800.0, "grad_norm": 1.9245577172126884, "language_loss": 0.69245851, "learning_rate": 8.569274005319354e-07, "loss": 0.71156383, "num_input_tokens_seen": 125759855, "step": 5845, "time_per_iteration": 2.6440470218658447 }, { "auxiliary_loss_clip": 0.01152886, "auxiliary_loss_mlp": 0.01027834, "balance_loss_clip": 1.04707456, "balance_loss_mlp": 1.02061629, "epoch": 0.7029399386761258, "flos": 20845318394880.0, "grad_norm": 1.750962480784786, "language_loss": 0.79891837, "learning_rate": 8.562882812006913e-07, "loss": 0.82072562, "num_input_tokens_seen": 125777345, "step": 5846, "time_per_iteration": 2.6221730709075928 }, { "auxiliary_loss_clip": 0.01169619, "auxiliary_loss_mlp": 0.01023082, "balance_loss_clip": 1.04833746, "balance_loss_mlp": 1.0158664, "epoch": 0.7030601815667649, "flos": 22055005653120.0, "grad_norm": 1.645634564858331, "language_loss": 0.77131832, "learning_rate": 8.556493353696066e-07, "loss": 0.79324532, "num_input_tokens_seen": 125796345, "step": 5847, "time_per_iteration": 2.5985822677612305 }, { "auxiliary_loss_clip": 0.01161449, "auxiliary_loss_mlp": 0.0076296, "balance_loss_clip": 1.04849291, "balance_loss_mlp": 1.00050879, "epoch": 0.7031804244574039, "flos": 27198742089600.0, "grad_norm": 6.164350309087768, "language_loss": 0.68148714, "learning_rate": 8.550105631356077e-07, "loss": 0.70073122, "num_input_tokens_seen": 125816070, "step": 5848, "time_per_iteration": 2.688530206680298 }, { "auxiliary_loss_clip": 0.01124416, "auxiliary_loss_mlp": 0.0102287, "balance_loss_clip": 1.04174078, "balance_loss_mlp": 1.01488328, "epoch": 0.7033006673480431, "flos": 22379853277440.0, "grad_norm": 1.9301445314710028, "language_loss": 0.77105927, "learning_rate": 8.543719645955961e-07, "loss": 0.79253209, "num_input_tokens_seen": 125834400, "step": 5849, "time_per_iteration": 2.7539658546447754 }, { "auxiliary_loss_clip": 0.01141308, "auxiliary_loss_mlp": 0.01022211, "balance_loss_clip": 1.04370952, "balance_loss_mlp": 1.01492453, "epoch": 0.7034209102386821, "flos": 24715986024960.0, "grad_norm": 1.6525937127209396, "language_loss": 0.7470383, "learning_rate": 8.537335398464467e-07, "loss": 0.76867348, "num_input_tokens_seen": 125854720, "step": 5850, "time_per_iteration": 2.7098097801208496 }, { "auxiliary_loss_clip": 0.01141042, "auxiliary_loss_mlp": 0.01026274, "balance_loss_clip": 1.04165411, "balance_loss_mlp": 1.01792979, "epoch": 0.7035411531293212, "flos": 22556174163840.0, "grad_norm": 4.396330511606085, "language_loss": 0.85512853, "learning_rate": 8.53095288985007e-07, "loss": 0.87680161, "num_input_tokens_seen": 125868455, "step": 5851, "time_per_iteration": 2.7180612087249756 }, { "auxiliary_loss_clip": 0.01165986, "auxiliary_loss_mlp": 0.01022896, "balance_loss_clip": 1.04614854, "balance_loss_mlp": 1.01604724, "epoch": 0.7036613960199604, "flos": 22674967418880.0, "grad_norm": 1.7456832123705772, "language_loss": 0.82543087, "learning_rate": 8.524572121081009e-07, "loss": 0.84731972, "num_input_tokens_seen": 125888555, "step": 5852, "time_per_iteration": 2.6090939044952393 }, { "auxiliary_loss_clip": 0.01158943, "auxiliary_loss_mlp": 0.01026124, "balance_loss_clip": 1.04509211, "balance_loss_mlp": 1.01829195, "epoch": 0.7037816389105994, "flos": 22492146170880.0, "grad_norm": 2.013494390773777, "language_loss": 0.62418544, "learning_rate": 8.518193093125232e-07, "loss": 0.64603609, "num_input_tokens_seen": 125907610, "step": 5853, "time_per_iteration": 2.684634208679199 }, { "auxiliary_loss_clip": 0.01144724, "auxiliary_loss_mlp": 0.01027009, "balance_loss_clip": 1.04435921, "balance_loss_mlp": 1.01996326, "epoch": 0.7039018818012385, "flos": 27087490690560.0, "grad_norm": 1.7444981213270008, "language_loss": 0.81068814, "learning_rate": 8.511815806950436e-07, "loss": 0.83240545, "num_input_tokens_seen": 125928640, "step": 5854, "time_per_iteration": 2.7879602909088135 }, { "auxiliary_loss_clip": 0.01150203, "auxiliary_loss_mlp": 0.01021506, "balance_loss_clip": 1.04173636, "balance_loss_mlp": 1.01486897, "epoch": 0.7040221246918776, "flos": 17749819198080.0, "grad_norm": 1.6934481351204795, "language_loss": 0.78014159, "learning_rate": 8.505440263524044e-07, "loss": 0.80185866, "num_input_tokens_seen": 125947485, "step": 5855, "time_per_iteration": 3.5706064701080322 }, { "auxiliary_loss_clip": 0.01159561, "auxiliary_loss_mlp": 0.01027849, "balance_loss_clip": 1.04597998, "balance_loss_mlp": 1.01992774, "epoch": 0.7041423675825167, "flos": 16279851012480.0, "grad_norm": 2.653077124902142, "language_loss": 0.88090879, "learning_rate": 8.49906646381322e-07, "loss": 0.90278292, "num_input_tokens_seen": 125960320, "step": 5856, "time_per_iteration": 2.6576919555664062 }, { "auxiliary_loss_clip": 0.01129434, "auxiliary_loss_mlp": 0.01024553, "balance_loss_clip": 1.04429448, "balance_loss_mlp": 1.01781225, "epoch": 0.7042626104731557, "flos": 25483181639040.0, "grad_norm": 1.7837947977575717, "language_loss": 0.7218529, "learning_rate": 8.492694408784884e-07, "loss": 0.74339271, "num_input_tokens_seen": 125980575, "step": 5857, "time_per_iteration": 2.720989465713501 }, { "auxiliary_loss_clip": 0.01160323, "auxiliary_loss_mlp": 0.01023834, "balance_loss_clip": 1.04695225, "balance_loss_mlp": 1.01656532, "epoch": 0.7043828533637949, "flos": 17857622891520.0, "grad_norm": 2.292731365337256, "language_loss": 0.62001485, "learning_rate": 8.486324099405642e-07, "loss": 0.64185643, "num_input_tokens_seen": 125997420, "step": 5858, "time_per_iteration": 3.5986831188201904 }, { "auxiliary_loss_clip": 0.0115294, "auxiliary_loss_mlp": 0.01025373, "balance_loss_clip": 1.04388487, "balance_loss_mlp": 1.01864934, "epoch": 0.704503096254434, "flos": 29494259533440.0, "grad_norm": 1.7486482528287213, "language_loss": 0.74658942, "learning_rate": 8.479955536641887e-07, "loss": 0.76837254, "num_input_tokens_seen": 126018915, "step": 5859, "time_per_iteration": 2.665661573410034 }, { "auxiliary_loss_clip": 0.01133301, "auxiliary_loss_mlp": 0.01026881, "balance_loss_clip": 1.040308, "balance_loss_mlp": 1.01982427, "epoch": 0.704623339145073, "flos": 30920739327360.0, "grad_norm": 2.8514414345908627, "language_loss": 0.66059577, "learning_rate": 8.473588721459716e-07, "loss": 0.68219757, "num_input_tokens_seen": 126038825, "step": 5860, "time_per_iteration": 3.6199865341186523 }, { "auxiliary_loss_clip": 0.01160107, "auxiliary_loss_mlp": 0.01030342, "balance_loss_clip": 1.05123973, "balance_loss_mlp": 1.0226028, "epoch": 0.7047435820357122, "flos": 23914747296000.0, "grad_norm": 2.2429486087456247, "language_loss": 0.7075417, "learning_rate": 8.467223654824967e-07, "loss": 0.72944617, "num_input_tokens_seen": 126058280, "step": 5861, "time_per_iteration": 2.6557364463806152 }, { "auxiliary_loss_clip": 0.01151159, "auxiliary_loss_mlp": 0.01023765, "balance_loss_clip": 1.04485238, "balance_loss_mlp": 1.01655626, "epoch": 0.7048638249263512, "flos": 46494010926720.0, "grad_norm": 1.8243827265668355, "language_loss": 0.62648475, "learning_rate": 8.460860337703233e-07, "loss": 0.64823401, "num_input_tokens_seen": 126078885, "step": 5862, "time_per_iteration": 2.810192584991455 }, { "auxiliary_loss_clip": 0.01114493, "auxiliary_loss_mlp": 0.01024667, "balance_loss_clip": 1.04057539, "balance_loss_mlp": 1.01715064, "epoch": 0.7049840678169903, "flos": 21689219502720.0, "grad_norm": 1.8414970453165826, "language_loss": 0.70628595, "learning_rate": 8.454498771059797e-07, "loss": 0.72767752, "num_input_tokens_seen": 126098260, "step": 5863, "time_per_iteration": 3.5392227172851562 }, { "auxiliary_loss_clip": 0.01105133, "auxiliary_loss_mlp": 0.010249, "balance_loss_clip": 1.04005861, "balance_loss_mlp": 1.01737177, "epoch": 0.7051043107076294, "flos": 18405081054720.0, "grad_norm": 2.0514093426084763, "language_loss": 0.83088732, "learning_rate": 8.448138955859725e-07, "loss": 0.85218763, "num_input_tokens_seen": 126114845, "step": 5864, "time_per_iteration": 2.888139009475708 }, { "auxiliary_loss_clip": 0.01142535, "auxiliary_loss_mlp": 0.01030837, "balance_loss_clip": 1.04514623, "balance_loss_mlp": 1.02293313, "epoch": 0.7052245535982685, "flos": 19319043640320.0, "grad_norm": 1.780424236066068, "language_loss": 0.90012884, "learning_rate": 8.44178089306778e-07, "loss": 0.9218626, "num_input_tokens_seen": 126132780, "step": 5865, "time_per_iteration": 2.635498523712158 }, { "auxiliary_loss_clip": 0.01170885, "auxiliary_loss_mlp": 0.01027926, "balance_loss_clip": 1.04911113, "balance_loss_mlp": 1.02055311, "epoch": 0.7053447964889076, "flos": 19062138591360.0, "grad_norm": 1.883842355512516, "language_loss": 0.77147472, "learning_rate": 8.4354245836485e-07, "loss": 0.79346281, "num_input_tokens_seen": 126151225, "step": 5866, "time_per_iteration": 2.5778565406799316 }, { "auxiliary_loss_clip": 0.01128326, "auxiliary_loss_mlp": 0.01026636, "balance_loss_clip": 1.04216361, "balance_loss_mlp": 1.01914394, "epoch": 0.7054650393795466, "flos": 27379228953600.0, "grad_norm": 1.5852951430768631, "language_loss": 0.72805226, "learning_rate": 8.429070028566108e-07, "loss": 0.74960184, "num_input_tokens_seen": 126172535, "step": 5867, "time_per_iteration": 2.744642496109009 }, { "auxiliary_loss_clip": 0.01153808, "auxiliary_loss_mlp": 0.01026631, "balance_loss_clip": 1.04501021, "balance_loss_mlp": 1.01926708, "epoch": 0.7055852822701858, "flos": 16102201322880.0, "grad_norm": 3.1888303874861523, "language_loss": 0.7478019, "learning_rate": 8.422717228784586e-07, "loss": 0.76960635, "num_input_tokens_seen": 126189410, "step": 5868, "time_per_iteration": 2.6816253662109375 }, { "auxiliary_loss_clip": 0.01115717, "auxiliary_loss_mlp": 0.01030291, "balance_loss_clip": 1.04673147, "balance_loss_mlp": 1.02254224, "epoch": 0.7057055251608249, "flos": 11692299744000.0, "grad_norm": 1.847081147716386, "language_loss": 0.69143838, "learning_rate": 8.416366185267663e-07, "loss": 0.71289849, "num_input_tokens_seen": 126206910, "step": 5869, "time_per_iteration": 2.6830389499664307 }, { "auxiliary_loss_clip": 0.01154564, "auxiliary_loss_mlp": 0.01026024, "balance_loss_clip": 1.04328108, "balance_loss_mlp": 1.01870191, "epoch": 0.7058257680514639, "flos": 22711560399360.0, "grad_norm": 1.630554820429009, "language_loss": 0.77729976, "learning_rate": 8.410016898978778e-07, "loss": 0.79910564, "num_input_tokens_seen": 126224385, "step": 5870, "time_per_iteration": 2.6357765197753906 }, { "auxiliary_loss_clip": 0.01110684, "auxiliary_loss_mlp": 0.01026743, "balance_loss_clip": 1.04265964, "balance_loss_mlp": 1.01990581, "epoch": 0.7059460109421031, "flos": 17529543043200.0, "grad_norm": 1.838974209451696, "language_loss": 0.79101813, "learning_rate": 8.403669370881115e-07, "loss": 0.81239235, "num_input_tokens_seen": 126243120, "step": 5871, "time_per_iteration": 2.837751865386963 }, { "auxiliary_loss_clip": 0.01170428, "auxiliary_loss_mlp": 0.01022, "balance_loss_clip": 1.04834175, "balance_loss_mlp": 1.01480556, "epoch": 0.7060662538327421, "flos": 23544687427200.0, "grad_norm": 1.6223582976913804, "language_loss": 0.78661394, "learning_rate": 8.397323601937587e-07, "loss": 0.8085382, "num_input_tokens_seen": 126263020, "step": 5872, "time_per_iteration": 2.6784138679504395 }, { "auxiliary_loss_clip": 0.01123155, "auxiliary_loss_mlp": 0.0102482, "balance_loss_clip": 1.04221499, "balance_loss_mlp": 1.01781607, "epoch": 0.7061864967233812, "flos": 30260736875520.0, "grad_norm": 1.8830647329523706, "language_loss": 0.77254868, "learning_rate": 8.390979593110838e-07, "loss": 0.7940284, "num_input_tokens_seen": 126285150, "step": 5873, "time_per_iteration": 2.7504963874816895 }, { "auxiliary_loss_clip": 0.0114814, "auxiliary_loss_mlp": 0.0102484, "balance_loss_clip": 1.04814827, "balance_loss_mlp": 1.01747894, "epoch": 0.7063067396140204, "flos": 20701460424960.0, "grad_norm": 1.7717292050157025, "language_loss": 0.81932139, "learning_rate": 8.384637345363262e-07, "loss": 0.84105116, "num_input_tokens_seen": 126304340, "step": 5874, "time_per_iteration": 2.678549289703369 }, { "auxiliary_loss_clip": 0.0113391, "auxiliary_loss_mlp": 0.01022322, "balance_loss_clip": 1.04022741, "balance_loss_mlp": 1.01479959, "epoch": 0.7064269825046594, "flos": 32266168081920.0, "grad_norm": 1.788069109942293, "language_loss": 0.76440012, "learning_rate": 8.378296859656964e-07, "loss": 0.7859624, "num_input_tokens_seen": 126325495, "step": 5875, "time_per_iteration": 2.8381645679473877 }, { "auxiliary_loss_clip": 0.01142851, "auxiliary_loss_mlp": 0.01021242, "balance_loss_clip": 1.04486203, "balance_loss_mlp": 1.01406264, "epoch": 0.7065472253952985, "flos": 30227124723840.0, "grad_norm": 6.416503624459208, "language_loss": 0.68454492, "learning_rate": 8.371958136953792e-07, "loss": 0.70618582, "num_input_tokens_seen": 126345525, "step": 5876, "time_per_iteration": 2.7895352840423584 }, { "auxiliary_loss_clip": 0.01132372, "auxiliary_loss_mlp": 0.01025826, "balance_loss_clip": 1.04298449, "balance_loss_mlp": 1.01791048, "epoch": 0.7066674682859376, "flos": 16216720859520.0, "grad_norm": 2.3377377694281836, "language_loss": 0.66504216, "learning_rate": 8.365621178215326e-07, "loss": 0.68662411, "num_input_tokens_seen": 126361995, "step": 5877, "time_per_iteration": 2.676149368286133 }, { "auxiliary_loss_clip": 0.01150474, "auxiliary_loss_mlp": 0.01026495, "balance_loss_clip": 1.0455296, "balance_loss_mlp": 1.01901507, "epoch": 0.7067877111765767, "flos": 14830461319680.0, "grad_norm": 2.410498897204674, "language_loss": 0.74916649, "learning_rate": 8.359285984402871e-07, "loss": 0.77093613, "num_input_tokens_seen": 126379260, "step": 5878, "time_per_iteration": 2.6173806190490723 }, { "auxiliary_loss_clip": 0.01140349, "auxiliary_loss_mlp": 0.01022377, "balance_loss_clip": 1.04710162, "balance_loss_mlp": 1.01585007, "epoch": 0.7069079540672157, "flos": 25440196037760.0, "grad_norm": 1.8166273628178795, "language_loss": 0.73939347, "learning_rate": 8.352952556477489e-07, "loss": 0.76102066, "num_input_tokens_seen": 126397170, "step": 5879, "time_per_iteration": 2.7111949920654297 }, { "auxiliary_loss_clip": 0.01155551, "auxiliary_loss_mlp": 0.01023513, "balance_loss_clip": 1.04740596, "balance_loss_mlp": 1.01584828, "epoch": 0.7070281969578549, "flos": 24607751368320.0, "grad_norm": 1.7274105825957071, "language_loss": 0.76615769, "learning_rate": 8.34662089539993e-07, "loss": 0.78794837, "num_input_tokens_seen": 126416680, "step": 5880, "time_per_iteration": 2.671912431716919 }, { "auxiliary_loss_clip": 0.0116726, "auxiliary_loss_mlp": 0.01029458, "balance_loss_clip": 1.0473212, "balance_loss_mlp": 1.02271068, "epoch": 0.707148439848494, "flos": 26724469887360.0, "grad_norm": 1.901765313120718, "language_loss": 0.79131633, "learning_rate": 8.340291002130722e-07, "loss": 0.8132835, "num_input_tokens_seen": 126435870, "step": 5881, "time_per_iteration": 3.5224192142486572 }, { "auxiliary_loss_clip": 0.01174361, "auxiliary_loss_mlp": 0.0102695, "balance_loss_clip": 1.04982519, "balance_loss_mlp": 1.01905274, "epoch": 0.707268682739133, "flos": 15085750256640.0, "grad_norm": 2.5281513926367496, "language_loss": 0.79943204, "learning_rate": 8.3339628776301e-07, "loss": 0.82144511, "num_input_tokens_seen": 126454010, "step": 5882, "time_per_iteration": 2.604020357131958 }, { "auxiliary_loss_clip": 0.01167813, "auxiliary_loss_mlp": 0.010232, "balance_loss_clip": 1.04697537, "balance_loss_mlp": 1.016083, "epoch": 0.7073889256297722, "flos": 34313148345600.0, "grad_norm": 2.002215409114501, "language_loss": 0.56875318, "learning_rate": 8.327636522858033e-07, "loss": 0.59066325, "num_input_tokens_seen": 126473615, "step": 5883, "time_per_iteration": 2.711421012878418 }, { "auxiliary_loss_clip": 0.01116433, "auxiliary_loss_mlp": 0.0102401, "balance_loss_clip": 1.04488218, "balance_loss_mlp": 1.01615429, "epoch": 0.7075091685204112, "flos": 20083940784000.0, "grad_norm": 2.8733439692292078, "language_loss": 0.76916456, "learning_rate": 8.321311938774225e-07, "loss": 0.79056895, "num_input_tokens_seen": 126492705, "step": 5884, "time_per_iteration": 3.744283676147461 }, { "auxiliary_loss_clip": 0.01169831, "auxiliary_loss_mlp": 0.01026067, "balance_loss_clip": 1.04607952, "balance_loss_mlp": 1.0192064, "epoch": 0.7076294114110503, "flos": 20777124424320.0, "grad_norm": 2.23620261437103, "language_loss": 0.79201716, "learning_rate": 8.314989126338104e-07, "loss": 0.81397617, "num_input_tokens_seen": 126512715, "step": 5885, "time_per_iteration": 2.61051607131958 }, { "auxiliary_loss_clip": 0.01157908, "auxiliary_loss_mlp": 0.01024618, "balance_loss_clip": 1.04575491, "balance_loss_mlp": 1.01774216, "epoch": 0.7077496543016895, "flos": 17967689141760.0, "grad_norm": 1.6573300311817207, "language_loss": 0.84372574, "learning_rate": 8.308668086508847e-07, "loss": 0.86555099, "num_input_tokens_seen": 126530795, "step": 5886, "time_per_iteration": 3.5022826194763184 }, { "auxiliary_loss_clip": 0.01129151, "auxiliary_loss_mlp": 0.01020909, "balance_loss_clip": 1.04029107, "balance_loss_mlp": 1.0135597, "epoch": 0.7078698971923285, "flos": 45478098564480.0, "grad_norm": 1.8759900335847886, "language_loss": 0.73636878, "learning_rate": 8.302348820245342e-07, "loss": 0.75786942, "num_input_tokens_seen": 126553360, "step": 5887, "time_per_iteration": 2.885112762451172 }, { "auxiliary_loss_clip": 0.01128825, "auxiliary_loss_mlp": 0.01025446, "balance_loss_clip": 1.04377675, "balance_loss_mlp": 1.01729167, "epoch": 0.7079901400829676, "flos": 26943704547840.0, "grad_norm": 3.1953111897607425, "language_loss": 0.69725049, "learning_rate": 8.296031328506232e-07, "loss": 0.71879321, "num_input_tokens_seen": 126573110, "step": 5888, "time_per_iteration": 2.7507333755493164 }, { "auxiliary_loss_clip": 0.01144247, "auxiliary_loss_mlp": 0.01022047, "balance_loss_clip": 1.04594433, "balance_loss_mlp": 1.01459336, "epoch": 0.7081103829736067, "flos": 24423206267520.0, "grad_norm": 2.0967192093696205, "language_loss": 0.75842762, "learning_rate": 8.289715612249857e-07, "loss": 0.78009057, "num_input_tokens_seen": 126593725, "step": 5889, "time_per_iteration": 3.623176097869873 }, { "auxiliary_loss_clip": 0.01139824, "auxiliary_loss_mlp": 0.01033783, "balance_loss_clip": 1.04530489, "balance_loss_mlp": 1.02636218, "epoch": 0.7082306258642458, "flos": 18543300589440.0, "grad_norm": 2.5743534432996515, "language_loss": 0.77849144, "learning_rate": 8.283401672434305e-07, "loss": 0.80022752, "num_input_tokens_seen": 126608950, "step": 5890, "time_per_iteration": 2.613532543182373 }, { "auxiliary_loss_clip": 0.01140755, "auxiliary_loss_mlp": 0.01023956, "balance_loss_clip": 1.04707265, "balance_loss_mlp": 1.01660085, "epoch": 0.7083508687548848, "flos": 23477534951040.0, "grad_norm": 2.0606732773709595, "language_loss": 0.70235968, "learning_rate": 8.277089510017412e-07, "loss": 0.72400677, "num_input_tokens_seen": 126629755, "step": 5891, "time_per_iteration": 2.7131686210632324 }, { "auxiliary_loss_clip": 0.01137579, "auxiliary_loss_mlp": 0.0102925, "balance_loss_clip": 1.04487658, "balance_loss_mlp": 1.02234232, "epoch": 0.708471111645524, "flos": 22419463000320.0, "grad_norm": 1.839939401594107, "language_loss": 0.82301378, "learning_rate": 8.270779125956719e-07, "loss": 0.84468204, "num_input_tokens_seen": 126650135, "step": 5892, "time_per_iteration": 2.6619436740875244 }, { "auxiliary_loss_clip": 0.011066, "auxiliary_loss_mlp": 0.01023856, "balance_loss_clip": 1.0414902, "balance_loss_mlp": 1.01694822, "epoch": 0.7085913545361631, "flos": 20922885815040.0, "grad_norm": 2.284891804788153, "language_loss": 0.8008424, "learning_rate": 8.264470521209505e-07, "loss": 0.82214701, "num_input_tokens_seen": 126668500, "step": 5893, "time_per_iteration": 2.7178423404693604 }, { "auxiliary_loss_clip": 0.01145722, "auxiliary_loss_mlp": 0.01029266, "balance_loss_clip": 1.04220271, "balance_loss_mlp": 1.02188158, "epoch": 0.7087115974268021, "flos": 15012384727680.0, "grad_norm": 2.5784025350627253, "language_loss": 0.76536942, "learning_rate": 8.258163696732785e-07, "loss": 0.78711927, "num_input_tokens_seen": 126686090, "step": 5894, "time_per_iteration": 2.6174371242523193 }, { "auxiliary_loss_clip": 0.01149106, "auxiliary_loss_mlp": 0.01026819, "balance_loss_clip": 1.04422677, "balance_loss_mlp": 1.01995504, "epoch": 0.7088318403174413, "flos": 21539040739200.0, "grad_norm": 1.9449283666471007, "language_loss": 0.76940322, "learning_rate": 8.251858653483288e-07, "loss": 0.79116249, "num_input_tokens_seen": 126704255, "step": 5895, "time_per_iteration": 2.702460289001465 }, { "auxiliary_loss_clip": 0.01157895, "auxiliary_loss_mlp": 0.0102679, "balance_loss_clip": 1.04820848, "balance_loss_mlp": 1.01955128, "epoch": 0.7089520832080803, "flos": 15516785462400.0, "grad_norm": 2.0950787089405587, "language_loss": 0.85921597, "learning_rate": 8.245555392417501e-07, "loss": 0.88106275, "num_input_tokens_seen": 126718910, "step": 5896, "time_per_iteration": 2.6082305908203125 }, { "auxiliary_loss_clip": 0.01096675, "auxiliary_loss_mlp": 0.01027654, "balance_loss_clip": 1.03717232, "balance_loss_mlp": 1.02061129, "epoch": 0.7090723260987194, "flos": 20412667077120.0, "grad_norm": 1.898690114962306, "language_loss": 0.78892267, "learning_rate": 8.239253914491613e-07, "loss": 0.810166, "num_input_tokens_seen": 126737235, "step": 5897, "time_per_iteration": 2.7123208045959473 }, { "auxiliary_loss_clip": 0.01123127, "auxiliary_loss_mlp": 0.01025608, "balance_loss_clip": 1.04294729, "balance_loss_mlp": 1.01774585, "epoch": 0.7091925689893585, "flos": 25668337271040.0, "grad_norm": 1.7255764156074929, "language_loss": 0.74975342, "learning_rate": 8.232954220661556e-07, "loss": 0.77124071, "num_input_tokens_seen": 126759970, "step": 5898, "time_per_iteration": 2.8093488216400146 }, { "auxiliary_loss_clip": 0.01170355, "auxiliary_loss_mlp": 0.01025523, "balance_loss_clip": 1.05083692, "balance_loss_mlp": 1.01870155, "epoch": 0.7093128118799976, "flos": 24206629213440.0, "grad_norm": 2.9105456312600353, "language_loss": 0.70237136, "learning_rate": 8.226656311882989e-07, "loss": 0.72433013, "num_input_tokens_seen": 126779280, "step": 5899, "time_per_iteration": 2.584559679031372 }, { "auxiliary_loss_clip": 0.01153614, "auxiliary_loss_mlp": 0.01025209, "balance_loss_clip": 1.04812288, "balance_loss_mlp": 1.01822639, "epoch": 0.7094330547706367, "flos": 16646786398080.0, "grad_norm": 2.202027468615871, "language_loss": 0.77109796, "learning_rate": 8.22036018911129e-07, "loss": 0.7928862, "num_input_tokens_seen": 126797310, "step": 5900, "time_per_iteration": 2.6311819553375244 }, { "auxiliary_loss_clip": 0.01173507, "auxiliary_loss_mlp": 0.01028562, "balance_loss_clip": 1.04822671, "balance_loss_mlp": 1.02049768, "epoch": 0.7095532976612757, "flos": 16283370545280.0, "grad_norm": 2.6782141234475025, "language_loss": 0.80760813, "learning_rate": 8.214065853301599e-07, "loss": 0.82962883, "num_input_tokens_seen": 126812840, "step": 5901, "time_per_iteration": 2.542280912399292 }, { "auxiliary_loss_clip": 0.01061038, "auxiliary_loss_mlp": 0.01002907, "balance_loss_clip": 1.01320195, "balance_loss_mlp": 1.00169706, "epoch": 0.7096735405519149, "flos": 70722080559360.0, "grad_norm": 0.8082763908702347, "language_loss": 0.58198375, "learning_rate": 8.207773305408734e-07, "loss": 0.60262328, "num_input_tokens_seen": 126880060, "step": 5902, "time_per_iteration": 3.3593337535858154 }, { "auxiliary_loss_clip": 0.01123063, "auxiliary_loss_mlp": 0.01026826, "balance_loss_clip": 1.04185033, "balance_loss_mlp": 1.01918447, "epoch": 0.709793783442554, "flos": 23621500661760.0, "grad_norm": 2.5054197816682438, "language_loss": 0.80036533, "learning_rate": 8.201482546387288e-07, "loss": 0.82186425, "num_input_tokens_seen": 126899535, "step": 5903, "time_per_iteration": 2.7340495586395264 }, { "auxiliary_loss_clip": 0.01154949, "auxiliary_loss_mlp": 0.01025754, "balance_loss_clip": 1.04708004, "balance_loss_mlp": 1.01866138, "epoch": 0.709914026333193, "flos": 25993472204160.0, "grad_norm": 1.6933203642417367, "language_loss": 0.91702288, "learning_rate": 8.195193577191553e-07, "loss": 0.9388299, "num_input_tokens_seen": 126921365, "step": 5904, "time_per_iteration": 2.6866002082824707 }, { "auxiliary_loss_clip": 0.01147093, "auxiliary_loss_mlp": 0.00762469, "balance_loss_clip": 1.04371846, "balance_loss_mlp": 1.00045228, "epoch": 0.7100342692238322, "flos": 24861531934080.0, "grad_norm": 1.6314562037770444, "language_loss": 0.84326196, "learning_rate": 8.188906398775579e-07, "loss": 0.86235756, "num_input_tokens_seen": 126941910, "step": 5905, "time_per_iteration": 2.761972188949585 }, { "auxiliary_loss_clip": 0.01169285, "auxiliary_loss_mlp": 0.0076245, "balance_loss_clip": 1.04590249, "balance_loss_mlp": 1.00046694, "epoch": 0.7101545121144712, "flos": 24932203943040.0, "grad_norm": 1.710471975434766, "language_loss": 0.68513906, "learning_rate": 8.18262101209311e-07, "loss": 0.70445639, "num_input_tokens_seen": 126961120, "step": 5906, "time_per_iteration": 2.630396842956543 }, { "auxiliary_loss_clip": 0.01157425, "auxiliary_loss_mlp": 0.01029922, "balance_loss_clip": 1.0448904, "balance_loss_mlp": 1.02279902, "epoch": 0.7102747550051103, "flos": 23768842250880.0, "grad_norm": 1.8503137651901902, "language_loss": 0.70115912, "learning_rate": 8.176337418097626e-07, "loss": 0.72303259, "num_input_tokens_seen": 126981590, "step": 5907, "time_per_iteration": 3.6312475204467773 }, { "auxiliary_loss_clip": 0.01156656, "auxiliary_loss_mlp": 0.00761924, "balance_loss_clip": 1.0475893, "balance_loss_mlp": 1.00051224, "epoch": 0.7103949978957494, "flos": 15303907509120.0, "grad_norm": 2.757161381079919, "language_loss": 0.79969442, "learning_rate": 8.170055617742364e-07, "loss": 0.8188802, "num_input_tokens_seen": 126998870, "step": 5908, "time_per_iteration": 2.679163694381714 }, { "auxiliary_loss_clip": 0.01136317, "auxiliary_loss_mlp": 0.01025491, "balance_loss_clip": 1.04399431, "balance_loss_mlp": 1.01798713, "epoch": 0.7105152407863885, "flos": 22638805401600.0, "grad_norm": 2.103260247446368, "language_loss": 0.7078436, "learning_rate": 8.163775611980252e-07, "loss": 0.72946167, "num_input_tokens_seen": 127017980, "step": 5909, "time_per_iteration": 2.7439260482788086 }, { "auxiliary_loss_clip": 0.01139283, "auxiliary_loss_mlp": 0.01022215, "balance_loss_clip": 1.04500115, "balance_loss_mlp": 1.01541734, "epoch": 0.7106354836770276, "flos": 17238594879360.0, "grad_norm": 1.8070948103970246, "language_loss": 0.78807652, "learning_rate": 8.157497401763982e-07, "loss": 0.80969149, "num_input_tokens_seen": 127035645, "step": 5910, "time_per_iteration": 3.60172438621521 }, { "auxiliary_loss_clip": 0.01154529, "auxiliary_loss_mlp": 0.01022287, "balance_loss_clip": 1.04695928, "balance_loss_mlp": 1.01539624, "epoch": 0.7107557265676667, "flos": 20193647898240.0, "grad_norm": 1.7164480080624704, "language_loss": 0.77956212, "learning_rate": 8.151220988045935e-07, "loss": 0.80133027, "num_input_tokens_seen": 127054900, "step": 5911, "time_per_iteration": 2.6132876873016357 }, { "auxiliary_loss_clip": 0.01153708, "auxiliary_loss_mlp": 0.01025681, "balance_loss_clip": 1.04624891, "balance_loss_mlp": 1.01847458, "epoch": 0.7108759694583058, "flos": 21507080613120.0, "grad_norm": 1.7352923060080183, "language_loss": 0.82478666, "learning_rate": 8.144946371778234e-07, "loss": 0.84658056, "num_input_tokens_seen": 127075010, "step": 5912, "time_per_iteration": 3.590484380722046 }, { "auxiliary_loss_clip": 0.01141794, "auxiliary_loss_mlp": 0.00762773, "balance_loss_clip": 1.04644072, "balance_loss_mlp": 1.00052118, "epoch": 0.7109962123489448, "flos": 24061909317120.0, "grad_norm": 2.047124359657653, "language_loss": 0.78447074, "learning_rate": 8.138673553912751e-07, "loss": 0.80351645, "num_input_tokens_seen": 127095570, "step": 5913, "time_per_iteration": 2.673729658126831 }, { "auxiliary_loss_clip": 0.01112955, "auxiliary_loss_mlp": 0.01027695, "balance_loss_clip": 1.04174948, "balance_loss_mlp": 1.02010727, "epoch": 0.711116455239584, "flos": 30480474326400.0, "grad_norm": 2.851097068313711, "language_loss": 0.57223094, "learning_rate": 8.132402535401059e-07, "loss": 0.59363741, "num_input_tokens_seen": 127116825, "step": 5914, "time_per_iteration": 2.793269157409668 }, { "auxiliary_loss_clip": 0.01152043, "auxiliary_loss_mlp": 0.01023294, "balance_loss_clip": 1.04684234, "balance_loss_mlp": 1.01674974, "epoch": 0.711236698130223, "flos": 25045610158080.0, "grad_norm": 1.6829144585458093, "language_loss": 0.74032915, "learning_rate": 8.126133317194465e-07, "loss": 0.76208258, "num_input_tokens_seen": 127137015, "step": 5915, "time_per_iteration": 3.5337934494018555 }, { "auxiliary_loss_clip": 0.01108594, "auxiliary_loss_mlp": 0.01027977, "balance_loss_clip": 1.03973198, "balance_loss_mlp": 1.0197103, "epoch": 0.7113569410208621, "flos": 24206701040640.0, "grad_norm": 2.8184517218380045, "language_loss": 0.74649179, "learning_rate": 8.11986590024401e-07, "loss": 0.76785749, "num_input_tokens_seen": 127156755, "step": 5916, "time_per_iteration": 2.6757571697235107 }, { "auxiliary_loss_clip": 0.01144506, "auxiliary_loss_mlp": 0.01027204, "balance_loss_clip": 1.04645419, "balance_loss_mlp": 1.01927018, "epoch": 0.7114771839115013, "flos": 35439306526080.0, "grad_norm": 1.7827660910154974, "language_loss": 0.68887722, "learning_rate": 8.113600285500442e-07, "loss": 0.71059442, "num_input_tokens_seen": 127176965, "step": 5917, "time_per_iteration": 2.7090091705322266 }, { "auxiliary_loss_clip": 0.01169557, "auxiliary_loss_mlp": 0.01024143, "balance_loss_clip": 1.04748249, "balance_loss_mlp": 1.0172441, "epoch": 0.7115974268021403, "flos": 21099458096640.0, "grad_norm": 2.0218899686022174, "language_loss": 0.74298537, "learning_rate": 8.107336473914268e-07, "loss": 0.76492238, "num_input_tokens_seen": 127195595, "step": 5918, "time_per_iteration": 2.459580659866333 }, { "auxiliary_loss_clip": 0.01047059, "auxiliary_loss_mlp": 0.01001011, "balance_loss_clip": 1.01185524, "balance_loss_mlp": 0.99990267, "epoch": 0.7117176696927794, "flos": 56752866616320.0, "grad_norm": 0.7679232797553799, "language_loss": 0.55732477, "learning_rate": 8.101074466435694e-07, "loss": 0.57780546, "num_input_tokens_seen": 127255070, "step": 5919, "time_per_iteration": 3.1003365516662598 }, { "auxiliary_loss_clip": 0.01152486, "auxiliary_loss_mlp": 0.01022812, "balance_loss_clip": 1.04656494, "balance_loss_mlp": 1.01515889, "epoch": 0.7118379125834186, "flos": 15925269905280.0, "grad_norm": 1.9849101782874348, "language_loss": 0.68176591, "learning_rate": 8.094814264014662e-07, "loss": 0.70351887, "num_input_tokens_seen": 127273825, "step": 5920, "time_per_iteration": 2.6907846927642822 }, { "auxiliary_loss_clip": 0.01171273, "auxiliary_loss_mlp": 0.01022419, "balance_loss_clip": 1.04736602, "balance_loss_mlp": 1.01513565, "epoch": 0.7119581554740576, "flos": 20193360589440.0, "grad_norm": 3.2857617322561294, "language_loss": 0.81235427, "learning_rate": 8.088555867600844e-07, "loss": 0.83429122, "num_input_tokens_seen": 127289990, "step": 5921, "time_per_iteration": 2.5715837478637695 }, { "auxiliary_loss_clip": 0.01122988, "auxiliary_loss_mlp": 0.0102521, "balance_loss_clip": 1.04130816, "balance_loss_mlp": 1.01872778, "epoch": 0.7120783983646967, "flos": 34715383822080.0, "grad_norm": 1.8127407553111445, "language_loss": 0.60598958, "learning_rate": 8.08229927814362e-07, "loss": 0.62747151, "num_input_tokens_seen": 127312880, "step": 5922, "time_per_iteration": 2.7809062004089355 }, { "auxiliary_loss_clip": 0.0112293, "auxiliary_loss_mlp": 0.01022849, "balance_loss_clip": 1.04056847, "balance_loss_mlp": 1.01609588, "epoch": 0.7121986412553358, "flos": 26359114700160.0, "grad_norm": 1.8650590095781927, "language_loss": 0.64797169, "learning_rate": 8.076044496592134e-07, "loss": 0.66942954, "num_input_tokens_seen": 127334730, "step": 5923, "time_per_iteration": 2.7348692417144775 }, { "auxiliary_loss_clip": 0.01139529, "auxiliary_loss_mlp": 0.01021883, "balance_loss_clip": 1.04551697, "balance_loss_mlp": 1.01435792, "epoch": 0.7123188841459749, "flos": 11145344371200.0, "grad_norm": 2.524536439488622, "language_loss": 0.78285807, "learning_rate": 8.069791523895204e-07, "loss": 0.80447221, "num_input_tokens_seen": 127351180, "step": 5924, "time_per_iteration": 2.5905065536499023 }, { "auxiliary_loss_clip": 0.01114704, "auxiliary_loss_mlp": 0.01023471, "balance_loss_clip": 1.03866816, "balance_loss_mlp": 1.01600814, "epoch": 0.7124391270366139, "flos": 20811670329600.0, "grad_norm": 6.841786142866126, "language_loss": 0.77432728, "learning_rate": 8.063540361001422e-07, "loss": 0.79570901, "num_input_tokens_seen": 127369750, "step": 5925, "time_per_iteration": 2.6896307468414307 }, { "auxiliary_loss_clip": 0.01121089, "auxiliary_loss_mlp": 0.01032219, "balance_loss_clip": 1.04248571, "balance_loss_mlp": 1.02471137, "epoch": 0.7125593699272531, "flos": 17603734584960.0, "grad_norm": 2.465766419307217, "language_loss": 0.7939074, "learning_rate": 8.057291008859069e-07, "loss": 0.81544048, "num_input_tokens_seen": 127387910, "step": 5926, "time_per_iteration": 2.664895534515381 }, { "auxiliary_loss_clip": 0.01151792, "auxiliary_loss_mlp": 0.01027993, "balance_loss_clip": 1.04357004, "balance_loss_mlp": 1.02069759, "epoch": 0.7126796128178922, "flos": 28654057526400.0, "grad_norm": 1.8188092895304209, "language_loss": 0.682046, "learning_rate": 8.051043468416187e-07, "loss": 0.70384383, "num_input_tokens_seen": 127409160, "step": 5927, "time_per_iteration": 2.7000789642333984 }, { "auxiliary_loss_clip": 0.01166078, "auxiliary_loss_mlp": 0.01024723, "balance_loss_clip": 1.04744303, "balance_loss_mlp": 1.0174036, "epoch": 0.7127998557085312, "flos": 16034438315520.0, "grad_norm": 1.8396817355264647, "language_loss": 0.8211208, "learning_rate": 8.044797740620506e-07, "loss": 0.84302884, "num_input_tokens_seen": 127427765, "step": 5928, "time_per_iteration": 2.573573589324951 }, { "auxiliary_loss_clip": 0.01107988, "auxiliary_loss_mlp": 0.01025481, "balance_loss_clip": 1.04360652, "balance_loss_mlp": 1.01835251, "epoch": 0.7129200985991703, "flos": 23403271582080.0, "grad_norm": 2.1656322590607457, "language_loss": 0.7888298, "learning_rate": 8.038553826419494e-07, "loss": 0.81016451, "num_input_tokens_seen": 127446475, "step": 5929, "time_per_iteration": 2.7168281078338623 }, { "auxiliary_loss_clip": 0.01165737, "auxiliary_loss_mlp": 0.01023815, "balance_loss_clip": 1.0450201, "balance_loss_mlp": 1.01649308, "epoch": 0.7130403414898094, "flos": 21397445326080.0, "grad_norm": 1.6187897262602156, "language_loss": 0.80862772, "learning_rate": 8.032311726760364e-07, "loss": 0.83052325, "num_input_tokens_seen": 127467695, "step": 5930, "time_per_iteration": 2.6366443634033203 }, { "auxiliary_loss_clip": 0.01119721, "auxiliary_loss_mlp": 0.01023537, "balance_loss_clip": 1.04200113, "balance_loss_mlp": 1.01580071, "epoch": 0.7131605843804485, "flos": 74739045306240.0, "grad_norm": 1.7620532007737026, "language_loss": 0.68559384, "learning_rate": 8.026071442590022e-07, "loss": 0.70702642, "num_input_tokens_seen": 127494590, "step": 5931, "time_per_iteration": 3.1424460411071777 }, { "auxiliary_loss_clip": 0.01156318, "auxiliary_loss_mlp": 0.01022984, "balance_loss_clip": 1.04968476, "balance_loss_mlp": 1.01577759, "epoch": 0.7132808272710875, "flos": 18368739469440.0, "grad_norm": 2.174486020010103, "language_loss": 0.80873203, "learning_rate": 8.019832974855134e-07, "loss": 0.83052504, "num_input_tokens_seen": 127512550, "step": 5932, "time_per_iteration": 2.598475933074951 }, { "auxiliary_loss_clip": 0.01127881, "auxiliary_loss_mlp": 0.01029972, "balance_loss_clip": 1.04424143, "balance_loss_mlp": 1.02288795, "epoch": 0.7134010701617267, "flos": 23253380127360.0, "grad_norm": 2.459369121151865, "language_loss": 0.8229093, "learning_rate": 8.013596324502052e-07, "loss": 0.84448785, "num_input_tokens_seen": 127531015, "step": 5933, "time_per_iteration": 3.603365659713745 }, { "auxiliary_loss_clip": 0.01149981, "auxiliary_loss_mlp": 0.0102105, "balance_loss_clip": 1.04728651, "balance_loss_mlp": 1.01408529, "epoch": 0.7135213130523658, "flos": 23653137565440.0, "grad_norm": 1.8042318350612188, "language_loss": 0.79032642, "learning_rate": 8.007361492476872e-07, "loss": 0.81203675, "num_input_tokens_seen": 127550340, "step": 5934, "time_per_iteration": 2.6694788932800293 }, { "auxiliary_loss_clip": 0.0113213, "auxiliary_loss_mlp": 0.01028817, "balance_loss_clip": 1.04175377, "balance_loss_mlp": 1.02148545, "epoch": 0.7136415559430048, "flos": 24790644443520.0, "grad_norm": 4.442838448246524, "language_loss": 0.793989, "learning_rate": 8.001128479725426e-07, "loss": 0.81559849, "num_input_tokens_seen": 127572245, "step": 5935, "time_per_iteration": 2.7471015453338623 }, { "auxiliary_loss_clip": 0.01106274, "auxiliary_loss_mlp": 0.01028584, "balance_loss_clip": 1.03985548, "balance_loss_mlp": 1.02197993, "epoch": 0.713761798833644, "flos": 18296954138880.0, "grad_norm": 1.9841931128431745, "language_loss": 0.81385779, "learning_rate": 7.994897287193248e-07, "loss": 0.83520633, "num_input_tokens_seen": 127591625, "step": 5936, "time_per_iteration": 3.6976704597473145 }, { "auxiliary_loss_clip": 0.01157485, "auxiliary_loss_mlp": 0.01024478, "balance_loss_clip": 1.04483795, "balance_loss_mlp": 1.01725376, "epoch": 0.713882041724283, "flos": 15558262692480.0, "grad_norm": 2.409587358933706, "language_loss": 0.83759689, "learning_rate": 7.988667915825605e-07, "loss": 0.85941648, "num_input_tokens_seen": 127608690, "step": 5937, "time_per_iteration": 2.660447835922241 }, { "auxiliary_loss_clip": 0.01144788, "auxiliary_loss_mlp": 0.01029405, "balance_loss_clip": 1.04724205, "balance_loss_mlp": 1.02207708, "epoch": 0.7140022846149221, "flos": 24061011477120.0, "grad_norm": 2.14795705158136, "language_loss": 0.75779063, "learning_rate": 7.982440366567491e-07, "loss": 0.77953255, "num_input_tokens_seen": 127627180, "step": 5938, "time_per_iteration": 3.5851340293884277 }, { "auxiliary_loss_clip": 0.01146946, "auxiliary_loss_mlp": 0.01023199, "balance_loss_clip": 1.043504, "balance_loss_mlp": 1.01619244, "epoch": 0.7141225275055613, "flos": 27891710248320.0, "grad_norm": 1.7926098099859056, "language_loss": 0.75308728, "learning_rate": 7.97621464036361e-07, "loss": 0.77478874, "num_input_tokens_seen": 127648940, "step": 5939, "time_per_iteration": 2.6826679706573486 }, { "auxiliary_loss_clip": 0.011573, "auxiliary_loss_mlp": 0.01022263, "balance_loss_clip": 1.04564404, "balance_loss_mlp": 1.01489615, "epoch": 0.7142427703962003, "flos": 19682603147520.0, "grad_norm": 1.6951551781152054, "language_loss": 0.67913485, "learning_rate": 7.969990738158417e-07, "loss": 0.70093048, "num_input_tokens_seen": 127667350, "step": 5940, "time_per_iteration": 3.506030321121216 }, { "auxiliary_loss_clip": 0.01158231, "auxiliary_loss_mlp": 0.01032495, "balance_loss_clip": 1.04922748, "balance_loss_mlp": 1.02481198, "epoch": 0.7143630132868394, "flos": 21032377447680.0, "grad_norm": 1.9661892906724783, "language_loss": 0.85181099, "learning_rate": 7.963768660896062e-07, "loss": 0.87371826, "num_input_tokens_seen": 127685760, "step": 5941, "time_per_iteration": 2.630401134490967 }, { "auxiliary_loss_clip": 0.01157426, "auxiliary_loss_mlp": 0.01024367, "balance_loss_clip": 1.04584062, "balance_loss_mlp": 1.01707125, "epoch": 0.7144832561774785, "flos": 24129923719680.0, "grad_norm": 6.840594556584783, "language_loss": 0.82687521, "learning_rate": 7.957548409520432e-07, "loss": 0.84869313, "num_input_tokens_seen": 127704985, "step": 5942, "time_per_iteration": 2.680835008621216 }, { "auxiliary_loss_clip": 0.01123067, "auxiliary_loss_mlp": 0.0102747, "balance_loss_clip": 1.04060268, "balance_loss_mlp": 1.02066875, "epoch": 0.7146034990681176, "flos": 16325817442560.0, "grad_norm": 1.9461005764905417, "language_loss": 0.84059095, "learning_rate": 7.951329984975135e-07, "loss": 0.86209631, "num_input_tokens_seen": 127721925, "step": 5943, "time_per_iteration": 2.6393728256225586 }, { "auxiliary_loss_clip": 0.01042877, "auxiliary_loss_mlp": 0.01002024, "balance_loss_clip": 1.01289892, "balance_loss_mlp": 1.00084364, "epoch": 0.7147237419587567, "flos": 69627164232960.0, "grad_norm": 0.7124811579652193, "language_loss": 0.54289871, "learning_rate": 7.94511338820349e-07, "loss": 0.56334776, "num_input_tokens_seen": 127784230, "step": 5944, "time_per_iteration": 3.2774412631988525 }, { "auxiliary_loss_clip": 0.0114109, "auxiliary_loss_mlp": 0.00762722, "balance_loss_clip": 1.04525328, "balance_loss_mlp": 1.00051308, "epoch": 0.7148439848493958, "flos": 22266806198400.0, "grad_norm": 1.8904318763387613, "language_loss": 0.78501368, "learning_rate": 7.938898620148575e-07, "loss": 0.80405176, "num_input_tokens_seen": 127801990, "step": 5945, "time_per_iteration": 2.7025394439697266 }, { "auxiliary_loss_clip": 0.01140603, "auxiliary_loss_mlp": 0.01020908, "balance_loss_clip": 1.0446744, "balance_loss_mlp": 1.01346302, "epoch": 0.7149642277400349, "flos": 17931383470080.0, "grad_norm": 2.130988484450793, "language_loss": 0.70861125, "learning_rate": 7.932685681753135e-07, "loss": 0.73022634, "num_input_tokens_seen": 127819270, "step": 5946, "time_per_iteration": 2.6213414669036865 }, { "auxiliary_loss_clip": 0.01166013, "auxiliary_loss_mlp": 0.01023166, "balance_loss_clip": 1.04735911, "balance_loss_mlp": 1.01633286, "epoch": 0.7150844706306739, "flos": 31681937370240.0, "grad_norm": 1.933412929800732, "language_loss": 0.62316501, "learning_rate": 7.92647457395969e-07, "loss": 0.64505678, "num_input_tokens_seen": 127841095, "step": 5947, "time_per_iteration": 2.6395583152770996 }, { "auxiliary_loss_clip": 0.01102934, "auxiliary_loss_mlp": 0.01029663, "balance_loss_clip": 1.03632379, "balance_loss_mlp": 1.02180099, "epoch": 0.7152047135213131, "flos": 10926217451520.0, "grad_norm": 2.3880072057433552, "language_loss": 0.73875451, "learning_rate": 7.920265297710444e-07, "loss": 0.76008046, "num_input_tokens_seen": 127858485, "step": 5948, "time_per_iteration": 2.6950740814208984 }, { "auxiliary_loss_clip": 0.01157048, "auxiliary_loss_mlp": 0.01020167, "balance_loss_clip": 1.04737473, "balance_loss_mlp": 1.01358962, "epoch": 0.7153249564119522, "flos": 20995640812800.0, "grad_norm": 1.8280224720273721, "language_loss": 0.73403251, "learning_rate": 7.914057853947363e-07, "loss": 0.75580466, "num_input_tokens_seen": 127877665, "step": 5949, "time_per_iteration": 2.614824056625366 }, { "auxiliary_loss_clip": 0.01121788, "auxiliary_loss_mlp": 0.01025135, "balance_loss_clip": 1.040748, "balance_loss_mlp": 1.01800048, "epoch": 0.7154451993025912, "flos": 24243114453120.0, "grad_norm": 1.8863226953057652, "language_loss": 0.62808287, "learning_rate": 7.907852243612089e-07, "loss": 0.64955211, "num_input_tokens_seen": 127898070, "step": 5950, "time_per_iteration": 2.749410629272461 }, { "auxiliary_loss_clip": 0.0113834, "auxiliary_loss_mlp": 0.01027133, "balance_loss_clip": 1.04387915, "balance_loss_mlp": 1.01988506, "epoch": 0.7155654421932304, "flos": 23330947547520.0, "grad_norm": 1.8682166625379448, "language_loss": 0.72102207, "learning_rate": 7.901648467646009e-07, "loss": 0.74267679, "num_input_tokens_seen": 127917010, "step": 5951, "time_per_iteration": 2.702850103378296 }, { "auxiliary_loss_clip": 0.01170848, "auxiliary_loss_mlp": 0.01022821, "balance_loss_clip": 1.0481391, "balance_loss_mlp": 1.01568353, "epoch": 0.7156856850838694, "flos": 22711883621760.0, "grad_norm": 1.5563958634665287, "language_loss": 0.72497296, "learning_rate": 7.895446526990244e-07, "loss": 0.74690974, "num_input_tokens_seen": 127937025, "step": 5952, "time_per_iteration": 2.5904288291931152 }, { "auxiliary_loss_clip": 0.01120202, "auxiliary_loss_mlp": 0.01035995, "balance_loss_clip": 1.04080248, "balance_loss_mlp": 1.02877665, "epoch": 0.7158059279745085, "flos": 19865424395520.0, "grad_norm": 1.6972202369004763, "language_loss": 0.75927198, "learning_rate": 7.889246422585609e-07, "loss": 0.78083396, "num_input_tokens_seen": 127956410, "step": 5953, "time_per_iteration": 2.7835564613342285 }, { "auxiliary_loss_clip": 0.01167948, "auxiliary_loss_mlp": 0.01026997, "balance_loss_clip": 1.04694104, "balance_loss_mlp": 1.01972568, "epoch": 0.7159261708651476, "flos": 24134772055680.0, "grad_norm": 2.0041077298020995, "language_loss": 0.73456115, "learning_rate": 7.883048155372675e-07, "loss": 0.75651062, "num_input_tokens_seen": 127974925, "step": 5954, "time_per_iteration": 2.6042799949645996 }, { "auxiliary_loss_clip": 0.01145187, "auxiliary_loss_mlp": 0.01022115, "balance_loss_clip": 1.04430723, "balance_loss_mlp": 1.01536453, "epoch": 0.7160464137557867, "flos": 16983198201600.0, "grad_norm": 2.1614894877926605, "language_loss": 0.71660602, "learning_rate": 7.876851726291698e-07, "loss": 0.73827899, "num_input_tokens_seen": 127993225, "step": 5955, "time_per_iteration": 2.639533042907715 }, { "auxiliary_loss_clip": 0.01129228, "auxiliary_loss_mlp": 0.01029034, "balance_loss_clip": 1.03995728, "balance_loss_mlp": 1.02219403, "epoch": 0.7161666566464258, "flos": 25228251838080.0, "grad_norm": 1.9059783187946302, "language_loss": 0.78292072, "learning_rate": 7.870657136282666e-07, "loss": 0.80450332, "num_input_tokens_seen": 128012085, "step": 5956, "time_per_iteration": 2.703115701675415 }, { "auxiliary_loss_clip": 0.01149507, "auxiliary_loss_mlp": 0.01024269, "balance_loss_clip": 1.04472268, "balance_loss_mlp": 1.01684833, "epoch": 0.7162868995370649, "flos": 26468390851200.0, "grad_norm": 1.6729137686104236, "language_loss": 0.82032263, "learning_rate": 7.86446438628531e-07, "loss": 0.84206039, "num_input_tokens_seen": 128033155, "step": 5957, "time_per_iteration": 2.6468446254730225 }, { "auxiliary_loss_clip": 0.01069234, "auxiliary_loss_mlp": 0.01001364, "balance_loss_clip": 1.0134325, "balance_loss_mlp": 1.00027299, "epoch": 0.716407142427704, "flos": 69998912040960.0, "grad_norm": 0.7641018382065691, "language_loss": 0.56902033, "learning_rate": 7.858273477239059e-07, "loss": 0.58972633, "num_input_tokens_seen": 128101575, "step": 5958, "time_per_iteration": 3.215191602706909 }, { "auxiliary_loss_clip": 0.01100212, "auxiliary_loss_mlp": 0.01023783, "balance_loss_clip": 1.03949308, "balance_loss_mlp": 1.01655865, "epoch": 0.716527385318343, "flos": 20740459616640.0, "grad_norm": 1.9060084375664466, "language_loss": 0.71255577, "learning_rate": 7.852084410083067e-07, "loss": 0.73379564, "num_input_tokens_seen": 128120395, "step": 5959, "time_per_iteration": 3.6476917266845703 }, { "auxiliary_loss_clip": 0.01134645, "auxiliary_loss_mlp": 0.01025604, "balance_loss_clip": 1.04458988, "balance_loss_mlp": 1.01947355, "epoch": 0.7166476282089821, "flos": 25371966153600.0, "grad_norm": 1.6330414503290143, "language_loss": 0.63672316, "learning_rate": 7.84589718575621e-07, "loss": 0.65832567, "num_input_tokens_seen": 128140840, "step": 5960, "time_per_iteration": 2.732532262802124 }, { "auxiliary_loss_clip": 0.01140272, "auxiliary_loss_mlp": 0.01021925, "balance_loss_clip": 1.03991461, "balance_loss_mlp": 1.0148083, "epoch": 0.7167678710996213, "flos": 24133730561280.0, "grad_norm": 1.9200712352608669, "language_loss": 0.68690026, "learning_rate": 7.83971180519708e-07, "loss": 0.7085222, "num_input_tokens_seen": 128159695, "step": 5961, "time_per_iteration": 2.6491539478302 }, { "auxiliary_loss_clip": 0.01171959, "auxiliary_loss_mlp": 0.0102436, "balance_loss_clip": 1.04853952, "balance_loss_mlp": 1.01650441, "epoch": 0.7168881139902603, "flos": 30226586019840.0, "grad_norm": 2.7216692236929183, "language_loss": 0.76240253, "learning_rate": 7.833528269344008e-07, "loss": 0.78436577, "num_input_tokens_seen": 128179600, "step": 5962, "time_per_iteration": 3.6375930309295654 }, { "auxiliary_loss_clip": 0.01122943, "auxiliary_loss_mlp": 0.01033116, "balance_loss_clip": 1.0431627, "balance_loss_mlp": 1.02523637, "epoch": 0.7170083568808994, "flos": 14606414236800.0, "grad_norm": 2.884749995217395, "language_loss": 0.77378017, "learning_rate": 7.827346579135023e-07, "loss": 0.79534072, "num_input_tokens_seen": 128196940, "step": 5963, "time_per_iteration": 2.7571957111358643 }, { "auxiliary_loss_clip": 0.01136467, "auxiliary_loss_mlp": 0.01025166, "balance_loss_clip": 1.04239571, "balance_loss_mlp": 1.01776636, "epoch": 0.7171285997715385, "flos": 23331091201920.0, "grad_norm": 2.231754411048932, "language_loss": 0.83027744, "learning_rate": 7.821166735507885e-07, "loss": 0.85189378, "num_input_tokens_seen": 128215970, "step": 5964, "time_per_iteration": 3.611799478530884 }, { "auxiliary_loss_clip": 0.01167917, "auxiliary_loss_mlp": 0.01026795, "balance_loss_clip": 1.0479157, "balance_loss_mlp": 1.01935649, "epoch": 0.7172488426621776, "flos": 16543543731840.0, "grad_norm": 1.6049486425254005, "language_loss": 0.68293786, "learning_rate": 7.81498873940007e-07, "loss": 0.70488495, "num_input_tokens_seen": 128233185, "step": 5965, "time_per_iteration": 2.648043632507324 }, { "auxiliary_loss_clip": 0.01160138, "auxiliary_loss_mlp": 0.01021966, "balance_loss_clip": 1.04485679, "balance_loss_mlp": 1.0142622, "epoch": 0.7173690855528166, "flos": 26541612725760.0, "grad_norm": 2.229137068378478, "language_loss": 0.7772367, "learning_rate": 7.808812591748768e-07, "loss": 0.79905772, "num_input_tokens_seen": 128253565, "step": 5966, "time_per_iteration": 3.5203135013580322 }, { "auxiliary_loss_clip": 0.01122912, "auxiliary_loss_mlp": 0.01025146, "balance_loss_clip": 1.04213619, "balance_loss_mlp": 1.01720667, "epoch": 0.7174893284434558, "flos": 22784099915520.0, "grad_norm": 2.4930044474866864, "language_loss": 0.65307075, "learning_rate": 7.802638293490915e-07, "loss": 0.67455125, "num_input_tokens_seen": 128273210, "step": 5967, "time_per_iteration": 2.6806979179382324 }, { "auxiliary_loss_clip": 0.01144379, "auxiliary_loss_mlp": 0.01024641, "balance_loss_clip": 1.0452112, "balance_loss_mlp": 1.01801848, "epoch": 0.7176095713340949, "flos": 23293564467840.0, "grad_norm": 1.614967961281152, "language_loss": 0.7705856, "learning_rate": 7.796465845563123e-07, "loss": 0.79227579, "num_input_tokens_seen": 128292085, "step": 5968, "time_per_iteration": 2.6985793113708496 }, { "auxiliary_loss_clip": 0.01136139, "auxiliary_loss_mlp": 0.00762136, "balance_loss_clip": 1.04323161, "balance_loss_mlp": 1.00048971, "epoch": 0.7177298142247339, "flos": 25591631777280.0, "grad_norm": 3.128429018381726, "language_loss": 0.79634362, "learning_rate": 7.790295248901766e-07, "loss": 0.81532633, "num_input_tokens_seen": 128313215, "step": 5969, "time_per_iteration": 2.6772968769073486 }, { "auxiliary_loss_clip": 0.01154983, "auxiliary_loss_mlp": 0.01023092, "balance_loss_clip": 1.04515159, "balance_loss_mlp": 1.0153079, "epoch": 0.7178500571153731, "flos": 31652778504960.0, "grad_norm": 1.8376642967099126, "language_loss": 0.62145644, "learning_rate": 7.784126504442902e-07, "loss": 0.64323717, "num_input_tokens_seen": 128336445, "step": 5970, "time_per_iteration": 2.7576663494110107 }, { "auxiliary_loss_clip": 0.0111631, "auxiliary_loss_mlp": 0.01021658, "balance_loss_clip": 1.04042506, "balance_loss_mlp": 1.01477635, "epoch": 0.7179703000060121, "flos": 19427242383360.0, "grad_norm": 1.3786158693642658, "language_loss": 0.67538601, "learning_rate": 7.777959613122351e-07, "loss": 0.69676572, "num_input_tokens_seen": 128356270, "step": 5971, "time_per_iteration": 2.6522915363311768 }, { "auxiliary_loss_clip": 0.01136941, "auxiliary_loss_mlp": 0.01022972, "balance_loss_clip": 1.04647112, "balance_loss_mlp": 1.01570654, "epoch": 0.7180905428966512, "flos": 28839249072000.0, "grad_norm": 1.6306388324022445, "language_loss": 0.77911288, "learning_rate": 7.771794575875604e-07, "loss": 0.80071199, "num_input_tokens_seen": 128378140, "step": 5972, "time_per_iteration": 2.6966331005096436 }, { "auxiliary_loss_clip": 0.01153122, "auxiliary_loss_mlp": 0.01027631, "balance_loss_clip": 1.04605675, "balance_loss_mlp": 1.02009678, "epoch": 0.7182107857872904, "flos": 20047563285120.0, "grad_norm": 2.395006155782625, "language_loss": 0.77460933, "learning_rate": 7.765631393637888e-07, "loss": 0.79641682, "num_input_tokens_seen": 128396335, "step": 5973, "time_per_iteration": 2.661557197570801 }, { "auxiliary_loss_clip": 0.01147998, "auxiliary_loss_mlp": 0.0102218, "balance_loss_clip": 1.04140627, "balance_loss_mlp": 1.01478052, "epoch": 0.7183310286779294, "flos": 22747686503040.0, "grad_norm": 3.005066889552014, "language_loss": 0.48119491, "learning_rate": 7.75947006734417e-07, "loss": 0.50289673, "num_input_tokens_seen": 128414115, "step": 5974, "time_per_iteration": 2.746615171432495 }, { "auxiliary_loss_clip": 0.01169315, "auxiliary_loss_mlp": 0.01027779, "balance_loss_clip": 1.04731607, "balance_loss_mlp": 1.020015, "epoch": 0.7184512715685685, "flos": 17158262112000.0, "grad_norm": 2.0763838475784233, "language_loss": 0.8271364, "learning_rate": 7.753310597929101e-07, "loss": 0.84910727, "num_input_tokens_seen": 128430755, "step": 5975, "time_per_iteration": 2.6496829986572266 }, { "auxiliary_loss_clip": 0.01068966, "auxiliary_loss_mlp": 0.01002303, "balance_loss_clip": 1.01317215, "balance_loss_mlp": 1.00120008, "epoch": 0.7185715144592076, "flos": 65509611448320.0, "grad_norm": 0.7620349630687402, "language_loss": 0.55517817, "learning_rate": 7.747152986327095e-07, "loss": 0.57589078, "num_input_tokens_seen": 128491300, "step": 5976, "time_per_iteration": 3.1072442531585693 }, { "auxiliary_loss_clip": 0.01117358, "auxiliary_loss_mlp": 0.01022807, "balance_loss_clip": 1.0421828, "balance_loss_mlp": 1.01572585, "epoch": 0.7186917573498467, "flos": 16180522928640.0, "grad_norm": 5.933660246758854, "language_loss": 0.6797325, "learning_rate": 7.740997233472228e-07, "loss": 0.70113415, "num_input_tokens_seen": 128508920, "step": 5977, "time_per_iteration": 2.7217118740081787 }, { "auxiliary_loss_clip": 0.01139399, "auxiliary_loss_mlp": 0.01028809, "balance_loss_clip": 1.04246533, "balance_loss_mlp": 1.0219903, "epoch": 0.7188120002404857, "flos": 29242274647680.0, "grad_norm": 2.1643604171114887, "language_loss": 0.70706111, "learning_rate": 7.734843340298329e-07, "loss": 0.72874314, "num_input_tokens_seen": 128528745, "step": 5978, "time_per_iteration": 2.7117276191711426 }, { "auxiliary_loss_clip": 0.01144831, "auxiliary_loss_mlp": 0.01027248, "balance_loss_clip": 1.04254472, "balance_loss_mlp": 1.01989901, "epoch": 0.7189322431311249, "flos": 33401161008000.0, "grad_norm": 2.150431130949243, "language_loss": 0.75061655, "learning_rate": 7.72869130773895e-07, "loss": 0.77233732, "num_input_tokens_seen": 128549345, "step": 5979, "time_per_iteration": 2.737966537475586 }, { "auxiliary_loss_clip": 0.01060179, "auxiliary_loss_mlp": 0.01003283, "balance_loss_clip": 1.01319802, "balance_loss_mlp": 1.00218678, "epoch": 0.719052486021764, "flos": 61351263792000.0, "grad_norm": 0.7859458574163565, "language_loss": 0.59318113, "learning_rate": 7.722541136727343e-07, "loss": 0.61381572, "num_input_tokens_seen": 128605360, "step": 5980, "time_per_iteration": 3.03460431098938 }, { "auxiliary_loss_clip": 0.01153985, "auxiliary_loss_mlp": 0.01025186, "balance_loss_clip": 1.04569125, "balance_loss_mlp": 1.01803398, "epoch": 0.719172728912403, "flos": 15596795007360.0, "grad_norm": 1.949938011943916, "language_loss": 0.80462837, "learning_rate": 7.716392828196483e-07, "loss": 0.82642007, "num_input_tokens_seen": 128623160, "step": 5981, "time_per_iteration": 2.6276376247406006 }, { "auxiliary_loss_clip": 0.01154966, "auxiliary_loss_mlp": 0.01026633, "balance_loss_clip": 1.04589462, "balance_loss_mlp": 1.01897979, "epoch": 0.7192929718030422, "flos": 15553162961280.0, "grad_norm": 2.629618249900072, "language_loss": 0.77091718, "learning_rate": 7.710246383079064e-07, "loss": 0.79273319, "num_input_tokens_seen": 128638545, "step": 5982, "time_per_iteration": 2.550541877746582 }, { "auxiliary_loss_clip": 0.01139318, "auxiliary_loss_mlp": 0.01026251, "balance_loss_clip": 1.0399313, "balance_loss_mlp": 1.0192709, "epoch": 0.7194132146936812, "flos": 21862487733120.0, "grad_norm": 2.360604047941329, "language_loss": 0.91996747, "learning_rate": 7.704101802307492e-07, "loss": 0.94162315, "num_input_tokens_seen": 128650845, "step": 5983, "time_per_iteration": 2.6126937866210938 }, { "auxiliary_loss_clip": 0.01119285, "auxiliary_loss_mlp": 0.01023099, "balance_loss_clip": 1.04180646, "balance_loss_mlp": 1.01589525, "epoch": 0.7195334575843203, "flos": 27338900958720.0, "grad_norm": 2.0484589717305823, "language_loss": 0.86824656, "learning_rate": 7.697959086813912e-07, "loss": 0.88967043, "num_input_tokens_seen": 128667010, "step": 5984, "time_per_iteration": 2.715475559234619 }, { "auxiliary_loss_clip": 0.01118543, "auxiliary_loss_mlp": 0.01028881, "balance_loss_clip": 1.03972197, "balance_loss_mlp": 1.02132273, "epoch": 0.7196537004749595, "flos": 18770615809920.0, "grad_norm": 1.6055033597454507, "language_loss": 0.80264378, "learning_rate": 7.691818237530145e-07, "loss": 0.82411796, "num_input_tokens_seen": 128685870, "step": 5985, "time_per_iteration": 3.571729898452759 }, { "auxiliary_loss_clip": 0.01125495, "auxiliary_loss_mlp": 0.01025537, "balance_loss_clip": 1.04267645, "balance_loss_mlp": 1.01821208, "epoch": 0.7197739433655985, "flos": 24531009960960.0, "grad_norm": 2.33406383515282, "language_loss": 0.77671707, "learning_rate": 7.685679255387774e-07, "loss": 0.79822737, "num_input_tokens_seen": 128704185, "step": 5986, "time_per_iteration": 2.8646109104156494 }, { "auxiliary_loss_clip": 0.01139178, "auxiliary_loss_mlp": 0.01024973, "balance_loss_clip": 1.04455137, "balance_loss_mlp": 1.0180831, "epoch": 0.7198941862562376, "flos": 18040587793920.0, "grad_norm": 2.1132427464102292, "language_loss": 0.77299112, "learning_rate": 7.679542141318065e-07, "loss": 0.79463261, "num_input_tokens_seen": 128721290, "step": 5987, "time_per_iteration": 2.621979236602783 }, { "auxiliary_loss_clip": 0.01126683, "auxiliary_loss_mlp": 0.01030077, "balance_loss_clip": 1.04073429, "balance_loss_mlp": 1.02268577, "epoch": 0.7200144291468767, "flos": 29022393542400.0, "grad_norm": 1.7292269968013068, "language_loss": 0.75879514, "learning_rate": 7.673406896252013e-07, "loss": 0.78036273, "num_input_tokens_seen": 128742665, "step": 5988, "time_per_iteration": 3.7614095211029053 }, { "auxiliary_loss_clip": 0.01121265, "auxiliary_loss_mlp": 0.01027625, "balance_loss_clip": 1.03819084, "balance_loss_mlp": 1.02014494, "epoch": 0.7201346720375158, "flos": 25374264624000.0, "grad_norm": 1.5961703625695083, "language_loss": 0.78503239, "learning_rate": 7.667273521120347e-07, "loss": 0.8065213, "num_input_tokens_seen": 128762225, "step": 5989, "time_per_iteration": 3.6551685333251953 }, { "auxiliary_loss_clip": 0.01124122, "auxiliary_loss_mlp": 0.01027139, "balance_loss_clip": 1.03928769, "balance_loss_mlp": 1.02028739, "epoch": 0.7202549149281549, "flos": 14355614499840.0, "grad_norm": 2.941981551291609, "language_loss": 0.79646784, "learning_rate": 7.661142016853468e-07, "loss": 0.81798047, "num_input_tokens_seen": 128779585, "step": 5990, "time_per_iteration": 2.676557779312134 }, { "auxiliary_loss_clip": 0.01110355, "auxiliary_loss_mlp": 0.01023767, "balance_loss_clip": 1.04020119, "balance_loss_mlp": 1.01605415, "epoch": 0.7203751578187939, "flos": 23001682550400.0, "grad_norm": 1.6814273215552205, "language_loss": 0.75202954, "learning_rate": 7.655012384381543e-07, "loss": 0.77337074, "num_input_tokens_seen": 128799070, "step": 5991, "time_per_iteration": 2.7704620361328125 }, { "auxiliary_loss_clip": 0.01139558, "auxiliary_loss_mlp": 0.01027916, "balance_loss_clip": 1.04746127, "balance_loss_mlp": 1.02062047, "epoch": 0.7204954007094331, "flos": 23692424065920.0, "grad_norm": 1.7507496862778682, "language_loss": 0.81863075, "learning_rate": 7.648884624634415e-07, "loss": 0.84030545, "num_input_tokens_seen": 128817620, "step": 5992, "time_per_iteration": 3.6613101959228516 }, { "auxiliary_loss_clip": 0.01155953, "auxiliary_loss_mlp": 0.01024278, "balance_loss_clip": 1.04830217, "balance_loss_mlp": 1.01714325, "epoch": 0.7206156436000721, "flos": 16253026531200.0, "grad_norm": 2.3642409651405725, "language_loss": 0.88711119, "learning_rate": 7.642758738541683e-07, "loss": 0.90891349, "num_input_tokens_seen": 128834200, "step": 5993, "time_per_iteration": 2.5870108604431152 }, { "auxiliary_loss_clip": 0.01059155, "auxiliary_loss_mlp": 0.01003118, "balance_loss_clip": 1.01307678, "balance_loss_mlp": 1.00200355, "epoch": 0.7207358864907112, "flos": 54377806504320.0, "grad_norm": 0.88760861742911, "language_loss": 0.6075542, "learning_rate": 7.636634727032621e-07, "loss": 0.62817693, "num_input_tokens_seen": 128891305, "step": 5994, "time_per_iteration": 3.098778009414673 }, { "auxiliary_loss_clip": 0.01125478, "auxiliary_loss_mlp": 0.01028337, "balance_loss_clip": 1.03822815, "balance_loss_mlp": 1.02041817, "epoch": 0.7208561293813504, "flos": 19135540033920.0, "grad_norm": 1.8366225370559612, "language_loss": 0.78431016, "learning_rate": 7.630512591036231e-07, "loss": 0.8058483, "num_input_tokens_seen": 128910615, "step": 5995, "time_per_iteration": 2.6821060180664062 }, { "auxiliary_loss_clip": 0.0115794, "auxiliary_loss_mlp": 0.01027515, "balance_loss_clip": 1.04851389, "balance_loss_mlp": 1.02022219, "epoch": 0.7209763722719894, "flos": 17748526308480.0, "grad_norm": 2.128604026119056, "language_loss": 0.6495558, "learning_rate": 7.624392331481255e-07, "loss": 0.67141032, "num_input_tokens_seen": 128928270, "step": 5996, "time_per_iteration": 2.692699670791626 }, { "auxiliary_loss_clip": 0.0105718, "auxiliary_loss_mlp": 0.01002805, "balance_loss_clip": 1.01301646, "balance_loss_mlp": 1.00168443, "epoch": 0.7210966151626285, "flos": 66819488716800.0, "grad_norm": 0.7379893830840482, "language_loss": 0.51799291, "learning_rate": 7.618273949296115e-07, "loss": 0.53859276, "num_input_tokens_seen": 128987780, "step": 5997, "time_per_iteration": 3.124922275543213 }, { "auxiliary_loss_clip": 0.01132661, "auxiliary_loss_mlp": 0.01032407, "balance_loss_clip": 1.04056931, "balance_loss_mlp": 1.02466774, "epoch": 0.7212168580532676, "flos": 21141869080320.0, "grad_norm": 1.9962954915761884, "language_loss": 0.68708301, "learning_rate": 7.612157445408987e-07, "loss": 0.70873368, "num_input_tokens_seen": 129005590, "step": 5998, "time_per_iteration": 2.689570665359497 }, { "auxiliary_loss_clip": 0.01145737, "auxiliary_loss_mlp": 0.01024191, "balance_loss_clip": 1.04835844, "balance_loss_mlp": 1.01675189, "epoch": 0.7213371009439067, "flos": 22345738335360.0, "grad_norm": 2.09908757744875, "language_loss": 0.74243224, "learning_rate": 7.606042820747716e-07, "loss": 0.76413143, "num_input_tokens_seen": 129021995, "step": 5999, "time_per_iteration": 2.696467638015747 }, { "auxiliary_loss_clip": 0.01149592, "auxiliary_loss_mlp": 0.01024061, "balance_loss_clip": 1.0489608, "balance_loss_mlp": 1.01709664, "epoch": 0.7214573438345457, "flos": 18515901490560.0, "grad_norm": 1.8101811894730397, "language_loss": 0.85186398, "learning_rate": 7.599930076239889e-07, "loss": 0.87360048, "num_input_tokens_seen": 129039280, "step": 6000, "time_per_iteration": 2.61403226852417 }, { "auxiliary_loss_clip": 0.01119007, "auxiliary_loss_mlp": 0.00762218, "balance_loss_clip": 1.04427481, "balance_loss_mlp": 1.00042057, "epoch": 0.7215775867251849, "flos": 35736108606720.0, "grad_norm": 2.2754610261186605, "language_loss": 0.70793509, "learning_rate": 7.593819212812818e-07, "loss": 0.72674739, "num_input_tokens_seen": 129060860, "step": 6001, "time_per_iteration": 2.886016845703125 }, { "auxiliary_loss_clip": 0.01152659, "auxiliary_loss_mlp": 0.01025235, "balance_loss_clip": 1.0462153, "balance_loss_mlp": 1.01822603, "epoch": 0.721697829615824, "flos": 20372410909440.0, "grad_norm": 1.921434475019056, "language_loss": 0.71434057, "learning_rate": 7.587710231393508e-07, "loss": 0.73611951, "num_input_tokens_seen": 129079215, "step": 6002, "time_per_iteration": 2.6018102169036865 }, { "auxiliary_loss_clip": 0.01072769, "auxiliary_loss_mlp": 0.01027814, "balance_loss_clip": 1.03419578, "balance_loss_mlp": 1.0205121, "epoch": 0.721818072506463, "flos": 20229809915520.0, "grad_norm": 1.9565145724889768, "language_loss": 0.83627021, "learning_rate": 7.581603132908685e-07, "loss": 0.85727608, "num_input_tokens_seen": 129097185, "step": 6003, "time_per_iteration": 2.805802822113037 }, { "auxiliary_loss_clip": 0.01119991, "auxiliary_loss_mlp": 0.01025964, "balance_loss_clip": 1.04248905, "balance_loss_mlp": 1.01846004, "epoch": 0.7219383153971022, "flos": 18186887888640.0, "grad_norm": 1.9264464349986414, "language_loss": 0.78781664, "learning_rate": 7.575497918284795e-07, "loss": 0.80927622, "num_input_tokens_seen": 129114730, "step": 6004, "time_per_iteration": 2.6486427783966064 }, { "auxiliary_loss_clip": 0.01171138, "auxiliary_loss_mlp": 0.01038083, "balance_loss_clip": 1.04656744, "balance_loss_mlp": 1.03038168, "epoch": 0.7220585582877412, "flos": 17342124854400.0, "grad_norm": 1.9777521904200235, "language_loss": 0.74841464, "learning_rate": 7.569394588447984e-07, "loss": 0.77050686, "num_input_tokens_seen": 129131745, "step": 6005, "time_per_iteration": 2.611745595932007 }, { "auxiliary_loss_clip": 0.01146459, "auxiliary_loss_mlp": 0.01020795, "balance_loss_clip": 1.04309511, "balance_loss_mlp": 1.01388347, "epoch": 0.7221788011783803, "flos": 16976338704000.0, "grad_norm": 5.127573321283999, "language_loss": 0.7802, "learning_rate": 7.563293144324146e-07, "loss": 0.80187249, "num_input_tokens_seen": 129147295, "step": 6006, "time_per_iteration": 2.614257335662842 }, { "auxiliary_loss_clip": 0.01167719, "auxiliary_loss_mlp": 0.0102589, "balance_loss_clip": 1.04902935, "balance_loss_mlp": 1.01943755, "epoch": 0.7222990440690195, "flos": 26286359702400.0, "grad_norm": 2.0033094540193073, "language_loss": 0.80186415, "learning_rate": 7.557193586838834e-07, "loss": 0.82380021, "num_input_tokens_seen": 129162660, "step": 6007, "time_per_iteration": 2.6874029636383057 }, { "auxiliary_loss_clip": 0.01140828, "auxiliary_loss_mlp": 0.01025999, "balance_loss_clip": 1.04188979, "balance_loss_mlp": 1.01919782, "epoch": 0.7224192869596585, "flos": 17601687509760.0, "grad_norm": 2.2259188666445033, "language_loss": 0.71017319, "learning_rate": 7.551095916917371e-07, "loss": 0.73184144, "num_input_tokens_seen": 129179990, "step": 6008, "time_per_iteration": 2.6946029663085938 }, { "auxiliary_loss_clip": 0.01138253, "auxiliary_loss_mlp": 0.01037242, "balance_loss_clip": 1.04296994, "balance_loss_mlp": 1.02892089, "epoch": 0.7225395298502976, "flos": 12932331016320.0, "grad_norm": 2.4980442300563808, "language_loss": 0.66816783, "learning_rate": 7.545000135484758e-07, "loss": 0.68992281, "num_input_tokens_seen": 129197425, "step": 6009, "time_per_iteration": 2.7120201587677 }, { "auxiliary_loss_clip": 0.01169552, "auxiliary_loss_mlp": 0.00762618, "balance_loss_clip": 1.04785275, "balance_loss_mlp": 1.00051141, "epoch": 0.7226597727409367, "flos": 29643899592960.0, "grad_norm": 1.88189438697995, "language_loss": 0.62682831, "learning_rate": 7.538906243465714e-07, "loss": 0.64614999, "num_input_tokens_seen": 129217560, "step": 6010, "time_per_iteration": 2.6536030769348145 }, { "auxiliary_loss_clip": 0.01170332, "auxiliary_loss_mlp": 0.01029676, "balance_loss_clip": 1.04746675, "balance_loss_mlp": 1.02223754, "epoch": 0.7227800156315758, "flos": 13771635183360.0, "grad_norm": 2.0807136818669845, "language_loss": 0.78703099, "learning_rate": 7.5328142417847e-07, "loss": 0.80903101, "num_input_tokens_seen": 129234325, "step": 6011, "time_per_iteration": 3.54526686668396 }, { "auxiliary_loss_clip": 0.01150614, "auxiliary_loss_mlp": 0.01024945, "balance_loss_clip": 1.0427314, "balance_loss_mlp": 1.01826382, "epoch": 0.7229002585222148, "flos": 20301882554880.0, "grad_norm": 1.7612563004156065, "language_loss": 0.69359243, "learning_rate": 7.526724131365838e-07, "loss": 0.71534801, "num_input_tokens_seen": 129255280, "step": 6012, "time_per_iteration": 2.636235237121582 }, { "auxiliary_loss_clip": 0.01142525, "auxiliary_loss_mlp": 0.01026281, "balance_loss_clip": 1.04771602, "balance_loss_mlp": 1.01917934, "epoch": 0.723020501412854, "flos": 16581250033920.0, "grad_norm": 1.8679819088804706, "language_loss": 0.70845002, "learning_rate": 7.520635913133017e-07, "loss": 0.73013806, "num_input_tokens_seen": 129273910, "step": 6013, "time_per_iteration": 2.629301071166992 }, { "auxiliary_loss_clip": 0.01160074, "auxiliary_loss_mlp": 0.01029979, "balance_loss_clip": 1.0458169, "balance_loss_mlp": 1.02210498, "epoch": 0.7231407443034931, "flos": 28548300908160.0, "grad_norm": 1.7134138836915547, "language_loss": 0.82348061, "learning_rate": 7.514549588009798e-07, "loss": 0.84538114, "num_input_tokens_seen": 129294785, "step": 6014, "time_per_iteration": 3.6462883949279785 }, { "auxiliary_loss_clip": 0.01143421, "auxiliary_loss_mlp": 0.01028461, "balance_loss_clip": 1.04566479, "balance_loss_mlp": 1.02111149, "epoch": 0.7232609871941321, "flos": 30008536508160.0, "grad_norm": 2.879571924697623, "language_loss": 0.70631444, "learning_rate": 7.508465156919492e-07, "loss": 0.7280333, "num_input_tokens_seen": 129318295, "step": 6015, "time_per_iteration": 2.7295796871185303 }, { "auxiliary_loss_clip": 0.01140205, "auxiliary_loss_mlp": 0.01023636, "balance_loss_clip": 1.04287624, "balance_loss_mlp": 1.01607239, "epoch": 0.7233812300847713, "flos": 16654005031680.0, "grad_norm": 2.7950851854591066, "language_loss": 0.6126312, "learning_rate": 7.502382620785083e-07, "loss": 0.63426971, "num_input_tokens_seen": 129334845, "step": 6016, "time_per_iteration": 3.4593629837036133 }, { "auxiliary_loss_clip": 0.01034885, "auxiliary_loss_mlp": 0.01002343, "balance_loss_clip": 1.01612592, "balance_loss_mlp": 1.00126982, "epoch": 0.7235014729754103, "flos": 67258784050560.0, "grad_norm": 0.8042331994800976, "language_loss": 0.62557673, "learning_rate": 7.496301980529289e-07, "loss": 0.64594901, "num_input_tokens_seen": 129398055, "step": 6017, "time_per_iteration": 3.342642307281494 }, { "auxiliary_loss_clip": 0.01169143, "auxiliary_loss_mlp": 0.01029261, "balance_loss_clip": 1.04766893, "balance_loss_mlp": 1.02265358, "epoch": 0.7236217158660494, "flos": 26943237671040.0, "grad_norm": 2.0255911370787887, "language_loss": 0.74674225, "learning_rate": 7.490223237074547e-07, "loss": 0.76872623, "num_input_tokens_seen": 129417765, "step": 6018, "time_per_iteration": 3.6600136756896973 }, { "auxiliary_loss_clip": 0.01127142, "auxiliary_loss_mlp": 0.01024706, "balance_loss_clip": 1.04300821, "balance_loss_mlp": 1.01763678, "epoch": 0.7237419587566886, "flos": 29423372042880.0, "grad_norm": 2.063045654665973, "language_loss": 0.65972131, "learning_rate": 7.484146391342989e-07, "loss": 0.68123984, "num_input_tokens_seen": 129437560, "step": 6019, "time_per_iteration": 2.792160749435425 }, { "auxiliary_loss_clip": 0.01132549, "auxiliary_loss_mlp": 0.01032046, "balance_loss_clip": 1.04165912, "balance_loss_mlp": 1.0248816, "epoch": 0.7238622016473276, "flos": 17821496787840.0, "grad_norm": 2.4288829873466016, "language_loss": 0.56292099, "learning_rate": 7.478071444256484e-07, "loss": 0.58456689, "num_input_tokens_seen": 129455320, "step": 6020, "time_per_iteration": 2.7437868118286133 }, { "auxiliary_loss_clip": 0.01137956, "auxiliary_loss_mlp": 0.01025351, "balance_loss_clip": 1.04479313, "balance_loss_mlp": 1.01754868, "epoch": 0.7239824445379667, "flos": 25739117020800.0, "grad_norm": 2.6669095041473376, "language_loss": 0.79238588, "learning_rate": 7.471998396736579e-07, "loss": 0.81401896, "num_input_tokens_seen": 129475700, "step": 6021, "time_per_iteration": 2.7686469554901123 }, { "auxiliary_loss_clip": 0.01126182, "auxiliary_loss_mlp": 0.01027909, "balance_loss_clip": 1.04309702, "balance_loss_mlp": 1.02080393, "epoch": 0.7241026874286057, "flos": 23148916398720.0, "grad_norm": 1.710602996967692, "language_loss": 0.75883532, "learning_rate": 7.465927249704549e-07, "loss": 0.78037626, "num_input_tokens_seen": 129493585, "step": 6022, "time_per_iteration": 2.721277952194214 }, { "auxiliary_loss_clip": 0.01151056, "auxiliary_loss_mlp": 0.0102044, "balance_loss_clip": 1.04369354, "balance_loss_mlp": 1.01328743, "epoch": 0.7242229303192449, "flos": 20266905686400.0, "grad_norm": 1.8652088212433782, "language_loss": 0.77326041, "learning_rate": 7.459858004081398e-07, "loss": 0.7949754, "num_input_tokens_seen": 129511555, "step": 6023, "time_per_iteration": 2.623974084854126 }, { "auxiliary_loss_clip": 0.0102785, "auxiliary_loss_mlp": 0.01002039, "balance_loss_clip": 1.01117229, "balance_loss_mlp": 1.00091255, "epoch": 0.724343173209884, "flos": 62311659684480.0, "grad_norm": 0.6622173124468023, "language_loss": 0.58018768, "learning_rate": 7.453790660787815e-07, "loss": 0.60048652, "num_input_tokens_seen": 129579650, "step": 6024, "time_per_iteration": 3.3654680252075195 }, { "auxiliary_loss_clip": 0.01145507, "auxiliary_loss_mlp": 0.01027942, "balance_loss_clip": 1.04642725, "balance_loss_mlp": 1.01996636, "epoch": 0.724463416100523, "flos": 35006403813120.0, "grad_norm": 2.6191525802008027, "language_loss": 0.63469702, "learning_rate": 7.447725220744214e-07, "loss": 0.65643156, "num_input_tokens_seen": 129601895, "step": 6025, "time_per_iteration": 2.885371446609497 }, { "auxiliary_loss_clip": 0.01168177, "auxiliary_loss_mlp": 0.01027003, "balance_loss_clip": 1.04523504, "balance_loss_mlp": 1.01948071, "epoch": 0.7245836589911622, "flos": 21871968923520.0, "grad_norm": 2.2900594143764916, "language_loss": 0.77175581, "learning_rate": 7.441661684870717e-07, "loss": 0.79370761, "num_input_tokens_seen": 129622150, "step": 6026, "time_per_iteration": 2.6330676078796387 }, { "auxiliary_loss_clip": 0.01168215, "auxiliary_loss_mlp": 0.01023662, "balance_loss_clip": 1.04620934, "balance_loss_mlp": 1.01617575, "epoch": 0.7247039018818012, "flos": 23006494972800.0, "grad_norm": 2.0148088341013555, "language_loss": 0.82252127, "learning_rate": 7.435600054087152e-07, "loss": 0.8444401, "num_input_tokens_seen": 129644315, "step": 6027, "time_per_iteration": 2.6369972229003906 }, { "auxiliary_loss_clip": 0.01171351, "auxiliary_loss_mlp": 0.01027919, "balance_loss_clip": 1.04793859, "balance_loss_mlp": 1.02015853, "epoch": 0.7248241447724403, "flos": 31722588587520.0, "grad_norm": 1.9326139594386633, "language_loss": 0.74602067, "learning_rate": 7.42954032931308e-07, "loss": 0.76801336, "num_input_tokens_seen": 129665355, "step": 6028, "time_per_iteration": 2.6036477088928223 }, { "auxiliary_loss_clip": 0.01143585, "auxiliary_loss_mlp": 0.01023361, "balance_loss_clip": 1.04557502, "balance_loss_mlp": 1.01659262, "epoch": 0.7249443876630794, "flos": 34896984007680.0, "grad_norm": 1.8247142438225092, "language_loss": 0.74691665, "learning_rate": 7.423482511467733e-07, "loss": 0.76858616, "num_input_tokens_seen": 129686125, "step": 6029, "time_per_iteration": 2.7858493328094482 }, { "auxiliary_loss_clip": 0.01087207, "auxiliary_loss_mlp": 0.01027572, "balance_loss_clip": 1.03850937, "balance_loss_mlp": 1.02010369, "epoch": 0.7250646305537185, "flos": 26359294268160.0, "grad_norm": 2.939089179775525, "language_loss": 0.64896369, "learning_rate": 7.417426601470099e-07, "loss": 0.67011142, "num_input_tokens_seen": 129706485, "step": 6030, "time_per_iteration": 2.846094846725464 }, { "auxiliary_loss_clip": 0.01158455, "auxiliary_loss_mlp": 0.01025203, "balance_loss_clip": 1.04695034, "balance_loss_mlp": 1.01698935, "epoch": 0.7251848734443576, "flos": 30081614728320.0, "grad_norm": 2.364294980806612, "language_loss": 0.78911912, "learning_rate": 7.411372600238841e-07, "loss": 0.81095576, "num_input_tokens_seen": 129727100, "step": 6031, "time_per_iteration": 2.6934359073638916 }, { "auxiliary_loss_clip": 0.01166927, "auxiliary_loss_mlp": 0.0102453, "balance_loss_clip": 1.04487956, "balance_loss_mlp": 1.01766908, "epoch": 0.7253051163349967, "flos": 17785262943360.0, "grad_norm": 1.9534667195942927, "language_loss": 0.74072063, "learning_rate": 7.405320508692346e-07, "loss": 0.76263523, "num_input_tokens_seen": 129745840, "step": 6032, "time_per_iteration": 2.565553665161133 }, { "auxiliary_loss_clip": 0.01164753, "auxiliary_loss_mlp": 0.01019831, "balance_loss_clip": 1.04707718, "balance_loss_mlp": 1.01326585, "epoch": 0.7254253592256358, "flos": 12641346938880.0, "grad_norm": 1.883765977510127, "language_loss": 0.75518143, "learning_rate": 7.399270327748727e-07, "loss": 0.77702725, "num_input_tokens_seen": 129763500, "step": 6033, "time_per_iteration": 2.5950703620910645 }, { "auxiliary_loss_clip": 0.0112698, "auxiliary_loss_mlp": 0.00762197, "balance_loss_clip": 1.04131293, "balance_loss_mlp": 1.00050497, "epoch": 0.7255456021162748, "flos": 27199208966400.0, "grad_norm": 1.6951026405374912, "language_loss": 0.73996353, "learning_rate": 7.39322205832577e-07, "loss": 0.75885528, "num_input_tokens_seen": 129784390, "step": 6034, "time_per_iteration": 2.7417588233947754 }, { "auxiliary_loss_clip": 0.01135778, "auxiliary_loss_mlp": 0.01031809, "balance_loss_clip": 1.0432688, "balance_loss_mlp": 1.02460909, "epoch": 0.725665845006914, "flos": 21288205088640.0, "grad_norm": 2.1729259436346937, "language_loss": 0.80984604, "learning_rate": 7.387175701341009e-07, "loss": 0.83152193, "num_input_tokens_seen": 129803060, "step": 6035, "time_per_iteration": 2.682269811630249 }, { "auxiliary_loss_clip": 0.01155293, "auxiliary_loss_mlp": 0.0102648, "balance_loss_clip": 1.04534841, "balance_loss_mlp": 1.01899385, "epoch": 0.7257860878975531, "flos": 16033684129920.0, "grad_norm": 2.2499513702812965, "language_loss": 0.7225036, "learning_rate": 7.381131257711659e-07, "loss": 0.74432135, "num_input_tokens_seen": 129820165, "step": 6036, "time_per_iteration": 2.5395188331604004 }, { "auxiliary_loss_clip": 0.01137122, "auxiliary_loss_mlp": 0.01025946, "balance_loss_clip": 1.04631877, "balance_loss_mlp": 1.01896274, "epoch": 0.7259063307881921, "flos": 12129943052160.0, "grad_norm": 1.9243247290005863, "language_loss": 0.8354001, "learning_rate": 7.375088728354677e-07, "loss": 0.85703075, "num_input_tokens_seen": 129835195, "step": 6037, "time_per_iteration": 2.6259469985961914 }, { "auxiliary_loss_clip": 0.01130574, "auxiliary_loss_mlp": 0.01027683, "balance_loss_clip": 1.04218817, "balance_loss_mlp": 1.01977324, "epoch": 0.7260265736788313, "flos": 30443845432320.0, "grad_norm": 1.611608127342657, "language_loss": 0.67345154, "learning_rate": 7.369048114186691e-07, "loss": 0.69503409, "num_input_tokens_seen": 129856240, "step": 6038, "time_per_iteration": 3.6943089962005615 }, { "auxiliary_loss_clip": 0.0113442, "auxiliary_loss_mlp": 0.00761784, "balance_loss_clip": 1.04366779, "balance_loss_mlp": 1.00048518, "epoch": 0.7261468165694703, "flos": 21142264129920.0, "grad_norm": 1.7252588338763846, "language_loss": 0.83240414, "learning_rate": 7.363009416124055e-07, "loss": 0.85136622, "num_input_tokens_seen": 129875565, "step": 6039, "time_per_iteration": 2.644244909286499 }, { "auxiliary_loss_clip": 0.01129696, "auxiliary_loss_mlp": 0.01021987, "balance_loss_clip": 1.04413962, "balance_loss_mlp": 1.01435137, "epoch": 0.7262670594601094, "flos": 22306308180480.0, "grad_norm": 2.505559585286645, "language_loss": 0.62613881, "learning_rate": 7.356972635082852e-07, "loss": 0.64765573, "num_input_tokens_seen": 129894420, "step": 6040, "time_per_iteration": 3.661090850830078 }, { "auxiliary_loss_clip": 0.01110882, "auxiliary_loss_mlp": 0.01030952, "balance_loss_clip": 1.04597783, "balance_loss_mlp": 1.02310228, "epoch": 0.7263873023507486, "flos": 25335049950720.0, "grad_norm": 1.9506326564264742, "language_loss": 0.75420082, "learning_rate": 7.35093777197884e-07, "loss": 0.77561915, "num_input_tokens_seen": 129914490, "step": 6041, "time_per_iteration": 2.7841336727142334 }, { "auxiliary_loss_clip": 0.01140613, "auxiliary_loss_mlp": 0.010281, "balance_loss_clip": 1.04636586, "balance_loss_mlp": 1.02081966, "epoch": 0.7265075452413876, "flos": 23878621192320.0, "grad_norm": 2.535210887871698, "language_loss": 0.85829937, "learning_rate": 7.344904827727525e-07, "loss": 0.87998652, "num_input_tokens_seen": 129931670, "step": 6042, "time_per_iteration": 3.5182671546936035 }, { "auxiliary_loss_clip": 0.0112412, "auxiliary_loss_mlp": 0.0102983, "balance_loss_clip": 1.04004908, "balance_loss_mlp": 1.02277851, "epoch": 0.7266277881320267, "flos": 28724549967360.0, "grad_norm": 3.9676776459536964, "language_loss": 0.73717988, "learning_rate": 7.338873803244076e-07, "loss": 0.75871938, "num_input_tokens_seen": 129946905, "step": 6043, "time_per_iteration": 3.7689168453216553 }, { "auxiliary_loss_clip": 0.01137069, "auxiliary_loss_mlp": 0.01025962, "balance_loss_clip": 1.04402995, "balance_loss_mlp": 1.01877069, "epoch": 0.7267480310226658, "flos": 24863507182080.0, "grad_norm": 1.855519757133444, "language_loss": 0.80739188, "learning_rate": 7.332844699443401e-07, "loss": 0.82902217, "num_input_tokens_seen": 129965505, "step": 6044, "time_per_iteration": 2.7695343494415283 }, { "auxiliary_loss_clip": 0.01104064, "auxiliary_loss_mlp": 0.01020023, "balance_loss_clip": 1.04072523, "balance_loss_mlp": 1.0127933, "epoch": 0.7268682739133049, "flos": 27198490694400.0, "grad_norm": 1.8265371910283956, "language_loss": 0.75400984, "learning_rate": 7.326817517240121e-07, "loss": 0.77525067, "num_input_tokens_seen": 129987210, "step": 6045, "time_per_iteration": 2.774348020553589 }, { "auxiliary_loss_clip": 0.01153826, "auxiliary_loss_mlp": 0.00762187, "balance_loss_clip": 1.0442549, "balance_loss_mlp": 1.0003624, "epoch": 0.7269885168039439, "flos": 33508138688640.0, "grad_norm": 1.6360963827040123, "language_loss": 0.8330602, "learning_rate": 7.320792257548545e-07, "loss": 0.85222042, "num_input_tokens_seen": 130008385, "step": 6046, "time_per_iteration": 2.843472957611084 }, { "auxiliary_loss_clip": 0.01145398, "auxiliary_loss_mlp": 0.01024618, "balance_loss_clip": 1.04477799, "balance_loss_mlp": 1.01686335, "epoch": 0.7271087596945831, "flos": 24313750548480.0, "grad_norm": 2.6496434780557263, "language_loss": 0.76142198, "learning_rate": 7.314768921282704e-07, "loss": 0.78312212, "num_input_tokens_seen": 130029040, "step": 6047, "time_per_iteration": 2.704103708267212 }, { "auxiliary_loss_clip": 0.01155891, "auxiliary_loss_mlp": 0.01029164, "balance_loss_clip": 1.04494977, "balance_loss_mlp": 1.0222199, "epoch": 0.7272290025852222, "flos": 23805147922560.0, "grad_norm": 2.548517646583692, "language_loss": 0.71948904, "learning_rate": 7.30874750935633e-07, "loss": 0.74133956, "num_input_tokens_seen": 130048725, "step": 6048, "time_per_iteration": 2.7366344928741455 }, { "auxiliary_loss_clip": 0.0112576, "auxiliary_loss_mlp": 0.01026212, "balance_loss_clip": 1.04628301, "balance_loss_mlp": 1.01914907, "epoch": 0.7273492454758612, "flos": 16720367408640.0, "grad_norm": 1.8976280166605914, "language_loss": 0.79147887, "learning_rate": 7.30272802268286e-07, "loss": 0.81299853, "num_input_tokens_seen": 130065720, "step": 6049, "time_per_iteration": 2.766047716140747 }, { "auxiliary_loss_clip": 0.01074721, "auxiliary_loss_mlp": 0.01021175, "balance_loss_clip": 1.03736532, "balance_loss_mlp": 1.01480305, "epoch": 0.7274694883665004, "flos": 28031330413440.0, "grad_norm": 1.8463501990940934, "language_loss": 0.76124406, "learning_rate": 7.29671046217547e-07, "loss": 0.78220302, "num_input_tokens_seen": 130084830, "step": 6050, "time_per_iteration": 2.8344318866729736 }, { "auxiliary_loss_clip": 0.01126552, "auxiliary_loss_mlp": 0.01026808, "balance_loss_clip": 1.0425483, "balance_loss_mlp": 1.01989079, "epoch": 0.7275897312571394, "flos": 30372706546560.0, "grad_norm": 1.8469447149965865, "language_loss": 0.82000464, "learning_rate": 7.290694828746988e-07, "loss": 0.84153825, "num_input_tokens_seen": 130104495, "step": 6051, "time_per_iteration": 2.8092734813690186 }, { "auxiliary_loss_clip": 0.01131375, "auxiliary_loss_mlp": 0.01020357, "balance_loss_clip": 1.04233015, "balance_loss_mlp": 1.01362443, "epoch": 0.7277099741477785, "flos": 19204775498880.0, "grad_norm": 3.2988396453663342, "language_loss": 0.8555494, "learning_rate": 7.284681123310004e-07, "loss": 0.87706673, "num_input_tokens_seen": 130123210, "step": 6052, "time_per_iteration": 2.805621385574341 }, { "auxiliary_loss_clip": 0.0115578, "auxiliary_loss_mlp": 0.01027655, "balance_loss_clip": 1.04701483, "balance_loss_mlp": 1.02013302, "epoch": 0.7278302170384175, "flos": 20667884186880.0, "grad_norm": 2.0558144532474434, "language_loss": 0.79413271, "learning_rate": 7.27866934677678e-07, "loss": 0.81596702, "num_input_tokens_seen": 130142880, "step": 6053, "time_per_iteration": 2.6840944290161133 }, { "auxiliary_loss_clip": 0.01111307, "auxiliary_loss_mlp": 0.01030418, "balance_loss_clip": 1.04242575, "balance_loss_mlp": 1.02375078, "epoch": 0.7279504599290567, "flos": 19093200877440.0, "grad_norm": 2.6842628763117418, "language_loss": 0.78485179, "learning_rate": 7.272659500059297e-07, "loss": 0.80626905, "num_input_tokens_seen": 130160220, "step": 6054, "time_per_iteration": 2.719508171081543 }, { "auxiliary_loss_clip": 0.01150432, "auxiliary_loss_mlp": 0.01030212, "balance_loss_clip": 1.04569113, "balance_loss_mlp": 1.02257097, "epoch": 0.7280707028196958, "flos": 19062174504960.0, "grad_norm": 2.2135263576301636, "language_loss": 0.80418754, "learning_rate": 7.266651584069264e-07, "loss": 0.82599396, "num_input_tokens_seen": 130177885, "step": 6055, "time_per_iteration": 2.7250959873199463 }, { "auxiliary_loss_clip": 0.0116098, "auxiliary_loss_mlp": 0.01024945, "balance_loss_clip": 1.04947817, "balance_loss_mlp": 1.01773334, "epoch": 0.7281909457103348, "flos": 37196308293120.0, "grad_norm": 2.426394453965791, "language_loss": 0.56653541, "learning_rate": 7.260645599718045e-07, "loss": 0.5883947, "num_input_tokens_seen": 130204240, "step": 6056, "time_per_iteration": 2.746609687805176 }, { "auxiliary_loss_clip": 0.01141004, "auxiliary_loss_mlp": 0.01029972, "balance_loss_clip": 1.04423809, "balance_loss_mlp": 1.02188075, "epoch": 0.728311188600974, "flos": 20667094087680.0, "grad_norm": 2.4093169748622194, "language_loss": 0.67530572, "learning_rate": 7.254641547916767e-07, "loss": 0.69701546, "num_input_tokens_seen": 130221735, "step": 6057, "time_per_iteration": 2.6372549533843994 }, { "auxiliary_loss_clip": 0.01169553, "auxiliary_loss_mlp": 0.01028681, "balance_loss_clip": 1.04964685, "balance_loss_mlp": 1.02082479, "epoch": 0.728431431491613, "flos": 28840685616000.0, "grad_norm": 1.7697727457209231, "language_loss": 0.69516104, "learning_rate": 7.248639429576226e-07, "loss": 0.71714342, "num_input_tokens_seen": 130241190, "step": 6058, "time_per_iteration": 2.66440749168396 }, { "auxiliary_loss_clip": 0.01158098, "auxiliary_loss_mlp": 0.01025235, "balance_loss_clip": 1.04665709, "balance_loss_mlp": 1.01795781, "epoch": 0.7285516743822521, "flos": 25991856092160.0, "grad_norm": 1.6396838993723266, "language_loss": 0.72115958, "learning_rate": 7.242639245606959e-07, "loss": 0.74299294, "num_input_tokens_seen": 130260980, "step": 6059, "time_per_iteration": 2.7847440242767334 }, { "auxiliary_loss_clip": 0.01147444, "auxiliary_loss_mlp": 0.01027012, "balance_loss_clip": 1.04423583, "balance_loss_mlp": 1.01925135, "epoch": 0.7286719172728913, "flos": 16399721675520.0, "grad_norm": 1.8483861491469487, "language_loss": 0.82856584, "learning_rate": 7.236640996919168e-07, "loss": 0.85031039, "num_input_tokens_seen": 130280025, "step": 6060, "time_per_iteration": 2.6948609352111816 }, { "auxiliary_loss_clip": 0.01155868, "auxiliary_loss_mlp": 0.01024727, "balance_loss_clip": 1.04640698, "balance_loss_mlp": 1.0177207, "epoch": 0.7287921601635303, "flos": 22018161277440.0, "grad_norm": 1.6669281791124535, "language_loss": 0.70620751, "learning_rate": 7.230644684422782e-07, "loss": 0.72801346, "num_input_tokens_seen": 130300255, "step": 6061, "time_per_iteration": 2.6385855674743652 }, { "auxiliary_loss_clip": 0.01122337, "auxiliary_loss_mlp": 0.01026803, "balance_loss_clip": 1.04070354, "balance_loss_mlp": 1.01956117, "epoch": 0.7289124030541694, "flos": 24600927784320.0, "grad_norm": 1.8373045031556106, "language_loss": 0.8153165, "learning_rate": 7.224650309027451e-07, "loss": 0.83680785, "num_input_tokens_seen": 130320005, "step": 6062, "time_per_iteration": 2.724644422531128 }, { "auxiliary_loss_clip": 0.01158826, "auxiliary_loss_mlp": 0.01024347, "balance_loss_clip": 1.04792356, "balance_loss_mlp": 1.01711059, "epoch": 0.7290326459448085, "flos": 21393638484480.0, "grad_norm": 1.7879442554754248, "language_loss": 0.6885621, "learning_rate": 7.218657871642506e-07, "loss": 0.71039379, "num_input_tokens_seen": 130338810, "step": 6063, "time_per_iteration": 2.7416863441467285 }, { "auxiliary_loss_clip": 0.01172451, "auxiliary_loss_mlp": 0.01027487, "balance_loss_clip": 1.04922533, "balance_loss_mlp": 1.02042651, "epoch": 0.7291528888354476, "flos": 18587686821120.0, "grad_norm": 2.206293166427509, "language_loss": 0.62533784, "learning_rate": 7.212667373177012e-07, "loss": 0.6473372, "num_input_tokens_seen": 130353805, "step": 6064, "time_per_iteration": 3.5416882038116455 }, { "auxiliary_loss_clip": 0.01125117, "auxiliary_loss_mlp": 0.01022004, "balance_loss_clip": 1.04112911, "balance_loss_mlp": 1.01473212, "epoch": 0.7292731317260867, "flos": 18951066760320.0, "grad_norm": 1.8400974973285498, "language_loss": 0.75520265, "learning_rate": 7.206678814539704e-07, "loss": 0.77667385, "num_input_tokens_seen": 130372105, "step": 6065, "time_per_iteration": 2.688159227371216 }, { "auxiliary_loss_clip": 0.01120521, "auxiliary_loss_mlp": 0.01024613, "balance_loss_clip": 1.04160213, "balance_loss_mlp": 1.01788056, "epoch": 0.7293933746167258, "flos": 21067569797760.0, "grad_norm": 1.7042634643452053, "language_loss": 0.72566283, "learning_rate": 7.20069219663904e-07, "loss": 0.74711418, "num_input_tokens_seen": 130391990, "step": 6066, "time_per_iteration": 3.738140821456909 }, { "auxiliary_loss_clip": 0.01156822, "auxiliary_loss_mlp": 0.01022982, "balance_loss_clip": 1.04450428, "balance_loss_mlp": 1.01588881, "epoch": 0.7295136175073649, "flos": 22453326547200.0, "grad_norm": 3.4958486570518583, "language_loss": 0.79648411, "learning_rate": 7.1947075203832e-07, "loss": 0.81828213, "num_input_tokens_seen": 130411970, "step": 6067, "time_per_iteration": 3.5604584217071533 }, { "auxiliary_loss_clip": 0.01067042, "auxiliary_loss_mlp": 0.0100148, "balance_loss_clip": 1.01161027, "balance_loss_mlp": 1.00038326, "epoch": 0.7296338603980039, "flos": 56125506648960.0, "grad_norm": 0.8717250012307006, "language_loss": 0.6016326, "learning_rate": 7.188724786680049e-07, "loss": 0.62231779, "num_input_tokens_seen": 130472440, "step": 6068, "time_per_iteration": 3.1580066680908203 }, { "auxiliary_loss_clip": 0.01140313, "auxiliary_loss_mlp": 0.01024481, "balance_loss_clip": 1.04286063, "balance_loss_mlp": 1.01746583, "epoch": 0.7297541032886431, "flos": 25228287751680.0, "grad_norm": 1.5251486149729556, "language_loss": 0.75822389, "learning_rate": 7.182743996437162e-07, "loss": 0.77987182, "num_input_tokens_seen": 130491975, "step": 6069, "time_per_iteration": 3.5650408267974854 }, { "auxiliary_loss_clip": 0.01133764, "auxiliary_loss_mlp": 0.01026829, "balance_loss_clip": 1.04201341, "balance_loss_mlp": 1.01912236, "epoch": 0.7298743461792822, "flos": 26467600752000.0, "grad_norm": 1.803512162646333, "language_loss": 0.68999541, "learning_rate": 7.176765150561819e-07, "loss": 0.71160138, "num_input_tokens_seen": 130510580, "step": 6070, "time_per_iteration": 2.705667018890381 }, { "auxiliary_loss_clip": 0.01167364, "auxiliary_loss_mlp": 0.01030735, "balance_loss_clip": 1.04464531, "balance_loss_mlp": 1.02363563, "epoch": 0.7299945890699212, "flos": 19569053278080.0, "grad_norm": 2.161320006402385, "language_loss": 0.79831499, "learning_rate": 7.170788249961002e-07, "loss": 0.82029593, "num_input_tokens_seen": 130529090, "step": 6071, "time_per_iteration": 2.548200845718384 }, { "auxiliary_loss_clip": 0.0116492, "auxiliary_loss_mlp": 0.01023026, "balance_loss_clip": 1.04524159, "balance_loss_mlp": 1.01583171, "epoch": 0.7301148319605604, "flos": 22928963466240.0, "grad_norm": 1.7972045737814695, "language_loss": 0.88163012, "learning_rate": 7.164813295541418e-07, "loss": 0.90350962, "num_input_tokens_seen": 130548655, "step": 6072, "time_per_iteration": 2.6509146690368652 }, { "auxiliary_loss_clip": 0.01142458, "auxiliary_loss_mlp": 0.01026115, "balance_loss_clip": 1.04353952, "balance_loss_mlp": 1.01864684, "epoch": 0.7302350748511994, "flos": 25369703596800.0, "grad_norm": 1.8014078959940987, "language_loss": 0.70234555, "learning_rate": 7.15884028820944e-07, "loss": 0.72403133, "num_input_tokens_seen": 130567710, "step": 6073, "time_per_iteration": 2.6406314373016357 }, { "auxiliary_loss_clip": 0.01119481, "auxiliary_loss_mlp": 0.01020014, "balance_loss_clip": 1.03955066, "balance_loss_mlp": 1.01275373, "epoch": 0.7303553177418385, "flos": 27819170732160.0, "grad_norm": 2.0886198458986738, "language_loss": 0.60453653, "learning_rate": 7.152869228871185e-07, "loss": 0.6259315, "num_input_tokens_seen": 130590195, "step": 6074, "time_per_iteration": 2.7987170219421387 }, { "auxiliary_loss_clip": 0.01137001, "auxiliary_loss_mlp": 0.0102862, "balance_loss_clip": 1.04436815, "balance_loss_mlp": 1.02083564, "epoch": 0.7304755606324776, "flos": 24426510318720.0, "grad_norm": 1.8193269530027092, "language_loss": 0.71880651, "learning_rate": 7.146900118432457e-07, "loss": 0.74046278, "num_input_tokens_seen": 130609940, "step": 6075, "time_per_iteration": 2.7236416339874268 }, { "auxiliary_loss_clip": 0.01078551, "auxiliary_loss_mlp": 0.01024851, "balance_loss_clip": 1.03255081, "balance_loss_mlp": 1.01818752, "epoch": 0.7305958035231167, "flos": 23840483927040.0, "grad_norm": 1.6679542320729357, "language_loss": 0.85823202, "learning_rate": 7.140932957798753e-07, "loss": 0.87926602, "num_input_tokens_seen": 130628380, "step": 6076, "time_per_iteration": 2.923943281173706 }, { "auxiliary_loss_clip": 0.01145297, "auxiliary_loss_mlp": 0.01026866, "balance_loss_clip": 1.04400325, "balance_loss_mlp": 1.0196687, "epoch": 0.7307160464137558, "flos": 16726939597440.0, "grad_norm": 2.1147092281533304, "language_loss": 0.70737302, "learning_rate": 7.134967747875309e-07, "loss": 0.72909456, "num_input_tokens_seen": 130646590, "step": 6077, "time_per_iteration": 2.645317792892456 }, { "auxiliary_loss_clip": 0.01146929, "auxiliary_loss_mlp": 0.01024915, "balance_loss_clip": 1.04217148, "balance_loss_mlp": 1.01772034, "epoch": 0.7308362893043949, "flos": 21798280172160.0, "grad_norm": 2.2725630580416114, "language_loss": 0.81947792, "learning_rate": 7.129004489567014e-07, "loss": 0.8411963, "num_input_tokens_seen": 130664070, "step": 6078, "time_per_iteration": 2.632467746734619 }, { "auxiliary_loss_clip": 0.01129207, "auxiliary_loss_mlp": 0.01025145, "balance_loss_clip": 1.04286528, "balance_loss_mlp": 1.01793289, "epoch": 0.730956532195034, "flos": 10707377840640.0, "grad_norm": 2.2325612113101987, "language_loss": 0.77751648, "learning_rate": 7.123043183778512e-07, "loss": 0.79906005, "num_input_tokens_seen": 130681400, "step": 6079, "time_per_iteration": 2.6426584720611572 }, { "auxiliary_loss_clip": 0.01131183, "auxiliary_loss_mlp": 0.01031881, "balance_loss_clip": 1.04377627, "balance_loss_mlp": 1.02489567, "epoch": 0.731076775085673, "flos": 19791987039360.0, "grad_norm": 1.5467680036266793, "language_loss": 0.64803576, "learning_rate": 7.117083831414114e-07, "loss": 0.66966635, "num_input_tokens_seen": 130700675, "step": 6080, "time_per_iteration": 2.7225708961486816 }, { "auxiliary_loss_clip": 0.01166604, "auxiliary_loss_mlp": 0.01020294, "balance_loss_clip": 1.04679906, "balance_loss_mlp": 1.01345456, "epoch": 0.7311970179763122, "flos": 20447033414400.0, "grad_norm": 1.8519438125482652, "language_loss": 0.69611144, "learning_rate": 7.11112643337787e-07, "loss": 0.71798038, "num_input_tokens_seen": 130719720, "step": 6081, "time_per_iteration": 2.587704658508301 }, { "auxiliary_loss_clip": 0.01142237, "auxiliary_loss_mlp": 0.01022853, "balance_loss_clip": 1.04646039, "balance_loss_mlp": 1.01534843, "epoch": 0.7313172608669513, "flos": 18513818501760.0, "grad_norm": 2.516022164197006, "language_loss": 0.76848078, "learning_rate": 7.10517099057349e-07, "loss": 0.79013169, "num_input_tokens_seen": 130736670, "step": 6082, "time_per_iteration": 2.6389245986938477 }, { "auxiliary_loss_clip": 0.01143274, "auxiliary_loss_mlp": 0.01024411, "balance_loss_clip": 1.04400098, "balance_loss_mlp": 1.01701379, "epoch": 0.7314375037575903, "flos": 16180738410240.0, "grad_norm": 2.431267638794644, "language_loss": 0.61869383, "learning_rate": 7.099217503904411e-07, "loss": 0.64037061, "num_input_tokens_seen": 130754525, "step": 6083, "time_per_iteration": 2.689507246017456 }, { "auxiliary_loss_clip": 0.01142663, "auxiliary_loss_mlp": 0.01026319, "balance_loss_clip": 1.04364085, "balance_loss_mlp": 1.01925635, "epoch": 0.7315577466482295, "flos": 17967940536960.0, "grad_norm": 1.848987066115641, "language_loss": 0.89435959, "learning_rate": 7.093265974273788e-07, "loss": 0.91604936, "num_input_tokens_seen": 130772420, "step": 6084, "time_per_iteration": 2.6483230590820312 }, { "auxiliary_loss_clip": 0.01153098, "auxiliary_loss_mlp": 0.01023974, "balance_loss_clip": 1.04372191, "balance_loss_mlp": 1.01718807, "epoch": 0.7316779895388685, "flos": 18405440190720.0, "grad_norm": 2.324260853356819, "language_loss": 0.72290403, "learning_rate": 7.087316402584447e-07, "loss": 0.7446748, "num_input_tokens_seen": 130791245, "step": 6085, "time_per_iteration": 2.657064199447632 }, { "auxiliary_loss_clip": 0.01167611, "auxiliary_loss_mlp": 0.01024828, "balance_loss_clip": 1.04676402, "balance_loss_mlp": 1.01789606, "epoch": 0.7317982324295076, "flos": 17928294900480.0, "grad_norm": 1.8059641123729397, "language_loss": 0.86411631, "learning_rate": 7.081368789738953e-07, "loss": 0.88604081, "num_input_tokens_seen": 130808445, "step": 6086, "time_per_iteration": 2.6807308197021484 }, { "auxiliary_loss_clip": 0.01133245, "auxiliary_loss_mlp": 0.01023615, "balance_loss_clip": 1.03931236, "balance_loss_mlp": 1.01625669, "epoch": 0.7319184753201466, "flos": 27229840289280.0, "grad_norm": 1.9421357534881216, "language_loss": 0.77874804, "learning_rate": 7.075423136639537e-07, "loss": 0.80031669, "num_input_tokens_seen": 130827700, "step": 6087, "time_per_iteration": 2.7404978275299072 }, { "auxiliary_loss_clip": 0.01122055, "auxiliary_loss_mlp": 0.01028526, "balance_loss_clip": 1.04129291, "balance_loss_mlp": 1.02099752, "epoch": 0.7320387182107858, "flos": 37448544574080.0, "grad_norm": 1.7687022053844337, "language_loss": 0.74847019, "learning_rate": 7.069479444188149e-07, "loss": 0.76997596, "num_input_tokens_seen": 130848290, "step": 6088, "time_per_iteration": 2.8200392723083496 }, { "auxiliary_loss_clip": 0.01131981, "auxiliary_loss_mlp": 0.01026569, "balance_loss_clip": 1.04380238, "balance_loss_mlp": 1.01938391, "epoch": 0.7321589611014249, "flos": 17859023521920.0, "grad_norm": 2.514695109210698, "language_loss": 0.82167232, "learning_rate": 7.063537713286453e-07, "loss": 0.84325778, "num_input_tokens_seen": 130865970, "step": 6089, "time_per_iteration": 3.588059663772583 }, { "auxiliary_loss_clip": 0.01143785, "auxiliary_loss_mlp": 0.01027205, "balance_loss_clip": 1.04302943, "balance_loss_mlp": 1.01976013, "epoch": 0.7322792039920639, "flos": 26100593539200.0, "grad_norm": 2.0386376291392785, "language_loss": 0.80817878, "learning_rate": 7.057597944835803e-07, "loss": 0.82988864, "num_input_tokens_seen": 130885245, "step": 6090, "time_per_iteration": 2.7465696334838867 }, { "auxiliary_loss_clip": 0.01130876, "auxiliary_loss_mlp": 0.01024406, "balance_loss_clip": 1.04108882, "balance_loss_mlp": 1.01755738, "epoch": 0.7323994468827031, "flos": 25369093065600.0, "grad_norm": 1.6603237953045553, "language_loss": 0.74667007, "learning_rate": 7.051660139737253e-07, "loss": 0.76822287, "num_input_tokens_seen": 130903465, "step": 6091, "time_per_iteration": 2.721478223800659 }, { "auxiliary_loss_clip": 0.01152329, "auxiliary_loss_mlp": 0.00761875, "balance_loss_clip": 1.0470053, "balance_loss_mlp": 1.00037515, "epoch": 0.7325196897733421, "flos": 26907075653760.0, "grad_norm": 2.599001020473849, "language_loss": 0.76815605, "learning_rate": 7.045724298891565e-07, "loss": 0.78729808, "num_input_tokens_seen": 130922935, "step": 6092, "time_per_iteration": 3.6391196250915527 }, { "auxiliary_loss_clip": 0.01154427, "auxiliary_loss_mlp": 0.01023141, "balance_loss_clip": 1.04767847, "balance_loss_mlp": 1.016137, "epoch": 0.7326399326639812, "flos": 25775781828480.0, "grad_norm": 1.8900254938226724, "language_loss": 0.69346154, "learning_rate": 7.039790423199192e-07, "loss": 0.7152372, "num_input_tokens_seen": 130942575, "step": 6093, "time_per_iteration": 3.596498966217041 }, { "auxiliary_loss_clip": 0.01143828, "auxiliary_loss_mlp": 0.01019487, "balance_loss_clip": 1.04539418, "balance_loss_mlp": 1.0119884, "epoch": 0.7327601755546204, "flos": 21032269706880.0, "grad_norm": 1.956844456188359, "language_loss": 0.77277523, "learning_rate": 7.033858513560322e-07, "loss": 0.79440832, "num_input_tokens_seen": 130958870, "step": 6094, "time_per_iteration": 2.6508889198303223 }, { "auxiliary_loss_clip": 0.01156238, "auxiliary_loss_mlp": 0.01026914, "balance_loss_clip": 1.04832983, "balance_loss_mlp": 1.01970243, "epoch": 0.7328804184452594, "flos": 16289224462080.0, "grad_norm": 2.195745681618227, "language_loss": 0.75718939, "learning_rate": 7.027928570874794e-07, "loss": 0.77902091, "num_input_tokens_seen": 130977060, "step": 6095, "time_per_iteration": 3.5407028198242188 }, { "auxiliary_loss_clip": 0.0116588, "auxiliary_loss_mlp": 0.01027161, "balance_loss_clip": 1.04567385, "balance_loss_mlp": 1.02011538, "epoch": 0.7330006613358985, "flos": 17858233422720.0, "grad_norm": 2.1271134368896627, "language_loss": 0.85205698, "learning_rate": 7.022000596042194e-07, "loss": 0.87398738, "num_input_tokens_seen": 130994160, "step": 6096, "time_per_iteration": 2.5869038105010986 }, { "auxiliary_loss_clip": 0.01127139, "auxiliary_loss_mlp": 0.01023343, "balance_loss_clip": 1.04021788, "balance_loss_mlp": 1.01607978, "epoch": 0.7331209042265376, "flos": 22492074343680.0, "grad_norm": 2.112414518673481, "language_loss": 0.81813431, "learning_rate": 7.016074589961784e-07, "loss": 0.83963913, "num_input_tokens_seen": 131012725, "step": 6097, "time_per_iteration": 2.7254457473754883 }, { "auxiliary_loss_clip": 0.01136594, "auxiliary_loss_mlp": 0.01022544, "balance_loss_clip": 1.04444003, "balance_loss_mlp": 1.01540303, "epoch": 0.7332411471171767, "flos": 33072757937280.0, "grad_norm": 2.0078630190332856, "language_loss": 0.67061466, "learning_rate": 7.01015055353253e-07, "loss": 0.69220603, "num_input_tokens_seen": 131035150, "step": 6098, "time_per_iteration": 2.735308885574341 }, { "auxiliary_loss_clip": 0.01102566, "auxiliary_loss_mlp": 0.01026944, "balance_loss_clip": 1.04050553, "balance_loss_mlp": 1.01927865, "epoch": 0.7333613900078157, "flos": 22743017735040.0, "grad_norm": 1.6646637309088514, "language_loss": 0.7814225, "learning_rate": 7.004228487653123e-07, "loss": 0.80271757, "num_input_tokens_seen": 131055955, "step": 6099, "time_per_iteration": 2.786825656890869 }, { "auxiliary_loss_clip": 0.01121683, "auxiliary_loss_mlp": 0.01023959, "balance_loss_clip": 1.03855133, "balance_loss_mlp": 1.01706314, "epoch": 0.7334816328984549, "flos": 22346133384960.0, "grad_norm": 1.7017383862174718, "language_loss": 0.78292018, "learning_rate": 6.998308393221906e-07, "loss": 0.8043766, "num_input_tokens_seen": 131074360, "step": 6100, "time_per_iteration": 2.7142446041107178 }, { "auxiliary_loss_clip": 0.01131895, "auxiliary_loss_mlp": 0.01025532, "balance_loss_clip": 1.04542506, "balance_loss_mlp": 1.01916361, "epoch": 0.733601875789094, "flos": 20736149984640.0, "grad_norm": 2.63138820289893, "language_loss": 0.71046239, "learning_rate": 6.992390271136977e-07, "loss": 0.73203665, "num_input_tokens_seen": 131090070, "step": 6101, "time_per_iteration": 2.6897268295288086 }, { "auxiliary_loss_clip": 0.01146556, "auxiliary_loss_mlp": 0.01027683, "balance_loss_clip": 1.04303432, "balance_loss_mlp": 1.02063465, "epoch": 0.733722118679733, "flos": 22564362464640.0, "grad_norm": 1.9277950259977643, "language_loss": 0.85557026, "learning_rate": 6.986474122296094e-07, "loss": 0.87731266, "num_input_tokens_seen": 131109185, "step": 6102, "time_per_iteration": 2.7104127407073975 }, { "auxiliary_loss_clip": 0.01171092, "auxiliary_loss_mlp": 0.01025821, "balance_loss_clip": 1.04846251, "balance_loss_mlp": 1.01814389, "epoch": 0.7338423615703722, "flos": 20084192179200.0, "grad_norm": 3.697914800952412, "language_loss": 0.72823578, "learning_rate": 6.980559947596751e-07, "loss": 0.75020492, "num_input_tokens_seen": 131127725, "step": 6103, "time_per_iteration": 2.6069588661193848 }, { "auxiliary_loss_clip": 0.01110651, "auxiliary_loss_mlp": 0.01026824, "balance_loss_clip": 1.03949296, "balance_loss_mlp": 1.0194118, "epoch": 0.7339626044610112, "flos": 21687675217920.0, "grad_norm": 1.9903755556231217, "language_loss": 0.7576701, "learning_rate": 6.974647747936109e-07, "loss": 0.77904487, "num_input_tokens_seen": 131146110, "step": 6104, "time_per_iteration": 2.825244665145874 }, { "auxiliary_loss_clip": 0.01167891, "auxiliary_loss_mlp": 0.00761817, "balance_loss_clip": 1.04689729, "balance_loss_mlp": 1.00042892, "epoch": 0.7340828473516503, "flos": 15268248282240.0, "grad_norm": 2.0342524475475745, "language_loss": 0.82833529, "learning_rate": 6.968737524211039e-07, "loss": 0.84763241, "num_input_tokens_seen": 131162920, "step": 6105, "time_per_iteration": 2.5975215435028076 }, { "auxiliary_loss_clip": 0.01153968, "auxiliary_loss_mlp": 0.01023195, "balance_loss_clip": 1.04725409, "balance_loss_mlp": 1.0161078, "epoch": 0.7342030902422895, "flos": 22930112701440.0, "grad_norm": 2.234961974479447, "language_loss": 0.80044168, "learning_rate": 6.962829277318132e-07, "loss": 0.82221329, "num_input_tokens_seen": 131182515, "step": 6106, "time_per_iteration": 2.681307315826416 }, { "auxiliary_loss_clip": 0.01157299, "auxiliary_loss_mlp": 0.01024428, "balance_loss_clip": 1.04708707, "balance_loss_mlp": 1.01787448, "epoch": 0.7343233331329285, "flos": 25847890381440.0, "grad_norm": 7.203489426051874, "language_loss": 0.83189243, "learning_rate": 6.956923008153652e-07, "loss": 0.8537097, "num_input_tokens_seen": 131202280, "step": 6107, "time_per_iteration": 2.6482224464416504 }, { "auxiliary_loss_clip": 0.01151931, "auxiliary_loss_mlp": 0.01026013, "balance_loss_clip": 1.04225278, "balance_loss_mlp": 1.01909256, "epoch": 0.7344435760235676, "flos": 18478985287680.0, "grad_norm": 2.284717790901659, "language_loss": 0.84300148, "learning_rate": 6.951018717613593e-07, "loss": 0.8647809, "num_input_tokens_seen": 131221295, "step": 6108, "time_per_iteration": 2.683530807495117 }, { "auxiliary_loss_clip": 0.01153055, "auxiliary_loss_mlp": 0.01025558, "balance_loss_clip": 1.04590845, "balance_loss_mlp": 1.01861393, "epoch": 0.7345638189142067, "flos": 17640040256640.0, "grad_norm": 2.074661338240404, "language_loss": 0.78401726, "learning_rate": 6.945116406593614e-07, "loss": 0.80580342, "num_input_tokens_seen": 131240150, "step": 6109, "time_per_iteration": 2.661816358566284 }, { "auxiliary_loss_clip": 0.01114456, "auxiliary_loss_mlp": 0.01025119, "balance_loss_clip": 1.04267454, "balance_loss_mlp": 1.01791906, "epoch": 0.7346840618048458, "flos": 20260225756800.0, "grad_norm": 2.0592450051425, "language_loss": 0.74569356, "learning_rate": 6.939216075989089e-07, "loss": 0.76708937, "num_input_tokens_seen": 131258080, "step": 6110, "time_per_iteration": 2.7212815284729004 }, { "auxiliary_loss_clip": 0.0113694, "auxiliary_loss_mlp": 0.01026289, "balance_loss_clip": 1.04203832, "balance_loss_mlp": 1.0191927, "epoch": 0.7348043046954849, "flos": 29023183641600.0, "grad_norm": 2.0460165824631478, "language_loss": 0.65902704, "learning_rate": 6.933317726695109e-07, "loss": 0.68065929, "num_input_tokens_seen": 131279310, "step": 6111, "time_per_iteration": 2.6604928970336914 }, { "auxiliary_loss_clip": 0.01124483, "auxiliary_loss_mlp": 0.01021088, "balance_loss_clip": 1.04504037, "balance_loss_mlp": 1.01418018, "epoch": 0.734924547586124, "flos": 17931203902080.0, "grad_norm": 2.6281996606430384, "language_loss": 0.80100441, "learning_rate": 6.92742135960644e-07, "loss": 0.82246006, "num_input_tokens_seen": 131297010, "step": 6112, "time_per_iteration": 2.7247776985168457 }, { "auxiliary_loss_clip": 0.0105774, "auxiliary_loss_mlp": 0.01000864, "balance_loss_clip": 1.0117979, "balance_loss_mlp": 0.9996835, "epoch": 0.7350447904767631, "flos": 63588319850880.0, "grad_norm": 0.8248769957439724, "language_loss": 0.55651057, "learning_rate": 6.921526975617556e-07, "loss": 0.57709658, "num_input_tokens_seen": 131356470, "step": 6113, "time_per_iteration": 3.2584221363067627 }, { "auxiliary_loss_clip": 0.0114409, "auxiliary_loss_mlp": 0.01024351, "balance_loss_clip": 1.04375446, "balance_loss_mlp": 1.0168556, "epoch": 0.7351650333674021, "flos": 21580015178880.0, "grad_norm": 1.9153751081803623, "language_loss": 0.75655913, "learning_rate": 6.915634575622631e-07, "loss": 0.77824354, "num_input_tokens_seen": 131374985, "step": 6114, "time_per_iteration": 2.685204267501831 }, { "auxiliary_loss_clip": 0.01164807, "auxiliary_loss_mlp": 0.01022969, "balance_loss_clip": 1.04503083, "balance_loss_mlp": 1.01580477, "epoch": 0.7352852762580413, "flos": 18186349184640.0, "grad_norm": 1.9002517603334987, "language_loss": 0.7124114, "learning_rate": 6.909744160515532e-07, "loss": 0.73428917, "num_input_tokens_seen": 131393125, "step": 6115, "time_per_iteration": 3.6154119968414307 }, { "auxiliary_loss_clip": 0.01138339, "auxiliary_loss_mlp": 0.01023337, "balance_loss_clip": 1.04455948, "balance_loss_mlp": 1.01588619, "epoch": 0.7354055191486804, "flos": 38910073063680.0, "grad_norm": 2.0563367034157762, "language_loss": 0.69645005, "learning_rate": 6.903855731189849e-07, "loss": 0.71806681, "num_input_tokens_seen": 131415760, "step": 6116, "time_per_iteration": 2.8522636890411377 }, { "auxiliary_loss_clip": 0.01145538, "auxiliary_loss_mlp": 0.01030377, "balance_loss_clip": 1.04394782, "balance_loss_mlp": 1.02267587, "epoch": 0.7355257620393194, "flos": 16289978647680.0, "grad_norm": 2.0658872808129614, "language_loss": 0.8164078, "learning_rate": 6.897969288538825e-07, "loss": 0.83816689, "num_input_tokens_seen": 131433705, "step": 6117, "time_per_iteration": 2.6889140605926514 }, { "auxiliary_loss_clip": 0.01134037, "auxiliary_loss_mlp": 0.0102378, "balance_loss_clip": 1.04170787, "balance_loss_mlp": 1.01679707, "epoch": 0.7356460049299585, "flos": 18114240631680.0, "grad_norm": 1.8389390514381192, "language_loss": 0.81356007, "learning_rate": 6.892084833455452e-07, "loss": 0.83513826, "num_input_tokens_seen": 131453275, "step": 6118, "time_per_iteration": 3.685389757156372 }, { "auxiliary_loss_clip": 0.01153269, "auxiliary_loss_mlp": 0.01020717, "balance_loss_clip": 1.04668689, "balance_loss_mlp": 1.01398134, "epoch": 0.7357662478205976, "flos": 21325193118720.0, "grad_norm": 1.4415584071879186, "language_loss": 0.83714473, "learning_rate": 6.886202366832384e-07, "loss": 0.85888469, "num_input_tokens_seen": 131474960, "step": 6119, "time_per_iteration": 3.6610970497131348 }, { "auxiliary_loss_clip": 0.0111033, "auxiliary_loss_mlp": 0.01024022, "balance_loss_clip": 1.04309869, "balance_loss_mlp": 1.01679778, "epoch": 0.7358864907112367, "flos": 14246841139200.0, "grad_norm": 2.238615281896033, "language_loss": 0.7363233, "learning_rate": 6.880321889561987e-07, "loss": 0.75766677, "num_input_tokens_seen": 131492935, "step": 6120, "time_per_iteration": 2.7224111557006836 }, { "auxiliary_loss_clip": 0.01119619, "auxiliary_loss_mlp": 0.0102503, "balance_loss_clip": 1.04149139, "balance_loss_mlp": 1.01675725, "epoch": 0.7360067336018757, "flos": 22309684058880.0, "grad_norm": 2.866509689302074, "language_loss": 0.64800429, "learning_rate": 6.874443402536338e-07, "loss": 0.66945088, "num_input_tokens_seen": 131512025, "step": 6121, "time_per_iteration": 3.6904549598693848 }, { "auxiliary_loss_clip": 0.0114401, "auxiliary_loss_mlp": 0.01023259, "balance_loss_clip": 1.04593706, "balance_loss_mlp": 1.01614857, "epoch": 0.7361269764925149, "flos": 25554607833600.0, "grad_norm": 2.1278581255609277, "language_loss": 0.80672872, "learning_rate": 6.868566906647177e-07, "loss": 0.82840145, "num_input_tokens_seen": 131532975, "step": 6122, "time_per_iteration": 2.666492462158203 }, { "auxiliary_loss_clip": 0.01156282, "auxiliary_loss_mlp": 0.01027164, "balance_loss_clip": 1.04644883, "balance_loss_mlp": 1.019508, "epoch": 0.736247219383154, "flos": 20376505059840.0, "grad_norm": 1.9326593252309239, "language_loss": 0.83392537, "learning_rate": 6.862692402785984e-07, "loss": 0.8557598, "num_input_tokens_seen": 131553225, "step": 6123, "time_per_iteration": 2.744079351425171 }, { "auxiliary_loss_clip": 0.01037955, "auxiliary_loss_mlp": 0.01000677, "balance_loss_clip": 1.01759815, "balance_loss_mlp": 0.99937785, "epoch": 0.736367462273793, "flos": 70339525735680.0, "grad_norm": 0.6814638255791329, "language_loss": 0.4961538, "learning_rate": 6.856819891843899e-07, "loss": 0.51654017, "num_input_tokens_seen": 131617930, "step": 6124, "time_per_iteration": 3.357239246368408 }, { "auxiliary_loss_clip": 0.01099121, "auxiliary_loss_mlp": 0.01023124, "balance_loss_clip": 1.04019141, "balance_loss_mlp": 1.01605499, "epoch": 0.7364877051644322, "flos": 22412711243520.0, "grad_norm": 1.9801678944034977, "language_loss": 0.72386044, "learning_rate": 6.8509493747118e-07, "loss": 0.74508286, "num_input_tokens_seen": 131636740, "step": 6125, "time_per_iteration": 2.7425270080566406 }, { "auxiliary_loss_clip": 0.01168817, "auxiliary_loss_mlp": 0.01029566, "balance_loss_clip": 1.04805648, "balance_loss_mlp": 1.02288449, "epoch": 0.7366079480550712, "flos": 12130266274560.0, "grad_norm": 3.502325688495394, "language_loss": 0.88581395, "learning_rate": 6.845080852280221e-07, "loss": 0.90779781, "num_input_tokens_seen": 131653810, "step": 6126, "time_per_iteration": 2.5839459896087646 }, { "auxiliary_loss_clip": 0.01126044, "auxiliary_loss_mlp": 0.01022169, "balance_loss_clip": 1.04225349, "balance_loss_mlp": 1.015347, "epoch": 0.7367281909457103, "flos": 15049336844160.0, "grad_norm": 1.8123386553167553, "language_loss": 0.74197537, "learning_rate": 6.839214325439409e-07, "loss": 0.76345754, "num_input_tokens_seen": 131671505, "step": 6127, "time_per_iteration": 2.666963815689087 }, { "auxiliary_loss_clip": 0.01133677, "auxiliary_loss_mlp": 0.010312, "balance_loss_clip": 1.04400754, "balance_loss_mlp": 1.02447414, "epoch": 0.7368484338363495, "flos": 23510752053120.0, "grad_norm": 1.7097160009710766, "language_loss": 0.71880198, "learning_rate": 6.833349795079327e-07, "loss": 0.74045074, "num_input_tokens_seen": 131690615, "step": 6128, "time_per_iteration": 2.70161771774292 }, { "auxiliary_loss_clip": 0.01127505, "auxiliary_loss_mlp": 0.01023456, "balance_loss_clip": 1.04527569, "balance_loss_mlp": 1.01648521, "epoch": 0.7369686767269885, "flos": 27417833095680.0, "grad_norm": 1.808217919827775, "language_loss": 0.68385935, "learning_rate": 6.827487262089613e-07, "loss": 0.70536894, "num_input_tokens_seen": 131711120, "step": 6129, "time_per_iteration": 2.8333590030670166 }, { "auxiliary_loss_clip": 0.01040251, "auxiliary_loss_mlp": 0.01000826, "balance_loss_clip": 1.01017439, "balance_loss_mlp": 0.99964631, "epoch": 0.7370889196176276, "flos": 70293343824000.0, "grad_norm": 0.892938912722892, "language_loss": 0.5684284, "learning_rate": 6.821626727359606e-07, "loss": 0.58883917, "num_input_tokens_seen": 131776680, "step": 6130, "time_per_iteration": 3.3624086380004883 }, { "auxiliary_loss_clip": 0.0113823, "auxiliary_loss_mlp": 0.01028494, "balance_loss_clip": 1.04475737, "balance_loss_mlp": 1.02020252, "epoch": 0.7372091625082667, "flos": 18040839189120.0, "grad_norm": 3.1919237366180155, "language_loss": 0.77428907, "learning_rate": 6.815768191778348e-07, "loss": 0.79595637, "num_input_tokens_seen": 131794760, "step": 6131, "time_per_iteration": 2.718435525894165 }, { "auxiliary_loss_clip": 0.01149373, "auxiliary_loss_mlp": 0.01023869, "balance_loss_clip": 1.04390895, "balance_loss_mlp": 1.01605487, "epoch": 0.7373294053989058, "flos": 33726331854720.0, "grad_norm": 1.759335844518035, "language_loss": 0.72766936, "learning_rate": 6.809911656234569e-07, "loss": 0.74940181, "num_input_tokens_seen": 131816735, "step": 6132, "time_per_iteration": 2.781639814376831 }, { "auxiliary_loss_clip": 0.01126753, "auxiliary_loss_mlp": 0.01026103, "balance_loss_clip": 1.04064047, "balance_loss_mlp": 1.01896262, "epoch": 0.7374496482895448, "flos": 21506326427520.0, "grad_norm": 2.0289294203118997, "language_loss": 0.78189903, "learning_rate": 6.804057121616707e-07, "loss": 0.80342758, "num_input_tokens_seen": 131834940, "step": 6133, "time_per_iteration": 2.733464002609253 }, { "auxiliary_loss_clip": 0.01156362, "auxiliary_loss_mlp": 0.01023743, "balance_loss_clip": 1.04602313, "balance_loss_mlp": 1.01636338, "epoch": 0.737569891180184, "flos": 24936908624640.0, "grad_norm": 1.8225271731410826, "language_loss": 0.71994406, "learning_rate": 6.798204588812888e-07, "loss": 0.74174511, "num_input_tokens_seen": 131854355, "step": 6134, "time_per_iteration": 2.732590436935425 }, { "auxiliary_loss_clip": 0.01088261, "auxiliary_loss_mlp": 0.00762605, "balance_loss_clip": 1.03734326, "balance_loss_mlp": 1.00032258, "epoch": 0.7376901340708231, "flos": 20664544222080.0, "grad_norm": 1.9399441165931683, "language_loss": 0.7533108, "learning_rate": 6.792354058710937e-07, "loss": 0.77181947, "num_input_tokens_seen": 131871825, "step": 6135, "time_per_iteration": 2.833524465560913 }, { "auxiliary_loss_clip": 0.01162145, "auxiliary_loss_mlp": 0.010203, "balance_loss_clip": 1.04581976, "balance_loss_mlp": 1.01377332, "epoch": 0.7378103769614621, "flos": 23805794367360.0, "grad_norm": 2.1575796254350936, "language_loss": 0.65367758, "learning_rate": 6.786505532198374e-07, "loss": 0.67550206, "num_input_tokens_seen": 131890770, "step": 6136, "time_per_iteration": 2.6107802391052246 }, { "auxiliary_loss_clip": 0.01170643, "auxiliary_loss_mlp": 0.01024665, "balance_loss_clip": 1.04824233, "balance_loss_mlp": 1.0179565, "epoch": 0.7379306198521013, "flos": 22237216369920.0, "grad_norm": 1.6630128771375878, "language_loss": 0.85195726, "learning_rate": 6.780659010162411e-07, "loss": 0.87391031, "num_input_tokens_seen": 131909720, "step": 6137, "time_per_iteration": 2.6605801582336426 }, { "auxiliary_loss_clip": 0.01128664, "auxiliary_loss_mlp": 0.01021449, "balance_loss_clip": 1.04304051, "balance_loss_mlp": 1.01457059, "epoch": 0.7380508627427403, "flos": 14903108576640.0, "grad_norm": 2.221249823598007, "language_loss": 0.83147383, "learning_rate": 6.774814493489975e-07, "loss": 0.85297489, "num_input_tokens_seen": 131927395, "step": 6138, "time_per_iteration": 2.6516075134277344 }, { "auxiliary_loss_clip": 0.01147902, "auxiliary_loss_mlp": 0.01021874, "balance_loss_clip": 1.04400373, "balance_loss_mlp": 1.0152756, "epoch": 0.7381711056333794, "flos": 21685843624320.0, "grad_norm": 2.6914548967087124, "language_loss": 0.66469336, "learning_rate": 6.768971983067655e-07, "loss": 0.68639106, "num_input_tokens_seen": 131947725, "step": 6139, "time_per_iteration": 2.652316093444824 }, { "auxiliary_loss_clip": 0.01066724, "auxiliary_loss_mlp": 0.0100128, "balance_loss_clip": 1.01143932, "balance_loss_mlp": 1.00017691, "epoch": 0.7382913485240186, "flos": 52404263596800.0, "grad_norm": 1.0358542440511391, "language_loss": 0.67792279, "learning_rate": 6.763131479781772e-07, "loss": 0.69860286, "num_input_tokens_seen": 131997485, "step": 6140, "time_per_iteration": 3.0506324768066406 }, { "auxiliary_loss_clip": 0.01134148, "auxiliary_loss_mlp": 0.01022972, "balance_loss_clip": 1.04591441, "balance_loss_mlp": 1.01643085, "epoch": 0.7384115914146576, "flos": 21798818876160.0, "grad_norm": 1.8192883155409, "language_loss": 0.7598536, "learning_rate": 6.757292984518316e-07, "loss": 0.78142476, "num_input_tokens_seen": 132016885, "step": 6141, "time_per_iteration": 3.6502699851989746 }, { "auxiliary_loss_clip": 0.01057816, "auxiliary_loss_mlp": 0.01002055, "balance_loss_clip": 1.0117054, "balance_loss_mlp": 1.00093412, "epoch": 0.7385318343052967, "flos": 61494331662720.0, "grad_norm": 0.742588597527093, "language_loss": 0.56435442, "learning_rate": 6.751456498162981e-07, "loss": 0.58495313, "num_input_tokens_seen": 132075920, "step": 6142, "time_per_iteration": 3.1735730171203613 }, { "auxiliary_loss_clip": 0.0115061, "auxiliary_loss_mlp": 0.01020828, "balance_loss_clip": 1.04159772, "balance_loss_mlp": 1.01447964, "epoch": 0.7386520771959358, "flos": 17013757697280.0, "grad_norm": 1.8059934994640525, "language_loss": 0.84910655, "learning_rate": 6.745622021601174e-07, "loss": 0.870821, "num_input_tokens_seen": 132092945, "step": 6143, "time_per_iteration": 2.579279661178589 }, { "auxiliary_loss_clip": 0.01127825, "auxiliary_loss_mlp": 0.01025174, "balance_loss_clip": 1.04210639, "balance_loss_mlp": 1.01752687, "epoch": 0.7387723200865749, "flos": 18770759464320.0, "grad_norm": 3.1796999162567205, "language_loss": 0.69921428, "learning_rate": 6.739789555717954e-07, "loss": 0.72074425, "num_input_tokens_seen": 132109920, "step": 6144, "time_per_iteration": 3.619631767272949 }, { "auxiliary_loss_clip": 0.01164342, "auxiliary_loss_mlp": 0.0102766, "balance_loss_clip": 1.04446328, "balance_loss_mlp": 1.02112484, "epoch": 0.738892562977214, "flos": 22525542840960.0, "grad_norm": 1.9400843925302267, "language_loss": 0.77214098, "learning_rate": 6.733959101398124e-07, "loss": 0.79406095, "num_input_tokens_seen": 132128050, "step": 6145, "time_per_iteration": 3.5423765182495117 }, { "auxiliary_loss_clip": 0.01136346, "auxiliary_loss_mlp": 0.01028842, "balance_loss_clip": 1.04183698, "balance_loss_mlp": 1.02232146, "epoch": 0.7390128058678531, "flos": 21501478091520.0, "grad_norm": 1.6293756708988685, "language_loss": 0.81573701, "learning_rate": 6.728130659526143e-07, "loss": 0.83738887, "num_input_tokens_seen": 132145860, "step": 6146, "time_per_iteration": 2.675356388092041 }, { "auxiliary_loss_clip": 0.01143133, "auxiliary_loss_mlp": 0.01031307, "balance_loss_clip": 1.04584646, "balance_loss_mlp": 1.02400541, "epoch": 0.7391330487584922, "flos": 25776176878080.0, "grad_norm": 3.7715962278292854, "language_loss": 0.70765883, "learning_rate": 6.7223042309862e-07, "loss": 0.72940326, "num_input_tokens_seen": 132166060, "step": 6147, "time_per_iteration": 3.622819423675537 }, { "auxiliary_loss_clip": 0.01149088, "auxiliary_loss_mlp": 0.01027724, "balance_loss_clip": 1.04287314, "balance_loss_mlp": 1.02035069, "epoch": 0.7392532916491312, "flos": 28366736636160.0, "grad_norm": 2.2515622299296325, "language_loss": 0.73953688, "learning_rate": 6.716479816662144e-07, "loss": 0.76130497, "num_input_tokens_seen": 132187790, "step": 6148, "time_per_iteration": 2.7269699573516846 }, { "auxiliary_loss_clip": 0.01142956, "auxiliary_loss_mlp": 0.01026883, "balance_loss_clip": 1.04342246, "balance_loss_mlp": 1.01971292, "epoch": 0.7393735345397703, "flos": 23585877348480.0, "grad_norm": 1.9986583679071979, "language_loss": 0.73205453, "learning_rate": 6.710657417437531e-07, "loss": 0.75375295, "num_input_tokens_seen": 132207495, "step": 6149, "time_per_iteration": 2.673571825027466 }, { "auxiliary_loss_clip": 0.01137523, "auxiliary_loss_mlp": 0.01024475, "balance_loss_clip": 1.04323149, "balance_loss_mlp": 1.01763797, "epoch": 0.7394937774304094, "flos": 19974772373760.0, "grad_norm": 2.077001166597141, "language_loss": 0.80304366, "learning_rate": 6.704837034195628e-07, "loss": 0.82466364, "num_input_tokens_seen": 132225960, "step": 6150, "time_per_iteration": 2.639875888824463 }, { "auxiliary_loss_clip": 0.0114937, "auxiliary_loss_mlp": 0.0102832, "balance_loss_clip": 1.04698658, "balance_loss_mlp": 1.02125061, "epoch": 0.7396140203210485, "flos": 23478037741440.0, "grad_norm": 1.6936409704951199, "language_loss": 0.85031468, "learning_rate": 6.699018667819376e-07, "loss": 0.87209153, "num_input_tokens_seen": 132245360, "step": 6151, "time_per_iteration": 2.757138252258301 }, { "auxiliary_loss_clip": 0.0115036, "auxiliary_loss_mlp": 0.01029054, "balance_loss_clip": 1.04483044, "balance_loss_mlp": 1.02143013, "epoch": 0.7397342632116876, "flos": 25555433846400.0, "grad_norm": 1.542186676860542, "language_loss": 0.72961366, "learning_rate": 6.693202319191415e-07, "loss": 0.7514078, "num_input_tokens_seen": 132267095, "step": 6152, "time_per_iteration": 2.702171802520752 }, { "auxiliary_loss_clip": 0.01167266, "auxiliary_loss_mlp": 0.01031521, "balance_loss_clip": 1.0494566, "balance_loss_mlp": 1.02404332, "epoch": 0.7398545061023267, "flos": 24755021130240.0, "grad_norm": 13.260195463512833, "language_loss": 0.74896753, "learning_rate": 6.687387989194084e-07, "loss": 0.77095544, "num_input_tokens_seen": 132286610, "step": 6153, "time_per_iteration": 2.613478660583496 }, { "auxiliary_loss_clip": 0.01131357, "auxiliary_loss_mlp": 0.01024296, "balance_loss_clip": 1.04274118, "balance_loss_mlp": 1.01739645, "epoch": 0.7399747489929658, "flos": 16508602776960.0, "grad_norm": 1.8932424676161752, "language_loss": 0.79156268, "learning_rate": 6.681575678709404e-07, "loss": 0.81311917, "num_input_tokens_seen": 132305300, "step": 6154, "time_per_iteration": 2.6608335971832275 }, { "auxiliary_loss_clip": 0.01153166, "auxiliary_loss_mlp": 0.01028663, "balance_loss_clip": 1.04603541, "balance_loss_mlp": 1.021752, "epoch": 0.7400949918836048, "flos": 24097065753600.0, "grad_norm": 2.011080587499535, "language_loss": 0.70579457, "learning_rate": 6.67576538861911e-07, "loss": 0.72761285, "num_input_tokens_seen": 132323875, "step": 6155, "time_per_iteration": 2.58485746383667 }, { "auxiliary_loss_clip": 0.01137159, "auxiliary_loss_mlp": 0.01022853, "balance_loss_clip": 1.04439569, "balance_loss_mlp": 1.01618052, "epoch": 0.740215234774244, "flos": 21802517976960.0, "grad_norm": 1.5327827975595518, "language_loss": 0.82253277, "learning_rate": 6.669957119804612e-07, "loss": 0.8441329, "num_input_tokens_seen": 132345510, "step": 6156, "time_per_iteration": 2.7540125846862793 }, { "auxiliary_loss_clip": 0.01147084, "auxiliary_loss_mlp": 0.01023176, "balance_loss_clip": 1.04396665, "balance_loss_mlp": 1.01556492, "epoch": 0.7403354776648831, "flos": 18733196816640.0, "grad_norm": 3.31545718488185, "language_loss": 0.73210192, "learning_rate": 6.66415087314702e-07, "loss": 0.75380456, "num_input_tokens_seen": 132360465, "step": 6157, "time_per_iteration": 2.6089510917663574 }, { "auxiliary_loss_clip": 0.01139749, "auxiliary_loss_mlp": 0.01021445, "balance_loss_clip": 1.04238176, "balance_loss_mlp": 1.01436055, "epoch": 0.7404557205555221, "flos": 16909581277440.0, "grad_norm": 2.6886413052750595, "language_loss": 0.73544782, "learning_rate": 6.65834664952714e-07, "loss": 0.75705969, "num_input_tokens_seen": 132377915, "step": 6158, "time_per_iteration": 2.67838978767395 }, { "auxiliary_loss_clip": 0.01126677, "auxiliary_loss_mlp": 0.01021295, "balance_loss_clip": 1.0417192, "balance_loss_mlp": 1.01444316, "epoch": 0.7405759634461613, "flos": 21214408596480.0, "grad_norm": 1.755785839360571, "language_loss": 0.75983554, "learning_rate": 6.652544449825457e-07, "loss": 0.78131521, "num_input_tokens_seen": 132398170, "step": 6159, "time_per_iteration": 2.699551820755005 }, { "auxiliary_loss_clip": 0.01147358, "auxiliary_loss_mlp": 0.01026041, "balance_loss_clip": 1.04426515, "balance_loss_mlp": 1.0186677, "epoch": 0.7406962063368003, "flos": 20480106862080.0, "grad_norm": 1.6452843978623748, "language_loss": 0.76552039, "learning_rate": 6.646744274922182e-07, "loss": 0.78725439, "num_input_tokens_seen": 132416615, "step": 6160, "time_per_iteration": 2.7571797370910645 }, { "auxiliary_loss_clip": 0.01141219, "auxiliary_loss_mlp": 0.01023363, "balance_loss_clip": 1.04379106, "balance_loss_mlp": 1.01647294, "epoch": 0.7408164492274394, "flos": 19791915212160.0, "grad_norm": 4.016063246530093, "language_loss": 0.75948834, "learning_rate": 6.640946125697171e-07, "loss": 0.78113413, "num_input_tokens_seen": 132434145, "step": 6161, "time_per_iteration": 2.672748327255249 }, { "auxiliary_loss_clip": 0.01154503, "auxiliary_loss_mlp": 0.0102632, "balance_loss_clip": 1.04459548, "balance_loss_mlp": 1.01885772, "epoch": 0.7409366921180786, "flos": 29204855654400.0, "grad_norm": 2.464715969395424, "language_loss": 0.76026523, "learning_rate": 6.635150003030017e-07, "loss": 0.7820735, "num_input_tokens_seen": 132452670, "step": 6162, "time_per_iteration": 2.781038999557495 }, { "auxiliary_loss_clip": 0.01110683, "auxiliary_loss_mlp": 0.01023184, "balance_loss_clip": 1.03815889, "balance_loss_mlp": 1.01624632, "epoch": 0.7410569350087176, "flos": 22930004960640.0, "grad_norm": 2.1607997882612326, "language_loss": 0.86309177, "learning_rate": 6.629355907799981e-07, "loss": 0.88443041, "num_input_tokens_seen": 132472475, "step": 6163, "time_per_iteration": 2.791494846343994 }, { "auxiliary_loss_clip": 0.01154148, "auxiliary_loss_mlp": 0.0102452, "balance_loss_clip": 1.04369307, "balance_loss_mlp": 1.01690292, "epoch": 0.7411771778993567, "flos": 30440397726720.0, "grad_norm": 1.5983854892380709, "language_loss": 0.68873477, "learning_rate": 6.623563840886015e-07, "loss": 0.71052146, "num_input_tokens_seen": 132493400, "step": 6164, "time_per_iteration": 2.7225863933563232 }, { "auxiliary_loss_clip": 0.01148115, "auxiliary_loss_mlp": 0.01027753, "balance_loss_clip": 1.04225874, "balance_loss_mlp": 1.02035594, "epoch": 0.7412974207899958, "flos": 20522050968960.0, "grad_norm": 1.590874097813442, "language_loss": 0.69388294, "learning_rate": 6.617773803166795e-07, "loss": 0.71564162, "num_input_tokens_seen": 132511725, "step": 6165, "time_per_iteration": 2.6535584926605225 }, { "auxiliary_loss_clip": 0.01144024, "auxiliary_loss_mlp": 0.00762333, "balance_loss_clip": 1.04511309, "balance_loss_mlp": 1.00046992, "epoch": 0.7414176636806349, "flos": 22090700793600.0, "grad_norm": 2.260000285038872, "language_loss": 0.81651258, "learning_rate": 6.611985795520634e-07, "loss": 0.83557612, "num_input_tokens_seen": 132530270, "step": 6166, "time_per_iteration": 2.705854654312134 }, { "auxiliary_loss_clip": 0.01135122, "auxiliary_loss_mlp": 0.0103128, "balance_loss_clip": 1.0463618, "balance_loss_mlp": 1.02372825, "epoch": 0.7415379065712739, "flos": 25155245445120.0, "grad_norm": 1.9417569411543147, "language_loss": 0.77616417, "learning_rate": 6.606199818825588e-07, "loss": 0.7978282, "num_input_tokens_seen": 132550725, "step": 6167, "time_per_iteration": 3.665947675704956 }, { "auxiliary_loss_clip": 0.01143141, "auxiliary_loss_mlp": 0.01023684, "balance_loss_clip": 1.04160309, "balance_loss_mlp": 1.01637018, "epoch": 0.7416581494619131, "flos": 16871731320960.0, "grad_norm": 1.9621960352584757, "language_loss": 0.81717956, "learning_rate": 6.600415873959377e-07, "loss": 0.83884776, "num_input_tokens_seen": 132568600, "step": 6168, "time_per_iteration": 2.641655206680298 }, { "auxiliary_loss_clip": 0.01094446, "auxiliary_loss_mlp": 0.00761831, "balance_loss_clip": 1.03798568, "balance_loss_mlp": 1.00029337, "epoch": 0.7417783923525522, "flos": 28438881102720.0, "grad_norm": 1.917392602003218, "language_loss": 0.64600259, "learning_rate": 6.594633961799437e-07, "loss": 0.66456532, "num_input_tokens_seen": 132587640, "step": 6169, "time_per_iteration": 2.8498239517211914 }, { "auxiliary_loss_clip": 0.01132784, "auxiliary_loss_mlp": 0.01029098, "balance_loss_clip": 1.04280353, "balance_loss_mlp": 1.02211237, "epoch": 0.7418986352431912, "flos": 20084299920000.0, "grad_norm": 7.335748716475869, "language_loss": 0.81735098, "learning_rate": 6.588854083222857e-07, "loss": 0.83896983, "num_input_tokens_seen": 132607075, "step": 6170, "time_per_iteration": 3.6786270141601562 }, { "auxiliary_loss_clip": 0.01146575, "auxiliary_loss_mlp": 0.01023755, "balance_loss_clip": 1.04670024, "balance_loss_mlp": 1.01657581, "epoch": 0.7420188781338304, "flos": 18259571059200.0, "grad_norm": 2.1381310175807644, "language_loss": 0.80877745, "learning_rate": 6.583076239106444e-07, "loss": 0.83048069, "num_input_tokens_seen": 132625580, "step": 6171, "time_per_iteration": 3.5799245834350586 }, { "auxiliary_loss_clip": 0.01145718, "auxiliary_loss_mlp": 0.01025826, "balance_loss_clip": 1.04415154, "balance_loss_mlp": 1.01801765, "epoch": 0.7421391210244694, "flos": 13771994319360.0, "grad_norm": 2.6444057254324447, "language_loss": 0.75089389, "learning_rate": 6.577300430326707e-07, "loss": 0.77260941, "num_input_tokens_seen": 132640525, "step": 6172, "time_per_iteration": 2.663411855697632 }, { "auxiliary_loss_clip": 0.01122971, "auxiliary_loss_mlp": 0.01023425, "balance_loss_clip": 1.04321146, "balance_loss_mlp": 1.01642704, "epoch": 0.7422593639151085, "flos": 15961683317760.0, "grad_norm": 2.5550883715118684, "language_loss": 0.7262525, "learning_rate": 6.571526657759821e-07, "loss": 0.74771643, "num_input_tokens_seen": 132656265, "step": 6173, "time_per_iteration": 3.7747671604156494 }, { "auxiliary_loss_clip": 0.01145114, "auxiliary_loss_mlp": 0.01021839, "balance_loss_clip": 1.04170871, "balance_loss_mlp": 1.01474071, "epoch": 0.7423796068057477, "flos": 30114400867200.0, "grad_norm": 1.5757989332268019, "language_loss": 0.70717388, "learning_rate": 6.565754922281663e-07, "loss": 0.72884345, "num_input_tokens_seen": 132678510, "step": 6174, "time_per_iteration": 2.7009828090667725 }, { "auxiliary_loss_clip": 0.01137283, "auxiliary_loss_mlp": 0.01024701, "balance_loss_clip": 1.04186022, "balance_loss_mlp": 1.0175184, "epoch": 0.7424998496963867, "flos": 20521907314560.0, "grad_norm": 1.9657415361527921, "language_loss": 0.7870422, "learning_rate": 6.559985224767801e-07, "loss": 0.808662, "num_input_tokens_seen": 132696385, "step": 6175, "time_per_iteration": 2.680285930633545 }, { "auxiliary_loss_clip": 0.01133103, "auxiliary_loss_mlp": 0.01024691, "balance_loss_clip": 1.04471266, "balance_loss_mlp": 1.01727569, "epoch": 0.7426200925870258, "flos": 21871573873920.0, "grad_norm": 2.4692111678006805, "language_loss": 0.75406039, "learning_rate": 6.55421756609349e-07, "loss": 0.77563834, "num_input_tokens_seen": 132714640, "step": 6176, "time_per_iteration": 2.643416404724121 }, { "auxiliary_loss_clip": 0.01150099, "auxiliary_loss_mlp": 0.01026566, "balance_loss_clip": 1.04653215, "balance_loss_mlp": 1.01871896, "epoch": 0.7427403354776649, "flos": 26432049265920.0, "grad_norm": 1.816109598237587, "language_loss": 0.79422861, "learning_rate": 6.54845194713369e-07, "loss": 0.81599522, "num_input_tokens_seen": 132735590, "step": 6177, "time_per_iteration": 2.6848556995391846 }, { "auxiliary_loss_clip": 0.01151869, "auxiliary_loss_mlp": 0.01026927, "balance_loss_clip": 1.04709256, "balance_loss_mlp": 1.01951814, "epoch": 0.742860578368304, "flos": 19898390102400.0, "grad_norm": 2.00371013866887, "language_loss": 0.80104649, "learning_rate": 6.542688368763034e-07, "loss": 0.82283449, "num_input_tokens_seen": 132753995, "step": 6178, "time_per_iteration": 2.590877056121826 }, { "auxiliary_loss_clip": 0.01153056, "auxiliary_loss_mlp": 0.01024574, "balance_loss_clip": 1.04698443, "balance_loss_mlp": 1.01750505, "epoch": 0.742980821258943, "flos": 24827201510400.0, "grad_norm": 1.7240918772624372, "language_loss": 0.76909059, "learning_rate": 6.536926831855854e-07, "loss": 0.79086685, "num_input_tokens_seen": 132773160, "step": 6179, "time_per_iteration": 2.672665596008301 }, { "auxiliary_loss_clip": 0.01134528, "auxiliary_loss_mlp": 0.0102586, "balance_loss_clip": 1.04347181, "balance_loss_mlp": 1.01896727, "epoch": 0.7431010641495821, "flos": 25228646887680.0, "grad_norm": 2.1188472283900994, "language_loss": 0.73579532, "learning_rate": 6.531167337286165e-07, "loss": 0.7573992, "num_input_tokens_seen": 132793180, "step": 6180, "time_per_iteration": 2.6938347816467285 }, { "auxiliary_loss_clip": 0.01136597, "auxiliary_loss_mlp": 0.01024819, "balance_loss_clip": 1.04511428, "balance_loss_mlp": 1.01795566, "epoch": 0.7432213070402213, "flos": 21762369550080.0, "grad_norm": 1.4347856741176404, "language_loss": 0.80086946, "learning_rate": 6.52540988592768e-07, "loss": 0.82248366, "num_input_tokens_seen": 132814200, "step": 6181, "time_per_iteration": 2.6921794414520264 }, { "auxiliary_loss_clip": 0.01142173, "auxiliary_loss_mlp": 0.0102712, "balance_loss_clip": 1.04395092, "balance_loss_mlp": 1.02014589, "epoch": 0.7433415499308603, "flos": 14793832425600.0, "grad_norm": 2.0101811982103635, "language_loss": 0.8376357, "learning_rate": 6.519654478653814e-07, "loss": 0.85932863, "num_input_tokens_seen": 132832565, "step": 6182, "time_per_iteration": 2.718783140182495 }, { "auxiliary_loss_clip": 0.01050848, "auxiliary_loss_mlp": 0.01000422, "balance_loss_clip": 1.01354313, "balance_loss_mlp": 0.99921781, "epoch": 0.7434617928214994, "flos": 67155577297920.0, "grad_norm": 0.7453184303340871, "language_loss": 0.56029809, "learning_rate": 6.51390111633763e-07, "loss": 0.58081079, "num_input_tokens_seen": 132897840, "step": 6183, "time_per_iteration": 3.365290880203247 }, { "auxiliary_loss_clip": 0.0109761, "auxiliary_loss_mlp": 0.01023665, "balance_loss_clip": 1.0396148, "balance_loss_mlp": 1.01669407, "epoch": 0.7435820357121385, "flos": 27377576928000.0, "grad_norm": 1.6831489474510961, "language_loss": 0.76521581, "learning_rate": 6.508149799851932e-07, "loss": 0.78642857, "num_input_tokens_seen": 132919505, "step": 6184, "time_per_iteration": 2.9354984760284424 }, { "auxiliary_loss_clip": 0.01133051, "auxiliary_loss_mlp": 0.01022251, "balance_loss_clip": 1.04284203, "balance_loss_mlp": 1.01578045, "epoch": 0.7437022786027776, "flos": 23987645948160.0, "grad_norm": 1.9986587032979253, "language_loss": 0.60991544, "learning_rate": 6.502400530069183e-07, "loss": 0.63146842, "num_input_tokens_seen": 132939390, "step": 6185, "time_per_iteration": 2.727757215499878 }, { "auxiliary_loss_clip": 0.01124675, "auxiliary_loss_mlp": 0.01030557, "balance_loss_clip": 1.04331207, "balance_loss_mlp": 1.02276659, "epoch": 0.7438225214934167, "flos": 21866761451520.0, "grad_norm": 2.02037388477216, "language_loss": 0.68976486, "learning_rate": 6.496653307861535e-07, "loss": 0.71131718, "num_input_tokens_seen": 132960060, "step": 6186, "time_per_iteration": 2.74678111076355 }, { "auxiliary_loss_clip": 0.01158623, "auxiliary_loss_mlp": 0.01029739, "balance_loss_clip": 1.04666233, "balance_loss_mlp": 1.02244091, "epoch": 0.7439427643840558, "flos": 20230097224320.0, "grad_norm": 1.9108927214583882, "language_loss": 0.66179669, "learning_rate": 6.490908134100857e-07, "loss": 0.6836803, "num_input_tokens_seen": 132978525, "step": 6187, "time_per_iteration": 2.7154691219329834 }, { "auxiliary_loss_clip": 0.0115762, "auxiliary_loss_mlp": 0.01028259, "balance_loss_clip": 1.04561043, "balance_loss_mlp": 1.02096963, "epoch": 0.7440630072746949, "flos": 20849915335680.0, "grad_norm": 2.263680346218758, "language_loss": 0.69878304, "learning_rate": 6.48516500965866e-07, "loss": 0.72064185, "num_input_tokens_seen": 132998460, "step": 6188, "time_per_iteration": 2.7056281566619873 }, { "auxiliary_loss_clip": 0.01156016, "auxiliary_loss_mlp": 0.01022572, "balance_loss_clip": 1.04372478, "balance_loss_mlp": 1.01570833, "epoch": 0.7441832501653339, "flos": 26503762769280.0, "grad_norm": 1.6387093689579877, "language_loss": 0.81713057, "learning_rate": 6.479423935406192e-07, "loss": 0.83891642, "num_input_tokens_seen": 133018445, "step": 6189, "time_per_iteration": 2.710979461669922 }, { "auxiliary_loss_clip": 0.01039946, "auxiliary_loss_mlp": 0.0100231, "balance_loss_clip": 1.01147199, "balance_loss_mlp": 1.00123084, "epoch": 0.7443034930559731, "flos": 68602848088320.0, "grad_norm": 0.8131682532173082, "language_loss": 0.62020528, "learning_rate": 6.473684912214357e-07, "loss": 0.64062786, "num_input_tokens_seen": 133082005, "step": 6190, "time_per_iteration": 3.3992176055908203 }, { "auxiliary_loss_clip": 0.01153844, "auxiliary_loss_mlp": 0.01020349, "balance_loss_clip": 1.04664409, "balance_loss_mlp": 1.01240098, "epoch": 0.7444237359466122, "flos": 18654982951680.0, "grad_norm": 2.0435947838739406, "language_loss": 0.69623053, "learning_rate": 6.467947940953778e-07, "loss": 0.71797246, "num_input_tokens_seen": 133100530, "step": 6191, "time_per_iteration": 2.652797222137451 }, { "auxiliary_loss_clip": 0.01137913, "auxiliary_loss_mlp": 0.0102193, "balance_loss_clip": 1.04418826, "balance_loss_mlp": 1.01517045, "epoch": 0.7445439788372512, "flos": 22817604326400.0, "grad_norm": 1.8150350836994886, "language_loss": 0.72858584, "learning_rate": 6.462213022494732e-07, "loss": 0.75018424, "num_input_tokens_seen": 133119775, "step": 6192, "time_per_iteration": 2.6620213985443115 }, { "auxiliary_loss_clip": 0.01058806, "auxiliary_loss_mlp": 0.01001157, "balance_loss_clip": 1.01320601, "balance_loss_mlp": 1.00005388, "epoch": 0.7446642217278904, "flos": 67045690615680.0, "grad_norm": 0.7703784984523696, "language_loss": 0.609743, "learning_rate": 6.456480157707201e-07, "loss": 0.6303426, "num_input_tokens_seen": 133184550, "step": 6193, "time_per_iteration": 4.086431264877319 }, { "auxiliary_loss_clip": 0.01119351, "auxiliary_loss_mlp": 0.01030476, "balance_loss_clip": 1.04086328, "balance_loss_mlp": 1.02318311, "epoch": 0.7447844646185294, "flos": 17417465631360.0, "grad_norm": 2.242495338294364, "language_loss": 0.84979081, "learning_rate": 6.450749347460866e-07, "loss": 0.87128907, "num_input_tokens_seen": 133201525, "step": 6194, "time_per_iteration": 2.724835157394409 }, { "auxiliary_loss_clip": 0.01166589, "auxiliary_loss_mlp": 0.01026241, "balance_loss_clip": 1.0457859, "balance_loss_mlp": 1.01919556, "epoch": 0.7449047075091685, "flos": 26615876094720.0, "grad_norm": 1.8996971260714226, "language_loss": 0.79105735, "learning_rate": 6.445020592625083e-07, "loss": 0.81298566, "num_input_tokens_seen": 133222175, "step": 6195, "time_per_iteration": 2.7371091842651367 }, { "auxiliary_loss_clip": 0.01164583, "auxiliary_loss_mlp": 0.01024413, "balance_loss_clip": 1.04396796, "balance_loss_mlp": 1.01793718, "epoch": 0.7450249503998077, "flos": 14170458867840.0, "grad_norm": 1.9934078505012374, "language_loss": 0.80482644, "learning_rate": 6.4392938940689e-07, "loss": 0.82671642, "num_input_tokens_seen": 133237590, "step": 6196, "time_per_iteration": 3.5921947956085205 }, { "auxiliary_loss_clip": 0.01108994, "auxiliary_loss_mlp": 0.00762413, "balance_loss_clip": 1.04156744, "balance_loss_mlp": 1.00039983, "epoch": 0.7451451932904467, "flos": 19606687752960.0, "grad_norm": 2.119713770102674, "language_loss": 0.71533179, "learning_rate": 6.433569252661049e-07, "loss": 0.73404586, "num_input_tokens_seen": 133255590, "step": 6197, "time_per_iteration": 3.598384141921997 }, { "auxiliary_loss_clip": 0.01116992, "auxiliary_loss_mlp": 0.01025382, "balance_loss_clip": 1.04030859, "balance_loss_mlp": 1.01888561, "epoch": 0.7452654361810858, "flos": 12495405980160.0, "grad_norm": 2.6214943572619287, "language_loss": 0.71287155, "learning_rate": 6.427846669269952e-07, "loss": 0.73429537, "num_input_tokens_seen": 133273210, "step": 6198, "time_per_iteration": 2.7548880577087402 }, { "auxiliary_loss_clip": 0.01172093, "auxiliary_loss_mlp": 0.01023313, "balance_loss_clip": 1.05139315, "balance_loss_mlp": 1.01636612, "epoch": 0.7453856790717249, "flos": 22127329687680.0, "grad_norm": 2.066059859363878, "language_loss": 0.82059526, "learning_rate": 6.422126144763729e-07, "loss": 0.84254932, "num_input_tokens_seen": 133292600, "step": 6199, "time_per_iteration": 3.5965919494628906 }, { "auxiliary_loss_clip": 0.0112144, "auxiliary_loss_mlp": 0.00762448, "balance_loss_clip": 1.03797698, "balance_loss_mlp": 1.00034308, "epoch": 0.745505921962364, "flos": 20010682995840.0, "grad_norm": 2.177534380578065, "language_loss": 0.76929462, "learning_rate": 6.416407680010174e-07, "loss": 0.7881335, "num_input_tokens_seen": 133306960, "step": 6200, "time_per_iteration": 2.6976382732391357 }, { "auxiliary_loss_clip": 0.01129615, "auxiliary_loss_mlp": 0.01024936, "balance_loss_clip": 1.04622293, "balance_loss_mlp": 1.01782823, "epoch": 0.745626164853003, "flos": 24677884673280.0, "grad_norm": 1.9918685500171576, "language_loss": 0.81095117, "learning_rate": 6.410691275876774e-07, "loss": 0.83249676, "num_input_tokens_seen": 133326380, "step": 6201, "time_per_iteration": 2.7952194213867188 }, { "auxiliary_loss_clip": 0.01148004, "auxiliary_loss_mlp": 0.01029834, "balance_loss_clip": 1.04552293, "balance_loss_mlp": 1.02254128, "epoch": 0.7457464077436422, "flos": 14538830797440.0, "grad_norm": 2.320940626682077, "language_loss": 0.76632571, "learning_rate": 6.404976933230704e-07, "loss": 0.78810412, "num_input_tokens_seen": 133342900, "step": 6202, "time_per_iteration": 2.6755716800689697 }, { "auxiliary_loss_clip": 0.01145553, "auxiliary_loss_mlp": 0.01024914, "balance_loss_clip": 1.04513848, "balance_loss_mlp": 1.01724267, "epoch": 0.7458666506342813, "flos": 34021194600960.0, "grad_norm": 2.0306120178633646, "language_loss": 0.72755754, "learning_rate": 6.399264652938813e-07, "loss": 0.74926221, "num_input_tokens_seen": 133363805, "step": 6203, "time_per_iteration": 2.762225866317749 }, { "auxiliary_loss_clip": 0.01136895, "auxiliary_loss_mlp": 0.0102437, "balance_loss_clip": 1.04284859, "balance_loss_mlp": 1.01728022, "epoch": 0.7459868935249203, "flos": 24279025075200.0, "grad_norm": 1.8858521106181385, "language_loss": 0.74555045, "learning_rate": 6.393554435867679e-07, "loss": 0.76716316, "num_input_tokens_seen": 133384655, "step": 6204, "time_per_iteration": 2.725489377975464 }, { "auxiliary_loss_clip": 0.01120467, "auxiliary_loss_mlp": 0.01031119, "balance_loss_clip": 1.04069018, "balance_loss_mlp": 1.02393639, "epoch": 0.7461071364155595, "flos": 21908777385600.0, "grad_norm": 4.475044887978384, "language_loss": 0.83342665, "learning_rate": 6.387846282883502e-07, "loss": 0.8549425, "num_input_tokens_seen": 133401185, "step": 6205, "time_per_iteration": 2.6904854774475098 }, { "auxiliary_loss_clip": 0.0116907, "auxiliary_loss_mlp": 0.01024745, "balance_loss_clip": 1.04903984, "balance_loss_mlp": 1.01740813, "epoch": 0.7462273793061985, "flos": 22889712879360.0, "grad_norm": 2.207906022797722, "language_loss": 0.76870573, "learning_rate": 6.38214019485223e-07, "loss": 0.79064387, "num_input_tokens_seen": 133420010, "step": 6206, "time_per_iteration": 2.6240029335021973 }, { "auxiliary_loss_clip": 0.01092009, "auxiliary_loss_mlp": 0.01022763, "balance_loss_clip": 1.03897583, "balance_loss_mlp": 1.01531529, "epoch": 0.7463476221968376, "flos": 19968451580160.0, "grad_norm": 2.1413635296819353, "language_loss": 0.72002679, "learning_rate": 6.376436172639461e-07, "loss": 0.74117458, "num_input_tokens_seen": 133437855, "step": 6207, "time_per_iteration": 2.762192726135254 }, { "auxiliary_loss_clip": 0.01086217, "auxiliary_loss_mlp": 0.0102523, "balance_loss_clip": 1.03874004, "balance_loss_mlp": 1.0176245, "epoch": 0.7464678650874768, "flos": 16836610798080.0, "grad_norm": 7.172271799418539, "language_loss": 0.65237814, "learning_rate": 6.370734217110487e-07, "loss": 0.67349261, "num_input_tokens_seen": 133456600, "step": 6208, "time_per_iteration": 2.785538911819458 }, { "auxiliary_loss_clip": 0.01146005, "auxiliary_loss_mlp": 0.01023859, "balance_loss_clip": 1.04878271, "balance_loss_mlp": 1.01612854, "epoch": 0.7465881079781158, "flos": 48100869843840.0, "grad_norm": 1.424048303769729, "language_loss": 0.64227974, "learning_rate": 6.36503432913031e-07, "loss": 0.66397834, "num_input_tokens_seen": 133479745, "step": 6209, "time_per_iteration": 2.8787760734558105 }, { "auxiliary_loss_clip": 0.01152254, "auxiliary_loss_mlp": 0.01021465, "balance_loss_clip": 1.04579377, "balance_loss_mlp": 1.01402009, "epoch": 0.7467083508687549, "flos": 19677359761920.0, "grad_norm": 2.0549736748124747, "language_loss": 0.69023716, "learning_rate": 6.359336509563569e-07, "loss": 0.71197432, "num_input_tokens_seen": 133495765, "step": 6210, "time_per_iteration": 2.601499557495117 }, { "auxiliary_loss_clip": 0.01115892, "auxiliary_loss_mlp": 0.01023254, "balance_loss_clip": 1.04204023, "balance_loss_mlp": 1.01638198, "epoch": 0.7468285937593939, "flos": 17895436934400.0, "grad_norm": 1.9197369274249187, "language_loss": 0.80489951, "learning_rate": 6.353640759274641e-07, "loss": 0.82629097, "num_input_tokens_seen": 133514655, "step": 6211, "time_per_iteration": 2.685817003250122 }, { "auxiliary_loss_clip": 0.01152078, "auxiliary_loss_mlp": 0.01022784, "balance_loss_clip": 1.04339683, "balance_loss_mlp": 1.01545, "epoch": 0.7469488366500331, "flos": 23141446369920.0, "grad_norm": 3.022521627437731, "language_loss": 0.74735475, "learning_rate": 6.347947079127556e-07, "loss": 0.76910329, "num_input_tokens_seen": 133532555, "step": 6212, "time_per_iteration": 2.6430695056915283 }, { "auxiliary_loss_clip": 0.01132323, "auxiliary_loss_mlp": 0.0102735, "balance_loss_clip": 1.04197931, "balance_loss_mlp": 1.02001524, "epoch": 0.7470690795406721, "flos": 16690849407360.0, "grad_norm": 2.2250280184706046, "language_loss": 0.76897228, "learning_rate": 6.342255469986053e-07, "loss": 0.79056901, "num_input_tokens_seen": 133551300, "step": 6213, "time_per_iteration": 2.6677963733673096 }, { "auxiliary_loss_clip": 0.0116835, "auxiliary_loss_mlp": 0.01024922, "balance_loss_clip": 1.04816616, "balance_loss_mlp": 1.0183351, "epoch": 0.7471893224313112, "flos": 25192700352000.0, "grad_norm": 1.9595972353033755, "language_loss": 0.76286519, "learning_rate": 6.336565932713533e-07, "loss": 0.78479791, "num_input_tokens_seen": 133570725, "step": 6214, "time_per_iteration": 2.7073559761047363 }, { "auxiliary_loss_clip": 0.01136412, "auxiliary_loss_mlp": 0.01024519, "balance_loss_clip": 1.04531765, "balance_loss_mlp": 1.01697028, "epoch": 0.7473095653219504, "flos": 22526225199360.0, "grad_norm": 1.9266797020248434, "language_loss": 0.77770662, "learning_rate": 6.330878468173088e-07, "loss": 0.79931593, "num_input_tokens_seen": 133590790, "step": 6215, "time_per_iteration": 2.7077155113220215 }, { "auxiliary_loss_clip": 0.0114552, "auxiliary_loss_mlp": 0.01026175, "balance_loss_clip": 1.0430181, "balance_loss_mlp": 1.01931417, "epoch": 0.7474298082125894, "flos": 18113989236480.0, "grad_norm": 1.7342427734497174, "language_loss": 0.73191398, "learning_rate": 6.32519307722752e-07, "loss": 0.75363088, "num_input_tokens_seen": 133608685, "step": 6216, "time_per_iteration": 2.5879409313201904 }, { "auxiliary_loss_clip": 0.01036042, "auxiliary_loss_mlp": 0.01002538, "balance_loss_clip": 1.01693916, "balance_loss_mlp": 1.00128603, "epoch": 0.7475500511032285, "flos": 62086535193600.0, "grad_norm": 0.8114796853298277, "language_loss": 0.54947698, "learning_rate": 6.31950976073929e-07, "loss": 0.56986284, "num_input_tokens_seen": 133662775, "step": 6217, "time_per_iteration": 3.254901647567749 }, { "auxiliary_loss_clip": 0.0110598, "auxiliary_loss_mlp": 0.01022972, "balance_loss_clip": 1.04088235, "balance_loss_mlp": 1.01577783, "epoch": 0.7476702939938676, "flos": 17785586165760.0, "grad_norm": 2.313469277267881, "language_loss": 0.80730742, "learning_rate": 6.31382851957055e-07, "loss": 0.82859683, "num_input_tokens_seen": 133679595, "step": 6218, "time_per_iteration": 2.74605131149292 }, { "auxiliary_loss_clip": 0.01123067, "auxiliary_loss_mlp": 0.00761962, "balance_loss_clip": 1.04384816, "balance_loss_mlp": 1.0003432, "epoch": 0.7477905368845067, "flos": 27927944092800.0, "grad_norm": 2.029662682670831, "language_loss": 0.7143659, "learning_rate": 6.308149354583143e-07, "loss": 0.73321617, "num_input_tokens_seen": 133699000, "step": 6219, "time_per_iteration": 3.669180393218994 }, { "auxiliary_loss_clip": 0.01159456, "auxiliary_loss_mlp": 0.01024154, "balance_loss_clip": 1.0477128, "balance_loss_mlp": 1.01677191, "epoch": 0.7479107797751458, "flos": 26870374932480.0, "grad_norm": 6.392390538490855, "language_loss": 0.81786942, "learning_rate": 6.302472266638586e-07, "loss": 0.83970553, "num_input_tokens_seen": 133719540, "step": 6220, "time_per_iteration": 2.685568332672119 }, { "auxiliary_loss_clip": 0.01172781, "auxiliary_loss_mlp": 0.01027566, "balance_loss_clip": 1.04796565, "balance_loss_mlp": 1.01853609, "epoch": 0.7480310226657849, "flos": 33943375785600.0, "grad_norm": 2.8540530534478967, "language_loss": 0.69759542, "learning_rate": 6.296797256598101e-07, "loss": 0.71959889, "num_input_tokens_seen": 133741020, "step": 6221, "time_per_iteration": 2.700759172439575 }, { "auxiliary_loss_clip": 0.0111621, "auxiliary_loss_mlp": 0.01023856, "balance_loss_clip": 1.0404377, "balance_loss_mlp": 1.01720679, "epoch": 0.748151265556424, "flos": 24826555065600.0, "grad_norm": 1.7188702061654149, "language_loss": 0.81237924, "learning_rate": 6.291124325322576e-07, "loss": 0.83377987, "num_input_tokens_seen": 133761145, "step": 6222, "time_per_iteration": 3.7109224796295166 }, { "auxiliary_loss_clip": 0.01145707, "auxiliary_loss_mlp": 0.01025544, "balance_loss_clip": 1.04561114, "balance_loss_mlp": 1.01832545, "epoch": 0.748271508447063, "flos": 38399351535360.0, "grad_norm": 1.6779396215167273, "language_loss": 0.62579292, "learning_rate": 6.285453473672595e-07, "loss": 0.6475054, "num_input_tokens_seen": 133783715, "step": 6223, "time_per_iteration": 3.6455330848693848 }, { "auxiliary_loss_clip": 0.01163661, "auxiliary_loss_mlp": 0.01018872, "balance_loss_clip": 1.04399359, "balance_loss_mlp": 1.01211619, "epoch": 0.7483917513377022, "flos": 21541842000000.0, "grad_norm": 1.8781218910827473, "language_loss": 0.75506663, "learning_rate": 6.279784702508415e-07, "loss": 0.77689195, "num_input_tokens_seen": 133804465, "step": 6224, "time_per_iteration": 2.61055588722229 }, { "auxiliary_loss_clip": 0.01041667, "auxiliary_loss_mlp": 0.01001958, "balance_loss_clip": 1.01223099, "balance_loss_mlp": 1.00086176, "epoch": 0.7485119942283412, "flos": 62314532772480.0, "grad_norm": 0.7871278899150429, "language_loss": 0.58513474, "learning_rate": 6.274118012689979e-07, "loss": 0.60557103, "num_input_tokens_seen": 133866365, "step": 6225, "time_per_iteration": 4.301759958267212 }, { "auxiliary_loss_clip": 0.01132988, "auxiliary_loss_mlp": 0.0102583, "balance_loss_clip": 1.04318023, "balance_loss_mlp": 1.01863313, "epoch": 0.7486322371189803, "flos": 29937613104000.0, "grad_norm": 1.5201661080899442, "language_loss": 0.68416023, "learning_rate": 6.268453405076943e-07, "loss": 0.70574844, "num_input_tokens_seen": 133888760, "step": 6226, "time_per_iteration": 2.747171401977539 }, { "auxiliary_loss_clip": 0.01141902, "auxiliary_loss_mlp": 0.01023239, "balance_loss_clip": 1.04579926, "balance_loss_mlp": 1.01594329, "epoch": 0.7487524800096195, "flos": 18949414734720.0, "grad_norm": 1.9065342277314996, "language_loss": 0.82434368, "learning_rate": 6.262790880528592e-07, "loss": 0.84599507, "num_input_tokens_seen": 133906380, "step": 6227, "time_per_iteration": 2.6401023864746094 }, { "auxiliary_loss_clip": 0.01135495, "auxiliary_loss_mlp": 0.01023107, "balance_loss_clip": 1.04067087, "balance_loss_mlp": 1.01534665, "epoch": 0.7488727229002585, "flos": 18697393935360.0, "grad_norm": 2.769678878544884, "language_loss": 0.8004936, "learning_rate": 6.257130439903951e-07, "loss": 0.82207966, "num_input_tokens_seen": 133922875, "step": 6228, "time_per_iteration": 2.797952890396118 }, { "auxiliary_loss_clip": 0.01171787, "auxiliary_loss_mlp": 0.01029138, "balance_loss_clip": 1.05088305, "balance_loss_mlp": 1.02200902, "epoch": 0.7489929657908976, "flos": 23623368168960.0, "grad_norm": 2.3649690285552265, "language_loss": 0.81214434, "learning_rate": 6.251472084061695e-07, "loss": 0.83415353, "num_input_tokens_seen": 133941795, "step": 6229, "time_per_iteration": 2.654466152191162 }, { "auxiliary_loss_clip": 0.01152695, "auxiliary_loss_mlp": 0.01029301, "balance_loss_clip": 1.04611039, "balance_loss_mlp": 1.0226047, "epoch": 0.7491132086815367, "flos": 20551533056640.0, "grad_norm": 2.1104288353780696, "language_loss": 0.89003074, "learning_rate": 6.245815813860191e-07, "loss": 0.91185069, "num_input_tokens_seen": 133957305, "step": 6230, "time_per_iteration": 2.674736976623535 }, { "auxiliary_loss_clip": 0.01168995, "auxiliary_loss_mlp": 0.01025352, "balance_loss_clip": 1.04644489, "balance_loss_mlp": 1.01758599, "epoch": 0.7492334515721758, "flos": 23003011353600.0, "grad_norm": 1.966029225582525, "language_loss": 0.70264196, "learning_rate": 6.240161630157495e-07, "loss": 0.72458547, "num_input_tokens_seen": 133976660, "step": 6231, "time_per_iteration": 2.6121299266815186 }, { "auxiliary_loss_clip": 0.01173144, "auxiliary_loss_mlp": 0.01024055, "balance_loss_clip": 1.04872179, "balance_loss_mlp": 1.01674449, "epoch": 0.7493536944628149, "flos": 16398823835520.0, "grad_norm": 2.8033757958578116, "language_loss": 0.70824087, "learning_rate": 6.23450953381133e-07, "loss": 0.73021287, "num_input_tokens_seen": 133994750, "step": 6232, "time_per_iteration": 2.5583012104034424 }, { "auxiliary_loss_clip": 0.01131681, "auxiliary_loss_mlp": 0.0102197, "balance_loss_clip": 1.04079473, "balance_loss_mlp": 1.01529408, "epoch": 0.749473937353454, "flos": 15338561155200.0, "grad_norm": 2.071902799841961, "language_loss": 0.68056297, "learning_rate": 6.228859525679131e-07, "loss": 0.70209944, "num_input_tokens_seen": 134009165, "step": 6233, "time_per_iteration": 2.71573805809021 }, { "auxiliary_loss_clip": 0.01154081, "auxiliary_loss_mlp": 0.01022456, "balance_loss_clip": 1.0457387, "balance_loss_mlp": 1.01558995, "epoch": 0.7495941802440931, "flos": 18951138587520.0, "grad_norm": 2.0610921477578454, "language_loss": 0.80419755, "learning_rate": 6.223211606617986e-07, "loss": 0.8259629, "num_input_tokens_seen": 134027585, "step": 6234, "time_per_iteration": 2.616936683654785 }, { "auxiliary_loss_clip": 0.01148718, "auxiliary_loss_mlp": 0.01026941, "balance_loss_clip": 1.04645324, "balance_loss_mlp": 1.02069139, "epoch": 0.7497144231347321, "flos": 22492469393280.0, "grad_norm": 1.8414731788455145, "language_loss": 0.84365821, "learning_rate": 6.217565777484701e-07, "loss": 0.8654148, "num_input_tokens_seen": 134046680, "step": 6235, "time_per_iteration": 2.6252260208129883 }, { "auxiliary_loss_clip": 0.01140598, "auxiliary_loss_mlp": 0.00761927, "balance_loss_clip": 1.04623854, "balance_loss_mlp": 1.00031388, "epoch": 0.7498346660253713, "flos": 24243509502720.0, "grad_norm": 2.4115485461250405, "language_loss": 0.80196118, "learning_rate": 6.211922039135722e-07, "loss": 0.82098639, "num_input_tokens_seen": 134066825, "step": 6236, "time_per_iteration": 2.7281253337860107 }, { "auxiliary_loss_clip": 0.01167969, "auxiliary_loss_mlp": 0.01027233, "balance_loss_clip": 1.04765785, "balance_loss_mlp": 1.02029192, "epoch": 0.7499549089160104, "flos": 24387080163840.0, "grad_norm": 1.898353851000896, "language_loss": 0.81011218, "learning_rate": 6.206280392427201e-07, "loss": 0.83206415, "num_input_tokens_seen": 134086410, "step": 6237, "time_per_iteration": 2.658667802810669 }, { "auxiliary_loss_clip": 0.01146407, "auxiliary_loss_mlp": 0.01024703, "balance_loss_clip": 1.0429287, "balance_loss_mlp": 1.01772928, "epoch": 0.7500751518066494, "flos": 34057320704640.0, "grad_norm": 1.7419043120328517, "language_loss": 0.73659861, "learning_rate": 6.200640838214983e-07, "loss": 0.75830972, "num_input_tokens_seen": 134109185, "step": 6238, "time_per_iteration": 2.7898871898651123 }, { "auxiliary_loss_clip": 0.0116755, "auxiliary_loss_mlp": 0.0102161, "balance_loss_clip": 1.04710031, "balance_loss_mlp": 1.01506889, "epoch": 0.7501953946972886, "flos": 18843586289280.0, "grad_norm": 2.100078406878541, "language_loss": 0.67188585, "learning_rate": 6.195003377354578e-07, "loss": 0.6937775, "num_input_tokens_seen": 134128455, "step": 6239, "time_per_iteration": 2.606393337249756 }, { "auxiliary_loss_clip": 0.01148993, "auxiliary_loss_mlp": 0.01026965, "balance_loss_clip": 1.04317188, "balance_loss_mlp": 1.01985395, "epoch": 0.7503156375879276, "flos": 20257675891200.0, "grad_norm": 2.740858987216959, "language_loss": 0.73855174, "learning_rate": 6.189368010701183e-07, "loss": 0.76031125, "num_input_tokens_seen": 134145515, "step": 6240, "time_per_iteration": 2.740865468978882 }, { "auxiliary_loss_clip": 0.01158915, "auxiliary_loss_mlp": 0.01023662, "balance_loss_clip": 1.04536927, "balance_loss_mlp": 1.01658702, "epoch": 0.7504358804785667, "flos": 13480040574720.0, "grad_norm": 1.8732397830881025, "language_loss": 0.76746249, "learning_rate": 6.183734739109683e-07, "loss": 0.78928822, "num_input_tokens_seen": 134163335, "step": 6241, "time_per_iteration": 2.6277778148651123 }, { "auxiliary_loss_clip": 0.01162419, "auxiliary_loss_mlp": 0.01027703, "balance_loss_clip": 1.04798865, "balance_loss_mlp": 1.02012706, "epoch": 0.7505561233692057, "flos": 29461042431360.0, "grad_norm": 2.387386378919897, "language_loss": 0.68524528, "learning_rate": 6.178103563434629e-07, "loss": 0.70714647, "num_input_tokens_seen": 134182335, "step": 6242, "time_per_iteration": 2.6819751262664795 }, { "auxiliary_loss_clip": 0.0116893, "auxiliary_loss_mlp": 0.01025698, "balance_loss_clip": 1.04800785, "balance_loss_mlp": 1.01837599, "epoch": 0.7506763662598449, "flos": 20302457172480.0, "grad_norm": 1.5757705144198286, "language_loss": 0.83963561, "learning_rate": 6.172474484530283e-07, "loss": 0.86158192, "num_input_tokens_seen": 134201070, "step": 6243, "time_per_iteration": 2.655743360519409 }, { "auxiliary_loss_clip": 0.01127742, "auxiliary_loss_mlp": 0.01024089, "balance_loss_clip": 1.0398035, "balance_loss_mlp": 1.01691318, "epoch": 0.750796609150484, "flos": 37230961939200.0, "grad_norm": 1.8384526624224773, "language_loss": 0.76193881, "learning_rate": 6.166847503250563e-07, "loss": 0.78345716, "num_input_tokens_seen": 134223310, "step": 6244, "time_per_iteration": 2.829166889190674 }, { "auxiliary_loss_clip": 0.01138492, "auxiliary_loss_mlp": 0.01022396, "balance_loss_clip": 1.04188788, "balance_loss_mlp": 1.01467109, "epoch": 0.750916852041123, "flos": 19609417186560.0, "grad_norm": 4.3962857688074815, "language_loss": 0.79270506, "learning_rate": 6.161222620449078e-07, "loss": 0.81431389, "num_input_tokens_seen": 134242085, "step": 6245, "time_per_iteration": 3.6289732456207275 }, { "auxiliary_loss_clip": 0.01127559, "auxiliary_loss_mlp": 0.01023462, "balance_loss_clip": 1.0438149, "balance_loss_mlp": 1.01585054, "epoch": 0.7510370949317622, "flos": 25112690807040.0, "grad_norm": 2.173995947960605, "language_loss": 0.80073798, "learning_rate": 6.155599836979117e-07, "loss": 0.82224822, "num_input_tokens_seen": 134260770, "step": 6246, "time_per_iteration": 2.7568163871765137 }, { "auxiliary_loss_clip": 0.01111068, "auxiliary_loss_mlp": 0.01027821, "balance_loss_clip": 1.0392102, "balance_loss_mlp": 1.02051318, "epoch": 0.7511573378224012, "flos": 19062282245760.0, "grad_norm": 1.9568674146395753, "language_loss": 0.81919175, "learning_rate": 6.149979153693649e-07, "loss": 0.84058058, "num_input_tokens_seen": 134278025, "step": 6247, "time_per_iteration": 2.7254300117492676 }, { "auxiliary_loss_clip": 0.01151893, "auxiliary_loss_mlp": 0.01030623, "balance_loss_clip": 1.0448662, "balance_loss_mlp": 1.0231545, "epoch": 0.7512775807130403, "flos": 19937676602880.0, "grad_norm": 1.902443383708817, "language_loss": 0.76787221, "learning_rate": 6.144360571445343e-07, "loss": 0.78969735, "num_input_tokens_seen": 134297170, "step": 6248, "time_per_iteration": 3.6428720951080322 }, { "auxiliary_loss_clip": 0.0115165, "auxiliary_loss_mlp": 0.01027596, "balance_loss_clip": 1.04630089, "balance_loss_mlp": 1.02063775, "epoch": 0.7513978236036795, "flos": 20739920912640.0, "grad_norm": 1.6579267789732461, "language_loss": 0.80161631, "learning_rate": 6.138744091086509e-07, "loss": 0.82340872, "num_input_tokens_seen": 134316755, "step": 6249, "time_per_iteration": 3.6026813983917236 }, { "auxiliary_loss_clip": 0.01130555, "auxiliary_loss_mlp": 0.01025326, "balance_loss_clip": 1.04616833, "balance_loss_mlp": 1.01805711, "epoch": 0.7515180664943185, "flos": 27563163523200.0, "grad_norm": 2.2298137738863137, "language_loss": 0.73009145, "learning_rate": 6.133129713469183e-07, "loss": 0.75165021, "num_input_tokens_seen": 134335960, "step": 6250, "time_per_iteration": 2.8123538494110107 }, { "auxiliary_loss_clip": 0.0113161, "auxiliary_loss_mlp": 0.01021473, "balance_loss_clip": 1.03925323, "balance_loss_mlp": 1.01411736, "epoch": 0.7516383093849576, "flos": 33803181002880.0, "grad_norm": 1.9319404483968476, "language_loss": 0.64297009, "learning_rate": 6.127517439445053e-07, "loss": 0.66450095, "num_input_tokens_seen": 134356805, "step": 6251, "time_per_iteration": 3.787219762802124 }, { "auxiliary_loss_clip": 0.01106763, "auxiliary_loss_mlp": 0.01025151, "balance_loss_clip": 1.04335487, "balance_loss_mlp": 1.01821876, "epoch": 0.7517585522755967, "flos": 29746172592000.0, "grad_norm": 2.714070573129891, "language_loss": 0.81470859, "learning_rate": 6.121907269865498e-07, "loss": 0.83602774, "num_input_tokens_seen": 134376295, "step": 6252, "time_per_iteration": 2.7680513858795166 }, { "auxiliary_loss_clip": 0.01031149, "auxiliary_loss_mlp": 0.01002061, "balance_loss_clip": 1.01180577, "balance_loss_mlp": 1.0010004, "epoch": 0.7518787951662358, "flos": 69807974319360.0, "grad_norm": 0.9216200366596135, "language_loss": 0.67246389, "learning_rate": 6.116299205581577e-07, "loss": 0.69279599, "num_input_tokens_seen": 134431125, "step": 6253, "time_per_iteration": 3.250988721847534 }, { "auxiliary_loss_clip": 0.0117543, "auxiliary_loss_mlp": 0.01023179, "balance_loss_clip": 1.05035937, "balance_loss_mlp": 1.01551056, "epoch": 0.7519990380568748, "flos": 34203225749760.0, "grad_norm": 1.8874540851359076, "language_loss": 0.68144083, "learning_rate": 6.110693247444018e-07, "loss": 0.70342696, "num_input_tokens_seen": 134452960, "step": 6254, "time_per_iteration": 2.6999900341033936 }, { "auxiliary_loss_clip": 0.01116706, "auxiliary_loss_mlp": 0.01026811, "balance_loss_clip": 1.04227161, "balance_loss_mlp": 1.01934004, "epoch": 0.752119280947514, "flos": 21725704742400.0, "grad_norm": 1.7365892652781167, "language_loss": 0.82121229, "learning_rate": 6.105089396303258e-07, "loss": 0.84264749, "num_input_tokens_seen": 134471350, "step": 6255, "time_per_iteration": 2.7478363513946533 }, { "auxiliary_loss_clip": 0.01140972, "auxiliary_loss_mlp": 0.01025814, "balance_loss_clip": 1.04336381, "balance_loss_mlp": 1.01799703, "epoch": 0.7522395238381531, "flos": 32742774668160.0, "grad_norm": 1.7950138831081461, "language_loss": 0.75520027, "learning_rate": 6.099487653009383e-07, "loss": 0.7768681, "num_input_tokens_seen": 134490695, "step": 6256, "time_per_iteration": 2.756467580795288 }, { "auxiliary_loss_clip": 0.01150528, "auxiliary_loss_mlp": 0.01027792, "balance_loss_clip": 1.04397273, "balance_loss_mlp": 1.02157807, "epoch": 0.7523597667287921, "flos": 23476026579840.0, "grad_norm": 1.9650594538959594, "language_loss": 0.82834804, "learning_rate": 6.093888018412192e-07, "loss": 0.85013115, "num_input_tokens_seen": 134506885, "step": 6257, "time_per_iteration": 2.6924142837524414 }, { "auxiliary_loss_clip": 0.01059174, "auxiliary_loss_mlp": 0.01000818, "balance_loss_clip": 1.01287508, "balance_loss_mlp": 0.99971503, "epoch": 0.7524800096194313, "flos": 67346730501120.0, "grad_norm": 0.7212498088554035, "language_loss": 0.54658824, "learning_rate": 6.088290493361125e-07, "loss": 0.56718814, "num_input_tokens_seen": 134571770, "step": 6258, "time_per_iteration": 3.4147026538848877 }, { "auxiliary_loss_clip": 0.01102411, "auxiliary_loss_mlp": 0.01027819, "balance_loss_clip": 1.038854, "balance_loss_mlp": 1.02052057, "epoch": 0.7526002525100703, "flos": 13006055681280.0, "grad_norm": 1.92976448761389, "language_loss": 0.7166273, "learning_rate": 6.082695078705322e-07, "loss": 0.73792958, "num_input_tokens_seen": 134589250, "step": 6259, "time_per_iteration": 2.7262258529663086 }, { "auxiliary_loss_clip": 0.0114762, "auxiliary_loss_mlp": 0.01026099, "balance_loss_clip": 1.04550135, "balance_loss_mlp": 1.01839244, "epoch": 0.7527204954007094, "flos": 21397229844480.0, "grad_norm": 2.3586720533326906, "language_loss": 0.68780529, "learning_rate": 6.077101775293618e-07, "loss": 0.70954251, "num_input_tokens_seen": 134608075, "step": 6260, "time_per_iteration": 2.6716935634613037 }, { "auxiliary_loss_clip": 0.01152837, "auxiliary_loss_mlp": 0.01027175, "balance_loss_clip": 1.04348385, "balance_loss_mlp": 1.01972485, "epoch": 0.7528407382913486, "flos": 18947188091520.0, "grad_norm": 3.2127736831010156, "language_loss": 0.82659423, "learning_rate": 6.071510583974504e-07, "loss": 0.84839433, "num_input_tokens_seen": 134623260, "step": 6261, "time_per_iteration": 2.6255314350128174 }, { "auxiliary_loss_clip": 0.01169446, "auxiliary_loss_mlp": 0.01025248, "balance_loss_clip": 1.04838395, "balance_loss_mlp": 1.01792598, "epoch": 0.7529609811819876, "flos": 15231798956160.0, "grad_norm": 2.4914882366134874, "language_loss": 0.72004449, "learning_rate": 6.065921505596161e-07, "loss": 0.7419914, "num_input_tokens_seen": 134641540, "step": 6262, "time_per_iteration": 2.6495933532714844 }, { "auxiliary_loss_clip": 0.01125055, "auxiliary_loss_mlp": 0.01023078, "balance_loss_clip": 1.04328561, "balance_loss_mlp": 1.0162828, "epoch": 0.7530812240726267, "flos": 19354487385600.0, "grad_norm": 1.619160954196703, "language_loss": 0.7707786, "learning_rate": 6.060334541006445e-07, "loss": 0.79225993, "num_input_tokens_seen": 134660035, "step": 6263, "time_per_iteration": 2.7348968982696533 }, { "auxiliary_loss_clip": 0.01125662, "auxiliary_loss_mlp": 0.01023989, "balance_loss_clip": 1.03913116, "balance_loss_mlp": 1.01731038, "epoch": 0.7532014669632658, "flos": 27748247328000.0, "grad_norm": 1.68107783548805, "language_loss": 0.69220817, "learning_rate": 6.05474969105289e-07, "loss": 0.71370465, "num_input_tokens_seen": 134683025, "step": 6264, "time_per_iteration": 2.850309133529663 }, { "auxiliary_loss_clip": 0.01155215, "auxiliary_loss_mlp": 0.01035135, "balance_loss_clip": 1.04662347, "balance_loss_mlp": 1.02681148, "epoch": 0.7533217098539049, "flos": 14137421333760.0, "grad_norm": 2.1339999725996104, "language_loss": 0.73980951, "learning_rate": 6.049166956582725e-07, "loss": 0.76171303, "num_input_tokens_seen": 134701290, "step": 6265, "time_per_iteration": 2.6398746967315674 }, { "auxiliary_loss_clip": 0.01146748, "auxiliary_loss_mlp": 0.01020411, "balance_loss_clip": 1.04198337, "balance_loss_mlp": 1.01370001, "epoch": 0.753441952744544, "flos": 26429068437120.0, "grad_norm": 2.2045831395176227, "language_loss": 0.87417853, "learning_rate": 6.043586338442841e-07, "loss": 0.89585012, "num_input_tokens_seen": 134720345, "step": 6266, "time_per_iteration": 2.644554853439331 }, { "auxiliary_loss_clip": 0.01165176, "auxiliary_loss_mlp": 0.01023187, "balance_loss_clip": 1.04734397, "balance_loss_mlp": 1.016783, "epoch": 0.7535621956351831, "flos": 23878621192320.0, "grad_norm": 2.0890091406381948, "language_loss": 0.73176897, "learning_rate": 6.038007837479815e-07, "loss": 0.75365257, "num_input_tokens_seen": 134741450, "step": 6267, "time_per_iteration": 2.680832862854004 }, { "auxiliary_loss_clip": 0.01150759, "auxiliary_loss_mlp": 0.01020997, "balance_loss_clip": 1.0453527, "balance_loss_mlp": 1.01413631, "epoch": 0.7536824385258222, "flos": 21795873960960.0, "grad_norm": 1.9098141274986997, "language_loss": 0.64090174, "learning_rate": 6.032431454539897e-07, "loss": 0.66261935, "num_input_tokens_seen": 134760295, "step": 6268, "time_per_iteration": 2.623098850250244 }, { "auxiliary_loss_clip": 0.01125534, "auxiliary_loss_mlp": 0.01026599, "balance_loss_clip": 1.04108059, "balance_loss_mlp": 1.01968229, "epoch": 0.7538026814164612, "flos": 28911644933760.0, "grad_norm": 2.1044252606337093, "language_loss": 0.81484723, "learning_rate": 6.026857190469014e-07, "loss": 0.83636862, "num_input_tokens_seen": 134782050, "step": 6269, "time_per_iteration": 2.7838385105133057 }, { "auxiliary_loss_clip": 0.01139588, "auxiliary_loss_mlp": 0.01022751, "balance_loss_clip": 1.04302239, "balance_loss_mlp": 1.01606953, "epoch": 0.7539229243071004, "flos": 21104701482240.0, "grad_norm": 1.845781272394144, "language_loss": 0.73953533, "learning_rate": 6.0212850461128e-07, "loss": 0.76115876, "num_input_tokens_seen": 134801170, "step": 6270, "time_per_iteration": 2.6962227821350098 }, { "auxiliary_loss_clip": 0.0114211, "auxiliary_loss_mlp": 0.01024368, "balance_loss_clip": 1.04209018, "balance_loss_mlp": 1.0170846, "epoch": 0.7540431671977395, "flos": 15158469340800.0, "grad_norm": 1.991840436484, "language_loss": 0.74586666, "learning_rate": 6.015715022316516e-07, "loss": 0.76753151, "num_input_tokens_seen": 134819150, "step": 6271, "time_per_iteration": 3.6456661224365234 }, { "auxiliary_loss_clip": 0.01113716, "auxiliary_loss_mlp": 0.01025175, "balance_loss_clip": 1.04074001, "balance_loss_mlp": 1.01766753, "epoch": 0.7541634100883785, "flos": 18770579896320.0, "grad_norm": 2.4982370853941607, "language_loss": 0.77915907, "learning_rate": 6.010147119925154e-07, "loss": 0.80054802, "num_input_tokens_seen": 134836905, "step": 6272, "time_per_iteration": 2.77127742767334 }, { "auxiliary_loss_clip": 0.01116688, "auxiliary_loss_mlp": 0.01029193, "balance_loss_clip": 1.04189539, "balance_loss_mlp": 1.0219779, "epoch": 0.7542836529790176, "flos": 20594770053120.0, "grad_norm": 2.0037858236753183, "language_loss": 0.66322893, "learning_rate": 6.004581339783348e-07, "loss": 0.68468773, "num_input_tokens_seen": 134855225, "step": 6273, "time_per_iteration": 2.779801607131958 }, { "auxiliary_loss_clip": 0.0115582, "auxiliary_loss_mlp": 0.01026099, "balance_loss_clip": 1.04462612, "balance_loss_mlp": 1.01792741, "epoch": 0.7544038958696567, "flos": 19095104298240.0, "grad_norm": 3.2822716948708957, "language_loss": 0.68447018, "learning_rate": 5.999017682735425e-07, "loss": 0.70628935, "num_input_tokens_seen": 134871615, "step": 6274, "time_per_iteration": 3.6571638584136963 }, { "auxiliary_loss_clip": 0.01104622, "auxiliary_loss_mlp": 0.01026139, "balance_loss_clip": 1.03943324, "balance_loss_mlp": 1.01911449, "epoch": 0.7545241387602958, "flos": 31723306859520.0, "grad_norm": 2.0742244832419043, "language_loss": 0.66205305, "learning_rate": 5.993456149625387e-07, "loss": 0.6833607, "num_input_tokens_seen": 134892765, "step": 6275, "time_per_iteration": 3.8331286907196045 }, { "auxiliary_loss_clip": 0.01115563, "auxiliary_loss_mlp": 0.01025186, "balance_loss_clip": 1.04071331, "balance_loss_mlp": 1.01777995, "epoch": 0.7546443816509348, "flos": 20296495514880.0, "grad_norm": 1.8478160775785186, "language_loss": 0.82517052, "learning_rate": 5.987896741296909e-07, "loss": 0.846578, "num_input_tokens_seen": 134910505, "step": 6276, "time_per_iteration": 2.6575846672058105 }, { "auxiliary_loss_clip": 0.01140702, "auxiliary_loss_mlp": 0.01027951, "balance_loss_clip": 1.04748929, "balance_loss_mlp": 1.02131987, "epoch": 0.754764624541574, "flos": 23696159080320.0, "grad_norm": 2.292307033888658, "language_loss": 0.78502434, "learning_rate": 5.982339458593361e-07, "loss": 0.80671084, "num_input_tokens_seen": 134930445, "step": 6277, "time_per_iteration": 3.625814914703369 }, { "auxiliary_loss_clip": 0.01149517, "auxiliary_loss_mlp": 0.00761355, "balance_loss_clip": 1.04548478, "balance_loss_mlp": 1.00030673, "epoch": 0.7548848674322131, "flos": 25337204766720.0, "grad_norm": 1.6476814673120361, "language_loss": 0.84002179, "learning_rate": 5.976784302357767e-07, "loss": 0.8591305, "num_input_tokens_seen": 134951010, "step": 6278, "time_per_iteration": 2.6776795387268066 }, { "auxiliary_loss_clip": 0.01157508, "auxiliary_loss_mlp": 0.0102141, "balance_loss_clip": 1.04604244, "balance_loss_mlp": 1.01454067, "epoch": 0.7550051103228521, "flos": 19573147428480.0, "grad_norm": 2.4891009882633632, "language_loss": 0.73635876, "learning_rate": 5.971231273432855e-07, "loss": 0.75814795, "num_input_tokens_seen": 134970495, "step": 6279, "time_per_iteration": 2.5919971466064453 }, { "auxiliary_loss_clip": 0.01058302, "auxiliary_loss_mlp": 0.01000411, "balance_loss_clip": 1.01246738, "balance_loss_mlp": 0.99931473, "epoch": 0.7551253532134913, "flos": 64150068648960.0, "grad_norm": 0.8209992599150006, "language_loss": 0.54554403, "learning_rate": 5.965680372661e-07, "loss": 0.56613111, "num_input_tokens_seen": 135028060, "step": 6280, "time_per_iteration": 3.155048131942749 }, { "auxiliary_loss_clip": 0.01140195, "auxiliary_loss_mlp": 0.01022911, "balance_loss_clip": 1.0452441, "balance_loss_mlp": 1.01650357, "epoch": 0.7552455961041303, "flos": 26067986968320.0, "grad_norm": 1.8259104257775596, "language_loss": 0.56418502, "learning_rate": 5.960131600884266e-07, "loss": 0.58581614, "num_input_tokens_seen": 135047330, "step": 6281, "time_per_iteration": 2.71712327003479 }, { "auxiliary_loss_clip": 0.01127012, "auxiliary_loss_mlp": 0.01019824, "balance_loss_clip": 1.04306006, "balance_loss_mlp": 1.0128355, "epoch": 0.7553658389947694, "flos": 24498223822080.0, "grad_norm": 1.7241970089663998, "language_loss": 0.75961745, "learning_rate": 5.954584958944413e-07, "loss": 0.78108585, "num_input_tokens_seen": 135065995, "step": 6282, "time_per_iteration": 2.787997245788574 }, { "auxiliary_loss_clip": 0.01125872, "auxiliary_loss_mlp": 0.00761967, "balance_loss_clip": 1.04066503, "balance_loss_mlp": 1.0003531, "epoch": 0.7554860818854086, "flos": 21799465320960.0, "grad_norm": 2.410344293148737, "language_loss": 0.81771076, "learning_rate": 5.949040447682854e-07, "loss": 0.83658916, "num_input_tokens_seen": 135085820, "step": 6283, "time_per_iteration": 2.814040184020996 }, { "auxiliary_loss_clip": 0.01144912, "auxiliary_loss_mlp": 0.01024653, "balance_loss_clip": 1.04377341, "balance_loss_mlp": 1.0174017, "epoch": 0.7556063247760476, "flos": 16362123114240.0, "grad_norm": 2.1649497122356043, "language_loss": 0.677809, "learning_rate": 5.943498067940686e-07, "loss": 0.69950467, "num_input_tokens_seen": 135102845, "step": 6284, "time_per_iteration": 2.807706832885742 }, { "auxiliary_loss_clip": 0.01133512, "auxiliary_loss_mlp": 0.01023528, "balance_loss_clip": 1.0471431, "balance_loss_mlp": 1.01707244, "epoch": 0.7557265676666867, "flos": 27235155502080.0, "grad_norm": 1.8223366815993103, "language_loss": 0.81475723, "learning_rate": 5.937957820558686e-07, "loss": 0.83632755, "num_input_tokens_seen": 135122190, "step": 6285, "time_per_iteration": 2.815964937210083 }, { "auxiliary_loss_clip": 0.01049897, "auxiliary_loss_mlp": 0.01001324, "balance_loss_clip": 1.01238608, "balance_loss_mlp": 1.00016212, "epoch": 0.7558468105573258, "flos": 62189131415040.0, "grad_norm": 0.8460875014979042, "language_loss": 0.65260708, "learning_rate": 5.932419706377296e-07, "loss": 0.67311931, "num_input_tokens_seen": 135180495, "step": 6286, "time_per_iteration": 3.3413124084472656 }, { "auxiliary_loss_clip": 0.01121812, "auxiliary_loss_mlp": 0.01021534, "balance_loss_clip": 1.04454362, "balance_loss_mlp": 1.01358259, "epoch": 0.7559670534479649, "flos": 33249078823680.0, "grad_norm": 1.9123176685865677, "language_loss": 0.74025619, "learning_rate": 5.92688372623666e-07, "loss": 0.76168966, "num_input_tokens_seen": 135199200, "step": 6287, "time_per_iteration": 2.8124232292175293 }, { "auxiliary_loss_clip": 0.01151264, "auxiliary_loss_mlp": 0.01021414, "balance_loss_clip": 1.04313421, "balance_loss_mlp": 1.01372528, "epoch": 0.7560872963386039, "flos": 14064379027200.0, "grad_norm": 2.013960057381503, "language_loss": 0.74263859, "learning_rate": 5.921349880976574e-07, "loss": 0.76436538, "num_input_tokens_seen": 135217035, "step": 6288, "time_per_iteration": 2.920696496963501 }, { "auxiliary_loss_clip": 0.01141504, "auxiliary_loss_mlp": 0.00762606, "balance_loss_clip": 1.04253244, "balance_loss_mlp": 1.00036418, "epoch": 0.7562075392292431, "flos": 20412307941120.0, "grad_norm": 1.9475340084046477, "language_loss": 0.81498098, "learning_rate": 5.915818171436515e-07, "loss": 0.8340221, "num_input_tokens_seen": 135236370, "step": 6289, "time_per_iteration": 2.689699649810791 }, { "auxiliary_loss_clip": 0.01138041, "auxiliary_loss_mlp": 0.01025891, "balance_loss_clip": 1.04131842, "balance_loss_mlp": 1.01816893, "epoch": 0.7563277821198822, "flos": 20376792368640.0, "grad_norm": 1.9624529616483757, "language_loss": 0.74726409, "learning_rate": 5.910288598455642e-07, "loss": 0.76890337, "num_input_tokens_seen": 135255720, "step": 6290, "time_per_iteration": 2.732272148132324 }, { "auxiliary_loss_clip": 0.01159495, "auxiliary_loss_mlp": 0.01026252, "balance_loss_clip": 1.04491186, "balance_loss_mlp": 1.01832485, "epoch": 0.7564480250105212, "flos": 18588261438720.0, "grad_norm": 2.1740456671534782, "language_loss": 0.74038965, "learning_rate": 5.90476116287278e-07, "loss": 0.76224715, "num_input_tokens_seen": 135273320, "step": 6291, "time_per_iteration": 2.669858932495117 }, { "auxiliary_loss_clip": 0.01141575, "auxiliary_loss_mlp": 0.01026765, "balance_loss_clip": 1.04640484, "balance_loss_mlp": 1.01952279, "epoch": 0.7565682679011604, "flos": 21215521918080.0, "grad_norm": 1.98625073258502, "language_loss": 0.67494714, "learning_rate": 5.899235865526456e-07, "loss": 0.69663048, "num_input_tokens_seen": 135292615, "step": 6292, "time_per_iteration": 2.654615879058838 }, { "auxiliary_loss_clip": 0.01117018, "auxiliary_loss_mlp": 0.01023383, "balance_loss_clip": 1.0410049, "balance_loss_mlp": 1.01685333, "epoch": 0.7566885107917994, "flos": 20449008662400.0, "grad_norm": 1.673209367793022, "language_loss": 0.82049394, "learning_rate": 5.893712707254825e-07, "loss": 0.84189796, "num_input_tokens_seen": 135310075, "step": 6293, "time_per_iteration": 2.7207071781158447 }, { "auxiliary_loss_clip": 0.01105763, "auxiliary_loss_mlp": 0.01022079, "balance_loss_clip": 1.0379169, "balance_loss_mlp": 1.01462269, "epoch": 0.7568087536824385, "flos": 19025832919680.0, "grad_norm": 2.6471451883831163, "language_loss": 0.66429102, "learning_rate": 5.888191688895769e-07, "loss": 0.68556941, "num_input_tokens_seen": 135327335, "step": 6294, "time_per_iteration": 2.8102660179138184 }, { "auxiliary_loss_clip": 0.01166598, "auxiliary_loss_mlp": 0.01023965, "balance_loss_clip": 1.04454422, "balance_loss_mlp": 1.01659775, "epoch": 0.7569289965730777, "flos": 15225442248960.0, "grad_norm": 2.4121090557416864, "language_loss": 0.62389433, "learning_rate": 5.882672811286813e-07, "loss": 0.64579993, "num_input_tokens_seen": 135343615, "step": 6295, "time_per_iteration": 2.718003988265991 }, { "auxiliary_loss_clip": 0.0116788, "auxiliary_loss_mlp": 0.01028573, "balance_loss_clip": 1.04474258, "balance_loss_mlp": 1.0209527, "epoch": 0.7570492394637167, "flos": 20769367086720.0, "grad_norm": 2.084705568977647, "language_loss": 0.6964274, "learning_rate": 5.877156075265166e-07, "loss": 0.71839195, "num_input_tokens_seen": 135359880, "step": 6296, "time_per_iteration": 2.7622368335723877 }, { "auxiliary_loss_clip": 0.01136152, "auxiliary_loss_mlp": 0.01028691, "balance_loss_clip": 1.04038179, "balance_loss_mlp": 1.02154446, "epoch": 0.7571694823543558, "flos": 15664091137920.0, "grad_norm": 3.2434354007642128, "language_loss": 0.70006472, "learning_rate": 5.871641481667715e-07, "loss": 0.72171319, "num_input_tokens_seen": 135374325, "step": 6297, "time_per_iteration": 3.6909573078155518 }, { "auxiliary_loss_clip": 0.0111357, "auxiliary_loss_mlp": 0.01029748, "balance_loss_clip": 1.04157495, "balance_loss_mlp": 1.02236915, "epoch": 0.7572897252449949, "flos": 25409241492480.0, "grad_norm": 1.9724741967299242, "language_loss": 0.84393597, "learning_rate": 5.866129031331011e-07, "loss": 0.8653692, "num_input_tokens_seen": 135393980, "step": 6298, "time_per_iteration": 2.8981802463531494 }, { "auxiliary_loss_clip": 0.01139921, "auxiliary_loss_mlp": 0.01028847, "balance_loss_clip": 1.0420382, "balance_loss_mlp": 1.0217452, "epoch": 0.757409968135634, "flos": 24279348297600.0, "grad_norm": 1.958729723651007, "language_loss": 0.83320141, "learning_rate": 5.8606187250913e-07, "loss": 0.85488909, "num_input_tokens_seen": 135412030, "step": 6299, "time_per_iteration": 2.7568917274475098 }, { "auxiliary_loss_clip": 0.01152813, "auxiliary_loss_mlp": 0.00761785, "balance_loss_clip": 1.04767704, "balance_loss_mlp": 1.00040877, "epoch": 0.757530211026273, "flos": 24133766474880.0, "grad_norm": 1.9047325448432209, "language_loss": 0.84423959, "learning_rate": 5.855110563784482e-07, "loss": 0.86338556, "num_input_tokens_seen": 135430565, "step": 6300, "time_per_iteration": 3.72444748878479 }, { "auxiliary_loss_clip": 0.01146225, "auxiliary_loss_mlp": 0.00762271, "balance_loss_clip": 1.04313529, "balance_loss_mlp": 1.0003593, "epoch": 0.7576504539169122, "flos": 23951807153280.0, "grad_norm": 1.6676511778433556, "language_loss": 0.64084601, "learning_rate": 5.849604548246156e-07, "loss": 0.65993094, "num_input_tokens_seen": 135451675, "step": 6301, "time_per_iteration": 3.6140666007995605 }, { "auxiliary_loss_clip": 0.01146713, "auxiliary_loss_mlp": 0.00762015, "balance_loss_clip": 1.04754448, "balance_loss_mlp": 1.00039482, "epoch": 0.7577706968075513, "flos": 21251360712960.0, "grad_norm": 4.049176499081369, "language_loss": 0.80364913, "learning_rate": 5.844100679311565e-07, "loss": 0.82273638, "num_input_tokens_seen": 135470635, "step": 6302, "time_per_iteration": 2.712498188018799 }, { "auxiliary_loss_clip": 0.01139141, "auxiliary_loss_mlp": 0.01025571, "balance_loss_clip": 1.0440588, "balance_loss_mlp": 1.01811481, "epoch": 0.7578909396981903, "flos": 18296595002880.0, "grad_norm": 2.613512436020556, "language_loss": 0.76440573, "learning_rate": 5.838598957815637e-07, "loss": 0.78605282, "num_input_tokens_seen": 135487865, "step": 6303, "time_per_iteration": 3.628082752227783 }, { "auxiliary_loss_clip": 0.01133443, "auxiliary_loss_mlp": 0.01029205, "balance_loss_clip": 1.04291451, "balance_loss_mlp": 1.02134883, "epoch": 0.7580111825888295, "flos": 25373869574400.0, "grad_norm": 2.109339797373425, "language_loss": 0.853652, "learning_rate": 5.833099384592996e-07, "loss": 0.87527847, "num_input_tokens_seen": 135508440, "step": 6304, "time_per_iteration": 2.781425952911377 }, { "auxiliary_loss_clip": 0.01136721, "auxiliary_loss_mlp": 0.01021591, "balance_loss_clip": 1.04525411, "balance_loss_mlp": 1.01417625, "epoch": 0.7581314254794685, "flos": 23768662682880.0, "grad_norm": 2.103217233452294, "language_loss": 0.71307147, "learning_rate": 5.827601960477913e-07, "loss": 0.73465455, "num_input_tokens_seen": 135526365, "step": 6305, "time_per_iteration": 2.834982395172119 }, { "auxiliary_loss_clip": 0.01148564, "auxiliary_loss_mlp": 0.01026133, "balance_loss_clip": 1.04179239, "balance_loss_mlp": 1.01910853, "epoch": 0.7582516683701076, "flos": 22054610603520.0, "grad_norm": 1.9277816443594353, "language_loss": 0.70513153, "learning_rate": 5.822106686304344e-07, "loss": 0.72687846, "num_input_tokens_seen": 135545655, "step": 6306, "time_per_iteration": 2.737581253051758 }, { "auxiliary_loss_clip": 0.01129601, "auxiliary_loss_mlp": 0.01026962, "balance_loss_clip": 1.04175305, "balance_loss_mlp": 1.01944017, "epoch": 0.7583719112607467, "flos": 31649725848960.0, "grad_norm": 1.9914277316950908, "language_loss": 0.57841611, "learning_rate": 5.816613562905919e-07, "loss": 0.59998178, "num_input_tokens_seen": 135566840, "step": 6307, "time_per_iteration": 2.7760770320892334 }, { "auxiliary_loss_clip": 0.01118079, "auxiliary_loss_mlp": 0.01021959, "balance_loss_clip": 1.04292512, "balance_loss_mlp": 1.0152688, "epoch": 0.7584921541513858, "flos": 33068376478080.0, "grad_norm": 1.8379004856668884, "language_loss": 0.70063239, "learning_rate": 5.811122591115933e-07, "loss": 0.72203279, "num_input_tokens_seen": 135587825, "step": 6308, "time_per_iteration": 2.811708927154541 }, { "auxiliary_loss_clip": 0.01122638, "auxiliary_loss_mlp": 0.01028988, "balance_loss_clip": 1.04425991, "balance_loss_mlp": 1.02158487, "epoch": 0.7586123970420249, "flos": 23326350606720.0, "grad_norm": 2.6134725234031464, "language_loss": 0.71178102, "learning_rate": 5.805633771767376e-07, "loss": 0.73329729, "num_input_tokens_seen": 135605220, "step": 6309, "time_per_iteration": 2.726496934890747 }, { "auxiliary_loss_clip": 0.01138096, "auxiliary_loss_mlp": 0.01028745, "balance_loss_clip": 1.04478073, "balance_loss_mlp": 1.02207184, "epoch": 0.7587326399326639, "flos": 18334229477760.0, "grad_norm": 1.7321532030601894, "language_loss": 0.77890545, "learning_rate": 5.800147105692888e-07, "loss": 0.80057389, "num_input_tokens_seen": 135624795, "step": 6310, "time_per_iteration": 2.730590343475342 }, { "auxiliary_loss_clip": 0.01152011, "auxiliary_loss_mlp": 0.01021953, "balance_loss_clip": 1.04198146, "balance_loss_mlp": 1.01490748, "epoch": 0.7588528828233031, "flos": 17275080119040.0, "grad_norm": 1.7672889420574105, "language_loss": 0.78823024, "learning_rate": 5.794662593724795e-07, "loss": 0.8099699, "num_input_tokens_seen": 135643800, "step": 6311, "time_per_iteration": 2.7096822261810303 }, { "auxiliary_loss_clip": 0.01171258, "auxiliary_loss_mlp": 0.01025534, "balance_loss_clip": 1.0502758, "balance_loss_mlp": 1.01806855, "epoch": 0.7589731257139422, "flos": 17713621267200.0, "grad_norm": 1.8929846373044263, "language_loss": 0.75111616, "learning_rate": 5.789180236695091e-07, "loss": 0.77308416, "num_input_tokens_seen": 135660655, "step": 6312, "time_per_iteration": 2.646646738052368 }, { "auxiliary_loss_clip": 0.01147637, "auxiliary_loss_mlp": 0.0102353, "balance_loss_clip": 1.04461527, "balance_loss_mlp": 1.01679218, "epoch": 0.7590933686045812, "flos": 15961072786560.0, "grad_norm": 2.3258460225069677, "language_loss": 0.85047269, "learning_rate": 5.78370003543544e-07, "loss": 0.8721844, "num_input_tokens_seen": 135679410, "step": 6313, "time_per_iteration": 2.649960517883301 }, { "auxiliary_loss_clip": 0.01153324, "auxiliary_loss_mlp": 0.00762347, "balance_loss_clip": 1.04476666, "balance_loss_mlp": 1.00034904, "epoch": 0.7592136114952204, "flos": 21068072588160.0, "grad_norm": 1.9855329163534234, "language_loss": 0.83486533, "learning_rate": 5.778221990777203e-07, "loss": 0.85402203, "num_input_tokens_seen": 135697150, "step": 6314, "time_per_iteration": 2.594696521759033 }, { "auxiliary_loss_clip": 0.01141951, "auxiliary_loss_mlp": 0.01025396, "balance_loss_clip": 1.04573607, "balance_loss_mlp": 1.01848507, "epoch": 0.7593338543858594, "flos": 25297666871040.0, "grad_norm": 2.63222025993635, "language_loss": 0.82578337, "learning_rate": 5.772746103551372e-07, "loss": 0.84745681, "num_input_tokens_seen": 135712545, "step": 6315, "time_per_iteration": 2.6482956409454346 }, { "auxiliary_loss_clip": 0.01135959, "auxiliary_loss_mlp": 0.01026729, "balance_loss_clip": 1.04506969, "balance_loss_mlp": 1.01991963, "epoch": 0.7594540972764985, "flos": 31832367528960.0, "grad_norm": 1.934637948013311, "language_loss": 0.71399981, "learning_rate": 5.767272374588648e-07, "loss": 0.7356267, "num_input_tokens_seen": 135733950, "step": 6316, "time_per_iteration": 2.7368102073669434 }, { "auxiliary_loss_clip": 0.01151554, "auxiliary_loss_mlp": 0.01028886, "balance_loss_clip": 1.04541874, "balance_loss_mlp": 1.02230525, "epoch": 0.7595743401671377, "flos": 37597250880000.0, "grad_norm": 1.701811029863731, "language_loss": 0.77918267, "learning_rate": 5.76180080471939e-07, "loss": 0.80098701, "num_input_tokens_seen": 135757120, "step": 6317, "time_per_iteration": 2.830780029296875 }, { "auxiliary_loss_clip": 0.01170569, "auxiliary_loss_mlp": 0.010253, "balance_loss_clip": 1.04730892, "balance_loss_mlp": 1.0175221, "epoch": 0.7596945830577767, "flos": 18287724343680.0, "grad_norm": 2.1494147126794445, "language_loss": 0.72327065, "learning_rate": 5.756331394773631e-07, "loss": 0.74522936, "num_input_tokens_seen": 135773335, "step": 6318, "time_per_iteration": 2.5440547466278076 }, { "auxiliary_loss_clip": 0.010949, "auxiliary_loss_mlp": 0.00762553, "balance_loss_clip": 1.03854334, "balance_loss_mlp": 1.00035882, "epoch": 0.7598148259484158, "flos": 22233122219520.0, "grad_norm": 2.0509616240386, "language_loss": 0.76128769, "learning_rate": 5.750864145581071e-07, "loss": 0.77986217, "num_input_tokens_seen": 135792555, "step": 6319, "time_per_iteration": 2.7838962078094482 }, { "auxiliary_loss_clip": 0.01167056, "auxiliary_loss_mlp": 0.01024775, "balance_loss_clip": 1.04672647, "balance_loss_mlp": 1.01815319, "epoch": 0.7599350688390549, "flos": 27161718145920.0, "grad_norm": 1.8486041513525464, "language_loss": 0.86402762, "learning_rate": 5.745399057971085e-07, "loss": 0.88594592, "num_input_tokens_seen": 135813690, "step": 6320, "time_per_iteration": 2.6997833251953125 }, { "auxiliary_loss_clip": 0.01158223, "auxiliary_loss_mlp": 0.01025118, "balance_loss_clip": 1.04598546, "balance_loss_mlp": 1.01755428, "epoch": 0.760055311729694, "flos": 15560704817280.0, "grad_norm": 2.3002389889355617, "language_loss": 0.75059491, "learning_rate": 5.739936132772738e-07, "loss": 0.77242839, "num_input_tokens_seen": 135832255, "step": 6321, "time_per_iteration": 2.6230013370513916 }, { "auxiliary_loss_clip": 0.01165725, "auxiliary_loss_mlp": 0.01025354, "balance_loss_clip": 1.04598498, "balance_loss_mlp": 1.01857364, "epoch": 0.760175554620333, "flos": 25155496840320.0, "grad_norm": 2.207653447871622, "language_loss": 0.74417233, "learning_rate": 5.734475370814733e-07, "loss": 0.76608312, "num_input_tokens_seen": 135851935, "step": 6322, "time_per_iteration": 3.543412446975708 }, { "auxiliary_loss_clip": 0.01156963, "auxiliary_loss_mlp": 0.01023815, "balance_loss_clip": 1.04455805, "balance_loss_mlp": 1.01653087, "epoch": 0.7602957975109722, "flos": 24353791234560.0, "grad_norm": 1.577349970172988, "language_loss": 0.78493333, "learning_rate": 5.729016772925483e-07, "loss": 0.80674112, "num_input_tokens_seen": 135873510, "step": 6323, "time_per_iteration": 2.6469149589538574 }, { "auxiliary_loss_clip": 0.01105845, "auxiliary_loss_mlp": 0.01021996, "balance_loss_clip": 1.04204798, "balance_loss_mlp": 1.01518941, "epoch": 0.7604160404016113, "flos": 25192664438400.0, "grad_norm": 1.8601796321705393, "language_loss": 0.7075597, "learning_rate": 5.723560339933038e-07, "loss": 0.72883815, "num_input_tokens_seen": 135893845, "step": 6324, "time_per_iteration": 2.8287336826324463 }, { "auxiliary_loss_clip": 0.01150857, "auxiliary_loss_mlp": 0.00761968, "balance_loss_clip": 1.04407251, "balance_loss_mlp": 1.00038838, "epoch": 0.7605362832922503, "flos": 29861841363840.0, "grad_norm": 1.9548997033109565, "language_loss": 0.65133667, "learning_rate": 5.71810607266513e-07, "loss": 0.67046487, "num_input_tokens_seen": 135912430, "step": 6325, "time_per_iteration": 2.7100303173065186 }, { "auxiliary_loss_clip": 0.01151256, "auxiliary_loss_mlp": 0.01033697, "balance_loss_clip": 1.04244256, "balance_loss_mlp": 1.02669084, "epoch": 0.7606565261828895, "flos": 13917935278080.0, "grad_norm": 1.8599563653281472, "language_loss": 0.60572016, "learning_rate": 5.712653971949184e-07, "loss": 0.62756968, "num_input_tokens_seen": 135930550, "step": 6326, "time_per_iteration": 3.9500253200531006 }, { "auxiliary_loss_clip": 0.01146184, "auxiliary_loss_mlp": 0.01024847, "balance_loss_clip": 1.04258132, "balance_loss_mlp": 1.01780438, "epoch": 0.7607767690735285, "flos": 18551273408640.0, "grad_norm": 2.5646036818967715, "language_loss": 0.75374019, "learning_rate": 5.707204038612268e-07, "loss": 0.77545047, "num_input_tokens_seen": 135947980, "step": 6327, "time_per_iteration": 3.6326229572296143 }, { "auxiliary_loss_clip": 0.01147881, "auxiliary_loss_mlp": 0.01030697, "balance_loss_clip": 1.0504446, "balance_loss_mlp": 1.02288282, "epoch": 0.7608970119641676, "flos": 20922993555840.0, "grad_norm": 2.288692000046473, "language_loss": 0.74147248, "learning_rate": 5.701756273481138e-07, "loss": 0.76325828, "num_input_tokens_seen": 135965400, "step": 6328, "time_per_iteration": 2.7432796955108643 }, { "auxiliary_loss_clip": 0.01143011, "auxiliary_loss_mlp": 0.01026098, "balance_loss_clip": 1.04347086, "balance_loss_mlp": 1.01945186, "epoch": 0.7610172548548068, "flos": 23807302738560.0, "grad_norm": 1.506493781097288, "language_loss": 0.74011517, "learning_rate": 5.696310677382212e-07, "loss": 0.76180625, "num_input_tokens_seen": 135986795, "step": 6329, "time_per_iteration": 2.7720091342926025 }, { "auxiliary_loss_clip": 0.01039133, "auxiliary_loss_mlp": 0.01000719, "balance_loss_clip": 1.0167532, "balance_loss_mlp": 0.99961627, "epoch": 0.7611374977454458, "flos": 66496580426880.0, "grad_norm": 0.8785653125016024, "language_loss": 0.6181308, "learning_rate": 5.690867251141576e-07, "loss": 0.6385293, "num_input_tokens_seen": 136053450, "step": 6330, "time_per_iteration": 4.412827014923096 }, { "auxiliary_loss_clip": 0.01159875, "auxiliary_loss_mlp": 0.01021172, "balance_loss_clip": 1.04485536, "balance_loss_mlp": 1.01412678, "epoch": 0.7612577406360849, "flos": 15633136592640.0, "grad_norm": 2.5654260077837896, "language_loss": 0.91710877, "learning_rate": 5.685425995585013e-07, "loss": 0.93891919, "num_input_tokens_seen": 136071375, "step": 6331, "time_per_iteration": 2.6193904876708984 }, { "auxiliary_loss_clip": 0.01050158, "auxiliary_loss_mlp": 0.01001462, "balance_loss_clip": 1.01288557, "balance_loss_mlp": 1.0003531, "epoch": 0.761377983526724, "flos": 60526253237760.0, "grad_norm": 0.7488647414752886, "language_loss": 0.59012389, "learning_rate": 5.679986911537935e-07, "loss": 0.61064005, "num_input_tokens_seen": 136138905, "step": 6332, "time_per_iteration": 3.3278043270111084 }, { "auxiliary_loss_clip": 0.01098419, "auxiliary_loss_mlp": 0.01024618, "balance_loss_clip": 1.04054308, "balance_loss_mlp": 1.01779056, "epoch": 0.7614982264173631, "flos": 35772522019200.0, "grad_norm": 1.8952726120378358, "language_loss": 0.66765219, "learning_rate": 5.674549999825462e-07, "loss": 0.68888259, "num_input_tokens_seen": 136161720, "step": 6333, "time_per_iteration": 2.86212420463562 }, { "auxiliary_loss_clip": 0.01058914, "auxiliary_loss_mlp": 0.0100197, "balance_loss_clip": 1.01274467, "balance_loss_mlp": 1.00092077, "epoch": 0.7616184693080021, "flos": 67925502345600.0, "grad_norm": 0.9139416715771943, "language_loss": 0.71372342, "learning_rate": 5.669115261272363e-07, "loss": 0.7343322, "num_input_tokens_seen": 136222040, "step": 6334, "time_per_iteration": 3.1977081298828125 }, { "auxiliary_loss_clip": 0.01156395, "auxiliary_loss_mlp": 0.01024446, "balance_loss_clip": 1.04763567, "balance_loss_mlp": 1.01739812, "epoch": 0.7617387121986413, "flos": 20521979141760.0, "grad_norm": 3.538803022576379, "language_loss": 0.72793818, "learning_rate": 5.663682696703081e-07, "loss": 0.74974668, "num_input_tokens_seen": 136240305, "step": 6335, "time_per_iteration": 2.6603240966796875 }, { "auxiliary_loss_clip": 0.01164982, "auxiliary_loss_mlp": 0.0102478, "balance_loss_clip": 1.04656601, "balance_loss_mlp": 1.01854563, "epoch": 0.7618589550892804, "flos": 18624495283200.0, "grad_norm": 1.8975780685722188, "language_loss": 0.81633151, "learning_rate": 5.658252306941746e-07, "loss": 0.83822912, "num_input_tokens_seen": 136259625, "step": 6336, "time_per_iteration": 2.5953853130340576 }, { "auxiliary_loss_clip": 0.01112332, "auxiliary_loss_mlp": 0.0103196, "balance_loss_clip": 1.04148638, "balance_loss_mlp": 1.0238899, "epoch": 0.7619791979799194, "flos": 17453735389440.0, "grad_norm": 2.0464550096887226, "language_loss": 0.75247151, "learning_rate": 5.65282409281212e-07, "loss": 0.77391446, "num_input_tokens_seen": 136277090, "step": 6337, "time_per_iteration": 2.761497974395752 }, { "auxiliary_loss_clip": 0.0113674, "auxiliary_loss_mlp": 0.01030616, "balance_loss_clip": 1.04385972, "balance_loss_mlp": 1.02317739, "epoch": 0.7620994408705585, "flos": 14137421333760.0, "grad_norm": 2.1791527272542366, "language_loss": 0.70025623, "learning_rate": 5.64739805513768e-07, "loss": 0.72192979, "num_input_tokens_seen": 136294635, "step": 6338, "time_per_iteration": 2.8429782390594482 }, { "auxiliary_loss_clip": 0.01052953, "auxiliary_loss_mlp": 0.0075377, "balance_loss_clip": 1.01101446, "balance_loss_mlp": 1.00028217, "epoch": 0.7622196837611976, "flos": 70708792527360.0, "grad_norm": 0.7873552627367454, "language_loss": 0.55733567, "learning_rate": 5.641974194741541e-07, "loss": 0.57540292, "num_input_tokens_seen": 136350320, "step": 6339, "time_per_iteration": 3.2008166313171387 }, { "auxiliary_loss_clip": 0.01043834, "auxiliary_loss_mlp": 0.01002054, "balance_loss_clip": 1.02042353, "balance_loss_mlp": 1.00082636, "epoch": 0.7623399266518367, "flos": 60684150447360.0, "grad_norm": 0.7793429766241025, "language_loss": 0.63742387, "learning_rate": 5.636552512446502e-07, "loss": 0.65788275, "num_input_tokens_seen": 136411375, "step": 6340, "time_per_iteration": 3.185504674911499 }, { "auxiliary_loss_clip": 0.01148551, "auxiliary_loss_mlp": 0.01024083, "balance_loss_clip": 1.04396617, "balance_loss_mlp": 1.0171361, "epoch": 0.7624601695424758, "flos": 26468893641600.0, "grad_norm": 1.74669787990895, "language_loss": 0.77981937, "learning_rate": 5.631133009075027e-07, "loss": 0.80154568, "num_input_tokens_seen": 136430560, "step": 6341, "time_per_iteration": 2.67034649848938 }, { "auxiliary_loss_clip": 0.0115496, "auxiliary_loss_mlp": 0.00761511, "balance_loss_clip": 1.04593194, "balance_loss_mlp": 1.00027597, "epoch": 0.7625804124331149, "flos": 19135755515520.0, "grad_norm": 2.50728885068984, "language_loss": 0.68546939, "learning_rate": 5.625715685449242e-07, "loss": 0.70463401, "num_input_tokens_seen": 136448665, "step": 6342, "time_per_iteration": 2.6798665523529053 }, { "auxiliary_loss_clip": 0.01121216, "auxiliary_loss_mlp": 0.01025769, "balance_loss_clip": 1.04516602, "balance_loss_mlp": 1.0189501, "epoch": 0.762700655323754, "flos": 26213101914240.0, "grad_norm": 1.5596743939029112, "language_loss": 0.71842062, "learning_rate": 5.620300542390966e-07, "loss": 0.73989046, "num_input_tokens_seen": 136469710, "step": 6343, "time_per_iteration": 2.7036850452423096 }, { "auxiliary_loss_clip": 0.01133492, "auxiliary_loss_mlp": 0.01022673, "balance_loss_clip": 1.0401938, "balance_loss_mlp": 1.01611972, "epoch": 0.762820898214393, "flos": 22382582711040.0, "grad_norm": 1.902244675232021, "language_loss": 0.8502233, "learning_rate": 5.614887580721659e-07, "loss": 0.87178493, "num_input_tokens_seen": 136489855, "step": 6344, "time_per_iteration": 2.682849168777466 }, { "auxiliary_loss_clip": 0.01119389, "auxiliary_loss_mlp": 0.01024952, "balance_loss_clip": 1.04425931, "balance_loss_mlp": 1.0180912, "epoch": 0.7629411411050322, "flos": 15700504550400.0, "grad_norm": 2.0102255678684124, "language_loss": 0.73986721, "learning_rate": 5.609476801262481e-07, "loss": 0.76131058, "num_input_tokens_seen": 136504715, "step": 6345, "time_per_iteration": 2.6353251934051514 }, { "auxiliary_loss_clip": 0.01122146, "auxiliary_loss_mlp": 0.01027645, "balance_loss_clip": 1.04372835, "balance_loss_mlp": 1.02015829, "epoch": 0.7630613839956712, "flos": 13770342293760.0, "grad_norm": 2.311413130055111, "language_loss": 0.64524722, "learning_rate": 5.604068204834223e-07, "loss": 0.66674513, "num_input_tokens_seen": 136521610, "step": 6346, "time_per_iteration": 2.73693585395813 }, { "auxiliary_loss_clip": 0.01105011, "auxiliary_loss_mlp": 0.00762409, "balance_loss_clip": 1.03952456, "balance_loss_mlp": 1.00034845, "epoch": 0.7631816268863103, "flos": 14569569861120.0, "grad_norm": 1.9812480603412777, "language_loss": 0.76561207, "learning_rate": 5.598661792257367e-07, "loss": 0.78428626, "num_input_tokens_seen": 136538655, "step": 6347, "time_per_iteration": 2.685518741607666 }, { "auxiliary_loss_clip": 0.01150189, "auxiliary_loss_mlp": 0.01024747, "balance_loss_clip": 1.04259109, "balance_loss_mlp": 1.01796365, "epoch": 0.7633018697769495, "flos": 19062210418560.0, "grad_norm": 2.5827807757956402, "language_loss": 0.76025748, "learning_rate": 5.593257564352071e-07, "loss": 0.78200686, "num_input_tokens_seen": 136557095, "step": 6348, "time_per_iteration": 3.5536322593688965 }, { "auxiliary_loss_clip": 0.01148201, "auxiliary_loss_mlp": 0.01022563, "balance_loss_clip": 1.04392862, "balance_loss_mlp": 1.01606655, "epoch": 0.7634221126675885, "flos": 22052958577920.0, "grad_norm": 1.5142091001149238, "language_loss": 0.75567925, "learning_rate": 5.58785552193815e-07, "loss": 0.7773869, "num_input_tokens_seen": 136577340, "step": 6349, "time_per_iteration": 2.6998536586761475 }, { "auxiliary_loss_clip": 0.01166919, "auxiliary_loss_mlp": 0.01021793, "balance_loss_clip": 1.04712105, "balance_loss_mlp": 1.01462841, "epoch": 0.7635423555582276, "flos": 29382720825600.0, "grad_norm": 1.82661615421463, "language_loss": 0.75546229, "learning_rate": 5.582455665835086e-07, "loss": 0.77734947, "num_input_tokens_seen": 136597635, "step": 6350, "time_per_iteration": 2.6439085006713867 }, { "auxiliary_loss_clip": 0.01150047, "auxiliary_loss_mlp": 0.01027394, "balance_loss_clip": 1.04400098, "balance_loss_mlp": 1.01936531, "epoch": 0.7636625984488667, "flos": 17784903807360.0, "grad_norm": 2.6605995533690088, "language_loss": 0.7228651, "learning_rate": 5.577057996862036e-07, "loss": 0.74463952, "num_input_tokens_seen": 136615260, "step": 6351, "time_per_iteration": 2.6813716888427734 }, { "auxiliary_loss_clip": 0.011642, "auxiliary_loss_mlp": 0.01022926, "balance_loss_clip": 1.04624605, "balance_loss_mlp": 1.01634896, "epoch": 0.7637828413395058, "flos": 23734583654400.0, "grad_norm": 1.6352495113543222, "language_loss": 0.76404732, "learning_rate": 5.571662515837814e-07, "loss": 0.78591859, "num_input_tokens_seen": 136637220, "step": 6352, "time_per_iteration": 3.5616705417633057 }, { "auxiliary_loss_clip": 0.01139656, "auxiliary_loss_mlp": 0.0102717, "balance_loss_clip": 1.04367375, "balance_loss_mlp": 1.02039027, "epoch": 0.7639030842301449, "flos": 36283279461120.0, "grad_norm": 2.0516868437954585, "language_loss": 0.83547992, "learning_rate": 5.566269223580926e-07, "loss": 0.85714817, "num_input_tokens_seen": 136658930, "step": 6353, "time_per_iteration": 3.6651198863983154 }, { "auxiliary_loss_clip": 0.0115741, "auxiliary_loss_mlp": 0.01027687, "balance_loss_clip": 1.04811454, "balance_loss_mlp": 1.02102351, "epoch": 0.764023327120784, "flos": 28878104609280.0, "grad_norm": 1.7261056785343323, "language_loss": 0.75433356, "learning_rate": 5.560878120909511e-07, "loss": 0.77618456, "num_input_tokens_seen": 136681530, "step": 6354, "time_per_iteration": 2.7422101497650146 }, { "auxiliary_loss_clip": 0.01058752, "auxiliary_loss_mlp": 0.0100234, "balance_loss_clip": 1.01327538, "balance_loss_mlp": 1.00124955, "epoch": 0.7641435700114231, "flos": 64789711067520.0, "grad_norm": 0.8571558431857784, "language_loss": 0.5851692, "learning_rate": 5.55548920864141e-07, "loss": 0.60578012, "num_input_tokens_seen": 136742185, "step": 6355, "time_per_iteration": 4.180971145629883 }, { "auxiliary_loss_clip": 0.0115465, "auxiliary_loss_mlp": 0.01024788, "balance_loss_clip": 1.04810858, "balance_loss_mlp": 1.01815677, "epoch": 0.7642638129020621, "flos": 16835784785280.0, "grad_norm": 1.588978036743412, "language_loss": 0.77831405, "learning_rate": 5.550102487594113e-07, "loss": 0.80010843, "num_input_tokens_seen": 136760855, "step": 6356, "time_per_iteration": 2.6153922080993652 }, { "auxiliary_loss_clip": 0.01111972, "auxiliary_loss_mlp": 0.00762198, "balance_loss_clip": 1.03862047, "balance_loss_mlp": 1.00045729, "epoch": 0.7643840557927013, "flos": 30408940391040.0, "grad_norm": 1.6256586618319135, "language_loss": 0.71459651, "learning_rate": 5.54471795858477e-07, "loss": 0.73333824, "num_input_tokens_seen": 136780925, "step": 6357, "time_per_iteration": 2.7946784496307373 }, { "auxiliary_loss_clip": 0.01124082, "auxiliary_loss_mlp": 0.01020972, "balance_loss_clip": 1.03943038, "balance_loss_mlp": 1.01452017, "epoch": 0.7645042986833404, "flos": 16983234115200.0, "grad_norm": 1.8972945962205445, "language_loss": 0.82961899, "learning_rate": 5.539335622430235e-07, "loss": 0.85106957, "num_input_tokens_seen": 136799545, "step": 6358, "time_per_iteration": 2.648392677307129 }, { "auxiliary_loss_clip": 0.01145322, "auxiliary_loss_mlp": 0.01022217, "balance_loss_clip": 1.04190731, "balance_loss_mlp": 1.01554406, "epoch": 0.7646245415739794, "flos": 17311493531520.0, "grad_norm": 1.9391787754919432, "language_loss": 0.74411118, "learning_rate": 5.533955479946975e-07, "loss": 0.76578653, "num_input_tokens_seen": 136818325, "step": 6359, "time_per_iteration": 2.6658008098602295 }, { "auxiliary_loss_clip": 0.01036751, "auxiliary_loss_mlp": 0.00753627, "balance_loss_clip": 1.02209842, "balance_loss_mlp": 1.00019169, "epoch": 0.7647447844646186, "flos": 70402332666240.0, "grad_norm": 0.8588667615629321, "language_loss": 0.6581229, "learning_rate": 5.528577531951173e-07, "loss": 0.6760267, "num_input_tokens_seen": 136878730, "step": 6360, "time_per_iteration": 3.2270116806030273 }, { "auxiliary_loss_clip": 0.0113993, "auxiliary_loss_mlp": 0.01022884, "balance_loss_clip": 1.04232228, "balance_loss_mlp": 1.01636052, "epoch": 0.7648650273552576, "flos": 17675914965120.0, "grad_norm": 1.9049725639900656, "language_loss": 0.73698354, "learning_rate": 5.523201779258653e-07, "loss": 0.75861168, "num_input_tokens_seen": 136897705, "step": 6361, "time_per_iteration": 2.606013536453247 }, { "auxiliary_loss_clip": 0.01165237, "auxiliary_loss_mlp": 0.01025246, "balance_loss_clip": 1.04492331, "balance_loss_mlp": 1.0177958, "epoch": 0.7649852702458967, "flos": 22162019247360.0, "grad_norm": 1.703063274518249, "language_loss": 0.84262884, "learning_rate": 5.517828222684912e-07, "loss": 0.86453366, "num_input_tokens_seen": 136918360, "step": 6362, "time_per_iteration": 2.6017954349517822 }, { "auxiliary_loss_clip": 0.01043643, "auxiliary_loss_mlp": 0.01000853, "balance_loss_clip": 1.01116824, "balance_loss_mlp": 0.99974996, "epoch": 0.7651055131365359, "flos": 69848338227840.0, "grad_norm": 0.7729796986747877, "language_loss": 0.59037185, "learning_rate": 5.512456863045117e-07, "loss": 0.61081672, "num_input_tokens_seen": 136979050, "step": 6363, "time_per_iteration": 3.2389886379241943 }, { "auxiliary_loss_clip": 0.01168057, "auxiliary_loss_mlp": 0.01024908, "balance_loss_clip": 1.0462383, "balance_loss_mlp": 1.01824093, "epoch": 0.7652257560271749, "flos": 19464014931840.0, "grad_norm": 1.7538903395187544, "language_loss": 0.74465001, "learning_rate": 5.507087701154089e-07, "loss": 0.76657969, "num_input_tokens_seen": 136998970, "step": 6364, "time_per_iteration": 2.6605122089385986 }, { "auxiliary_loss_clip": 0.01114568, "auxiliary_loss_mlp": 0.01027344, "balance_loss_clip": 1.04169679, "balance_loss_mlp": 1.02022398, "epoch": 0.765345998917814, "flos": 15961108700160.0, "grad_norm": 1.9669878462908983, "language_loss": 0.75689614, "learning_rate": 5.50172073782634e-07, "loss": 0.77831519, "num_input_tokens_seen": 137016950, "step": 6365, "time_per_iteration": 2.7000157833099365 }, { "auxiliary_loss_clip": 0.01124328, "auxiliary_loss_mlp": 0.01021195, "balance_loss_clip": 1.04524255, "balance_loss_mlp": 1.0147934, "epoch": 0.7654662418084531, "flos": 23659853408640.0, "grad_norm": 1.715404366456008, "language_loss": 0.87721127, "learning_rate": 5.496355973876023e-07, "loss": 0.8986665, "num_input_tokens_seen": 137036205, "step": 6366, "time_per_iteration": 2.8411295413970947 }, { "auxiliary_loss_clip": 0.01123729, "auxiliary_loss_mlp": 0.0076232, "balance_loss_clip": 1.04118633, "balance_loss_mlp": 1.00037766, "epoch": 0.7655864846990922, "flos": 41463608878080.0, "grad_norm": 1.7124703536233545, "language_loss": 0.70787239, "learning_rate": 5.490993410116984e-07, "loss": 0.72673285, "num_input_tokens_seen": 137059195, "step": 6367, "time_per_iteration": 2.9521539211273193 }, { "auxiliary_loss_clip": 0.01117395, "auxiliary_loss_mlp": 0.01027402, "balance_loss_clip": 1.04118943, "balance_loss_mlp": 1.01968265, "epoch": 0.7657067275897312, "flos": 43142684088960.0, "grad_norm": 1.8016545720064288, "language_loss": 0.69636142, "learning_rate": 5.485633047362704e-07, "loss": 0.71780938, "num_input_tokens_seen": 137081200, "step": 6368, "time_per_iteration": 2.962754726409912 }, { "auxiliary_loss_clip": 0.01172731, "auxiliary_loss_mlp": 0.01030218, "balance_loss_clip": 1.04939687, "balance_loss_mlp": 1.02227592, "epoch": 0.7658269704803703, "flos": 17311780840320.0, "grad_norm": 2.1352557196145625, "language_loss": 0.78828049, "learning_rate": 5.480274886426341e-07, "loss": 0.81030989, "num_input_tokens_seen": 137097840, "step": 6369, "time_per_iteration": 2.552774429321289 }, { "auxiliary_loss_clip": 0.01146926, "auxiliary_loss_mlp": 0.01025921, "balance_loss_clip": 1.04545712, "balance_loss_mlp": 1.01937652, "epoch": 0.7659472133710095, "flos": 12568160977920.0, "grad_norm": 2.620189900900933, "language_loss": 0.77579921, "learning_rate": 5.474918928120744e-07, "loss": 0.79752767, "num_input_tokens_seen": 137114335, "step": 6370, "time_per_iteration": 2.650662660598755 }, { "auxiliary_loss_clip": 0.01150304, "auxiliary_loss_mlp": 0.01031305, "balance_loss_clip": 1.04539502, "balance_loss_mlp": 1.02510309, "epoch": 0.7660674562616485, "flos": 22707430335360.0, "grad_norm": 1.6394062540774934, "language_loss": 0.87166619, "learning_rate": 5.469565173258392e-07, "loss": 0.89348227, "num_input_tokens_seen": 137132850, "step": 6371, "time_per_iteration": 2.665968179702759 }, { "auxiliary_loss_clip": 0.01170207, "auxiliary_loss_mlp": 0.01022013, "balance_loss_clip": 1.04656506, "balance_loss_mlp": 1.0144136, "epoch": 0.7661876991522876, "flos": 17056455989760.0, "grad_norm": 1.6926994880095982, "language_loss": 0.63532573, "learning_rate": 5.464213622651454e-07, "loss": 0.6572479, "num_input_tokens_seen": 137150665, "step": 6372, "time_per_iteration": 2.6460282802581787 }, { "auxiliary_loss_clip": 0.01131477, "auxiliary_loss_mlp": 0.01024372, "balance_loss_clip": 1.04250145, "balance_loss_mlp": 1.01703799, "epoch": 0.7663079420429267, "flos": 20084228092800.0, "grad_norm": 1.59063295053305, "language_loss": 0.84140569, "learning_rate": 5.458864277111753e-07, "loss": 0.86296421, "num_input_tokens_seen": 137168500, "step": 6373, "time_per_iteration": 2.699895143508911 }, { "auxiliary_loss_clip": 0.01133515, "auxiliary_loss_mlp": 0.00761558, "balance_loss_clip": 1.0424211, "balance_loss_mlp": 1.00029063, "epoch": 0.7664281849335658, "flos": 12677473042560.0, "grad_norm": 2.531034092465509, "language_loss": 0.69403011, "learning_rate": 5.453517137450769e-07, "loss": 0.71298087, "num_input_tokens_seen": 137185075, "step": 6374, "time_per_iteration": 2.7018518447875977 }, { "auxiliary_loss_clip": 0.0115284, "auxiliary_loss_mlp": 0.01026562, "balance_loss_clip": 1.04661727, "balance_loss_mlp": 1.01948714, "epoch": 0.7665484278242048, "flos": 22345271458560.0, "grad_norm": 2.129378241758401, "language_loss": 0.76058298, "learning_rate": 5.448172204479684e-07, "loss": 0.782377, "num_input_tokens_seen": 137204355, "step": 6375, "time_per_iteration": 3.6241307258605957 }, { "auxiliary_loss_clip": 0.01165316, "auxiliary_loss_mlp": 0.01026001, "balance_loss_clip": 1.04522657, "balance_loss_mlp": 1.0193727, "epoch": 0.766668670714844, "flos": 23617909301760.0, "grad_norm": 1.6742809047910137, "language_loss": 0.74979305, "learning_rate": 5.442829479009294e-07, "loss": 0.77170622, "num_input_tokens_seen": 137223135, "step": 6376, "time_per_iteration": 2.64015531539917 }, { "auxiliary_loss_clip": 0.01160103, "auxiliary_loss_mlp": 0.01025826, "balance_loss_clip": 1.04567993, "balance_loss_mlp": 1.01853693, "epoch": 0.7667889136054831, "flos": 19427134642560.0, "grad_norm": 2.2569588863758177, "language_loss": 0.71472394, "learning_rate": 5.437488961850103e-07, "loss": 0.73658323, "num_input_tokens_seen": 137242935, "step": 6377, "time_per_iteration": 2.7714951038360596 }, { "auxiliary_loss_clip": 0.01106276, "auxiliary_loss_mlp": 0.01020832, "balance_loss_clip": 1.03959322, "balance_loss_mlp": 1.01406407, "epoch": 0.7669091564961221, "flos": 26866352609280.0, "grad_norm": 1.8524503567415964, "language_loss": 0.75385594, "learning_rate": 5.432150653812258e-07, "loss": 0.77512705, "num_input_tokens_seen": 137262970, "step": 6378, "time_per_iteration": 3.745340347290039 }, { "auxiliary_loss_clip": 0.01151036, "auxiliary_loss_mlp": 0.01027103, "balance_loss_clip": 1.0469079, "balance_loss_mlp": 1.02034402, "epoch": 0.7670293993867613, "flos": 12385303816320.0, "grad_norm": 2.1143224707120716, "language_loss": 0.82675147, "learning_rate": 5.42681455570557e-07, "loss": 0.8485328, "num_input_tokens_seen": 137279500, "step": 6379, "time_per_iteration": 3.5243401527404785 }, { "auxiliary_loss_clip": 0.01163705, "auxiliary_loss_mlp": 0.01025123, "balance_loss_clip": 1.04543662, "balance_loss_mlp": 1.01820874, "epoch": 0.7671496422774003, "flos": 21762944167680.0, "grad_norm": 1.6869785052038828, "language_loss": 0.64815784, "learning_rate": 5.42148066833954e-07, "loss": 0.67004609, "num_input_tokens_seen": 137298745, "step": 6380, "time_per_iteration": 2.6245720386505127 }, { "auxiliary_loss_clip": 0.0116734, "auxiliary_loss_mlp": 0.01021444, "balance_loss_clip": 1.04773784, "balance_loss_mlp": 1.01420236, "epoch": 0.7672698851680394, "flos": 21069221823360.0, "grad_norm": 1.9024696777833305, "language_loss": 0.75404704, "learning_rate": 5.416148992523289e-07, "loss": 0.77593493, "num_input_tokens_seen": 137317320, "step": 6381, "time_per_iteration": 3.683361291885376 }, { "auxiliary_loss_clip": 0.01083537, "auxiliary_loss_mlp": 0.01024279, "balance_loss_clip": 1.03886509, "balance_loss_mlp": 1.0171324, "epoch": 0.7673901280586786, "flos": 16976697840000.0, "grad_norm": 1.7609533473183545, "language_loss": 0.78604978, "learning_rate": 5.410819529065644e-07, "loss": 0.80712795, "num_input_tokens_seen": 137335275, "step": 6382, "time_per_iteration": 2.828155279159546 }, { "auxiliary_loss_clip": 0.01109308, "auxiliary_loss_mlp": 0.0102408, "balance_loss_clip": 1.03997087, "balance_loss_mlp": 1.01762819, "epoch": 0.7675103709493176, "flos": 29242669697280.0, "grad_norm": 2.3102895425205507, "language_loss": 0.65431333, "learning_rate": 5.405492278775079e-07, "loss": 0.67564726, "num_input_tokens_seen": 137355055, "step": 6383, "time_per_iteration": 2.824099540710449 }, { "auxiliary_loss_clip": 0.01139017, "auxiliary_loss_mlp": 0.01025331, "balance_loss_clip": 1.04169178, "balance_loss_mlp": 1.01833665, "epoch": 0.7676306138399567, "flos": 29023004073600.0, "grad_norm": 2.171812742644068, "language_loss": 0.79636002, "learning_rate": 5.400167242459732e-07, "loss": 0.81800354, "num_input_tokens_seen": 137374015, "step": 6384, "time_per_iteration": 2.774479389190674 }, { "auxiliary_loss_clip": 0.01150202, "auxiliary_loss_mlp": 0.01026366, "balance_loss_clip": 1.04473054, "balance_loss_mlp": 1.01952672, "epoch": 0.7677508567305958, "flos": 22565116650240.0, "grad_norm": 2.424215647708366, "language_loss": 0.80994678, "learning_rate": 5.394844420927405e-07, "loss": 0.83171248, "num_input_tokens_seen": 137393625, "step": 6385, "time_per_iteration": 2.7876338958740234 }, { "auxiliary_loss_clip": 0.01166727, "auxiliary_loss_mlp": 0.01025575, "balance_loss_clip": 1.04730129, "balance_loss_mlp": 1.01834762, "epoch": 0.7678710996212349, "flos": 25411432222080.0, "grad_norm": 2.3954923024685573, "language_loss": 0.73298776, "learning_rate": 5.389523814985562e-07, "loss": 0.75491077, "num_input_tokens_seen": 137413045, "step": 6386, "time_per_iteration": 2.6423001289367676 }, { "auxiliary_loss_clip": 0.01108319, "auxiliary_loss_mlp": 0.01022558, "balance_loss_clip": 1.03986537, "balance_loss_mlp": 1.01507735, "epoch": 0.767991342511874, "flos": 26756825063040.0, "grad_norm": 1.7597953076190629, "language_loss": 0.76496708, "learning_rate": 5.384205425441344e-07, "loss": 0.78627586, "num_input_tokens_seen": 137433955, "step": 6387, "time_per_iteration": 2.81948184967041 }, { "auxiliary_loss_clip": 0.01137589, "auxiliary_loss_mlp": 0.01024249, "balance_loss_clip": 1.04108226, "balance_loss_mlp": 1.01706672, "epoch": 0.7681115854025131, "flos": 26359509749760.0, "grad_norm": 1.6924046247964757, "language_loss": 0.84348118, "learning_rate": 5.378889253101537e-07, "loss": 0.86509955, "num_input_tokens_seen": 137454510, "step": 6388, "time_per_iteration": 2.7051732540130615 }, { "auxiliary_loss_clip": 0.01149997, "auxiliary_loss_mlp": 0.01022863, "balance_loss_clip": 1.04168677, "balance_loss_mlp": 1.01625931, "epoch": 0.7682318282931522, "flos": 23257043314560.0, "grad_norm": 1.6145039469297509, "language_loss": 0.80656451, "learning_rate": 5.373575298772617e-07, "loss": 0.82829309, "num_input_tokens_seen": 137473630, "step": 6389, "time_per_iteration": 2.653120994567871 }, { "auxiliary_loss_clip": 0.0105859, "auxiliary_loss_mlp": 0.0100172, "balance_loss_clip": 1.01212275, "balance_loss_mlp": 1.00065947, "epoch": 0.7683520711837912, "flos": 70072457137920.0, "grad_norm": 0.7613138994082183, "language_loss": 0.61344755, "learning_rate": 5.368263563260689e-07, "loss": 0.63405073, "num_input_tokens_seen": 137538765, "step": 6390, "time_per_iteration": 3.2916054725646973 }, { "auxiliary_loss_clip": 0.01150602, "auxiliary_loss_mlp": 0.01022717, "balance_loss_clip": 1.04272795, "balance_loss_mlp": 1.01535583, "epoch": 0.7684723140744304, "flos": 18624890332800.0, "grad_norm": 1.5440633130100656, "language_loss": 0.64154589, "learning_rate": 5.362954047371537e-07, "loss": 0.66327906, "num_input_tokens_seen": 137557875, "step": 6391, "time_per_iteration": 2.680232524871826 }, { "auxiliary_loss_clip": 0.01125509, "auxiliary_loss_mlp": 0.01027774, "balance_loss_clip": 1.0466187, "balance_loss_mlp": 1.02024603, "epoch": 0.7685925569650695, "flos": 27452989532160.0, "grad_norm": 1.8580157691117833, "language_loss": 0.72026002, "learning_rate": 5.357646751910627e-07, "loss": 0.74179286, "num_input_tokens_seen": 137579055, "step": 6392, "time_per_iteration": 2.8222649097442627 }, { "auxiliary_loss_clip": 0.01135123, "auxiliary_loss_mlp": 0.01030658, "balance_loss_clip": 1.04329586, "balance_loss_mlp": 1.02305496, "epoch": 0.7687127998557085, "flos": 24535714642560.0, "grad_norm": 2.3322299227369254, "language_loss": 0.80078393, "learning_rate": 5.352341677683061e-07, "loss": 0.8224417, "num_input_tokens_seen": 137600355, "step": 6393, "time_per_iteration": 2.7269113063812256 }, { "auxiliary_loss_clip": 0.01132089, "auxiliary_loss_mlp": 0.01023129, "balance_loss_clip": 1.04273057, "balance_loss_mlp": 1.0166446, "epoch": 0.7688330427463477, "flos": 25155963717120.0, "grad_norm": 2.003035981792588, "language_loss": 0.79613602, "learning_rate": 5.347038825493617e-07, "loss": 0.81768823, "num_input_tokens_seen": 137621885, "step": 6394, "time_per_iteration": 2.796241044998169 }, { "auxiliary_loss_clip": 0.01133463, "auxiliary_loss_mlp": 0.01026789, "balance_loss_clip": 1.04353929, "balance_loss_mlp": 1.02032757, "epoch": 0.7689532856369867, "flos": 21211284113280.0, "grad_norm": 2.122989533800318, "language_loss": 0.6884256, "learning_rate": 5.341738196146732e-07, "loss": 0.71002817, "num_input_tokens_seen": 137640230, "step": 6395, "time_per_iteration": 2.7164947986602783 }, { "auxiliary_loss_clip": 0.01146092, "auxiliary_loss_mlp": 0.01025766, "balance_loss_clip": 1.04076064, "balance_loss_mlp": 1.01834536, "epoch": 0.7690735285276258, "flos": 25119083427840.0, "grad_norm": 2.261664772624106, "language_loss": 0.73397601, "learning_rate": 5.336439790446503e-07, "loss": 0.75569463, "num_input_tokens_seen": 137659330, "step": 6396, "time_per_iteration": 2.7563045024871826 }, { "auxiliary_loss_clip": 0.01121477, "auxiliary_loss_mlp": 0.01025091, "balance_loss_clip": 1.04062676, "balance_loss_mlp": 1.01821864, "epoch": 0.769193771418265, "flos": 54744020640000.0, "grad_norm": 2.479228581580639, "language_loss": 0.62897038, "learning_rate": 5.331143609196711e-07, "loss": 0.65043604, "num_input_tokens_seen": 137683145, "step": 6397, "time_per_iteration": 2.9721250534057617 }, { "auxiliary_loss_clip": 0.01151597, "auxiliary_loss_mlp": 0.01024885, "balance_loss_clip": 1.04656434, "balance_loss_mlp": 1.01786923, "epoch": 0.769314014308904, "flos": 37341890115840.0, "grad_norm": 1.7868253641031386, "language_loss": 0.76941401, "learning_rate": 5.325849653200758e-07, "loss": 0.79117888, "num_input_tokens_seen": 137707095, "step": 6398, "time_per_iteration": 2.771365165710449 }, { "auxiliary_loss_clip": 0.0116727, "auxiliary_loss_mlp": 0.01029693, "balance_loss_clip": 1.04718995, "balance_loss_mlp": 1.02280867, "epoch": 0.7694342571995431, "flos": 20631686256000.0, "grad_norm": 3.3019738555633853, "language_loss": 0.76067495, "learning_rate": 5.32055792326175e-07, "loss": 0.78264457, "num_input_tokens_seen": 137725520, "step": 6399, "time_per_iteration": 2.649177074432373 }, { "auxiliary_loss_clip": 0.01141187, "auxiliary_loss_mlp": 0.01027716, "balance_loss_clip": 1.04605222, "balance_loss_mlp": 1.02076578, "epoch": 0.7695545000901821, "flos": 24207706621440.0, "grad_norm": 1.902866483394228, "language_loss": 0.72548765, "learning_rate": 5.315268420182437e-07, "loss": 0.74717665, "num_input_tokens_seen": 137744195, "step": 6400, "time_per_iteration": 2.660510301589966 }, { "auxiliary_loss_clip": 0.0113101, "auxiliary_loss_mlp": 0.0076173, "balance_loss_clip": 1.04269159, "balance_loss_mlp": 1.00035703, "epoch": 0.7696747429808213, "flos": 28001273708160.0, "grad_norm": 1.681992570682121, "language_loss": 0.76208502, "learning_rate": 5.309981144765221e-07, "loss": 0.78101242, "num_input_tokens_seen": 137764340, "step": 6401, "time_per_iteration": 3.6609530448913574 }, { "auxiliary_loss_clip": 0.0111497, "auxiliary_loss_mlp": 0.01025044, "balance_loss_clip": 1.03996408, "balance_loss_mlp": 1.01762879, "epoch": 0.7697949858714603, "flos": 11509550323200.0, "grad_norm": 2.6460224070408924, "language_loss": 0.75701189, "learning_rate": 5.304696097812196e-07, "loss": 0.77841198, "num_input_tokens_seen": 137780940, "step": 6402, "time_per_iteration": 2.708095073699951 }, { "auxiliary_loss_clip": 0.01136198, "auxiliary_loss_mlp": 0.01032156, "balance_loss_clip": 1.04284167, "balance_loss_mlp": 1.02412748, "epoch": 0.7699152287620994, "flos": 26688271956480.0, "grad_norm": 2.63760172135628, "language_loss": 0.61183774, "learning_rate": 5.299413280125078e-07, "loss": 0.63352126, "num_input_tokens_seen": 137799250, "step": 6403, "time_per_iteration": 2.8732190132141113 }, { "auxiliary_loss_clip": 0.01139578, "auxiliary_loss_mlp": 0.01021154, "balance_loss_clip": 1.04421771, "balance_loss_mlp": 1.01372993, "epoch": 0.7700354716527386, "flos": 16544944362240.0, "grad_norm": 2.296630453131005, "language_loss": 0.7309109, "learning_rate": 5.294132692505284e-07, "loss": 0.7525183, "num_input_tokens_seen": 137817660, "step": 6404, "time_per_iteration": 3.6902973651885986 }, { "auxiliary_loss_clip": 0.011034, "auxiliary_loss_mlp": 0.01026355, "balance_loss_clip": 1.04032183, "balance_loss_mlp": 1.01910365, "epoch": 0.7701557145433776, "flos": 19242733196160.0, "grad_norm": 1.8800870442130917, "language_loss": 0.7916317, "learning_rate": 5.288854335753861e-07, "loss": 0.81292927, "num_input_tokens_seen": 137835920, "step": 6405, "time_per_iteration": 3.5423619747161865 }, { "auxiliary_loss_clip": 0.01155107, "auxiliary_loss_mlp": 0.01025623, "balance_loss_clip": 1.04573441, "balance_loss_mlp": 1.01925707, "epoch": 0.7702759574340167, "flos": 31685744211840.0, "grad_norm": 1.8445855877154294, "language_loss": 0.75646949, "learning_rate": 5.283578210671551e-07, "loss": 0.7782768, "num_input_tokens_seen": 137858160, "step": 6406, "time_per_iteration": 2.739253520965576 }, { "auxiliary_loss_clip": 0.01143392, "auxiliary_loss_mlp": 0.01026818, "balance_loss_clip": 1.04487705, "balance_loss_mlp": 1.01998687, "epoch": 0.7703962003246558, "flos": 16800089644800.0, "grad_norm": 1.9500985859353615, "language_loss": 0.76723647, "learning_rate": 5.278304318058719e-07, "loss": 0.78893858, "num_input_tokens_seen": 137876015, "step": 6407, "time_per_iteration": 3.7025091648101807 }, { "auxiliary_loss_clip": 0.01094194, "auxiliary_loss_mlp": 0.01026279, "balance_loss_clip": 1.03667212, "balance_loss_mlp": 1.01932287, "epoch": 0.7705164432152949, "flos": 35736072693120.0, "grad_norm": 1.6896424429654362, "language_loss": 0.78950554, "learning_rate": 5.273032658715411e-07, "loss": 0.81071025, "num_input_tokens_seen": 137898825, "step": 6408, "time_per_iteration": 2.891700029373169 }, { "auxiliary_loss_clip": 0.01104712, "auxiliary_loss_mlp": 0.01022416, "balance_loss_clip": 1.03783691, "balance_loss_mlp": 1.01549911, "epoch": 0.7706366861059339, "flos": 23365960329600.0, "grad_norm": 1.7595666080232313, "language_loss": 0.76452291, "learning_rate": 5.267763233441347e-07, "loss": 0.78579414, "num_input_tokens_seen": 137919455, "step": 6409, "time_per_iteration": 2.836327314376831 }, { "auxiliary_loss_clip": 0.01156346, "auxiliary_loss_mlp": 0.01021902, "balance_loss_clip": 1.04629374, "balance_loss_mlp": 1.01404619, "epoch": 0.7707569289965731, "flos": 22929897219840.0, "grad_norm": 3.1028401347576526, "language_loss": 0.69501936, "learning_rate": 5.26249604303588e-07, "loss": 0.71680188, "num_input_tokens_seen": 137937960, "step": 6410, "time_per_iteration": 2.6974172592163086 }, { "auxiliary_loss_clip": 0.01168912, "auxiliary_loss_mlp": 0.01030375, "balance_loss_clip": 1.04803884, "balance_loss_mlp": 1.02307963, "epoch": 0.7708771718872122, "flos": 17420661941760.0, "grad_norm": 3.58273021846403, "language_loss": 0.78129029, "learning_rate": 5.257231088298057e-07, "loss": 0.8032831, "num_input_tokens_seen": 137956370, "step": 6411, "time_per_iteration": 2.6282572746276855 }, { "auxiliary_loss_clip": 0.01031061, "auxiliary_loss_mlp": 0.0100179, "balance_loss_clip": 1.01102996, "balance_loss_mlp": 1.00065792, "epoch": 0.7709974147778512, "flos": 72241316248320.0, "grad_norm": 0.7910575183030759, "language_loss": 0.53945106, "learning_rate": 5.25196837002655e-07, "loss": 0.55977958, "num_input_tokens_seen": 138016080, "step": 6412, "time_per_iteration": 3.3573949337005615 }, { "auxiliary_loss_clip": 0.01135268, "auxiliary_loss_mlp": 0.01027217, "balance_loss_clip": 1.0434736, "balance_loss_mlp": 1.01931906, "epoch": 0.7711176576684904, "flos": 39859694876160.0, "grad_norm": 2.27898180571654, "language_loss": 0.6877116, "learning_rate": 5.24670788901971e-07, "loss": 0.7093364, "num_input_tokens_seen": 138039170, "step": 6413, "time_per_iteration": 2.9168944358825684 }, { "auxiliary_loss_clip": 0.01139074, "auxiliary_loss_mlp": 0.01031199, "balance_loss_clip": 1.04611027, "balance_loss_mlp": 1.0227803, "epoch": 0.7712379005591294, "flos": 36976391274240.0, "grad_norm": 2.2494557467210674, "language_loss": 0.6817652, "learning_rate": 5.241449646075557e-07, "loss": 0.70346797, "num_input_tokens_seen": 138062395, "step": 6414, "time_per_iteration": 2.8658511638641357 }, { "auxiliary_loss_clip": 0.01160076, "auxiliary_loss_mlp": 0.01025015, "balance_loss_clip": 1.0453136, "balance_loss_mlp": 1.01789558, "epoch": 0.7713581434497685, "flos": 22776773541120.0, "grad_norm": 2.664660940046069, "language_loss": 0.72140026, "learning_rate": 5.236193641991762e-07, "loss": 0.7432512, "num_input_tokens_seen": 138080325, "step": 6415, "time_per_iteration": 2.63266921043396 }, { "auxiliary_loss_clip": 0.01137436, "auxiliary_loss_mlp": 0.01019783, "balance_loss_clip": 1.04445648, "balance_loss_mlp": 1.01245213, "epoch": 0.7714783863404077, "flos": 24097460803200.0, "grad_norm": 2.0731486619305763, "language_loss": 0.70005715, "learning_rate": 5.23093987756565e-07, "loss": 0.72162926, "num_input_tokens_seen": 138099020, "step": 6416, "time_per_iteration": 2.727823495864868 }, { "auxiliary_loss_clip": 0.01132973, "auxiliary_loss_mlp": 0.01026554, "balance_loss_clip": 1.04063606, "balance_loss_mlp": 1.01826334, "epoch": 0.7715986292310467, "flos": 21063655215360.0, "grad_norm": 3.7112545916164765, "language_loss": 0.75302947, "learning_rate": 5.225688353594217e-07, "loss": 0.77462476, "num_input_tokens_seen": 138118650, "step": 6417, "time_per_iteration": 2.7487869262695312 }, { "auxiliary_loss_clip": 0.01147236, "auxiliary_loss_mlp": 0.00762056, "balance_loss_clip": 1.04795706, "balance_loss_mlp": 1.00033951, "epoch": 0.7717188721216858, "flos": 20594877793920.0, "grad_norm": 2.199670325998729, "language_loss": 0.77259552, "learning_rate": 5.220439070874108e-07, "loss": 0.79168844, "num_input_tokens_seen": 138137890, "step": 6418, "time_per_iteration": 2.7488796710968018 }, { "auxiliary_loss_clip": 0.01153026, "auxiliary_loss_mlp": 0.01026234, "balance_loss_clip": 1.0466218, "balance_loss_mlp": 1.01869369, "epoch": 0.7718391150123249, "flos": 26250951870720.0, "grad_norm": 1.7166794809938501, "language_loss": 0.71128881, "learning_rate": 5.215192030201652e-07, "loss": 0.73308146, "num_input_tokens_seen": 138158880, "step": 6419, "time_per_iteration": 2.719557762145996 }, { "auxiliary_loss_clip": 0.01109772, "auxiliary_loss_mlp": 0.01032763, "balance_loss_clip": 1.0366323, "balance_loss_mlp": 1.02619743, "epoch": 0.771959357902964, "flos": 22049762267520.0, "grad_norm": 1.772813829255, "language_loss": 0.86211777, "learning_rate": 5.209947232372798e-07, "loss": 0.88354313, "num_input_tokens_seen": 138176370, "step": 6420, "time_per_iteration": 2.753448724746704 }, { "auxiliary_loss_clip": 0.01155267, "auxiliary_loss_mlp": 0.00762554, "balance_loss_clip": 1.04376078, "balance_loss_mlp": 1.00036001, "epoch": 0.772079600793603, "flos": 30446000248320.0, "grad_norm": 2.5040273237501736, "language_loss": 0.81016463, "learning_rate": 5.204704678183196e-07, "loss": 0.82934284, "num_input_tokens_seen": 138195105, "step": 6421, "time_per_iteration": 2.7588725090026855 }, { "auxiliary_loss_clip": 0.0116796, "auxiliary_loss_mlp": 0.01025801, "balance_loss_clip": 1.04771173, "balance_loss_mlp": 1.01866031, "epoch": 0.7721998436842422, "flos": 12969857750400.0, "grad_norm": 1.9976498468927195, "language_loss": 0.85269761, "learning_rate": 5.19946436842813e-07, "loss": 0.87463522, "num_input_tokens_seen": 138212235, "step": 6422, "time_per_iteration": 2.7765920162200928 }, { "auxiliary_loss_clip": 0.01124486, "auxiliary_loss_mlp": 0.01020184, "balance_loss_clip": 1.04324126, "balance_loss_mlp": 1.01340711, "epoch": 0.7723200865748813, "flos": 32635509678720.0, "grad_norm": 1.729676874727167, "language_loss": 0.68092155, "learning_rate": 5.194226303902546e-07, "loss": 0.70236826, "num_input_tokens_seen": 138231970, "step": 6423, "time_per_iteration": 2.8819327354431152 }, { "auxiliary_loss_clip": 0.01135946, "auxiliary_loss_mlp": 0.01028582, "balance_loss_clip": 1.04212928, "balance_loss_mlp": 1.02133954, "epoch": 0.7724403294655203, "flos": 21105707063040.0, "grad_norm": 2.070220723427855, "language_loss": 0.70741105, "learning_rate": 5.188990485401072e-07, "loss": 0.72905636, "num_input_tokens_seen": 138251175, "step": 6424, "time_per_iteration": 2.7188920974731445 }, { "auxiliary_loss_clip": 0.01154784, "auxiliary_loss_mlp": 0.01026371, "balance_loss_clip": 1.04670763, "balance_loss_mlp": 1.01873565, "epoch": 0.7725605723561595, "flos": 22090736707200.0, "grad_norm": 1.875834491467513, "language_loss": 0.85838175, "learning_rate": 5.183756913717954e-07, "loss": 0.88019335, "num_input_tokens_seen": 138270950, "step": 6425, "time_per_iteration": 2.8070931434631348 }, { "auxiliary_loss_clip": 0.01135489, "auxiliary_loss_mlp": 0.0102373, "balance_loss_clip": 1.04433084, "balance_loss_mlp": 1.01660156, "epoch": 0.7726808152467985, "flos": 34495610457600.0, "grad_norm": 1.6632357324463254, "language_loss": 0.73441005, "learning_rate": 5.178525589647136e-07, "loss": 0.75600219, "num_input_tokens_seen": 138292590, "step": 6426, "time_per_iteration": 2.770979881286621 }, { "auxiliary_loss_clip": 0.01144802, "auxiliary_loss_mlp": 0.01026127, "balance_loss_clip": 1.04504752, "balance_loss_mlp": 1.01901674, "epoch": 0.7728010581374376, "flos": 22306344094080.0, "grad_norm": 1.7742075947940026, "language_loss": 0.78883708, "learning_rate": 5.173296513982197e-07, "loss": 0.8105464, "num_input_tokens_seen": 138311115, "step": 6427, "time_per_iteration": 3.671598196029663 }, { "auxiliary_loss_clip": 0.01136836, "auxiliary_loss_mlp": 0.01026063, "balance_loss_clip": 1.04641128, "balance_loss_mlp": 1.01867771, "epoch": 0.7729213010280768, "flos": 27126453968640.0, "grad_norm": 1.9817430158183396, "language_loss": 0.6527372, "learning_rate": 5.168069687516398e-07, "loss": 0.67436624, "num_input_tokens_seen": 138330885, "step": 6428, "time_per_iteration": 2.7427639961242676 }, { "auxiliary_loss_clip": 0.01139192, "auxiliary_loss_mlp": 0.01027139, "balance_loss_clip": 1.04598558, "balance_loss_mlp": 1.01996863, "epoch": 0.7730415439187158, "flos": 18150223080960.0, "grad_norm": 2.0519430786425334, "language_loss": 0.71908963, "learning_rate": 5.16284511104263e-07, "loss": 0.74075294, "num_input_tokens_seen": 138350020, "step": 6429, "time_per_iteration": 3.5955750942230225 }, { "auxiliary_loss_clip": 0.011372, "auxiliary_loss_mlp": 0.01024811, "balance_loss_clip": 1.04296303, "balance_loss_mlp": 1.01756358, "epoch": 0.7731617868093549, "flos": 11947480940160.0, "grad_norm": 2.44053418087118, "language_loss": 0.81044734, "learning_rate": 5.157622785353457e-07, "loss": 0.83206737, "num_input_tokens_seen": 138368135, "step": 6430, "time_per_iteration": 3.499960422515869 }, { "auxiliary_loss_clip": 0.01059354, "auxiliary_loss_mlp": 0.01001041, "balance_loss_clip": 1.01350415, "balance_loss_mlp": 0.99990886, "epoch": 0.7732820296999939, "flos": 64201027069440.0, "grad_norm": 0.6459628709422021, "language_loss": 0.60392249, "learning_rate": 5.152402711241113e-07, "loss": 0.62452644, "num_input_tokens_seen": 138436040, "step": 6431, "time_per_iteration": 3.3474714756011963 }, { "auxiliary_loss_clip": 0.01121319, "auxiliary_loss_mlp": 0.01024232, "balance_loss_clip": 1.04037702, "balance_loss_mlp": 1.01734161, "epoch": 0.7734022725906331, "flos": 25302191984640.0, "grad_norm": 1.7783610017447533, "language_loss": 0.82917446, "learning_rate": 5.147184889497465e-07, "loss": 0.85062993, "num_input_tokens_seen": 138455510, "step": 6432, "time_per_iteration": 2.7860913276672363 }, { "auxiliary_loss_clip": 0.01115844, "auxiliary_loss_mlp": 0.01029796, "balance_loss_clip": 1.04077435, "balance_loss_mlp": 1.02166593, "epoch": 0.7735225154812722, "flos": 17347440067200.0, "grad_norm": 4.056209079891061, "language_loss": 0.80328608, "learning_rate": 5.141969320914072e-07, "loss": 0.82474244, "num_input_tokens_seen": 138473015, "step": 6433, "time_per_iteration": 3.8188586235046387 }, { "auxiliary_loss_clip": 0.01169918, "auxiliary_loss_mlp": 0.01030235, "balance_loss_clip": 1.04704857, "balance_loss_mlp": 1.02224803, "epoch": 0.7736427583719112, "flos": 32630086725120.0, "grad_norm": 22.157868158695997, "language_loss": 0.62548733, "learning_rate": 5.136756006282113e-07, "loss": 0.64748883, "num_input_tokens_seen": 138491680, "step": 6434, "time_per_iteration": 2.678572177886963 }, { "auxiliary_loss_clip": 0.01169601, "auxiliary_loss_mlp": 0.01025116, "balance_loss_clip": 1.0476625, "balance_loss_mlp": 1.01793087, "epoch": 0.7737630012625504, "flos": 19860073269120.0, "grad_norm": 2.1620797324315184, "language_loss": 0.84767663, "learning_rate": 5.131544946392446e-07, "loss": 0.86962378, "num_input_tokens_seen": 138506960, "step": 6435, "time_per_iteration": 2.6757941246032715 }, { "auxiliary_loss_clip": 0.01136752, "auxiliary_loss_mlp": 0.0102883, "balance_loss_clip": 1.04515851, "balance_loss_mlp": 1.02134633, "epoch": 0.7738832441531894, "flos": 36022639397760.0, "grad_norm": 2.3707517759969563, "language_loss": 0.63956642, "learning_rate": 5.126336142035592e-07, "loss": 0.66122222, "num_input_tokens_seen": 138526995, "step": 6436, "time_per_iteration": 2.858335256576538 }, { "auxiliary_loss_clip": 0.01138007, "auxiliary_loss_mlp": 0.01025221, "balance_loss_clip": 1.04168022, "balance_loss_mlp": 1.01741242, "epoch": 0.7740034870438285, "flos": 13405274415360.0, "grad_norm": 2.404318442976146, "language_loss": 0.71885681, "learning_rate": 5.121129594001721e-07, "loss": 0.74048907, "num_input_tokens_seen": 138541260, "step": 6437, "time_per_iteration": 2.679598331451416 }, { "auxiliary_loss_clip": 0.01151714, "auxiliary_loss_mlp": 0.01023939, "balance_loss_clip": 1.04493713, "balance_loss_mlp": 1.01681638, "epoch": 0.7741237299344677, "flos": 22086714384000.0, "grad_norm": 1.562178010772168, "language_loss": 0.81174451, "learning_rate": 5.115925303080661e-07, "loss": 0.8335011, "num_input_tokens_seen": 138560970, "step": 6438, "time_per_iteration": 2.6316263675689697 }, { "auxiliary_loss_clip": 0.01138886, "auxiliary_loss_mlp": 0.01026225, "balance_loss_clip": 1.0429461, "balance_loss_mlp": 1.01920366, "epoch": 0.7742439728251067, "flos": 19864777950720.0, "grad_norm": 2.204659357807789, "language_loss": 0.79076701, "learning_rate": 5.110723270061899e-07, "loss": 0.8124181, "num_input_tokens_seen": 138577460, "step": 6439, "time_per_iteration": 2.6606359481811523 }, { "auxiliary_loss_clip": 0.01164415, "auxiliary_loss_mlp": 0.01024201, "balance_loss_clip": 1.04537988, "balance_loss_mlp": 1.01727772, "epoch": 0.7743642157157458, "flos": 16690167048960.0, "grad_norm": 1.9023463630591777, "language_loss": 0.79318559, "learning_rate": 5.105523495734572e-07, "loss": 0.8150717, "num_input_tokens_seen": 138594860, "step": 6440, "time_per_iteration": 2.6125144958496094 }, { "auxiliary_loss_clip": 0.01163638, "auxiliary_loss_mlp": 0.01026098, "balance_loss_clip": 1.04316127, "balance_loss_mlp": 1.01902902, "epoch": 0.7744844586063849, "flos": 20304360593280.0, "grad_norm": 1.6854903294054728, "language_loss": 0.74844944, "learning_rate": 5.100325980887499e-07, "loss": 0.77034676, "num_input_tokens_seen": 138614785, "step": 6441, "time_per_iteration": 2.6861441135406494 }, { "auxiliary_loss_clip": 0.01148197, "auxiliary_loss_mlp": 0.01026856, "balance_loss_clip": 1.04539585, "balance_loss_mlp": 1.01976919, "epoch": 0.774604701497024, "flos": 22966705681920.0, "grad_norm": 1.8463180804138655, "language_loss": 0.83195031, "learning_rate": 5.095130726309116e-07, "loss": 0.85370088, "num_input_tokens_seen": 138634960, "step": 6442, "time_per_iteration": 2.7292332649230957 }, { "auxiliary_loss_clip": 0.0106823, "auxiliary_loss_mlp": 0.01001697, "balance_loss_clip": 1.01348519, "balance_loss_mlp": 1.0005765, "epoch": 0.774724944387663, "flos": 60288523073280.0, "grad_norm": 0.7941688999338065, "language_loss": 0.59057462, "learning_rate": 5.089937732787559e-07, "loss": 0.61127388, "num_input_tokens_seen": 138699520, "step": 6443, "time_per_iteration": 3.2514429092407227 }, { "auxiliary_loss_clip": 0.01126128, "auxiliary_loss_mlp": 0.01025154, "balance_loss_clip": 1.04298973, "balance_loss_mlp": 1.01760232, "epoch": 0.7748451872783022, "flos": 26761026954240.0, "grad_norm": 3.9225416558798964, "language_loss": 0.66530561, "learning_rate": 5.084747001110592e-07, "loss": 0.68681842, "num_input_tokens_seen": 138719145, "step": 6444, "time_per_iteration": 2.8492958545684814 }, { "auxiliary_loss_clip": 0.0114794, "auxiliary_loss_mlp": 0.00761955, "balance_loss_clip": 1.04622149, "balance_loss_mlp": 1.00038099, "epoch": 0.7749654301689413, "flos": 30338627518080.0, "grad_norm": 1.822902785445455, "language_loss": 0.69928116, "learning_rate": 5.07955853206564e-07, "loss": 0.71838009, "num_input_tokens_seen": 138743850, "step": 6445, "time_per_iteration": 2.746682643890381 }, { "auxiliary_loss_clip": 0.0115542, "auxiliary_loss_mlp": 0.01026645, "balance_loss_clip": 1.04482055, "balance_loss_mlp": 1.01948404, "epoch": 0.7750856730595803, "flos": 43179851687040.0, "grad_norm": 1.6007121356050662, "language_loss": 0.70974147, "learning_rate": 5.074372326439807e-07, "loss": 0.73156214, "num_input_tokens_seen": 138766860, "step": 6446, "time_per_iteration": 2.8201136589050293 }, { "auxiliary_loss_clip": 0.01124826, "auxiliary_loss_mlp": 0.01021973, "balance_loss_clip": 1.04125595, "balance_loss_mlp": 1.01509202, "epoch": 0.7752059159502195, "flos": 17640040256640.0, "grad_norm": 2.618848042312786, "language_loss": 0.73940384, "learning_rate": 5.069188385019814e-07, "loss": 0.76087183, "num_input_tokens_seen": 138784560, "step": 6447, "time_per_iteration": 2.6787567138671875 }, { "auxiliary_loss_clip": 0.01114355, "auxiliary_loss_mlp": 0.01026693, "balance_loss_clip": 1.0391345, "balance_loss_mlp": 1.0196414, "epoch": 0.7753261588408585, "flos": 12677688524160.0, "grad_norm": 3.0016443703734415, "language_loss": 0.61175913, "learning_rate": 5.064006708592077e-07, "loss": 0.63316953, "num_input_tokens_seen": 138800805, "step": 6448, "time_per_iteration": 2.6894946098327637 }, { "auxiliary_loss_clip": 0.01131387, "auxiliary_loss_mlp": 0.01026369, "balance_loss_clip": 1.04410458, "balance_loss_mlp": 1.01965165, "epoch": 0.7754464017314976, "flos": 16690741666560.0, "grad_norm": 2.478813443954334, "language_loss": 0.75778174, "learning_rate": 5.058827297942641e-07, "loss": 0.77935928, "num_input_tokens_seen": 138815910, "step": 6449, "time_per_iteration": 2.6561739444732666 }, { "auxiliary_loss_clip": 0.01145461, "auxiliary_loss_mlp": 0.01024964, "balance_loss_clip": 1.04478037, "balance_loss_mlp": 1.01799667, "epoch": 0.7755666446221368, "flos": 19718944732800.0, "grad_norm": 1.8846018491981815, "language_loss": 0.75187117, "learning_rate": 5.053650153857237e-07, "loss": 0.77357543, "num_input_tokens_seen": 138834920, "step": 6450, "time_per_iteration": 2.659416675567627 }, { "auxiliary_loss_clip": 0.011523, "auxiliary_loss_mlp": 0.01021156, "balance_loss_clip": 1.04715598, "balance_loss_mlp": 1.0139823, "epoch": 0.7756868875127758, "flos": 18693623007360.0, "grad_norm": 1.5774534707436472, "language_loss": 0.6981768, "learning_rate": 5.048475277121214e-07, "loss": 0.71991134, "num_input_tokens_seen": 138852135, "step": 6451, "time_per_iteration": 2.7153401374816895 }, { "auxiliary_loss_clip": 0.01155226, "auxiliary_loss_mlp": 0.01020876, "balance_loss_clip": 1.04583013, "balance_loss_mlp": 1.01400697, "epoch": 0.7758071304034149, "flos": 28404191543040.0, "grad_norm": 1.8933194641412132, "language_loss": 0.76703584, "learning_rate": 5.043302668519598e-07, "loss": 0.7887969, "num_input_tokens_seen": 138871470, "step": 6452, "time_per_iteration": 2.6996614933013916 }, { "auxiliary_loss_clip": 0.01155932, "auxiliary_loss_mlp": 0.01024086, "balance_loss_clip": 1.04545593, "balance_loss_mlp": 1.01740408, "epoch": 0.775927373294054, "flos": 20595344670720.0, "grad_norm": 2.048114495790253, "language_loss": 0.72242427, "learning_rate": 5.038132328837079e-07, "loss": 0.74422443, "num_input_tokens_seen": 138889860, "step": 6453, "time_per_iteration": 3.617781400680542 }, { "auxiliary_loss_clip": 0.01154341, "auxiliary_loss_mlp": 0.01025515, "balance_loss_clip": 1.04534531, "balance_loss_mlp": 1.0185951, "epoch": 0.7760476161846931, "flos": 22526368853760.0, "grad_norm": 2.1897326045199863, "language_loss": 0.7391476, "learning_rate": 5.032964258857993e-07, "loss": 0.76094621, "num_input_tokens_seen": 138909955, "step": 6454, "time_per_iteration": 2.610060453414917 }, { "auxiliary_loss_clip": 0.01148891, "auxiliary_loss_mlp": 0.01024288, "balance_loss_clip": 1.03994656, "balance_loss_mlp": 1.0168792, "epoch": 0.7761678590753321, "flos": 48651488403840.0, "grad_norm": 1.4498786617429609, "language_loss": 0.68275189, "learning_rate": 5.027798459366329e-07, "loss": 0.70448375, "num_input_tokens_seen": 138935320, "step": 6455, "time_per_iteration": 3.817877769470215 }, { "auxiliary_loss_clip": 0.01155112, "auxiliary_loss_mlp": 0.01025822, "balance_loss_clip": 1.04394746, "balance_loss_mlp": 1.01855659, "epoch": 0.7762881019659713, "flos": 26177047637760.0, "grad_norm": 1.4302560581559935, "language_loss": 0.63766849, "learning_rate": 5.02263493114573e-07, "loss": 0.65947783, "num_input_tokens_seen": 138957115, "step": 6456, "time_per_iteration": 2.5985450744628906 }, { "auxiliary_loss_clip": 0.0116429, "auxiliary_loss_mlp": 0.01024018, "balance_loss_clip": 1.04462838, "balance_loss_mlp": 1.01693726, "epoch": 0.7764083448566104, "flos": 20588341518720.0, "grad_norm": 2.2602438638820526, "language_loss": 0.76780146, "learning_rate": 5.017473674979502e-07, "loss": 0.78968453, "num_input_tokens_seen": 138973140, "step": 6457, "time_per_iteration": 3.5946390628814697 }, { "auxiliary_loss_clip": 0.01031843, "auxiliary_loss_mlp": 0.01002603, "balance_loss_clip": 1.01597202, "balance_loss_mlp": 1.00156021, "epoch": 0.7765285877472494, "flos": 67293078560640.0, "grad_norm": 0.752214283597663, "language_loss": 0.5831582, "learning_rate": 5.01231469165061e-07, "loss": 0.60350269, "num_input_tokens_seen": 139028965, "step": 6458, "time_per_iteration": 3.1834661960601807 }, { "auxiliary_loss_clip": 0.01059607, "auxiliary_loss_mlp": 0.0100093, "balance_loss_clip": 1.01384521, "balance_loss_mlp": 0.99983895, "epoch": 0.7766488306378886, "flos": 61344476121600.0, "grad_norm": 0.822561254599888, "language_loss": 0.56862414, "learning_rate": 5.007157981941663e-07, "loss": 0.58922958, "num_input_tokens_seen": 139094325, "step": 6459, "time_per_iteration": 4.1516313552856445 }, { "auxiliary_loss_clip": 0.01047194, "auxiliary_loss_mlp": 0.01002471, "balance_loss_clip": 1.01288843, "balance_loss_mlp": 1.00127339, "epoch": 0.7767690735285276, "flos": 62946199393920.0, "grad_norm": 0.8755056117427497, "language_loss": 0.67480373, "learning_rate": 5.002003546634928e-07, "loss": 0.69530046, "num_input_tokens_seen": 139150425, "step": 6460, "time_per_iteration": 3.181318998336792 }, { "auxiliary_loss_clip": 0.01108705, "auxiliary_loss_mlp": 0.01024395, "balance_loss_clip": 1.04324031, "balance_loss_mlp": 1.01789248, "epoch": 0.7768893164191667, "flos": 20886400575360.0, "grad_norm": 1.7949676875286518, "language_loss": 0.76194465, "learning_rate": 4.996851386512331e-07, "loss": 0.78327572, "num_input_tokens_seen": 139169130, "step": 6461, "time_per_iteration": 2.714665412902832 }, { "auxiliary_loss_clip": 0.01137453, "auxiliary_loss_mlp": 0.01029596, "balance_loss_clip": 1.04218972, "balance_loss_mlp": 1.02151418, "epoch": 0.7770095593098058, "flos": 20704584908160.0, "grad_norm": 1.7281912233807137, "language_loss": 0.83168024, "learning_rate": 4.991701502355444e-07, "loss": 0.85335076, "num_input_tokens_seen": 139189595, "step": 6462, "time_per_iteration": 2.6480584144592285 }, { "auxiliary_loss_clip": 0.01155295, "auxiliary_loss_mlp": 0.01025194, "balance_loss_clip": 1.04463816, "balance_loss_mlp": 1.01868773, "epoch": 0.7771298022004449, "flos": 24717709877760.0, "grad_norm": 1.5997942560185956, "language_loss": 0.76085317, "learning_rate": 4.986553894945518e-07, "loss": 0.7826581, "num_input_tokens_seen": 139210805, "step": 6463, "time_per_iteration": 2.6461575031280518 }, { "auxiliary_loss_clip": 0.01111241, "auxiliary_loss_mlp": 0.01024649, "balance_loss_clip": 1.03996766, "balance_loss_mlp": 1.01824188, "epoch": 0.777250045091084, "flos": 25009232659200.0, "grad_norm": 2.0799850288031463, "language_loss": 0.86169779, "learning_rate": 4.981408565063416e-07, "loss": 0.88305676, "num_input_tokens_seen": 139230750, "step": 6464, "time_per_iteration": 2.814223527908325 }, { "auxiliary_loss_clip": 0.01168085, "auxiliary_loss_mlp": 0.01022604, "balance_loss_clip": 1.04642463, "balance_loss_mlp": 1.01550794, "epoch": 0.777370287981723, "flos": 20119887319680.0, "grad_norm": 2.0715943766377056, "language_loss": 0.75847101, "learning_rate": 4.976265513489701e-07, "loss": 0.78037786, "num_input_tokens_seen": 139250720, "step": 6465, "time_per_iteration": 2.6299853324890137 }, { "auxiliary_loss_clip": 0.01150618, "auxiliary_loss_mlp": 0.01019793, "balance_loss_clip": 1.04261351, "balance_loss_mlp": 1.01306105, "epoch": 0.7774905308723622, "flos": 21718809331200.0, "grad_norm": 1.6822820347852592, "language_loss": 0.80462503, "learning_rate": 4.971124741004562e-07, "loss": 0.82632917, "num_input_tokens_seen": 139269720, "step": 6466, "time_per_iteration": 2.635477066040039 }, { "auxiliary_loss_clip": 0.01152104, "auxiliary_loss_mlp": 0.01020257, "balance_loss_clip": 1.04461551, "balance_loss_mlp": 1.01299095, "epoch": 0.7776107737630013, "flos": 16034115093120.0, "grad_norm": 1.796220729026924, "language_loss": 0.76739693, "learning_rate": 4.965986248387846e-07, "loss": 0.78912055, "num_input_tokens_seen": 139288035, "step": 6467, "time_per_iteration": 2.665466547012329 }, { "auxiliary_loss_clip": 0.01140994, "auxiliary_loss_mlp": 0.01024096, "balance_loss_clip": 1.04372537, "balance_loss_mlp": 1.01738214, "epoch": 0.7777310166536403, "flos": 24790895838720.0, "grad_norm": 1.6545632798177887, "language_loss": 0.7725265, "learning_rate": 4.960850036419073e-07, "loss": 0.79417741, "num_input_tokens_seen": 139307135, "step": 6468, "time_per_iteration": 2.6672067642211914 }, { "auxiliary_loss_clip": 0.01132018, "auxiliary_loss_mlp": 0.01025707, "balance_loss_clip": 1.04175663, "balance_loss_mlp": 1.0186379, "epoch": 0.7778512595442795, "flos": 17272530253440.0, "grad_norm": 1.7540391274234641, "language_loss": 0.78676045, "learning_rate": 4.955716105877378e-07, "loss": 0.80833769, "num_input_tokens_seen": 139325905, "step": 6469, "time_per_iteration": 2.7395763397216797 }, { "auxiliary_loss_clip": 0.01154851, "auxiliary_loss_mlp": 0.0076235, "balance_loss_clip": 1.04376721, "balance_loss_mlp": 1.00029695, "epoch": 0.7779715024349185, "flos": 17748418567680.0, "grad_norm": 1.8017653196162637, "language_loss": 0.83470154, "learning_rate": 4.950584457541598e-07, "loss": 0.85387355, "num_input_tokens_seen": 139344370, "step": 6470, "time_per_iteration": 2.6993916034698486 }, { "auxiliary_loss_clip": 0.01155753, "auxiliary_loss_mlp": 0.01025722, "balance_loss_clip": 1.04485857, "balance_loss_mlp": 1.01876044, "epoch": 0.7780917453255576, "flos": 24316875031680.0, "grad_norm": 1.4160150360587893, "language_loss": 0.82013917, "learning_rate": 4.945455092190183e-07, "loss": 0.84195393, "num_input_tokens_seen": 139365625, "step": 6471, "time_per_iteration": 2.6903605461120605 }, { "auxiliary_loss_clip": 0.01067996, "auxiliary_loss_mlp": 0.01001653, "balance_loss_clip": 1.01324511, "balance_loss_mlp": 1.00057971, "epoch": 0.7782119882161967, "flos": 56364601530240.0, "grad_norm": 0.6856111128833383, "language_loss": 0.55956912, "learning_rate": 4.940328010601271e-07, "loss": 0.58026558, "num_input_tokens_seen": 139430540, "step": 6472, "time_per_iteration": 3.1728127002716064 }, { "auxiliary_loss_clip": 0.01148671, "auxiliary_loss_mlp": 0.01029824, "balance_loss_clip": 1.04724932, "balance_loss_mlp": 1.02228153, "epoch": 0.7783322311068358, "flos": 46789986994560.0, "grad_norm": 1.7836299176374093, "language_loss": 0.76575935, "learning_rate": 4.935203213552621e-07, "loss": 0.78754431, "num_input_tokens_seen": 139454280, "step": 6473, "time_per_iteration": 2.8815653324127197 }, { "auxiliary_loss_clip": 0.01139833, "auxiliary_loss_mlp": 0.01027316, "balance_loss_clip": 1.04394925, "balance_loss_mlp": 1.02044106, "epoch": 0.7784524739974749, "flos": 19057864872960.0, "grad_norm": 2.2381279709341255, "language_loss": 0.6688993, "learning_rate": 4.930080701821662e-07, "loss": 0.69057077, "num_input_tokens_seen": 139471745, "step": 6474, "time_per_iteration": 2.6783008575439453 }, { "auxiliary_loss_clip": 0.01140352, "auxiliary_loss_mlp": 0.01023055, "balance_loss_clip": 1.04386652, "balance_loss_mlp": 1.01621211, "epoch": 0.778572716888114, "flos": 24791111320320.0, "grad_norm": 6.263478103286799, "language_loss": 0.77012241, "learning_rate": 4.92496047618548e-07, "loss": 0.79175651, "num_input_tokens_seen": 139491505, "step": 6475, "time_per_iteration": 2.72338604927063 }, { "auxiliary_loss_clip": 0.01155148, "auxiliary_loss_mlp": 0.01023402, "balance_loss_clip": 1.04690385, "balance_loss_mlp": 1.01625812, "epoch": 0.7786929597787531, "flos": 20078086867200.0, "grad_norm": 1.9834186106298857, "language_loss": 0.78220427, "learning_rate": 4.919842537420811e-07, "loss": 0.80398977, "num_input_tokens_seen": 139508620, "step": 6476, "time_per_iteration": 2.637674570083618 }, { "auxiliary_loss_clip": 0.01139471, "auxiliary_loss_mlp": 0.0102583, "balance_loss_clip": 1.04591179, "balance_loss_mlp": 1.01898694, "epoch": 0.7788132026693921, "flos": 21872220318720.0, "grad_norm": 1.6878529302025571, "language_loss": 0.79502815, "learning_rate": 4.91472688630404e-07, "loss": 0.81668115, "num_input_tokens_seen": 139529360, "step": 6477, "time_per_iteration": 2.6739304065704346 }, { "auxiliary_loss_clip": 0.01163541, "auxiliary_loss_mlp": 0.01024472, "balance_loss_clip": 1.04494286, "balance_loss_mlp": 1.01751947, "epoch": 0.7789334455600313, "flos": 11181937351680.0, "grad_norm": 1.8480577570858563, "language_loss": 0.74438775, "learning_rate": 4.909613523611202e-07, "loss": 0.7662679, "num_input_tokens_seen": 139546240, "step": 6478, "time_per_iteration": 2.6428725719451904 }, { "auxiliary_loss_clip": 0.01107386, "auxiliary_loss_mlp": 0.00762858, "balance_loss_clip": 1.03770041, "balance_loss_mlp": 1.00032306, "epoch": 0.7790536884506704, "flos": 28695427015680.0, "grad_norm": 1.7562949178445104, "language_loss": 0.74715012, "learning_rate": 4.904502450117991e-07, "loss": 0.76585257, "num_input_tokens_seen": 139567200, "step": 6479, "time_per_iteration": 3.8457181453704834 }, { "auxiliary_loss_clip": 0.01135774, "auxiliary_loss_mlp": 0.01028014, "balance_loss_clip": 1.04673386, "balance_loss_mlp": 1.02095652, "epoch": 0.7791739313413094, "flos": 11072302064640.0, "grad_norm": 2.455798611845857, "language_loss": 0.7288183, "learning_rate": 4.899393666599762e-07, "loss": 0.75045609, "num_input_tokens_seen": 139583775, "step": 6480, "time_per_iteration": 2.7839150428771973 }, { "auxiliary_loss_clip": 0.0116489, "auxiliary_loss_mlp": 0.01030578, "balance_loss_clip": 1.04376745, "balance_loss_mlp": 1.02408743, "epoch": 0.7792941742319486, "flos": 14679276975360.0, "grad_norm": 2.2856088138509976, "language_loss": 0.72678024, "learning_rate": 4.894287173831506e-07, "loss": 0.74873495, "num_input_tokens_seen": 139599735, "step": 6481, "time_per_iteration": 3.635589599609375 }, { "auxiliary_loss_clip": 0.01140352, "auxiliary_loss_mlp": 0.01029414, "balance_loss_clip": 1.04149628, "balance_loss_mlp": 1.02192187, "epoch": 0.7794144171225876, "flos": 23258874908160.0, "grad_norm": 1.9367567173263704, "language_loss": 0.84623909, "learning_rate": 4.889182972587877e-07, "loss": 0.86793679, "num_input_tokens_seen": 139619030, "step": 6482, "time_per_iteration": 2.7233073711395264 }, { "auxiliary_loss_clip": 0.01134007, "auxiliary_loss_mlp": 0.01025289, "balance_loss_clip": 1.04406214, "balance_loss_mlp": 1.01844668, "epoch": 0.7795346600132267, "flos": 21507080613120.0, "grad_norm": 1.9498208743933556, "language_loss": 0.66406345, "learning_rate": 4.884081063643177e-07, "loss": 0.68565637, "num_input_tokens_seen": 139637690, "step": 6483, "time_per_iteration": 3.57716965675354 }, { "auxiliary_loss_clip": 0.01038897, "auxiliary_loss_mlp": 0.01002983, "balance_loss_clip": 1.01105428, "balance_loss_mlp": 1.00189805, "epoch": 0.7796549029038659, "flos": 70052273694720.0, "grad_norm": 0.8726391741822886, "language_loss": 0.52546024, "learning_rate": 4.878981447771353e-07, "loss": 0.54587901, "num_input_tokens_seen": 139692070, "step": 6484, "time_per_iteration": 3.2176403999328613 }, { "auxiliary_loss_clip": 0.01118769, "auxiliary_loss_mlp": 0.01025413, "balance_loss_clip": 1.04198575, "balance_loss_mlp": 1.01792932, "epoch": 0.7797751457945049, "flos": 23989405714560.0, "grad_norm": 1.618157020421602, "language_loss": 0.73182797, "learning_rate": 4.873884125746035e-07, "loss": 0.75326979, "num_input_tokens_seen": 139713745, "step": 6485, "time_per_iteration": 3.6669607162475586 }, { "auxiliary_loss_clip": 0.01133003, "auxiliary_loss_mlp": 0.01025056, "balance_loss_clip": 1.0414927, "balance_loss_mlp": 1.0176053, "epoch": 0.779895388685144, "flos": 22674751937280.0, "grad_norm": 2.3527004457314376, "language_loss": 0.72136891, "learning_rate": 4.868789098340456e-07, "loss": 0.74294949, "num_input_tokens_seen": 139731650, "step": 6486, "time_per_iteration": 2.82139253616333 }, { "auxiliary_loss_clip": 0.01126032, "auxiliary_loss_mlp": 0.01023583, "balance_loss_clip": 1.04270244, "balance_loss_mlp": 1.0164957, "epoch": 0.7800156315757831, "flos": 23768698596480.0, "grad_norm": 2.394968700562239, "language_loss": 0.72802091, "learning_rate": 4.863696366327543e-07, "loss": 0.74951708, "num_input_tokens_seen": 139750820, "step": 6487, "time_per_iteration": 2.697099447250366 }, { "auxiliary_loss_clip": 0.01152109, "auxiliary_loss_mlp": 0.01020556, "balance_loss_clip": 1.04224277, "balance_loss_mlp": 1.01325476, "epoch": 0.7801358744664222, "flos": 26429714881920.0, "grad_norm": 1.6723277964858903, "language_loss": 0.77978295, "learning_rate": 4.85860593047986e-07, "loss": 0.80150962, "num_input_tokens_seen": 139770885, "step": 6488, "time_per_iteration": 2.652677059173584 }, { "auxiliary_loss_clip": 0.01116286, "auxiliary_loss_mlp": 0.01025155, "balance_loss_clip": 1.03719664, "balance_loss_mlp": 1.01813996, "epoch": 0.7802561173570612, "flos": 26322162583680.0, "grad_norm": 1.5805377911966227, "language_loss": 0.74991924, "learning_rate": 4.853517791569613e-07, "loss": 0.77133363, "num_input_tokens_seen": 139793065, "step": 6489, "time_per_iteration": 2.7648510932922363 }, { "auxiliary_loss_clip": 0.01141426, "auxiliary_loss_mlp": 0.00762949, "balance_loss_clip": 1.04247558, "balance_loss_mlp": 1.00031161, "epoch": 0.7803763602477004, "flos": 40333751596800.0, "grad_norm": 1.775369385709843, "language_loss": 0.66525555, "learning_rate": 4.848431950368684e-07, "loss": 0.68429923, "num_input_tokens_seen": 139815625, "step": 6490, "time_per_iteration": 2.832223653793335 }, { "auxiliary_loss_clip": 0.01067782, "auxiliary_loss_mlp": 0.00753583, "balance_loss_clip": 1.01309848, "balance_loss_mlp": 1.00021553, "epoch": 0.7804966031383395, "flos": 67001448038400.0, "grad_norm": 0.7416839678124272, "language_loss": 0.55685437, "learning_rate": 4.843348407648569e-07, "loss": 0.575068, "num_input_tokens_seen": 139876905, "step": 6491, "time_per_iteration": 3.132840633392334 }, { "auxiliary_loss_clip": 0.01154032, "auxiliary_loss_mlp": 0.01024296, "balance_loss_clip": 1.04165101, "balance_loss_mlp": 1.01688743, "epoch": 0.7806168460289785, "flos": 17740733057280.0, "grad_norm": 2.3123721522641643, "language_loss": 0.82773179, "learning_rate": 4.838267164180457e-07, "loss": 0.84951508, "num_input_tokens_seen": 139892575, "step": 6492, "time_per_iteration": 2.595829963684082 }, { "auxiliary_loss_clip": 0.0116886, "auxiliary_loss_mlp": 0.01026919, "balance_loss_clip": 1.0465076, "balance_loss_mlp": 1.01954639, "epoch": 0.7807370889196176, "flos": 23946240545280.0, "grad_norm": 1.979848602662472, "language_loss": 0.83643359, "learning_rate": 4.833188220735156e-07, "loss": 0.8583914, "num_input_tokens_seen": 139912245, "step": 6493, "time_per_iteration": 2.6299400329589844 }, { "auxiliary_loss_clip": 0.01148356, "auxiliary_loss_mlp": 0.01025825, "balance_loss_clip": 1.04276729, "balance_loss_mlp": 1.01899457, "epoch": 0.7808573318102567, "flos": 18989024457600.0, "grad_norm": 2.2131090536186164, "language_loss": 0.7434932, "learning_rate": 4.828111578083152e-07, "loss": 0.76523507, "num_input_tokens_seen": 139929150, "step": 6494, "time_per_iteration": 2.635835886001587 }, { "auxiliary_loss_clip": 0.01138668, "auxiliary_loss_mlp": 0.0102479, "balance_loss_clip": 1.04576337, "balance_loss_mlp": 1.01814365, "epoch": 0.7809775747008958, "flos": 23980750536960.0, "grad_norm": 2.005280747149861, "language_loss": 0.80942309, "learning_rate": 4.823037236994556e-07, "loss": 0.83105767, "num_input_tokens_seen": 139947315, "step": 6495, "time_per_iteration": 2.706821918487549 }, { "auxiliary_loss_clip": 0.01058834, "auxiliary_loss_mlp": 0.0100208, "balance_loss_clip": 1.01273489, "balance_loss_mlp": 1.00101936, "epoch": 0.7810978175915348, "flos": 68535875180160.0, "grad_norm": 0.7164227959020683, "language_loss": 0.56315351, "learning_rate": 4.817965198239136e-07, "loss": 0.58376265, "num_input_tokens_seen": 140013775, "step": 6496, "time_per_iteration": 3.2237048149108887 }, { "auxiliary_loss_clip": 0.01122918, "auxiliary_loss_mlp": 0.01027089, "balance_loss_clip": 1.04022777, "balance_loss_mlp": 1.02027369, "epoch": 0.781218060482174, "flos": 19642131498240.0, "grad_norm": 2.004018700319703, "language_loss": 0.74465728, "learning_rate": 4.812895462586331e-07, "loss": 0.76615733, "num_input_tokens_seen": 140031600, "step": 6497, "time_per_iteration": 2.754545211791992 }, { "auxiliary_loss_clip": 0.01128106, "auxiliary_loss_mlp": 0.01025823, "balance_loss_clip": 1.04246426, "balance_loss_mlp": 1.01907921, "epoch": 0.7813383033728131, "flos": 25627865621760.0, "grad_norm": 2.161221674182993, "language_loss": 0.82125139, "learning_rate": 4.807828030805207e-07, "loss": 0.84279072, "num_input_tokens_seen": 140050590, "step": 6498, "time_per_iteration": 2.741144895553589 }, { "auxiliary_loss_clip": 0.0115266, "auxiliary_loss_mlp": 0.01026925, "balance_loss_clip": 1.04734945, "balance_loss_mlp": 1.01968932, "epoch": 0.7814585462634521, "flos": 20485924865280.0, "grad_norm": 2.7148601169117326, "language_loss": 0.67761749, "learning_rate": 4.802762903664495e-07, "loss": 0.69941336, "num_input_tokens_seen": 140069770, "step": 6499, "time_per_iteration": 2.628065586090088 }, { "auxiliary_loss_clip": 0.01141958, "auxiliary_loss_mlp": 0.01024596, "balance_loss_clip": 1.04427123, "balance_loss_mlp": 1.0172056, "epoch": 0.7815787891540913, "flos": 22304297018880.0, "grad_norm": 2.4614267420154814, "language_loss": 0.73741066, "learning_rate": 4.797700081932565e-07, "loss": 0.75907624, "num_input_tokens_seen": 140087635, "step": 6500, "time_per_iteration": 2.685960292816162 }, { "auxiliary_loss_clip": 0.0109231, "auxiliary_loss_mlp": 0.01029253, "balance_loss_clip": 1.03771329, "balance_loss_mlp": 1.02210057, "epoch": 0.7816990320447303, "flos": 22600668136320.0, "grad_norm": 2.7368271045160637, "language_loss": 0.8148374, "learning_rate": 4.792639566377442e-07, "loss": 0.83605301, "num_input_tokens_seen": 140105045, "step": 6501, "time_per_iteration": 2.7704479694366455 }, { "auxiliary_loss_clip": 0.01144911, "auxiliary_loss_mlp": 0.01025459, "balance_loss_clip": 1.0402112, "balance_loss_mlp": 1.01784742, "epoch": 0.7818192749353694, "flos": 24935974871040.0, "grad_norm": 1.6336279600859456, "language_loss": 0.77348495, "learning_rate": 4.78758135776681e-07, "loss": 0.79518861, "num_input_tokens_seen": 140124900, "step": 6502, "time_per_iteration": 2.713123321533203 }, { "auxiliary_loss_clip": 0.01143787, "auxiliary_loss_mlp": 0.01026481, "balance_loss_clip": 1.04750299, "balance_loss_mlp": 1.01901817, "epoch": 0.7819395178260086, "flos": 23733039369600.0, "grad_norm": 2.137510511301502, "language_loss": 0.78671944, "learning_rate": 4.782525456867989e-07, "loss": 0.80842209, "num_input_tokens_seen": 140143755, "step": 6503, "time_per_iteration": 2.7096970081329346 }, { "auxiliary_loss_clip": 0.01128704, "auxiliary_loss_mlp": 0.01025174, "balance_loss_clip": 1.04381442, "balance_loss_mlp": 1.01729465, "epoch": 0.7820597607166476, "flos": 23221671396480.0, "grad_norm": 1.87347471338828, "language_loss": 0.83443689, "learning_rate": 4.777471864447959e-07, "loss": 0.85597569, "num_input_tokens_seen": 140164495, "step": 6504, "time_per_iteration": 2.7644662857055664 }, { "auxiliary_loss_clip": 0.01141083, "auxiliary_loss_mlp": 0.01026369, "balance_loss_clip": 1.04327416, "balance_loss_mlp": 1.01905584, "epoch": 0.7821800036072867, "flos": 22309540404480.0, "grad_norm": 2.0412351970810523, "language_loss": 0.80360079, "learning_rate": 4.772420581273344e-07, "loss": 0.8252753, "num_input_tokens_seen": 140181980, "step": 6505, "time_per_iteration": 3.649690866470337 }, { "auxiliary_loss_clip": 0.01149532, "auxiliary_loss_mlp": 0.01022908, "balance_loss_clip": 1.0452559, "balance_loss_mlp": 1.01576114, "epoch": 0.7823002464979258, "flos": 21544176384000.0, "grad_norm": 2.02620607906541, "language_loss": 0.7657876, "learning_rate": 4.7673716081104134e-07, "loss": 0.78751194, "num_input_tokens_seen": 140202155, "step": 6506, "time_per_iteration": 2.739870309829712 }, { "auxiliary_loss_clip": 0.01151386, "auxiliary_loss_mlp": 0.01024134, "balance_loss_clip": 1.04672098, "balance_loss_mlp": 1.01754165, "epoch": 0.7824204893885649, "flos": 24535642815360.0, "grad_norm": 1.7339916883145055, "language_loss": 0.84749007, "learning_rate": 4.762324945725109e-07, "loss": 0.86924529, "num_input_tokens_seen": 140221600, "step": 6507, "time_per_iteration": 3.708195447921753 }, { "auxiliary_loss_clip": 0.01136269, "auxiliary_loss_mlp": 0.01028393, "balance_loss_clip": 1.04586339, "balance_loss_mlp": 1.02120435, "epoch": 0.782540732279204, "flos": 27415211402880.0, "grad_norm": 1.6707353165861734, "language_loss": 0.75711691, "learning_rate": 4.7572805948829844e-07, "loss": 0.77876353, "num_input_tokens_seen": 140241860, "step": 6508, "time_per_iteration": 3.6458725929260254 }, { "auxiliary_loss_clip": 0.01115593, "auxiliary_loss_mlp": 0.01022723, "balance_loss_clip": 1.04082417, "balance_loss_mlp": 1.01644051, "epoch": 0.7826609751698431, "flos": 24353216616960.0, "grad_norm": 1.7284796602822707, "language_loss": 0.70950717, "learning_rate": 4.7522385563492795e-07, "loss": 0.73089033, "num_input_tokens_seen": 140262160, "step": 6509, "time_per_iteration": 2.7927610874176025 }, { "auxiliary_loss_clip": 0.01129312, "auxiliary_loss_mlp": 0.01027456, "balance_loss_clip": 1.04428709, "balance_loss_mlp": 1.02054191, "epoch": 0.7827812180604822, "flos": 23988543788160.0, "grad_norm": 2.131050588258103, "language_loss": 0.70212454, "learning_rate": 4.747198830888863e-07, "loss": 0.72369218, "num_input_tokens_seen": 140282030, "step": 6510, "time_per_iteration": 2.6927080154418945 }, { "auxiliary_loss_clip": 0.01134704, "auxiliary_loss_mlp": 0.01032845, "balance_loss_clip": 1.04248953, "balance_loss_mlp": 1.02527809, "epoch": 0.7829014609511212, "flos": 27454318335360.0, "grad_norm": 2.4557858302003663, "language_loss": 0.68832856, "learning_rate": 4.742161419266251e-07, "loss": 0.71000403, "num_input_tokens_seen": 140301190, "step": 6511, "time_per_iteration": 3.5797083377838135 }, { "auxiliary_loss_clip": 0.01156082, "auxiliary_loss_mlp": 0.01025296, "balance_loss_clip": 1.04555655, "balance_loss_mlp": 1.01780701, "epoch": 0.7830217038417604, "flos": 29204532432000.0, "grad_norm": 2.362803190038171, "language_loss": 0.65238452, "learning_rate": 4.7371263222456304e-07, "loss": 0.67419827, "num_input_tokens_seen": 140318510, "step": 6512, "time_per_iteration": 2.850525379180908 }, { "auxiliary_loss_clip": 0.01052182, "auxiliary_loss_mlp": 0.01002249, "balance_loss_clip": 1.01057458, "balance_loss_mlp": 1.00121212, "epoch": 0.7831419467323995, "flos": 60950895822720.0, "grad_norm": 0.8035402461808099, "language_loss": 0.61378908, "learning_rate": 4.7320935405908004e-07, "loss": 0.63433337, "num_input_tokens_seen": 140379380, "step": 6513, "time_per_iteration": 3.212474822998047 }, { "auxiliary_loss_clip": 0.01169045, "auxiliary_loss_mlp": 0.01022777, "balance_loss_clip": 1.04531562, "balance_loss_mlp": 1.01534402, "epoch": 0.7832621896230385, "flos": 19682531320320.0, "grad_norm": 2.646457061383335, "language_loss": 0.84070718, "learning_rate": 4.7270630750652475e-07, "loss": 0.86262548, "num_input_tokens_seen": 140395335, "step": 6514, "time_per_iteration": 2.5818753242492676 }, { "auxiliary_loss_clip": 0.01148263, "auxiliary_loss_mlp": 0.01024398, "balance_loss_clip": 1.04314542, "balance_loss_mlp": 1.01729023, "epoch": 0.7833824325136777, "flos": 25009232659200.0, "grad_norm": 1.7583176131906804, "language_loss": 0.80288374, "learning_rate": 4.7220349264320746e-07, "loss": 0.82461035, "num_input_tokens_seen": 140414420, "step": 6515, "time_per_iteration": 2.736367702484131 }, { "auxiliary_loss_clip": 0.01056253, "auxiliary_loss_mlp": 0.01002319, "balance_loss_clip": 1.01291633, "balance_loss_mlp": 1.00118685, "epoch": 0.7835026754043167, "flos": 68800142517120.0, "grad_norm": 0.7381798755897361, "language_loss": 0.54850918, "learning_rate": 4.71700909545407e-07, "loss": 0.5690949, "num_input_tokens_seen": 140477365, "step": 6516, "time_per_iteration": 3.216501235961914 }, { "auxiliary_loss_clip": 0.01155581, "auxiliary_loss_mlp": 0.01023199, "balance_loss_clip": 1.04490209, "balance_loss_mlp": 1.01616311, "epoch": 0.7836229182949558, "flos": 19864598382720.0, "grad_norm": 1.7435618883171538, "language_loss": 0.76963866, "learning_rate": 4.711985582893627e-07, "loss": 0.79142648, "num_input_tokens_seen": 140495885, "step": 6517, "time_per_iteration": 2.604694128036499 }, { "auxiliary_loss_clip": 0.01113355, "auxiliary_loss_mlp": 0.01022746, "balance_loss_clip": 1.03974986, "balance_loss_mlp": 1.01538157, "epoch": 0.783743161185595, "flos": 22965843755520.0, "grad_norm": 1.702977391361563, "language_loss": 0.71579391, "learning_rate": 4.706964389512811e-07, "loss": 0.73715496, "num_input_tokens_seen": 140515920, "step": 6518, "time_per_iteration": 2.7204086780548096 }, { "auxiliary_loss_clip": 0.01168441, "auxiliary_loss_mlp": 0.01029594, "balance_loss_clip": 1.04993188, "balance_loss_mlp": 1.02176809, "epoch": 0.783863404076234, "flos": 12458489777280.0, "grad_norm": 1.929812012400059, "language_loss": 0.87469423, "learning_rate": 4.701945516073345e-07, "loss": 0.89667463, "num_input_tokens_seen": 140533395, "step": 6519, "time_per_iteration": 2.597367525100708 }, { "auxiliary_loss_clip": 0.01121524, "auxiliary_loss_mlp": 0.01024113, "balance_loss_clip": 1.04178286, "balance_loss_mlp": 1.01675785, "epoch": 0.7839836469668731, "flos": 24243940465920.0, "grad_norm": 1.937925012022678, "language_loss": 0.75177819, "learning_rate": 4.696928963336577e-07, "loss": 0.77323449, "num_input_tokens_seen": 140552825, "step": 6520, "time_per_iteration": 2.6953039169311523 }, { "auxiliary_loss_clip": 0.01052418, "auxiliary_loss_mlp": 0.01002183, "balance_loss_clip": 1.01071143, "balance_loss_mlp": 1.00112247, "epoch": 0.7841038898575122, "flos": 62121978938880.0, "grad_norm": 0.8514675553265723, "language_loss": 0.61002588, "learning_rate": 4.6919147320635224e-07, "loss": 0.63057184, "num_input_tokens_seen": 140615535, "step": 6521, "time_per_iteration": 3.2307302951812744 }, { "auxiliary_loss_clip": 0.01153539, "auxiliary_loss_mlp": 0.01030485, "balance_loss_clip": 1.04268312, "balance_loss_mlp": 1.02346635, "epoch": 0.7842241327481513, "flos": 20193899293440.0, "grad_norm": 2.3458921208731462, "language_loss": 0.72891557, "learning_rate": 4.6869028230148286e-07, "loss": 0.75075579, "num_input_tokens_seen": 140633330, "step": 6522, "time_per_iteration": 2.634263038635254 }, { "auxiliary_loss_clip": 0.01117245, "auxiliary_loss_mlp": 0.01025528, "balance_loss_clip": 1.03807116, "balance_loss_mlp": 1.01718295, "epoch": 0.7843443756387903, "flos": 28074531496320.0, "grad_norm": 2.188934808958748, "language_loss": 0.59796172, "learning_rate": 4.6818932369507957e-07, "loss": 0.61938941, "num_input_tokens_seen": 140652830, "step": 6523, "time_per_iteration": 2.81190824508667 }, { "auxiliary_loss_clip": 0.01154284, "auxiliary_loss_mlp": 0.01027161, "balance_loss_clip": 1.04728448, "balance_loss_mlp": 1.02010727, "epoch": 0.7844646185294295, "flos": 21323397438720.0, "grad_norm": 2.2289330118390343, "language_loss": 0.89438963, "learning_rate": 4.676885974631386e-07, "loss": 0.91620398, "num_input_tokens_seen": 140671190, "step": 6524, "time_per_iteration": 2.6719560623168945 }, { "auxiliary_loss_clip": 0.01150863, "auxiliary_loss_mlp": 0.01028565, "balance_loss_clip": 1.04421163, "balance_loss_mlp": 1.02145684, "epoch": 0.7845848614200686, "flos": 23656585271040.0, "grad_norm": 1.9566310791697055, "language_loss": 0.81413531, "learning_rate": 4.67188103681619e-07, "loss": 0.83592963, "num_input_tokens_seen": 140690975, "step": 6525, "time_per_iteration": 2.6624062061309814 }, { "auxiliary_loss_clip": 0.01148986, "auxiliary_loss_mlp": 0.00762025, "balance_loss_clip": 1.04680371, "balance_loss_mlp": 1.00033152, "epoch": 0.7847051043107076, "flos": 23402194174080.0, "grad_norm": 2.05880972361832, "language_loss": 0.69340259, "learning_rate": 4.666878424264453e-07, "loss": 0.71251267, "num_input_tokens_seen": 140710930, "step": 6526, "time_per_iteration": 2.6948983669281006 }, { "auxiliary_loss_clip": 0.01130694, "auxiliary_loss_mlp": 0.01022389, "balance_loss_clip": 1.04281449, "balance_loss_mlp": 1.01589537, "epoch": 0.7848253472013467, "flos": 19022277473280.0, "grad_norm": 1.5703116274649527, "language_loss": 0.73804486, "learning_rate": 4.661878137735069e-07, "loss": 0.75957566, "num_input_tokens_seen": 140729120, "step": 6527, "time_per_iteration": 2.6417665481567383 }, { "auxiliary_loss_clip": 0.011386, "auxiliary_loss_mlp": 0.01024925, "balance_loss_clip": 1.044608, "balance_loss_mlp": 1.01849961, "epoch": 0.7849455900919858, "flos": 21179180332800.0, "grad_norm": 1.8874763645142707, "language_loss": 0.75004268, "learning_rate": 4.656880177986571e-07, "loss": 0.77167785, "num_input_tokens_seen": 140747665, "step": 6528, "time_per_iteration": 2.7247180938720703 }, { "auxiliary_loss_clip": 0.01140943, "auxiliary_loss_mlp": 0.01031669, "balance_loss_clip": 1.04186249, "balance_loss_mlp": 1.02439106, "epoch": 0.7850658329826249, "flos": 19536482620800.0, "grad_norm": 1.8036284286668014, "language_loss": 0.8152957, "learning_rate": 4.6518845457771607e-07, "loss": 0.83702177, "num_input_tokens_seen": 140766525, "step": 6529, "time_per_iteration": 2.621004819869995 }, { "auxiliary_loss_clip": 0.01146764, "auxiliary_loss_mlp": 0.00761985, "balance_loss_clip": 1.04415882, "balance_loss_mlp": 1.00033748, "epoch": 0.7851860758732639, "flos": 12495334152960.0, "grad_norm": 1.8650112955659495, "language_loss": 0.79178798, "learning_rate": 4.646891241864652e-07, "loss": 0.81087542, "num_input_tokens_seen": 140785090, "step": 6530, "time_per_iteration": 2.6424753665924072 }, { "auxiliary_loss_clip": 0.0114799, "auxiliary_loss_mlp": 0.01029023, "balance_loss_clip": 1.04147577, "balance_loss_mlp": 1.02162933, "epoch": 0.7853063187639031, "flos": 22960959505920.0, "grad_norm": 2.2584725423911847, "language_loss": 0.73293316, "learning_rate": 4.6419002670065397e-07, "loss": 0.75470328, "num_input_tokens_seen": 140804670, "step": 6531, "time_per_iteration": 3.6408517360687256 }, { "auxiliary_loss_clip": 0.01128906, "auxiliary_loss_mlp": 0.01021468, "balance_loss_clip": 1.04347396, "balance_loss_mlp": 1.01436865, "epoch": 0.7854265616545422, "flos": 17347260499200.0, "grad_norm": 2.0470471342488112, "language_loss": 0.87063831, "learning_rate": 4.6369116219599445e-07, "loss": 0.89214206, "num_input_tokens_seen": 140820655, "step": 6532, "time_per_iteration": 2.634772777557373 }, { "auxiliary_loss_clip": 0.01123772, "auxiliary_loss_mlp": 0.01025043, "balance_loss_clip": 1.04215527, "balance_loss_mlp": 1.01783085, "epoch": 0.7855468045451812, "flos": 23838293197440.0, "grad_norm": 1.8346912960306412, "language_loss": 0.7917468, "learning_rate": 4.631925307481637e-07, "loss": 0.81323493, "num_input_tokens_seen": 140840470, "step": 6533, "time_per_iteration": 3.7489569187164307 }, { "auxiliary_loss_clip": 0.01137624, "auxiliary_loss_mlp": 0.0102293, "balance_loss_clip": 1.04461479, "balance_loss_mlp": 1.01648378, "epoch": 0.7856670474358204, "flos": 25666792986240.0, "grad_norm": 2.351012759212642, "language_loss": 0.7571286, "learning_rate": 4.6269413243280533e-07, "loss": 0.77873409, "num_input_tokens_seen": 140859890, "step": 6534, "time_per_iteration": 3.6823599338531494 }, { "auxiliary_loss_clip": 0.01142354, "auxiliary_loss_mlp": 0.01025216, "balance_loss_clip": 1.04485023, "balance_loss_mlp": 1.01776004, "epoch": 0.7857872903264594, "flos": 18144656472960.0, "grad_norm": 2.643578500758141, "language_loss": 0.74123603, "learning_rate": 4.621959673255236e-07, "loss": 0.76291174, "num_input_tokens_seen": 140876190, "step": 6535, "time_per_iteration": 2.740847110748291 }, { "auxiliary_loss_clip": 0.01107053, "auxiliary_loss_mlp": 0.01023079, "balance_loss_clip": 1.03989673, "balance_loss_mlp": 1.015944, "epoch": 0.7859075332170985, "flos": 14386138081920.0, "grad_norm": 1.9743967525569641, "language_loss": 0.90623361, "learning_rate": 4.6169803550189135e-07, "loss": 0.92753488, "num_input_tokens_seen": 140891885, "step": 6536, "time_per_iteration": 2.7829766273498535 }, { "auxiliary_loss_clip": 0.01103936, "auxiliary_loss_mlp": 0.01022835, "balance_loss_clip": 1.04109764, "balance_loss_mlp": 1.01564121, "epoch": 0.7860277761077377, "flos": 19864059678720.0, "grad_norm": 2.7165200151715707, "language_loss": 0.77474707, "learning_rate": 4.6120033703744355e-07, "loss": 0.79601473, "num_input_tokens_seen": 140910780, "step": 6537, "time_per_iteration": 3.644303560256958 }, { "auxiliary_loss_clip": 0.01128731, "auxiliary_loss_mlp": 0.01026819, "balance_loss_clip": 1.04113722, "balance_loss_mlp": 1.01999998, "epoch": 0.7861480189983767, "flos": 26396174557440.0, "grad_norm": 2.1626602532023886, "language_loss": 0.78468531, "learning_rate": 4.607028720076822e-07, "loss": 0.8062408, "num_input_tokens_seen": 140927460, "step": 6538, "time_per_iteration": 2.719326972961426 }, { "auxiliary_loss_clip": 0.01151668, "auxiliary_loss_mlp": 0.01022403, "balance_loss_clip": 1.04510629, "balance_loss_mlp": 1.01529825, "epoch": 0.7862682618890158, "flos": 24236578177920.0, "grad_norm": 2.009243439209818, "language_loss": 0.73821133, "learning_rate": 4.6020564048807074e-07, "loss": 0.75995207, "num_input_tokens_seen": 140945135, "step": 6539, "time_per_iteration": 2.649303913116455 }, { "auxiliary_loss_clip": 0.0115164, "auxiliary_loss_mlp": 0.01023582, "balance_loss_clip": 1.04424453, "balance_loss_mlp": 1.01654232, "epoch": 0.7863885047796549, "flos": 47551508259840.0, "grad_norm": 2.0476943998704877, "language_loss": 0.72336972, "learning_rate": 4.5970864255403883e-07, "loss": 0.74512196, "num_input_tokens_seen": 140966660, "step": 6540, "time_per_iteration": 2.8656511306762695 }, { "auxiliary_loss_clip": 0.01142652, "auxiliary_loss_mlp": 0.01022248, "balance_loss_clip": 1.04286623, "balance_loss_mlp": 1.01573014, "epoch": 0.786508747670294, "flos": 24389234979840.0, "grad_norm": 2.138915679509711, "language_loss": 0.82185066, "learning_rate": 4.59211878280982e-07, "loss": 0.84349966, "num_input_tokens_seen": 140986175, "step": 6541, "time_per_iteration": 2.7298736572265625 }, { "auxiliary_loss_clip": 0.01141152, "auxiliary_loss_mlp": 0.010269, "balance_loss_clip": 1.04435539, "balance_loss_mlp": 1.01957464, "epoch": 0.786628990560933, "flos": 18041234238720.0, "grad_norm": 2.3896103460293086, "language_loss": 0.70372742, "learning_rate": 4.587153477442578e-07, "loss": 0.7254079, "num_input_tokens_seen": 141002490, "step": 6542, "time_per_iteration": 2.6405084133148193 }, { "auxiliary_loss_clip": 0.01170431, "auxiliary_loss_mlp": 0.01036391, "balance_loss_clip": 1.04768038, "balance_loss_mlp": 1.02875245, "epoch": 0.7867492334515722, "flos": 25848860048640.0, "grad_norm": 2.1652174479655764, "language_loss": 0.81551778, "learning_rate": 4.582190510191899e-07, "loss": 0.83758593, "num_input_tokens_seen": 141021150, "step": 6543, "time_per_iteration": 2.7101471424102783 }, { "auxiliary_loss_clip": 0.01121926, "auxiliary_loss_mlp": 0.01027424, "balance_loss_clip": 1.04262877, "balance_loss_mlp": 1.02078438, "epoch": 0.7868694763422113, "flos": 16580819070720.0, "grad_norm": 2.0619018079848894, "language_loss": 0.87624788, "learning_rate": 4.5772298818106625e-07, "loss": 0.89774138, "num_input_tokens_seen": 141036940, "step": 6544, "time_per_iteration": 2.655843496322632 }, { "auxiliary_loss_clip": 0.0113278, "auxiliary_loss_mlp": 0.01028974, "balance_loss_clip": 1.04543138, "balance_loss_mlp": 1.02148795, "epoch": 0.7869897192328503, "flos": 29386276272000.0, "grad_norm": 4.663540698872355, "language_loss": 0.7158742, "learning_rate": 4.572271593051384e-07, "loss": 0.73749173, "num_input_tokens_seen": 141054295, "step": 6545, "time_per_iteration": 2.718353033065796 }, { "auxiliary_loss_clip": 0.0110284, "auxiliary_loss_mlp": 0.01029868, "balance_loss_clip": 1.04103041, "balance_loss_mlp": 1.02253985, "epoch": 0.7871099621234895, "flos": 17128923678720.0, "grad_norm": 1.5556034695510152, "language_loss": 0.78277373, "learning_rate": 4.567315644666245e-07, "loss": 0.80410075, "num_input_tokens_seen": 141073090, "step": 6546, "time_per_iteration": 2.7910475730895996 }, { "auxiliary_loss_clip": 0.01117721, "auxiliary_loss_mlp": 0.01031127, "balance_loss_clip": 1.04316914, "balance_loss_mlp": 1.02477646, "epoch": 0.7872302050141285, "flos": 23440187784960.0, "grad_norm": 1.8621165870092966, "language_loss": 0.84666979, "learning_rate": 4.5623620374070507e-07, "loss": 0.86815828, "num_input_tokens_seen": 141092405, "step": 6547, "time_per_iteration": 2.7623181343078613 }, { "auxiliary_loss_clip": 0.0103324, "auxiliary_loss_mlp": 0.01002075, "balance_loss_clip": 1.01081991, "balance_loss_mlp": 1.00098419, "epoch": 0.7873504479047676, "flos": 65959752689280.0, "grad_norm": 0.9866431168711856, "language_loss": 0.58348221, "learning_rate": 4.557410772025263e-07, "loss": 0.60383534, "num_input_tokens_seen": 141154355, "step": 6548, "time_per_iteration": 3.3675191402435303 }, { "auxiliary_loss_clip": 0.01135339, "auxiliary_loss_mlp": 0.01025255, "balance_loss_clip": 1.04200256, "balance_loss_mlp": 1.01813495, "epoch": 0.7874706907954068, "flos": 23258336204160.0, "grad_norm": 2.066006259179901, "language_loss": 0.66360658, "learning_rate": 4.5524618492719803e-07, "loss": 0.68521249, "num_input_tokens_seen": 141173575, "step": 6549, "time_per_iteration": 2.6847474575042725 }, { "auxiliary_loss_clip": 0.01151731, "auxiliary_loss_mlp": 0.01022695, "balance_loss_clip": 1.04375494, "balance_loss_mlp": 1.01572442, "epoch": 0.7875909336860458, "flos": 28767786963840.0, "grad_norm": 1.5294075871146862, "language_loss": 0.79010028, "learning_rate": 4.54751526989795e-07, "loss": 0.81184453, "num_input_tokens_seen": 141195415, "step": 6550, "time_per_iteration": 2.824073314666748 }, { "auxiliary_loss_clip": 0.01156235, "auxiliary_loss_mlp": 0.01022516, "balance_loss_clip": 1.04496276, "balance_loss_mlp": 1.01473737, "epoch": 0.7877111765766849, "flos": 18697286194560.0, "grad_norm": 1.9889939881377474, "language_loss": 0.7950635, "learning_rate": 4.5425710346535775e-07, "loss": 0.81685096, "num_input_tokens_seen": 141213360, "step": 6551, "time_per_iteration": 2.6355676651000977 }, { "auxiliary_loss_clip": 0.01156237, "auxiliary_loss_mlp": 0.01026441, "balance_loss_clip": 1.04492104, "balance_loss_mlp": 1.0196346, "epoch": 0.787831419467324, "flos": 27592968833280.0, "grad_norm": 1.9672284385624146, "language_loss": 0.81780326, "learning_rate": 4.537629144288877e-07, "loss": 0.83963013, "num_input_tokens_seen": 141230815, "step": 6552, "time_per_iteration": 2.662985324859619 }, { "auxiliary_loss_clip": 0.01117042, "auxiliary_loss_mlp": 0.01025404, "balance_loss_clip": 1.03891873, "balance_loss_mlp": 1.01873112, "epoch": 0.7879516623579631, "flos": 18150187167360.0, "grad_norm": 1.8666838725698773, "language_loss": 0.7503131, "learning_rate": 4.5326895995535477e-07, "loss": 0.77173758, "num_input_tokens_seen": 141249715, "step": 6553, "time_per_iteration": 2.7002344131469727 }, { "auxiliary_loss_clip": 0.01150124, "auxiliary_loss_mlp": 0.01023505, "balance_loss_clip": 1.04413927, "balance_loss_mlp": 1.01643038, "epoch": 0.7880719052486022, "flos": 20339193807360.0, "grad_norm": 2.1727990837310536, "language_loss": 0.84388721, "learning_rate": 4.527752401196907e-07, "loss": 0.86562359, "num_input_tokens_seen": 141267730, "step": 6554, "time_per_iteration": 2.6406917572021484 }, { "auxiliary_loss_clip": 0.01132671, "auxiliary_loss_mlp": 0.01023317, "balance_loss_clip": 1.04241252, "balance_loss_mlp": 1.01650119, "epoch": 0.7881921481392413, "flos": 21653237053440.0, "grad_norm": 1.8793056401779376, "language_loss": 0.66952729, "learning_rate": 4.5228175499679254e-07, "loss": 0.69108713, "num_input_tokens_seen": 141287315, "step": 6555, "time_per_iteration": 2.619337797164917 }, { "auxiliary_loss_clip": 0.01056679, "auxiliary_loss_mlp": 0.01001159, "balance_loss_clip": 1.01170564, "balance_loss_mlp": 1.00004995, "epoch": 0.7883123910298804, "flos": 68565860058240.0, "grad_norm": 0.8149922310657656, "language_loss": 0.54506767, "learning_rate": 4.5178850466152174e-07, "loss": 0.56564605, "num_input_tokens_seen": 141346145, "step": 6556, "time_per_iteration": 3.3280327320098877 }, { "auxiliary_loss_clip": 0.01133662, "auxiliary_loss_mlp": 0.01029994, "balance_loss_clip": 1.04099798, "balance_loss_mlp": 1.02297235, "epoch": 0.7884326339205194, "flos": 19318217627520.0, "grad_norm": 1.817638445008885, "language_loss": 0.81955147, "learning_rate": 4.512954891887031e-07, "loss": 0.84118807, "num_input_tokens_seen": 141364445, "step": 6557, "time_per_iteration": 3.548931360244751 }, { "auxiliary_loss_clip": 0.01131631, "auxiliary_loss_mlp": 0.01029984, "balance_loss_clip": 1.04338217, "balance_loss_mlp": 1.02232528, "epoch": 0.7885528768111585, "flos": 17784903807360.0, "grad_norm": 2.447038106571842, "language_loss": 0.83515429, "learning_rate": 4.5080270865312806e-07, "loss": 0.8567704, "num_input_tokens_seen": 141381640, "step": 6558, "time_per_iteration": 3.563401222229004 }, { "auxiliary_loss_clip": 0.0115118, "auxiliary_loss_mlp": 0.01034699, "balance_loss_clip": 1.04364157, "balance_loss_mlp": 1.02793384, "epoch": 0.7886731197017977, "flos": 18807639753600.0, "grad_norm": 1.8815182859597837, "language_loss": 0.71232963, "learning_rate": 4.5031016312954985e-07, "loss": 0.73418844, "num_input_tokens_seen": 141399955, "step": 6559, "time_per_iteration": 2.6551735401153564 }, { "auxiliary_loss_clip": 0.01162075, "auxiliary_loss_mlp": 0.01025009, "balance_loss_clip": 1.04860187, "balance_loss_mlp": 1.01706338, "epoch": 0.7887933625924367, "flos": 33365358126720.0, "grad_norm": 2.286705896198435, "language_loss": 0.74358279, "learning_rate": 4.498178526926886e-07, "loss": 0.76545364, "num_input_tokens_seen": 141420820, "step": 6560, "time_per_iteration": 3.534046173095703 }, { "auxiliary_loss_clip": 0.01164854, "auxiliary_loss_mlp": 0.01034973, "balance_loss_clip": 1.04638696, "balance_loss_mlp": 1.02810609, "epoch": 0.7889136054830758, "flos": 17019360218880.0, "grad_norm": 2.1882539855749945, "language_loss": 0.72542822, "learning_rate": 4.4932577741722635e-07, "loss": 0.74742651, "num_input_tokens_seen": 141439350, "step": 6561, "time_per_iteration": 2.611114978790283 }, { "auxiliary_loss_clip": 0.01136127, "auxiliary_loss_mlp": 0.01021298, "balance_loss_clip": 1.04255438, "balance_loss_mlp": 1.01417542, "epoch": 0.7890338483737149, "flos": 29424629018880.0, "grad_norm": 1.6073754913268334, "language_loss": 0.74488145, "learning_rate": 4.4883393737780985e-07, "loss": 0.76645565, "num_input_tokens_seen": 141460300, "step": 6562, "time_per_iteration": 3.726135015487671 }, { "auxiliary_loss_clip": 0.01146444, "auxiliary_loss_mlp": 0.01026647, "balance_loss_clip": 1.04271138, "balance_loss_mlp": 1.01967072, "epoch": 0.789154091264354, "flos": 19971576063360.0, "grad_norm": 2.3646061175033113, "language_loss": 0.78332841, "learning_rate": 4.4834233264905254e-07, "loss": 0.80505931, "num_input_tokens_seen": 141477315, "step": 6563, "time_per_iteration": 2.656501293182373 }, { "auxiliary_loss_clip": 0.01118198, "auxiliary_loss_mlp": 0.01027295, "balance_loss_clip": 1.0412817, "balance_loss_mlp": 1.01908731, "epoch": 0.789274334154993, "flos": 14537825216640.0, "grad_norm": 2.761064206611322, "language_loss": 0.72229016, "learning_rate": 4.478509633055294e-07, "loss": 0.74374509, "num_input_tokens_seen": 141495025, "step": 6564, "time_per_iteration": 2.691890239715576 }, { "auxiliary_loss_clip": 0.01141516, "auxiliary_loss_mlp": 0.01032135, "balance_loss_clip": 1.04269898, "balance_loss_mlp": 1.02426434, "epoch": 0.7893945770456322, "flos": 21827403123840.0, "grad_norm": 3.7766899694245737, "language_loss": 0.79584867, "learning_rate": 4.473598294217813e-07, "loss": 0.81758523, "num_input_tokens_seen": 141510450, "step": 6565, "time_per_iteration": 2.7199668884277344 }, { "auxiliary_loss_clip": 0.0115106, "auxiliary_loss_mlp": 0.01019835, "balance_loss_clip": 1.04574788, "balance_loss_mlp": 1.01328707, "epoch": 0.7895148199362713, "flos": 20740639184640.0, "grad_norm": 2.0829669840529634, "language_loss": 0.71817136, "learning_rate": 4.468689310723124e-07, "loss": 0.73988026, "num_input_tokens_seen": 141528265, "step": 6566, "time_per_iteration": 2.5981650352478027 }, { "auxiliary_loss_clip": 0.0112801, "auxiliary_loss_mlp": 0.01027744, "balance_loss_clip": 1.04102254, "balance_loss_mlp": 1.02102959, "epoch": 0.7896350628269103, "flos": 16690669839360.0, "grad_norm": 1.8233368040251934, "language_loss": 0.78604758, "learning_rate": 4.463782683315913e-07, "loss": 0.80760515, "num_input_tokens_seen": 141547270, "step": 6567, "time_per_iteration": 2.736029863357544 }, { "auxiliary_loss_clip": 0.01165053, "auxiliary_loss_mlp": 0.01022007, "balance_loss_clip": 1.04595351, "balance_loss_mlp": 1.01503932, "epoch": 0.7897553057175495, "flos": 22638374438400.0, "grad_norm": 1.6596794948853546, "language_loss": 0.73316944, "learning_rate": 4.458878412740523e-07, "loss": 0.75504005, "num_input_tokens_seen": 141566050, "step": 6568, "time_per_iteration": 2.5992326736450195 }, { "auxiliary_loss_clip": 0.01151287, "auxiliary_loss_mlp": 0.01026369, "balance_loss_clip": 1.04574108, "balance_loss_mlp": 1.01936805, "epoch": 0.7898755486081885, "flos": 14537573821440.0, "grad_norm": 2.5278020281825526, "language_loss": 0.77608633, "learning_rate": 4.453976499740919e-07, "loss": 0.79786289, "num_input_tokens_seen": 141583695, "step": 6569, "time_per_iteration": 2.7377381324768066 }, { "auxiliary_loss_clip": 0.01151164, "auxiliary_loss_mlp": 0.01024438, "balance_loss_clip": 1.04572093, "balance_loss_mlp": 1.0171659, "epoch": 0.7899957914988276, "flos": 17238487138560.0, "grad_norm": 1.9272554374408004, "language_loss": 0.778036, "learning_rate": 4.4490769450607215e-07, "loss": 0.79979205, "num_input_tokens_seen": 141601320, "step": 6570, "time_per_iteration": 2.5805087089538574 }, { "auxiliary_loss_clip": 0.01120397, "auxiliary_loss_mlp": 0.01032536, "balance_loss_clip": 1.03802979, "balance_loss_mlp": 1.02529407, "epoch": 0.7901160343894668, "flos": 41279351086080.0, "grad_norm": 2.1183495770997682, "language_loss": 0.72867256, "learning_rate": 4.4441797494431845e-07, "loss": 0.75020194, "num_input_tokens_seen": 141623125, "step": 6571, "time_per_iteration": 2.885620355606079 }, { "auxiliary_loss_clip": 0.01149752, "auxiliary_loss_mlp": 0.01025505, "balance_loss_clip": 1.04517829, "balance_loss_mlp": 1.01845074, "epoch": 0.7902362772801058, "flos": 16837005847680.0, "grad_norm": 2.2991869277648824, "language_loss": 0.77602798, "learning_rate": 4.439284913631207e-07, "loss": 0.79778051, "num_input_tokens_seen": 141640335, "step": 6572, "time_per_iteration": 2.612126350402832 }, { "auxiliary_loss_clip": 0.01131071, "auxiliary_loss_mlp": 0.01029151, "balance_loss_clip": 1.04621196, "balance_loss_mlp": 1.0221951, "epoch": 0.7903565201707449, "flos": 27125987091840.0, "grad_norm": 2.1405212629013697, "language_loss": 0.83670473, "learning_rate": 4.434392438367347e-07, "loss": 0.858307, "num_input_tokens_seen": 141659760, "step": 6573, "time_per_iteration": 2.744536876678467 }, { "auxiliary_loss_clip": 0.01154702, "auxiliary_loss_mlp": 0.01020204, "balance_loss_clip": 1.04215395, "balance_loss_mlp": 1.01346552, "epoch": 0.790476763061384, "flos": 31025167142400.0, "grad_norm": 1.8112933648399423, "language_loss": 0.74432015, "learning_rate": 4.4295023243937677e-07, "loss": 0.76606917, "num_input_tokens_seen": 141679965, "step": 6574, "time_per_iteration": 2.67392635345459 }, { "auxiliary_loss_clip": 0.01154005, "auxiliary_loss_mlp": 0.01033672, "balance_loss_clip": 1.04677653, "balance_loss_mlp": 1.02540469, "epoch": 0.7905970059520231, "flos": 22089084681600.0, "grad_norm": 1.6691020590721684, "language_loss": 0.80659276, "learning_rate": 4.4246145724523123e-07, "loss": 0.82846951, "num_input_tokens_seen": 141697710, "step": 6575, "time_per_iteration": 2.6139297485351562 }, { "auxiliary_loss_clip": 0.01127273, "auxiliary_loss_mlp": 0.0102455, "balance_loss_clip": 1.04504657, "balance_loss_mlp": 1.01792252, "epoch": 0.7907172488426621, "flos": 20558141159040.0, "grad_norm": 2.231793773301612, "language_loss": 0.77643675, "learning_rate": 4.41972918328444e-07, "loss": 0.79795498, "num_input_tokens_seen": 141715145, "step": 6576, "time_per_iteration": 2.6796176433563232 }, { "auxiliary_loss_clip": 0.01151101, "auxiliary_loss_mlp": 0.01028562, "balance_loss_clip": 1.04685283, "balance_loss_mlp": 1.0212307, "epoch": 0.7908374917333013, "flos": 30081542901120.0, "grad_norm": 2.0559465054088375, "language_loss": 0.77554029, "learning_rate": 4.4148461576312646e-07, "loss": 0.79733694, "num_input_tokens_seen": 141734810, "step": 6577, "time_per_iteration": 2.7399675846099854 }, { "auxiliary_loss_clip": 0.011548, "auxiliary_loss_mlp": 0.01024901, "balance_loss_clip": 1.04763997, "balance_loss_mlp": 1.01827884, "epoch": 0.7909577346239404, "flos": 20996359084800.0, "grad_norm": 1.4259078603368212, "language_loss": 0.7441473, "learning_rate": 4.4099654962335343e-07, "loss": 0.76594436, "num_input_tokens_seen": 141755260, "step": 6578, "time_per_iteration": 2.6793501377105713 }, { "auxiliary_loss_clip": 0.01144472, "auxiliary_loss_mlp": 0.0102662, "balance_loss_clip": 1.04413962, "balance_loss_mlp": 1.0184598, "epoch": 0.7910779775145794, "flos": 26247935128320.0, "grad_norm": 1.8932801837412665, "language_loss": 0.75036913, "learning_rate": 4.405087199831636e-07, "loss": 0.77208, "num_input_tokens_seen": 141775500, "step": 6579, "time_per_iteration": 2.662370443344116 }, { "auxiliary_loss_clip": 0.01141344, "auxiliary_loss_mlp": 0.00761808, "balance_loss_clip": 1.04423678, "balance_loss_mlp": 1.00031042, "epoch": 0.7911982204052186, "flos": 22564434291840.0, "grad_norm": 2.4907727341526873, "language_loss": 0.67167008, "learning_rate": 4.400211269165619e-07, "loss": 0.6907016, "num_input_tokens_seen": 141791955, "step": 6580, "time_per_iteration": 2.6002676486968994 }, { "auxiliary_loss_clip": 0.01167511, "auxiliary_loss_mlp": 0.01027333, "balance_loss_clip": 1.04833972, "balance_loss_mlp": 1.02069855, "epoch": 0.7913184632958576, "flos": 23112538899840.0, "grad_norm": 1.471968634431362, "language_loss": 0.76648951, "learning_rate": 4.3953377049751416e-07, "loss": 0.7884379, "num_input_tokens_seen": 141812380, "step": 6581, "time_per_iteration": 2.4867498874664307 }, { "auxiliary_loss_clip": 0.0114613, "auxiliary_loss_mlp": 0.01024431, "balance_loss_clip": 1.04663146, "balance_loss_mlp": 1.01778245, "epoch": 0.7914387061864967, "flos": 12311758719360.0, "grad_norm": 2.835134876846789, "language_loss": 0.77872956, "learning_rate": 4.390466507999537e-07, "loss": 0.80043519, "num_input_tokens_seen": 141828130, "step": 6582, "time_per_iteration": 2.559220314025879 }, { "auxiliary_loss_clip": 0.01122812, "auxiliary_loss_mlp": 0.01024, "balance_loss_clip": 1.04213488, "balance_loss_mlp": 1.0170207, "epoch": 0.7915589490771359, "flos": 17603267708160.0, "grad_norm": 2.0871129673683924, "language_loss": 0.75623381, "learning_rate": 4.385597678977748e-07, "loss": 0.77770191, "num_input_tokens_seen": 141846965, "step": 6583, "time_per_iteration": 3.3919739723205566 }, { "auxiliary_loss_clip": 0.01136033, "auxiliary_loss_mlp": 0.01023604, "balance_loss_clip": 1.04259777, "balance_loss_mlp": 1.01632023, "epoch": 0.7916791919677749, "flos": 25591272641280.0, "grad_norm": 1.688755111005333, "language_loss": 0.75415182, "learning_rate": 4.3807312186483726e-07, "loss": 0.77574825, "num_input_tokens_seen": 141867685, "step": 6584, "time_per_iteration": 3.6593778133392334 }, { "auxiliary_loss_clip": 0.01151325, "auxiliary_loss_mlp": 0.01025702, "balance_loss_clip": 1.04845881, "balance_loss_mlp": 1.01820409, "epoch": 0.791799434858414, "flos": 18844340474880.0, "grad_norm": 2.067439016947969, "language_loss": 0.78546363, "learning_rate": 4.375867127749655e-07, "loss": 0.80723393, "num_input_tokens_seen": 141885960, "step": 6585, "time_per_iteration": 2.5773589611053467 }, { "auxiliary_loss_clip": 0.01124761, "auxiliary_loss_mlp": 0.01030391, "balance_loss_clip": 1.0449053, "balance_loss_mlp": 1.02321208, "epoch": 0.7919196777490531, "flos": 25812015672960.0, "grad_norm": 2.6259510177194083, "language_loss": 0.67132556, "learning_rate": 4.3710054070194744e-07, "loss": 0.69287705, "num_input_tokens_seen": 141905655, "step": 6586, "time_per_iteration": 3.5546681880950928 }, { "auxiliary_loss_clip": 0.01166712, "auxiliary_loss_mlp": 0.00762717, "balance_loss_clip": 1.04552519, "balance_loss_mlp": 1.00033295, "epoch": 0.7920399206396922, "flos": 11947624594560.0, "grad_norm": 3.010499486649441, "language_loss": 0.67067397, "learning_rate": 4.3661460571953455e-07, "loss": 0.68996823, "num_input_tokens_seen": 141922390, "step": 6587, "time_per_iteration": 2.565969944000244 }, { "auxiliary_loss_clip": 0.01155014, "auxiliary_loss_mlp": 0.01022379, "balance_loss_clip": 1.04440284, "balance_loss_mlp": 1.0151366, "epoch": 0.7921601635303313, "flos": 21579907438080.0, "grad_norm": 1.6232356796912386, "language_loss": 0.68545932, "learning_rate": 4.36128907901443e-07, "loss": 0.70723325, "num_input_tokens_seen": 141941985, "step": 6588, "time_per_iteration": 3.5271337032318115 }, { "auxiliary_loss_clip": 0.01129008, "auxiliary_loss_mlp": 0.0102648, "balance_loss_clip": 1.04249501, "balance_loss_mlp": 1.01930332, "epoch": 0.7922804064209703, "flos": 18113989236480.0, "grad_norm": 2.268056303636022, "language_loss": 0.72763604, "learning_rate": 4.356434473213519e-07, "loss": 0.74919093, "num_input_tokens_seen": 141959435, "step": 6589, "time_per_iteration": 2.6921334266662598 }, { "auxiliary_loss_clip": 0.01138042, "auxiliary_loss_mlp": 0.01025676, "balance_loss_clip": 1.04596138, "balance_loss_mlp": 1.01907218, "epoch": 0.7924006493116095, "flos": 21652806090240.0, "grad_norm": 1.6742639432437334, "language_loss": 0.79861051, "learning_rate": 4.351582240529068e-07, "loss": 0.82024765, "num_input_tokens_seen": 141980265, "step": 6590, "time_per_iteration": 2.700547456741333 }, { "auxiliary_loss_clip": 0.01048599, "auxiliary_loss_mlp": 0.01003617, "balance_loss_clip": 1.01125717, "balance_loss_mlp": 1.0025084, "epoch": 0.7925208922022485, "flos": 64242755694720.0, "grad_norm": 0.6803916454915758, "language_loss": 0.58201158, "learning_rate": 4.346732381697149e-07, "loss": 0.6025337, "num_input_tokens_seen": 142044395, "step": 6591, "time_per_iteration": 3.2797157764434814 }, { "auxiliary_loss_clip": 0.01132943, "auxiliary_loss_mlp": 0.01027058, "balance_loss_clip": 1.04304218, "balance_loss_mlp": 1.02052498, "epoch": 0.7926411350928876, "flos": 16941541403520.0, "grad_norm": 3.899620780568765, "language_loss": 0.80946338, "learning_rate": 4.3418848974534825e-07, "loss": 0.83106339, "num_input_tokens_seen": 142061335, "step": 6592, "time_per_iteration": 2.714526414871216 }, { "auxiliary_loss_clip": 0.0113174, "auxiliary_loss_mlp": 0.01028676, "balance_loss_clip": 1.0441947, "balance_loss_mlp": 1.02150536, "epoch": 0.7927613779835267, "flos": 34459987144320.0, "grad_norm": 1.7027060133792158, "language_loss": 0.68533641, "learning_rate": 4.3370397885334276e-07, "loss": 0.70694065, "num_input_tokens_seen": 142081965, "step": 6593, "time_per_iteration": 2.817122459411621 }, { "auxiliary_loss_clip": 0.01146755, "auxiliary_loss_mlp": 0.01024909, "balance_loss_clip": 1.0451901, "balance_loss_mlp": 1.01772666, "epoch": 0.7928816208741658, "flos": 18951174501120.0, "grad_norm": 1.842052047482128, "language_loss": 0.75661194, "learning_rate": 4.3321970556719777e-07, "loss": 0.77832854, "num_input_tokens_seen": 142100260, "step": 6594, "time_per_iteration": 2.6958248615264893 }, { "auxiliary_loss_clip": 0.01168985, "auxiliary_loss_mlp": 0.01029868, "balance_loss_clip": 1.04831183, "balance_loss_mlp": 1.02226877, "epoch": 0.7930018637648049, "flos": 18623022825600.0, "grad_norm": 2.4484711774387358, "language_loss": 0.72057086, "learning_rate": 4.3273566996037856e-07, "loss": 0.74255937, "num_input_tokens_seen": 142116955, "step": 6595, "time_per_iteration": 2.6819543838500977 }, { "auxiliary_loss_clip": 0.01137205, "auxiliary_loss_mlp": 0.0102242, "balance_loss_clip": 1.04269814, "balance_loss_mlp": 1.01595235, "epoch": 0.793122106655444, "flos": 24530650824960.0, "grad_norm": 1.941070734157617, "language_loss": 0.80763304, "learning_rate": 4.322518721063113e-07, "loss": 0.82922924, "num_input_tokens_seen": 142135505, "step": 6596, "time_per_iteration": 2.6704723834991455 }, { "auxiliary_loss_clip": 0.01153819, "auxiliary_loss_mlp": 0.0102467, "balance_loss_clip": 1.0477159, "balance_loss_mlp": 1.01732111, "epoch": 0.7932423495460831, "flos": 34421203434240.0, "grad_norm": 1.8224453360823383, "language_loss": 0.70263267, "learning_rate": 4.3176831207838906e-07, "loss": 0.72441757, "num_input_tokens_seen": 142158915, "step": 6597, "time_per_iteration": 2.738457679748535 }, { "auxiliary_loss_clip": 0.01152293, "auxiliary_loss_mlp": 0.0102466, "balance_loss_clip": 1.04740357, "balance_loss_mlp": 1.01736426, "epoch": 0.7933625924367221, "flos": 26980333441920.0, "grad_norm": 2.3053334392508975, "language_loss": 0.74474257, "learning_rate": 4.3128498994996685e-07, "loss": 0.76651216, "num_input_tokens_seen": 142178390, "step": 6598, "time_per_iteration": 2.6537108421325684 }, { "auxiliary_loss_clip": 0.01158147, "auxiliary_loss_mlp": 0.01025388, "balance_loss_clip": 1.04699552, "balance_loss_mlp": 1.01781261, "epoch": 0.7934828353273613, "flos": 29568630643200.0, "grad_norm": 2.304702368534531, "language_loss": 0.71498388, "learning_rate": 4.308019057943646e-07, "loss": 0.73681927, "num_input_tokens_seen": 142200115, "step": 6599, "time_per_iteration": 2.681518077850342 }, { "auxiliary_loss_clip": 0.01117741, "auxiliary_loss_mlp": 0.0102126, "balance_loss_clip": 1.04100549, "balance_loss_mlp": 1.01419103, "epoch": 0.7936030782180004, "flos": 28615381557120.0, "grad_norm": 1.8786489081817974, "language_loss": 0.74474412, "learning_rate": 4.3031905968486535e-07, "loss": 0.76613414, "num_input_tokens_seen": 142220945, "step": 6600, "time_per_iteration": 2.747075319290161 }, { "auxiliary_loss_clip": 0.01109768, "auxiliary_loss_mlp": 0.0102267, "balance_loss_clip": 1.04509687, "balance_loss_mlp": 1.01518393, "epoch": 0.7937233211086394, "flos": 16392574869120.0, "grad_norm": 2.087317650598343, "language_loss": 0.69005632, "learning_rate": 4.298364516947162e-07, "loss": 0.71138072, "num_input_tokens_seen": 142238175, "step": 6601, "time_per_iteration": 2.6921565532684326 }, { "auxiliary_loss_clip": 0.01107157, "auxiliary_loss_mlp": 0.01022351, "balance_loss_clip": 1.03985536, "balance_loss_mlp": 1.015082, "epoch": 0.7938435639992786, "flos": 22013420682240.0, "grad_norm": 2.232783682800349, "language_loss": 0.65724939, "learning_rate": 4.293540818971295e-07, "loss": 0.67854446, "num_input_tokens_seen": 142255980, "step": 6602, "time_per_iteration": 2.707388162612915 }, { "auxiliary_loss_clip": 0.0115636, "auxiliary_loss_mlp": 0.01028712, "balance_loss_clip": 1.04408228, "balance_loss_mlp": 1.02179515, "epoch": 0.7939638068899176, "flos": 22197032029440.0, "grad_norm": 2.2792970929418073, "language_loss": 0.76566613, "learning_rate": 4.2887195036527934e-07, "loss": 0.78751689, "num_input_tokens_seen": 142274785, "step": 6603, "time_per_iteration": 2.6386613845825195 }, { "auxiliary_loss_clip": 0.01144115, "auxiliary_loss_mlp": 0.01026335, "balance_loss_clip": 1.04141831, "balance_loss_mlp": 1.01875901, "epoch": 0.7940840497805567, "flos": 17745186343680.0, "grad_norm": 2.6881563889119175, "language_loss": 0.73623884, "learning_rate": 4.28390057172306e-07, "loss": 0.75794339, "num_input_tokens_seen": 142291290, "step": 6604, "time_per_iteration": 2.595839262008667 }, { "auxiliary_loss_clip": 0.01119241, "auxiliary_loss_mlp": 0.01025596, "balance_loss_clip": 1.03985262, "balance_loss_mlp": 1.01801074, "epoch": 0.7942042926711959, "flos": 23805435231360.0, "grad_norm": 2.1837075443046152, "language_loss": 0.71968406, "learning_rate": 4.279084023913111e-07, "loss": 0.74113244, "num_input_tokens_seen": 142309165, "step": 6605, "time_per_iteration": 2.7436530590057373 }, { "auxiliary_loss_clip": 0.01152123, "auxiliary_loss_mlp": 0.01028948, "balance_loss_clip": 1.04638004, "balance_loss_mlp": 1.02161026, "epoch": 0.7943245355618349, "flos": 19244959839360.0, "grad_norm": 1.8848455650486993, "language_loss": 0.69322759, "learning_rate": 4.2742698609536096e-07, "loss": 0.7150383, "num_input_tokens_seen": 142327475, "step": 6606, "time_per_iteration": 2.637544870376587 }, { "auxiliary_loss_clip": 0.01144007, "auxiliary_loss_mlp": 0.01028253, "balance_loss_clip": 1.0466063, "balance_loss_mlp": 1.02082586, "epoch": 0.794444778452474, "flos": 25007616547200.0, "grad_norm": 1.7130174245728278, "language_loss": 0.78467387, "learning_rate": 4.2694580835748706e-07, "loss": 0.80639642, "num_input_tokens_seen": 142347335, "step": 6607, "time_per_iteration": 2.7070252895355225 }, { "auxiliary_loss_clip": 0.0113929, "auxiliary_loss_mlp": 0.01026575, "balance_loss_clip": 1.04224896, "balance_loss_mlp": 1.01998019, "epoch": 0.7945650213431131, "flos": 23221491828480.0, "grad_norm": 2.099407377652163, "language_loss": 0.7390883, "learning_rate": 4.264648692506836e-07, "loss": 0.76074696, "num_input_tokens_seen": 142366125, "step": 6608, "time_per_iteration": 3.6462104320526123 }, { "auxiliary_loss_clip": 0.0113366, "auxiliary_loss_mlp": 0.01020289, "balance_loss_clip": 1.04245579, "balance_loss_mlp": 1.0125258, "epoch": 0.7946852642337522, "flos": 26062887237120.0, "grad_norm": 1.8052307590889696, "language_loss": 0.72130656, "learning_rate": 4.2598416884790824e-07, "loss": 0.74284613, "num_input_tokens_seen": 142385175, "step": 6609, "time_per_iteration": 2.6625540256500244 }, { "auxiliary_loss_clip": 0.01147592, "auxiliary_loss_mlp": 0.01025646, "balance_loss_clip": 1.04356539, "balance_loss_mlp": 1.01858544, "epoch": 0.7948055071243912, "flos": 23769704177280.0, "grad_norm": 2.0954804433810468, "language_loss": 0.80744004, "learning_rate": 4.255037072220828e-07, "loss": 0.82917237, "num_input_tokens_seen": 142406545, "step": 6610, "time_per_iteration": 3.6090331077575684 }, { "auxiliary_loss_clip": 0.01163737, "auxiliary_loss_mlp": 0.01020264, "balance_loss_clip": 1.04579067, "balance_loss_mlp": 1.01369858, "epoch": 0.7949257500150304, "flos": 21980814111360.0, "grad_norm": 2.2444320613222146, "language_loss": 0.71758533, "learning_rate": 4.2502348444609293e-07, "loss": 0.73942542, "num_input_tokens_seen": 142426165, "step": 6611, "time_per_iteration": 2.5966131687164307 }, { "auxiliary_loss_clip": 0.01104111, "auxiliary_loss_mlp": 0.0102738, "balance_loss_clip": 1.03645778, "balance_loss_mlp": 1.01999247, "epoch": 0.7950459929056695, "flos": 25774129802880.0, "grad_norm": 1.7992212715604992, "language_loss": 0.69166189, "learning_rate": 4.2454350059278844e-07, "loss": 0.71297681, "num_input_tokens_seen": 142447225, "step": 6612, "time_per_iteration": 3.7075371742248535 }, { "auxiliary_loss_clip": 0.01129987, "auxiliary_loss_mlp": 0.01030656, "balance_loss_clip": 1.03784871, "balance_loss_mlp": 1.02360177, "epoch": 0.7951662357963085, "flos": 22158068751360.0, "grad_norm": 1.8395695269170795, "language_loss": 0.84035015, "learning_rate": 4.240637557349824e-07, "loss": 0.8619566, "num_input_tokens_seen": 142464440, "step": 6613, "time_per_iteration": 2.6705801486968994 }, { "auxiliary_loss_clip": 0.01128205, "auxiliary_loss_mlp": 0.01029596, "balance_loss_clip": 1.04359269, "balance_loss_mlp": 1.02234757, "epoch": 0.7952864786869477, "flos": 24641938137600.0, "grad_norm": 4.372283167560864, "language_loss": 0.66712534, "learning_rate": 4.235842499454516e-07, "loss": 0.68870342, "num_input_tokens_seen": 142484355, "step": 6614, "time_per_iteration": 3.544501781463623 }, { "auxiliary_loss_clip": 0.01137659, "auxiliary_loss_mlp": 0.01031768, "balance_loss_clip": 1.04305339, "balance_loss_mlp": 1.02443326, "epoch": 0.7954067215775867, "flos": 21830922656640.0, "grad_norm": 1.702732078796595, "language_loss": 0.83062238, "learning_rate": 4.2310498329693687e-07, "loss": 0.85231662, "num_input_tokens_seen": 142505255, "step": 6615, "time_per_iteration": 2.7796730995178223 }, { "auxiliary_loss_clip": 0.01153501, "auxiliary_loss_mlp": 0.01028395, "balance_loss_clip": 1.04555619, "balance_loss_mlp": 1.02092612, "epoch": 0.7955269644682258, "flos": 24060652341120.0, "grad_norm": 1.9520837655629037, "language_loss": 0.81253648, "learning_rate": 4.2262595586214164e-07, "loss": 0.83435541, "num_input_tokens_seen": 142526350, "step": 6616, "time_per_iteration": 2.7931745052337646 }, { "auxiliary_loss_clip": 0.01156637, "auxiliary_loss_mlp": 0.01024413, "balance_loss_clip": 1.04625058, "balance_loss_mlp": 1.01654482, "epoch": 0.795647207358865, "flos": 25010741030400.0, "grad_norm": 1.7078938912867316, "language_loss": 0.76930273, "learning_rate": 4.221471677137358e-07, "loss": 0.7911132, "num_input_tokens_seen": 142547165, "step": 6617, "time_per_iteration": 2.717799663543701 }, { "auxiliary_loss_clip": 0.01127945, "auxiliary_loss_mlp": 0.01027936, "balance_loss_clip": 1.04291022, "balance_loss_mlp": 1.02163339, "epoch": 0.795767450249504, "flos": 14648358343680.0, "grad_norm": 1.5765809689404129, "language_loss": 0.70008832, "learning_rate": 4.216686189243492e-07, "loss": 0.72164714, "num_input_tokens_seen": 142565955, "step": 6618, "time_per_iteration": 2.7128725051879883 }, { "auxiliary_loss_clip": 0.01121928, "auxiliary_loss_mlp": 0.0102681, "balance_loss_clip": 1.04343247, "balance_loss_mlp": 1.01901376, "epoch": 0.7958876931401431, "flos": 18547897530240.0, "grad_norm": 1.6533442996143157, "language_loss": 0.72681165, "learning_rate": 4.211903095665785e-07, "loss": 0.748299, "num_input_tokens_seen": 142585340, "step": 6619, "time_per_iteration": 2.72342586517334 }, { "auxiliary_loss_clip": 0.01147253, "auxiliary_loss_mlp": 0.01023265, "balance_loss_clip": 1.04300714, "balance_loss_mlp": 1.01664567, "epoch": 0.7960079360307821, "flos": 21543960902400.0, "grad_norm": 1.933912079442407, "language_loss": 0.75134587, "learning_rate": 4.2071223971298277e-07, "loss": 0.77305108, "num_input_tokens_seen": 142602525, "step": 6620, "time_per_iteration": 2.660771131515503 }, { "auxiliary_loss_clip": 0.01153009, "auxiliary_loss_mlp": 0.01022202, "balance_loss_clip": 1.04366338, "balance_loss_mlp": 1.01469207, "epoch": 0.7961281789214213, "flos": 25481745095040.0, "grad_norm": 2.42667822648509, "language_loss": 0.61178529, "learning_rate": 4.2023440943608433e-07, "loss": 0.63353741, "num_input_tokens_seen": 142622490, "step": 6621, "time_per_iteration": 2.6651463508605957 }, { "auxiliary_loss_clip": 0.0115114, "auxiliary_loss_mlp": 0.01024614, "balance_loss_clip": 1.0433557, "balance_loss_mlp": 1.01798654, "epoch": 0.7962484218120603, "flos": 21944436612480.0, "grad_norm": 1.6892435322744372, "language_loss": 0.78271627, "learning_rate": 4.1975681880837023e-07, "loss": 0.80447382, "num_input_tokens_seen": 142642495, "step": 6622, "time_per_iteration": 2.701033592224121 }, { "auxiliary_loss_clip": 0.01118987, "auxiliary_loss_mlp": 0.01025705, "balance_loss_clip": 1.03900933, "balance_loss_mlp": 1.01862729, "epoch": 0.7963686647026994, "flos": 18876264687360.0, "grad_norm": 1.7421357881806323, "language_loss": 0.82522047, "learning_rate": 4.192794679022895e-07, "loss": 0.84666735, "num_input_tokens_seen": 142660820, "step": 6623, "time_per_iteration": 2.8209164142608643 }, { "auxiliary_loss_clip": 0.01151061, "auxiliary_loss_mlp": 0.0102591, "balance_loss_clip": 1.04136157, "balance_loss_mlp": 1.01936889, "epoch": 0.7964889075933386, "flos": 29716582763520.0, "grad_norm": 1.7459344701964405, "language_loss": 0.72072279, "learning_rate": 4.1880235679025743e-07, "loss": 0.74249244, "num_input_tokens_seen": 142680915, "step": 6624, "time_per_iteration": 2.7281858921051025 }, { "auxiliary_loss_clip": 0.01096769, "auxiliary_loss_mlp": 0.01026236, "balance_loss_clip": 1.03850508, "balance_loss_mlp": 1.01868987, "epoch": 0.7966091504839776, "flos": 29491458272640.0, "grad_norm": 1.8098663370232195, "language_loss": 0.6398896, "learning_rate": 4.1832548554464986e-07, "loss": 0.6611197, "num_input_tokens_seen": 142699210, "step": 6625, "time_per_iteration": 2.7737364768981934 }, { "auxiliary_loss_clip": 0.01050562, "auxiliary_loss_mlp": 0.01003174, "balance_loss_clip": 1.01047742, "balance_loss_mlp": 1.00208306, "epoch": 0.7967293933746167, "flos": 67288697101440.0, "grad_norm": 0.7389845512539909, "language_loss": 0.58728892, "learning_rate": 4.178488542378098e-07, "loss": 0.60782623, "num_input_tokens_seen": 142756790, "step": 6626, "time_per_iteration": 3.131082057952881 }, { "auxiliary_loss_clip": 0.01170185, "auxiliary_loss_mlp": 0.01025291, "balance_loss_clip": 1.04675663, "balance_loss_mlp": 1.01732206, "epoch": 0.7968496362652558, "flos": 25554679660800.0, "grad_norm": 2.1396027904525288, "language_loss": 0.89241338, "learning_rate": 4.173724629420401e-07, "loss": 0.91436815, "num_input_tokens_seen": 142778150, "step": 6627, "time_per_iteration": 2.68062686920166 }, { "auxiliary_loss_clip": 0.0114327, "auxiliary_loss_mlp": 0.0102512, "balance_loss_clip": 1.04390192, "balance_loss_mlp": 1.01717436, "epoch": 0.7969698791558949, "flos": 14501088581760.0, "grad_norm": 2.7968331721948316, "language_loss": 0.6829145, "learning_rate": 4.168963117296087e-07, "loss": 0.70459837, "num_input_tokens_seen": 142795485, "step": 6628, "time_per_iteration": 2.6608705520629883 }, { "auxiliary_loss_clip": 0.01167098, "auxiliary_loss_mlp": 0.01020548, "balance_loss_clip": 1.04657626, "balance_loss_mlp": 1.01330888, "epoch": 0.797090122046534, "flos": 22127545169280.0, "grad_norm": 2.1799370837378804, "language_loss": 0.75819182, "learning_rate": 4.1642040067274876e-07, "loss": 0.78006828, "num_input_tokens_seen": 142815155, "step": 6629, "time_per_iteration": 2.6247172355651855 }, { "auxiliary_loss_clip": 0.01143317, "auxiliary_loss_mlp": 0.01022652, "balance_loss_clip": 1.04322028, "balance_loss_mlp": 1.01581573, "epoch": 0.7972103649371731, "flos": 19897671830400.0, "grad_norm": 2.5477718161510037, "language_loss": 0.72408128, "learning_rate": 4.1594472984365493e-07, "loss": 0.74574101, "num_input_tokens_seen": 142833840, "step": 6630, "time_per_iteration": 2.6708824634552 }, { "auxiliary_loss_clip": 0.01148682, "auxiliary_loss_mlp": 0.01023632, "balance_loss_clip": 1.04490709, "balance_loss_mlp": 1.01634264, "epoch": 0.7973306078278122, "flos": 36058621847040.0, "grad_norm": 1.8757672118327924, "language_loss": 0.7760886, "learning_rate": 4.154692993144862e-07, "loss": 0.79781175, "num_input_tokens_seen": 142853610, "step": 6631, "time_per_iteration": 2.7654147148132324 }, { "auxiliary_loss_clip": 0.0116578, "auxiliary_loss_mlp": 0.00761995, "balance_loss_clip": 1.04559827, "balance_loss_mlp": 1.00034165, "epoch": 0.7974508507184512, "flos": 21360600950400.0, "grad_norm": 2.218169043351546, "language_loss": 0.71765107, "learning_rate": 4.1499410915736476e-07, "loss": 0.73692882, "num_input_tokens_seen": 142872540, "step": 6632, "time_per_iteration": 2.586106538772583 }, { "auxiliary_loss_clip": 0.01056106, "auxiliary_loss_mlp": 0.01001677, "balance_loss_clip": 1.01096582, "balance_loss_mlp": 1.00061655, "epoch": 0.7975710936090904, "flos": 68253115317120.0, "grad_norm": 0.772278231300321, "language_loss": 0.64280772, "learning_rate": 4.145191594443762e-07, "loss": 0.66338563, "num_input_tokens_seen": 142936895, "step": 6633, "time_per_iteration": 3.3651533126831055 }, { "auxiliary_loss_clip": 0.0112014, "auxiliary_loss_mlp": 0.01023464, "balance_loss_clip": 1.0427376, "balance_loss_mlp": 1.01614463, "epoch": 0.7976913364997295, "flos": 22492433479680.0, "grad_norm": 1.9221053805459567, "language_loss": 0.70264852, "learning_rate": 4.140444502475713e-07, "loss": 0.7240845, "num_input_tokens_seen": 142956445, "step": 6634, "time_per_iteration": 3.744769811630249 }, { "auxiliary_loss_clip": 0.01148017, "auxiliary_loss_mlp": 0.01027908, "balance_loss_clip": 1.04295862, "balance_loss_mlp": 1.02057409, "epoch": 0.7978115793903685, "flos": 15263220378240.0, "grad_norm": 2.204993520311742, "language_loss": 0.69948012, "learning_rate": 4.1356998163896216e-07, "loss": 0.72123933, "num_input_tokens_seen": 142973495, "step": 6635, "time_per_iteration": 2.584517002105713 }, { "auxiliary_loss_clip": 0.01130287, "auxiliary_loss_mlp": 0.01024034, "balance_loss_clip": 1.04370642, "balance_loss_mlp": 1.01689351, "epoch": 0.7979318222810077, "flos": 19719232041600.0, "grad_norm": 2.2650138702848337, "language_loss": 0.74848908, "learning_rate": 4.130957536905255e-07, "loss": 0.77003229, "num_input_tokens_seen": 142991510, "step": 6636, "time_per_iteration": 3.7424020767211914 }, { "auxiliary_loss_clip": 0.01148713, "auxiliary_loss_mlp": 0.01022475, "balance_loss_clip": 1.04510033, "balance_loss_mlp": 1.01525378, "epoch": 0.7980520651716467, "flos": 15560273854080.0, "grad_norm": 2.1155481574875807, "language_loss": 0.71889347, "learning_rate": 4.1262176647420134e-07, "loss": 0.74060535, "num_input_tokens_seen": 143009675, "step": 6637, "time_per_iteration": 2.6771535873413086 }, { "auxiliary_loss_clip": 0.01143596, "auxiliary_loss_mlp": 0.0102627, "balance_loss_clip": 1.04423714, "balance_loss_mlp": 1.01931989, "epoch": 0.7981723080622858, "flos": 22309432663680.0, "grad_norm": 1.7828988147090705, "language_loss": 0.79890704, "learning_rate": 4.121480200618923e-07, "loss": 0.8206057, "num_input_tokens_seen": 143029330, "step": 6638, "time_per_iteration": 3.58729887008667 }, { "auxiliary_loss_clip": 0.01134274, "auxiliary_loss_mlp": 0.01030838, "balance_loss_clip": 1.04290044, "balance_loss_mlp": 1.02367103, "epoch": 0.798292550952925, "flos": 22929573997440.0, "grad_norm": 1.7560359721772547, "language_loss": 0.80230844, "learning_rate": 4.116745145254674e-07, "loss": 0.82395959, "num_input_tokens_seen": 143048865, "step": 6639, "time_per_iteration": 2.764585256576538 }, { "auxiliary_loss_clip": 0.01038556, "auxiliary_loss_mlp": 0.01000227, "balance_loss_clip": 1.0095582, "balance_loss_mlp": 0.99916619, "epoch": 0.798412793843564, "flos": 64497936890880.0, "grad_norm": 0.765878016361562, "language_loss": 0.58009958, "learning_rate": 4.1120124993675476e-07, "loss": 0.60048747, "num_input_tokens_seen": 143113295, "step": 6640, "time_per_iteration": 4.163068056106567 }, { "auxiliary_loss_clip": 0.01146323, "auxiliary_loss_mlp": 0.01025695, "balance_loss_clip": 1.04511511, "balance_loss_mlp": 1.01853311, "epoch": 0.7985330367342031, "flos": 13586910514560.0, "grad_norm": 1.8750743392976605, "language_loss": 0.62057328, "learning_rate": 4.107282263675498e-07, "loss": 0.64229345, "num_input_tokens_seen": 143130965, "step": 6641, "time_per_iteration": 2.6209092140197754 }, { "auxiliary_loss_clip": 0.01043504, "auxiliary_loss_mlp": 0.00753948, "balance_loss_clip": 1.01348042, "balance_loss_mlp": 1.00032485, "epoch": 0.7986532796248422, "flos": 67698797656320.0, "grad_norm": 0.7719501805341424, "language_loss": 0.52439427, "learning_rate": 4.1025544388960907e-07, "loss": 0.54236877, "num_input_tokens_seen": 143192005, "step": 6642, "time_per_iteration": 3.2523021697998047 }, { "auxiliary_loss_clip": 0.01154395, "auxiliary_loss_mlp": 0.0102572, "balance_loss_clip": 1.04630303, "balance_loss_mlp": 1.01868939, "epoch": 0.7987735225154813, "flos": 22455373622400.0, "grad_norm": 3.764465394838307, "language_loss": 0.71760416, "learning_rate": 4.097829025746538e-07, "loss": 0.73940539, "num_input_tokens_seen": 143213550, "step": 6643, "time_per_iteration": 2.6863629817962646 }, { "auxiliary_loss_clip": 0.01053809, "auxiliary_loss_mlp": 0.01002492, "balance_loss_clip": 1.01017475, "balance_loss_mlp": 1.0013833, "epoch": 0.7988937654061203, "flos": 68864098682880.0, "grad_norm": 0.657498188933818, "language_loss": 0.6100446, "learning_rate": 4.0931060249436757e-07, "loss": 0.6306076, "num_input_tokens_seen": 143277390, "step": 6644, "time_per_iteration": 3.2721004486083984 }, { "auxiliary_loss_clip": 0.01151821, "auxiliary_loss_mlp": 0.010291, "balance_loss_clip": 1.04664159, "balance_loss_mlp": 1.02166152, "epoch": 0.7990140082967595, "flos": 20806893820800.0, "grad_norm": 1.9901529004076401, "language_loss": 0.6951946, "learning_rate": 4.088385437203978e-07, "loss": 0.71700382, "num_input_tokens_seen": 143294400, "step": 6645, "time_per_iteration": 2.5758488178253174 }, { "auxiliary_loss_clip": 0.01166981, "auxiliary_loss_mlp": 0.01026221, "balance_loss_clip": 1.04625666, "balance_loss_mlp": 1.01885378, "epoch": 0.7991342511873986, "flos": 18985289443200.0, "grad_norm": 2.0831947701096554, "language_loss": 0.77237403, "learning_rate": 4.083667263243564e-07, "loss": 0.79430604, "num_input_tokens_seen": 143312745, "step": 6646, "time_per_iteration": 2.6265087127685547 }, { "auxiliary_loss_clip": 0.01147306, "auxiliary_loss_mlp": 0.01023143, "balance_loss_clip": 1.04510188, "balance_loss_mlp": 1.01668739, "epoch": 0.7992544940780376, "flos": 20816805974400.0, "grad_norm": 1.6840189143213304, "language_loss": 0.7187891, "learning_rate": 4.0789515037781653e-07, "loss": 0.74049366, "num_input_tokens_seen": 143333470, "step": 6647, "time_per_iteration": 2.6656789779663086 }, { "auxiliary_loss_clip": 0.01153527, "auxiliary_loss_mlp": 0.01026603, "balance_loss_clip": 1.0429256, "balance_loss_mlp": 1.01872909, "epoch": 0.7993747369686768, "flos": 12640772321280.0, "grad_norm": 1.8958411360164624, "language_loss": 0.82700378, "learning_rate": 4.0742381595231755e-07, "loss": 0.84880507, "num_input_tokens_seen": 143350195, "step": 6648, "time_per_iteration": 2.6367146968841553 }, { "auxiliary_loss_clip": 0.01129939, "auxiliary_loss_mlp": 0.01023216, "balance_loss_clip": 1.04398274, "balance_loss_mlp": 1.01624846, "epoch": 0.7994949798593158, "flos": 20078769225600.0, "grad_norm": 1.9573830764149018, "language_loss": 0.78186178, "learning_rate": 4.06952723119359e-07, "loss": 0.8033933, "num_input_tokens_seen": 143370070, "step": 6649, "time_per_iteration": 2.756636381149292 }, { "auxiliary_loss_clip": 0.01131829, "auxiliary_loss_mlp": 0.01027184, "balance_loss_clip": 1.04419208, "balance_loss_mlp": 1.01968575, "epoch": 0.7996152227499549, "flos": 38654209509120.0, "grad_norm": 1.918896244060421, "language_loss": 0.67443299, "learning_rate": 4.0648187195040504e-07, "loss": 0.69602311, "num_input_tokens_seen": 143392275, "step": 6650, "time_per_iteration": 2.7790627479553223 }, { "auxiliary_loss_clip": 0.01050058, "auxiliary_loss_mlp": 0.010009, "balance_loss_clip": 1.01004803, "balance_loss_mlp": 0.99973184, "epoch": 0.799735465640594, "flos": 70243821947520.0, "grad_norm": 0.8119900651899423, "language_loss": 0.67610514, "learning_rate": 4.060112625168848e-07, "loss": 0.69661474, "num_input_tokens_seen": 143457385, "step": 6651, "time_per_iteration": 3.415618658065796 }, { "auxiliary_loss_clip": 0.01170947, "auxiliary_loss_mlp": 0.01020788, "balance_loss_clip": 1.05035496, "balance_loss_mlp": 1.01345682, "epoch": 0.7998557085312331, "flos": 24240995550720.0, "grad_norm": 2.1154873670917764, "language_loss": 0.74255818, "learning_rate": 4.055408948901886e-07, "loss": 0.76447558, "num_input_tokens_seen": 143478785, "step": 6652, "time_per_iteration": 2.6214535236358643 }, { "auxiliary_loss_clip": 0.01157811, "auxiliary_loss_mlp": 0.01021885, "balance_loss_clip": 1.04539061, "balance_loss_mlp": 1.01413655, "epoch": 0.7999759514218722, "flos": 27564025449600.0, "grad_norm": 1.724964186143899, "language_loss": 0.71275151, "learning_rate": 4.050707691416708e-07, "loss": 0.73454845, "num_input_tokens_seen": 143500095, "step": 6653, "time_per_iteration": 2.7239198684692383 }, { "auxiliary_loss_clip": 0.01050168, "auxiliary_loss_mlp": 0.0100051, "balance_loss_clip": 1.0099349, "balance_loss_mlp": 0.99940729, "epoch": 0.8000961943125112, "flos": 67337428878720.0, "grad_norm": 0.6714850581057558, "language_loss": 0.59780562, "learning_rate": 4.046008853426495e-07, "loss": 0.61831236, "num_input_tokens_seen": 143563410, "step": 6654, "time_per_iteration": 3.318298578262329 }, { "auxiliary_loss_clip": 0.01121327, "auxiliary_loss_mlp": 0.01024978, "balance_loss_clip": 1.04183161, "balance_loss_mlp": 1.01764083, "epoch": 0.8002164372031504, "flos": 28733815676160.0, "grad_norm": 1.9017568002732907, "language_loss": 0.6245991, "learning_rate": 4.0413124356440464e-07, "loss": 0.64606214, "num_input_tokens_seen": 143587455, "step": 6655, "time_per_iteration": 2.7620980739593506 }, { "auxiliary_loss_clip": 0.01113668, "auxiliary_loss_mlp": 0.01023177, "balance_loss_clip": 1.04051781, "balance_loss_mlp": 1.01546419, "epoch": 0.8003366800937894, "flos": 17639429725440.0, "grad_norm": 2.2930991992347467, "language_loss": 0.82098979, "learning_rate": 4.0366184387818223e-07, "loss": 0.84235823, "num_input_tokens_seen": 143605915, "step": 6656, "time_per_iteration": 2.737867832183838 }, { "auxiliary_loss_clip": 0.01170641, "auxiliary_loss_mlp": 0.0102768, "balance_loss_clip": 1.04659057, "balance_loss_mlp": 1.01997864, "epoch": 0.8004569229844285, "flos": 25995303797760.0, "grad_norm": 1.83130701499229, "language_loss": 0.85740399, "learning_rate": 4.0319268635518797e-07, "loss": 0.8793872, "num_input_tokens_seen": 143626490, "step": 6657, "time_per_iteration": 2.6952006816864014 }, { "auxiliary_loss_clip": 0.01153349, "auxiliary_loss_mlp": 0.01022722, "balance_loss_clip": 1.04441631, "balance_loss_mlp": 1.0156796, "epoch": 0.8005771658750677, "flos": 20812352688000.0, "grad_norm": 1.5267786824322473, "language_loss": 0.74763185, "learning_rate": 4.027237710665943e-07, "loss": 0.76939255, "num_input_tokens_seen": 143644955, "step": 6658, "time_per_iteration": 2.695887327194214 }, { "auxiliary_loss_clip": 0.0112783, "auxiliary_loss_mlp": 0.01026104, "balance_loss_clip": 1.04070568, "balance_loss_mlp": 1.01833117, "epoch": 0.8006974087657067, "flos": 25812626204160.0, "grad_norm": 1.792931699090747, "language_loss": 0.69331968, "learning_rate": 4.022550980835344e-07, "loss": 0.71485913, "num_input_tokens_seen": 143667200, "step": 6659, "time_per_iteration": 2.7515246868133545 }, { "auxiliary_loss_clip": 0.01122326, "auxiliary_loss_mlp": 0.0102029, "balance_loss_clip": 1.03961229, "balance_loss_mlp": 1.01287508, "epoch": 0.8008176516563458, "flos": 17164690646400.0, "grad_norm": 2.9960552429431964, "language_loss": 0.7979877, "learning_rate": 4.017866674771051e-07, "loss": 0.8194139, "num_input_tokens_seen": 143684685, "step": 6660, "time_per_iteration": 3.6636250019073486 }, { "auxiliary_loss_clip": 0.01103583, "auxiliary_loss_mlp": 0.01029242, "balance_loss_clip": 1.0380764, "balance_loss_mlp": 1.02210176, "epoch": 0.8009378945469849, "flos": 24207311571840.0, "grad_norm": 1.8592722494133118, "language_loss": 0.74507761, "learning_rate": 4.013184793183688e-07, "loss": 0.76640588, "num_input_tokens_seen": 143706780, "step": 6661, "time_per_iteration": 3.7798209190368652 }, { "auxiliary_loss_clip": 0.01149743, "auxiliary_loss_mlp": 0.01019786, "balance_loss_clip": 1.04176569, "balance_loss_mlp": 1.01274645, "epoch": 0.801058137437624, "flos": 19787318271360.0, "grad_norm": 2.168932486002685, "language_loss": 0.7247718, "learning_rate": 4.008505336783472e-07, "loss": 0.74646711, "num_input_tokens_seen": 143724505, "step": 6662, "time_per_iteration": 2.7066030502319336 }, { "auxiliary_loss_clip": 0.01142856, "auxiliary_loss_mlp": 0.01020144, "balance_loss_clip": 1.04217696, "balance_loss_mlp": 1.01384974, "epoch": 0.801178380328263, "flos": 18659400324480.0, "grad_norm": 1.9810694453285715, "language_loss": 0.80543631, "learning_rate": 4.003828306280284e-07, "loss": 0.8270663, "num_input_tokens_seen": 143742180, "step": 6663, "time_per_iteration": 2.7109992504119873 }, { "auxiliary_loss_clip": 0.01153562, "auxiliary_loss_mlp": 0.0102653, "balance_loss_clip": 1.04398632, "balance_loss_mlp": 1.01992035, "epoch": 0.8012986232189022, "flos": 15706573948800.0, "grad_norm": 1.8856192083850938, "language_loss": 0.77977014, "learning_rate": 3.999153702383626e-07, "loss": 0.80157101, "num_input_tokens_seen": 143760070, "step": 6664, "time_per_iteration": 3.676591396331787 }, { "auxiliary_loss_clip": 0.01157035, "auxiliary_loss_mlp": 0.01021981, "balance_loss_clip": 1.04482079, "balance_loss_mlp": 1.01474488, "epoch": 0.8014188661095413, "flos": 28584139703040.0, "grad_norm": 2.0374664609452426, "language_loss": 0.73997349, "learning_rate": 3.9944815258026263e-07, "loss": 0.76176357, "num_input_tokens_seen": 143781890, "step": 6665, "time_per_iteration": 3.7532827854156494 }, { "auxiliary_loss_clip": 0.01157038, "auxiliary_loss_mlp": 0.01028747, "balance_loss_clip": 1.04589391, "balance_loss_mlp": 1.02123094, "epoch": 0.8015391090001803, "flos": 29310360877440.0, "grad_norm": 1.9699411345399531, "language_loss": 0.83173484, "learning_rate": 3.989811777246057e-07, "loss": 0.85359263, "num_input_tokens_seen": 143802060, "step": 6666, "time_per_iteration": 2.6741790771484375 }, { "auxiliary_loss_clip": 0.01064715, "auxiliary_loss_mlp": 0.01003182, "balance_loss_clip": 1.01025105, "balance_loss_mlp": 1.00205553, "epoch": 0.8016593518908195, "flos": 70397340675840.0, "grad_norm": 0.8571874662979818, "language_loss": 0.66202956, "learning_rate": 3.985144457422305e-07, "loss": 0.68270856, "num_input_tokens_seen": 143856345, "step": 6667, "time_per_iteration": 3.1302645206451416 }, { "auxiliary_loss_clip": 0.01167299, "auxiliary_loss_mlp": 0.0102592, "balance_loss_clip": 1.04591012, "balance_loss_mlp": 1.01871073, "epoch": 0.8017795947814585, "flos": 26026114688640.0, "grad_norm": 1.8282722862344438, "language_loss": 0.76581407, "learning_rate": 3.9804795670394096e-07, "loss": 0.78774631, "num_input_tokens_seen": 143876470, "step": 6668, "time_per_iteration": 2.5995113849639893 }, { "auxiliary_loss_clip": 0.01127793, "auxiliary_loss_mlp": 0.01021173, "balance_loss_clip": 1.04074919, "balance_loss_mlp": 1.01430368, "epoch": 0.8018998376720976, "flos": 22087181260800.0, "grad_norm": 1.63931249636291, "language_loss": 0.70664251, "learning_rate": 3.975817106805022e-07, "loss": 0.72813219, "num_input_tokens_seen": 143895170, "step": 6669, "time_per_iteration": 2.704141139984131 }, { "auxiliary_loss_clip": 0.01125367, "auxiliary_loss_mlp": 0.01025197, "balance_loss_clip": 1.0431006, "balance_loss_mlp": 1.01786613, "epoch": 0.8020200805627368, "flos": 34568545023360.0, "grad_norm": 3.314096642308104, "language_loss": 0.65041065, "learning_rate": 3.97115707742645e-07, "loss": 0.67191625, "num_input_tokens_seen": 143915845, "step": 6670, "time_per_iteration": 2.8983476161956787 }, { "auxiliary_loss_clip": 0.01142608, "auxiliary_loss_mlp": 0.01022773, "balance_loss_clip": 1.04613709, "balance_loss_mlp": 1.01595688, "epoch": 0.8021403234533758, "flos": 20120354196480.0, "grad_norm": 2.0501038992173344, "language_loss": 0.64725602, "learning_rate": 3.966499479610599e-07, "loss": 0.66890985, "num_input_tokens_seen": 143933940, "step": 6671, "time_per_iteration": 2.8100409507751465 }, { "auxiliary_loss_clip": 0.01122202, "auxiliary_loss_mlp": 0.01025394, "balance_loss_clip": 1.04365039, "balance_loss_mlp": 1.0186466, "epoch": 0.8022605663440149, "flos": 27746200252800.0, "grad_norm": 2.0580553244467317, "language_loss": 0.64835286, "learning_rate": 3.9618443140640225e-07, "loss": 0.66982883, "num_input_tokens_seen": 143952850, "step": 6672, "time_per_iteration": 2.8200061321258545 }, { "auxiliary_loss_clip": 0.01019357, "auxiliary_loss_mlp": 0.00999761, "balance_loss_clip": 1.00868118, "balance_loss_mlp": 0.998658, "epoch": 0.802380809234654, "flos": 60244998768000.0, "grad_norm": 0.9514401170998399, "language_loss": 0.51319474, "learning_rate": 3.957191581492918e-07, "loss": 0.53338593, "num_input_tokens_seen": 144013610, "step": 6673, "time_per_iteration": 3.3444461822509766 }, { "auxiliary_loss_clip": 0.01133597, "auxiliary_loss_mlp": 0.01028432, "balance_loss_clip": 1.04279375, "balance_loss_mlp": 1.02079701, "epoch": 0.8025010521252931, "flos": 15080722352640.0, "grad_norm": 3.042229916381196, "language_loss": 0.71183473, "learning_rate": 3.952541282603097e-07, "loss": 0.73345506, "num_input_tokens_seen": 144028715, "step": 6674, "time_per_iteration": 2.603052854537964 }, { "auxiliary_loss_clip": 0.01148643, "auxiliary_loss_mlp": 0.01023713, "balance_loss_clip": 1.04405761, "balance_loss_mlp": 1.01621485, "epoch": 0.8026212950159322, "flos": 22163527618560.0, "grad_norm": 1.976011786906928, "language_loss": 0.83579731, "learning_rate": 3.9478934181000013e-07, "loss": 0.85752082, "num_input_tokens_seen": 144048740, "step": 6675, "time_per_iteration": 2.678072214126587 }, { "auxiliary_loss_clip": 0.01169617, "auxiliary_loss_mlp": 0.01028351, "balance_loss_clip": 1.04609609, "balance_loss_mlp": 1.02086437, "epoch": 0.8027415379065713, "flos": 17675986792320.0, "grad_norm": 2.6470521414139885, "language_loss": 0.84771991, "learning_rate": 3.943247988688714e-07, "loss": 0.86969954, "num_input_tokens_seen": 144067435, "step": 6676, "time_per_iteration": 2.6244890689849854 }, { "auxiliary_loss_clip": 0.01151195, "auxiliary_loss_mlp": 0.01023232, "balance_loss_clip": 1.04206729, "balance_loss_mlp": 1.01641655, "epoch": 0.8028617807972104, "flos": 21979593048960.0, "grad_norm": 1.621980457380252, "language_loss": 0.72034276, "learning_rate": 3.938604995073933e-07, "loss": 0.74208701, "num_input_tokens_seen": 144085905, "step": 6677, "time_per_iteration": 2.666801929473877 }, { "auxiliary_loss_clip": 0.0114165, "auxiliary_loss_mlp": 0.01026746, "balance_loss_clip": 1.04435122, "balance_loss_mlp": 1.01930141, "epoch": 0.8029820236878494, "flos": 26428457905920.0, "grad_norm": 1.896965134520913, "language_loss": 0.65158582, "learning_rate": 3.9339644379600157e-07, "loss": 0.67326975, "num_input_tokens_seen": 144105735, "step": 6678, "time_per_iteration": 2.738889217376709 }, { "auxiliary_loss_clip": 0.01157797, "auxiliary_loss_mlp": 0.01025104, "balance_loss_clip": 1.04745066, "balance_loss_mlp": 1.01746583, "epoch": 0.8031022665784886, "flos": 17676489582720.0, "grad_norm": 2.318460260470543, "language_loss": 0.7157408, "learning_rate": 3.929326318050907e-07, "loss": 0.73756987, "num_input_tokens_seen": 144123405, "step": 6679, "time_per_iteration": 2.5747742652893066 }, { "auxiliary_loss_clip": 0.0116356, "auxiliary_loss_mlp": 0.01021488, "balance_loss_clip": 1.04461241, "balance_loss_mlp": 1.01428211, "epoch": 0.8032225094691277, "flos": 15450279431040.0, "grad_norm": 2.0258329194147024, "language_loss": 0.78872329, "learning_rate": 3.924690636050225e-07, "loss": 0.81057382, "num_input_tokens_seen": 144140815, "step": 6680, "time_per_iteration": 2.5574538707733154 }, { "auxiliary_loss_clip": 0.01154745, "auxiliary_loss_mlp": 0.01022324, "balance_loss_clip": 1.04568195, "balance_loss_mlp": 1.01467371, "epoch": 0.8033427523597667, "flos": 26179202453760.0, "grad_norm": 2.0826822266057534, "language_loss": 0.72938728, "learning_rate": 3.9200573926611915e-07, "loss": 0.75115794, "num_input_tokens_seen": 144162230, "step": 6681, "time_per_iteration": 2.6499826908111572 }, { "auxiliary_loss_clip": 0.01151886, "auxiliary_loss_mlp": 0.01026539, "balance_loss_clip": 1.0474658, "balance_loss_mlp": 1.01960444, "epoch": 0.8034629952504058, "flos": 21324905809920.0, "grad_norm": 1.8789600024984061, "language_loss": 0.73186529, "learning_rate": 3.9154265885866613e-07, "loss": 0.75364959, "num_input_tokens_seen": 144181540, "step": 6682, "time_per_iteration": 2.6929173469543457 }, { "auxiliary_loss_clip": 0.01153499, "auxiliary_loss_mlp": 0.0102535, "balance_loss_clip": 1.04752469, "balance_loss_mlp": 1.01787543, "epoch": 0.8035832381410449, "flos": 21651585027840.0, "grad_norm": 2.6816917408072922, "language_loss": 0.75016391, "learning_rate": 3.9107982245291394e-07, "loss": 0.77195239, "num_input_tokens_seen": 144199665, "step": 6683, "time_per_iteration": 2.5866661071777344 }, { "auxiliary_loss_clip": 0.0112534, "auxiliary_loss_mlp": 0.01026076, "balance_loss_clip": 1.04474306, "balance_loss_mlp": 1.0186553, "epoch": 0.803703481031684, "flos": 20518818744960.0, "grad_norm": 2.026007095002717, "language_loss": 0.77216923, "learning_rate": 3.9061723011907245e-07, "loss": 0.79368341, "num_input_tokens_seen": 144219020, "step": 6684, "time_per_iteration": 2.7521634101867676 }, { "auxiliary_loss_clip": 0.01138673, "auxiliary_loss_mlp": 0.01030288, "balance_loss_clip": 1.04344726, "balance_loss_mlp": 1.02258158, "epoch": 0.803823723922323, "flos": 22854807838080.0, "grad_norm": 1.677040812408557, "language_loss": 0.79368579, "learning_rate": 3.901548819273179e-07, "loss": 0.81537545, "num_input_tokens_seen": 144239035, "step": 6685, "time_per_iteration": 2.6913387775421143 }, { "auxiliary_loss_clip": 0.01152069, "auxiliary_loss_mlp": 0.01025199, "balance_loss_clip": 1.04500258, "balance_loss_mlp": 1.01807976, "epoch": 0.8039439668129622, "flos": 21362145235200.0, "grad_norm": 1.930173022697641, "language_loss": 0.69341147, "learning_rate": 3.896927779477881e-07, "loss": 0.71518415, "num_input_tokens_seen": 144258295, "step": 6686, "time_per_iteration": 3.53283429145813 }, { "auxiliary_loss_clip": 0.01126043, "auxiliary_loss_mlp": 0.01024285, "balance_loss_clip": 1.04132485, "balance_loss_mlp": 1.01703107, "epoch": 0.8040642097036013, "flos": 23802382575360.0, "grad_norm": 2.040405946396933, "language_loss": 0.67413282, "learning_rate": 3.892309182505833e-07, "loss": 0.69563609, "num_input_tokens_seen": 144276110, "step": 6687, "time_per_iteration": 3.792588233947754 }, { "auxiliary_loss_clip": 0.0116473, "auxiliary_loss_mlp": 0.01025649, "balance_loss_clip": 1.04519033, "balance_loss_mlp": 1.01840734, "epoch": 0.8041844525942403, "flos": 25922046009600.0, "grad_norm": 3.360773166055262, "language_loss": 0.86347687, "learning_rate": 3.887693029057675e-07, "loss": 0.88538063, "num_input_tokens_seen": 144295620, "step": 6688, "time_per_iteration": 2.632033109664917 }, { "auxiliary_loss_clip": 0.01140102, "auxiliary_loss_mlp": 0.01021857, "balance_loss_clip": 1.04388797, "balance_loss_mlp": 1.0154047, "epoch": 0.8043046954848795, "flos": 25191120153600.0, "grad_norm": 1.7126691095787838, "language_loss": 0.81258261, "learning_rate": 3.8830793198336684e-07, "loss": 0.83420217, "num_input_tokens_seen": 144315210, "step": 6689, "time_per_iteration": 2.7543301582336426 }, { "auxiliary_loss_clip": 0.0115853, "auxiliary_loss_mlp": 0.01029591, "balance_loss_clip": 1.04578221, "balance_loss_mlp": 1.02201533, "epoch": 0.8044249383755185, "flos": 41719185123840.0, "grad_norm": 1.8027025949090252, "language_loss": 0.70240974, "learning_rate": 3.878468055533721e-07, "loss": 0.72429097, "num_input_tokens_seen": 144337750, "step": 6690, "time_per_iteration": 3.7838265895843506 }, { "auxiliary_loss_clip": 0.01132499, "auxiliary_loss_mlp": 0.01039048, "balance_loss_clip": 1.04430759, "balance_loss_mlp": 1.0308733, "epoch": 0.8045451812661576, "flos": 20631434860800.0, "grad_norm": 4.042632290894405, "language_loss": 0.84840524, "learning_rate": 3.8738592368573464e-07, "loss": 0.8701207, "num_input_tokens_seen": 144355305, "step": 6691, "time_per_iteration": 2.751680850982666 }, { "auxiliary_loss_clip": 0.01117986, "auxiliary_loss_mlp": 0.01025877, "balance_loss_clip": 1.04215741, "balance_loss_mlp": 1.01868248, "epoch": 0.8046654241567968, "flos": 29711806254720.0, "grad_norm": 2.096202333294587, "language_loss": 0.88072956, "learning_rate": 3.8692528645037137e-07, "loss": 0.90216815, "num_input_tokens_seen": 144374485, "step": 6692, "time_per_iteration": 3.6693036556243896 }, { "auxiliary_loss_clip": 0.01166649, "auxiliary_loss_mlp": 0.01027551, "balance_loss_clip": 1.04721117, "balance_loss_mlp": 1.02024925, "epoch": 0.8047856670474358, "flos": 17671389851520.0, "grad_norm": 2.14913359985059, "language_loss": 0.77669942, "learning_rate": 3.8646489391715907e-07, "loss": 0.79864144, "num_input_tokens_seen": 144388780, "step": 6693, "time_per_iteration": 2.5592777729034424 }, { "auxiliary_loss_clip": 0.01140112, "auxiliary_loss_mlp": 0.01027556, "balance_loss_clip": 1.04379141, "balance_loss_mlp": 1.01989043, "epoch": 0.8049059099380749, "flos": 17120699464320.0, "grad_norm": 2.333836135212007, "language_loss": 0.8795982, "learning_rate": 3.8600474615593903e-07, "loss": 0.90127486, "num_input_tokens_seen": 144403395, "step": 6694, "time_per_iteration": 2.6118319034576416 }, { "auxiliary_loss_clip": 0.0103498, "auxiliary_loss_mlp": 0.01000808, "balance_loss_clip": 1.01028836, "balance_loss_mlp": 0.99972898, "epoch": 0.805026152828714, "flos": 62212903240320.0, "grad_norm": 0.7858853051697917, "language_loss": 0.59614527, "learning_rate": 3.8554484323651605e-07, "loss": 0.61650318, "num_input_tokens_seen": 144465265, "step": 6695, "time_per_iteration": 3.3556320667266846 }, { "auxiliary_loss_clip": 0.01152528, "auxiliary_loss_mlp": 0.00761805, "balance_loss_clip": 1.04712987, "balance_loss_mlp": 1.00026059, "epoch": 0.8051463957193531, "flos": 21688608971520.0, "grad_norm": 2.0519758684973843, "language_loss": 0.79214931, "learning_rate": 3.85085185228657e-07, "loss": 0.81129253, "num_input_tokens_seen": 144484235, "step": 6696, "time_per_iteration": 2.6618385314941406 }, { "auxiliary_loss_clip": 0.01132417, "auxiliary_loss_mlp": 0.01028133, "balance_loss_clip": 1.04301572, "balance_loss_mlp": 1.02011645, "epoch": 0.8052666386099921, "flos": 32051458535040.0, "grad_norm": 2.030838826254038, "language_loss": 0.73146808, "learning_rate": 3.8462577220209114e-07, "loss": 0.75307357, "num_input_tokens_seen": 144504610, "step": 6697, "time_per_iteration": 2.8052375316619873 }, { "auxiliary_loss_clip": 0.01064558, "auxiliary_loss_mlp": 0.01002805, "balance_loss_clip": 1.0101881, "balance_loss_mlp": 1.00176167, "epoch": 0.8053868815006313, "flos": 67157875768320.0, "grad_norm": 0.7044477038079189, "language_loss": 0.58986294, "learning_rate": 3.8416660422651127e-07, "loss": 0.61053658, "num_input_tokens_seen": 144574260, "step": 6698, "time_per_iteration": 3.3441967964172363 }, { "auxiliary_loss_clip": 0.01127679, "auxiliary_loss_mlp": 0.01024281, "balance_loss_clip": 1.04054952, "balance_loss_mlp": 1.01704752, "epoch": 0.8055071243912704, "flos": 23837000307840.0, "grad_norm": 1.9639227705355988, "language_loss": 0.68217838, "learning_rate": 3.837076813715723e-07, "loss": 0.70369798, "num_input_tokens_seen": 144594145, "step": 6699, "time_per_iteration": 2.7055587768554688 }, { "auxiliary_loss_clip": 0.01122887, "auxiliary_loss_mlp": 0.01029587, "balance_loss_clip": 1.04019153, "balance_loss_mlp": 1.0210216, "epoch": 0.8056273672819094, "flos": 21324510760320.0, "grad_norm": 2.5556585898206916, "language_loss": 0.75173509, "learning_rate": 3.832490037068941e-07, "loss": 0.77325982, "num_input_tokens_seen": 144612935, "step": 6700, "time_per_iteration": 2.747830629348755 }, { "auxiliary_loss_clip": 0.01094081, "auxiliary_loss_mlp": 0.0102447, "balance_loss_clip": 1.03850842, "balance_loss_mlp": 1.01738954, "epoch": 0.8057476101725486, "flos": 25768383626880.0, "grad_norm": 1.8667292132826152, "language_loss": 0.75965178, "learning_rate": 3.827905713020554e-07, "loss": 0.7808373, "num_input_tokens_seen": 144630580, "step": 6701, "time_per_iteration": 2.8565828800201416 }, { "auxiliary_loss_clip": 0.01127833, "auxiliary_loss_mlp": 0.01025691, "balance_loss_clip": 1.03966975, "balance_loss_mlp": 1.01713753, "epoch": 0.8058678530631876, "flos": 24535283679360.0, "grad_norm": 2.007475077175213, "language_loss": 0.69141352, "learning_rate": 3.823323842266017e-07, "loss": 0.7129488, "num_input_tokens_seen": 144649975, "step": 6702, "time_per_iteration": 2.710167169570923 }, { "auxiliary_loss_clip": 0.0115359, "auxiliary_loss_mlp": 0.0101959, "balance_loss_clip": 1.04205835, "balance_loss_mlp": 1.01238382, "epoch": 0.8059880959538267, "flos": 24753728240640.0, "grad_norm": 2.424771603400656, "language_loss": 0.73059636, "learning_rate": 3.818744425500393e-07, "loss": 0.75232816, "num_input_tokens_seen": 144667990, "step": 6703, "time_per_iteration": 2.724198579788208 }, { "auxiliary_loss_clip": 0.01116741, "auxiliary_loss_mlp": 0.01025207, "balance_loss_clip": 1.03865385, "balance_loss_mlp": 1.0179472, "epoch": 0.8061083388444659, "flos": 22196349671040.0, "grad_norm": 1.8085836149025707, "language_loss": 0.8053444, "learning_rate": 3.8141674634183675e-07, "loss": 0.82676393, "num_input_tokens_seen": 144687020, "step": 6704, "time_per_iteration": 2.7691078186035156 }, { "auxiliary_loss_clip": 0.01109193, "auxiliary_loss_mlp": 0.01027808, "balance_loss_clip": 1.04166269, "balance_loss_mlp": 1.02137399, "epoch": 0.8062285817351049, "flos": 30044195735040.0, "grad_norm": 1.761189382591428, "language_loss": 0.66088951, "learning_rate": 3.809592956714278e-07, "loss": 0.6822595, "num_input_tokens_seen": 144710255, "step": 6705, "time_per_iteration": 2.795830011367798 }, { "auxiliary_loss_clip": 0.01158905, "auxiliary_loss_mlp": 0.01025814, "balance_loss_clip": 1.04744077, "balance_loss_mlp": 1.01832771, "epoch": 0.806348824625744, "flos": 22782591544320.0, "grad_norm": 8.558763051492198, "language_loss": 0.74781334, "learning_rate": 3.805020906082057e-07, "loss": 0.76966053, "num_input_tokens_seen": 144728830, "step": 6706, "time_per_iteration": 2.667769432067871 }, { "auxiliary_loss_clip": 0.01143106, "auxiliary_loss_mlp": 0.01030836, "balance_loss_clip": 1.04450202, "balance_loss_mlp": 1.02218723, "epoch": 0.8064690675163831, "flos": 23404600385280.0, "grad_norm": 3.6560542782517635, "language_loss": 0.80786294, "learning_rate": 3.8004513122152917e-07, "loss": 0.82960236, "num_input_tokens_seen": 144747140, "step": 6707, "time_per_iteration": 2.6309080123901367 }, { "auxiliary_loss_clip": 0.01129634, "auxiliary_loss_mlp": 0.01028291, "balance_loss_clip": 1.04481447, "balance_loss_mlp": 1.02124834, "epoch": 0.8065893104070222, "flos": 24060903736320.0, "grad_norm": 5.7532097711745385, "language_loss": 0.67128372, "learning_rate": 3.79588417580718e-07, "loss": 0.69286299, "num_input_tokens_seen": 144765250, "step": 6708, "time_per_iteration": 2.7516837120056152 }, { "auxiliary_loss_clip": 0.01157029, "auxiliary_loss_mlp": 0.01024596, "balance_loss_clip": 1.04798555, "balance_loss_mlp": 1.017169, "epoch": 0.8067095532976613, "flos": 22305410340480.0, "grad_norm": 1.9254754871975799, "language_loss": 0.76563334, "learning_rate": 3.791319497550558e-07, "loss": 0.7874496, "num_input_tokens_seen": 144783080, "step": 6709, "time_per_iteration": 2.616316318511963 }, { "auxiliary_loss_clip": 0.01132227, "auxiliary_loss_mlp": 0.00761978, "balance_loss_clip": 1.04366863, "balance_loss_mlp": 1.0002985, "epoch": 0.8068297961883004, "flos": 17129498296320.0, "grad_norm": 4.771874240411894, "language_loss": 0.7093364, "learning_rate": 3.78675727813788e-07, "loss": 0.7282784, "num_input_tokens_seen": 144800645, "step": 6710, "time_per_iteration": 2.6581382751464844 }, { "auxiliary_loss_clip": 0.01138666, "auxiliary_loss_mlp": 0.01025977, "balance_loss_clip": 1.04421604, "balance_loss_mlp": 1.01866639, "epoch": 0.8069500390789395, "flos": 22018843635840.0, "grad_norm": 1.6545849480560237, "language_loss": 0.73449981, "learning_rate": 3.782197518261225e-07, "loss": 0.75614619, "num_input_tokens_seen": 144820085, "step": 6711, "time_per_iteration": 2.6776797771453857 }, { "auxiliary_loss_clip": 0.01144127, "auxiliary_loss_mlp": 0.01025292, "balance_loss_clip": 1.04358244, "balance_loss_mlp": 1.0178802, "epoch": 0.8070702819695785, "flos": 19244241567360.0, "grad_norm": 2.4369533078048327, "language_loss": 0.95764345, "learning_rate": 3.777640218612319e-07, "loss": 0.97933763, "num_input_tokens_seen": 144838070, "step": 6712, "time_per_iteration": 3.5909345149993896 }, { "auxiliary_loss_clip": 0.01147924, "auxiliary_loss_mlp": 0.01022455, "balance_loss_clip": 1.04475319, "balance_loss_mlp": 1.01553154, "epoch": 0.8071905248602176, "flos": 21544320038400.0, "grad_norm": 2.4524030696783274, "language_loss": 0.71657729, "learning_rate": 3.773085379882488e-07, "loss": 0.73828101, "num_input_tokens_seen": 144857125, "step": 6713, "time_per_iteration": 2.6792399883270264 }, { "auxiliary_loss_clip": 0.01151621, "auxiliary_loss_mlp": 0.00762349, "balance_loss_clip": 1.04198134, "balance_loss_mlp": 1.00029445, "epoch": 0.8073107677508568, "flos": 37268309105280.0, "grad_norm": 1.9460315691344214, "language_loss": 0.76351786, "learning_rate": 3.768533002762715e-07, "loss": 0.78265762, "num_input_tokens_seen": 144880660, "step": 6714, "time_per_iteration": 3.8547730445861816 }, { "auxiliary_loss_clip": 0.01137919, "auxiliary_loss_mlp": 0.01028433, "balance_loss_clip": 1.04076064, "balance_loss_mlp": 1.02131629, "epoch": 0.8074310106414958, "flos": 28366269759360.0, "grad_norm": 1.650601631541225, "language_loss": 0.7687183, "learning_rate": 3.763983087943572e-07, "loss": 0.79038185, "num_input_tokens_seen": 144900050, "step": 6715, "time_per_iteration": 2.7230751514434814 }, { "auxiliary_loss_clip": 0.01145864, "auxiliary_loss_mlp": 0.0076205, "balance_loss_clip": 1.04383707, "balance_loss_mlp": 1.00029075, "epoch": 0.8075512535321349, "flos": 24281646768000.0, "grad_norm": 1.6440954139173636, "language_loss": 0.81088781, "learning_rate": 3.759435636115282e-07, "loss": 0.82996696, "num_input_tokens_seen": 144920835, "step": 6716, "time_per_iteration": 3.5402698516845703 }, { "auxiliary_loss_clip": 0.01090542, "auxiliary_loss_mlp": 0.00762236, "balance_loss_clip": 1.03979897, "balance_loss_mlp": 1.00030351, "epoch": 0.807671496422774, "flos": 26030855283840.0, "grad_norm": 2.0619252526920473, "language_loss": 0.73218077, "learning_rate": 3.7548906479676967e-07, "loss": 0.75070846, "num_input_tokens_seen": 144940430, "step": 6717, "time_per_iteration": 3.7854037284851074 }, { "auxiliary_loss_clip": 0.01156252, "auxiliary_loss_mlp": 0.01026961, "balance_loss_clip": 1.04444408, "balance_loss_mlp": 1.01875377, "epoch": 0.8077917393134131, "flos": 23730740899200.0, "grad_norm": 1.987188786642795, "language_loss": 0.7184993, "learning_rate": 3.7503481241902855e-07, "loss": 0.74033141, "num_input_tokens_seen": 144960405, "step": 6718, "time_per_iteration": 2.7212939262390137 }, { "auxiliary_loss_clip": 0.01138397, "auxiliary_loss_mlp": 0.00761992, "balance_loss_clip": 1.04276919, "balance_loss_mlp": 1.00030804, "epoch": 0.8079119822040521, "flos": 18402028398720.0, "grad_norm": 1.726930422083057, "language_loss": 0.80267918, "learning_rate": 3.745808065472145e-07, "loss": 0.82168305, "num_input_tokens_seen": 144977700, "step": 6719, "time_per_iteration": 2.704171895980835 }, { "auxiliary_loss_clip": 0.01149326, "auxiliary_loss_mlp": 0.01023754, "balance_loss_clip": 1.04778886, "balance_loss_mlp": 1.01647639, "epoch": 0.8080322250946913, "flos": 23621787970560.0, "grad_norm": 1.6201188853162733, "language_loss": 0.76063275, "learning_rate": 3.741270472501994e-07, "loss": 0.78236347, "num_input_tokens_seen": 144998340, "step": 6720, "time_per_iteration": 2.609226942062378 }, { "auxiliary_loss_clip": 0.011374, "auxiliary_loss_mlp": 0.01030645, "balance_loss_clip": 1.0452373, "balance_loss_mlp": 1.02391529, "epoch": 0.8081524679853304, "flos": 22820692896000.0, "grad_norm": 2.045703426244612, "language_loss": 0.72659254, "learning_rate": 3.736735345968183e-07, "loss": 0.74827302, "num_input_tokens_seen": 145017950, "step": 6721, "time_per_iteration": 2.631300210952759 }, { "auxiliary_loss_clip": 0.01154941, "auxiliary_loss_mlp": 0.0102374, "balance_loss_clip": 1.04708576, "balance_loss_mlp": 1.01673388, "epoch": 0.8082727108759694, "flos": 17640004343040.0, "grad_norm": 1.6119000616671963, "language_loss": 0.78926432, "learning_rate": 3.7322026865586986e-07, "loss": 0.81105113, "num_input_tokens_seen": 145036985, "step": 6722, "time_per_iteration": 2.6055054664611816 }, { "auxiliary_loss_clip": 0.0115989, "auxiliary_loss_mlp": 0.01027179, "balance_loss_clip": 1.04925394, "balance_loss_mlp": 1.01948977, "epoch": 0.8083929537666086, "flos": 25958172113280.0, "grad_norm": 2.0934348913813676, "language_loss": 0.73665059, "learning_rate": 3.7276724949611206e-07, "loss": 0.7585212, "num_input_tokens_seen": 145057095, "step": 6723, "time_per_iteration": 2.6585187911987305 }, { "auxiliary_loss_clip": 0.01144313, "auxiliary_loss_mlp": 0.01024472, "balance_loss_clip": 1.04570675, "balance_loss_mlp": 1.01662254, "epoch": 0.8085131966572476, "flos": 27089178629760.0, "grad_norm": 2.5045521792052488, "language_loss": 0.75154835, "learning_rate": 3.723144771862694e-07, "loss": 0.77323616, "num_input_tokens_seen": 145077735, "step": 6724, "time_per_iteration": 2.6524457931518555 }, { "auxiliary_loss_clip": 0.01128678, "auxiliary_loss_mlp": 0.01028739, "balance_loss_clip": 1.04312181, "balance_loss_mlp": 1.02108014, "epoch": 0.8086334395478867, "flos": 23988543788160.0, "grad_norm": 9.389125020229514, "language_loss": 0.76955557, "learning_rate": 3.718619517950263e-07, "loss": 0.79112971, "num_input_tokens_seen": 145098330, "step": 6725, "time_per_iteration": 2.7643635272979736 }, { "auxiliary_loss_clip": 0.01167739, "auxiliary_loss_mlp": 0.01026911, "balance_loss_clip": 1.04878473, "balance_loss_mlp": 1.0196507, "epoch": 0.8087536824385259, "flos": 20405879406720.0, "grad_norm": 2.070930702130001, "language_loss": 0.77393061, "learning_rate": 3.714096733910301e-07, "loss": 0.7958771, "num_input_tokens_seen": 145115855, "step": 6726, "time_per_iteration": 2.6047353744506836 }, { "auxiliary_loss_clip": 0.01163768, "auxiliary_loss_mlp": 0.01035056, "balance_loss_clip": 1.04878199, "balance_loss_mlp": 1.02675915, "epoch": 0.8088739253291649, "flos": 25919639798400.0, "grad_norm": 2.539625950193938, "language_loss": 0.69968915, "learning_rate": 3.709576420428926e-07, "loss": 0.72167736, "num_input_tokens_seen": 145136655, "step": 6727, "time_per_iteration": 2.7307746410369873 }, { "auxiliary_loss_clip": 0.01140689, "auxiliary_loss_mlp": 0.01023313, "balance_loss_clip": 1.04243803, "balance_loss_mlp": 1.01609826, "epoch": 0.808994168219804, "flos": 28402072640640.0, "grad_norm": 2.984272264057401, "language_loss": 0.73046154, "learning_rate": 3.7050585781918463e-07, "loss": 0.75210154, "num_input_tokens_seen": 145156955, "step": 6728, "time_per_iteration": 2.790555953979492 }, { "auxiliary_loss_clip": 0.01156501, "auxiliary_loss_mlp": 0.01026021, "balance_loss_clip": 1.04507196, "balance_loss_mlp": 1.01851082, "epoch": 0.8091144111104431, "flos": 17421056991360.0, "grad_norm": 2.272916350716052, "language_loss": 0.6869809, "learning_rate": 3.700543207884428e-07, "loss": 0.70880616, "num_input_tokens_seen": 145173865, "step": 6729, "time_per_iteration": 2.6495230197906494 }, { "auxiliary_loss_clip": 0.01150206, "auxiliary_loss_mlp": 0.01020785, "balance_loss_clip": 1.04368329, "balance_loss_mlp": 1.01371312, "epoch": 0.8092346540010822, "flos": 32153803361280.0, "grad_norm": 1.8686169143089237, "language_loss": 0.70947027, "learning_rate": 3.6960303101916466e-07, "loss": 0.73118019, "num_input_tokens_seen": 145193780, "step": 6730, "time_per_iteration": 2.744354486465454 }, { "auxiliary_loss_clip": 0.01064991, "auxiliary_loss_mlp": 0.00753638, "balance_loss_clip": 1.01078701, "balance_loss_mlp": 1.000247, "epoch": 0.8093548968917212, "flos": 58035093390720.0, "grad_norm": 0.7413906452416251, "language_loss": 0.55534536, "learning_rate": 3.6915198857981047e-07, "loss": 0.57353163, "num_input_tokens_seen": 145258980, "step": 6731, "time_per_iteration": 3.2873332500457764 }, { "auxiliary_loss_clip": 0.01120964, "auxiliary_loss_mlp": 0.0102323, "balance_loss_clip": 1.04180217, "balance_loss_mlp": 1.01587725, "epoch": 0.8094751397823604, "flos": 27381599251200.0, "grad_norm": 1.6238463886999253, "language_loss": 0.67773122, "learning_rate": 3.687011935388027e-07, "loss": 0.69917315, "num_input_tokens_seen": 145281875, "step": 6732, "time_per_iteration": 2.810391426086426 }, { "auxiliary_loss_clip": 0.01151766, "auxiliary_loss_mlp": 0.01020987, "balance_loss_clip": 1.04537225, "balance_loss_mlp": 1.0139327, "epoch": 0.8095953826729995, "flos": 24061083304320.0, "grad_norm": 1.9776199233314524, "language_loss": 0.72888982, "learning_rate": 3.6825064596452646e-07, "loss": 0.75061738, "num_input_tokens_seen": 145302220, "step": 6733, "time_per_iteration": 2.6122918128967285 }, { "auxiliary_loss_clip": 0.01151905, "auxiliary_loss_mlp": 0.01024893, "balance_loss_clip": 1.04301095, "balance_loss_mlp": 1.01825595, "epoch": 0.8097156255636385, "flos": 23951412103680.0, "grad_norm": 1.8562402281421826, "language_loss": 0.70700181, "learning_rate": 3.678003459253305e-07, "loss": 0.72876978, "num_input_tokens_seen": 145323070, "step": 6734, "time_per_iteration": 2.7278125286102295 }, { "auxiliary_loss_clip": 0.01124187, "auxiliary_loss_mlp": 0.01030044, "balance_loss_clip": 1.0424782, "balance_loss_mlp": 1.02271271, "epoch": 0.8098358684542777, "flos": 21799142098560.0, "grad_norm": 2.186608896382513, "language_loss": 0.74018025, "learning_rate": 3.673502934895236e-07, "loss": 0.76172256, "num_input_tokens_seen": 145342575, "step": 6735, "time_per_iteration": 2.6809206008911133 }, { "auxiliary_loss_clip": 0.01064295, "auxiliary_loss_mlp": 0.01000882, "balance_loss_clip": 1.0102421, "balance_loss_mlp": 0.99979132, "epoch": 0.8099561113449167, "flos": 68809515966720.0, "grad_norm": 0.6897237886489156, "language_loss": 0.57953131, "learning_rate": 3.669004887253802e-07, "loss": 0.60018313, "num_input_tokens_seen": 145408865, "step": 6736, "time_per_iteration": 3.3543224334716797 }, { "auxiliary_loss_clip": 0.01142986, "auxiliary_loss_mlp": 0.01023177, "balance_loss_clip": 1.04585862, "balance_loss_mlp": 1.01620924, "epoch": 0.8100763542355558, "flos": 23586056916480.0, "grad_norm": 1.5333678418541534, "language_loss": 0.78947115, "learning_rate": 3.664509317011335e-07, "loss": 0.81113279, "num_input_tokens_seen": 145429200, "step": 6737, "time_per_iteration": 2.699428081512451 }, { "auxiliary_loss_clip": 0.01155156, "auxiliary_loss_mlp": 0.0102535, "balance_loss_clip": 1.04902315, "balance_loss_mlp": 1.01784563, "epoch": 0.810196597126195, "flos": 31650408207360.0, "grad_norm": 1.8326739942157924, "language_loss": 0.73623377, "learning_rate": 3.6600162248498134e-07, "loss": 0.75803888, "num_input_tokens_seen": 145452830, "step": 6738, "time_per_iteration": 3.614037275314331 }, { "auxiliary_loss_clip": 0.01076915, "auxiliary_loss_mlp": 0.01025964, "balance_loss_clip": 1.03411484, "balance_loss_mlp": 1.01913023, "epoch": 0.810316840016834, "flos": 24900459298560.0, "grad_norm": 1.7875238942546223, "language_loss": 0.76167393, "learning_rate": 3.6555256114508426e-07, "loss": 0.78270268, "num_input_tokens_seen": 145472625, "step": 6739, "time_per_iteration": 2.786972761154175 }, { "auxiliary_loss_clip": 0.01138576, "auxiliary_loss_mlp": 0.01034726, "balance_loss_clip": 1.03992498, "balance_loss_mlp": 1.02711725, "epoch": 0.8104370829074731, "flos": 27965003950080.0, "grad_norm": 2.0184615393574994, "language_loss": 0.72946858, "learning_rate": 3.651037477495642e-07, "loss": 0.75120163, "num_input_tokens_seen": 145494075, "step": 6740, "time_per_iteration": 3.701734781265259 }, { "auxiliary_loss_clip": 0.01165754, "auxiliary_loss_mlp": 0.01030626, "balance_loss_clip": 1.04509497, "balance_loss_mlp": 1.02344394, "epoch": 0.8105573257981122, "flos": 24640752988800.0, "grad_norm": 2.2612466036966894, "language_loss": 0.67987198, "learning_rate": 3.6465518236650584e-07, "loss": 0.70183581, "num_input_tokens_seen": 145514220, "step": 6741, "time_per_iteration": 2.730714797973633 }, { "auxiliary_loss_clip": 0.01125416, "auxiliary_loss_mlp": 0.01021193, "balance_loss_clip": 1.04226601, "balance_loss_mlp": 1.01451421, "epoch": 0.8106775686887513, "flos": 26358935132160.0, "grad_norm": 1.9101157888147844, "language_loss": 0.78239787, "learning_rate": 3.642068650639558e-07, "loss": 0.80386394, "num_input_tokens_seen": 145533965, "step": 6742, "time_per_iteration": 3.628936767578125 }, { "auxiliary_loss_clip": 0.01131323, "auxiliary_loss_mlp": 0.01024899, "balance_loss_clip": 1.03879547, "balance_loss_mlp": 1.01782143, "epoch": 0.8107978115793903, "flos": 27271892136960.0, "grad_norm": 2.3403319295773026, "language_loss": 0.64612436, "learning_rate": 3.6375879590992334e-07, "loss": 0.66768658, "num_input_tokens_seen": 145554310, "step": 6743, "time_per_iteration": 3.655266284942627 }, { "auxiliary_loss_clip": 0.01134313, "auxiliary_loss_mlp": 0.01030752, "balance_loss_clip": 1.04222083, "balance_loss_mlp": 1.02359927, "epoch": 0.8109180544700295, "flos": 24934322845440.0, "grad_norm": 2.0379656965539934, "language_loss": 0.80922627, "learning_rate": 3.6331097497238173e-07, "loss": 0.83087695, "num_input_tokens_seen": 145573755, "step": 6744, "time_per_iteration": 2.702237129211426 }, { "auxiliary_loss_clip": 0.01123114, "auxiliary_loss_mlp": 0.01019525, "balance_loss_clip": 1.04154289, "balance_loss_mlp": 1.01274788, "epoch": 0.8110382973606686, "flos": 21105383840640.0, "grad_norm": 1.9017473837866932, "language_loss": 0.79699701, "learning_rate": 3.628634023192627e-07, "loss": 0.81842339, "num_input_tokens_seen": 145594000, "step": 6745, "time_per_iteration": 2.7435286045074463 }, { "auxiliary_loss_clip": 0.01153616, "auxiliary_loss_mlp": 0.01026671, "balance_loss_clip": 1.0446564, "balance_loss_mlp": 1.01913095, "epoch": 0.8111585402513076, "flos": 15414081500160.0, "grad_norm": 2.0425402172120264, "language_loss": 0.7547375, "learning_rate": 3.624160780184644e-07, "loss": 0.77654028, "num_input_tokens_seen": 145611215, "step": 6746, "time_per_iteration": 2.615886688232422 }, { "auxiliary_loss_clip": 0.01132685, "auxiliary_loss_mlp": 0.0102173, "balance_loss_clip": 1.04288495, "balance_loss_mlp": 1.0142436, "epoch": 0.8112787831419467, "flos": 24095736950400.0, "grad_norm": 2.0194326229268253, "language_loss": 0.74667335, "learning_rate": 3.6196900213784496e-07, "loss": 0.76821744, "num_input_tokens_seen": 145630530, "step": 6747, "time_per_iteration": 2.6693856716156006 }, { "auxiliary_loss_clip": 0.01154651, "auxiliary_loss_mlp": 0.01024483, "balance_loss_clip": 1.04503155, "balance_loss_mlp": 1.01764619, "epoch": 0.8113990260325858, "flos": 20483374999680.0, "grad_norm": 1.8730010996656736, "language_loss": 0.86623436, "learning_rate": 3.6152217474522527e-07, "loss": 0.88802576, "num_input_tokens_seen": 145647345, "step": 6748, "time_per_iteration": 2.6099674701690674 }, { "auxiliary_loss_clip": 0.01155692, "auxiliary_loss_mlp": 0.01026449, "balance_loss_clip": 1.04887307, "balance_loss_mlp": 1.01911736, "epoch": 0.8115192689232249, "flos": 24901141656960.0, "grad_norm": 1.8014404876793537, "language_loss": 0.73082066, "learning_rate": 3.6107559590838975e-07, "loss": 0.75264204, "num_input_tokens_seen": 145666330, "step": 6749, "time_per_iteration": 2.6364614963531494 }, { "auxiliary_loss_clip": 0.01092465, "auxiliary_loss_mlp": 0.01026997, "balance_loss_clip": 1.03848636, "balance_loss_mlp": 1.01960278, "epoch": 0.811639511813864, "flos": 24057204635520.0, "grad_norm": 2.6382949896127346, "language_loss": 0.66563833, "learning_rate": 3.606292656950822e-07, "loss": 0.6868329, "num_input_tokens_seen": 145684740, "step": 6750, "time_per_iteration": 2.7730424404144287 }, { "auxiliary_loss_clip": 0.01132285, "auxiliary_loss_mlp": 0.01030064, "balance_loss_clip": 1.04062319, "balance_loss_mlp": 1.02204108, "epoch": 0.8117597547045031, "flos": 23185150243200.0, "grad_norm": 2.092806495273232, "language_loss": 0.86498737, "learning_rate": 3.601831841730121e-07, "loss": 0.88661087, "num_input_tokens_seen": 145702660, "step": 6751, "time_per_iteration": 2.6612370014190674 }, { "auxiliary_loss_clip": 0.01150059, "auxiliary_loss_mlp": 0.01026838, "balance_loss_clip": 1.0446775, "balance_loss_mlp": 1.01977825, "epoch": 0.8118799975951422, "flos": 23040250778880.0, "grad_norm": 2.150153526122874, "language_loss": 0.72546262, "learning_rate": 3.5973735140984916e-07, "loss": 0.7472316, "num_input_tokens_seen": 145722830, "step": 6752, "time_per_iteration": 2.755699396133423 }, { "auxiliary_loss_clip": 0.01106616, "auxiliary_loss_mlp": 0.00761935, "balance_loss_clip": 1.03873062, "balance_loss_mlp": 1.00025082, "epoch": 0.8120002404857812, "flos": 24639962889600.0, "grad_norm": 2.1479293726628335, "language_loss": 0.7953099, "learning_rate": 3.5929176747322607e-07, "loss": 0.81399548, "num_input_tokens_seen": 145741935, "step": 6753, "time_per_iteration": 2.749873161315918 }, { "auxiliary_loss_clip": 0.01047047, "auxiliary_loss_mlp": 0.01000933, "balance_loss_clip": 1.01043868, "balance_loss_mlp": 0.99990767, "epoch": 0.8121204833764204, "flos": 57415742156160.0, "grad_norm": 0.8098266423300995, "language_loss": 0.56176656, "learning_rate": 3.588464324307372e-07, "loss": 0.58224636, "num_input_tokens_seen": 145805560, "step": 6754, "time_per_iteration": 3.342338800430298 }, { "auxiliary_loss_clip": 0.01154236, "auxiliary_loss_mlp": 0.01025152, "balance_loss_clip": 1.04342616, "balance_loss_mlp": 1.01819587, "epoch": 0.8122407262670595, "flos": 19464589549440.0, "grad_norm": 1.765450332735354, "language_loss": 0.75360906, "learning_rate": 3.584013463499391e-07, "loss": 0.7754029, "num_input_tokens_seen": 145824180, "step": 6755, "time_per_iteration": 2.6084117889404297 }, { "auxiliary_loss_clip": 0.01043866, "auxiliary_loss_mlp": 0.01001511, "balance_loss_clip": 1.00996268, "balance_loss_mlp": 1.00038493, "epoch": 0.8123609691576985, "flos": 56425325472000.0, "grad_norm": 0.7339927557820085, "language_loss": 0.64425051, "learning_rate": 3.579565092983521e-07, "loss": 0.66470432, "num_input_tokens_seen": 145885300, "step": 6756, "time_per_iteration": 3.1868698596954346 }, { "auxiliary_loss_clip": 0.011654, "auxiliary_loss_mlp": 0.01025218, "balance_loss_clip": 1.04529715, "balance_loss_mlp": 1.01784503, "epoch": 0.8124812120483377, "flos": 20631973564800.0, "grad_norm": 1.9543579265210251, "language_loss": 0.83786309, "learning_rate": 3.575119213434565e-07, "loss": 0.85976928, "num_input_tokens_seen": 145903815, "step": 6757, "time_per_iteration": 2.7193238735198975 }, { "auxiliary_loss_clip": 0.01150321, "auxiliary_loss_mlp": 0.0102234, "balance_loss_clip": 1.04619575, "balance_loss_mlp": 1.01547945, "epoch": 0.8126014549389767, "flos": 22492397566080.0, "grad_norm": 1.8004129726041063, "language_loss": 0.81609917, "learning_rate": 3.5706758255269765e-07, "loss": 0.83782566, "num_input_tokens_seen": 145922270, "step": 6758, "time_per_iteration": 2.916170358657837 }, { "auxiliary_loss_clip": 0.01142772, "auxiliary_loss_mlp": 0.0103049, "balance_loss_clip": 1.04452956, "balance_loss_mlp": 1.02332234, "epoch": 0.8127216978296158, "flos": 23287961946240.0, "grad_norm": 1.578065908580091, "language_loss": 0.69679046, "learning_rate": 3.566234929934795e-07, "loss": 0.71852303, "num_input_tokens_seen": 145941470, "step": 6759, "time_per_iteration": 2.6941497325897217 }, { "auxiliary_loss_clip": 0.0115065, "auxiliary_loss_mlp": 0.01023341, "balance_loss_clip": 1.04719126, "balance_loss_mlp": 1.01639974, "epoch": 0.812841940720255, "flos": 25154994049920.0, "grad_norm": 1.3984229417732994, "language_loss": 0.71819484, "learning_rate": 3.561796527331706e-07, "loss": 0.73993474, "num_input_tokens_seen": 145963145, "step": 6760, "time_per_iteration": 2.6374270915985107 }, { "auxiliary_loss_clip": 0.01127171, "auxiliary_loss_mlp": 0.01024087, "balance_loss_clip": 1.04285955, "balance_loss_mlp": 1.01705647, "epoch": 0.812962183610894, "flos": 26648446752000.0, "grad_norm": 1.8456232683533451, "language_loss": 0.77666843, "learning_rate": 3.5573606183910163e-07, "loss": 0.798181, "num_input_tokens_seen": 145983150, "step": 6761, "time_per_iteration": 2.765927314758301 }, { "auxiliary_loss_clip": 0.01158696, "auxiliary_loss_mlp": 0.01025123, "balance_loss_clip": 1.04464626, "balance_loss_mlp": 1.01762176, "epoch": 0.8130824265015331, "flos": 24966965329920.0, "grad_norm": 1.663396216963244, "language_loss": 0.78314865, "learning_rate": 3.5529272037856493e-07, "loss": 0.80498683, "num_input_tokens_seen": 146001365, "step": 6762, "time_per_iteration": 2.6404948234558105 }, { "auxiliary_loss_clip": 0.01019782, "auxiliary_loss_mlp": 0.01001993, "balance_loss_clip": 1.01004291, "balance_loss_mlp": 1.0009383, "epoch": 0.8132026693921722, "flos": 67622918175360.0, "grad_norm": 0.7267919999992576, "language_loss": 0.53819984, "learning_rate": 3.548496284188149e-07, "loss": 0.55841756, "num_input_tokens_seen": 146061570, "step": 6763, "time_per_iteration": 3.449946880340576 }, { "auxiliary_loss_clip": 0.01105027, "auxiliary_loss_mlp": 0.01023465, "balance_loss_clip": 1.04289222, "balance_loss_mlp": 1.01691461, "epoch": 0.8133229122828113, "flos": 19495149045120.0, "grad_norm": 1.7052395409267305, "language_loss": 0.7922774, "learning_rate": 3.544067860270681e-07, "loss": 0.81356233, "num_input_tokens_seen": 146079145, "step": 6764, "time_per_iteration": 4.241077899932861 }, { "auxiliary_loss_clip": 0.01128175, "auxiliary_loss_mlp": 0.01029039, "balance_loss_clip": 1.04152966, "balance_loss_mlp": 1.02127242, "epoch": 0.8134431551734503, "flos": 20668135582080.0, "grad_norm": 1.605901410645251, "language_loss": 0.70808089, "learning_rate": 3.539641932705029e-07, "loss": 0.72965306, "num_input_tokens_seen": 146097625, "step": 6765, "time_per_iteration": 2.6738879680633545 }, { "auxiliary_loss_clip": 0.011721, "auxiliary_loss_mlp": 0.01029411, "balance_loss_clip": 1.04835677, "balance_loss_mlp": 1.02160525, "epoch": 0.8135633980640895, "flos": 21507332008320.0, "grad_norm": 2.1668270650832313, "language_loss": 0.77378178, "learning_rate": 3.53521850216262e-07, "loss": 0.79579687, "num_input_tokens_seen": 146117195, "step": 6766, "time_per_iteration": 3.5958991050720215 }, { "auxiliary_loss_clip": 0.01168414, "auxiliary_loss_mlp": 0.01028152, "balance_loss_clip": 1.04705501, "balance_loss_mlp": 1.02080894, "epoch": 0.8136836409547286, "flos": 20554442058240.0, "grad_norm": 1.7462105110917778, "language_loss": 0.76822436, "learning_rate": 3.530797569314461e-07, "loss": 0.79018998, "num_input_tokens_seen": 146136220, "step": 6767, "time_per_iteration": 2.5776124000549316 }, { "auxiliary_loss_clip": 0.01168279, "auxiliary_loss_mlp": 0.01024433, "balance_loss_clip": 1.04812407, "balance_loss_mlp": 1.0173161, "epoch": 0.8138038838453676, "flos": 20299045380480.0, "grad_norm": 5.3353156622999105, "language_loss": 0.77876586, "learning_rate": 3.5263791348312235e-07, "loss": 0.80069292, "num_input_tokens_seen": 146155415, "step": 6768, "time_per_iteration": 2.6032814979553223 }, { "auxiliary_loss_clip": 0.0113586, "auxiliary_loss_mlp": 0.0102281, "balance_loss_clip": 1.04133892, "balance_loss_mlp": 1.01562428, "epoch": 0.8139241267360068, "flos": 29789840551680.0, "grad_norm": 1.8885355330848477, "language_loss": 0.70610976, "learning_rate": 3.521963199383171e-07, "loss": 0.72769654, "num_input_tokens_seen": 146178370, "step": 6769, "time_per_iteration": 3.6701419353485107 }, { "auxiliary_loss_clip": 0.01111533, "auxiliary_loss_mlp": 0.01025869, "balance_loss_clip": 1.04016232, "balance_loss_mlp": 1.01782537, "epoch": 0.8140443696266458, "flos": 19713270384000.0, "grad_norm": 1.9558474050020058, "language_loss": 0.76693845, "learning_rate": 3.517549763640197e-07, "loss": 0.78831244, "num_input_tokens_seen": 146196010, "step": 6770, "time_per_iteration": 2.755021810531616 }, { "auxiliary_loss_clip": 0.01151632, "auxiliary_loss_mlp": 0.00762366, "balance_loss_clip": 1.04766679, "balance_loss_mlp": 1.00030243, "epoch": 0.8141646125172849, "flos": 27160568910720.0, "grad_norm": 1.9021443885281621, "language_loss": 0.71119058, "learning_rate": 3.513138828271829e-07, "loss": 0.73033059, "num_input_tokens_seen": 146215880, "step": 6771, "time_per_iteration": 2.7321043014526367 }, { "auxiliary_loss_clip": 0.0111842, "auxiliary_loss_mlp": 0.01023915, "balance_loss_clip": 1.04097891, "balance_loss_mlp": 1.01740909, "epoch": 0.8142848554079241, "flos": 39673102700160.0, "grad_norm": 1.9270048577847785, "language_loss": 0.69917703, "learning_rate": 3.508730393947179e-07, "loss": 0.72060037, "num_input_tokens_seen": 146239135, "step": 6772, "time_per_iteration": 2.809476852416992 }, { "auxiliary_loss_clip": 0.01123513, "auxiliary_loss_mlp": 0.01021841, "balance_loss_clip": 1.04175425, "balance_loss_mlp": 1.01410413, "epoch": 0.8144050982985631, "flos": 22237288197120.0, "grad_norm": 1.5659830090760427, "language_loss": 0.72389346, "learning_rate": 3.504324461335024e-07, "loss": 0.74534702, "num_input_tokens_seen": 146259245, "step": 6773, "time_per_iteration": 2.7746129035949707 }, { "auxiliary_loss_clip": 0.0110208, "auxiliary_loss_mlp": 0.01026691, "balance_loss_clip": 1.03941131, "balance_loss_mlp": 1.01924086, "epoch": 0.8145253411892022, "flos": 23038239617280.0, "grad_norm": 1.6839746075585653, "language_loss": 0.88235205, "learning_rate": 3.499921031103732e-07, "loss": 0.90363979, "num_input_tokens_seen": 146280015, "step": 6774, "time_per_iteration": 2.800969123840332 }, { "auxiliary_loss_clip": 0.01133613, "auxiliary_loss_mlp": 0.01024395, "balance_loss_clip": 1.04057539, "balance_loss_mlp": 1.01735556, "epoch": 0.8146455840798413, "flos": 24827668387200.0, "grad_norm": 1.6582059915551792, "language_loss": 0.785864, "learning_rate": 3.4955201039212987e-07, "loss": 0.80744404, "num_input_tokens_seen": 146300935, "step": 6775, "time_per_iteration": 2.672597646713257 }, { "auxiliary_loss_clip": 0.01159293, "auxiliary_loss_mlp": 0.01032063, "balance_loss_clip": 1.04643619, "balance_loss_mlp": 1.02517867, "epoch": 0.8147658269704804, "flos": 19974520978560.0, "grad_norm": 2.095969376062296, "language_loss": 0.65290308, "learning_rate": 3.4911216804553465e-07, "loss": 0.67481661, "num_input_tokens_seen": 146319835, "step": 6776, "time_per_iteration": 2.650496244430542 }, { "auxiliary_loss_clip": 0.01137642, "auxiliary_loss_mlp": 0.01026416, "balance_loss_clip": 1.0433073, "balance_loss_mlp": 1.01853967, "epoch": 0.8148860698611194, "flos": 21178031097600.0, "grad_norm": 2.0547707339855794, "language_loss": 0.70269966, "learning_rate": 3.4867257613731017e-07, "loss": 0.72434026, "num_input_tokens_seen": 146339030, "step": 6777, "time_per_iteration": 2.6557295322418213 }, { "auxiliary_loss_clip": 0.01140502, "auxiliary_loss_mlp": 0.01026518, "balance_loss_clip": 1.04281688, "balance_loss_mlp": 1.01922822, "epoch": 0.8150063127517585, "flos": 19606903234560.0, "grad_norm": 1.7835040834888813, "language_loss": 0.855739, "learning_rate": 3.4823323473414343e-07, "loss": 0.87740922, "num_input_tokens_seen": 146358550, "step": 6778, "time_per_iteration": 2.688664197921753 }, { "auxiliary_loss_clip": 0.01134424, "auxiliary_loss_mlp": 0.01029502, "balance_loss_clip": 1.04386652, "balance_loss_mlp": 1.02121675, "epoch": 0.8151265556423977, "flos": 22638374438400.0, "grad_norm": 3.772169122838476, "language_loss": 0.75568068, "learning_rate": 3.477941439026812e-07, "loss": 0.77731991, "num_input_tokens_seen": 146376770, "step": 6779, "time_per_iteration": 2.7082858085632324 }, { "auxiliary_loss_clip": 0.01142026, "auxiliary_loss_mlp": 0.01025861, "balance_loss_clip": 1.04575777, "balance_loss_mlp": 1.0185802, "epoch": 0.8152467985330367, "flos": 17968048277760.0, "grad_norm": 1.677069985734151, "language_loss": 0.73124552, "learning_rate": 3.473553037095349e-07, "loss": 0.75292432, "num_input_tokens_seen": 146395795, "step": 6780, "time_per_iteration": 2.6599388122558594 }, { "auxiliary_loss_clip": 0.01131485, "auxiliary_loss_mlp": 0.01022629, "balance_loss_clip": 1.04079604, "balance_loss_mlp": 1.01486492, "epoch": 0.8153670414236758, "flos": 24969012405120.0, "grad_norm": 1.995856671081086, "language_loss": 0.83050299, "learning_rate": 3.469167142212743e-07, "loss": 0.85204411, "num_input_tokens_seen": 146417640, "step": 6781, "time_per_iteration": 2.7476744651794434 }, { "auxiliary_loss_clip": 0.01153447, "auxiliary_loss_mlp": 0.01031783, "balance_loss_clip": 1.04586673, "balance_loss_mlp": 1.0245738, "epoch": 0.8154872843143149, "flos": 31066069754880.0, "grad_norm": 2.6625612623950574, "language_loss": 0.62642157, "learning_rate": 3.4647837550443337e-07, "loss": 0.64827389, "num_input_tokens_seen": 146436205, "step": 6782, "time_per_iteration": 2.671539783477783 }, { "auxiliary_loss_clip": 0.01127045, "auxiliary_loss_mlp": 0.01023214, "balance_loss_clip": 1.0422107, "balance_loss_mlp": 1.01571035, "epoch": 0.815607527204954, "flos": 19391654983680.0, "grad_norm": 3.219560972561662, "language_loss": 0.74440861, "learning_rate": 3.460402876255086e-07, "loss": 0.76591122, "num_input_tokens_seen": 146453595, "step": 6783, "time_per_iteration": 2.6240627765655518 }, { "auxiliary_loss_clip": 0.01156864, "auxiliary_loss_mlp": 0.01026325, "balance_loss_clip": 1.04634035, "balance_loss_mlp": 1.01908886, "epoch": 0.815727770095593, "flos": 26140418743680.0, "grad_norm": 1.94070167111501, "language_loss": 0.71741283, "learning_rate": 3.456024506509574e-07, "loss": 0.73924476, "num_input_tokens_seen": 146474515, "step": 6784, "time_per_iteration": 2.6897995471954346 }, { "auxiliary_loss_clip": 0.01156674, "auxiliary_loss_mlp": 0.00762445, "balance_loss_clip": 1.04856813, "balance_loss_mlp": 1.00034916, "epoch": 0.8158480129862322, "flos": 25337527989120.0, "grad_norm": 1.7699741280867545, "language_loss": 0.73982215, "learning_rate": 3.4516486464719873e-07, "loss": 0.7590133, "num_input_tokens_seen": 146493905, "step": 6785, "time_per_iteration": 2.7292234897613525 }, { "auxiliary_loss_clip": 0.01105085, "auxiliary_loss_mlp": 0.01020222, "balance_loss_clip": 1.03913569, "balance_loss_mlp": 1.01296258, "epoch": 0.8159682558768713, "flos": 34423645559040.0, "grad_norm": 2.3015031322578423, "language_loss": 0.62340873, "learning_rate": 3.4472752968061445e-07, "loss": 0.64466178, "num_input_tokens_seen": 146518335, "step": 6786, "time_per_iteration": 2.831507682800293 }, { "auxiliary_loss_clip": 0.01151235, "auxiliary_loss_mlp": 0.0102215, "balance_loss_clip": 1.0431838, "balance_loss_mlp": 1.01523638, "epoch": 0.8160884987675103, "flos": 18653223185280.0, "grad_norm": 1.9242106479990093, "language_loss": 0.7405054, "learning_rate": 3.442904458175475e-07, "loss": 0.76223928, "num_input_tokens_seen": 146535655, "step": 6787, "time_per_iteration": 2.6088531017303467 }, { "auxiliary_loss_clip": 0.01149133, "auxiliary_loss_mlp": 0.01021235, "balance_loss_clip": 1.04317582, "balance_loss_mlp": 1.01396942, "epoch": 0.8162087416581495, "flos": 31430527102080.0, "grad_norm": 1.5957272577528205, "language_loss": 0.75919896, "learning_rate": 3.438536131243044e-07, "loss": 0.78090262, "num_input_tokens_seen": 146556815, "step": 6788, "time_per_iteration": 2.6711971759796143 }, { "auxiliary_loss_clip": 0.01140871, "auxiliary_loss_mlp": 0.01026308, "balance_loss_clip": 1.04209173, "balance_loss_mlp": 1.01776111, "epoch": 0.8163289845487885, "flos": 37593910915200.0, "grad_norm": 2.20547633117888, "language_loss": 0.62033814, "learning_rate": 3.434170316671503e-07, "loss": 0.64200997, "num_input_tokens_seen": 146581845, "step": 6789, "time_per_iteration": 3.727544069290161 }, { "auxiliary_loss_clip": 0.01117867, "auxiliary_loss_mlp": 0.01020254, "balance_loss_clip": 1.04487443, "balance_loss_mlp": 1.01314926, "epoch": 0.8164492274394276, "flos": 13953989554560.0, "grad_norm": 2.274752726647801, "language_loss": 0.89709854, "learning_rate": 3.4298070151231583e-07, "loss": 0.91847974, "num_input_tokens_seen": 146597245, "step": 6790, "time_per_iteration": 2.6696135997772217 }, { "auxiliary_loss_clip": 0.01142962, "auxiliary_loss_mlp": 0.01026564, "balance_loss_clip": 1.04367554, "balance_loss_mlp": 1.01980805, "epoch": 0.8165694703300668, "flos": 28986554747520.0, "grad_norm": 1.7878639961843614, "language_loss": 0.60046828, "learning_rate": 3.425446227259916e-07, "loss": 0.62216359, "num_input_tokens_seen": 146618210, "step": 6791, "time_per_iteration": 3.703967809677124 }, { "auxiliary_loss_clip": 0.0114091, "auxiliary_loss_mlp": 0.01024608, "balance_loss_clip": 1.04360437, "balance_loss_mlp": 1.01765847, "epoch": 0.8166897132207058, "flos": 25118365155840.0, "grad_norm": 2.1697990944121477, "language_loss": 0.82503033, "learning_rate": 3.421087953743296e-07, "loss": 0.84668547, "num_input_tokens_seen": 146637975, "step": 6792, "time_per_iteration": 2.7091760635375977 }, { "auxiliary_loss_clip": 0.01152963, "auxiliary_loss_mlp": 0.0102502, "balance_loss_clip": 1.04258275, "balance_loss_mlp": 1.01778126, "epoch": 0.8168099561113449, "flos": 23148593176320.0, "grad_norm": 1.929388794282692, "language_loss": 0.80264533, "learning_rate": 3.416732195234464e-07, "loss": 0.82442522, "num_input_tokens_seen": 146658030, "step": 6793, "time_per_iteration": 2.5943613052368164 }, { "auxiliary_loss_clip": 0.01153775, "auxiliary_loss_mlp": 0.01021909, "balance_loss_clip": 1.04283488, "balance_loss_mlp": 1.01501, "epoch": 0.816930199001984, "flos": 18407666833920.0, "grad_norm": 1.6858651072172797, "language_loss": 0.79413015, "learning_rate": 3.4123789523941613e-07, "loss": 0.81588697, "num_input_tokens_seen": 146677855, "step": 6794, "time_per_iteration": 3.5468177795410156 }, { "auxiliary_loss_clip": 0.01143687, "auxiliary_loss_mlp": 0.01027542, "balance_loss_clip": 1.04106283, "balance_loss_mlp": 1.02021086, "epoch": 0.8170504418926231, "flos": 21251324799360.0, "grad_norm": 1.5397155287058346, "language_loss": 0.63563758, "learning_rate": 3.4080282258827884e-07, "loss": 0.65734982, "num_input_tokens_seen": 146696230, "step": 6795, "time_per_iteration": 3.622075319290161 }, { "auxiliary_loss_clip": 0.01155362, "auxiliary_loss_mlp": 0.01024098, "balance_loss_clip": 1.046386, "balance_loss_mlp": 1.01682341, "epoch": 0.8171706847832622, "flos": 19099234362240.0, "grad_norm": 2.0425453612048052, "language_loss": 0.72574759, "learning_rate": 3.403680016360342e-07, "loss": 0.7475422, "num_input_tokens_seen": 146714835, "step": 6796, "time_per_iteration": 2.6547253131866455 }, { "auxiliary_loss_clip": 0.01148249, "auxiliary_loss_mlp": 0.01034553, "balance_loss_clip": 1.04631543, "balance_loss_mlp": 1.02678323, "epoch": 0.8172909276739013, "flos": 21470128496640.0, "grad_norm": 1.4700048684002351, "language_loss": 0.67752874, "learning_rate": 3.3993343244864403e-07, "loss": 0.69935679, "num_input_tokens_seen": 146734425, "step": 6797, "time_per_iteration": 2.813734292984009 }, { "auxiliary_loss_clip": 0.01151411, "auxiliary_loss_mlp": 0.01023312, "balance_loss_clip": 1.04508317, "balance_loss_mlp": 1.01665378, "epoch": 0.8174111705645404, "flos": 27599792417280.0, "grad_norm": 1.947826010283228, "language_loss": 0.72654766, "learning_rate": 3.394991150920323e-07, "loss": 0.74829489, "num_input_tokens_seen": 146757545, "step": 6798, "time_per_iteration": 2.6812334060668945 }, { "auxiliary_loss_clip": 0.01113109, "auxiliary_loss_mlp": 0.00762909, "balance_loss_clip": 1.04083431, "balance_loss_mlp": 1.00032032, "epoch": 0.8175314134551794, "flos": 14064594508800.0, "grad_norm": 2.037867654210426, "language_loss": 0.74162453, "learning_rate": 3.3906504963208396e-07, "loss": 0.76038468, "num_input_tokens_seen": 146774240, "step": 6799, "time_per_iteration": 2.7910609245300293 }, { "auxiliary_loss_clip": 0.01105151, "auxiliary_loss_mlp": 0.0103017, "balance_loss_clip": 1.04198909, "balance_loss_mlp": 1.02259409, "epoch": 0.8176516563458186, "flos": 22708076780160.0, "grad_norm": 1.7761928562735736, "language_loss": 0.66340107, "learning_rate": 3.3863123613464774e-07, "loss": 0.68475425, "num_input_tokens_seen": 146793140, "step": 6800, "time_per_iteration": 2.7528247833251953 }, { "auxiliary_loss_clip": 0.01139653, "auxiliary_loss_mlp": 0.01022469, "balance_loss_clip": 1.03985929, "balance_loss_mlp": 1.01553738, "epoch": 0.8177718992364577, "flos": 21945406279680.0, "grad_norm": 1.7618752587457034, "language_loss": 0.75051808, "learning_rate": 3.381976746655317e-07, "loss": 0.77213931, "num_input_tokens_seen": 146812895, "step": 6801, "time_per_iteration": 2.7613768577575684 }, { "auxiliary_loss_clip": 0.0110321, "auxiliary_loss_mlp": 0.01021971, "balance_loss_clip": 1.04286706, "balance_loss_mlp": 1.01461291, "epoch": 0.8178921421270967, "flos": 22017443005440.0, "grad_norm": 2.3711849956579765, "language_loss": 0.66908723, "learning_rate": 3.3776436529050756e-07, "loss": 0.69033897, "num_input_tokens_seen": 146832445, "step": 6802, "time_per_iteration": 2.7386040687561035 }, { "auxiliary_loss_clip": 0.01162426, "auxiliary_loss_mlp": 0.01021226, "balance_loss_clip": 1.04387558, "balance_loss_mlp": 1.01431751, "epoch": 0.8180123850177359, "flos": 33183111496320.0, "grad_norm": 1.7884706085642583, "language_loss": 0.72550929, "learning_rate": 3.373313080753073e-07, "loss": 0.74734586, "num_input_tokens_seen": 146856505, "step": 6803, "time_per_iteration": 2.73659610748291 }, { "auxiliary_loss_clip": 0.01146235, "auxiliary_loss_mlp": 0.01024023, "balance_loss_clip": 1.04173827, "balance_loss_mlp": 1.01662064, "epoch": 0.8181326279083749, "flos": 22091167670400.0, "grad_norm": 1.5053419981682647, "language_loss": 0.77437043, "learning_rate": 3.3689850308562527e-07, "loss": 0.79607302, "num_input_tokens_seen": 146876950, "step": 6804, "time_per_iteration": 2.6836154460906982 }, { "auxiliary_loss_clip": 0.01100999, "auxiliary_loss_mlp": 0.01027418, "balance_loss_clip": 1.04209661, "balance_loss_mlp": 1.02075982, "epoch": 0.818252870799014, "flos": 15705747936000.0, "grad_norm": 2.183016564911383, "language_loss": 0.77830648, "learning_rate": 3.364659503871183e-07, "loss": 0.79959059, "num_input_tokens_seen": 146894885, "step": 6805, "time_per_iteration": 2.680767774581909 }, { "auxiliary_loss_clip": 0.0111702, "auxiliary_loss_mlp": 0.01021652, "balance_loss_clip": 1.03714645, "balance_loss_mlp": 1.01509249, "epoch": 0.8183731136896532, "flos": 18770687637120.0, "grad_norm": 1.7656548376046912, "language_loss": 0.8377685, "learning_rate": 3.3603365004540417e-07, "loss": 0.85915518, "num_input_tokens_seen": 146913180, "step": 6806, "time_per_iteration": 2.7702226638793945 }, { "auxiliary_loss_clip": 0.01167225, "auxiliary_loss_mlp": 0.01028531, "balance_loss_clip": 1.04855084, "balance_loss_mlp": 1.02108908, "epoch": 0.8184933565802922, "flos": 26541792293760.0, "grad_norm": 2.137188309210202, "language_loss": 0.77125549, "learning_rate": 3.356016021260624e-07, "loss": 0.79321301, "num_input_tokens_seen": 146933510, "step": 6807, "time_per_iteration": 2.7126476764678955 }, { "auxiliary_loss_clip": 0.01153829, "auxiliary_loss_mlp": 0.01027971, "balance_loss_clip": 1.04523563, "balance_loss_mlp": 1.02080357, "epoch": 0.8186135994709313, "flos": 17530117660800.0, "grad_norm": 3.5725032667554237, "language_loss": 0.6569097, "learning_rate": 3.35169806694634e-07, "loss": 0.67872763, "num_input_tokens_seen": 146951760, "step": 6808, "time_per_iteration": 2.5987396240234375 }, { "auxiliary_loss_clip": 0.01036393, "auxiliary_loss_mlp": 0.01000374, "balance_loss_clip": 1.01467621, "balance_loss_mlp": 0.99934274, "epoch": 0.8187338423615703, "flos": 63480300675840.0, "grad_norm": 0.7158209206496644, "language_loss": 0.60612559, "learning_rate": 3.3473826381662186e-07, "loss": 0.62649333, "num_input_tokens_seen": 147022900, "step": 6809, "time_per_iteration": 3.3959803581237793 }, { "auxiliary_loss_clip": 0.01148081, "auxiliary_loss_mlp": 0.01023279, "balance_loss_clip": 1.04593217, "balance_loss_mlp": 1.0165168, "epoch": 0.8188540852522095, "flos": 17529974006400.0, "grad_norm": 1.8507835486934523, "language_loss": 0.81545275, "learning_rate": 3.3430697355749216e-07, "loss": 0.83716637, "num_input_tokens_seen": 147040590, "step": 6810, "time_per_iteration": 2.628107786178589 }, { "auxiliary_loss_clip": 0.01105418, "auxiliary_loss_mlp": 0.01023156, "balance_loss_clip": 1.03855443, "balance_loss_mlp": 1.01596451, "epoch": 0.8189743281428485, "flos": 14392530702720.0, "grad_norm": 2.129243998376342, "language_loss": 0.75437975, "learning_rate": 3.3387593598266907e-07, "loss": 0.7756654, "num_input_tokens_seen": 147057200, "step": 6811, "time_per_iteration": 2.6898815631866455 }, { "auxiliary_loss_clip": 0.01117077, "auxiliary_loss_mlp": 0.01025909, "balance_loss_clip": 1.03909302, "balance_loss_mlp": 1.0189023, "epoch": 0.8190945710334876, "flos": 25080479285760.0, "grad_norm": 1.8140498751227367, "language_loss": 0.78205192, "learning_rate": 3.3344515115754225e-07, "loss": 0.80348188, "num_input_tokens_seen": 147076180, "step": 6812, "time_per_iteration": 2.7593486309051514 }, { "auxiliary_loss_clip": 0.01127575, "auxiliary_loss_mlp": 0.0102507, "balance_loss_clip": 1.04146886, "balance_loss_mlp": 1.01745248, "epoch": 0.8192148139241268, "flos": 21507152440320.0, "grad_norm": 2.6296371305132604, "language_loss": 0.79686183, "learning_rate": 3.33014619147461e-07, "loss": 0.81838834, "num_input_tokens_seen": 147094205, "step": 6813, "time_per_iteration": 2.728896379470825 }, { "auxiliary_loss_clip": 0.01138478, "auxiliary_loss_mlp": 0.01027684, "balance_loss_clip": 1.04560375, "balance_loss_mlp": 1.02068019, "epoch": 0.8193350568147658, "flos": 23952166289280.0, "grad_norm": 2.333424950073784, "language_loss": 0.7172007, "learning_rate": 3.325843400177362e-07, "loss": 0.73886228, "num_input_tokens_seen": 147115545, "step": 6814, "time_per_iteration": 2.675199270248413 }, { "auxiliary_loss_clip": 0.01156394, "auxiliary_loss_mlp": 0.00762201, "balance_loss_clip": 1.04531801, "balance_loss_mlp": 1.00033975, "epoch": 0.8194552997054049, "flos": 20559469962240.0, "grad_norm": 1.891592898025834, "language_loss": 0.73589027, "learning_rate": 3.32154313833642e-07, "loss": 0.75507617, "num_input_tokens_seen": 147135700, "step": 6815, "time_per_iteration": 3.564971923828125 }, { "auxiliary_loss_clip": 0.01167365, "auxiliary_loss_mlp": 0.01025997, "balance_loss_clip": 1.04571378, "balance_loss_mlp": 1.01864219, "epoch": 0.819575542596044, "flos": 26031753123840.0, "grad_norm": 2.0546941626694246, "language_loss": 0.59619355, "learning_rate": 3.3172454066041164e-07, "loss": 0.61812717, "num_input_tokens_seen": 147155205, "step": 6816, "time_per_iteration": 2.576524257659912 }, { "auxiliary_loss_clip": 0.01093095, "auxiliary_loss_mlp": 0.00761777, "balance_loss_clip": 1.04022646, "balance_loss_mlp": 1.00022829, "epoch": 0.8196957854866831, "flos": 29096944220160.0, "grad_norm": 2.053170800429048, "language_loss": 0.76442891, "learning_rate": 3.3129502056324234e-07, "loss": 0.78297758, "num_input_tokens_seen": 147176570, "step": 6817, "time_per_iteration": 2.888990640640259 }, { "auxiliary_loss_clip": 0.01002923, "auxiliary_loss_mlp": 0.01001357, "balance_loss_clip": 1.00972974, "balance_loss_mlp": 1.0003016, "epoch": 0.8198160283773221, "flos": 69033631898880.0, "grad_norm": 0.7954721573869636, "language_loss": 0.59695387, "learning_rate": 3.3086575360729165e-07, "loss": 0.61699677, "num_input_tokens_seen": 147234105, "step": 6818, "time_per_iteration": 4.50211238861084 }, { "auxiliary_loss_clip": 0.01136302, "auxiliary_loss_mlp": 0.01025642, "balance_loss_clip": 1.04409206, "balance_loss_mlp": 1.01839101, "epoch": 0.8199362712679613, "flos": 16618058496000.0, "grad_norm": 1.926953100479827, "language_loss": 0.71491039, "learning_rate": 3.3043673985767906e-07, "loss": 0.73652983, "num_input_tokens_seen": 147253170, "step": 6819, "time_per_iteration": 3.029890298843384 }, { "auxiliary_loss_clip": 0.0110912, "auxiliary_loss_mlp": 0.01028331, "balance_loss_clip": 1.03571379, "balance_loss_mlp": 1.02119923, "epoch": 0.8200565141586004, "flos": 21757664868480.0, "grad_norm": 1.6440375861230732, "language_loss": 0.77597892, "learning_rate": 3.3000797937948564e-07, "loss": 0.79735345, "num_input_tokens_seen": 147271465, "step": 6820, "time_per_iteration": 3.6129424571990967 }, { "auxiliary_loss_clip": 0.01034633, "auxiliary_loss_mlp": 0.01000839, "balance_loss_clip": 1.0099647, "balance_loss_mlp": 0.99975377, "epoch": 0.8201767570492394, "flos": 69807112392960.0, "grad_norm": 0.9542628581485805, "language_loss": 0.64954996, "learning_rate": 3.295794722377534e-07, "loss": 0.66990465, "num_input_tokens_seen": 147335070, "step": 6821, "time_per_iteration": 4.214130878448486 }, { "auxiliary_loss_clip": 0.01162271, "auxiliary_loss_mlp": 0.01024254, "balance_loss_clip": 1.04453683, "balance_loss_mlp": 1.01754534, "epoch": 0.8202969999398786, "flos": 23111892455040.0, "grad_norm": 1.6409227124065595, "language_loss": 0.8032335, "learning_rate": 3.291512184974876e-07, "loss": 0.82509875, "num_input_tokens_seen": 147355460, "step": 6822, "time_per_iteration": 2.63578200340271 }, { "auxiliary_loss_clip": 0.01135538, "auxiliary_loss_mlp": 0.0102677, "balance_loss_clip": 1.04030132, "balance_loss_mlp": 1.0188899, "epoch": 0.8204172428305176, "flos": 28220616109440.0, "grad_norm": 1.7942194889419898, "language_loss": 0.66882432, "learning_rate": 3.2872321822365346e-07, "loss": 0.69044745, "num_input_tokens_seen": 147375675, "step": 6823, "time_per_iteration": 2.8146891593933105 }, { "auxiliary_loss_clip": 0.01149681, "auxiliary_loss_mlp": 0.01025926, "balance_loss_clip": 1.04458487, "balance_loss_mlp": 1.01904774, "epoch": 0.8205374857211567, "flos": 20887011106560.0, "grad_norm": 1.966202374375574, "language_loss": 0.73363161, "learning_rate": 3.282954714811783e-07, "loss": 0.75538766, "num_input_tokens_seen": 147394580, "step": 6824, "time_per_iteration": 2.7465476989746094 }, { "auxiliary_loss_clip": 0.01124685, "auxiliary_loss_mlp": 0.01028487, "balance_loss_clip": 1.03859639, "balance_loss_mlp": 1.02118492, "epoch": 0.8206577286117959, "flos": 13152140294400.0, "grad_norm": 2.3505178210132227, "language_loss": 0.7091049, "learning_rate": 3.2786797833495093e-07, "loss": 0.7306366, "num_input_tokens_seen": 147409935, "step": 6825, "time_per_iteration": 2.6675689220428467 }, { "auxiliary_loss_clip": 0.01164936, "auxiliary_loss_mlp": 0.01026295, "balance_loss_clip": 1.04634595, "balance_loss_mlp": 1.01978588, "epoch": 0.8207779715024349, "flos": 25265634917760.0, "grad_norm": 1.9069559303895203, "language_loss": 0.72501826, "learning_rate": 3.274407388498213e-07, "loss": 0.7469306, "num_input_tokens_seen": 147428065, "step": 6826, "time_per_iteration": 2.7370588779449463 }, { "auxiliary_loss_clip": 0.01120405, "auxiliary_loss_mlp": 0.01023993, "balance_loss_clip": 1.04068327, "balance_loss_mlp": 1.01701343, "epoch": 0.820898214393074, "flos": 19610243199360.0, "grad_norm": 5.5213273193389005, "language_loss": 0.74619353, "learning_rate": 3.270137530906021e-07, "loss": 0.76763749, "num_input_tokens_seen": 147447300, "step": 6827, "time_per_iteration": 2.757395029067993 }, { "auxiliary_loss_clip": 0.01099938, "auxiliary_loss_mlp": 0.01024548, "balance_loss_clip": 1.04060459, "balance_loss_mlp": 1.01785755, "epoch": 0.8210184572837131, "flos": 15596615439360.0, "grad_norm": 14.48063162950305, "language_loss": 0.83311737, "learning_rate": 3.265870211220665e-07, "loss": 0.85436225, "num_input_tokens_seen": 147465135, "step": 6828, "time_per_iteration": 2.7714548110961914 }, { "auxiliary_loss_clip": 0.0111787, "auxiliary_loss_mlp": 0.01031915, "balance_loss_clip": 1.0414896, "balance_loss_mlp": 1.02448225, "epoch": 0.8211387001743522, "flos": 20813932886400.0, "grad_norm": 1.8250351126514501, "language_loss": 0.81677771, "learning_rate": 3.2616054300894934e-07, "loss": 0.83827555, "num_input_tokens_seen": 147484585, "step": 6829, "time_per_iteration": 2.7387709617614746 }, { "auxiliary_loss_clip": 0.01125869, "auxiliary_loss_mlp": 0.01029995, "balance_loss_clip": 1.04103065, "balance_loss_mlp": 1.02214491, "epoch": 0.8212589430649913, "flos": 27704579368320.0, "grad_norm": 2.455088860433198, "language_loss": 0.84382081, "learning_rate": 3.2573431881594693e-07, "loss": 0.86537945, "num_input_tokens_seen": 147504130, "step": 6830, "time_per_iteration": 2.8171210289001465 }, { "auxiliary_loss_clip": 0.01092465, "auxiliary_loss_mlp": 0.01025556, "balance_loss_clip": 1.03477597, "balance_loss_mlp": 1.01816761, "epoch": 0.8213791859556304, "flos": 22455625017600.0, "grad_norm": 2.530409609510806, "language_loss": 0.65670943, "learning_rate": 3.2530834860771663e-07, "loss": 0.6778897, "num_input_tokens_seen": 147523510, "step": 6831, "time_per_iteration": 2.796443462371826 }, { "auxiliary_loss_clip": 0.01150943, "auxiliary_loss_mlp": 0.01027401, "balance_loss_clip": 1.04264617, "balance_loss_mlp": 1.02017069, "epoch": 0.8214994288462695, "flos": 16654471908480.0, "grad_norm": 2.0680328298978488, "language_loss": 0.74380672, "learning_rate": 3.248826324488794e-07, "loss": 0.76559019, "num_input_tokens_seen": 147540805, "step": 6832, "time_per_iteration": 2.648983955383301 }, { "auxiliary_loss_clip": 0.01166672, "auxiliary_loss_mlp": 0.01025501, "balance_loss_clip": 1.04863071, "balance_loss_mlp": 1.01893258, "epoch": 0.8216196717369085, "flos": 25221787390080.0, "grad_norm": 1.7599528645319884, "language_loss": 0.87514991, "learning_rate": 3.244571704040138e-07, "loss": 0.89707166, "num_input_tokens_seen": 147560965, "step": 6833, "time_per_iteration": 2.643709182739258 }, { "auxiliary_loss_clip": 0.01148139, "auxiliary_loss_mlp": 0.01024755, "balance_loss_clip": 1.04212093, "balance_loss_mlp": 1.01735854, "epoch": 0.8217399146275477, "flos": 25371930240000.0, "grad_norm": 1.899377404175653, "language_loss": 0.73382634, "learning_rate": 3.2403196253766374e-07, "loss": 0.75555527, "num_input_tokens_seen": 147580045, "step": 6834, "time_per_iteration": 2.700230836868286 }, { "auxiliary_loss_clip": 0.01148298, "auxiliary_loss_mlp": 0.01029285, "balance_loss_clip": 1.04324365, "balance_loss_mlp": 1.02066612, "epoch": 0.8218601575181868, "flos": 25629625388160.0, "grad_norm": 2.830981040625386, "language_loss": 0.79149902, "learning_rate": 3.2360700891433254e-07, "loss": 0.8132748, "num_input_tokens_seen": 147599070, "step": 6835, "time_per_iteration": 2.6842093467712402 }, { "auxiliary_loss_clip": 0.01021158, "auxiliary_loss_mlp": 0.01000819, "balance_loss_clip": 1.01012921, "balance_loss_mlp": 0.99976963, "epoch": 0.8219804004088258, "flos": 67660229427840.0, "grad_norm": 0.7930575116686561, "language_loss": 0.57276225, "learning_rate": 3.231823095984847e-07, "loss": 0.59298205, "num_input_tokens_seen": 147653710, "step": 6836, "time_per_iteration": 3.2144362926483154 }, { "auxiliary_loss_clip": 0.01136561, "auxiliary_loss_mlp": 0.01023428, "balance_loss_clip": 1.04313612, "balance_loss_mlp": 1.01697588, "epoch": 0.822100643299465, "flos": 19464266327040.0, "grad_norm": 3.933087390946373, "language_loss": 0.76283741, "learning_rate": 3.2275786465454814e-07, "loss": 0.7844373, "num_input_tokens_seen": 147670360, "step": 6837, "time_per_iteration": 2.6482715606689453 }, { "auxiliary_loss_clip": 0.01122733, "auxiliary_loss_mlp": 0.01022469, "balance_loss_clip": 1.04110289, "balance_loss_mlp": 1.01536751, "epoch": 0.822220886190104, "flos": 24681368292480.0, "grad_norm": 1.8144310590163144, "language_loss": 0.7562269, "learning_rate": 3.2233367414690917e-07, "loss": 0.77767891, "num_input_tokens_seen": 147692550, "step": 6838, "time_per_iteration": 2.768923282623291 }, { "auxiliary_loss_clip": 0.01120903, "auxiliary_loss_mlp": 0.01019917, "balance_loss_clip": 1.04031301, "balance_loss_mlp": 1.01327145, "epoch": 0.8223411290807431, "flos": 27819062991360.0, "grad_norm": 2.192067446991063, "language_loss": 0.84866214, "learning_rate": 3.219097381399183e-07, "loss": 0.87007034, "num_input_tokens_seen": 147709725, "step": 6839, "time_per_iteration": 2.909639358520508 }, { "auxiliary_loss_clip": 0.01144701, "auxiliary_loss_mlp": 0.01026866, "balance_loss_clip": 1.04399872, "balance_loss_mlp": 1.01936769, "epoch": 0.8224613719713821, "flos": 23218546913280.0, "grad_norm": 2.352031751023756, "language_loss": 0.81310326, "learning_rate": 3.2148605669788584e-07, "loss": 0.83481896, "num_input_tokens_seen": 147729615, "step": 6840, "time_per_iteration": 2.7260680198669434 }, { "auxiliary_loss_clip": 0.01140528, "auxiliary_loss_mlp": 0.01024479, "balance_loss_clip": 1.04476714, "balance_loss_mlp": 1.01808071, "epoch": 0.8225816148620213, "flos": 15706250726400.0, "grad_norm": 2.696695956319773, "language_loss": 0.77717996, "learning_rate": 3.2106262988508405e-07, "loss": 0.79883003, "num_input_tokens_seen": 147747665, "step": 6841, "time_per_iteration": 3.554579734802246 }, { "auxiliary_loss_clip": 0.01136592, "auxiliary_loss_mlp": 0.01026157, "balance_loss_clip": 1.04272604, "balance_loss_mlp": 1.01849782, "epoch": 0.8227018577526604, "flos": 18515111391360.0, "grad_norm": 1.824029120400195, "language_loss": 0.74170238, "learning_rate": 3.206394577657465e-07, "loss": 0.76332986, "num_input_tokens_seen": 147765445, "step": 6842, "time_per_iteration": 2.6430625915527344 }, { "auxiliary_loss_clip": 0.01156388, "auxiliary_loss_mlp": 0.01029685, "balance_loss_clip": 1.04479218, "balance_loss_mlp": 1.02215743, "epoch": 0.8228221006432994, "flos": 22236785406720.0, "grad_norm": 2.6196643057513516, "language_loss": 0.72625279, "learning_rate": 3.202165404040675e-07, "loss": 0.74811351, "num_input_tokens_seen": 147783365, "step": 6843, "time_per_iteration": 2.6649913787841797 }, { "auxiliary_loss_clip": 0.01092603, "auxiliary_loss_mlp": 0.01029962, "balance_loss_clip": 1.03896499, "balance_loss_mlp": 1.02204084, "epoch": 0.8229423435339386, "flos": 24097532630400.0, "grad_norm": 1.938151880741357, "language_loss": 0.74711716, "learning_rate": 3.1979387786420396e-07, "loss": 0.76834279, "num_input_tokens_seen": 147803605, "step": 6844, "time_per_iteration": 3.8545730113983154 }, { "auxiliary_loss_clip": 0.01138117, "auxiliary_loss_mlp": 0.01023619, "balance_loss_clip": 1.04012895, "balance_loss_mlp": 1.01667523, "epoch": 0.8230625864245776, "flos": 23878549365120.0, "grad_norm": 2.563518927888582, "language_loss": 0.81982243, "learning_rate": 3.1937147021027346e-07, "loss": 0.84143984, "num_input_tokens_seen": 147822060, "step": 6845, "time_per_iteration": 2.6494531631469727 }, { "auxiliary_loss_clip": 0.01150125, "auxiliary_loss_mlp": 0.01021499, "balance_loss_clip": 1.0456742, "balance_loss_mlp": 1.01466537, "epoch": 0.8231828293152167, "flos": 16581106379520.0, "grad_norm": 2.2357334054310853, "language_loss": 0.7664417, "learning_rate": 3.189493175063547e-07, "loss": 0.78815794, "num_input_tokens_seen": 147839295, "step": 6846, "time_per_iteration": 3.480252265930176 }, { "auxiliary_loss_clip": 0.01140905, "auxiliary_loss_mlp": 0.01026308, "balance_loss_clip": 1.0456351, "balance_loss_mlp": 1.01920295, "epoch": 0.8233030722058559, "flos": 18880071528960.0, "grad_norm": 1.985214438547369, "language_loss": 0.67680234, "learning_rate": 3.1852741981648776e-07, "loss": 0.69847453, "num_input_tokens_seen": 147857945, "step": 6847, "time_per_iteration": 3.59553599357605 }, { "auxiliary_loss_clip": 0.01112154, "auxiliary_loss_mlp": 0.01027301, "balance_loss_clip": 1.04046929, "balance_loss_mlp": 1.01948357, "epoch": 0.8234233150964949, "flos": 28439024757120.0, "grad_norm": 2.354619975278059, "language_loss": 0.69872355, "learning_rate": 3.1810577720467404e-07, "loss": 0.72011817, "num_input_tokens_seen": 147879675, "step": 6848, "time_per_iteration": 2.7346746921539307 }, { "auxiliary_loss_clip": 0.01142453, "auxiliary_loss_mlp": 0.01023929, "balance_loss_clip": 1.04334617, "balance_loss_mlp": 1.01637459, "epoch": 0.823543557987134, "flos": 33765941577600.0, "grad_norm": 1.5685924918047176, "language_loss": 0.56674284, "learning_rate": 3.176843897348769e-07, "loss": 0.58840668, "num_input_tokens_seen": 147902870, "step": 6849, "time_per_iteration": 2.810746431350708 }, { "auxiliary_loss_clip": 0.01130911, "auxiliary_loss_mlp": 0.01025135, "balance_loss_clip": 1.04127824, "balance_loss_mlp": 1.01794112, "epoch": 0.8236638008777731, "flos": 17092366611840.0, "grad_norm": 3.1687785230804284, "language_loss": 0.75636572, "learning_rate": 3.1726325747102034e-07, "loss": 0.77792615, "num_input_tokens_seen": 147921245, "step": 6850, "time_per_iteration": 2.749417543411255 }, { "auxiliary_loss_clip": 0.01102765, "auxiliary_loss_mlp": 0.01025391, "balance_loss_clip": 1.03689504, "balance_loss_mlp": 1.01829219, "epoch": 0.8237840437684122, "flos": 61639982334720.0, "grad_norm": 1.5031607549176016, "language_loss": 0.64180243, "learning_rate": 3.1684238047698974e-07, "loss": 0.66308403, "num_input_tokens_seen": 147949515, "step": 6851, "time_per_iteration": 3.1159825325012207 }, { "auxiliary_loss_clip": 0.01140625, "auxiliary_loss_mlp": 0.01026406, "balance_loss_clip": 1.04314041, "balance_loss_mlp": 1.01931298, "epoch": 0.8239042866590512, "flos": 27309023821440.0, "grad_norm": 2.0323512236743406, "language_loss": 0.53235209, "learning_rate": 3.1642175881663155e-07, "loss": 0.55402237, "num_input_tokens_seen": 147969245, "step": 6852, "time_per_iteration": 2.7231178283691406 }, { "auxiliary_loss_clip": 0.01164319, "auxiliary_loss_mlp": 0.01022013, "balance_loss_clip": 1.04529381, "balance_loss_mlp": 1.01539648, "epoch": 0.8240245295496904, "flos": 21726351187200.0, "grad_norm": 2.450953557615833, "language_loss": 0.83682191, "learning_rate": 3.160013925537537e-07, "loss": 0.85868526, "num_input_tokens_seen": 147990080, "step": 6853, "time_per_iteration": 2.653960704803467 }, { "auxiliary_loss_clip": 0.01127298, "auxiliary_loss_mlp": 0.01020404, "balance_loss_clip": 1.04135597, "balance_loss_mlp": 1.01324272, "epoch": 0.8241447724403295, "flos": 20009318279040.0, "grad_norm": 2.173573669098559, "language_loss": 0.75777602, "learning_rate": 3.155812817521266e-07, "loss": 0.77925301, "num_input_tokens_seen": 148010455, "step": 6854, "time_per_iteration": 2.707383632659912 }, { "auxiliary_loss_clip": 0.01142311, "auxiliary_loss_mlp": 0.01024594, "balance_loss_clip": 1.04642439, "balance_loss_mlp": 1.01746476, "epoch": 0.8242650153309685, "flos": 22272983337600.0, "grad_norm": 2.093092814069572, "language_loss": 0.78101254, "learning_rate": 3.151614264754787e-07, "loss": 0.80268157, "num_input_tokens_seen": 148028400, "step": 6855, "time_per_iteration": 2.6370902061462402 }, { "auxiliary_loss_clip": 0.0116525, "auxiliary_loss_mlp": 0.01022293, "balance_loss_clip": 1.04357171, "balance_loss_mlp": 1.01489592, "epoch": 0.8243852582216077, "flos": 22309971367680.0, "grad_norm": 2.951230582783915, "language_loss": 0.79173386, "learning_rate": 3.147418267875035e-07, "loss": 0.81360924, "num_input_tokens_seen": 148046530, "step": 6856, "time_per_iteration": 2.67295503616333 }, { "auxiliary_loss_clip": 0.01090905, "auxiliary_loss_mlp": 0.00762078, "balance_loss_clip": 1.03502238, "balance_loss_mlp": 1.00030792, "epoch": 0.8245055011122467, "flos": 24645421756800.0, "grad_norm": 2.852149866676685, "language_loss": 0.65997863, "learning_rate": 3.1432248275185315e-07, "loss": 0.67850852, "num_input_tokens_seen": 148067040, "step": 6857, "time_per_iteration": 2.7662220001220703 }, { "auxiliary_loss_clip": 0.01149363, "auxiliary_loss_mlp": 0.01026665, "balance_loss_clip": 1.04480612, "balance_loss_mlp": 1.01977432, "epoch": 0.8246257440028858, "flos": 17487275713920.0, "grad_norm": 2.022430315648165, "language_loss": 0.76989788, "learning_rate": 3.139033944321412e-07, "loss": 0.79165816, "num_input_tokens_seen": 148084400, "step": 6858, "time_per_iteration": 2.600883722305298 }, { "auxiliary_loss_clip": 0.01154287, "auxiliary_loss_mlp": 0.01026902, "balance_loss_clip": 1.04316688, "balance_loss_mlp": 1.01956475, "epoch": 0.824745986893525, "flos": 25010130499200.0, "grad_norm": 2.3256419937891617, "language_loss": 0.78577185, "learning_rate": 3.1348456189194507e-07, "loss": 0.80758375, "num_input_tokens_seen": 148104860, "step": 6859, "time_per_iteration": 2.6353957653045654 }, { "auxiliary_loss_clip": 0.01113992, "auxiliary_loss_mlp": 0.01026649, "balance_loss_clip": 1.03877759, "balance_loss_mlp": 1.01937735, "epoch": 0.824866229784164, "flos": 18772698798720.0, "grad_norm": 1.5531458437604966, "language_loss": 0.82940346, "learning_rate": 3.1306598519479876e-07, "loss": 0.85080987, "num_input_tokens_seen": 148124680, "step": 6860, "time_per_iteration": 2.679022789001465 }, { "auxiliary_loss_clip": 0.0113496, "auxiliary_loss_mlp": 0.0102252, "balance_loss_clip": 1.04322004, "balance_loss_mlp": 1.01583207, "epoch": 0.8249864726748031, "flos": 23842171866240.0, "grad_norm": 1.7889739653856476, "language_loss": 0.78536642, "learning_rate": 3.1264766440420177e-07, "loss": 0.80694127, "num_input_tokens_seen": 148147150, "step": 6861, "time_per_iteration": 2.7648589611053467 }, { "auxiliary_loss_clip": 0.01146401, "auxiliary_loss_mlp": 0.01028113, "balance_loss_clip": 1.04280543, "balance_loss_mlp": 1.02133858, "epoch": 0.8251067155654422, "flos": 20303103617280.0, "grad_norm": 1.949835237882574, "language_loss": 0.69111681, "learning_rate": 3.122295995836124e-07, "loss": 0.7128619, "num_input_tokens_seen": 148167020, "step": 6862, "time_per_iteration": 2.65055775642395 }, { "auxiliary_loss_clip": 0.01154074, "auxiliary_loss_mlp": 0.01026739, "balance_loss_clip": 1.04191899, "balance_loss_mlp": 1.01916885, "epoch": 0.8252269584560813, "flos": 25009699536000.0, "grad_norm": 2.199715816738437, "language_loss": 0.77306986, "learning_rate": 3.118117907964508e-07, "loss": 0.79487789, "num_input_tokens_seen": 148188965, "step": 6863, "time_per_iteration": 2.6835062503814697 }, { "auxiliary_loss_clip": 0.01130718, "auxiliary_loss_mlp": 0.01026144, "balance_loss_clip": 1.04249382, "balance_loss_mlp": 1.01942337, "epoch": 0.8253472013467203, "flos": 17128564542720.0, "grad_norm": 1.9240802559531334, "language_loss": 0.80223799, "learning_rate": 3.1139423810609856e-07, "loss": 0.82380658, "num_input_tokens_seen": 148205660, "step": 6864, "time_per_iteration": 2.684217691421509 }, { "auxiliary_loss_clip": 0.01163564, "auxiliary_loss_mlp": 0.01023364, "balance_loss_clip": 1.04194605, "balance_loss_mlp": 1.01572251, "epoch": 0.8254674442373595, "flos": 22414794232320.0, "grad_norm": 2.1033585067865097, "language_loss": 0.75494015, "learning_rate": 3.1097694157589714e-07, "loss": 0.77680945, "num_input_tokens_seen": 148225545, "step": 6865, "time_per_iteration": 2.668067455291748 }, { "auxiliary_loss_clip": 0.01150944, "auxiliary_loss_mlp": 0.0102819, "balance_loss_clip": 1.04736543, "balance_loss_mlp": 1.02096617, "epoch": 0.8255876871279986, "flos": 24786765774720.0, "grad_norm": 2.964766454436364, "language_loss": 0.7608614, "learning_rate": 3.105599012691511e-07, "loss": 0.78265274, "num_input_tokens_seen": 148243975, "step": 6866, "time_per_iteration": 2.691396951675415 }, { "auxiliary_loss_clip": 0.01148503, "auxiliary_loss_mlp": 0.01023933, "balance_loss_clip": 1.04434156, "balance_loss_mlp": 1.0172869, "epoch": 0.8257079300186376, "flos": 27455431656960.0, "grad_norm": 1.8177448676157129, "language_loss": 0.82310081, "learning_rate": 3.101431172491249e-07, "loss": 0.84482515, "num_input_tokens_seen": 148265520, "step": 6867, "time_per_iteration": 3.667149066925049 }, { "auxiliary_loss_clip": 0.01126367, "auxiliary_loss_mlp": 0.00762474, "balance_loss_clip": 1.03956985, "balance_loss_mlp": 1.00035715, "epoch": 0.8258281729092768, "flos": 16471866142080.0, "grad_norm": 2.231629388582422, "language_loss": 0.72044814, "learning_rate": 3.097265895790444e-07, "loss": 0.73933649, "num_input_tokens_seen": 148283730, "step": 6868, "time_per_iteration": 2.781785726547241 }, { "auxiliary_loss_clip": 0.01125108, "auxiliary_loss_mlp": 0.01027682, "balance_loss_clip": 1.04231453, "balance_loss_mlp": 1.02039266, "epoch": 0.8259484157999158, "flos": 21433822824960.0, "grad_norm": 2.115833492978135, "language_loss": 0.83084416, "learning_rate": 3.093103183220962e-07, "loss": 0.85237217, "num_input_tokens_seen": 148303775, "step": 6869, "time_per_iteration": 3.7169461250305176 }, { "auxiliary_loss_clip": 0.01054864, "auxiliary_loss_mlp": 0.01000525, "balance_loss_clip": 1.01064658, "balance_loss_mlp": 0.99936849, "epoch": 0.8260686586905549, "flos": 58322342453760.0, "grad_norm": 0.8225836507519573, "language_loss": 0.59376675, "learning_rate": 3.0889430354142796e-07, "loss": 0.61432064, "num_input_tokens_seen": 148365285, "step": 6870, "time_per_iteration": 3.225717067718506 }, { "auxiliary_loss_clip": 0.01123529, "auxiliary_loss_mlp": 0.01025522, "balance_loss_clip": 1.03915203, "balance_loss_mlp": 1.01804781, "epoch": 0.826188901581194, "flos": 27527288814720.0, "grad_norm": 1.9075894233445656, "language_loss": 0.6996727, "learning_rate": 3.084785453001497e-07, "loss": 0.72116321, "num_input_tokens_seen": 148386200, "step": 6871, "time_per_iteration": 2.730030059814453 }, { "auxiliary_loss_clip": 0.01141599, "auxiliary_loss_mlp": 0.00761658, "balance_loss_clip": 1.04596353, "balance_loss_mlp": 1.0003159, "epoch": 0.8263091444718331, "flos": 23696051339520.0, "grad_norm": 2.7753753906915457, "language_loss": 0.8199389, "learning_rate": 3.080630436613314e-07, "loss": 0.8389715, "num_input_tokens_seen": 148403970, "step": 6872, "time_per_iteration": 4.576741695404053 }, { "auxiliary_loss_clip": 0.01142384, "auxiliary_loss_mlp": 0.0102135, "balance_loss_clip": 1.04177213, "balance_loss_mlp": 1.01445079, "epoch": 0.8264293873624722, "flos": 17165157523200.0, "grad_norm": 2.1358032959107187, "language_loss": 0.86287141, "learning_rate": 3.076477986880039e-07, "loss": 0.88450873, "num_input_tokens_seen": 148421765, "step": 6873, "time_per_iteration": 2.6175484657287598 }, { "auxiliary_loss_clip": 0.01136373, "auxiliary_loss_mlp": 0.01024142, "balance_loss_clip": 1.0443691, "balance_loss_mlp": 1.01696587, "epoch": 0.8265496302531112, "flos": 24098645952000.0, "grad_norm": 2.253268624515649, "language_loss": 0.69369936, "learning_rate": 3.0723281044315986e-07, "loss": 0.71530455, "num_input_tokens_seen": 148443720, "step": 6874, "time_per_iteration": 2.655113697052002 }, { "auxiliary_loss_clip": 0.01162482, "auxiliary_loss_mlp": 0.01025041, "balance_loss_clip": 1.04431462, "balance_loss_mlp": 1.01839757, "epoch": 0.8266698731437504, "flos": 14099894599680.0, "grad_norm": 2.072491775220985, "language_loss": 0.76558852, "learning_rate": 3.068180789897521e-07, "loss": 0.78746367, "num_input_tokens_seen": 148462130, "step": 6875, "time_per_iteration": 2.5548360347747803 }, { "auxiliary_loss_clip": 0.01153416, "auxiliary_loss_mlp": 0.01028534, "balance_loss_clip": 1.04295528, "balance_loss_mlp": 1.02091026, "epoch": 0.8267901160343895, "flos": 30777563715840.0, "grad_norm": 1.458009570379395, "language_loss": 0.81511939, "learning_rate": 3.064036043906966e-07, "loss": 0.83693892, "num_input_tokens_seen": 148485570, "step": 6876, "time_per_iteration": 2.671701431274414 }, { "auxiliary_loss_clip": 0.01133442, "auxiliary_loss_mlp": 0.01023876, "balance_loss_clip": 1.04322791, "balance_loss_mlp": 1.01637173, "epoch": 0.8269103589250285, "flos": 40624915242240.0, "grad_norm": 1.9827016228190217, "language_loss": 0.67826402, "learning_rate": 3.059893867088668e-07, "loss": 0.69983721, "num_input_tokens_seen": 148509715, "step": 6877, "time_per_iteration": 2.8650858402252197 }, { "auxiliary_loss_clip": 0.0115195, "auxiliary_loss_mlp": 0.01025434, "balance_loss_clip": 1.04621649, "balance_loss_mlp": 1.01837957, "epoch": 0.8270306018156677, "flos": 30263645877120.0, "grad_norm": 1.998923847080477, "language_loss": 0.67202789, "learning_rate": 3.055754260071004e-07, "loss": 0.6938017, "num_input_tokens_seen": 148532010, "step": 6878, "time_per_iteration": 2.6947569847106934 }, { "auxiliary_loss_clip": 0.01152336, "auxiliary_loss_mlp": 0.01023093, "balance_loss_clip": 1.04327726, "balance_loss_mlp": 1.01660514, "epoch": 0.8271508447063067, "flos": 25226599812480.0, "grad_norm": 4.023478785188813, "language_loss": 0.73942006, "learning_rate": 3.051617223481948e-07, "loss": 0.76117432, "num_input_tokens_seen": 148553330, "step": 6879, "time_per_iteration": 2.6372623443603516 }, { "auxiliary_loss_clip": 0.0113727, "auxiliary_loss_mlp": 0.01027581, "balance_loss_clip": 1.04294944, "balance_loss_mlp": 1.01989818, "epoch": 0.8272710875969458, "flos": 17566602900480.0, "grad_norm": 1.938147120761069, "language_loss": 0.75182259, "learning_rate": 3.047482757949078e-07, "loss": 0.77347112, "num_input_tokens_seen": 148570960, "step": 6880, "time_per_iteration": 2.675318956375122 }, { "auxiliary_loss_clip": 0.01120053, "auxiliary_loss_mlp": 0.00762017, "balance_loss_clip": 1.03933787, "balance_loss_mlp": 1.00028944, "epoch": 0.827391330487585, "flos": 19755465886080.0, "grad_norm": 2.021647288884602, "language_loss": 0.85845852, "learning_rate": 3.043350864099605e-07, "loss": 0.87727928, "num_input_tokens_seen": 148589520, "step": 6881, "time_per_iteration": 2.6630618572235107 }, { "auxiliary_loss_clip": 0.01154736, "auxiliary_loss_mlp": 0.01023861, "balance_loss_clip": 1.0430789, "balance_loss_mlp": 1.01655674, "epoch": 0.827511573378224, "flos": 16835174254080.0, "grad_norm": 2.243049787858883, "language_loss": 0.80511105, "learning_rate": 3.039221542560315e-07, "loss": 0.82689703, "num_input_tokens_seen": 148606085, "step": 6882, "time_per_iteration": 2.5902864933013916 }, { "auxiliary_loss_clip": 0.01153419, "auxiliary_loss_mlp": 0.01020627, "balance_loss_clip": 1.04627717, "balance_loss_mlp": 1.01373923, "epoch": 0.8276318162688631, "flos": 18369242259840.0, "grad_norm": 1.8147681616399893, "language_loss": 0.73521298, "learning_rate": 3.0350947939576356e-07, "loss": 0.75695348, "num_input_tokens_seen": 148625240, "step": 6883, "time_per_iteration": 2.6904993057250977 }, { "auxiliary_loss_clip": 0.01158903, "auxiliary_loss_mlp": 0.01026369, "balance_loss_clip": 1.04635811, "balance_loss_mlp": 1.01894212, "epoch": 0.8277520591595022, "flos": 19352691705600.0, "grad_norm": 1.649741636592778, "language_loss": 0.72427964, "learning_rate": 3.0309706189175876e-07, "loss": 0.74613237, "num_input_tokens_seen": 148645075, "step": 6884, "time_per_iteration": 2.7054338455200195 }, { "auxiliary_loss_clip": 0.0104523, "auxiliary_loss_mlp": 0.01000808, "balance_loss_clip": 1.01041222, "balance_loss_mlp": 0.99968702, "epoch": 0.8278723020501413, "flos": 67918858329600.0, "grad_norm": 0.7696649978691301, "language_loss": 0.57408834, "learning_rate": 3.0268490180658045e-07, "loss": 0.5945487, "num_input_tokens_seen": 148707855, "step": 6885, "time_per_iteration": 3.2412540912628174 }, { "auxiliary_loss_clip": 0.01171355, "auxiliary_loss_mlp": 0.01026037, "balance_loss_clip": 1.04918337, "balance_loss_mlp": 1.01908994, "epoch": 0.8279925449407803, "flos": 18185738653440.0, "grad_norm": 2.8077949817409174, "language_loss": 0.79323328, "learning_rate": 3.0227299920275305e-07, "loss": 0.81520724, "num_input_tokens_seen": 148724170, "step": 6886, "time_per_iteration": 2.5954487323760986 }, { "auxiliary_loss_clip": 0.01128624, "auxiliary_loss_mlp": 0.01026779, "balance_loss_clip": 1.04439616, "balance_loss_mlp": 1.01874399, "epoch": 0.8281127878314195, "flos": 20631434860800.0, "grad_norm": 2.1696416595507713, "language_loss": 0.85906786, "learning_rate": 3.018613541427613e-07, "loss": 0.88062191, "num_input_tokens_seen": 148743690, "step": 6887, "time_per_iteration": 2.7092034816741943 }, { "auxiliary_loss_clip": 0.01162889, "auxiliary_loss_mlp": 0.01026806, "balance_loss_clip": 1.04309034, "balance_loss_mlp": 1.01963246, "epoch": 0.8282330307220586, "flos": 18004282122240.0, "grad_norm": 1.7644289077917026, "language_loss": 0.73549163, "learning_rate": 3.0144996668905243e-07, "loss": 0.75738859, "num_input_tokens_seen": 148761070, "step": 6888, "time_per_iteration": 2.56958270072937 }, { "auxiliary_loss_clip": 0.01098022, "auxiliary_loss_mlp": 0.00761963, "balance_loss_clip": 1.03678775, "balance_loss_mlp": 1.00029647, "epoch": 0.8283532736126976, "flos": 20084120352000.0, "grad_norm": 1.9885599824076756, "language_loss": 0.82546413, "learning_rate": 3.010388369040331e-07, "loss": 0.844064, "num_input_tokens_seen": 148779730, "step": 6889, "time_per_iteration": 2.8491427898406982 }, { "auxiliary_loss_clip": 0.01148727, "auxiliary_loss_mlp": 0.01027118, "balance_loss_clip": 1.04416275, "balance_loss_mlp": 1.0195725, "epoch": 0.8284735165033368, "flos": 31868421805440.0, "grad_norm": 2.1587240715254765, "language_loss": 0.8301208, "learning_rate": 3.0062796485007156e-07, "loss": 0.85187924, "num_input_tokens_seen": 148800670, "step": 6890, "time_per_iteration": 2.711315631866455 }, { "auxiliary_loss_clip": 0.01164732, "auxiliary_loss_mlp": 0.0076239, "balance_loss_clip": 1.04417527, "balance_loss_mlp": 1.00032163, "epoch": 0.8285937593939758, "flos": 26651319840000.0, "grad_norm": 3.064607766289369, "language_loss": 0.65458012, "learning_rate": 3.002173505894965e-07, "loss": 0.67385131, "num_input_tokens_seen": 148819820, "step": 6891, "time_per_iteration": 2.621171236038208 }, { "auxiliary_loss_clip": 0.0115783, "auxiliary_loss_mlp": 0.01028173, "balance_loss_clip": 1.04501128, "balance_loss_mlp": 1.02051091, "epoch": 0.8287140022846149, "flos": 20193683811840.0, "grad_norm": 4.732032073761448, "language_loss": 0.63101268, "learning_rate": 2.998069941845973e-07, "loss": 0.65287268, "num_input_tokens_seen": 148838890, "step": 6892, "time_per_iteration": 2.5424044132232666 }, { "auxiliary_loss_clip": 0.010641, "auxiliary_loss_mlp": 0.01001911, "balance_loss_clip": 1.0102818, "balance_loss_mlp": 1.00083256, "epoch": 0.8288342451752541, "flos": 70755980019840.0, "grad_norm": 0.7639893419810463, "language_loss": 0.5742805, "learning_rate": 2.993968956976258e-07, "loss": 0.59494054, "num_input_tokens_seen": 148906635, "step": 6893, "time_per_iteration": 4.207201719284058 }, { "auxiliary_loss_clip": 0.01173939, "auxiliary_loss_mlp": 0.01026638, "balance_loss_clip": 1.04838729, "balance_loss_mlp": 1.01946783, "epoch": 0.8289544880658931, "flos": 24572235795840.0, "grad_norm": 2.629436096199152, "language_loss": 0.70249015, "learning_rate": 2.9898705519079313e-07, "loss": 0.72449589, "num_input_tokens_seen": 148925740, "step": 6894, "time_per_iteration": 2.6489667892456055 }, { "auxiliary_loss_clip": 0.01131731, "auxiliary_loss_mlp": 0.01020901, "balance_loss_clip": 1.04176688, "balance_loss_mlp": 1.01403785, "epoch": 0.8290747309565322, "flos": 22273378387200.0, "grad_norm": 1.7304504399810638, "language_loss": 0.74345016, "learning_rate": 2.985774727262715e-07, "loss": 0.76497638, "num_input_tokens_seen": 148944585, "step": 6895, "time_per_iteration": 2.8332114219665527 }, { "auxiliary_loss_clip": 0.01163331, "auxiliary_loss_mlp": 0.01023846, "balance_loss_clip": 1.04386532, "balance_loss_mlp": 1.01690769, "epoch": 0.8291949738471713, "flos": 23255570856960.0, "grad_norm": 1.9033257590703767, "language_loss": 0.81624132, "learning_rate": 2.981681483661949e-07, "loss": 0.83811307, "num_input_tokens_seen": 148964170, "step": 6896, "time_per_iteration": 3.7558043003082275 }, { "auxiliary_loss_clip": 0.01153868, "auxiliary_loss_mlp": 0.0102583, "balance_loss_clip": 1.04669404, "balance_loss_mlp": 1.01862717, "epoch": 0.8293152167378104, "flos": 52555768185600.0, "grad_norm": 1.8195632971407565, "language_loss": 0.7141301, "learning_rate": 2.9775908217265633e-07, "loss": 0.7359271, "num_input_tokens_seen": 148989405, "step": 6897, "time_per_iteration": 2.959160804748535 }, { "auxiliary_loss_clip": 0.01011354, "auxiliary_loss_mlp": 0.01000949, "balance_loss_clip": 1.01029181, "balance_loss_mlp": 0.99979228, "epoch": 0.8294354596284494, "flos": 63356156294400.0, "grad_norm": 0.8293398263021645, "language_loss": 0.5034461, "learning_rate": 2.9735027420771253e-07, "loss": 0.52356911, "num_input_tokens_seen": 149049740, "step": 6898, "time_per_iteration": 4.367146015167236 }, { "auxiliary_loss_clip": 0.01132803, "auxiliary_loss_mlp": 0.01023917, "balance_loss_clip": 1.04462063, "balance_loss_mlp": 1.01734877, "epoch": 0.8295557025190886, "flos": 24827021942400.0, "grad_norm": 1.6132516790129687, "language_loss": 0.71373212, "learning_rate": 2.969417245333774e-07, "loss": 0.73529935, "num_input_tokens_seen": 149069120, "step": 6899, "time_per_iteration": 4.0404932498931885 }, { "auxiliary_loss_clip": 0.0111941, "auxiliary_loss_mlp": 0.01020754, "balance_loss_clip": 1.04331577, "balance_loss_mlp": 1.01393509, "epoch": 0.8296759454097277, "flos": 25118580637440.0, "grad_norm": 2.296449887913235, "language_loss": 0.77969223, "learning_rate": 2.9653343321162915e-07, "loss": 0.80109388, "num_input_tokens_seen": 149088630, "step": 6900, "time_per_iteration": 2.9579837322235107 }, { "auxiliary_loss_clip": 0.01125565, "auxiliary_loss_mlp": 0.01030646, "balance_loss_clip": 1.04472113, "balance_loss_mlp": 1.02315116, "epoch": 0.8297961883003667, "flos": 24132581326080.0, "grad_norm": 2.1239738408946214, "language_loss": 0.65194774, "learning_rate": 2.9612540030440446e-07, "loss": 0.67350984, "num_input_tokens_seen": 149109175, "step": 6901, "time_per_iteration": 2.7515783309936523 }, { "auxiliary_loss_clip": 0.01044333, "auxiliary_loss_mlp": 0.01001402, "balance_loss_clip": 1.01100278, "balance_loss_mlp": 1.00024021, "epoch": 0.8299164311910058, "flos": 67446561375360.0, "grad_norm": 0.8484105544830768, "language_loss": 0.64074349, "learning_rate": 2.9571762587360206e-07, "loss": 0.66120088, "num_input_tokens_seen": 149165560, "step": 6902, "time_per_iteration": 3.1909103393554688 }, { "auxiliary_loss_clip": 0.01105254, "auxiliary_loss_mlp": 0.01029135, "balance_loss_clip": 1.03401589, "balance_loss_mlp": 1.02185166, "epoch": 0.8300366740816449, "flos": 25228682801280.0, "grad_norm": 1.8234935162403185, "language_loss": 0.74050653, "learning_rate": 2.953101099810806e-07, "loss": 0.76185042, "num_input_tokens_seen": 149185165, "step": 6903, "time_per_iteration": 2.736765146255493 }, { "auxiliary_loss_clip": 0.01148841, "auxiliary_loss_mlp": 0.01026552, "balance_loss_clip": 1.04657149, "balance_loss_mlp": 1.01981354, "epoch": 0.830156916972284, "flos": 18041018757120.0, "grad_norm": 2.201152945331171, "language_loss": 0.8320533, "learning_rate": 2.9490285268865965e-07, "loss": 0.85380727, "num_input_tokens_seen": 149202655, "step": 6904, "time_per_iteration": 2.5887012481689453 }, { "auxiliary_loss_clip": 0.01158876, "auxiliary_loss_mlp": 0.01027848, "balance_loss_clip": 1.04755235, "balance_loss_mlp": 1.02074337, "epoch": 0.830277159862923, "flos": 26322485806080.0, "grad_norm": 2.049425247208737, "language_loss": 0.79858911, "learning_rate": 2.9449585405812085e-07, "loss": 0.82045633, "num_input_tokens_seen": 149220035, "step": 6905, "time_per_iteration": 2.7241194248199463 }, { "auxiliary_loss_clip": 0.01127116, "auxiliary_loss_mlp": 0.01025406, "balance_loss_clip": 1.042521, "balance_loss_mlp": 1.01827765, "epoch": 0.8303974027535622, "flos": 19938861751680.0, "grad_norm": 1.8403439441451048, "language_loss": 0.73768473, "learning_rate": 2.940891141512043e-07, "loss": 0.75920999, "num_input_tokens_seen": 149238055, "step": 6906, "time_per_iteration": 2.7087695598602295 }, { "auxiliary_loss_clip": 0.01137095, "auxiliary_loss_mlp": 0.01030369, "balance_loss_clip": 1.04198551, "balance_loss_mlp": 1.02312124, "epoch": 0.8305176456442013, "flos": 17165552572800.0, "grad_norm": 2.198733560498124, "language_loss": 0.72272718, "learning_rate": 2.9368263302961385e-07, "loss": 0.74440181, "num_input_tokens_seen": 149256755, "step": 6907, "time_per_iteration": 2.6908416748046875 }, { "auxiliary_loss_clip": 0.010942, "auxiliary_loss_mlp": 0.01025847, "balance_loss_clip": 1.03566265, "balance_loss_mlp": 1.01814342, "epoch": 0.8306378885348403, "flos": 25627614226560.0, "grad_norm": 1.9231458316807981, "language_loss": 0.79334396, "learning_rate": 2.9327641075501075e-07, "loss": 0.81454444, "num_input_tokens_seen": 149275745, "step": 6908, "time_per_iteration": 2.8732964992523193 }, { "auxiliary_loss_clip": 0.01130173, "auxiliary_loss_mlp": 0.01027956, "balance_loss_clip": 1.03926826, "balance_loss_mlp": 1.02036285, "epoch": 0.8307581314254795, "flos": 33947864985600.0, "grad_norm": 2.8896379394197784, "language_loss": 0.66836548, "learning_rate": 2.9287044738901866e-07, "loss": 0.68994671, "num_input_tokens_seen": 149293730, "step": 6909, "time_per_iteration": 2.906161069869995 }, { "auxiliary_loss_clip": 0.01152362, "auxiliary_loss_mlp": 0.00761856, "balance_loss_clip": 1.043347, "balance_loss_mlp": 1.00036192, "epoch": 0.8308783743161186, "flos": 17562724231680.0, "grad_norm": 2.2784822058040493, "language_loss": 0.90547252, "learning_rate": 2.9246474299322274e-07, "loss": 0.92461467, "num_input_tokens_seen": 149309290, "step": 6910, "time_per_iteration": 2.729814052581787 }, { "auxiliary_loss_clip": 0.01030367, "auxiliary_loss_mlp": 0.01002008, "balance_loss_clip": 1.00917411, "balance_loss_mlp": 1.00093496, "epoch": 0.8309986172067576, "flos": 69412885649280.0, "grad_norm": 0.8876647815973384, "language_loss": 0.63141698, "learning_rate": 2.920592976291678e-07, "loss": 0.65174073, "num_input_tokens_seen": 149366620, "step": 6911, "time_per_iteration": 3.222320795059204 }, { "auxiliary_loss_clip": 0.01150811, "auxiliary_loss_mlp": 0.01024274, "balance_loss_clip": 1.04295039, "balance_loss_mlp": 1.01654959, "epoch": 0.8311188600973968, "flos": 22309755886080.0, "grad_norm": 1.965992516192072, "language_loss": 0.80837864, "learning_rate": 2.916541113583595e-07, "loss": 0.8301295, "num_input_tokens_seen": 149385120, "step": 6912, "time_per_iteration": 2.684483766555786 }, { "auxiliary_loss_clip": 0.01130691, "auxiliary_loss_mlp": 0.01026734, "balance_loss_clip": 1.04552412, "balance_loss_mlp": 1.0190568, "epoch": 0.8312391029880358, "flos": 18770077105920.0, "grad_norm": 2.1070939777772084, "language_loss": 0.66598034, "learning_rate": 2.912491842422642e-07, "loss": 0.6875546, "num_input_tokens_seen": 149402825, "step": 6913, "time_per_iteration": 2.6930036544799805 }, { "auxiliary_loss_clip": 0.0115166, "auxiliary_loss_mlp": 0.01027434, "balance_loss_clip": 1.04362988, "balance_loss_mlp": 1.02048993, "epoch": 0.8313593458786749, "flos": 20376648714240.0, "grad_norm": 1.6819199638623243, "language_loss": 0.71018171, "learning_rate": 2.9084451634230857e-07, "loss": 0.73197263, "num_input_tokens_seen": 149422125, "step": 6914, "time_per_iteration": 2.6302640438079834 }, { "auxiliary_loss_clip": 0.01123428, "auxiliary_loss_mlp": 0.01028328, "balance_loss_clip": 1.04107094, "balance_loss_mlp": 1.02105951, "epoch": 0.831479588769314, "flos": 32124069878400.0, "grad_norm": 2.3521413154079207, "language_loss": 0.71246755, "learning_rate": 2.9044010771988125e-07, "loss": 0.73398513, "num_input_tokens_seen": 149441940, "step": 6915, "time_per_iteration": 2.8047239780426025 }, { "auxiliary_loss_clip": 0.0113191, "auxiliary_loss_mlp": 0.01030803, "balance_loss_clip": 1.0428772, "balance_loss_mlp": 1.02365637, "epoch": 0.8315998316599531, "flos": 45185929338240.0, "grad_norm": 1.8169549431571592, "language_loss": 0.71967554, "learning_rate": 2.900359584363303e-07, "loss": 0.74130267, "num_input_tokens_seen": 149465045, "step": 6916, "time_per_iteration": 2.9225351810455322 }, { "auxiliary_loss_clip": 0.01107633, "auxiliary_loss_mlp": 0.01024775, "balance_loss_clip": 1.04263592, "balance_loss_mlp": 1.01683879, "epoch": 0.8317200745505922, "flos": 18363747479040.0, "grad_norm": 2.292316795099922, "language_loss": 0.84706467, "learning_rate": 2.8963206855296494e-07, "loss": 0.86838871, "num_input_tokens_seen": 149481285, "step": 6917, "time_per_iteration": 2.7519233226776123 }, { "auxiliary_loss_clip": 0.01154833, "auxiliary_loss_mlp": 0.01026699, "balance_loss_clip": 1.04508829, "balance_loss_mlp": 1.01963043, "epoch": 0.8318403174412313, "flos": 24206557386240.0, "grad_norm": 1.8113582486503708, "language_loss": 0.77350533, "learning_rate": 2.892284381310548e-07, "loss": 0.79532069, "num_input_tokens_seen": 149502700, "step": 6918, "time_per_iteration": 2.731346607208252 }, { "auxiliary_loss_clip": 0.01134044, "auxiliary_loss_mlp": 0.01026609, "balance_loss_clip": 1.04204237, "balance_loss_mlp": 1.01911092, "epoch": 0.8319605603318704, "flos": 22418780641920.0, "grad_norm": 4.695828968899663, "language_loss": 0.72206217, "learning_rate": 2.888250672318302e-07, "loss": 0.74366874, "num_input_tokens_seen": 149520100, "step": 6919, "time_per_iteration": 3.507978677749634 }, { "auxiliary_loss_clip": 0.01169787, "auxiliary_loss_mlp": 0.01025642, "balance_loss_clip": 1.04887509, "balance_loss_mlp": 1.01849198, "epoch": 0.8320808032225094, "flos": 37414501459200.0, "grad_norm": 7.529636998992757, "language_loss": 0.68715096, "learning_rate": 2.884219559164831e-07, "loss": 0.70910525, "num_input_tokens_seen": 149543245, "step": 6920, "time_per_iteration": 2.78218936920166 }, { "auxiliary_loss_clip": 0.01152442, "auxiliary_loss_mlp": 0.01022702, "balance_loss_clip": 1.04595542, "balance_loss_mlp": 1.01522791, "epoch": 0.8322010461131486, "flos": 12787395638400.0, "grad_norm": 2.499125854533929, "language_loss": 0.81368315, "learning_rate": 2.880191042461635e-07, "loss": 0.83543456, "num_input_tokens_seen": 149559185, "step": 6921, "time_per_iteration": 3.8115532398223877 }, { "auxiliary_loss_clip": 0.0111557, "auxiliary_loss_mlp": 0.01022635, "balance_loss_clip": 1.03995848, "balance_loss_mlp": 1.01557469, "epoch": 0.8323212890037877, "flos": 15815455050240.0, "grad_norm": 2.392163793002885, "language_loss": 0.80242693, "learning_rate": 2.876165122819849e-07, "loss": 0.82380903, "num_input_tokens_seen": 149577165, "step": 6922, "time_per_iteration": 2.747318744659424 }, { "auxiliary_loss_clip": 0.01163222, "auxiliary_loss_mlp": 0.0102518, "balance_loss_clip": 1.04446042, "balance_loss_mlp": 1.01795316, "epoch": 0.8324415318944267, "flos": 21719276208000.0, "grad_norm": 1.5913941281476578, "language_loss": 0.79572314, "learning_rate": 2.872141800850201e-07, "loss": 0.81760716, "num_input_tokens_seen": 149594340, "step": 6923, "time_per_iteration": 2.5436248779296875 }, { "auxiliary_loss_clip": 0.01165914, "auxiliary_loss_mlp": 0.01023858, "balance_loss_clip": 1.04652572, "balance_loss_mlp": 1.01684248, "epoch": 0.8325617747850659, "flos": 34198700636160.0, "grad_norm": 1.820792424788784, "language_loss": 0.73310322, "learning_rate": 2.868121077163024e-07, "loss": 0.75500095, "num_input_tokens_seen": 149613895, "step": 6924, "time_per_iteration": 2.6740427017211914 }, { "auxiliary_loss_clip": 0.01153397, "auxiliary_loss_mlp": 0.01022502, "balance_loss_clip": 1.04200625, "balance_loss_mlp": 1.01558518, "epoch": 0.8326820176757049, "flos": 18369457741440.0, "grad_norm": 1.8680298109177198, "language_loss": 0.72317708, "learning_rate": 2.864102952368257e-07, "loss": 0.74493611, "num_input_tokens_seen": 149631820, "step": 6925, "time_per_iteration": 4.370633125305176 }, { "auxiliary_loss_clip": 0.01096188, "auxiliary_loss_mlp": 0.01026774, "balance_loss_clip": 1.03367221, "balance_loss_mlp": 1.01985359, "epoch": 0.832802260566344, "flos": 35991325716480.0, "grad_norm": 1.552991814931484, "language_loss": 0.59365404, "learning_rate": 2.860087427075444e-07, "loss": 0.6148836, "num_input_tokens_seen": 149656070, "step": 6926, "time_per_iteration": 2.9004361629486084 }, { "auxiliary_loss_clip": 0.01131762, "auxiliary_loss_mlp": 0.01026724, "balance_loss_clip": 1.04137349, "balance_loss_mlp": 1.02005768, "epoch": 0.8329225034569832, "flos": 14244434928000.0, "grad_norm": 2.4404291254182287, "language_loss": 0.85973775, "learning_rate": 2.856074501893744e-07, "loss": 0.88132262, "num_input_tokens_seen": 149671270, "step": 6927, "time_per_iteration": 2.6743197441101074 }, { "auxiliary_loss_clip": 0.01155476, "auxiliary_loss_mlp": 0.01026991, "balance_loss_clip": 1.04830277, "balance_loss_mlp": 1.01969838, "epoch": 0.8330427463476222, "flos": 18077468083200.0, "grad_norm": 1.8192466664158928, "language_loss": 0.81531829, "learning_rate": 2.8520641774319054e-07, "loss": 0.83714294, "num_input_tokens_seen": 149689360, "step": 6928, "time_per_iteration": 2.6387438774108887 }, { "auxiliary_loss_clip": 0.01136606, "auxiliary_loss_mlp": 0.01025544, "balance_loss_clip": 1.03857946, "balance_loss_mlp": 1.01830459, "epoch": 0.8331629892382613, "flos": 18040839189120.0, "grad_norm": 1.8777610140858128, "language_loss": 0.75467509, "learning_rate": 2.848056454298309e-07, "loss": 0.77629662, "num_input_tokens_seen": 149706685, "step": 6929, "time_per_iteration": 2.6104047298431396 }, { "auxiliary_loss_clip": 0.01137764, "auxiliary_loss_mlp": 0.01026921, "balance_loss_clip": 1.04410315, "balance_loss_mlp": 1.01916659, "epoch": 0.8332832321289004, "flos": 17457398576640.0, "grad_norm": 2.282239066253034, "language_loss": 0.65346092, "learning_rate": 2.844051333100905e-07, "loss": 0.67510778, "num_input_tokens_seen": 149724230, "step": 6930, "time_per_iteration": 2.6615798473358154 }, { "auxiliary_loss_clip": 0.01138217, "auxiliary_loss_mlp": 0.01023013, "balance_loss_clip": 1.04612589, "balance_loss_mlp": 1.01610756, "epoch": 0.8334034750195395, "flos": 15084852416640.0, "grad_norm": 1.9005960920133897, "language_loss": 0.83848256, "learning_rate": 2.840048814447269e-07, "loss": 0.86009485, "num_input_tokens_seen": 149742395, "step": 6931, "time_per_iteration": 2.655243158340454 }, { "auxiliary_loss_clip": 0.01129273, "auxiliary_loss_mlp": 0.01028132, "balance_loss_clip": 1.04056931, "balance_loss_mlp": 1.02051115, "epoch": 0.8335237179101785, "flos": 19427170556160.0, "grad_norm": 2.445454972383859, "language_loss": 0.74034965, "learning_rate": 2.836048898944587e-07, "loss": 0.76192373, "num_input_tokens_seen": 149760820, "step": 6932, "time_per_iteration": 2.645317554473877 }, { "auxiliary_loss_clip": 0.01137909, "auxiliary_loss_mlp": 0.01024226, "balance_loss_clip": 1.04434729, "balance_loss_mlp": 1.01773834, "epoch": 0.8336439608008177, "flos": 21762046327680.0, "grad_norm": 2.617799933526437, "language_loss": 0.72997952, "learning_rate": 2.832051587199642e-07, "loss": 0.75160092, "num_input_tokens_seen": 149778075, "step": 6933, "time_per_iteration": 2.659097671508789 }, { "auxiliary_loss_clip": 0.01055053, "auxiliary_loss_mlp": 0.01001661, "balance_loss_clip": 1.0094887, "balance_loss_mlp": 1.00057602, "epoch": 0.8337642036914568, "flos": 59702783990400.0, "grad_norm": 0.8106477396634857, "language_loss": 0.57702243, "learning_rate": 2.828056879818821e-07, "loss": 0.59758961, "num_input_tokens_seen": 149837150, "step": 6934, "time_per_iteration": 3.156630277633667 }, { "auxiliary_loss_clip": 0.01121564, "auxiliary_loss_mlp": 0.01024203, "balance_loss_clip": 1.03780556, "balance_loss_mlp": 1.01773608, "epoch": 0.8338844465820958, "flos": 27162185022720.0, "grad_norm": 1.9561354136624716, "language_loss": 0.83049148, "learning_rate": 2.824064777408117e-07, "loss": 0.85194921, "num_input_tokens_seen": 149856940, "step": 6935, "time_per_iteration": 2.742537260055542 }, { "auxiliary_loss_clip": 0.0115133, "auxiliary_loss_mlp": 0.01026984, "balance_loss_clip": 1.04536891, "balance_loss_mlp": 1.01985502, "epoch": 0.8340046894727349, "flos": 30481264425600.0, "grad_norm": 2.30564830791528, "language_loss": 0.75979996, "learning_rate": 2.8200752805731263e-07, "loss": 0.78158313, "num_input_tokens_seen": 149879930, "step": 6936, "time_per_iteration": 2.7043325901031494 }, { "auxiliary_loss_clip": 0.01151034, "auxiliary_loss_mlp": 0.01026543, "balance_loss_clip": 1.04521227, "balance_loss_mlp": 1.01992404, "epoch": 0.834124932363374, "flos": 27126166659840.0, "grad_norm": 1.4320006619683023, "language_loss": 0.81088042, "learning_rate": 2.8160883899190625e-07, "loss": 0.83265615, "num_input_tokens_seen": 149903200, "step": 6937, "time_per_iteration": 2.8850889205932617 }, { "auxiliary_loss_clip": 0.01115015, "auxiliary_loss_mlp": 0.01022758, "balance_loss_clip": 1.04166412, "balance_loss_mlp": 1.01537263, "epoch": 0.8342451752540131, "flos": 24569865498240.0, "grad_norm": 2.0764549710500733, "language_loss": 0.73394012, "learning_rate": 2.8121041060507234e-07, "loss": 0.75531781, "num_input_tokens_seen": 149922230, "step": 6938, "time_per_iteration": 2.752775192260742 }, { "auxiliary_loss_clip": 0.01155761, "auxiliary_loss_mlp": 0.01024121, "balance_loss_clip": 1.04410243, "balance_loss_mlp": 1.01669478, "epoch": 0.8343654181446521, "flos": 26615085995520.0, "grad_norm": 1.9083064553441433, "language_loss": 0.71075499, "learning_rate": 2.808122429572528e-07, "loss": 0.73255378, "num_input_tokens_seen": 149942435, "step": 6939, "time_per_iteration": 2.7183587551116943 }, { "auxiliary_loss_clip": 0.01130593, "auxiliary_loss_mlp": 0.01022488, "balance_loss_clip": 1.04147267, "balance_loss_mlp": 1.01545501, "epoch": 0.8344856610352913, "flos": 20777268078720.0, "grad_norm": 3.195150156041863, "language_loss": 0.76000774, "learning_rate": 2.804143361088489e-07, "loss": 0.78153849, "num_input_tokens_seen": 149961615, "step": 6940, "time_per_iteration": 2.7594761848449707 }, { "auxiliary_loss_clip": 0.01131744, "auxiliary_loss_mlp": 0.01024999, "balance_loss_clip": 1.04161334, "balance_loss_mlp": 1.01825225, "epoch": 0.8346059039259304, "flos": 26095960684800.0, "grad_norm": 2.289741030076065, "language_loss": 0.78059584, "learning_rate": 2.8001669012022277e-07, "loss": 0.80216324, "num_input_tokens_seen": 149979585, "step": 6941, "time_per_iteration": 2.683696746826172 }, { "auxiliary_loss_clip": 0.01153803, "auxiliary_loss_mlp": 0.01023888, "balance_loss_clip": 1.04694963, "balance_loss_mlp": 1.01647317, "epoch": 0.8347261468165694, "flos": 29027708755200.0, "grad_norm": 1.603878145062125, "language_loss": 0.69060361, "learning_rate": 2.7961930505169795e-07, "loss": 0.71238053, "num_input_tokens_seen": 150003830, "step": 6942, "time_per_iteration": 2.7634029388427734 }, { "auxiliary_loss_clip": 0.01156569, "auxiliary_loss_mlp": 0.00762421, "balance_loss_clip": 1.04647732, "balance_loss_mlp": 1.0003109, "epoch": 0.8348463897072086, "flos": 26396461866240.0, "grad_norm": 1.9879392064093055, "language_loss": 0.76688904, "learning_rate": 2.792221809635558e-07, "loss": 0.78607893, "num_input_tokens_seen": 150024460, "step": 6943, "time_per_iteration": 2.667833089828491 }, { "auxiliary_loss_clip": 0.01084255, "auxiliary_loss_mlp": 0.01027849, "balance_loss_clip": 1.03934729, "balance_loss_mlp": 1.02080345, "epoch": 0.8349666325978476, "flos": 23367720096000.0, "grad_norm": 2.1795786053919226, "language_loss": 0.75003266, "learning_rate": 2.788253179160411e-07, "loss": 0.77115369, "num_input_tokens_seen": 150045620, "step": 6944, "time_per_iteration": 2.8700051307678223 }, { "auxiliary_loss_clip": 0.01137276, "auxiliary_loss_mlp": 0.01021681, "balance_loss_clip": 1.04302907, "balance_loss_mlp": 1.0149132, "epoch": 0.8350868754884867, "flos": 12896528135040.0, "grad_norm": 2.1806518001903665, "language_loss": 0.64790952, "learning_rate": 2.7842871596935725e-07, "loss": 0.66949916, "num_input_tokens_seen": 150064135, "step": 6945, "time_per_iteration": 3.842698574066162 }, { "auxiliary_loss_clip": 0.01156784, "auxiliary_loss_mlp": 0.01025912, "balance_loss_clip": 1.04413915, "balance_loss_mlp": 1.01897717, "epoch": 0.8352071183791259, "flos": 26505522535680.0, "grad_norm": 1.6410961565916997, "language_loss": 0.69138777, "learning_rate": 2.780323751836682e-07, "loss": 0.71321476, "num_input_tokens_seen": 150085350, "step": 6946, "time_per_iteration": 2.6883273124694824 }, { "auxiliary_loss_clip": 0.01136112, "auxiliary_loss_mlp": 0.00762105, "balance_loss_clip": 1.0402813, "balance_loss_mlp": 1.00033724, "epoch": 0.8353273612697649, "flos": 20668063754880.0, "grad_norm": 1.6535325408017343, "language_loss": 0.78626525, "learning_rate": 2.7763629561909876e-07, "loss": 0.80524743, "num_input_tokens_seen": 150106180, "step": 6947, "time_per_iteration": 3.6024527549743652 }, { "auxiliary_loss_clip": 0.01162621, "auxiliary_loss_mlp": 0.01022826, "balance_loss_clip": 1.04405856, "balance_loss_mlp": 1.0161767, "epoch": 0.835447604160404, "flos": 19754137082880.0, "grad_norm": 2.729547521258697, "language_loss": 0.76896966, "learning_rate": 2.772404773357335e-07, "loss": 0.79082417, "num_input_tokens_seen": 150125585, "step": 6948, "time_per_iteration": 2.680018186569214 }, { "auxiliary_loss_clip": 0.0111533, "auxiliary_loss_mlp": 0.01024085, "balance_loss_clip": 1.04051232, "balance_loss_mlp": 1.01718545, "epoch": 0.8355678470510431, "flos": 23435842239360.0, "grad_norm": 2.8562464878327902, "language_loss": 0.78414255, "learning_rate": 2.7684492039361853e-07, "loss": 0.80553675, "num_input_tokens_seen": 150144810, "step": 6949, "time_per_iteration": 2.6965789794921875 }, { "auxiliary_loss_clip": 0.01168538, "auxiliary_loss_mlp": 0.01024925, "balance_loss_clip": 1.04911804, "balance_loss_mlp": 1.01766479, "epoch": 0.8356880899416822, "flos": 21214588164480.0, "grad_norm": 1.6610893400646594, "language_loss": 0.83647728, "learning_rate": 2.764496248527586e-07, "loss": 0.85841191, "num_input_tokens_seen": 150163785, "step": 6950, "time_per_iteration": 4.2908408641815186 }, { "auxiliary_loss_clip": 0.01132121, "auxiliary_loss_mlp": 0.01022721, "balance_loss_clip": 1.04104352, "balance_loss_mlp": 1.01598287, "epoch": 0.8358083328323213, "flos": 28037543466240.0, "grad_norm": 2.5824855894385106, "language_loss": 0.78644133, "learning_rate": 2.760545907731211e-07, "loss": 0.80798978, "num_input_tokens_seen": 150184360, "step": 6951, "time_per_iteration": 2.7243332862854004 }, { "auxiliary_loss_clip": 0.01151549, "auxiliary_loss_mlp": 0.01028331, "balance_loss_clip": 1.04236186, "balance_loss_mlp": 1.0214082, "epoch": 0.8359285757229604, "flos": 27783655159680.0, "grad_norm": 1.6997004968865645, "language_loss": 0.67868143, "learning_rate": 2.75659818214631e-07, "loss": 0.70048022, "num_input_tokens_seen": 150205465, "step": 6952, "time_per_iteration": 2.682063341140747 }, { "auxiliary_loss_clip": 0.01142542, "auxiliary_loss_mlp": 0.0102313, "balance_loss_clip": 1.04373074, "balance_loss_mlp": 1.01573312, "epoch": 0.8360488186135995, "flos": 21435115714560.0, "grad_norm": 1.7247182491966533, "language_loss": 0.78227949, "learning_rate": 2.752653072371749e-07, "loss": 0.80393624, "num_input_tokens_seen": 150224900, "step": 6953, "time_per_iteration": 2.656730890274048 }, { "auxiliary_loss_clip": 0.01117231, "auxiliary_loss_mlp": 0.01029805, "balance_loss_clip": 1.0417397, "balance_loss_mlp": 1.02322459, "epoch": 0.8361690615042385, "flos": 27632327160960.0, "grad_norm": 1.7212566432344993, "language_loss": 0.74942958, "learning_rate": 2.7487105790060105e-07, "loss": 0.77089989, "num_input_tokens_seen": 150244310, "step": 6954, "time_per_iteration": 2.7086880207061768 }, { "auxiliary_loss_clip": 0.01153625, "auxiliary_loss_mlp": 0.0102235, "balance_loss_clip": 1.04428899, "balance_loss_mlp": 1.01546609, "epoch": 0.8362893043948777, "flos": 39202529598720.0, "grad_norm": 1.8320703755418757, "language_loss": 0.69461942, "learning_rate": 2.7447707026471587e-07, "loss": 0.71637917, "num_input_tokens_seen": 150267285, "step": 6955, "time_per_iteration": 2.7473676204681396 }, { "auxiliary_loss_clip": 0.01122806, "auxiliary_loss_mlp": 0.01028273, "balance_loss_clip": 1.04002833, "balance_loss_mlp": 1.02145696, "epoch": 0.8364095472855168, "flos": 24785329230720.0, "grad_norm": 2.5912299531918768, "language_loss": 0.7976712, "learning_rate": 2.740833443892874e-07, "loss": 0.81918204, "num_input_tokens_seen": 150285455, "step": 6956, "time_per_iteration": 2.709413528442383 }, { "auxiliary_loss_clip": 0.01136542, "auxiliary_loss_mlp": 0.01024544, "balance_loss_clip": 1.04131198, "balance_loss_mlp": 1.01793671, "epoch": 0.8365297901761558, "flos": 22743412784640.0, "grad_norm": 1.7398432603468486, "language_loss": 0.79643595, "learning_rate": 2.7368988033404327e-07, "loss": 0.81804681, "num_input_tokens_seen": 150302970, "step": 6957, "time_per_iteration": 2.691215753555298 }, { "auxiliary_loss_clip": 0.0112513, "auxiliary_loss_mlp": 0.0102357, "balance_loss_clip": 1.04152334, "balance_loss_mlp": 1.01717424, "epoch": 0.836650033066795, "flos": 28396003242240.0, "grad_norm": 1.5622276238943975, "language_loss": 0.8457827, "learning_rate": 2.732966781586712e-07, "loss": 0.86726964, "num_input_tokens_seen": 150322715, "step": 6958, "time_per_iteration": 2.773498058319092 }, { "auxiliary_loss_clip": 0.01144043, "auxiliary_loss_mlp": 0.01025315, "balance_loss_clip": 1.04143167, "balance_loss_mlp": 1.01840949, "epoch": 0.836770275957434, "flos": 22236857233920.0, "grad_norm": 1.6616435272934111, "language_loss": 0.66622663, "learning_rate": 2.729037379228205e-07, "loss": 0.68792021, "num_input_tokens_seen": 150342900, "step": 6959, "time_per_iteration": 2.6601452827453613 }, { "auxiliary_loss_clip": 0.01140661, "auxiliary_loss_mlp": 0.01029079, "balance_loss_clip": 1.04701686, "balance_loss_mlp": 1.02225089, "epoch": 0.8368905188480731, "flos": 22491930689280.0, "grad_norm": 1.5298461478555523, "language_loss": 0.80395037, "learning_rate": 2.725110596860998e-07, "loss": 0.82564777, "num_input_tokens_seen": 150363580, "step": 6960, "time_per_iteration": 2.61474609375 }, { "auxiliary_loss_clip": 0.01107096, "auxiliary_loss_mlp": 0.01023896, "balance_loss_clip": 1.04175925, "balance_loss_mlp": 1.01687777, "epoch": 0.8370107617387123, "flos": 13370405287680.0, "grad_norm": 2.022454069160928, "language_loss": 0.70238638, "learning_rate": 2.7211864350807776e-07, "loss": 0.72369635, "num_input_tokens_seen": 150381780, "step": 6961, "time_per_iteration": 2.759600877761841 }, { "auxiliary_loss_clip": 0.01167546, "auxiliary_loss_mlp": 0.0102807, "balance_loss_clip": 1.04706633, "balance_loss_mlp": 1.02117944, "epoch": 0.8371310046293513, "flos": 25261289372160.0, "grad_norm": 1.6360609389194645, "language_loss": 0.7353406, "learning_rate": 2.717264894482836e-07, "loss": 0.7572968, "num_input_tokens_seen": 150402120, "step": 6962, "time_per_iteration": 2.682410717010498 }, { "auxiliary_loss_clip": 0.01154549, "auxiliary_loss_mlp": 0.0102506, "balance_loss_clip": 1.04635978, "balance_loss_mlp": 1.0176152, "epoch": 0.8372512475199904, "flos": 19792705311360.0, "grad_norm": 2.1787163153914637, "language_loss": 0.80855393, "learning_rate": 2.7133459756620646e-07, "loss": 0.83035004, "num_input_tokens_seen": 150419315, "step": 6963, "time_per_iteration": 2.649040460586548 }, { "auxiliary_loss_clip": 0.0114648, "auxiliary_loss_mlp": 0.01018782, "balance_loss_clip": 1.04381657, "balance_loss_mlp": 1.01194799, "epoch": 0.8373714904106295, "flos": 19391224020480.0, "grad_norm": 1.6240082562631224, "language_loss": 0.73497939, "learning_rate": 2.7094296792129733e-07, "loss": 0.75663197, "num_input_tokens_seen": 150438915, "step": 6964, "time_per_iteration": 2.6730337142944336 }, { "auxiliary_loss_clip": 0.01150866, "auxiliary_loss_mlp": 0.01022725, "balance_loss_clip": 1.043414, "balance_loss_mlp": 1.01542282, "epoch": 0.8374917333012686, "flos": 14975935401600.0, "grad_norm": 1.7661703526574635, "language_loss": 0.75496614, "learning_rate": 2.7055160057296424e-07, "loss": 0.77670205, "num_input_tokens_seen": 150456155, "step": 6965, "time_per_iteration": 2.6447768211364746 }, { "auxiliary_loss_clip": 0.01123953, "auxiliary_loss_mlp": 0.01026921, "balance_loss_clip": 1.04136562, "balance_loss_mlp": 1.01976275, "epoch": 0.8376119761919076, "flos": 30331839847680.0, "grad_norm": 1.6031496584173668, "language_loss": 0.72237158, "learning_rate": 2.7016049558057896e-07, "loss": 0.74388039, "num_input_tokens_seen": 150478115, "step": 6966, "time_per_iteration": 2.805903434753418 }, { "auxiliary_loss_clip": 0.01153846, "auxiliary_loss_mlp": 0.01023246, "balance_loss_clip": 1.04736912, "balance_loss_mlp": 1.01602149, "epoch": 0.8377322190825467, "flos": 29423336129280.0, "grad_norm": 1.6583781180368389, "language_loss": 0.70793593, "learning_rate": 2.6976965300347074e-07, "loss": 0.72970682, "num_input_tokens_seen": 150500725, "step": 6967, "time_per_iteration": 2.699808359146118 }, { "auxiliary_loss_clip": 0.01132811, "auxiliary_loss_mlp": 0.01026507, "balance_loss_clip": 1.04148948, "balance_loss_mlp": 1.01961422, "epoch": 0.8378524619731859, "flos": 26687086807680.0, "grad_norm": 2.9504190943416244, "language_loss": 0.6922363, "learning_rate": 2.693790729009309e-07, "loss": 0.71382952, "num_input_tokens_seen": 150522335, "step": 6968, "time_per_iteration": 2.7191340923309326 }, { "auxiliary_loss_clip": 0.01137376, "auxiliary_loss_mlp": 0.01021762, "balance_loss_clip": 1.04343688, "balance_loss_mlp": 1.01468658, "epoch": 0.8379727048638249, "flos": 20703866636160.0, "grad_norm": 1.7418338712959534, "language_loss": 0.88372296, "learning_rate": 2.6898875533220946e-07, "loss": 0.90531433, "num_input_tokens_seen": 150541640, "step": 6969, "time_per_iteration": 2.7307398319244385 }, { "auxiliary_loss_clip": 0.01161439, "auxiliary_loss_mlp": 0.01019904, "balance_loss_clip": 1.04563558, "balance_loss_mlp": 1.01363683, "epoch": 0.838092947754464, "flos": 20084084438400.0, "grad_norm": 2.0010800646827835, "language_loss": 0.81853807, "learning_rate": 2.685987003565171e-07, "loss": 0.84035152, "num_input_tokens_seen": 150559680, "step": 6970, "time_per_iteration": 2.6180191040039062 }, { "auxiliary_loss_clip": 0.01113633, "auxiliary_loss_mlp": 0.01022113, "balance_loss_clip": 1.04127896, "balance_loss_mlp": 1.01500487, "epoch": 0.8382131906451031, "flos": 18113270964480.0, "grad_norm": 2.6728955739784306, "language_loss": 0.75138009, "learning_rate": 2.6820890803302566e-07, "loss": 0.77273756, "num_input_tokens_seen": 150575205, "step": 6971, "time_per_iteration": 3.5802676677703857 }, { "auxiliary_loss_clip": 0.01136773, "auxiliary_loss_mlp": 0.01025168, "balance_loss_clip": 1.04504228, "balance_loss_mlp": 1.01831353, "epoch": 0.8383334335357422, "flos": 17092653920640.0, "grad_norm": 2.0663337904546912, "language_loss": 0.81727618, "learning_rate": 2.6781937842086557e-07, "loss": 0.83889556, "num_input_tokens_seen": 150593995, "step": 6972, "time_per_iteration": 2.688053607940674 }, { "auxiliary_loss_clip": 0.01153575, "auxiliary_loss_mlp": 0.01023938, "balance_loss_clip": 1.04478717, "balance_loss_mlp": 1.01690483, "epoch": 0.8384536764263812, "flos": 20704728562560.0, "grad_norm": 2.071723705513684, "language_loss": 0.67204165, "learning_rate": 2.6743011157912933e-07, "loss": 0.69381678, "num_input_tokens_seen": 150613715, "step": 6973, "time_per_iteration": 3.601346015930176 }, { "auxiliary_loss_clip": 0.0110494, "auxiliary_loss_mlp": 0.01021795, "balance_loss_clip": 1.0343436, "balance_loss_mlp": 1.01432633, "epoch": 0.8385739193170204, "flos": 28986842056320.0, "grad_norm": 2.2157827594547, "language_loss": 0.65132111, "learning_rate": 2.6704110756686725e-07, "loss": 0.67258847, "num_input_tokens_seen": 150634540, "step": 6974, "time_per_iteration": 2.7911980152130127 }, { "auxiliary_loss_clip": 0.01134653, "auxiliary_loss_mlp": 0.00762102, "balance_loss_clip": 1.04156375, "balance_loss_mlp": 1.00028193, "epoch": 0.8386941622076595, "flos": 23438068882560.0, "grad_norm": 1.84554909251744, "language_loss": 0.83899009, "learning_rate": 2.6665236644309085e-07, "loss": 0.85795766, "num_input_tokens_seen": 150654850, "step": 6975, "time_per_iteration": 2.7316012382507324 }, { "auxiliary_loss_clip": 0.01152296, "auxiliary_loss_mlp": 0.01025471, "balance_loss_clip": 1.04457295, "balance_loss_mlp": 1.01907289, "epoch": 0.8388144050982985, "flos": 23002724044800.0, "grad_norm": 1.838204913583312, "language_loss": 0.79792708, "learning_rate": 2.662638882667727e-07, "loss": 0.81970477, "num_input_tokens_seen": 150673790, "step": 6976, "time_per_iteration": 4.411527395248413 }, { "auxiliary_loss_clip": 0.01167, "auxiliary_loss_mlp": 0.0102742, "balance_loss_clip": 1.04473424, "balance_loss_mlp": 1.02035403, "epoch": 0.8389346479889377, "flos": 24280353878400.0, "grad_norm": 2.0631165611587643, "language_loss": 0.72908771, "learning_rate": 2.658756730968443e-07, "loss": 0.75103199, "num_input_tokens_seen": 150692255, "step": 6977, "time_per_iteration": 2.6788253784179688 }, { "auxiliary_loss_clip": 0.01141242, "auxiliary_loss_mlp": 0.01022056, "balance_loss_clip": 1.04516315, "balance_loss_mlp": 1.01516569, "epoch": 0.8390548908795767, "flos": 21215019127680.0, "grad_norm": 1.945889137960177, "language_loss": 0.88383639, "learning_rate": 2.654877209921975e-07, "loss": 0.90546942, "num_input_tokens_seen": 150709790, "step": 6978, "time_per_iteration": 2.7129976749420166 }, { "auxiliary_loss_clip": 0.01115461, "auxiliary_loss_mlp": 0.01030892, "balance_loss_clip": 1.03854418, "balance_loss_mlp": 1.02288079, "epoch": 0.8391751337702158, "flos": 35627299332480.0, "grad_norm": 2.293842820281841, "language_loss": 0.62911761, "learning_rate": 2.651000320116843e-07, "loss": 0.65058112, "num_input_tokens_seen": 150730675, "step": 6979, "time_per_iteration": 2.831265926361084 }, { "auxiliary_loss_clip": 0.0111999, "auxiliary_loss_mlp": 0.00763164, "balance_loss_clip": 1.04119182, "balance_loss_mlp": 1.00031948, "epoch": 0.839295376660855, "flos": 21325229032320.0, "grad_norm": 2.211234443006644, "language_loss": 0.76118541, "learning_rate": 2.647126062141163e-07, "loss": 0.78001696, "num_input_tokens_seen": 150749750, "step": 6980, "time_per_iteration": 2.7487103939056396 }, { "auxiliary_loss_clip": 0.01140194, "auxiliary_loss_mlp": 0.01024791, "balance_loss_clip": 1.04015064, "balance_loss_mlp": 1.0178858, "epoch": 0.839415619551494, "flos": 18442535961600.0, "grad_norm": 1.9620609120349832, "language_loss": 0.83808976, "learning_rate": 2.643254436582669e-07, "loss": 0.85973966, "num_input_tokens_seen": 150769240, "step": 6981, "time_per_iteration": 2.7452313899993896 }, { "auxiliary_loss_clip": 0.01113608, "auxiliary_loss_mlp": 0.0102325, "balance_loss_clip": 1.04101145, "balance_loss_mlp": 1.01576352, "epoch": 0.8395358624421331, "flos": 23221958705280.0, "grad_norm": 1.738223522358968, "language_loss": 0.82014567, "learning_rate": 2.6393854440286743e-07, "loss": 0.84151423, "num_input_tokens_seen": 150788410, "step": 6982, "time_per_iteration": 2.749258279800415 }, { "auxiliary_loss_clip": 0.01167569, "auxiliary_loss_mlp": 0.0103245, "balance_loss_clip": 1.04897523, "balance_loss_mlp": 1.02578616, "epoch": 0.8396561053327722, "flos": 24381657210240.0, "grad_norm": 2.0963858468313434, "language_loss": 0.70762157, "learning_rate": 2.6355190850661045e-07, "loss": 0.72962177, "num_input_tokens_seen": 150805245, "step": 6983, "time_per_iteration": 2.5884456634521484 }, { "auxiliary_loss_clip": 0.01138932, "auxiliary_loss_mlp": 0.01022086, "balance_loss_clip": 1.04584336, "balance_loss_mlp": 1.01494551, "epoch": 0.8397763482234113, "flos": 22237755073920.0, "grad_norm": 1.7944106453538728, "language_loss": 0.86556828, "learning_rate": 2.631655360281486e-07, "loss": 0.88717842, "num_input_tokens_seen": 150824920, "step": 6984, "time_per_iteration": 2.659560203552246 }, { "auxiliary_loss_clip": 0.0115766, "auxiliary_loss_mlp": 0.00762414, "balance_loss_clip": 1.04434347, "balance_loss_mlp": 1.00032902, "epoch": 0.8398965911140504, "flos": 22163743100160.0, "grad_norm": 2.04143474504253, "language_loss": 0.65331852, "learning_rate": 2.6277942702609323e-07, "loss": 0.67251927, "num_input_tokens_seen": 150844400, "step": 6985, "time_per_iteration": 2.5993106365203857 }, { "auxiliary_loss_clip": 0.01125262, "auxiliary_loss_mlp": 0.0102337, "balance_loss_clip": 1.04267311, "balance_loss_mlp": 1.01626539, "epoch": 0.8400168340046895, "flos": 21542775753600.0, "grad_norm": 2.2294006909275677, "language_loss": 0.87365502, "learning_rate": 2.623935815590186e-07, "loss": 0.8951413, "num_input_tokens_seen": 150862780, "step": 6986, "time_per_iteration": 2.715299606323242 }, { "auxiliary_loss_clip": 0.0113987, "auxiliary_loss_mlp": 0.010237, "balance_loss_clip": 1.04602766, "balance_loss_mlp": 1.01657724, "epoch": 0.8401370768953286, "flos": 22491966602880.0, "grad_norm": 5.49628294358022, "language_loss": 0.80774987, "learning_rate": 2.6200799968545516e-07, "loss": 0.82938552, "num_input_tokens_seen": 150883075, "step": 6987, "time_per_iteration": 2.657796859741211 }, { "auxiliary_loss_clip": 0.0104398, "auxiliary_loss_mlp": 0.01001793, "balance_loss_clip": 1.01385546, "balance_loss_mlp": 1.00069666, "epoch": 0.8402573197859676, "flos": 59238890818560.0, "grad_norm": 1.2369501984306077, "language_loss": 0.56422448, "learning_rate": 2.616226814638969e-07, "loss": 0.58468223, "num_input_tokens_seen": 150948180, "step": 6988, "time_per_iteration": 3.2862749099731445 }, { "auxiliary_loss_clip": 0.01141679, "auxiliary_loss_mlp": 0.01021775, "balance_loss_clip": 1.04574263, "balance_loss_mlp": 1.01483691, "epoch": 0.8403775626766068, "flos": 22674608282880.0, "grad_norm": 1.944297281431451, "language_loss": 0.77621794, "learning_rate": 2.612376269527954e-07, "loss": 0.79785246, "num_input_tokens_seen": 150967885, "step": 6989, "time_per_iteration": 2.6740596294403076 }, { "auxiliary_loss_clip": 0.01136484, "auxiliary_loss_mlp": 0.01029104, "balance_loss_clip": 1.04451239, "balance_loss_mlp": 1.02229404, "epoch": 0.8404978055672458, "flos": 19609704495360.0, "grad_norm": 1.8160304535090925, "language_loss": 0.67219532, "learning_rate": 2.608528362105635e-07, "loss": 0.69385117, "num_input_tokens_seen": 150987255, "step": 6990, "time_per_iteration": 2.645740270614624 }, { "auxiliary_loss_clip": 0.01125828, "auxiliary_loss_mlp": 0.010257, "balance_loss_clip": 1.04073668, "balance_loss_mlp": 1.0183568, "epoch": 0.8406180484578849, "flos": 27526929678720.0, "grad_norm": 1.8435575235186863, "language_loss": 0.73242462, "learning_rate": 2.6046830929557374e-07, "loss": 0.75393993, "num_input_tokens_seen": 151006905, "step": 6991, "time_per_iteration": 2.8330512046813965 }, { "auxiliary_loss_clip": 0.01119346, "auxiliary_loss_mlp": 0.0103008, "balance_loss_clip": 1.04183793, "balance_loss_mlp": 1.0229187, "epoch": 0.8407382913485241, "flos": 22127473342080.0, "grad_norm": 1.7899578276103818, "language_loss": 0.84913933, "learning_rate": 2.6008404626615776e-07, "loss": 0.8706336, "num_input_tokens_seen": 151025405, "step": 6992, "time_per_iteration": 2.797581911087036 }, { "auxiliary_loss_clip": 0.01156149, "auxiliary_loss_mlp": 0.01028373, "balance_loss_clip": 1.04540968, "balance_loss_mlp": 1.02114296, "epoch": 0.8408585342391631, "flos": 13918473982080.0, "grad_norm": 6.064781602813566, "language_loss": 0.7370891, "learning_rate": 2.597000471806092e-07, "loss": 0.75893432, "num_input_tokens_seen": 151041970, "step": 6993, "time_per_iteration": 2.633920431137085 }, { "auxiliary_loss_clip": 0.01135768, "auxiliary_loss_mlp": 0.01028484, "balance_loss_clip": 1.0458045, "balance_loss_mlp": 1.02072895, "epoch": 0.8409787771298022, "flos": 20187865808640.0, "grad_norm": 1.9524396115806804, "language_loss": 0.73114771, "learning_rate": 2.593163120971793e-07, "loss": 0.75279021, "num_input_tokens_seen": 151060835, "step": 6994, "time_per_iteration": 2.6739163398742676 }, { "auxiliary_loss_clip": 0.01100822, "auxiliary_loss_mlp": 0.01023529, "balance_loss_clip": 1.03693962, "balance_loss_mlp": 1.01669538, "epoch": 0.8410990200204413, "flos": 23142523777920.0, "grad_norm": 1.8553947982649048, "language_loss": 0.69391561, "learning_rate": 2.5893284107408165e-07, "loss": 0.71515912, "num_input_tokens_seen": 151078205, "step": 6995, "time_per_iteration": 2.762742519378662 }, { "auxiliary_loss_clip": 0.01108347, "auxiliary_loss_mlp": 0.01024856, "balance_loss_clip": 1.04152846, "balance_loss_mlp": 1.01787341, "epoch": 0.8412192629110804, "flos": 24027219757440.0, "grad_norm": 1.7060053288870858, "language_loss": 0.77827573, "learning_rate": 2.5854963416948726e-07, "loss": 0.79960769, "num_input_tokens_seen": 151100470, "step": 6996, "time_per_iteration": 2.802072763442993 }, { "auxiliary_loss_clip": 0.01108827, "auxiliary_loss_mlp": 0.01025264, "balance_loss_clip": 1.03562391, "balance_loss_mlp": 1.01812911, "epoch": 0.8413395058017195, "flos": 25591703604480.0, "grad_norm": 1.6374101168180368, "language_loss": 0.69195354, "learning_rate": 2.5816669144152816e-07, "loss": 0.71329439, "num_input_tokens_seen": 151121650, "step": 6997, "time_per_iteration": 3.707207441329956 }, { "auxiliary_loss_clip": 0.0106359, "auxiliary_loss_mlp": 0.0100144, "balance_loss_clip": 1.00962865, "balance_loss_mlp": 1.00037336, "epoch": 0.8414597486923585, "flos": 63635396624640.0, "grad_norm": 0.8472904490178057, "language_loss": 0.6626665, "learning_rate": 2.5778401294829777e-07, "loss": 0.68331683, "num_input_tokens_seen": 151180390, "step": 6998, "time_per_iteration": 3.2839386463165283 }, { "auxiliary_loss_clip": 0.0114934, "auxiliary_loss_mlp": 0.00762232, "balance_loss_clip": 1.04424119, "balance_loss_mlp": 1.00030518, "epoch": 0.8415799915829977, "flos": 19098731571840.0, "grad_norm": 1.739850638902741, "language_loss": 0.65066063, "learning_rate": 2.574015987478473e-07, "loss": 0.66977632, "num_input_tokens_seen": 151198520, "step": 6999, "time_per_iteration": 3.7726216316223145 }, { "auxiliary_loss_clip": 0.01144831, "auxiliary_loss_mlp": 0.01025158, "balance_loss_clip": 1.0449903, "balance_loss_mlp": 1.01810944, "epoch": 0.8417002344736367, "flos": 19821612781440.0, "grad_norm": 2.208586185950456, "language_loss": 0.87014008, "learning_rate": 2.570194488981887e-07, "loss": 0.89183998, "num_input_tokens_seen": 151215065, "step": 7000, "time_per_iteration": 2.639275312423706 }, { "auxiliary_loss_clip": 0.01063595, "auxiliary_loss_mlp": 0.01000695, "balance_loss_clip": 1.00962543, "balance_loss_mlp": 0.99964601, "epoch": 0.8418204773642758, "flos": 62161516834560.0, "grad_norm": 0.8367606008631114, "language_loss": 0.60348427, "learning_rate": 2.566375634572939e-07, "loss": 0.62412721, "num_input_tokens_seen": 151275705, "step": 7001, "time_per_iteration": 3.1223394870758057 }, { "auxiliary_loss_clip": 0.0113068, "auxiliary_loss_mlp": 0.01021491, "balance_loss_clip": 1.04214501, "balance_loss_mlp": 1.01444578, "epoch": 0.841940720254915, "flos": 17092905315840.0, "grad_norm": 2.159089342480216, "language_loss": 0.76371706, "learning_rate": 2.562559424830943e-07, "loss": 0.78523874, "num_input_tokens_seen": 151293665, "step": 7002, "time_per_iteration": 4.6003031730651855 }, { "auxiliary_loss_clip": 0.01134261, "auxiliary_loss_mlp": 0.01026676, "balance_loss_clip": 1.04182374, "balance_loss_mlp": 1.01929712, "epoch": 0.842060963145554, "flos": 16283586026880.0, "grad_norm": 2.4084769928116248, "language_loss": 0.7027722, "learning_rate": 2.5587458603348256e-07, "loss": 0.72438157, "num_input_tokens_seen": 151310955, "step": 7003, "time_per_iteration": 2.723414421081543 }, { "auxiliary_loss_clip": 0.01118105, "auxiliary_loss_mlp": 0.01025449, "balance_loss_clip": 1.03960752, "balance_loss_mlp": 1.01812959, "epoch": 0.8421812060361931, "flos": 21908238681600.0, "grad_norm": 1.8636974819042524, "language_loss": 0.84031522, "learning_rate": 2.554934941663085e-07, "loss": 0.86175072, "num_input_tokens_seen": 151328490, "step": 7004, "time_per_iteration": 2.7512331008911133 }, { "auxiliary_loss_clip": 0.01123125, "auxiliary_loss_mlp": 0.0102224, "balance_loss_clip": 1.0409168, "balance_loss_mlp": 1.01462245, "epoch": 0.8423014489268322, "flos": 27777693502080.0, "grad_norm": 2.0728739590679375, "language_loss": 0.73428416, "learning_rate": 2.5511266693938484e-07, "loss": 0.75573778, "num_input_tokens_seen": 151346950, "step": 7005, "time_per_iteration": 2.7689740657806396 }, { "auxiliary_loss_clip": 0.01137224, "auxiliary_loss_mlp": 0.0102759, "balance_loss_clip": 1.0451864, "balance_loss_mlp": 1.01981497, "epoch": 0.8424216918174713, "flos": 25117610970240.0, "grad_norm": 1.5590548070233865, "language_loss": 0.77958858, "learning_rate": 2.547321044104822e-07, "loss": 0.80123675, "num_input_tokens_seen": 151368445, "step": 7006, "time_per_iteration": 2.7984139919281006 }, { "auxiliary_loss_clip": 0.01172034, "auxiliary_loss_mlp": 0.0102309, "balance_loss_clip": 1.04991269, "balance_loss_mlp": 1.01585948, "epoch": 0.8425419347081103, "flos": 24748448941440.0, "grad_norm": 1.8094242763878217, "language_loss": 0.7691561, "learning_rate": 2.5435180663733113e-07, "loss": 0.7911073, "num_input_tokens_seen": 151388745, "step": 7007, "time_per_iteration": 2.710113763809204 }, { "auxiliary_loss_clip": 0.01120089, "auxiliary_loss_mlp": 0.01023057, "balance_loss_clip": 1.04015744, "balance_loss_mlp": 1.0158751, "epoch": 0.8426621775987495, "flos": 24820916630400.0, "grad_norm": 2.336410961902473, "language_loss": 0.71521246, "learning_rate": 2.539717736776241e-07, "loss": 0.73664391, "num_input_tokens_seen": 151404970, "step": 7008, "time_per_iteration": 2.7850589752197266 }, { "auxiliary_loss_clip": 0.01148829, "auxiliary_loss_mlp": 0.01020958, "balance_loss_clip": 1.04475522, "balance_loss_mlp": 1.01420176, "epoch": 0.8427824204893886, "flos": 23550074467200.0, "grad_norm": 1.529002743032788, "language_loss": 0.76523453, "learning_rate": 2.535920055890097e-07, "loss": 0.78693247, "num_input_tokens_seen": 151426265, "step": 7009, "time_per_iteration": 2.6552512645721436 }, { "auxiliary_loss_clip": 0.01105516, "auxiliary_loss_mlp": 0.01027151, "balance_loss_clip": 1.03949189, "balance_loss_mlp": 1.01965904, "epoch": 0.8429026633800276, "flos": 16143858120960.0, "grad_norm": 8.999618408915026, "language_loss": 0.64795172, "learning_rate": 2.5321250242910006e-07, "loss": 0.66927838, "num_input_tokens_seen": 151444180, "step": 7010, "time_per_iteration": 2.7009873390197754 }, { "auxiliary_loss_clip": 0.01166204, "auxiliary_loss_mlp": 0.01024535, "balance_loss_clip": 1.04684448, "balance_loss_mlp": 1.01788008, "epoch": 0.8430229062706668, "flos": 22198540400640.0, "grad_norm": 1.74431619907716, "language_loss": 0.86437142, "learning_rate": 2.5283326425546493e-07, "loss": 0.88627881, "num_input_tokens_seen": 151463290, "step": 7011, "time_per_iteration": 2.6431782245635986 }, { "auxiliary_loss_clip": 0.01116967, "auxiliary_loss_mlp": 0.01019903, "balance_loss_clip": 1.04379439, "balance_loss_mlp": 1.01368964, "epoch": 0.8431431491613058, "flos": 35330317683840.0, "grad_norm": 1.9529512025556373, "language_loss": 0.69271219, "learning_rate": 2.5245429112563443e-07, "loss": 0.71408093, "num_input_tokens_seen": 151483965, "step": 7012, "time_per_iteration": 2.8054752349853516 }, { "auxiliary_loss_clip": 0.01151978, "auxiliary_loss_mlp": 0.01026271, "balance_loss_clip": 1.04676676, "balance_loss_mlp": 1.01891005, "epoch": 0.8432633920519449, "flos": 25812374808960.0, "grad_norm": 2.0137130052381216, "language_loss": 0.82254303, "learning_rate": 2.5207558309709865e-07, "loss": 0.84432554, "num_input_tokens_seen": 151503700, "step": 7013, "time_per_iteration": 2.693586587905884 }, { "auxiliary_loss_clip": 0.01037977, "auxiliary_loss_mlp": 0.00753656, "balance_loss_clip": 1.00973487, "balance_loss_mlp": 1.0001626, "epoch": 0.8433836349425841, "flos": 64959531592320.0, "grad_norm": 0.6570822625597135, "language_loss": 0.56242132, "learning_rate": 2.516971402273065e-07, "loss": 0.58033764, "num_input_tokens_seen": 151569765, "step": 7014, "time_per_iteration": 3.2729172706604004 }, { "auxiliary_loss_clip": 0.01134999, "auxiliary_loss_mlp": 0.01027392, "balance_loss_clip": 1.04065633, "balance_loss_mlp": 1.02075469, "epoch": 0.8435038778332231, "flos": 20229989483520.0, "grad_norm": 3.146310841218998, "language_loss": 0.67463106, "learning_rate": 2.513189625736687e-07, "loss": 0.69625497, "num_input_tokens_seen": 151586660, "step": 7015, "time_per_iteration": 2.687856435775757 }, { "auxiliary_loss_clip": 0.01129493, "auxiliary_loss_mlp": 0.0102521, "balance_loss_clip": 1.04158998, "balance_loss_mlp": 1.01777434, "epoch": 0.8436241207238622, "flos": 20992229020800.0, "grad_norm": 2.2554639897278412, "language_loss": 0.71552044, "learning_rate": 2.509410501935534e-07, "loss": 0.7370674, "num_input_tokens_seen": 151602295, "step": 7016, "time_per_iteration": 2.6681063175201416 }, { "auxiliary_loss_clip": 0.01141635, "auxiliary_loss_mlp": 0.01025483, "balance_loss_clip": 1.04518795, "balance_loss_mlp": 1.0181458, "epoch": 0.8437443636145013, "flos": 14682257804160.0, "grad_norm": 2.448269051211799, "language_loss": 0.75735784, "learning_rate": 2.5056340314429116e-07, "loss": 0.77902901, "num_input_tokens_seen": 151619760, "step": 7017, "time_per_iteration": 2.631019115447998 }, { "auxiliary_loss_clip": 0.01110437, "auxiliary_loss_mlp": 0.01022451, "balance_loss_clip": 1.03904843, "balance_loss_mlp": 1.01502419, "epoch": 0.8438646065051404, "flos": 21608814908160.0, "grad_norm": 2.1716708436909102, "language_loss": 0.80142826, "learning_rate": 2.5018602148316904e-07, "loss": 0.82275712, "num_input_tokens_seen": 151635795, "step": 7018, "time_per_iteration": 2.677196741104126 }, { "auxiliary_loss_clip": 0.01165058, "auxiliary_loss_mlp": 0.01019599, "balance_loss_clip": 1.04740286, "balance_loss_mlp": 1.01312041, "epoch": 0.8439848493957794, "flos": 23289937194240.0, "grad_norm": 1.7484482050868222, "language_loss": 0.80210567, "learning_rate": 2.498089052674359e-07, "loss": 0.82395226, "num_input_tokens_seen": 151653770, "step": 7019, "time_per_iteration": 2.6859614849090576 }, { "auxiliary_loss_clip": 0.01153381, "auxiliary_loss_mlp": 0.01028089, "balance_loss_clip": 1.04529476, "balance_loss_mlp": 1.02100241, "epoch": 0.8441050922864186, "flos": 19719339782400.0, "grad_norm": 1.9056904206983771, "language_loss": 0.75338936, "learning_rate": 2.494320545543007e-07, "loss": 0.77520406, "num_input_tokens_seen": 151673340, "step": 7020, "time_per_iteration": 2.673088312149048 }, { "auxiliary_loss_clip": 0.011689, "auxiliary_loss_mlp": 0.0102658, "balance_loss_clip": 1.04656053, "balance_loss_mlp": 1.01940954, "epoch": 0.8442253351770577, "flos": 21835268202240.0, "grad_norm": 1.7180170385952607, "language_loss": 0.66982859, "learning_rate": 2.490554694009308e-07, "loss": 0.69178343, "num_input_tokens_seen": 151694205, "step": 7021, "time_per_iteration": 2.724266529083252 }, { "auxiliary_loss_clip": 0.01154934, "auxiliary_loss_mlp": 0.01020967, "balance_loss_clip": 1.04336262, "balance_loss_mlp": 1.014431, "epoch": 0.8443455780676967, "flos": 34346365447680.0, "grad_norm": 1.5585749575031145, "language_loss": 0.78261769, "learning_rate": 2.4867914986445426e-07, "loss": 0.8043766, "num_input_tokens_seen": 151716595, "step": 7022, "time_per_iteration": 3.7058355808258057 }, { "auxiliary_loss_clip": 0.01141334, "auxiliary_loss_mlp": 0.01024842, "balance_loss_clip": 1.04170442, "balance_loss_mlp": 1.01817775, "epoch": 0.8444658209583359, "flos": 48214599281280.0, "grad_norm": 2.4291379961990964, "language_loss": 0.71314448, "learning_rate": 2.483030960019581e-07, "loss": 0.73480624, "num_input_tokens_seen": 151740525, "step": 7023, "time_per_iteration": 2.966651439666748 }, { "auxiliary_loss_clip": 0.01019181, "auxiliary_loss_mlp": 0.01000878, "balance_loss_clip": 1.00808215, "balance_loss_mlp": 0.99979371, "epoch": 0.8445860638489749, "flos": 68484773105280.0, "grad_norm": 0.7338019974225316, "language_loss": 0.55466616, "learning_rate": 2.479273078704891e-07, "loss": 0.57486671, "num_input_tokens_seen": 151793890, "step": 7024, "time_per_iteration": 3.2191364765167236 }, { "auxiliary_loss_clip": 0.01018302, "auxiliary_loss_mlp": 0.01000342, "balance_loss_clip": 1.01334238, "balance_loss_mlp": 0.99920958, "epoch": 0.844706306739614, "flos": 62833331882880.0, "grad_norm": 0.7741569441432007, "language_loss": 0.64670718, "learning_rate": 2.475517855270552e-07, "loss": 0.66689372, "num_input_tokens_seen": 151853970, "step": 7025, "time_per_iteration": 4.198790073394775 }, { "auxiliary_loss_clip": 0.01163936, "auxiliary_loss_mlp": 0.01028206, "balance_loss_clip": 1.04598677, "balance_loss_mlp": 1.02153897, "epoch": 0.8448265496302532, "flos": 14976114969600.0, "grad_norm": 2.0759049913267007, "language_loss": 0.7254976, "learning_rate": 2.4717652902862143e-07, "loss": 0.747419, "num_input_tokens_seen": 151872945, "step": 7026, "time_per_iteration": 2.5440597534179688 }, { "auxiliary_loss_clip": 0.01140994, "auxiliary_loss_mlp": 0.01027947, "balance_loss_clip": 1.0416491, "balance_loss_mlp": 1.02110124, "epoch": 0.8449467925208922, "flos": 23441265192960.0, "grad_norm": 1.7079991438787616, "language_loss": 0.81183183, "learning_rate": 2.4680153843211495e-07, "loss": 0.83352125, "num_input_tokens_seen": 151892875, "step": 7027, "time_per_iteration": 2.6581881046295166 }, { "auxiliary_loss_clip": 0.01137541, "auxiliary_loss_mlp": 0.0102712, "balance_loss_clip": 1.04580116, "balance_loss_mlp": 1.0192821, "epoch": 0.8450670354115313, "flos": 22748045639040.0, "grad_norm": 2.096913556250651, "language_loss": 0.72426206, "learning_rate": 2.464268137944212e-07, "loss": 0.74590874, "num_input_tokens_seen": 151914170, "step": 7028, "time_per_iteration": 3.611950635910034 }, { "auxiliary_loss_clip": 0.01097353, "auxiliary_loss_mlp": 0.01026593, "balance_loss_clip": 1.03791738, "balance_loss_mlp": 1.01949668, "epoch": 0.8451872783021703, "flos": 29825571605760.0, "grad_norm": 1.790828883094282, "language_loss": 0.78204376, "learning_rate": 2.46052355172385e-07, "loss": 0.80328321, "num_input_tokens_seen": 151932210, "step": 7029, "time_per_iteration": 2.7777621746063232 }, { "auxiliary_loss_clip": 0.01167353, "auxiliary_loss_mlp": 0.0102024, "balance_loss_clip": 1.04588389, "balance_loss_mlp": 1.01309979, "epoch": 0.8453075211928095, "flos": 21870029589120.0, "grad_norm": 2.058876764666817, "language_loss": 0.74269986, "learning_rate": 2.456781626228128e-07, "loss": 0.76457572, "num_input_tokens_seen": 151951715, "step": 7030, "time_per_iteration": 2.6207592487335205 }, { "auxiliary_loss_clip": 0.01022978, "auxiliary_loss_mlp": 0.00753964, "balance_loss_clip": 1.0097456, "balance_loss_mlp": 1.00023866, "epoch": 0.8454277640834486, "flos": 58751869288320.0, "grad_norm": 0.9294692087443818, "language_loss": 0.66250712, "learning_rate": 2.453042362024675e-07, "loss": 0.68027657, "num_input_tokens_seen": 152004960, "step": 7031, "time_per_iteration": 3.3698291778564453 }, { "auxiliary_loss_clip": 0.01163406, "auxiliary_loss_mlp": 0.01025278, "balance_loss_clip": 1.04462099, "balance_loss_mlp": 1.01822042, "epoch": 0.8455480069740876, "flos": 27090076469760.0, "grad_norm": 1.650540415263846, "language_loss": 0.73158592, "learning_rate": 2.449305759680751e-07, "loss": 0.75347275, "num_input_tokens_seen": 152026285, "step": 7032, "time_per_iteration": 2.8203601837158203 }, { "auxiliary_loss_clip": 0.01123231, "auxiliary_loss_mlp": 0.01025167, "balance_loss_clip": 1.04423881, "balance_loss_mlp": 1.01782322, "epoch": 0.8456682498647268, "flos": 27198670262400.0, "grad_norm": 1.5311474882454255, "language_loss": 0.75364614, "learning_rate": 2.445571819763188e-07, "loss": 0.77513009, "num_input_tokens_seen": 152048585, "step": 7033, "time_per_iteration": 2.74275541305542 }, { "auxiliary_loss_clip": 0.01165106, "auxiliary_loss_mlp": 0.01024014, "balance_loss_clip": 1.04581904, "balance_loss_mlp": 1.01724303, "epoch": 0.8457884927553658, "flos": 20631901737600.0, "grad_norm": 1.744691832268476, "language_loss": 0.58519399, "learning_rate": 2.4418405428384227e-07, "loss": 0.60708523, "num_input_tokens_seen": 152068795, "step": 7034, "time_per_iteration": 2.6598153114318848 }, { "auxiliary_loss_clip": 0.01164135, "auxiliary_loss_mlp": 0.0076206, "balance_loss_clip": 1.04432368, "balance_loss_mlp": 1.0003264, "epoch": 0.8459087356460049, "flos": 15299023259520.0, "grad_norm": 1.7901131306148133, "language_loss": 0.71786171, "learning_rate": 2.4381119294724864e-07, "loss": 0.73712373, "num_input_tokens_seen": 152086240, "step": 7035, "time_per_iteration": 2.750171184539795 }, { "auxiliary_loss_clip": 0.01165368, "auxiliary_loss_mlp": 0.01025723, "balance_loss_clip": 1.04599094, "balance_loss_mlp": 1.01909554, "epoch": 0.846028978536644, "flos": 18843155326080.0, "grad_norm": 6.305948687171912, "language_loss": 0.53793579, "learning_rate": 2.434385980231004e-07, "loss": 0.5598467, "num_input_tokens_seen": 152105080, "step": 7036, "time_per_iteration": 2.637657403945923 }, { "auxiliary_loss_clip": 0.0115339, "auxiliary_loss_mlp": 0.01025476, "balance_loss_clip": 1.04558074, "balance_loss_mlp": 1.01874328, "epoch": 0.8461492214272831, "flos": 52661740285440.0, "grad_norm": 1.7613863885543728, "language_loss": 0.65282613, "learning_rate": 2.4306626956792043e-07, "loss": 0.67461479, "num_input_tokens_seen": 152130025, "step": 7037, "time_per_iteration": 2.9432764053344727 }, { "auxiliary_loss_clip": 0.01150433, "auxiliary_loss_mlp": 0.01028093, "balance_loss_clip": 1.04275322, "balance_loss_mlp": 1.02119339, "epoch": 0.8462694643179222, "flos": 18588405093120.0, "grad_norm": 3.6945433498684905, "language_loss": 0.75846136, "learning_rate": 2.4269420763819017e-07, "loss": 0.78024662, "num_input_tokens_seen": 152148070, "step": 7038, "time_per_iteration": 2.576801061630249 }, { "auxiliary_loss_clip": 0.01147919, "auxiliary_loss_mlp": 0.01027504, "balance_loss_clip": 1.0434804, "balance_loss_mlp": 1.02045321, "epoch": 0.8463897072085613, "flos": 24387080163840.0, "grad_norm": 5.254667853083792, "language_loss": 0.83874106, "learning_rate": 2.4232241229035223e-07, "loss": 0.86049527, "num_input_tokens_seen": 152165825, "step": 7039, "time_per_iteration": 2.617311954498291 }, { "auxiliary_loss_clip": 0.01055637, "auxiliary_loss_mlp": 0.01000542, "balance_loss_clip": 1.01017976, "balance_loss_mlp": 0.99946958, "epoch": 0.8465099500992004, "flos": 68702140258560.0, "grad_norm": 0.7992574556891276, "language_loss": 0.56714725, "learning_rate": 2.419508835808064e-07, "loss": 0.58770907, "num_input_tokens_seen": 152222380, "step": 7040, "time_per_iteration": 3.148341655731201 }, { "auxiliary_loss_clip": 0.01138112, "auxiliary_loss_mlp": 0.01023773, "balance_loss_clip": 1.04408526, "balance_loss_mlp": 1.0161109, "epoch": 0.8466301929898394, "flos": 13735724561280.0, "grad_norm": 2.2954023563087227, "language_loss": 0.63160312, "learning_rate": 2.415796215659134e-07, "loss": 0.65322196, "num_input_tokens_seen": 152239085, "step": 7041, "time_per_iteration": 2.711677312850952 }, { "auxiliary_loss_clip": 0.01125935, "auxiliary_loss_mlp": 0.01027803, "balance_loss_clip": 1.03857005, "balance_loss_mlp": 1.02041793, "epoch": 0.8467504358804786, "flos": 19241260738560.0, "grad_norm": 1.9916184455094037, "language_loss": 0.7761305, "learning_rate": 2.412086263019939e-07, "loss": 0.79766786, "num_input_tokens_seen": 152257110, "step": 7042, "time_per_iteration": 2.719024896621704 }, { "auxiliary_loss_clip": 0.01161164, "auxiliary_loss_mlp": 0.01026071, "balance_loss_clip": 1.04586339, "balance_loss_mlp": 1.01968145, "epoch": 0.8468706787711177, "flos": 21324115710720.0, "grad_norm": 2.160140082011248, "language_loss": 0.79931897, "learning_rate": 2.408378978453276e-07, "loss": 0.82119131, "num_input_tokens_seen": 152277230, "step": 7043, "time_per_iteration": 2.6342647075653076 }, { "auxiliary_loss_clip": 0.01055115, "auxiliary_loss_mlp": 0.01002697, "balance_loss_clip": 1.00955546, "balance_loss_mlp": 1.00158262, "epoch": 0.8469909216617567, "flos": 64877439058560.0, "grad_norm": 0.8161229485733912, "language_loss": 0.63929856, "learning_rate": 2.404674362521533e-07, "loss": 0.6598767, "num_input_tokens_seen": 152335725, "step": 7044, "time_per_iteration": 3.118412971496582 }, { "auxiliary_loss_clip": 0.01151388, "auxiliary_loss_mlp": 0.01028961, "balance_loss_clip": 1.045223, "balance_loss_mlp": 1.02175784, "epoch": 0.8471111645523959, "flos": 19280583152640.0, "grad_norm": 2.2791399808911894, "language_loss": 0.74284536, "learning_rate": 2.4009724157866997e-07, "loss": 0.7646488, "num_input_tokens_seen": 152352785, "step": 7045, "time_per_iteration": 2.553032398223877 }, { "auxiliary_loss_clip": 0.01162876, "auxiliary_loss_mlp": 0.01022107, "balance_loss_clip": 1.04493117, "balance_loss_mlp": 1.01523507, "epoch": 0.8472314074430349, "flos": 22015826893440.0, "grad_norm": 2.0631344794248725, "language_loss": 0.76700121, "learning_rate": 2.3972731388103564e-07, "loss": 0.78885102, "num_input_tokens_seen": 152371265, "step": 7046, "time_per_iteration": 2.6717097759246826 }, { "auxiliary_loss_clip": 0.01005423, "auxiliary_loss_mlp": 0.01003665, "balance_loss_clip": 1.01020527, "balance_loss_mlp": 1.00258625, "epoch": 0.847351650333674, "flos": 57882580243200.0, "grad_norm": 0.8119867572220022, "language_loss": 0.62355244, "learning_rate": 2.393576532153687e-07, "loss": 0.64364332, "num_input_tokens_seen": 152435050, "step": 7047, "time_per_iteration": 3.4974520206451416 }, { "auxiliary_loss_clip": 0.01052422, "auxiliary_loss_mlp": 0.01001826, "balance_loss_clip": 1.01044035, "balance_loss_mlp": 1.00075328, "epoch": 0.8474718932243132, "flos": 41284238313600.0, "grad_norm": 0.9268370997294185, "language_loss": 0.57778144, "learning_rate": 2.389882596377453e-07, "loss": 0.59832394, "num_input_tokens_seen": 152489315, "step": 7048, "time_per_iteration": 4.663876056671143 }, { "auxiliary_loss_clip": 0.0116231, "auxiliary_loss_mlp": 0.0102718, "balance_loss_clip": 1.04331541, "balance_loss_mlp": 1.01999116, "epoch": 0.8475921361149522, "flos": 38180906974080.0, "grad_norm": 1.8488675169436, "language_loss": 0.76770693, "learning_rate": 2.386191332042031e-07, "loss": 0.7896018, "num_input_tokens_seen": 152511210, "step": 7049, "time_per_iteration": 2.7592718601226807 }, { "auxiliary_loss_clip": 0.01168773, "auxiliary_loss_mlp": 0.01027821, "balance_loss_clip": 1.04669166, "balance_loss_mlp": 1.0205102, "epoch": 0.8477123790055913, "flos": 25375054723200.0, "grad_norm": 1.6532979096179117, "language_loss": 0.72595763, "learning_rate": 2.3825027397073794e-07, "loss": 0.74792361, "num_input_tokens_seen": 152531685, "step": 7050, "time_per_iteration": 3.645735502243042 }, { "auxiliary_loss_clip": 0.01150227, "auxiliary_loss_mlp": 0.01023875, "balance_loss_clip": 1.04850125, "balance_loss_mlp": 1.01698756, "epoch": 0.8478326218962304, "flos": 30225185389440.0, "grad_norm": 2.6993129632934343, "language_loss": 0.66546714, "learning_rate": 2.3788168199330515e-07, "loss": 0.68720824, "num_input_tokens_seen": 152553245, "step": 7051, "time_per_iteration": 2.683839797973633 }, { "auxiliary_loss_clip": 0.01121504, "auxiliary_loss_mlp": 0.01024045, "balance_loss_clip": 1.0369426, "balance_loss_mlp": 1.01723528, "epoch": 0.8479528647868695, "flos": 38213800853760.0, "grad_norm": 1.652982133840298, "language_loss": 0.7270661, "learning_rate": 2.3751335732782074e-07, "loss": 0.74852157, "num_input_tokens_seen": 152574505, "step": 7052, "time_per_iteration": 2.813793897628784 }, { "auxiliary_loss_clip": 0.01151843, "auxiliary_loss_mlp": 0.01021552, "balance_loss_clip": 1.04649878, "balance_loss_mlp": 1.01481366, "epoch": 0.8480731076775085, "flos": 20957790856320.0, "grad_norm": 2.0146958948031295, "language_loss": 0.79529226, "learning_rate": 2.371453000301582e-07, "loss": 0.81702626, "num_input_tokens_seen": 152593190, "step": 7053, "time_per_iteration": 2.6675665378570557 }, { "auxiliary_loss_clip": 0.01119206, "auxiliary_loss_mlp": 0.01027432, "balance_loss_clip": 1.04158258, "balance_loss_mlp": 1.02029765, "epoch": 0.8481933505681477, "flos": 32596510487040.0, "grad_norm": 1.9168889550430275, "language_loss": 0.74312669, "learning_rate": 2.3677751015615222e-07, "loss": 0.76459306, "num_input_tokens_seen": 152615265, "step": 7054, "time_per_iteration": 4.62365984916687 }, { "auxiliary_loss_clip": 0.01126451, "auxiliary_loss_mlp": 0.01025454, "balance_loss_clip": 1.03987098, "balance_loss_mlp": 1.0184207, "epoch": 0.8483135934587868, "flos": 20741177888640.0, "grad_norm": 2.191017697170778, "language_loss": 0.8551929, "learning_rate": 2.3640998776159593e-07, "loss": 0.87671196, "num_input_tokens_seen": 152632770, "step": 7055, "time_per_iteration": 2.774322748184204 }, { "auxiliary_loss_clip": 0.01136604, "auxiliary_loss_mlp": 0.01023614, "balance_loss_clip": 1.04237628, "balance_loss_mlp": 1.0173521, "epoch": 0.8484338363494258, "flos": 21653057485440.0, "grad_norm": 1.827601702448024, "language_loss": 0.81193769, "learning_rate": 2.3604273290224253e-07, "loss": 0.8335399, "num_input_tokens_seen": 152653485, "step": 7056, "time_per_iteration": 2.6664516925811768 }, { "auxiliary_loss_clip": 0.0113947, "auxiliary_loss_mlp": 0.01026124, "balance_loss_clip": 1.04442155, "balance_loss_mlp": 1.01858664, "epoch": 0.848554079240065, "flos": 15013964926080.0, "grad_norm": 1.8581342283989246, "language_loss": 0.74364591, "learning_rate": 2.356757456338039e-07, "loss": 0.76530182, "num_input_tokens_seen": 152670970, "step": 7057, "time_per_iteration": 2.6567416191101074 }, { "auxiliary_loss_clip": 0.01039184, "auxiliary_loss_mlp": 0.01002229, "balance_loss_clip": 1.00913858, "balance_loss_mlp": 1.00117385, "epoch": 0.848674322130704, "flos": 68060453742720.0, "grad_norm": 0.751717422672945, "language_loss": 0.59034991, "learning_rate": 2.3530902601195147e-07, "loss": 0.61076403, "num_input_tokens_seen": 152739460, "step": 7058, "time_per_iteration": 3.301959276199341 }, { "auxiliary_loss_clip": 0.01150516, "auxiliary_loss_mlp": 0.01028518, "balance_loss_clip": 1.04451203, "balance_loss_mlp": 1.02086449, "epoch": 0.8487945650213431, "flos": 18475788977280.0, "grad_norm": 2.0924921533219027, "language_loss": 0.79036587, "learning_rate": 2.34942574092317e-07, "loss": 0.8121562, "num_input_tokens_seen": 152754710, "step": 7059, "time_per_iteration": 2.6350607872009277 }, { "auxiliary_loss_clip": 0.01156126, "auxiliary_loss_mlp": 0.01023368, "balance_loss_clip": 1.04537535, "balance_loss_mlp": 1.01639414, "epoch": 0.8489148079119821, "flos": 23473189405440.0, "grad_norm": 1.8922180567389206, "language_loss": 0.76386482, "learning_rate": 2.3457638993049045e-07, "loss": 0.78565979, "num_input_tokens_seen": 152772700, "step": 7060, "time_per_iteration": 2.635366201400757 }, { "auxiliary_loss_clip": 0.01098113, "auxiliary_loss_mlp": 0.01024242, "balance_loss_clip": 1.04231179, "balance_loss_mlp": 1.01685381, "epoch": 0.8490350508026213, "flos": 19937604775680.0, "grad_norm": 1.9923190405533386, "language_loss": 0.63618362, "learning_rate": 2.3421047358202252e-07, "loss": 0.65740722, "num_input_tokens_seen": 152791550, "step": 7061, "time_per_iteration": 2.800842761993408 }, { "auxiliary_loss_clip": 0.01153311, "auxiliary_loss_mlp": 0.01023833, "balance_loss_clip": 1.04460144, "balance_loss_mlp": 1.01626873, "epoch": 0.8491552936932604, "flos": 24279958828800.0, "grad_norm": 4.274903663826911, "language_loss": 0.83620596, "learning_rate": 2.3384482510242144e-07, "loss": 0.85797745, "num_input_tokens_seen": 152809410, "step": 7062, "time_per_iteration": 2.669896125793457 }, { "auxiliary_loss_clip": 0.01166725, "auxiliary_loss_mlp": 0.01027486, "balance_loss_clip": 1.04463291, "balance_loss_mlp": 1.02065849, "epoch": 0.8492755365838994, "flos": 22522526098560.0, "grad_norm": 1.9549202490808835, "language_loss": 0.77283823, "learning_rate": 2.3347944454715575e-07, "loss": 0.79478031, "num_input_tokens_seen": 152825800, "step": 7063, "time_per_iteration": 2.6135904788970947 }, { "auxiliary_loss_clip": 0.01167854, "auxiliary_loss_mlp": 0.01030056, "balance_loss_clip": 1.04610193, "balance_loss_mlp": 1.02259374, "epoch": 0.8493957794745386, "flos": 26980441182720.0, "grad_norm": 1.6891355432433717, "language_loss": 0.67207617, "learning_rate": 2.331143319716542e-07, "loss": 0.6940552, "num_input_tokens_seen": 152845330, "step": 7064, "time_per_iteration": 2.7460579872131348 }, { "auxiliary_loss_clip": 0.01128961, "auxiliary_loss_mlp": 0.01022693, "balance_loss_clip": 1.04186976, "balance_loss_mlp": 1.01542735, "epoch": 0.8495160223651776, "flos": 29861985018240.0, "grad_norm": 2.5246985088780667, "language_loss": 0.65696877, "learning_rate": 2.3274948743130363e-07, "loss": 0.67848533, "num_input_tokens_seen": 152865165, "step": 7065, "time_per_iteration": 2.8012807369232178 }, { "auxiliary_loss_clip": 0.01163962, "auxiliary_loss_mlp": 0.01024213, "balance_loss_clip": 1.04373598, "balance_loss_mlp": 1.01723289, "epoch": 0.8496362652558167, "flos": 23075443128960.0, "grad_norm": 1.8131028880251328, "language_loss": 0.79195386, "learning_rate": 2.3238491098145085e-07, "loss": 0.81383562, "num_input_tokens_seen": 152884695, "step": 7066, "time_per_iteration": 2.6099555492401123 }, { "auxiliary_loss_clip": 0.01152292, "auxiliary_loss_mlp": 0.01023056, "balance_loss_clip": 1.04537451, "balance_loss_mlp": 1.01535547, "epoch": 0.8497565081464559, "flos": 14609107756800.0, "grad_norm": 2.163188294058064, "language_loss": 0.73331773, "learning_rate": 2.3202060267740141e-07, "loss": 0.75507128, "num_input_tokens_seen": 152902220, "step": 7067, "time_per_iteration": 2.5986642837524414 }, { "auxiliary_loss_clip": 0.01102972, "auxiliary_loss_mlp": 0.0102424, "balance_loss_clip": 1.03673494, "balance_loss_mlp": 1.01700985, "epoch": 0.8498767510370949, "flos": 21136446126720.0, "grad_norm": 2.1650669952287505, "language_loss": 0.77689844, "learning_rate": 2.3165656257442044e-07, "loss": 0.79817057, "num_input_tokens_seen": 152920740, "step": 7068, "time_per_iteration": 2.736037254333496 }, { "auxiliary_loss_clip": 0.01147295, "auxiliary_loss_mlp": 0.01024972, "balance_loss_clip": 1.0445056, "balance_loss_mlp": 1.01859462, "epoch": 0.849996993927734, "flos": 23654538195840.0, "grad_norm": 1.970296837928355, "language_loss": 0.90339291, "learning_rate": 2.31292790727734e-07, "loss": 0.92511559, "num_input_tokens_seen": 152938305, "step": 7069, "time_per_iteration": 2.6501681804656982 }, { "auxiliary_loss_clip": 0.01161699, "auxiliary_loss_mlp": 0.01022073, "balance_loss_clip": 1.04326999, "balance_loss_mlp": 1.0152452, "epoch": 0.8501172368183731, "flos": 20558069331840.0, "grad_norm": 2.7157938259234595, "language_loss": 0.80252659, "learning_rate": 2.3092928719252392e-07, "loss": 0.8243643, "num_input_tokens_seen": 152956705, "step": 7070, "time_per_iteration": 2.617213487625122 }, { "auxiliary_loss_clip": 0.01149561, "auxiliary_loss_mlp": 0.01027915, "balance_loss_clip": 1.04398203, "balance_loss_mlp": 1.02051854, "epoch": 0.8502374797090122, "flos": 22272624201600.0, "grad_norm": 2.1384330409842507, "language_loss": 0.78838897, "learning_rate": 2.3056605202393475e-07, "loss": 0.81016374, "num_input_tokens_seen": 152974265, "step": 7071, "time_per_iteration": 2.6039540767669678 }, { "auxiliary_loss_clip": 0.01144646, "auxiliary_loss_mlp": 0.00762449, "balance_loss_clip": 1.03988528, "balance_loss_mlp": 1.00029945, "epoch": 0.8503577225996513, "flos": 23659817495040.0, "grad_norm": 2.141804217037687, "language_loss": 0.66768324, "learning_rate": 2.3020308527706888e-07, "loss": 0.68675423, "num_input_tokens_seen": 152993680, "step": 7072, "time_per_iteration": 2.6783671379089355 }, { "auxiliary_loss_clip": 0.01140746, "auxiliary_loss_mlp": 0.01029438, "balance_loss_clip": 1.04054523, "balance_loss_mlp": 1.02291739, "epoch": 0.8504779654902904, "flos": 26758513002240.0, "grad_norm": 1.7001826320622906, "language_loss": 0.88688457, "learning_rate": 2.2984038700698715e-07, "loss": 0.90858644, "num_input_tokens_seen": 153012990, "step": 7073, "time_per_iteration": 2.6809561252593994 }, { "auxiliary_loss_clip": 0.01149177, "auxiliary_loss_mlp": 0.01031452, "balance_loss_clip": 1.04541969, "balance_loss_mlp": 1.02455616, "epoch": 0.8505982083809295, "flos": 26468247196800.0, "grad_norm": 1.5823771237171476, "language_loss": 0.78990161, "learning_rate": 2.2947795726871222e-07, "loss": 0.81170785, "num_input_tokens_seen": 153034015, "step": 7074, "time_per_iteration": 3.6629855632781982 }, { "auxiliary_loss_clip": 0.01150307, "auxiliary_loss_mlp": 0.00762433, "balance_loss_clip": 1.04792476, "balance_loss_mlp": 1.00032783, "epoch": 0.8507184512715685, "flos": 20303390926080.0, "grad_norm": 1.7353819702728206, "language_loss": 0.85607344, "learning_rate": 2.2911579611722253e-07, "loss": 0.87520087, "num_input_tokens_seen": 153053160, "step": 7075, "time_per_iteration": 2.61295485496521 }, { "auxiliary_loss_clip": 0.0113381, "auxiliary_loss_mlp": 0.01029057, "balance_loss_clip": 1.04171646, "balance_loss_mlp": 1.02221155, "epoch": 0.8508386941622077, "flos": 19025186474880.0, "grad_norm": 1.8119139228994352, "language_loss": 0.8735351, "learning_rate": 2.2875390360745905e-07, "loss": 0.89516377, "num_input_tokens_seen": 153072565, "step": 7076, "time_per_iteration": 3.5926101207733154 }, { "auxiliary_loss_clip": 0.0112874, "auxiliary_loss_mlp": 0.01025363, "balance_loss_clip": 1.04088712, "balance_loss_mlp": 1.01793611, "epoch": 0.8509589370528468, "flos": 16433405654400.0, "grad_norm": 1.9384262258085811, "language_loss": 0.77594537, "learning_rate": 2.2839227979432008e-07, "loss": 0.79748642, "num_input_tokens_seen": 153090215, "step": 7077, "time_per_iteration": 2.831124782562256 }, { "auxiliary_loss_clip": 0.0113754, "auxiliary_loss_mlp": 0.01020834, "balance_loss_clip": 1.04098701, "balance_loss_mlp": 1.01420879, "epoch": 0.8510791799434858, "flos": 18259714713600.0, "grad_norm": 2.5612378665857287, "language_loss": 0.85377669, "learning_rate": 2.2803092473266373e-07, "loss": 0.87536049, "num_input_tokens_seen": 153107740, "step": 7078, "time_per_iteration": 2.7760732173919678 }, { "auxiliary_loss_clip": 0.01167695, "auxiliary_loss_mlp": 0.01030133, "balance_loss_clip": 1.04660594, "balance_loss_mlp": 1.02268267, "epoch": 0.851199422834125, "flos": 23441372933760.0, "grad_norm": 2.2422804676477193, "language_loss": 0.86557108, "learning_rate": 2.2766983847730724e-07, "loss": 0.8875494, "num_input_tokens_seen": 153127410, "step": 7079, "time_per_iteration": 2.698514699935913 }, { "auxiliary_loss_clip": 0.01132373, "auxiliary_loss_mlp": 0.01029787, "balance_loss_clip": 1.04086173, "balance_loss_mlp": 1.02166915, "epoch": 0.851319665724764, "flos": 16289404030080.0, "grad_norm": 1.8402620885932484, "language_loss": 0.66402376, "learning_rate": 2.2730902108302663e-07, "loss": 0.68564534, "num_input_tokens_seen": 153144325, "step": 7080, "time_per_iteration": 4.4326677322387695 }, { "auxiliary_loss_clip": 0.01129637, "auxiliary_loss_mlp": 0.01027865, "balance_loss_clip": 1.03969181, "balance_loss_mlp": 1.02057505, "epoch": 0.8514399086154031, "flos": 18989347680000.0, "grad_norm": 2.5865598695513774, "language_loss": 0.68666607, "learning_rate": 2.269484726045583e-07, "loss": 0.70824111, "num_input_tokens_seen": 153163240, "step": 7081, "time_per_iteration": 2.653665065765381 }, { "auxiliary_loss_clip": 0.01128071, "auxiliary_loss_mlp": 0.01023198, "balance_loss_clip": 1.04253709, "balance_loss_mlp": 1.01634955, "epoch": 0.8515601515060423, "flos": 24571194301440.0, "grad_norm": 1.6435528000268327, "language_loss": 0.79199278, "learning_rate": 2.2658819309659672e-07, "loss": 0.81350553, "num_input_tokens_seen": 153183440, "step": 7082, "time_per_iteration": 2.714900255203247 }, { "auxiliary_loss_clip": 0.01134288, "auxiliary_loss_mlp": 0.01024306, "balance_loss_clip": 1.04427671, "balance_loss_mlp": 1.01786315, "epoch": 0.8516803943966813, "flos": 19529443555200.0, "grad_norm": 1.9784495074987636, "language_loss": 0.84981954, "learning_rate": 2.2622818261379706e-07, "loss": 0.87140548, "num_input_tokens_seen": 153200460, "step": 7083, "time_per_iteration": 2.6477298736572266 }, { "auxiliary_loss_clip": 0.01135917, "auxiliary_loss_mlp": 0.01025897, "balance_loss_clip": 1.04232275, "balance_loss_mlp": 1.01854444, "epoch": 0.8518006372873204, "flos": 20265792364800.0, "grad_norm": 1.705143643502875, "language_loss": 0.74859035, "learning_rate": 2.2586844121077142e-07, "loss": 0.77020854, "num_input_tokens_seen": 153218970, "step": 7084, "time_per_iteration": 2.6346898078918457 }, { "auxiliary_loss_clip": 0.0111266, "auxiliary_loss_mlp": 0.01026998, "balance_loss_clip": 1.03935778, "balance_loss_mlp": 1.01955295, "epoch": 0.8519208801779595, "flos": 24133227770880.0, "grad_norm": 2.0887545921494977, "language_loss": 0.71639442, "learning_rate": 2.2550896894209215e-07, "loss": 0.73779106, "num_input_tokens_seen": 153238485, "step": 7085, "time_per_iteration": 2.7445759773254395 }, { "auxiliary_loss_clip": 0.01014058, "auxiliary_loss_mlp": 0.01001973, "balance_loss_clip": 1.01144266, "balance_loss_mlp": 1.00085247, "epoch": 0.8520411230685986, "flos": 63035223252480.0, "grad_norm": 0.6889700179101199, "language_loss": 0.56611234, "learning_rate": 2.2514976586229184e-07, "loss": 0.58627266, "num_input_tokens_seen": 153306430, "step": 7086, "time_per_iteration": 3.5789618492126465 }, { "auxiliary_loss_clip": 0.01055412, "auxiliary_loss_mlp": 0.01001301, "balance_loss_clip": 1.01069546, "balance_loss_mlp": 1.00024021, "epoch": 0.8521613659592376, "flos": 65836865283840.0, "grad_norm": 0.7493191452507887, "language_loss": 0.54590893, "learning_rate": 2.247908320258609e-07, "loss": 0.56647611, "num_input_tokens_seen": 153366520, "step": 7087, "time_per_iteration": 3.393077850341797 }, { "auxiliary_loss_clip": 0.01104357, "auxiliary_loss_mlp": 0.01024781, "balance_loss_clip": 1.04156566, "balance_loss_mlp": 1.01749992, "epoch": 0.8522816088498768, "flos": 23112323418240.0, "grad_norm": 2.2504354816639105, "language_loss": 0.79599226, "learning_rate": 2.2443216748724914e-07, "loss": 0.81728369, "num_input_tokens_seen": 153387230, "step": 7088, "time_per_iteration": 2.785992383956909 }, { "auxiliary_loss_clip": 0.01155361, "auxiliary_loss_mlp": 0.00762343, "balance_loss_clip": 1.04596686, "balance_loss_mlp": 1.00036216, "epoch": 0.8524018517405159, "flos": 31758140073600.0, "grad_norm": 1.8550164917537284, "language_loss": 0.74374211, "learning_rate": 2.2407377230086588e-07, "loss": 0.76291919, "num_input_tokens_seen": 153409585, "step": 7089, "time_per_iteration": 2.7844386100769043 }, { "auxiliary_loss_clip": 0.01120924, "auxiliary_loss_mlp": 0.01024855, "balance_loss_clip": 1.04258204, "balance_loss_mlp": 1.01760161, "epoch": 0.8525220946311549, "flos": 18690318956160.0, "grad_norm": 2.0523451367161853, "language_loss": 0.83916366, "learning_rate": 2.23715646521079e-07, "loss": 0.86062145, "num_input_tokens_seen": 153427105, "step": 7090, "time_per_iteration": 2.854074239730835 }, { "auxiliary_loss_clip": 0.01154577, "auxiliary_loss_mlp": 0.00762573, "balance_loss_clip": 1.04452956, "balance_loss_mlp": 1.00036705, "epoch": 0.852642337521794, "flos": 21793216354560.0, "grad_norm": 4.291956274384828, "language_loss": 0.84273869, "learning_rate": 2.2335779020221724e-07, "loss": 0.86191016, "num_input_tokens_seen": 153443725, "step": 7091, "time_per_iteration": 2.7256882190704346 }, { "auxiliary_loss_clip": 0.01052688, "auxiliary_loss_mlp": 0.01000873, "balance_loss_clip": 1.01417351, "balance_loss_mlp": 0.99979991, "epoch": 0.8527625804124331, "flos": 69040132260480.0, "grad_norm": 0.8048091126206192, "language_loss": 0.56422871, "learning_rate": 2.2300020339856497e-07, "loss": 0.58476436, "num_input_tokens_seen": 153506410, "step": 7092, "time_per_iteration": 3.238832473754883 }, { "auxiliary_loss_clip": 0.01134335, "auxiliary_loss_mlp": 0.01024407, "balance_loss_clip": 1.0425086, "balance_loss_mlp": 1.01706934, "epoch": 0.8528828233030722, "flos": 26979399688320.0, "grad_norm": 2.262614103193169, "language_loss": 0.78400278, "learning_rate": 2.2264288616436966e-07, "loss": 0.80559015, "num_input_tokens_seen": 153526665, "step": 7093, "time_per_iteration": 2.6890523433685303 }, { "auxiliary_loss_clip": 0.01134363, "auxiliary_loss_mlp": 0.01029566, "balance_loss_clip": 1.04337466, "balance_loss_mlp": 1.02256227, "epoch": 0.8530030661937112, "flos": 17487598936320.0, "grad_norm": 2.2023646900109033, "language_loss": 0.7245028, "learning_rate": 2.222858385538351e-07, "loss": 0.74614215, "num_input_tokens_seen": 153543465, "step": 7094, "time_per_iteration": 2.7223308086395264 }, { "auxiliary_loss_clip": 0.0114836, "auxiliary_loss_mlp": 0.01021044, "balance_loss_clip": 1.04345775, "balance_loss_mlp": 1.01439774, "epoch": 0.8531233090843504, "flos": 22160798184960.0, "grad_norm": 1.6399211981645385, "language_loss": 0.68004733, "learning_rate": 2.2192906062112527e-07, "loss": 0.70174134, "num_input_tokens_seen": 153563340, "step": 7095, "time_per_iteration": 2.6311800479888916 }, { "auxiliary_loss_clip": 0.01164545, "auxiliary_loss_mlp": 0.01023573, "balance_loss_clip": 1.04381084, "balance_loss_mlp": 1.01725459, "epoch": 0.8532435519749895, "flos": 37635388145280.0, "grad_norm": 1.6584030410558421, "language_loss": 0.70693767, "learning_rate": 2.2157255242036377e-07, "loss": 0.72881883, "num_input_tokens_seen": 153587005, "step": 7096, "time_per_iteration": 2.7560884952545166 }, { "auxiliary_loss_clip": 0.01119321, "auxiliary_loss_mlp": 0.01027299, "balance_loss_clip": 1.04067373, "balance_loss_mlp": 1.01914525, "epoch": 0.8533637948656285, "flos": 21398163598080.0, "grad_norm": 2.076162762638185, "language_loss": 0.74634242, "learning_rate": 2.2121631400563135e-07, "loss": 0.76780868, "num_input_tokens_seen": 153606835, "step": 7097, "time_per_iteration": 2.74774169921875 }, { "auxiliary_loss_clip": 0.0104845, "auxiliary_loss_mlp": 0.01001789, "balance_loss_clip": 1.00860381, "balance_loss_mlp": 1.00074565, "epoch": 0.8534840377562677, "flos": 53345122490880.0, "grad_norm": 0.7930277828687593, "language_loss": 0.52901196, "learning_rate": 2.208603454309701e-07, "loss": 0.54951441, "num_input_tokens_seen": 153664925, "step": 7098, "time_per_iteration": 3.1556763648986816 }, { "auxiliary_loss_clip": 0.01108362, "auxiliary_loss_mlp": 0.01025402, "balance_loss_clip": 1.0405122, "balance_loss_mlp": 1.01813912, "epoch": 0.8536042806469067, "flos": 20814148368000.0, "grad_norm": 3.540980250591467, "language_loss": 0.7061255, "learning_rate": 2.2050464675037994e-07, "loss": 0.72746313, "num_input_tokens_seen": 153683550, "step": 7099, "time_per_iteration": 2.7320668697357178 }, { "auxiliary_loss_clip": 0.01136601, "auxiliary_loss_mlp": 0.01029218, "balance_loss_clip": 1.04442334, "balance_loss_mlp": 1.02215147, "epoch": 0.8537245235375458, "flos": 24681368292480.0, "grad_norm": 2.086214535371896, "language_loss": 0.72838306, "learning_rate": 2.2014921801782016e-07, "loss": 0.75004125, "num_input_tokens_seen": 153703040, "step": 7100, "time_per_iteration": 3.6332132816314697 }, { "auxiliary_loss_clip": 0.01137397, "auxiliary_loss_mlp": 0.01023724, "balance_loss_clip": 1.03944874, "balance_loss_mlp": 1.01654124, "epoch": 0.853844766428185, "flos": 24384817607040.0, "grad_norm": 1.9552713810402769, "language_loss": 0.7394681, "learning_rate": 2.1979405928720872e-07, "loss": 0.76107937, "num_input_tokens_seen": 153722695, "step": 7101, "time_per_iteration": 2.6510138511657715 }, { "auxiliary_loss_clip": 0.01138395, "auxiliary_loss_mlp": 0.0102267, "balance_loss_clip": 1.04154146, "balance_loss_mlp": 1.01558876, "epoch": 0.853965009318824, "flos": 20955707867520.0, "grad_norm": 1.4835219857056998, "language_loss": 0.79484034, "learning_rate": 2.1943917061242257e-07, "loss": 0.81645095, "num_input_tokens_seen": 153742550, "step": 7102, "time_per_iteration": 4.112707853317261 }, { "auxiliary_loss_clip": 0.01158572, "auxiliary_loss_mlp": 0.00762855, "balance_loss_clip": 1.04527473, "balance_loss_mlp": 1.00036097, "epoch": 0.8540852522094631, "flos": 24201816791040.0, "grad_norm": 1.564708585845891, "language_loss": 0.66497183, "learning_rate": 2.1908455204729903e-07, "loss": 0.6841861, "num_input_tokens_seen": 153761700, "step": 7103, "time_per_iteration": 2.779060125350952 }, { "auxiliary_loss_clip": 0.01135353, "auxiliary_loss_mlp": 0.01025456, "balance_loss_clip": 1.04021275, "balance_loss_mlp": 1.01821375, "epoch": 0.8542054951001022, "flos": 25082921410560.0, "grad_norm": 2.1065440857517164, "language_loss": 0.78374982, "learning_rate": 2.1873020364563265e-07, "loss": 0.80535787, "num_input_tokens_seen": 153780765, "step": 7104, "time_per_iteration": 2.723001003265381 }, { "auxiliary_loss_clip": 0.01148081, "auxiliary_loss_mlp": 0.01021871, "balance_loss_clip": 1.04430282, "balance_loss_mlp": 1.01515317, "epoch": 0.8543257379907413, "flos": 24316551809280.0, "grad_norm": 2.400422158840647, "language_loss": 0.76084709, "learning_rate": 2.183761254611789e-07, "loss": 0.78254664, "num_input_tokens_seen": 153801090, "step": 7105, "time_per_iteration": 2.7377679347991943 }, { "auxiliary_loss_clip": 0.01149169, "auxiliary_loss_mlp": 0.01027401, "balance_loss_clip": 1.04580855, "balance_loss_mlp": 1.0206089, "epoch": 0.8544459808813804, "flos": 55286630467200.0, "grad_norm": 2.143652440550313, "language_loss": 0.69949925, "learning_rate": 2.1802231754764987e-07, "loss": 0.72126496, "num_input_tokens_seen": 153826530, "step": 7106, "time_per_iteration": 4.792828559875488 }, { "auxiliary_loss_clip": 0.01137755, "auxiliary_loss_mlp": 0.01024947, "balance_loss_clip": 1.04046607, "balance_loss_mlp": 1.01799083, "epoch": 0.8545662237720195, "flos": 25776248705280.0, "grad_norm": 1.9615746322677676, "language_loss": 0.76389998, "learning_rate": 2.17668779958718e-07, "loss": 0.78552699, "num_input_tokens_seen": 153849110, "step": 7107, "time_per_iteration": 2.67134428024292 }, { "auxiliary_loss_clip": 0.01165664, "auxiliary_loss_mlp": 0.01024269, "balance_loss_clip": 1.04648924, "balance_loss_mlp": 1.01675034, "epoch": 0.8546864666626586, "flos": 11108320427520.0, "grad_norm": 3.3725118989668466, "language_loss": 0.80724597, "learning_rate": 2.1731551274801553e-07, "loss": 0.82914531, "num_input_tokens_seen": 153865550, "step": 7108, "time_per_iteration": 2.6561856269836426 }, { "auxiliary_loss_clip": 0.01138656, "auxiliary_loss_mlp": 0.01022654, "balance_loss_clip": 1.0438087, "balance_loss_mlp": 1.0157876, "epoch": 0.8548067095532976, "flos": 25520169669120.0, "grad_norm": 1.9231641265825687, "language_loss": 0.61299968, "learning_rate": 2.169625159691324e-07, "loss": 0.6346128, "num_input_tokens_seen": 153885425, "step": 7109, "time_per_iteration": 2.714599132537842 }, { "auxiliary_loss_clip": 0.01116069, "auxiliary_loss_mlp": 0.01024428, "balance_loss_clip": 1.03891826, "balance_loss_mlp": 1.01629782, "epoch": 0.8549269524439368, "flos": 24717853532160.0, "grad_norm": 1.9540620734166119, "language_loss": 0.74674201, "learning_rate": 2.1660978967561784e-07, "loss": 0.76814699, "num_input_tokens_seen": 153904760, "step": 7110, "time_per_iteration": 2.840850591659546 }, { "auxiliary_loss_clip": 0.01162628, "auxiliary_loss_mlp": 0.01024728, "balance_loss_clip": 1.04370904, "balance_loss_mlp": 1.01795697, "epoch": 0.8550471953345758, "flos": 19825599191040.0, "grad_norm": 8.319640844703247, "language_loss": 0.78994304, "learning_rate": 2.1625733392098035e-07, "loss": 0.81181657, "num_input_tokens_seen": 153920370, "step": 7111, "time_per_iteration": 2.6063499450683594 }, { "auxiliary_loss_clip": 0.01163846, "auxiliary_loss_mlp": 0.01023666, "balance_loss_clip": 1.04456425, "balance_loss_mlp": 1.01710665, "epoch": 0.8551674382252149, "flos": 22820441500800.0, "grad_norm": 2.0938802725730548, "language_loss": 0.79841703, "learning_rate": 2.159051487586867e-07, "loss": 0.82029212, "num_input_tokens_seen": 153940500, "step": 7112, "time_per_iteration": 2.582059144973755 }, { "auxiliary_loss_clip": 0.01142514, "auxiliary_loss_mlp": 0.01026769, "balance_loss_clip": 1.04594505, "balance_loss_mlp": 1.01888061, "epoch": 0.8552876811158541, "flos": 20631255292800.0, "grad_norm": 2.394891833624902, "language_loss": 0.72912782, "learning_rate": 2.155532342421642e-07, "loss": 0.75082064, "num_input_tokens_seen": 153958500, "step": 7113, "time_per_iteration": 2.6642394065856934 }, { "auxiliary_loss_clip": 0.01156171, "auxiliary_loss_mlp": 0.01029846, "balance_loss_clip": 1.04496002, "balance_loss_mlp": 1.02274752, "epoch": 0.8554079240064931, "flos": 23112359331840.0, "grad_norm": 1.6664629764834973, "language_loss": 0.78411943, "learning_rate": 2.1520159042479636e-07, "loss": 0.80597961, "num_input_tokens_seen": 153976790, "step": 7114, "time_per_iteration": 2.664931535720825 }, { "auxiliary_loss_clip": 0.01149161, "auxiliary_loss_mlp": 0.01025135, "balance_loss_clip": 1.04433656, "balance_loss_mlp": 1.01834917, "epoch": 0.8555281668971322, "flos": 22128047959680.0, "grad_norm": 1.9371589484070706, "language_loss": 0.70808703, "learning_rate": 2.148502173599287e-07, "loss": 0.72982997, "num_input_tokens_seen": 153994930, "step": 7115, "time_per_iteration": 2.603637218475342 }, { "auxiliary_loss_clip": 0.01132291, "auxiliary_loss_mlp": 0.0102597, "balance_loss_clip": 1.04285383, "balance_loss_mlp": 1.01785481, "epoch": 0.8556484097877713, "flos": 31139040234240.0, "grad_norm": 1.6940619583707974, "language_loss": 0.65812707, "learning_rate": 2.1449911510086372e-07, "loss": 0.67970967, "num_input_tokens_seen": 154014400, "step": 7116, "time_per_iteration": 2.746556043624878 }, { "auxiliary_loss_clip": 0.0114909, "auxiliary_loss_mlp": 0.01025169, "balance_loss_clip": 1.04360104, "balance_loss_mlp": 1.01854348, "epoch": 0.8557686526784104, "flos": 24316551809280.0, "grad_norm": 2.0868054024809357, "language_loss": 0.77025682, "learning_rate": 2.141482837008628e-07, "loss": 0.7919994, "num_input_tokens_seen": 154034940, "step": 7117, "time_per_iteration": 2.7009174823760986 }, { "auxiliary_loss_clip": 0.01144076, "auxiliary_loss_mlp": 0.01022784, "balance_loss_clip": 1.04253578, "balance_loss_mlp": 1.01579571, "epoch": 0.8558888955690495, "flos": 17712723427200.0, "grad_norm": 1.8645667339833119, "language_loss": 0.72031069, "learning_rate": 2.1379772321314826e-07, "loss": 0.74197924, "num_input_tokens_seen": 154052985, "step": 7118, "time_per_iteration": 2.709346055984497 }, { "auxiliary_loss_clip": 0.01083568, "auxiliary_loss_mlp": 0.01027137, "balance_loss_clip": 1.03798127, "balance_loss_mlp": 1.02012157, "epoch": 0.8560091384596886, "flos": 19171702051200.0, "grad_norm": 2.168572080530982, "language_loss": 0.82049555, "learning_rate": 2.1344743369089802e-07, "loss": 0.84160262, "num_input_tokens_seen": 154068765, "step": 7119, "time_per_iteration": 2.8381388187408447 }, { "auxiliary_loss_clip": 0.01137011, "auxiliary_loss_mlp": 0.01027397, "balance_loss_clip": 1.04447985, "balance_loss_mlp": 1.02016377, "epoch": 0.8561293813503277, "flos": 23914855036800.0, "grad_norm": 1.7845548049348547, "language_loss": 0.82039738, "learning_rate": 2.130974151872522e-07, "loss": 0.84204143, "num_input_tokens_seen": 154089100, "step": 7120, "time_per_iteration": 2.689147472381592 }, { "auxiliary_loss_clip": 0.01124545, "auxiliary_loss_mlp": 0.01020391, "balance_loss_clip": 1.04212987, "balance_loss_mlp": 1.01365912, "epoch": 0.8562496242409667, "flos": 22529206028160.0, "grad_norm": 1.8358423196909646, "language_loss": 0.78501606, "learning_rate": 2.1274766775530773e-07, "loss": 0.80646539, "num_input_tokens_seen": 154108965, "step": 7121, "time_per_iteration": 2.8025741577148438 }, { "auxiliary_loss_clip": 0.01169745, "auxiliary_loss_mlp": 0.01025096, "balance_loss_clip": 1.04653645, "balance_loss_mlp": 1.01693296, "epoch": 0.8563698671316058, "flos": 14712745472640.0, "grad_norm": 3.0040132047351253, "language_loss": 0.79582512, "learning_rate": 2.1239819144812077e-07, "loss": 0.81777346, "num_input_tokens_seen": 154123425, "step": 7122, "time_per_iteration": 2.608166456222534 }, { "auxiliary_loss_clip": 0.0111509, "auxiliary_loss_mlp": 0.01028697, "balance_loss_clip": 1.03808188, "balance_loss_mlp": 1.02119255, "epoch": 0.856490110022245, "flos": 39167768211840.0, "grad_norm": 1.729891099056536, "language_loss": 0.69833261, "learning_rate": 2.1204898631870716e-07, "loss": 0.71977049, "num_input_tokens_seen": 154148315, "step": 7123, "time_per_iteration": 2.915086030960083 }, { "auxiliary_loss_clip": 0.0113926, "auxiliary_loss_mlp": 0.01021249, "balance_loss_clip": 1.04589844, "balance_loss_mlp": 1.01444852, "epoch": 0.856610352912884, "flos": 29059345658880.0, "grad_norm": 1.740348403264222, "language_loss": 0.7593357, "learning_rate": 2.1170005242004006e-07, "loss": 0.78094077, "num_input_tokens_seen": 154169665, "step": 7124, "time_per_iteration": 2.749047040939331 }, { "auxiliary_loss_clip": 0.01140945, "auxiliary_loss_mlp": 0.01021662, "balance_loss_clip": 1.04142749, "balance_loss_mlp": 1.01517463, "epoch": 0.8567305958035231, "flos": 23878333883520.0, "grad_norm": 1.777643815422862, "language_loss": 0.7831949, "learning_rate": 2.1135138980505384e-07, "loss": 0.80482101, "num_input_tokens_seen": 154190335, "step": 7125, "time_per_iteration": 3.67966365814209 }, { "auxiliary_loss_clip": 0.0113193, "auxiliary_loss_mlp": 0.01024986, "balance_loss_clip": 1.04251599, "balance_loss_mlp": 1.0184381, "epoch": 0.8568508386941622, "flos": 22200120599040.0, "grad_norm": 1.637236405821452, "language_loss": 0.722251, "learning_rate": 2.110029985266395e-07, "loss": 0.74382013, "num_input_tokens_seen": 154210040, "step": 7126, "time_per_iteration": 2.73753023147583 }, { "auxiliary_loss_clip": 0.01142892, "auxiliary_loss_mlp": 0.01023662, "balance_loss_clip": 1.04302812, "balance_loss_mlp": 1.01634896, "epoch": 0.8569710815848013, "flos": 17307507121920.0, "grad_norm": 3.954711404580165, "language_loss": 0.73940802, "learning_rate": 2.1065487863764787e-07, "loss": 0.76107359, "num_input_tokens_seen": 154228385, "step": 7127, "time_per_iteration": 2.7066574096679688 }, { "auxiliary_loss_clip": 0.01100546, "auxiliary_loss_mlp": 0.01023769, "balance_loss_clip": 1.0358429, "balance_loss_mlp": 1.01662242, "epoch": 0.8570913244754403, "flos": 23732285184000.0, "grad_norm": 1.5619274477599348, "language_loss": 0.858603, "learning_rate": 2.1030703019088846e-07, "loss": 0.87984622, "num_input_tokens_seen": 154249015, "step": 7128, "time_per_iteration": 3.8305954933166504 }, { "auxiliary_loss_clip": 0.01146414, "auxiliary_loss_mlp": 0.01019989, "balance_loss_clip": 1.04410338, "balance_loss_mlp": 1.01348591, "epoch": 0.8572115673660795, "flos": 20048748433920.0, "grad_norm": 1.9929363539919407, "language_loss": 0.7077477, "learning_rate": 2.099594532391291e-07, "loss": 0.72941172, "num_input_tokens_seen": 154267700, "step": 7129, "time_per_iteration": 2.6667532920837402 }, { "auxiliary_loss_clip": 0.01142899, "auxiliary_loss_mlp": 0.01025046, "balance_loss_clip": 1.04275405, "balance_loss_mlp": 1.01774466, "epoch": 0.8573318102567186, "flos": 27160389342720.0, "grad_norm": 1.5429759433785004, "language_loss": 0.78972203, "learning_rate": 2.0961214783509806e-07, "loss": 0.81140149, "num_input_tokens_seen": 154290580, "step": 7130, "time_per_iteration": 2.680532217025757 }, { "auxiliary_loss_clip": 0.01139297, "auxiliary_loss_mlp": 0.01023983, "balance_loss_clip": 1.04012167, "balance_loss_mlp": 1.01700318, "epoch": 0.8574520531473576, "flos": 24936585402240.0, "grad_norm": 1.8476976488686694, "language_loss": 0.74702221, "learning_rate": 2.0926511403148051e-07, "loss": 0.768655, "num_input_tokens_seen": 154309545, "step": 7131, "time_per_iteration": 2.7055540084838867 }, { "auxiliary_loss_clip": 0.01131429, "auxiliary_loss_mlp": 0.01020751, "balance_loss_clip": 1.04433978, "balance_loss_mlp": 1.01409006, "epoch": 0.8575722960379968, "flos": 18771154513920.0, "grad_norm": 1.8998857565928924, "language_loss": 0.7549572, "learning_rate": 2.0891835188092143e-07, "loss": 0.77647901, "num_input_tokens_seen": 154326545, "step": 7132, "time_per_iteration": 4.546796560287476 }, { "auxiliary_loss_clip": 0.01130517, "auxiliary_loss_mlp": 0.01028405, "balance_loss_clip": 1.04222476, "balance_loss_mlp": 1.02164006, "epoch": 0.8576925389286358, "flos": 22200300167040.0, "grad_norm": 1.9270937953793448, "language_loss": 0.81542766, "learning_rate": 2.0857186143602434e-07, "loss": 0.83701694, "num_input_tokens_seen": 154345190, "step": 7133, "time_per_iteration": 2.7012603282928467 }, { "auxiliary_loss_clip": 0.01115542, "auxiliary_loss_mlp": 0.01024615, "balance_loss_clip": 1.04092193, "balance_loss_mlp": 1.01736724, "epoch": 0.8578127818192749, "flos": 22894345733760.0, "grad_norm": 1.7233212797318922, "language_loss": 0.67682958, "learning_rate": 2.0822564274935094e-07, "loss": 0.69823116, "num_input_tokens_seen": 154364615, "step": 7134, "time_per_iteration": 2.6446781158447266 }, { "auxiliary_loss_clip": 0.01136986, "auxiliary_loss_mlp": 0.01028975, "balance_loss_clip": 1.04500258, "balance_loss_mlp": 1.02078485, "epoch": 0.8579330247099141, "flos": 34824839541120.0, "grad_norm": 2.055907725644711, "language_loss": 0.67079622, "learning_rate": 2.078796958734239e-07, "loss": 0.69245577, "num_input_tokens_seen": 154387335, "step": 7135, "time_per_iteration": 2.792418956756592 }, { "auxiliary_loss_clip": 0.01150448, "auxiliary_loss_mlp": 0.01023773, "balance_loss_clip": 1.0451895, "balance_loss_mlp": 1.01648366, "epoch": 0.8580532676005531, "flos": 19755681367680.0, "grad_norm": 2.0994298973306997, "language_loss": 0.75208074, "learning_rate": 2.0753402086072124e-07, "loss": 0.77382296, "num_input_tokens_seen": 154405965, "step": 7136, "time_per_iteration": 2.723586082458496 }, { "auxiliary_loss_clip": 0.0109364, "auxiliary_loss_mlp": 0.01028876, "balance_loss_clip": 1.03901255, "balance_loss_mlp": 1.02185142, "epoch": 0.8581735104911922, "flos": 22739318634240.0, "grad_norm": 7.691147728024715, "language_loss": 0.74857146, "learning_rate": 2.071886177636828e-07, "loss": 0.76979661, "num_input_tokens_seen": 154422750, "step": 7137, "time_per_iteration": 2.9094910621643066 }, { "auxiliary_loss_clip": 0.01149475, "auxiliary_loss_mlp": 0.01024923, "balance_loss_clip": 1.04564857, "balance_loss_mlp": 1.01760912, "epoch": 0.8582937533818313, "flos": 23149131880320.0, "grad_norm": 1.903500849230241, "language_loss": 0.82877076, "learning_rate": 2.0684348663470575e-07, "loss": 0.85051465, "num_input_tokens_seen": 154442930, "step": 7138, "time_per_iteration": 2.988851547241211 }, { "auxiliary_loss_clip": 0.01135026, "auxiliary_loss_mlp": 0.01022303, "balance_loss_clip": 1.03903675, "balance_loss_mlp": 1.01420236, "epoch": 0.8584139962724704, "flos": 19498668577920.0, "grad_norm": 1.9087255683714992, "language_loss": 0.6134125, "learning_rate": 2.0649862752614555e-07, "loss": 0.6349858, "num_input_tokens_seen": 154461640, "step": 7139, "time_per_iteration": 2.6334290504455566 }, { "auxiliary_loss_clip": 0.01044564, "auxiliary_loss_mlp": 0.01001559, "balance_loss_clip": 1.01009202, "balance_loss_mlp": 1.00039637, "epoch": 0.8585342391631094, "flos": 71276577788160.0, "grad_norm": 0.785123412092843, "language_loss": 0.56996667, "learning_rate": 2.0615404049031838e-07, "loss": 0.59042799, "num_input_tokens_seen": 154518610, "step": 7140, "time_per_iteration": 3.229763984680176 }, { "auxiliary_loss_clip": 0.011524, "auxiliary_loss_mlp": 0.010243, "balance_loss_clip": 1.045385, "balance_loss_mlp": 1.01652753, "epoch": 0.8586544820537486, "flos": 10815432929280.0, "grad_norm": 2.3789058260143565, "language_loss": 0.7822358, "learning_rate": 2.0580972557949616e-07, "loss": 0.80400276, "num_input_tokens_seen": 154533700, "step": 7141, "time_per_iteration": 2.572829484939575 }, { "auxiliary_loss_clip": 0.0105548, "auxiliary_loss_mlp": 0.01000308, "balance_loss_clip": 1.00970387, "balance_loss_mlp": 0.99925894, "epoch": 0.8587747249443877, "flos": 64811184422400.0, "grad_norm": 0.793529180126071, "language_loss": 0.54204392, "learning_rate": 2.054656828459125e-07, "loss": 0.5626018, "num_input_tokens_seen": 154597810, "step": 7142, "time_per_iteration": 3.251307249069214 }, { "auxiliary_loss_clip": 0.01106037, "auxiliary_loss_mlp": 0.01023177, "balance_loss_clip": 1.04059243, "balance_loss_mlp": 1.0159831, "epoch": 0.8588949678350267, "flos": 26834607964800.0, "grad_norm": 1.9312799700060816, "language_loss": 0.77271008, "learning_rate": 2.051219123417578e-07, "loss": 0.79400223, "num_input_tokens_seen": 154617870, "step": 7143, "time_per_iteration": 2.7867801189422607 }, { "auxiliary_loss_clip": 0.01167322, "auxiliary_loss_mlp": 0.01024897, "balance_loss_clip": 1.04582918, "balance_loss_mlp": 1.01688576, "epoch": 0.8590152107256659, "flos": 26104256726400.0, "grad_norm": 2.215683153906872, "language_loss": 0.60338986, "learning_rate": 2.0477841411918196e-07, "loss": 0.62531209, "num_input_tokens_seen": 154637395, "step": 7144, "time_per_iteration": 2.6101300716400146 }, { "auxiliary_loss_clip": 0.01145377, "auxiliary_loss_mlp": 0.01023639, "balance_loss_clip": 1.04185092, "balance_loss_mlp": 1.01686454, "epoch": 0.859135453616305, "flos": 26140885620480.0, "grad_norm": 1.9594809220030802, "language_loss": 0.74833429, "learning_rate": 2.0443518823029326e-07, "loss": 0.77002442, "num_input_tokens_seen": 154657935, "step": 7145, "time_per_iteration": 2.7533867359161377 }, { "auxiliary_loss_clip": 0.01115643, "auxiliary_loss_mlp": 0.01024488, "balance_loss_clip": 1.03999734, "balance_loss_mlp": 1.01758575, "epoch": 0.859255696506944, "flos": 12969319046400.0, "grad_norm": 2.2634185528288655, "language_loss": 0.76531649, "learning_rate": 2.0409223472715854e-07, "loss": 0.78671777, "num_input_tokens_seen": 154675080, "step": 7146, "time_per_iteration": 2.6953179836273193 }, { "auxiliary_loss_clip": 0.01121682, "auxiliary_loss_mlp": 0.00761505, "balance_loss_clip": 1.04234374, "balance_loss_mlp": 1.00034237, "epoch": 0.8593759393975832, "flos": 18475753063680.0, "grad_norm": 2.2651036618379576, "language_loss": 0.74923873, "learning_rate": 2.0374955366180434e-07, "loss": 0.7680707, "num_input_tokens_seen": 154692720, "step": 7147, "time_per_iteration": 2.6443030834198 }, { "auxiliary_loss_clip": 0.01128121, "auxiliary_loss_mlp": 0.01024694, "balance_loss_clip": 1.04130733, "balance_loss_mlp": 1.01771736, "epoch": 0.8594961822882222, "flos": 22200156512640.0, "grad_norm": 1.698857684184007, "language_loss": 0.72930765, "learning_rate": 2.034071450862147e-07, "loss": 0.7508359, "num_input_tokens_seen": 154710190, "step": 7148, "time_per_iteration": 2.682011842727661 }, { "auxiliary_loss_clip": 0.01141911, "auxiliary_loss_mlp": 0.01030984, "balance_loss_clip": 1.04224062, "balance_loss_mlp": 1.02324772, "epoch": 0.8596164251788613, "flos": 23294749616640.0, "grad_norm": 1.8538445190991428, "language_loss": 0.77062947, "learning_rate": 2.030650090523327e-07, "loss": 0.7923584, "num_input_tokens_seen": 154729380, "step": 7149, "time_per_iteration": 2.6947977542877197 }, { "auxiliary_loss_clip": 0.01117665, "auxiliary_loss_mlp": 0.01029392, "balance_loss_clip": 1.03898692, "balance_loss_mlp": 1.02183664, "epoch": 0.8597366680695004, "flos": 31649905416960.0, "grad_norm": 2.2391420338666252, "language_loss": 0.59360737, "learning_rate": 2.0272314561205995e-07, "loss": 0.61507797, "num_input_tokens_seen": 154749775, "step": 7150, "time_per_iteration": 2.7641103267669678 }, { "auxiliary_loss_clip": 0.0111533, "auxiliary_loss_mlp": 0.01022015, "balance_loss_clip": 1.03594851, "balance_loss_mlp": 1.01482987, "epoch": 0.8598569109601395, "flos": 21287738211840.0, "grad_norm": 1.882668454778394, "language_loss": 0.7275129, "learning_rate": 2.023815548172567e-07, "loss": 0.74888635, "num_input_tokens_seen": 154769845, "step": 7151, "time_per_iteration": 3.66994047164917 }, { "auxiliary_loss_clip": 0.01149508, "auxiliary_loss_mlp": 0.01022741, "balance_loss_clip": 1.04240417, "balance_loss_mlp": 1.01571095, "epoch": 0.8599771538507786, "flos": 25447809720960.0, "grad_norm": 1.9292665823031423, "language_loss": 0.66239846, "learning_rate": 2.0204023671974267e-07, "loss": 0.68412089, "num_input_tokens_seen": 154789230, "step": 7152, "time_per_iteration": 2.6786062717437744 }, { "auxiliary_loss_clip": 0.01146366, "auxiliary_loss_mlp": 0.01026267, "balance_loss_clip": 1.04216826, "balance_loss_mlp": 1.01951361, "epoch": 0.8600973967414177, "flos": 16723958768640.0, "grad_norm": 2.1090072728378266, "language_loss": 0.80818725, "learning_rate": 2.0169919137129532e-07, "loss": 0.8299135, "num_input_tokens_seen": 154807670, "step": 7153, "time_per_iteration": 2.5562543869018555 }, { "auxiliary_loss_clip": 0.01153562, "auxiliary_loss_mlp": 0.0102597, "balance_loss_clip": 1.04688716, "balance_loss_mlp": 1.01822102, "epoch": 0.8602176396320568, "flos": 25227928615680.0, "grad_norm": 2.1889175461436214, "language_loss": 0.7088182, "learning_rate": 2.013584188236508e-07, "loss": 0.73061347, "num_input_tokens_seen": 154825575, "step": 7154, "time_per_iteration": 3.629422903060913 }, { "auxiliary_loss_clip": 0.01167114, "auxiliary_loss_mlp": 0.01027963, "balance_loss_clip": 1.04594326, "balance_loss_mlp": 1.02078664, "epoch": 0.8603378825226958, "flos": 20412236113920.0, "grad_norm": 1.7833954588102878, "language_loss": 0.79483163, "learning_rate": 2.0101791912850396e-07, "loss": 0.81678236, "num_input_tokens_seen": 154845115, "step": 7155, "time_per_iteration": 2.5893263816833496 }, { "auxiliary_loss_clip": 0.01139264, "auxiliary_loss_mlp": 0.01028248, "balance_loss_clip": 1.04567194, "balance_loss_mlp": 1.02096176, "epoch": 0.8604581254133349, "flos": 34930201109760.0, "grad_norm": 2.0816786865580155, "language_loss": 0.64101493, "learning_rate": 2.006776923375082e-07, "loss": 0.66269004, "num_input_tokens_seen": 154866770, "step": 7156, "time_per_iteration": 2.758556604385376 }, { "auxiliary_loss_clip": 0.01163413, "auxiliary_loss_mlp": 0.01019268, "balance_loss_clip": 1.04401803, "balance_loss_mlp": 1.01216316, "epoch": 0.860578368303974, "flos": 22596538072320.0, "grad_norm": 1.7339618308165046, "language_loss": 0.71121514, "learning_rate": 2.003377385022764e-07, "loss": 0.733042, "num_input_tokens_seen": 154885595, "step": 7157, "time_per_iteration": 3.5030624866485596 }, { "auxiliary_loss_clip": 0.01137622, "auxiliary_loss_mlp": 0.01025614, "balance_loss_clip": 1.04236126, "balance_loss_mlp": 1.01883352, "epoch": 0.8606986111946131, "flos": 21324331192320.0, "grad_norm": 2.145581417479779, "language_loss": 0.77588087, "learning_rate": 1.9999805767437826e-07, "loss": 0.79751325, "num_input_tokens_seen": 154904485, "step": 7158, "time_per_iteration": 2.6859681606292725 }, { "auxiliary_loss_clip": 0.01129234, "auxiliary_loss_mlp": 0.01022119, "balance_loss_clip": 1.04004693, "balance_loss_mlp": 1.01522541, "epoch": 0.8608188540852522, "flos": 28877206769280.0, "grad_norm": 1.716529933886509, "language_loss": 0.71930081, "learning_rate": 1.9965864990534386e-07, "loss": 0.74081433, "num_input_tokens_seen": 154925010, "step": 7159, "time_per_iteration": 3.5790255069732666 }, { "auxiliary_loss_clip": 0.01116766, "auxiliary_loss_mlp": 0.01022545, "balance_loss_clip": 1.03792489, "balance_loss_mlp": 1.01536274, "epoch": 0.8609390969758913, "flos": 29716187713920.0, "grad_norm": 1.9911643598488806, "language_loss": 0.77608061, "learning_rate": 1.9931951524666092e-07, "loss": 0.79747367, "num_input_tokens_seen": 154946100, "step": 7160, "time_per_iteration": 2.75137996673584 }, { "auxiliary_loss_clip": 0.01155995, "auxiliary_loss_mlp": 0.00762244, "balance_loss_clip": 1.04518652, "balance_loss_mlp": 1.00033522, "epoch": 0.8610593398665304, "flos": 21249349551360.0, "grad_norm": 1.845537916430274, "language_loss": 0.81207287, "learning_rate": 1.9898065374977534e-07, "loss": 0.8312552, "num_input_tokens_seen": 154966305, "step": 7161, "time_per_iteration": 2.657897710800171 }, { "auxiliary_loss_clip": 0.01121344, "auxiliary_loss_mlp": 0.01017375, "balance_loss_clip": 1.03978992, "balance_loss_mlp": 1.01117277, "epoch": 0.8611795827571694, "flos": 14830102183680.0, "grad_norm": 2.084375488838697, "language_loss": 0.72797596, "learning_rate": 1.9864206546609342e-07, "loss": 0.74936318, "num_input_tokens_seen": 154985145, "step": 7162, "time_per_iteration": 2.6752803325653076 }, { "auxiliary_loss_clip": 0.0116406, "auxiliary_loss_mlp": 0.01022987, "balance_loss_clip": 1.04373062, "balance_loss_mlp": 1.01629877, "epoch": 0.8612998256478086, "flos": 24243258107520.0, "grad_norm": 1.831737609914979, "language_loss": 0.84219527, "learning_rate": 1.983037504469771e-07, "loss": 0.86406577, "num_input_tokens_seen": 155003855, "step": 7163, "time_per_iteration": 2.5494441986083984 }, { "auxiliary_loss_clip": 0.01151161, "auxiliary_loss_mlp": 0.01029651, "balance_loss_clip": 1.04430914, "balance_loss_mlp": 1.02259135, "epoch": 0.8614200685384477, "flos": 21252653602560.0, "grad_norm": 1.832595670008331, "language_loss": 0.66648054, "learning_rate": 1.9796570874374984e-07, "loss": 0.68828863, "num_input_tokens_seen": 155023960, "step": 7164, "time_per_iteration": 2.5694549083709717 }, { "auxiliary_loss_clip": 0.01140723, "auxiliary_loss_mlp": 0.01027012, "balance_loss_clip": 1.0433042, "balance_loss_mlp": 1.02004445, "epoch": 0.8615403114290867, "flos": 20007738080640.0, "grad_norm": 1.6617486799965422, "language_loss": 0.77754819, "learning_rate": 1.976279404076917e-07, "loss": 0.79922551, "num_input_tokens_seen": 155043360, "step": 7165, "time_per_iteration": 2.5954620838165283 }, { "auxiliary_loss_clip": 0.01122546, "auxiliary_loss_mlp": 0.01026582, "balance_loss_clip": 1.04318786, "balance_loss_mlp": 1.01891696, "epoch": 0.8616605543197259, "flos": 29789373674880.0, "grad_norm": 1.856242811945421, "language_loss": 0.76453954, "learning_rate": 1.9729044549004193e-07, "loss": 0.78603083, "num_input_tokens_seen": 155064745, "step": 7166, "time_per_iteration": 2.6513164043426514 }, { "auxiliary_loss_clip": 0.01149459, "auxiliary_loss_mlp": 0.01025069, "balance_loss_clip": 1.043733, "balance_loss_mlp": 1.01859629, "epoch": 0.8617807972103649, "flos": 28911609020160.0, "grad_norm": 1.7675755679526302, "language_loss": 0.70370877, "learning_rate": 1.9695322404199822e-07, "loss": 0.72545397, "num_input_tokens_seen": 155086790, "step": 7167, "time_per_iteration": 2.6939780712127686 }, { "auxiliary_loss_clip": 0.01134043, "auxiliary_loss_mlp": 0.010261, "balance_loss_clip": 1.04174268, "balance_loss_mlp": 1.01837277, "epoch": 0.861901040101004, "flos": 27673804391040.0, "grad_norm": 1.9456807754740857, "language_loss": 0.82074732, "learning_rate": 1.9661627611471654e-07, "loss": 0.84234869, "num_input_tokens_seen": 155106585, "step": 7168, "time_per_iteration": 2.7579081058502197 }, { "auxiliary_loss_clip": 0.01144509, "auxiliary_loss_mlp": 0.0102288, "balance_loss_clip": 1.04288769, "balance_loss_mlp": 1.01531577, "epoch": 0.8620212829916432, "flos": 49748056755840.0, "grad_norm": 16.158414512943875, "language_loss": 0.70402759, "learning_rate": 1.9627960175931246e-07, "loss": 0.72570145, "num_input_tokens_seen": 155131285, "step": 7169, "time_per_iteration": 2.940899610519409 }, { "auxiliary_loss_clip": 0.01152602, "auxiliary_loss_mlp": 0.01027342, "balance_loss_clip": 1.04576457, "balance_loss_mlp": 1.02081847, "epoch": 0.8621415258822822, "flos": 21138672769920.0, "grad_norm": 2.6496488080516363, "language_loss": 0.74570143, "learning_rate": 1.9594320102685847e-07, "loss": 0.76750082, "num_input_tokens_seen": 155150555, "step": 7170, "time_per_iteration": 2.6414034366607666 }, { "auxiliary_loss_clip": 0.01127143, "auxiliary_loss_mlp": 0.00762062, "balance_loss_clip": 1.04004812, "balance_loss_mlp": 1.00034177, "epoch": 0.8622617687729213, "flos": 21689039934720.0, "grad_norm": 2.6527758173417766, "language_loss": 0.64341938, "learning_rate": 1.956070739683864e-07, "loss": 0.66231143, "num_input_tokens_seen": 155169890, "step": 7171, "time_per_iteration": 2.6548707485198975 }, { "auxiliary_loss_clip": 0.01106596, "auxiliary_loss_mlp": 0.01019376, "balance_loss_clip": 1.03721023, "balance_loss_mlp": 1.0127598, "epoch": 0.8623820116635604, "flos": 26250592734720.0, "grad_norm": 1.5085956852081983, "language_loss": 0.74213332, "learning_rate": 1.9527122063488678e-07, "loss": 0.76339298, "num_input_tokens_seen": 155191005, "step": 7172, "time_per_iteration": 2.7063567638397217 }, { "auxiliary_loss_clip": 0.01133647, "auxiliary_loss_mlp": 0.01022562, "balance_loss_clip": 1.0385344, "balance_loss_mlp": 1.01534057, "epoch": 0.8625022545541995, "flos": 19647554451840.0, "grad_norm": 1.7140280899912888, "language_loss": 0.80425847, "learning_rate": 1.9493564107730755e-07, "loss": 0.82582057, "num_input_tokens_seen": 155211005, "step": 7173, "time_per_iteration": 2.6357126235961914 }, { "auxiliary_loss_clip": 0.01131145, "auxiliary_loss_mlp": 0.01023876, "balance_loss_clip": 1.03975296, "balance_loss_mlp": 1.01717353, "epoch": 0.8626224974448385, "flos": 21908382336000.0, "grad_norm": 1.9049073428580094, "language_loss": 0.60879517, "learning_rate": 1.9460033534655684e-07, "loss": 0.63034534, "num_input_tokens_seen": 155230365, "step": 7174, "time_per_iteration": 2.6813619136810303 }, { "auxiliary_loss_clip": 0.01130539, "auxiliary_loss_mlp": 0.01025889, "balance_loss_clip": 1.03774881, "balance_loss_mlp": 1.0189271, "epoch": 0.8627427403354777, "flos": 23331198942720.0, "grad_norm": 1.635754109214876, "language_loss": 0.84502518, "learning_rate": 1.9426530349349978e-07, "loss": 0.86658955, "num_input_tokens_seen": 155250815, "step": 7175, "time_per_iteration": 2.686558485031128 }, { "auxiliary_loss_clip": 0.01148936, "auxiliary_loss_mlp": 0.00761872, "balance_loss_clip": 1.042647, "balance_loss_mlp": 1.00031614, "epoch": 0.8628629832261168, "flos": 16362877299840.0, "grad_norm": 1.855064150178447, "language_loss": 0.64787221, "learning_rate": 1.9393054556896038e-07, "loss": 0.66698027, "num_input_tokens_seen": 155268515, "step": 7176, "time_per_iteration": 2.6345980167388916 }, { "auxiliary_loss_clip": 0.01121205, "auxiliary_loss_mlp": 0.01025425, "balance_loss_clip": 1.04006135, "balance_loss_mlp": 1.01824558, "epoch": 0.8629832261167558, "flos": 28103941756800.0, "grad_norm": 2.336250220930642, "language_loss": 0.68790728, "learning_rate": 1.9359606162372133e-07, "loss": 0.70937353, "num_input_tokens_seen": 155290120, "step": 7177, "time_per_iteration": 3.771299123764038 }, { "auxiliary_loss_clip": 0.01165493, "auxiliary_loss_mlp": 0.01020642, "balance_loss_clip": 1.0470736, "balance_loss_mlp": 1.01319122, "epoch": 0.863103469007395, "flos": 20230061310720.0, "grad_norm": 1.7739054333130502, "language_loss": 0.70558107, "learning_rate": 1.9326185170852293e-07, "loss": 0.72744244, "num_input_tokens_seen": 155309085, "step": 7178, "time_per_iteration": 2.6541996002197266 }, { "auxiliary_loss_clip": 0.01150113, "auxiliary_loss_mlp": 0.01026302, "balance_loss_clip": 1.04409266, "balance_loss_mlp": 1.0191052, "epoch": 0.863223711898034, "flos": 24498547044480.0, "grad_norm": 1.9369858039736345, "language_loss": 0.72229439, "learning_rate": 1.9292791587406598e-07, "loss": 0.74405855, "num_input_tokens_seen": 155327945, "step": 7179, "time_per_iteration": 2.672797203063965 }, { "auxiliary_loss_clip": 0.01149387, "auxiliary_loss_mlp": 0.00762354, "balance_loss_clip": 1.04306555, "balance_loss_mlp": 1.00031364, "epoch": 0.8633439547886731, "flos": 17675376261120.0, "grad_norm": 2.09837599603976, "language_loss": 0.86848027, "learning_rate": 1.9259425417100661e-07, "loss": 0.88759762, "num_input_tokens_seen": 155344060, "step": 7180, "time_per_iteration": 3.5816879272460938 }, { "auxiliary_loss_clip": 0.01088739, "auxiliary_loss_mlp": 0.01026757, "balance_loss_clip": 1.03139722, "balance_loss_mlp": 1.01927078, "epoch": 0.8634641976793123, "flos": 12895055677440.0, "grad_norm": 1.9428146577784717, "language_loss": 0.74645793, "learning_rate": 1.9226086664996234e-07, "loss": 0.76761281, "num_input_tokens_seen": 155362305, "step": 7181, "time_per_iteration": 2.718672752380371 }, { "auxiliary_loss_clip": 0.011415, "auxiliary_loss_mlp": 0.01018964, "balance_loss_clip": 1.04517245, "balance_loss_mlp": 1.01209772, "epoch": 0.8635844405699513, "flos": 23878980328320.0, "grad_norm": 3.613124617743116, "language_loss": 0.7414974, "learning_rate": 1.9192775336150712e-07, "loss": 0.76310205, "num_input_tokens_seen": 155382605, "step": 7182, "time_per_iteration": 2.690648078918457 }, { "auxiliary_loss_clip": 0.0104943, "auxiliary_loss_mlp": 0.01002038, "balance_loss_clip": 1.00822604, "balance_loss_mlp": 1.00095963, "epoch": 0.8637046834605904, "flos": 60453387521280.0, "grad_norm": 0.7622576332722774, "language_loss": 0.56289518, "learning_rate": 1.915949143561739e-07, "loss": 0.58340991, "num_input_tokens_seen": 155437280, "step": 7183, "time_per_iteration": 4.05260443687439 }, { "auxiliary_loss_clip": 0.0115187, "auxiliary_loss_mlp": 0.01030617, "balance_loss_clip": 1.0453651, "balance_loss_mlp": 1.02299023, "epoch": 0.8638249263512295, "flos": 20558751690240.0, "grad_norm": 1.7627645974046182, "language_loss": 0.7767421, "learning_rate": 1.9126234968445498e-07, "loss": 0.79856694, "num_input_tokens_seen": 155456970, "step": 7184, "time_per_iteration": 3.543426513671875 }, { "auxiliary_loss_clip": 0.01167325, "auxiliary_loss_mlp": 0.01025116, "balance_loss_clip": 1.04744649, "balance_loss_mlp": 1.01844347, "epoch": 0.8639451692418686, "flos": 26615768353920.0, "grad_norm": 1.5606417652399565, "language_loss": 0.67630446, "learning_rate": 1.9093005939679884e-07, "loss": 0.69822884, "num_input_tokens_seen": 155478925, "step": 7185, "time_per_iteration": 2.6614227294921875 }, { "auxiliary_loss_clip": 0.01151543, "auxiliary_loss_mlp": 0.01027733, "balance_loss_clip": 1.04621899, "balance_loss_mlp": 1.0204103, "epoch": 0.8640654121325076, "flos": 15122450977920.0, "grad_norm": 1.8831870948660911, "language_loss": 0.76740342, "learning_rate": 1.9059804354361452e-07, "loss": 0.78919613, "num_input_tokens_seen": 155496700, "step": 7186, "time_per_iteration": 2.5526468753814697 }, { "auxiliary_loss_clip": 0.01129767, "auxiliary_loss_mlp": 0.01022783, "balance_loss_clip": 1.03927016, "balance_loss_mlp": 1.0160979, "epoch": 0.8641856550231467, "flos": 31869068250240.0, "grad_norm": 2.3492456012954324, "language_loss": 0.70625961, "learning_rate": 1.902663021752684e-07, "loss": 0.72778517, "num_input_tokens_seen": 155518130, "step": 7187, "time_per_iteration": 2.7872605323791504 }, { "auxiliary_loss_clip": 0.01167846, "auxiliary_loss_mlp": 0.01026486, "balance_loss_clip": 1.04684472, "balance_loss_mlp": 1.01898789, "epoch": 0.8643058979137859, "flos": 14976545932800.0, "grad_norm": 2.2005799720253862, "language_loss": 0.82768297, "learning_rate": 1.8993483534208556e-07, "loss": 0.8496263, "num_input_tokens_seen": 155537040, "step": 7188, "time_per_iteration": 2.691890001296997 }, { "auxiliary_loss_clip": 0.01130549, "auxiliary_loss_mlp": 0.01021462, "balance_loss_clip": 1.04274797, "balance_loss_mlp": 1.01439333, "epoch": 0.8644261408044249, "flos": 13115726881920.0, "grad_norm": 2.495818442137677, "language_loss": 0.75188887, "learning_rate": 1.8960364309434884e-07, "loss": 0.77340901, "num_input_tokens_seen": 155554535, "step": 7189, "time_per_iteration": 2.6328091621398926 }, { "auxiliary_loss_clip": 0.01091966, "auxiliary_loss_mlp": 0.0076175, "balance_loss_clip": 1.03875136, "balance_loss_mlp": 1.00033545, "epoch": 0.864546383695064, "flos": 20850920916480.0, "grad_norm": 1.7993057065347131, "language_loss": 0.77976131, "learning_rate": 1.8927272548229967e-07, "loss": 0.79829854, "num_input_tokens_seen": 155574225, "step": 7190, "time_per_iteration": 2.794088125228882 }, { "auxiliary_loss_clip": 0.01107659, "auxiliary_loss_mlp": 0.01029736, "balance_loss_clip": 1.0389297, "balance_loss_mlp": 1.02262259, "epoch": 0.8646666265857031, "flos": 21324582587520.0, "grad_norm": 1.7727385853435105, "language_loss": 0.83027005, "learning_rate": 1.8894208255613876e-07, "loss": 0.85164404, "num_input_tokens_seen": 155593540, "step": 7191, "time_per_iteration": 2.7616939544677734 }, { "auxiliary_loss_clip": 0.01164931, "auxiliary_loss_mlp": 0.01023697, "balance_loss_clip": 1.04640579, "balance_loss_mlp": 1.01684237, "epoch": 0.8647868694763422, "flos": 19750833031680.0, "grad_norm": 1.858189988932058, "language_loss": 0.7791276, "learning_rate": 1.8861171436602397e-07, "loss": 0.80101389, "num_input_tokens_seen": 155610655, "step": 7192, "time_per_iteration": 2.594428777694702 }, { "auxiliary_loss_clip": 0.01153888, "auxiliary_loss_mlp": 0.01024697, "balance_loss_clip": 1.04623413, "balance_loss_mlp": 1.01759553, "epoch": 0.8649071123669813, "flos": 26176760328960.0, "grad_norm": 2.4900749878375836, "language_loss": 0.80438733, "learning_rate": 1.882816209620719e-07, "loss": 0.82617325, "num_input_tokens_seen": 155627365, "step": 7193, "time_per_iteration": 2.7147698402404785 }, { "auxiliary_loss_clip": 0.01141652, "auxiliary_loss_mlp": 0.01028338, "balance_loss_clip": 1.04749024, "balance_loss_mlp": 1.02108717, "epoch": 0.8650273552576204, "flos": 20302888135680.0, "grad_norm": 1.901977928179352, "language_loss": 0.76730353, "learning_rate": 1.8795180239435738e-07, "loss": 0.78900349, "num_input_tokens_seen": 155646220, "step": 7194, "time_per_iteration": 2.7062807083129883 }, { "auxiliary_loss_clip": 0.0114419, "auxiliary_loss_mlp": 0.01026291, "balance_loss_clip": 1.04384005, "balance_loss_mlp": 1.01916862, "epoch": 0.8651475981482595, "flos": 23951088881280.0, "grad_norm": 3.9397989089396757, "language_loss": 0.7558893, "learning_rate": 1.8762225871291348e-07, "loss": 0.77759415, "num_input_tokens_seen": 155662095, "step": 7195, "time_per_iteration": 2.7234749794006348 }, { "auxiliary_loss_clip": 0.01164864, "auxiliary_loss_mlp": 0.00762002, "balance_loss_clip": 1.04532468, "balance_loss_mlp": 1.00028634, "epoch": 0.8652678410388985, "flos": 21684622561920.0, "grad_norm": 1.8001298630211915, "language_loss": 0.8074317, "learning_rate": 1.8729298996773201e-07, "loss": 0.82670027, "num_input_tokens_seen": 155680845, "step": 7196, "time_per_iteration": 2.663961410522461 }, { "auxiliary_loss_clip": 0.01049082, "auxiliary_loss_mlp": 0.01001778, "balance_loss_clip": 1.00929379, "balance_loss_mlp": 1.0007056, "epoch": 0.8653880839295377, "flos": 65224660855680.0, "grad_norm": 0.830508887732276, "language_loss": 0.60882175, "learning_rate": 1.8696399620876301e-07, "loss": 0.6293304, "num_input_tokens_seen": 155737875, "step": 7197, "time_per_iteration": 3.157330274581909 }, { "auxiliary_loss_clip": 0.0111843, "auxiliary_loss_mlp": 0.01026771, "balance_loss_clip": 1.03613019, "balance_loss_mlp": 1.01905274, "epoch": 0.8655083268201768, "flos": 17749172753280.0, "grad_norm": 2.6192545803710505, "language_loss": 0.78811312, "learning_rate": 1.866352774859141e-07, "loss": 0.80956513, "num_input_tokens_seen": 155753100, "step": 7198, "time_per_iteration": 2.7168991565704346 }, { "auxiliary_loss_clip": 0.0112618, "auxiliary_loss_mlp": 0.01021117, "balance_loss_clip": 1.03973413, "balance_loss_mlp": 1.01481056, "epoch": 0.8656285697108158, "flos": 20703974376960.0, "grad_norm": 2.195829167103854, "language_loss": 0.69223666, "learning_rate": 1.8630683384905188e-07, "loss": 0.71370959, "num_input_tokens_seen": 155772430, "step": 7199, "time_per_iteration": 2.795522689819336 }, { "auxiliary_loss_clip": 0.01165234, "auxiliary_loss_mlp": 0.00762218, "balance_loss_clip": 1.04666197, "balance_loss_mlp": 1.00031996, "epoch": 0.865748812601455, "flos": 18653833716480.0, "grad_norm": 1.995233305081438, "language_loss": 0.88604444, "learning_rate": 1.8597866534800045e-07, "loss": 0.90531898, "num_input_tokens_seen": 155787545, "step": 7200, "time_per_iteration": 2.6190578937530518 }, { "auxiliary_loss_clip": 0.01153167, "auxiliary_loss_mlp": 0.00762255, "balance_loss_clip": 1.04418015, "balance_loss_mlp": 1.0003581, "epoch": 0.865869055492094, "flos": 70652554807680.0, "grad_norm": 2.1468728449959924, "language_loss": 0.74912989, "learning_rate": 1.8565077203254398e-07, "loss": 0.76828414, "num_input_tokens_seen": 155813005, "step": 7201, "time_per_iteration": 3.031463146209717 }, { "auxiliary_loss_clip": 0.0112532, "auxiliary_loss_mlp": 0.01023572, "balance_loss_clip": 1.04522526, "balance_loss_mlp": 1.01571035, "epoch": 0.8659892983827331, "flos": 17383961220480.0, "grad_norm": 3.655888372550851, "language_loss": 0.72210425, "learning_rate": 1.8532315395242203e-07, "loss": 0.74359322, "num_input_tokens_seen": 155829455, "step": 7202, "time_per_iteration": 2.664135456085205 }, { "auxiliary_loss_clip": 0.01127116, "auxiliary_loss_mlp": 0.01022045, "balance_loss_clip": 1.0418725, "balance_loss_mlp": 1.01562858, "epoch": 0.8661095412733723, "flos": 17895221452800.0, "grad_norm": 2.0166007637412906, "language_loss": 0.72529364, "learning_rate": 1.849958111573353e-07, "loss": 0.74678528, "num_input_tokens_seen": 155848060, "step": 7203, "time_per_iteration": 3.616074800491333 }, { "auxiliary_loss_clip": 0.01162221, "auxiliary_loss_mlp": 0.01027761, "balance_loss_clip": 1.04477561, "balance_loss_mlp": 1.02032518, "epoch": 0.8662297841640113, "flos": 18224163227520.0, "grad_norm": 1.931478183790679, "language_loss": 0.64171195, "learning_rate": 1.8466874369694074e-07, "loss": 0.66361177, "num_input_tokens_seen": 155865755, "step": 7204, "time_per_iteration": 2.596609592437744 }, { "auxiliary_loss_clip": 0.01122692, "auxiliary_loss_mlp": 0.01022803, "balance_loss_clip": 1.03742886, "balance_loss_mlp": 1.01584756, "epoch": 0.8663500270546504, "flos": 16362159027840.0, "grad_norm": 4.022737778391404, "language_loss": 0.6999073, "learning_rate": 1.843419516208542e-07, "loss": 0.72136235, "num_input_tokens_seen": 155882680, "step": 7205, "time_per_iteration": 2.7297699451446533 }, { "auxiliary_loss_clip": 0.01154197, "auxiliary_loss_mlp": 0.01024433, "balance_loss_clip": 1.04617679, "balance_loss_mlp": 1.01660061, "epoch": 0.8664702699452895, "flos": 17894431353600.0, "grad_norm": 2.1236357310180662, "language_loss": 0.79577732, "learning_rate": 1.8401543497865047e-07, "loss": 0.81756359, "num_input_tokens_seen": 155900680, "step": 7206, "time_per_iteration": 3.646923065185547 }, { "auxiliary_loss_clip": 0.01152389, "auxiliary_loss_mlp": 0.00762054, "balance_loss_clip": 1.04297161, "balance_loss_mlp": 1.00031734, "epoch": 0.8665905128359286, "flos": 30736373794560.0, "grad_norm": 2.6056777347025983, "language_loss": 0.64004844, "learning_rate": 1.836891938198608e-07, "loss": 0.65919286, "num_input_tokens_seen": 155921105, "step": 7207, "time_per_iteration": 2.723679304122925 }, { "auxiliary_loss_clip": 0.01136782, "auxiliary_loss_mlp": 0.01022211, "balance_loss_clip": 1.04313457, "balance_loss_mlp": 1.01520395, "epoch": 0.8667107557265676, "flos": 18656419495680.0, "grad_norm": 2.2396151388134906, "language_loss": 0.7114675, "learning_rate": 1.8336322819397677e-07, "loss": 0.7330575, "num_input_tokens_seen": 155938640, "step": 7208, "time_per_iteration": 2.6658427715301514 }, { "auxiliary_loss_clip": 0.01125988, "auxiliary_loss_mlp": 0.01025288, "balance_loss_clip": 1.03821874, "balance_loss_mlp": 1.01765847, "epoch": 0.8668309986172068, "flos": 20083725302400.0, "grad_norm": 1.7757728704907143, "language_loss": 0.62684739, "learning_rate": 1.8303753815044654e-07, "loss": 0.64836007, "num_input_tokens_seen": 155957945, "step": 7209, "time_per_iteration": 3.5742831230163574 }, { "auxiliary_loss_clip": 0.01148961, "auxiliary_loss_mlp": 0.0102705, "balance_loss_clip": 1.04438746, "balance_loss_mlp": 1.01922143, "epoch": 0.8669512415078459, "flos": 21615099788160.0, "grad_norm": 2.3340052585193938, "language_loss": 0.70265317, "learning_rate": 1.827121237386773e-07, "loss": 0.72441328, "num_input_tokens_seen": 155975390, "step": 7210, "time_per_iteration": 3.5693185329437256 }, { "auxiliary_loss_clip": 0.01138295, "auxiliary_loss_mlp": 0.01022976, "balance_loss_clip": 1.04270625, "balance_loss_mlp": 1.0154537, "epoch": 0.8670714843984849, "flos": 17703601372800.0, "grad_norm": 2.872965685661625, "language_loss": 0.7505762, "learning_rate": 1.8238698500803374e-07, "loss": 0.7721889, "num_input_tokens_seen": 155988155, "step": 7211, "time_per_iteration": 2.6012954711914062 }, { "auxiliary_loss_clip": 0.01056229, "auxiliary_loss_mlp": 0.0100083, "balance_loss_clip": 1.01030433, "balance_loss_mlp": 0.99973947, "epoch": 0.8671917272891241, "flos": 60705483125760.0, "grad_norm": 0.7205851947943182, "language_loss": 0.56258273, "learning_rate": 1.820621220078391e-07, "loss": 0.58315331, "num_input_tokens_seen": 156052065, "step": 7212, "time_per_iteration": 3.2570459842681885 }, { "auxiliary_loss_clip": 0.01166263, "auxiliary_loss_mlp": 0.0102452, "balance_loss_clip": 1.04535317, "balance_loss_mlp": 1.01686716, "epoch": 0.8673119701797631, "flos": 20451881750400.0, "grad_norm": 1.6422282465976248, "language_loss": 0.67696047, "learning_rate": 1.8173753478737553e-07, "loss": 0.69886827, "num_input_tokens_seen": 156072500, "step": 7213, "time_per_iteration": 2.6624302864074707 }, { "auxiliary_loss_clip": 0.01167599, "auxiliary_loss_mlp": 0.01023269, "balance_loss_clip": 1.04588246, "balance_loss_mlp": 1.01527572, "epoch": 0.8674322130704022, "flos": 19647410797440.0, "grad_norm": 2.2210793817714927, "language_loss": 0.79784238, "learning_rate": 1.8141322339588205e-07, "loss": 0.81975108, "num_input_tokens_seen": 156089840, "step": 7214, "time_per_iteration": 2.553856372833252 }, { "auxiliary_loss_clip": 0.01164467, "auxiliary_loss_mlp": 0.01024759, "balance_loss_clip": 1.04610479, "balance_loss_mlp": 1.01754737, "epoch": 0.8675524559610414, "flos": 26025001367040.0, "grad_norm": 2.240022714926397, "language_loss": 0.70142567, "learning_rate": 1.810891878825569e-07, "loss": 0.72331792, "num_input_tokens_seen": 156109815, "step": 7215, "time_per_iteration": 2.6321849822998047 }, { "auxiliary_loss_clip": 0.01137748, "auxiliary_loss_mlp": 0.01029971, "balance_loss_clip": 1.04171181, "balance_loss_mlp": 1.02256823, "epoch": 0.8676726988516804, "flos": 15049444584960.0, "grad_norm": 2.002437494060461, "language_loss": 0.71926832, "learning_rate": 1.8076542829655561e-07, "loss": 0.74094546, "num_input_tokens_seen": 156128620, "step": 7216, "time_per_iteration": 2.6315650939941406 }, { "auxiliary_loss_clip": 0.01138441, "auxiliary_loss_mlp": 0.01024903, "balance_loss_clip": 1.04438508, "balance_loss_mlp": 1.01757753, "epoch": 0.8677929417423195, "flos": 16288111140480.0, "grad_norm": 2.132253532979228, "language_loss": 0.79346716, "learning_rate": 1.8044194468699203e-07, "loss": 0.81510061, "num_input_tokens_seen": 156145930, "step": 7217, "time_per_iteration": 2.6922640800476074 }, { "auxiliary_loss_clip": 0.01136824, "auxiliary_loss_mlp": 0.01022512, "balance_loss_clip": 1.04491985, "balance_loss_mlp": 1.01522875, "epoch": 0.8679131846329585, "flos": 18844160906880.0, "grad_norm": 2.8580772045960576, "language_loss": 0.75840545, "learning_rate": 1.8011873710293912e-07, "loss": 0.77999878, "num_input_tokens_seen": 156164435, "step": 7218, "time_per_iteration": 2.6042444705963135 }, { "auxiliary_loss_clip": 0.01149284, "auxiliary_loss_mlp": 0.01029951, "balance_loss_clip": 1.04486108, "balance_loss_mlp": 1.02225924, "epoch": 0.8680334275235977, "flos": 33620718890880.0, "grad_norm": 3.8647837018938676, "language_loss": 0.6914556, "learning_rate": 1.7979580559342677e-07, "loss": 0.71324795, "num_input_tokens_seen": 156185165, "step": 7219, "time_per_iteration": 2.7507784366607666 }, { "auxiliary_loss_clip": 0.01135776, "auxiliary_loss_mlp": 0.01025354, "balance_loss_clip": 1.04287779, "balance_loss_mlp": 1.01808548, "epoch": 0.8681536704142367, "flos": 24681152810880.0, "grad_norm": 1.6907430229472833, "language_loss": 0.66703421, "learning_rate": 1.7947315020744358e-07, "loss": 0.68864548, "num_input_tokens_seen": 156206260, "step": 7220, "time_per_iteration": 2.720121383666992 }, { "auxiliary_loss_clip": 0.0113593, "auxiliary_loss_mlp": 0.01021411, "balance_loss_clip": 1.04156065, "balance_loss_mlp": 1.01510811, "epoch": 0.8682739133048758, "flos": 20011042131840.0, "grad_norm": 2.212574643950438, "language_loss": 0.80232763, "learning_rate": 1.7915077099393594e-07, "loss": 0.82390106, "num_input_tokens_seen": 156222860, "step": 7221, "time_per_iteration": 2.745957136154175 }, { "auxiliary_loss_clip": 0.01153586, "auxiliary_loss_mlp": 0.01023046, "balance_loss_clip": 1.04350495, "balance_loss_mlp": 1.01583123, "epoch": 0.868394156195515, "flos": 16654759217280.0, "grad_norm": 2.0511217319288586, "language_loss": 0.73335361, "learning_rate": 1.788286680018083e-07, "loss": 0.75511992, "num_input_tokens_seen": 156241570, "step": 7222, "time_per_iteration": 2.718672275543213 }, { "auxiliary_loss_clip": 0.01140234, "auxiliary_loss_mlp": 0.01023217, "balance_loss_clip": 1.044065, "balance_loss_mlp": 1.016047, "epoch": 0.868514399086154, "flos": 28001381448960.0, "grad_norm": 1.6637833227843744, "language_loss": 0.72353375, "learning_rate": 1.7850684127992443e-07, "loss": 0.74516827, "num_input_tokens_seen": 156261315, "step": 7223, "time_per_iteration": 2.8501052856445312 }, { "auxiliary_loss_clip": 0.01125389, "auxiliary_loss_mlp": 0.0102582, "balance_loss_clip": 1.04443371, "balance_loss_mlp": 1.0184201, "epoch": 0.8686346419767931, "flos": 20084587228800.0, "grad_norm": 2.0484443985792167, "language_loss": 0.70362473, "learning_rate": 1.7818529087710378e-07, "loss": 0.72513688, "num_input_tokens_seen": 156281670, "step": 7224, "time_per_iteration": 2.725290060043335 }, { "auxiliary_loss_clip": 0.01150527, "auxiliary_loss_mlp": 0.00762105, "balance_loss_clip": 1.04430628, "balance_loss_mlp": 1.00031936, "epoch": 0.8687548848674322, "flos": 18223516782720.0, "grad_norm": 1.7681518154399882, "language_loss": 0.84138501, "learning_rate": 1.7786401684212637e-07, "loss": 0.86051136, "num_input_tokens_seen": 156300500, "step": 7225, "time_per_iteration": 2.6578962802886963 }, { "auxiliary_loss_clip": 0.01034196, "auxiliary_loss_mlp": 0.01001643, "balance_loss_clip": 1.01321983, "balance_loss_mlp": 1.00058222, "epoch": 0.8688751277580713, "flos": 70457885049600.0, "grad_norm": 0.7337689220900175, "language_loss": 0.55913556, "learning_rate": 1.7754301922372883e-07, "loss": 0.579494, "num_input_tokens_seen": 156350145, "step": 7226, "time_per_iteration": 3.1123263835906982 }, { "auxiliary_loss_clip": 0.01100191, "auxiliary_loss_mlp": 0.01023015, "balance_loss_clip": 1.03839636, "balance_loss_mlp": 1.01631296, "epoch": 0.8689953706487104, "flos": 26906788344960.0, "grad_norm": 1.955616906942337, "language_loss": 0.8109175, "learning_rate": 1.7722229807060617e-07, "loss": 0.83214962, "num_input_tokens_seen": 156368725, "step": 7227, "time_per_iteration": 2.8001925945281982 }, { "auxiliary_loss_clip": 0.01112941, "auxiliary_loss_mlp": 0.01025517, "balance_loss_clip": 1.03805697, "balance_loss_mlp": 1.01820302, "epoch": 0.8691156135393495, "flos": 34637385438720.0, "grad_norm": 1.9755535299927285, "language_loss": 0.81624377, "learning_rate": 1.7690185343141172e-07, "loss": 0.83762836, "num_input_tokens_seen": 156388640, "step": 7228, "time_per_iteration": 2.8757705688476562 }, { "auxiliary_loss_clip": 0.01135437, "auxiliary_loss_mlp": 0.01027087, "balance_loss_clip": 1.03868949, "balance_loss_mlp": 1.02036619, "epoch": 0.8692358564299886, "flos": 18989814556800.0, "grad_norm": 2.7504030986500427, "language_loss": 0.70152318, "learning_rate": 1.7658168535475615e-07, "loss": 0.72314847, "num_input_tokens_seen": 156406425, "step": 7229, "time_per_iteration": 3.6369171142578125 }, { "auxiliary_loss_clip": 0.01141683, "auxiliary_loss_mlp": 0.0103175, "balance_loss_clip": 1.04413331, "balance_loss_mlp": 1.02438831, "epoch": 0.8693560993206276, "flos": 30370839039360.0, "grad_norm": 1.5100824508337627, "language_loss": 0.64298582, "learning_rate": 1.7626179388920948e-07, "loss": 0.66472018, "num_input_tokens_seen": 156427705, "step": 7230, "time_per_iteration": 2.7803828716278076 }, { "auxiliary_loss_clip": 0.01138281, "auxiliary_loss_mlp": 0.00762263, "balance_loss_clip": 1.04307604, "balance_loss_mlp": 1.0003047, "epoch": 0.8694763422112668, "flos": 27200430028800.0, "grad_norm": 1.6504425114459482, "language_loss": 0.80373943, "learning_rate": 1.7594217908329866e-07, "loss": 0.82274485, "num_input_tokens_seen": 156449890, "step": 7231, "time_per_iteration": 2.756636142730713 }, { "auxiliary_loss_clip": 0.01129687, "auxiliary_loss_mlp": 0.01024563, "balance_loss_clip": 1.04222608, "balance_loss_mlp": 1.01769674, "epoch": 0.8695965851019059, "flos": 26139161767680.0, "grad_norm": 2.0008207769322444, "language_loss": 0.74344909, "learning_rate": 1.7562284098550895e-07, "loss": 0.76499152, "num_input_tokens_seen": 156469600, "step": 7232, "time_per_iteration": 2.8242292404174805 }, { "auxiliary_loss_clip": 0.01035967, "auxiliary_loss_mlp": 0.01000965, "balance_loss_clip": 1.00848031, "balance_loss_mlp": 0.9999755, "epoch": 0.8697168279925449, "flos": 67332616456320.0, "grad_norm": 0.8339608653136719, "language_loss": 0.62236714, "learning_rate": 1.753037796442838e-07, "loss": 0.64273643, "num_input_tokens_seen": 156529040, "step": 7233, "time_per_iteration": 4.549138307571411 }, { "auxiliary_loss_clip": 0.0116257, "auxiliary_loss_mlp": 0.01026912, "balance_loss_clip": 1.04277492, "balance_loss_mlp": 1.01962793, "epoch": 0.8698370708831841, "flos": 19718693337600.0, "grad_norm": 2.149235800836467, "language_loss": 0.75099307, "learning_rate": 1.74984995108024e-07, "loss": 0.77288795, "num_input_tokens_seen": 156546970, "step": 7234, "time_per_iteration": 2.6407880783081055 }, { "auxiliary_loss_clip": 0.01153715, "auxiliary_loss_mlp": 0.01026798, "balance_loss_clip": 1.04458392, "balance_loss_mlp": 1.01929998, "epoch": 0.8699573137738231, "flos": 12859971068160.0, "grad_norm": 2.651492010645613, "language_loss": 0.83160651, "learning_rate": 1.7466648742508981e-07, "loss": 0.85341161, "num_input_tokens_seen": 156563155, "step": 7235, "time_per_iteration": 3.7609012126922607 }, { "auxiliary_loss_clip": 0.01135787, "auxiliary_loss_mlp": 0.01023268, "balance_loss_clip": 1.04352427, "balance_loss_mlp": 1.01633644, "epoch": 0.8700775566644622, "flos": 17420733768960.0, "grad_norm": 3.1864419325609523, "language_loss": 0.84572554, "learning_rate": 1.7434825664379837e-07, "loss": 0.86731607, "num_input_tokens_seen": 156581660, "step": 7236, "time_per_iteration": 2.6378731727600098 }, { "auxiliary_loss_clip": 0.01153978, "auxiliary_loss_mlp": 0.01028861, "balance_loss_clip": 1.0454092, "balance_loss_mlp": 1.02161944, "epoch": 0.8701977995551013, "flos": 13735221770880.0, "grad_norm": 2.6545530333264846, "language_loss": 0.86445045, "learning_rate": 1.740303028124246e-07, "loss": 0.88627887, "num_input_tokens_seen": 156597720, "step": 7237, "time_per_iteration": 3.5886640548706055 }, { "auxiliary_loss_clip": 0.01082294, "auxiliary_loss_mlp": 0.01022426, "balance_loss_clip": 1.03384101, "balance_loss_mlp": 1.01522613, "epoch": 0.8703180424457404, "flos": 30555707362560.0, "grad_norm": 2.0125780664581963, "language_loss": 0.75790465, "learning_rate": 1.7371262597920212e-07, "loss": 0.77895194, "num_input_tokens_seen": 156619780, "step": 7238, "time_per_iteration": 2.837843894958496 }, { "auxiliary_loss_clip": 0.01107736, "auxiliary_loss_mlp": 0.01023909, "balance_loss_clip": 1.0428133, "balance_loss_mlp": 1.01687312, "epoch": 0.8704382853363795, "flos": 19608986223360.0, "grad_norm": 1.650254867927733, "language_loss": 0.7632935, "learning_rate": 1.7339522619232195e-07, "loss": 0.78460991, "num_input_tokens_seen": 156638160, "step": 7239, "time_per_iteration": 2.702130079269409 }, { "auxiliary_loss_clip": 0.0114643, "auxiliary_loss_mlp": 0.0102498, "balance_loss_clip": 1.04256904, "balance_loss_mlp": 1.0174247, "epoch": 0.8705585282270186, "flos": 26613900846720.0, "grad_norm": 1.842181176382965, "language_loss": 0.75373083, "learning_rate": 1.730781034999338e-07, "loss": 0.77544498, "num_input_tokens_seen": 156659740, "step": 7240, "time_per_iteration": 2.714712619781494 }, { "auxiliary_loss_clip": 0.01161791, "auxiliary_loss_mlp": 0.01022555, "balance_loss_clip": 1.04601431, "balance_loss_mlp": 1.01582289, "epoch": 0.8706787711176577, "flos": 34090465979520.0, "grad_norm": 1.7260359105662368, "language_loss": 0.73121929, "learning_rate": 1.7276125795014497e-07, "loss": 0.75306273, "num_input_tokens_seen": 156678190, "step": 7241, "time_per_iteration": 2.6628572940826416 }, { "auxiliary_loss_clip": 0.01140314, "auxiliary_loss_mlp": 0.01026252, "balance_loss_clip": 1.04115093, "balance_loss_mlp": 1.01844931, "epoch": 0.8707990140082967, "flos": 14611513968000.0, "grad_norm": 3.111027694617384, "language_loss": 0.67362064, "learning_rate": 1.7244468959102054e-07, "loss": 0.69528627, "num_input_tokens_seen": 156695245, "step": 7242, "time_per_iteration": 2.704679250717163 }, { "auxiliary_loss_clip": 0.01150679, "auxiliary_loss_mlp": 0.01023879, "balance_loss_clip": 1.04519892, "balance_loss_mlp": 1.0170033, "epoch": 0.8709192568989359, "flos": 20084156265600.0, "grad_norm": 2.121500619976293, "language_loss": 0.85637796, "learning_rate": 1.7212839847058348e-07, "loss": 0.87812364, "num_input_tokens_seen": 156710375, "step": 7243, "time_per_iteration": 2.7049427032470703 }, { "auxiliary_loss_clip": 0.01103923, "auxiliary_loss_mlp": 0.0102499, "balance_loss_clip": 1.03817129, "balance_loss_mlp": 1.01793003, "epoch": 0.871039499789575, "flos": 16727083251840.0, "grad_norm": 5.602977235714113, "language_loss": 0.74029577, "learning_rate": 1.718123846368147e-07, "loss": 0.76158494, "num_input_tokens_seen": 156729420, "step": 7244, "time_per_iteration": 2.7712364196777344 }, { "auxiliary_loss_clip": 0.01135145, "auxiliary_loss_mlp": 0.00762053, "balance_loss_clip": 1.0429002, "balance_loss_mlp": 1.0002805, "epoch": 0.871159742680214, "flos": 21068790860160.0, "grad_norm": 1.729561588735784, "language_loss": 0.71534711, "learning_rate": 1.714966481376543e-07, "loss": 0.73431909, "num_input_tokens_seen": 156746100, "step": 7245, "time_per_iteration": 2.6597390174865723 }, { "auxiliary_loss_clip": 0.01150242, "auxiliary_loss_mlp": 0.0102393, "balance_loss_clip": 1.04326999, "balance_loss_mlp": 1.016572, "epoch": 0.8712799855708532, "flos": 28256526731520.0, "grad_norm": 1.9541527202810542, "language_loss": 0.83348507, "learning_rate": 1.7118118902099797e-07, "loss": 0.85522681, "num_input_tokens_seen": 156764185, "step": 7246, "time_per_iteration": 2.7947535514831543 }, { "auxiliary_loss_clip": 0.01150411, "auxiliary_loss_mlp": 0.01025654, "balance_loss_clip": 1.04417133, "balance_loss_mlp": 1.01869226, "epoch": 0.8714002284614922, "flos": 22236677665920.0, "grad_norm": 1.7324966375242503, "language_loss": 0.80830705, "learning_rate": 1.7086600733470146e-07, "loss": 0.83006775, "num_input_tokens_seen": 156784855, "step": 7247, "time_per_iteration": 2.6020960807800293 }, { "auxiliary_loss_clip": 0.01146129, "auxiliary_loss_mlp": 0.01018448, "balance_loss_clip": 1.04227912, "balance_loss_mlp": 1.01222551, "epoch": 0.8715204713521313, "flos": 21431919404160.0, "grad_norm": 2.079136764264898, "language_loss": 0.77104324, "learning_rate": 1.7055110312657738e-07, "loss": 0.79268897, "num_input_tokens_seen": 156804350, "step": 7248, "time_per_iteration": 2.6741738319396973 }, { "auxiliary_loss_clip": 0.01131874, "auxiliary_loss_mlp": 0.01027688, "balance_loss_clip": 1.04178238, "balance_loss_mlp": 1.02042508, "epoch": 0.8716407142427703, "flos": 23440439180160.0, "grad_norm": 3.4059993289215647, "language_loss": 0.73671281, "learning_rate": 1.702364764443962e-07, "loss": 0.75830847, "num_input_tokens_seen": 156823425, "step": 7249, "time_per_iteration": 2.640627145767212 }, { "auxiliary_loss_clip": 0.01092552, "auxiliary_loss_mlp": 0.01023438, "balance_loss_clip": 1.03645611, "balance_loss_mlp": 1.01614881, "epoch": 0.8717609571334095, "flos": 27958683156480.0, "grad_norm": 1.8956472465721956, "language_loss": 0.72415388, "learning_rate": 1.6992212733588685e-07, "loss": 0.74531376, "num_input_tokens_seen": 156843090, "step": 7250, "time_per_iteration": 2.778545379638672 }, { "auxiliary_loss_clip": 0.01134993, "auxiliary_loss_mlp": 0.01024177, "balance_loss_clip": 1.04272413, "balance_loss_mlp": 1.0172267, "epoch": 0.8718812000240486, "flos": 25479482538240.0, "grad_norm": 1.9153419237576017, "language_loss": 0.75269252, "learning_rate": 1.6960805584873538e-07, "loss": 0.77428418, "num_input_tokens_seen": 156861090, "step": 7251, "time_per_iteration": 2.6828668117523193 }, { "auxiliary_loss_clip": 0.01110826, "auxiliary_loss_mlp": 0.01022842, "balance_loss_clip": 1.03825974, "balance_loss_mlp": 1.01575792, "epoch": 0.8720014429146876, "flos": 23403056100480.0, "grad_norm": 1.5621852482781111, "language_loss": 0.78255045, "learning_rate": 1.6929426203058684e-07, "loss": 0.80388719, "num_input_tokens_seen": 156881515, "step": 7252, "time_per_iteration": 2.7120277881622314 }, { "auxiliary_loss_clip": 0.01166542, "auxiliary_loss_mlp": 0.00762962, "balance_loss_clip": 1.04314256, "balance_loss_mlp": 1.00033724, "epoch": 0.8721216858053268, "flos": 24352821567360.0, "grad_norm": 2.481391710407773, "language_loss": 0.80192971, "learning_rate": 1.689807459290431e-07, "loss": 0.82122481, "num_input_tokens_seen": 156900170, "step": 7253, "time_per_iteration": 2.660639524459839 }, { "auxiliary_loss_clip": 0.01139246, "auxiliary_loss_mlp": 0.01027833, "balance_loss_clip": 1.04419446, "balance_loss_mlp": 1.02075529, "epoch": 0.8722419286959658, "flos": 33869687034240.0, "grad_norm": 2.1118929973916423, "language_loss": 0.70486695, "learning_rate": 1.6866750759166437e-07, "loss": 0.72653776, "num_input_tokens_seen": 156920150, "step": 7254, "time_per_iteration": 2.7880444526672363 }, { "auxiliary_loss_clip": 0.01118129, "auxiliary_loss_mlp": 0.01023806, "balance_loss_clip": 1.03703618, "balance_loss_mlp": 1.01653099, "epoch": 0.8723621715866049, "flos": 18369385914240.0, "grad_norm": 2.722148056159217, "language_loss": 0.77344733, "learning_rate": 1.6835454706596865e-07, "loss": 0.79486668, "num_input_tokens_seen": 156937980, "step": 7255, "time_per_iteration": 3.7212982177734375 }, { "auxiliary_loss_clip": 0.01166938, "auxiliary_loss_mlp": 0.01025918, "balance_loss_clip": 1.04768622, "balance_loss_mlp": 1.01846147, "epoch": 0.8724824144772441, "flos": 22013348855040.0, "grad_norm": 1.841066691904238, "language_loss": 0.73459494, "learning_rate": 1.680418643994317e-07, "loss": 0.75652349, "num_input_tokens_seen": 156956550, "step": 7256, "time_per_iteration": 2.613248109817505 }, { "auxiliary_loss_clip": 0.01063779, "auxiliary_loss_mlp": 0.01000399, "balance_loss_clip": 1.00982738, "balance_loss_mlp": 0.99928439, "epoch": 0.8726026573678831, "flos": 66698720213760.0, "grad_norm": 0.8866463219472269, "language_loss": 0.64473844, "learning_rate": 1.6772945963948738e-07, "loss": 0.66538012, "num_input_tokens_seen": 157014715, "step": 7257, "time_per_iteration": 3.1832034587860107 }, { "auxiliary_loss_clip": 0.01136705, "auxiliary_loss_mlp": 0.0102477, "balance_loss_clip": 1.04436493, "balance_loss_mlp": 1.01748335, "epoch": 0.8727229002585222, "flos": 13370908078080.0, "grad_norm": 2.5237168916348076, "language_loss": 0.7744329, "learning_rate": 1.6741733283352733e-07, "loss": 0.79604769, "num_input_tokens_seen": 157032320, "step": 7258, "time_per_iteration": 2.6153995990753174 }, { "auxiliary_loss_clip": 0.01111996, "auxiliary_loss_mlp": 0.0102604, "balance_loss_clip": 1.03997302, "balance_loss_mlp": 1.01864934, "epoch": 0.8728431431491613, "flos": 21796987282560.0, "grad_norm": 1.5407311822511656, "language_loss": 0.83775401, "learning_rate": 1.6710548402890102e-07, "loss": 0.85913444, "num_input_tokens_seen": 157052845, "step": 7259, "time_per_iteration": 3.683941602706909 }, { "auxiliary_loss_clip": 0.01169614, "auxiliary_loss_mlp": 0.01026806, "balance_loss_clip": 1.04579294, "balance_loss_mlp": 1.01900995, "epoch": 0.8729633860398004, "flos": 36173823742080.0, "grad_norm": 1.9671246123920263, "language_loss": 0.6712265, "learning_rate": 1.6679391327291527e-07, "loss": 0.69319069, "num_input_tokens_seen": 157074050, "step": 7260, "time_per_iteration": 2.754448413848877 }, { "auxiliary_loss_clip": 0.01137933, "auxiliary_loss_mlp": 0.01024907, "balance_loss_clip": 1.04058909, "balance_loss_mlp": 1.01787984, "epoch": 0.8730836289304394, "flos": 16359680989440.0, "grad_norm": 3.3832735154254676, "language_loss": 0.68257892, "learning_rate": 1.6648262061283492e-07, "loss": 0.7042073, "num_input_tokens_seen": 157089350, "step": 7261, "time_per_iteration": 3.443157434463501 }, { "auxiliary_loss_clip": 0.01122288, "auxiliary_loss_mlp": 0.01019692, "balance_loss_clip": 1.03860259, "balance_loss_mlp": 1.01318038, "epoch": 0.8732038718210786, "flos": 21215126868480.0, "grad_norm": 1.7893717377168012, "language_loss": 0.73836637, "learning_rate": 1.6617160609588353e-07, "loss": 0.75978613, "num_input_tokens_seen": 157108525, "step": 7262, "time_per_iteration": 2.699214458465576 }, { "auxiliary_loss_clip": 0.01140241, "auxiliary_loss_mlp": 0.01024862, "balance_loss_clip": 1.04215324, "balance_loss_mlp": 1.01767695, "epoch": 0.8733241147117177, "flos": 16610696208000.0, "grad_norm": 2.106505762121692, "language_loss": 0.72605437, "learning_rate": 1.6586086976924163e-07, "loss": 0.7477054, "num_input_tokens_seen": 157124025, "step": 7263, "time_per_iteration": 3.587695598602295 }, { "auxiliary_loss_clip": 0.01151199, "auxiliary_loss_mlp": 0.01024519, "balance_loss_clip": 1.04277718, "balance_loss_mlp": 1.01761961, "epoch": 0.8734443576023567, "flos": 20193935207040.0, "grad_norm": 2.072181274733871, "language_loss": 0.78584707, "learning_rate": 1.6555041168004747e-07, "loss": 0.80760419, "num_input_tokens_seen": 157143345, "step": 7264, "time_per_iteration": 2.577897310256958 }, { "auxiliary_loss_clip": 0.01133803, "auxiliary_loss_mlp": 0.0103179, "balance_loss_clip": 1.04284883, "balance_loss_mlp": 1.02482867, "epoch": 0.8735646004929959, "flos": 18041162411520.0, "grad_norm": 2.0580591432551896, "language_loss": 0.69129175, "learning_rate": 1.6524023187539715e-07, "loss": 0.71294767, "num_input_tokens_seen": 157161630, "step": 7265, "time_per_iteration": 2.698284387588501 }, { "auxiliary_loss_clip": 0.01134231, "auxiliary_loss_mlp": 0.01024626, "balance_loss_clip": 1.04109156, "balance_loss_mlp": 1.01741362, "epoch": 0.873684843383635, "flos": 20262344659200.0, "grad_norm": 2.132302142650764, "language_loss": 0.75218976, "learning_rate": 1.649303304023446e-07, "loss": 0.77377832, "num_input_tokens_seen": 157181385, "step": 7266, "time_per_iteration": 2.621762752532959 }, { "auxiliary_loss_clip": 0.011181, "auxiliary_loss_mlp": 0.0102518, "balance_loss_clip": 1.04323363, "balance_loss_mlp": 1.01839066, "epoch": 0.873805086274274, "flos": 16947287579520.0, "grad_norm": 1.9028484610913756, "language_loss": 0.7883181, "learning_rate": 1.6462070730790246e-07, "loss": 0.80975091, "num_input_tokens_seen": 157200545, "step": 7267, "time_per_iteration": 2.6563141345977783 }, { "auxiliary_loss_clip": 0.01131091, "auxiliary_loss_mlp": 0.01022847, "balance_loss_clip": 1.03855038, "balance_loss_mlp": 1.01599836, "epoch": 0.8739253291649132, "flos": 18041270152320.0, "grad_norm": 2.5014693423972068, "language_loss": 0.78546011, "learning_rate": 1.6431136263903912e-07, "loss": 0.80699944, "num_input_tokens_seen": 157219545, "step": 7268, "time_per_iteration": 2.609837055206299 }, { "auxiliary_loss_clip": 0.01152752, "auxiliary_loss_mlp": 0.0076242, "balance_loss_clip": 1.04156637, "balance_loss_mlp": 1.00030661, "epoch": 0.8740455720555522, "flos": 21325085377920.0, "grad_norm": 3.18531652504852, "language_loss": 0.73314148, "learning_rate": 1.6400229644268282e-07, "loss": 0.75229323, "num_input_tokens_seen": 157237900, "step": 7269, "time_per_iteration": 2.628361701965332 }, { "auxiliary_loss_clip": 0.01120832, "auxiliary_loss_mlp": 0.0102603, "balance_loss_clip": 1.04594326, "balance_loss_mlp": 1.01887119, "epoch": 0.8741658149461913, "flos": 15158684822400.0, "grad_norm": 2.0070455872733595, "language_loss": 0.81079817, "learning_rate": 1.6369350876571852e-07, "loss": 0.83226675, "num_input_tokens_seen": 157256055, "step": 7270, "time_per_iteration": 2.630439281463623 }, { "auxiliary_loss_clip": 0.01105183, "auxiliary_loss_mlp": 0.01026293, "balance_loss_clip": 1.03800571, "balance_loss_mlp": 1.01937246, "epoch": 0.8742860578368304, "flos": 23039855729280.0, "grad_norm": 1.967326702373711, "language_loss": 0.81462806, "learning_rate": 1.6338499965498874e-07, "loss": 0.83594286, "num_input_tokens_seen": 157274785, "step": 7271, "time_per_iteration": 2.7288546562194824 }, { "auxiliary_loss_clip": 0.01117921, "auxiliary_loss_mlp": 0.0102348, "balance_loss_clip": 1.03998494, "balance_loss_mlp": 1.01601148, "epoch": 0.8744063007274695, "flos": 28145347159680.0, "grad_norm": 1.5531408435442666, "language_loss": 0.77211964, "learning_rate": 1.630767691572943e-07, "loss": 0.79353362, "num_input_tokens_seen": 157294805, "step": 7272, "time_per_iteration": 2.708362340927124 }, { "auxiliary_loss_clip": 0.0104614, "auxiliary_loss_mlp": 0.01000423, "balance_loss_clip": 1.00983977, "balance_loss_mlp": 0.99931461, "epoch": 0.8745265436181086, "flos": 64034076654720.0, "grad_norm": 0.7395824641449439, "language_loss": 0.53510451, "learning_rate": 1.6276881731939306e-07, "loss": 0.55557013, "num_input_tokens_seen": 157356695, "step": 7273, "time_per_iteration": 3.2982938289642334 }, { "auxiliary_loss_clip": 0.01147267, "auxiliary_loss_mlp": 0.01023166, "balance_loss_clip": 1.0440836, "balance_loss_mlp": 1.0160135, "epoch": 0.8746467865087477, "flos": 28658618553600.0, "grad_norm": 2.8046853219473564, "language_loss": 0.75306726, "learning_rate": 1.6246114418800193e-07, "loss": 0.77477157, "num_input_tokens_seen": 157376975, "step": 7274, "time_per_iteration": 2.6496031284332275 }, { "auxiliary_loss_clip": 0.0114845, "auxiliary_loss_mlp": 0.01030917, "balance_loss_clip": 1.04471493, "balance_loss_mlp": 1.02374625, "epoch": 0.8747670293993868, "flos": 23985850268160.0, "grad_norm": 1.6831792379573156, "language_loss": 0.76723397, "learning_rate": 1.6215374980979423e-07, "loss": 0.78902763, "num_input_tokens_seen": 157397385, "step": 7275, "time_per_iteration": 2.636554002761841 }, { "auxiliary_loss_clip": 0.01149804, "auxiliary_loss_mlp": 0.01024321, "balance_loss_clip": 1.04678965, "balance_loss_mlp": 1.01728177, "epoch": 0.8748872722900258, "flos": 45221624478720.0, "grad_norm": 2.3883594851199224, "language_loss": 0.68456042, "learning_rate": 1.6184663423140133e-07, "loss": 0.70630169, "num_input_tokens_seen": 157417685, "step": 7276, "time_per_iteration": 2.8852758407592773 }, { "auxiliary_loss_clip": 0.01111369, "auxiliary_loss_mlp": 0.01032495, "balance_loss_clip": 1.0406177, "balance_loss_mlp": 1.0258106, "epoch": 0.875007515180665, "flos": 19754280737280.0, "grad_norm": 3.426875908680601, "language_loss": 0.63856876, "learning_rate": 1.615397974994126e-07, "loss": 0.66000742, "num_input_tokens_seen": 157435490, "step": 7277, "time_per_iteration": 2.7091591358184814 }, { "auxiliary_loss_clip": 0.0116426, "auxiliary_loss_mlp": 0.01024694, "balance_loss_clip": 1.04661441, "balance_loss_mlp": 1.01808119, "epoch": 0.875127758071304, "flos": 22710734386560.0, "grad_norm": 1.5091558039609847, "language_loss": 0.80714977, "learning_rate": 1.6123323966037438e-07, "loss": 0.82903922, "num_input_tokens_seen": 157454010, "step": 7278, "time_per_iteration": 2.6624274253845215 }, { "auxiliary_loss_clip": 0.01165226, "auxiliary_loss_mlp": 0.0102558, "balance_loss_clip": 1.04691386, "balance_loss_mlp": 1.01844549, "epoch": 0.8752480009619431, "flos": 23403846199680.0, "grad_norm": 2.362465263535424, "language_loss": 0.78463286, "learning_rate": 1.6092696076079216e-07, "loss": 0.80654097, "num_input_tokens_seen": 157472385, "step": 7279, "time_per_iteration": 2.585635185241699 }, { "auxiliary_loss_clip": 0.01110085, "auxiliary_loss_mlp": 0.01023621, "balance_loss_clip": 1.03865314, "balance_loss_mlp": 1.01689792, "epoch": 0.8753682438525822, "flos": 26213101914240.0, "grad_norm": 1.6492561958061493, "language_loss": 0.73807371, "learning_rate": 1.6062096084712785e-07, "loss": 0.75941074, "num_input_tokens_seen": 157493735, "step": 7280, "time_per_iteration": 2.8308045864105225 }, { "auxiliary_loss_clip": 0.01127536, "auxiliary_loss_mlp": 0.00762087, "balance_loss_clip": 1.0392319, "balance_loss_mlp": 1.00034499, "epoch": 0.8754884867432213, "flos": 23326745656320.0, "grad_norm": 2.180551441267642, "language_loss": 0.70590103, "learning_rate": 1.6031523996580098e-07, "loss": 0.72479725, "num_input_tokens_seen": 157511295, "step": 7281, "time_per_iteration": 3.6046698093414307 }, { "auxiliary_loss_clip": 0.01129718, "auxiliary_loss_mlp": 0.01025158, "balance_loss_clip": 1.04172122, "balance_loss_mlp": 1.01733172, "epoch": 0.8756087296338604, "flos": 12495226412160.0, "grad_norm": 2.0295094836304406, "language_loss": 0.66268319, "learning_rate": 1.6000979816318981e-07, "loss": 0.68423194, "num_input_tokens_seen": 157529760, "step": 7282, "time_per_iteration": 2.618992567062378 }, { "auxiliary_loss_clip": 0.01146471, "auxiliary_loss_mlp": 0.01024356, "balance_loss_clip": 1.04449463, "balance_loss_mlp": 1.01699483, "epoch": 0.8757289725244994, "flos": 18952898353920.0, "grad_norm": 2.3779890626651072, "language_loss": 0.75254142, "learning_rate": 1.5970463548562886e-07, "loss": 0.77424967, "num_input_tokens_seen": 157548915, "step": 7283, "time_per_iteration": 2.7662734985351562 }, { "auxiliary_loss_clip": 0.01132961, "auxiliary_loss_mlp": 0.01021469, "balance_loss_clip": 1.04234564, "balance_loss_mlp": 1.01414979, "epoch": 0.8758492154151386, "flos": 25265958140160.0, "grad_norm": 1.8332834981617228, "language_loss": 0.71480161, "learning_rate": 1.5939975197941192e-07, "loss": 0.73634589, "num_input_tokens_seen": 157570570, "step": 7284, "time_per_iteration": 2.7201197147369385 }, { "auxiliary_loss_clip": 0.01045485, "auxiliary_loss_mlp": 0.01000387, "balance_loss_clip": 1.01040816, "balance_loss_mlp": 0.99928993, "epoch": 0.8759694583057777, "flos": 65571664193280.0, "grad_norm": 0.8068913526430259, "language_loss": 0.53328645, "learning_rate": 1.5909514769078892e-07, "loss": 0.55374515, "num_input_tokens_seen": 157635675, "step": 7285, "time_per_iteration": 4.465670824050903 }, { "auxiliary_loss_clip": 0.01114719, "auxiliary_loss_mlp": 0.01029068, "balance_loss_clip": 1.04323983, "balance_loss_mlp": 1.02235317, "epoch": 0.8760897011964167, "flos": 25446193608960.0, "grad_norm": 1.518160326874458, "language_loss": 0.77572083, "learning_rate": 1.5879082266596867e-07, "loss": 0.79715872, "num_input_tokens_seen": 157657015, "step": 7286, "time_per_iteration": 2.7442522048950195 }, { "auxiliary_loss_clip": 0.01127476, "auxiliary_loss_mlp": 0.01023479, "balance_loss_clip": 1.03642857, "balance_loss_mlp": 1.01663077, "epoch": 0.8762099440870559, "flos": 28984830894720.0, "grad_norm": 2.175116705612869, "language_loss": 0.72145331, "learning_rate": 1.5848677695111645e-07, "loss": 0.7429629, "num_input_tokens_seen": 157678615, "step": 7287, "time_per_iteration": 3.588203191757202 }, { "auxiliary_loss_clip": 0.01131906, "auxiliary_loss_mlp": 0.0102708, "balance_loss_clip": 1.0443902, "balance_loss_mlp": 1.01956725, "epoch": 0.8763301869776949, "flos": 21609461352960.0, "grad_norm": 2.153063874875102, "language_loss": 0.6894412, "learning_rate": 1.5818301059235562e-07, "loss": 0.71103108, "num_input_tokens_seen": 157693790, "step": 7288, "time_per_iteration": 2.6855738162994385 }, { "auxiliary_loss_clip": 0.0114031, "auxiliary_loss_mlp": 0.01032621, "balance_loss_clip": 1.04395461, "balance_loss_mlp": 1.0256263, "epoch": 0.876450429868334, "flos": 24644416176000.0, "grad_norm": 1.6316562276810045, "language_loss": 0.81172907, "learning_rate": 1.578795236357684e-07, "loss": 0.83345842, "num_input_tokens_seen": 157715255, "step": 7289, "time_per_iteration": 3.6084144115448 }, { "auxiliary_loss_clip": 0.01139863, "auxiliary_loss_mlp": 0.01023525, "balance_loss_clip": 1.04496586, "balance_loss_mlp": 1.01647687, "epoch": 0.8765706727589732, "flos": 20260046188800.0, "grad_norm": 2.164144388502614, "language_loss": 0.85402048, "learning_rate": 1.5757631612739218e-07, "loss": 0.87565434, "num_input_tokens_seen": 157728800, "step": 7290, "time_per_iteration": 2.6464099884033203 }, { "auxiliary_loss_clip": 0.01063688, "auxiliary_loss_mlp": 0.01001535, "balance_loss_clip": 1.00987816, "balance_loss_mlp": 1.00046194, "epoch": 0.8766909156496122, "flos": 71371165276800.0, "grad_norm": 0.7780697193850279, "language_loss": 0.61451524, "learning_rate": 1.572733881132242e-07, "loss": 0.63516754, "num_input_tokens_seen": 157789445, "step": 7291, "time_per_iteration": 3.277047872543335 }, { "auxiliary_loss_clip": 0.01028041, "auxiliary_loss_mlp": 0.01001151, "balance_loss_clip": 1.00982809, "balance_loss_mlp": 1.0001322, "epoch": 0.8768111585402513, "flos": 69523490603520.0, "grad_norm": 0.7776909159034786, "language_loss": 0.5852173, "learning_rate": 1.5697073963921814e-07, "loss": 0.60550922, "num_input_tokens_seen": 157848685, "step": 7292, "time_per_iteration": 3.2369918823242188 }, { "auxiliary_loss_clip": 0.01150697, "auxiliary_loss_mlp": 0.01024459, "balance_loss_clip": 1.04483223, "balance_loss_mlp": 1.01706839, "epoch": 0.8769314014308904, "flos": 18838558385280.0, "grad_norm": 8.11853627589704, "language_loss": 0.85482097, "learning_rate": 1.566683707512857e-07, "loss": 0.87657249, "num_input_tokens_seen": 157866360, "step": 7293, "time_per_iteration": 2.6047167778015137 }, { "auxiliary_loss_clip": 0.01132545, "auxiliary_loss_mlp": 0.01025128, "balance_loss_clip": 1.04087603, "balance_loss_mlp": 1.01779985, "epoch": 0.8770516443215295, "flos": 14976402278400.0, "grad_norm": 1.9239766604471977, "language_loss": 0.79340392, "learning_rate": 1.5636628149529553e-07, "loss": 0.81498069, "num_input_tokens_seen": 157884150, "step": 7294, "time_per_iteration": 2.6527507305145264 }, { "auxiliary_loss_clip": 0.01135643, "auxiliary_loss_mlp": 0.01024819, "balance_loss_clip": 1.04136741, "balance_loss_mlp": 1.01852143, "epoch": 0.8771718872121685, "flos": 31649654021760.0, "grad_norm": 2.155165612399429, "language_loss": 0.79473811, "learning_rate": 1.560644719170743e-07, "loss": 0.81634271, "num_input_tokens_seen": 157905020, "step": 7295, "time_per_iteration": 2.693279266357422 }, { "auxiliary_loss_clip": 0.01121506, "auxiliary_loss_mlp": 0.01025904, "balance_loss_clip": 1.03929377, "balance_loss_mlp": 1.01829505, "epoch": 0.8772921301028077, "flos": 36095466222720.0, "grad_norm": 1.9870419563178132, "language_loss": 0.720456, "learning_rate": 1.5576294206240692e-07, "loss": 0.74193013, "num_input_tokens_seen": 157924545, "step": 7296, "time_per_iteration": 2.8292758464813232 }, { "auxiliary_loss_clip": 0.01133083, "auxiliary_loss_mlp": 0.01025522, "balance_loss_clip": 1.04272318, "balance_loss_mlp": 1.01822364, "epoch": 0.8774123729934468, "flos": 57116961849600.0, "grad_norm": 1.7109750929582392, "language_loss": 0.680282, "learning_rate": 1.5546169197703507e-07, "loss": 0.70186806, "num_input_tokens_seen": 157950820, "step": 7297, "time_per_iteration": 3.017703056335449 }, { "auxiliary_loss_clip": 0.01141063, "auxiliary_loss_mlp": 0.01029456, "balance_loss_clip": 1.04113734, "balance_loss_mlp": 1.02262509, "epoch": 0.8775326158840858, "flos": 23914495900800.0, "grad_norm": 2.8667188378766455, "language_loss": 0.77309406, "learning_rate": 1.5516072170665774e-07, "loss": 0.79479921, "num_input_tokens_seen": 157968790, "step": 7298, "time_per_iteration": 2.6317033767700195 }, { "auxiliary_loss_clip": 0.01153034, "auxiliary_loss_mlp": 0.01023293, "balance_loss_clip": 1.04476976, "balance_loss_mlp": 1.01659298, "epoch": 0.877652858774725, "flos": 17123285243520.0, "grad_norm": 1.812087348607546, "language_loss": 0.86755735, "learning_rate": 1.5486003129693214e-07, "loss": 0.88932061, "num_input_tokens_seen": 157986155, "step": 7299, "time_per_iteration": 2.555802822113037 }, { "auxiliary_loss_clip": 0.01154185, "auxiliary_loss_mlp": 0.01022068, "balance_loss_clip": 1.04455984, "balance_loss_mlp": 1.0148797, "epoch": 0.877773101665364, "flos": 16508961912960.0, "grad_norm": 2.2403401874713147, "language_loss": 0.7796911, "learning_rate": 1.545596207934725e-07, "loss": 0.80145359, "num_input_tokens_seen": 158004640, "step": 7300, "time_per_iteration": 2.6984965801239014 }, { "auxiliary_loss_clip": 0.01128927, "auxiliary_loss_mlp": 0.01023965, "balance_loss_clip": 1.03950858, "balance_loss_mlp": 1.01722062, "epoch": 0.8778933445560031, "flos": 22053209973120.0, "grad_norm": 1.7561554186803003, "language_loss": 0.77897739, "learning_rate": 1.5425949024185147e-07, "loss": 0.80050635, "num_input_tokens_seen": 158024665, "step": 7301, "time_per_iteration": 2.6678168773651123 }, { "auxiliary_loss_clip": 0.0113636, "auxiliary_loss_mlp": 0.01033852, "balance_loss_clip": 1.03970563, "balance_loss_mlp": 1.0269289, "epoch": 0.8780135874466423, "flos": 22564757514240.0, "grad_norm": 1.8473028573438084, "language_loss": 0.67654943, "learning_rate": 1.5395963968759818e-07, "loss": 0.69825149, "num_input_tokens_seen": 158044940, "step": 7302, "time_per_iteration": 2.6924238204956055 }, { "auxiliary_loss_clip": 0.01137139, "auxiliary_loss_mlp": 0.01025727, "balance_loss_clip": 1.04152226, "balance_loss_mlp": 1.01868165, "epoch": 0.8781338303372813, "flos": 61531999073280.0, "grad_norm": 1.8836446766541677, "language_loss": 0.643996, "learning_rate": 1.536600691761998e-07, "loss": 0.66562462, "num_input_tokens_seen": 158070770, "step": 7303, "time_per_iteration": 3.019775152206421 }, { "auxiliary_loss_clip": 0.01126032, "auxiliary_loss_mlp": 0.01022588, "balance_loss_clip": 1.04357266, "balance_loss_mlp": 1.01602244, "epoch": 0.8782540732279204, "flos": 22674751937280.0, "grad_norm": 1.709293467872228, "language_loss": 0.71925914, "learning_rate": 1.5336077875310084e-07, "loss": 0.74074537, "num_input_tokens_seen": 158089995, "step": 7304, "time_per_iteration": 2.684757709503174 }, { "auxiliary_loss_clip": 0.01111025, "auxiliary_loss_mlp": 0.0102203, "balance_loss_clip": 1.03975415, "balance_loss_mlp": 1.01536906, "epoch": 0.8783743161185595, "flos": 16070348937600.0, "grad_norm": 1.856272974650716, "language_loss": 0.74417388, "learning_rate": 1.5306176846370321e-07, "loss": 0.76550448, "num_input_tokens_seen": 158108140, "step": 7305, "time_per_iteration": 2.7901532649993896 }, { "auxiliary_loss_clip": 0.01144548, "auxiliary_loss_mlp": 0.01024718, "balance_loss_clip": 1.04204941, "balance_loss_mlp": 1.0177412, "epoch": 0.8784945590091986, "flos": 26067879227520.0, "grad_norm": 1.8957695556898477, "language_loss": 0.74132109, "learning_rate": 1.5276303835336712e-07, "loss": 0.76301378, "num_input_tokens_seen": 158128680, "step": 7306, "time_per_iteration": 2.722118854522705 }, { "auxiliary_loss_clip": 0.01055053, "auxiliary_loss_mlp": 0.01001821, "balance_loss_clip": 1.01033807, "balance_loss_mlp": 1.00074196, "epoch": 0.8786148018998376, "flos": 62720643939840.0, "grad_norm": 0.7599929587460502, "language_loss": 0.53532767, "learning_rate": 1.524645884674094e-07, "loss": 0.55589646, "num_input_tokens_seen": 158185610, "step": 7307, "time_per_iteration": 4.130972862243652 }, { "auxiliary_loss_clip": 0.01168498, "auxiliary_loss_mlp": 0.00762748, "balance_loss_clip": 1.04665542, "balance_loss_mlp": 1.00033689, "epoch": 0.8787350447904768, "flos": 21652734263040.0, "grad_norm": 1.9801854911587087, "language_loss": 0.7901355, "learning_rate": 1.521664188511047e-07, "loss": 0.809448, "num_input_tokens_seen": 158205635, "step": 7308, "time_per_iteration": 2.6811304092407227 }, { "auxiliary_loss_clip": 0.01135825, "auxiliary_loss_mlp": 0.00761913, "balance_loss_clip": 1.04574776, "balance_loss_mlp": 1.00032067, "epoch": 0.8788552876811159, "flos": 25478476957440.0, "grad_norm": 1.8161530161664998, "language_loss": 0.80703342, "learning_rate": 1.518685295496851e-07, "loss": 0.82601082, "num_input_tokens_seen": 158223495, "step": 7309, "time_per_iteration": 2.680771827697754 }, { "auxiliary_loss_clip": 0.01150382, "auxiliary_loss_mlp": 0.01025273, "balance_loss_clip": 1.04215217, "balance_loss_mlp": 1.01821232, "epoch": 0.8789755305717549, "flos": 22310222762880.0, "grad_norm": 1.842008748910858, "language_loss": 0.85548961, "learning_rate": 1.5157092060833975e-07, "loss": 0.87724614, "num_input_tokens_seen": 158243145, "step": 7310, "time_per_iteration": 2.646902084350586 }, { "auxiliary_loss_clip": 0.01134012, "auxiliary_loss_mlp": 0.01022448, "balance_loss_clip": 1.04064727, "balance_loss_mlp": 1.01546526, "epoch": 0.879095773462394, "flos": 29310971408640.0, "grad_norm": 1.710386695610177, "language_loss": 0.65955323, "learning_rate": 1.5127359207221658e-07, "loss": 0.68111777, "num_input_tokens_seen": 158262625, "step": 7311, "time_per_iteration": 3.7048845291137695 }, { "auxiliary_loss_clip": 0.01082797, "auxiliary_loss_mlp": 0.01024492, "balance_loss_clip": 1.03319788, "balance_loss_mlp": 1.01727974, "epoch": 0.8792160163530331, "flos": 16690023394560.0, "grad_norm": 1.872200122242252, "language_loss": 0.73467815, "learning_rate": 1.5097654398641923e-07, "loss": 0.75575107, "num_input_tokens_seen": 158280530, "step": 7312, "time_per_iteration": 2.8412325382232666 }, { "auxiliary_loss_clip": 0.01155188, "auxiliary_loss_mlp": 0.01024128, "balance_loss_clip": 1.04535174, "balance_loss_mlp": 1.01757717, "epoch": 0.8793362592436722, "flos": 24499301230080.0, "grad_norm": 1.3956736803658787, "language_loss": 0.731996, "learning_rate": 1.5067977639601014e-07, "loss": 0.75378919, "num_input_tokens_seen": 158303290, "step": 7313, "time_per_iteration": 3.5221035480499268 }, { "auxiliary_loss_clip": 0.01134914, "auxiliary_loss_mlp": 0.01023593, "balance_loss_clip": 1.04295158, "balance_loss_mlp": 1.01642871, "epoch": 0.8794565021343113, "flos": 14538399834240.0, "grad_norm": 2.022052663508477, "language_loss": 0.70458853, "learning_rate": 1.5038328934600864e-07, "loss": 0.72617358, "num_input_tokens_seen": 158319925, "step": 7314, "time_per_iteration": 3.540907621383667 }, { "auxiliary_loss_clip": 0.01137812, "auxiliary_loss_mlp": 0.01025539, "balance_loss_clip": 1.04503047, "balance_loss_mlp": 1.01868451, "epoch": 0.8795767450249504, "flos": 39530286224640.0, "grad_norm": 1.8319473022924495, "language_loss": 0.6992141, "learning_rate": 1.5008708288139161e-07, "loss": 0.72084755, "num_input_tokens_seen": 158342285, "step": 7315, "time_per_iteration": 2.796792507171631 }, { "auxiliary_loss_clip": 0.01150985, "auxiliary_loss_mlp": 0.01022016, "balance_loss_clip": 1.04665208, "balance_loss_mlp": 1.01499486, "epoch": 0.8796969879155895, "flos": 22960672197120.0, "grad_norm": 2.3463255076044236, "language_loss": 0.73192626, "learning_rate": 1.497911570470931e-07, "loss": 0.75365627, "num_input_tokens_seen": 158362290, "step": 7316, "time_per_iteration": 2.619607448577881 }, { "auxiliary_loss_clip": 0.01111408, "auxiliary_loss_mlp": 0.0102369, "balance_loss_clip": 1.03940582, "balance_loss_mlp": 1.0164839, "epoch": 0.8798172308062285, "flos": 28362427004160.0, "grad_norm": 1.6201314254989154, "language_loss": 0.85595918, "learning_rate": 1.494955118880048e-07, "loss": 0.87731016, "num_input_tokens_seen": 158383275, "step": 7317, "time_per_iteration": 2.7967560291290283 }, { "auxiliary_loss_clip": 0.01150308, "auxiliary_loss_mlp": 0.01021643, "balance_loss_clip": 1.04371107, "balance_loss_mlp": 1.0151844, "epoch": 0.8799374736968677, "flos": 23988974751360.0, "grad_norm": 1.8625832051271416, "language_loss": 0.72841799, "learning_rate": 1.4920014744897634e-07, "loss": 0.75013757, "num_input_tokens_seen": 158402690, "step": 7318, "time_per_iteration": 2.68392014503479 }, { "auxiliary_loss_clip": 0.01129351, "auxiliary_loss_mlp": 0.01027001, "balance_loss_clip": 1.04272437, "balance_loss_mlp": 1.01978898, "epoch": 0.8800577165875068, "flos": 25630271832960.0, "grad_norm": 1.943495049244269, "language_loss": 0.86672628, "learning_rate": 1.4890506377481392e-07, "loss": 0.88828981, "num_input_tokens_seen": 158421780, "step": 7319, "time_per_iteration": 2.7370026111602783 }, { "auxiliary_loss_clip": 0.01090704, "auxiliary_loss_mlp": 0.01023551, "balance_loss_clip": 1.03720713, "balance_loss_mlp": 1.01742101, "epoch": 0.8801779594781458, "flos": 23440331439360.0, "grad_norm": 1.7030834533872041, "language_loss": 0.64356792, "learning_rate": 1.486102609102815e-07, "loss": 0.66471052, "num_input_tokens_seen": 158442330, "step": 7320, "time_per_iteration": 2.89265513420105 }, { "auxiliary_loss_clip": 0.01132573, "auxiliary_loss_mlp": 0.0102223, "balance_loss_clip": 1.04236782, "balance_loss_mlp": 1.01546466, "epoch": 0.880298202368785, "flos": 11508580656000.0, "grad_norm": 2.634447978451085, "language_loss": 0.85916805, "learning_rate": 1.483157389001004e-07, "loss": 0.88071609, "num_input_tokens_seen": 158459890, "step": 7321, "time_per_iteration": 2.8615598678588867 }, { "auxiliary_loss_clip": 0.01136212, "auxiliary_loss_mlp": 0.0102064, "balance_loss_clip": 1.04082918, "balance_loss_mlp": 1.01291561, "epoch": 0.880418445259424, "flos": 22671447886080.0, "grad_norm": 2.0660175655852324, "language_loss": 0.79083145, "learning_rate": 1.4802149778894933e-07, "loss": 0.81239998, "num_input_tokens_seen": 158478680, "step": 7322, "time_per_iteration": 2.7057135105133057 }, { "auxiliary_loss_clip": 0.01138257, "auxiliary_loss_mlp": 0.01023973, "balance_loss_clip": 1.03821075, "balance_loss_mlp": 1.01768219, "epoch": 0.8805386881500631, "flos": 20522158709760.0, "grad_norm": 1.6663683610417948, "language_loss": 0.87769306, "learning_rate": 1.4772753762146484e-07, "loss": 0.89931536, "num_input_tokens_seen": 158497935, "step": 7323, "time_per_iteration": 2.591421127319336 }, { "auxiliary_loss_clip": 0.0114452, "auxiliary_loss_mlp": 0.01021278, "balance_loss_clip": 1.04076314, "balance_loss_mlp": 1.01430142, "epoch": 0.8806589310407023, "flos": 36538891620480.0, "grad_norm": 1.581750167520751, "language_loss": 0.70491219, "learning_rate": 1.474338584422401e-07, "loss": 0.72657013, "num_input_tokens_seen": 158523145, "step": 7324, "time_per_iteration": 2.770631790161133 }, { "auxiliary_loss_clip": 0.01147048, "auxiliary_loss_mlp": 0.01022299, "balance_loss_clip": 1.04438233, "balance_loss_mlp": 1.01538801, "epoch": 0.8807791739313413, "flos": 23440187784960.0, "grad_norm": 3.0589357484176602, "language_loss": 0.7582376, "learning_rate": 1.4714046029582595e-07, "loss": 0.77993107, "num_input_tokens_seen": 158542210, "step": 7325, "time_per_iteration": 2.6987175941467285 }, { "auxiliary_loss_clip": 0.01125955, "auxiliary_loss_mlp": 0.01023172, "balance_loss_clip": 1.040025, "balance_loss_mlp": 1.01621342, "epoch": 0.8808994168219804, "flos": 25956843310080.0, "grad_norm": 1.7545723290700608, "language_loss": 0.75767952, "learning_rate": 1.46847343226731e-07, "loss": 0.77917081, "num_input_tokens_seen": 158563250, "step": 7326, "time_per_iteration": 2.716244697570801 }, { "auxiliary_loss_clip": 0.01155225, "auxiliary_loss_mlp": 0.010264, "balance_loss_clip": 1.04429901, "balance_loss_mlp": 1.01941395, "epoch": 0.8810196597126195, "flos": 17092079303040.0, "grad_norm": 2.0978701804729387, "language_loss": 0.69515151, "learning_rate": 1.465545072794203e-07, "loss": 0.7169677, "num_input_tokens_seen": 158581125, "step": 7327, "time_per_iteration": 2.6343047618865967 }, { "auxiliary_loss_clip": 0.011042, "auxiliary_loss_mlp": 0.01022801, "balance_loss_clip": 1.04166448, "balance_loss_mlp": 1.01592278, "epoch": 0.8811399026032586, "flos": 23002831785600.0, "grad_norm": 1.674097396754825, "language_loss": 0.76057196, "learning_rate": 1.4626195249831774e-07, "loss": 0.78184199, "num_input_tokens_seen": 158602025, "step": 7328, "time_per_iteration": 2.824004650115967 }, { "auxiliary_loss_clip": 0.01146759, "auxiliary_loss_mlp": 0.01025543, "balance_loss_clip": 1.04159081, "balance_loss_mlp": 1.01844358, "epoch": 0.8812601454938976, "flos": 14463813242880.0, "grad_norm": 1.78945363535501, "language_loss": 0.71841609, "learning_rate": 1.4596967892780244e-07, "loss": 0.74013913, "num_input_tokens_seen": 158618355, "step": 7329, "time_per_iteration": 2.6211071014404297 }, { "auxiliary_loss_clip": 0.0116262, "auxiliary_loss_mlp": 0.01025428, "balance_loss_clip": 1.04487669, "balance_loss_mlp": 1.01906562, "epoch": 0.8813803883845368, "flos": 22493223578880.0, "grad_norm": 1.6654149330345132, "language_loss": 0.74991429, "learning_rate": 1.4567768661221314e-07, "loss": 0.7717948, "num_input_tokens_seen": 158638925, "step": 7330, "time_per_iteration": 2.602931022644043 }, { "auxiliary_loss_clip": 0.01153674, "auxiliary_loss_mlp": 0.00763165, "balance_loss_clip": 1.04493344, "balance_loss_mlp": 1.00031853, "epoch": 0.8815006312751759, "flos": 21506901045120.0, "grad_norm": 1.9997754687501357, "language_loss": 0.74582636, "learning_rate": 1.4538597559584442e-07, "loss": 0.76499474, "num_input_tokens_seen": 158656715, "step": 7331, "time_per_iteration": 2.7275524139404297 }, { "auxiliary_loss_clip": 0.01131506, "auxiliary_loss_mlp": 0.01025426, "balance_loss_clip": 1.04142213, "balance_loss_mlp": 1.01807642, "epoch": 0.8816208741658149, "flos": 22784566792320.0, "grad_norm": 1.7754792428530966, "language_loss": 0.78790021, "learning_rate": 1.4509454592294823e-07, "loss": 0.80946952, "num_input_tokens_seen": 158677200, "step": 7332, "time_per_iteration": 2.748170852661133 }, { "auxiliary_loss_clip": 0.01125001, "auxiliary_loss_mlp": 0.00761745, "balance_loss_clip": 1.04391074, "balance_loss_mlp": 1.00036526, "epoch": 0.8817411170564541, "flos": 17779409026560.0, "grad_norm": 1.875482080855588, "language_loss": 0.78980607, "learning_rate": 1.448033976377354e-07, "loss": 0.8086735, "num_input_tokens_seen": 158692185, "step": 7333, "time_per_iteration": 3.6027653217315674 }, { "auxiliary_loss_clip": 0.01151279, "auxiliary_loss_mlp": 0.01024031, "balance_loss_clip": 1.0421375, "balance_loss_mlp": 1.01730752, "epoch": 0.8818613599470931, "flos": 18551812112640.0, "grad_norm": 2.5822906913678882, "language_loss": 0.74189335, "learning_rate": 1.445125307843713e-07, "loss": 0.76364648, "num_input_tokens_seen": 158710410, "step": 7334, "time_per_iteration": 2.6831414699554443 }, { "auxiliary_loss_clip": 0.0114729, "auxiliary_loss_mlp": 0.01022895, "balance_loss_clip": 1.04379129, "balance_loss_mlp": 1.01598716, "epoch": 0.8819816028377322, "flos": 27599792417280.0, "grad_norm": 1.5681874134326286, "language_loss": 0.75440156, "learning_rate": 1.442219454069813e-07, "loss": 0.77610338, "num_input_tokens_seen": 158731435, "step": 7335, "time_per_iteration": 2.6399829387664795 }, { "auxiliary_loss_clip": 0.0111172, "auxiliary_loss_mlp": 0.01026191, "balance_loss_clip": 1.03918386, "balance_loss_mlp": 1.01859117, "epoch": 0.8821018457283714, "flos": 23404600385280.0, "grad_norm": 1.805800720670328, "language_loss": 0.6637519, "learning_rate": 1.4393164154964676e-07, "loss": 0.68513095, "num_input_tokens_seen": 158750965, "step": 7336, "time_per_iteration": 2.7591781616210938 }, { "auxiliary_loss_clip": 0.01148691, "auxiliary_loss_mlp": 0.01023947, "balance_loss_clip": 1.04549265, "balance_loss_mlp": 1.01717901, "epoch": 0.8822220886190104, "flos": 29132459792640.0, "grad_norm": 1.8413131493111239, "language_loss": 0.9398514, "learning_rate": 1.4364161925640649e-07, "loss": 0.96157777, "num_input_tokens_seen": 158772365, "step": 7337, "time_per_iteration": 3.6458804607391357 }, { "auxiliary_loss_clip": 0.01162947, "auxiliary_loss_mlp": 0.01027558, "balance_loss_clip": 1.04454565, "balance_loss_mlp": 1.02119195, "epoch": 0.8823423315096495, "flos": 20485422074880.0, "grad_norm": 1.8130564564364458, "language_loss": 0.85298634, "learning_rate": 1.4335187857125663e-07, "loss": 0.87489134, "num_input_tokens_seen": 158791065, "step": 7338, "time_per_iteration": 2.6147055625915527 }, { "auxiliary_loss_clip": 0.01152828, "auxiliary_loss_mlp": 0.01022221, "balance_loss_clip": 1.04498184, "balance_loss_mlp": 1.01513386, "epoch": 0.8824625744002886, "flos": 24206377818240.0, "grad_norm": 2.4640067973388304, "language_loss": 0.75609112, "learning_rate": 1.4306241953815023e-07, "loss": 0.77784157, "num_input_tokens_seen": 158812125, "step": 7339, "time_per_iteration": 3.616642475128174 }, { "auxiliary_loss_clip": 0.01152349, "auxiliary_loss_mlp": 0.01031783, "balance_loss_clip": 1.04412913, "balance_loss_mlp": 1.02455878, "epoch": 0.8825828172909277, "flos": 24679500785280.0, "grad_norm": 1.6382755399289484, "language_loss": 0.70902061, "learning_rate": 1.4277324220099862e-07, "loss": 0.73086202, "num_input_tokens_seen": 158834035, "step": 7340, "time_per_iteration": 3.551053524017334 }, { "auxiliary_loss_clip": 0.01116226, "auxiliary_loss_mlp": 0.01023607, "balance_loss_clip": 1.03718746, "balance_loss_mlp": 1.0169518, "epoch": 0.8827030601815667, "flos": 22456163721600.0, "grad_norm": 1.967168713989707, "language_loss": 0.74310088, "learning_rate": 1.4248434660366938e-07, "loss": 0.76449925, "num_input_tokens_seen": 158853510, "step": 7341, "time_per_iteration": 2.743844985961914 }, { "auxiliary_loss_clip": 0.0113756, "auxiliary_loss_mlp": 0.0102826, "balance_loss_clip": 1.04477715, "balance_loss_mlp": 1.02093506, "epoch": 0.8828233030722058, "flos": 19865639877120.0, "grad_norm": 1.9758013145910316, "language_loss": 0.70900273, "learning_rate": 1.4219573278998808e-07, "loss": 0.73066092, "num_input_tokens_seen": 158871970, "step": 7342, "time_per_iteration": 2.707883596420288 }, { "auxiliary_loss_clip": 0.01133696, "auxiliary_loss_mlp": 0.01023658, "balance_loss_clip": 1.04004216, "balance_loss_mlp": 1.01608551, "epoch": 0.882943545962845, "flos": 39347213581440.0, "grad_norm": 2.5088557478473077, "language_loss": 0.6489374, "learning_rate": 1.4190740080373685e-07, "loss": 0.67051095, "num_input_tokens_seen": 158892250, "step": 7343, "time_per_iteration": 2.7806074619293213 }, { "auxiliary_loss_clip": 0.01108178, "auxiliary_loss_mlp": 0.01026609, "balance_loss_clip": 1.04177892, "balance_loss_mlp": 1.01940584, "epoch": 0.883063788853484, "flos": 19054524908160.0, "grad_norm": 1.850247342726599, "language_loss": 0.83969223, "learning_rate": 1.4161935068865538e-07, "loss": 0.86104012, "num_input_tokens_seen": 158907395, "step": 7344, "time_per_iteration": 2.950233221054077 }, { "auxiliary_loss_clip": 0.01162829, "auxiliary_loss_mlp": 0.01025502, "balance_loss_clip": 1.04365587, "balance_loss_mlp": 1.01830792, "epoch": 0.8831840317441231, "flos": 18733196816640.0, "grad_norm": 2.221890758178015, "language_loss": 0.75689912, "learning_rate": 1.4133158248844113e-07, "loss": 0.77878249, "num_input_tokens_seen": 158926300, "step": 7345, "time_per_iteration": 2.6039254665374756 }, { "auxiliary_loss_clip": 0.01126108, "auxiliary_loss_mlp": 0.01023585, "balance_loss_clip": 1.04221749, "balance_loss_mlp": 1.01604533, "epoch": 0.8833042746347622, "flos": 26827712553600.0, "grad_norm": 1.7954049940125767, "language_loss": 0.73467028, "learning_rate": 1.4104409624674785e-07, "loss": 0.75616717, "num_input_tokens_seen": 158946085, "step": 7346, "time_per_iteration": 2.768541097640991 }, { "auxiliary_loss_clip": 0.01152698, "auxiliary_loss_mlp": 0.01033663, "balance_loss_clip": 1.04663908, "balance_loss_mlp": 1.02676678, "epoch": 0.8834245175254013, "flos": 26104077158400.0, "grad_norm": 1.8418977718763025, "language_loss": 0.78609478, "learning_rate": 1.407568920071873e-07, "loss": 0.80795842, "num_input_tokens_seen": 158964950, "step": 7347, "time_per_iteration": 2.709946632385254 }, { "auxiliary_loss_clip": 0.01169526, "auxiliary_loss_mlp": 0.01020615, "balance_loss_clip": 1.04660726, "balance_loss_mlp": 1.01322377, "epoch": 0.8835447604160404, "flos": 30629036977920.0, "grad_norm": 3.1903350754573427, "language_loss": 0.68383771, "learning_rate": 1.4046996981332782e-07, "loss": 0.70573914, "num_input_tokens_seen": 158984835, "step": 7348, "time_per_iteration": 2.6688780784606934 }, { "auxiliary_loss_clip": 0.01122275, "auxiliary_loss_mlp": 0.01022748, "balance_loss_clip": 1.03988028, "balance_loss_mlp": 1.01537776, "epoch": 0.8836650033066795, "flos": 24718356322560.0, "grad_norm": 2.1526581492396475, "language_loss": 0.78630245, "learning_rate": 1.4018332970869516e-07, "loss": 0.80775267, "num_input_tokens_seen": 159002775, "step": 7349, "time_per_iteration": 2.7519280910491943 }, { "auxiliary_loss_clip": 0.01128344, "auxiliary_loss_mlp": 0.01023141, "balance_loss_clip": 1.04196525, "balance_loss_mlp": 1.01639056, "epoch": 0.8837852461973186, "flos": 25413371556480.0, "grad_norm": 1.8540547039930992, "language_loss": 0.84751916, "learning_rate": 1.398969717367733e-07, "loss": 0.86903405, "num_input_tokens_seen": 159024100, "step": 7350, "time_per_iteration": 2.784491539001465 }, { "auxiliary_loss_clip": 0.01107498, "auxiliary_loss_mlp": 0.01025199, "balance_loss_clip": 1.04319572, "balance_loss_mlp": 1.01899743, "epoch": 0.8839054890879576, "flos": 17822574195840.0, "grad_norm": 1.773543566915141, "language_loss": 0.76390219, "learning_rate": 1.396108959410014e-07, "loss": 0.78522921, "num_input_tokens_seen": 159043315, "step": 7351, "time_per_iteration": 2.7038753032684326 }, { "auxiliary_loss_clip": 0.01148445, "auxiliary_loss_mlp": 0.00762621, "balance_loss_clip": 1.04500461, "balance_loss_mlp": 1.00032449, "epoch": 0.8840257319785968, "flos": 23769021818880.0, "grad_norm": 1.6672788801270013, "language_loss": 0.81081527, "learning_rate": 1.3932510236477745e-07, "loss": 0.82992595, "num_input_tokens_seen": 159063985, "step": 7352, "time_per_iteration": 2.7274186611175537 }, { "auxiliary_loss_clip": 0.0114843, "auxiliary_loss_mlp": 0.01024221, "balance_loss_clip": 1.04169989, "balance_loss_mlp": 1.01686621, "epoch": 0.8841459748692359, "flos": 29059776622080.0, "grad_norm": 2.103254462457511, "language_loss": 0.56121218, "learning_rate": 1.3903959105145636e-07, "loss": 0.58293867, "num_input_tokens_seen": 159084475, "step": 7353, "time_per_iteration": 2.6778509616851807 }, { "auxiliary_loss_clip": 0.01165703, "auxiliary_loss_mlp": 0.01023482, "balance_loss_clip": 1.04520071, "balance_loss_mlp": 1.01664805, "epoch": 0.8842662177598749, "flos": 24311523905280.0, "grad_norm": 2.0493406692423997, "language_loss": 0.83196855, "learning_rate": 1.387543620443492e-07, "loss": 0.8538605, "num_input_tokens_seen": 159101320, "step": 7354, "time_per_iteration": 2.5573666095733643 }, { "auxiliary_loss_clip": 0.01163211, "auxiliary_loss_mlp": 0.01025649, "balance_loss_clip": 1.04571044, "balance_loss_mlp": 1.01912475, "epoch": 0.8843864606505141, "flos": 25007867942400.0, "grad_norm": 1.7246239052273389, "language_loss": 0.84399056, "learning_rate": 1.3846941538672606e-07, "loss": 0.86587912, "num_input_tokens_seen": 159120025, "step": 7355, "time_per_iteration": 2.562065839767456 }, { "auxiliary_loss_clip": 0.01113544, "auxiliary_loss_mlp": 0.01022807, "balance_loss_clip": 1.04273701, "balance_loss_mlp": 1.01596439, "epoch": 0.8845067035411531, "flos": 28183915388160.0, "grad_norm": 2.1635010084432094, "language_loss": 0.80948305, "learning_rate": 1.3818475112181193e-07, "loss": 0.83084661, "num_input_tokens_seen": 159138820, "step": 7356, "time_per_iteration": 2.777174472808838 }, { "auxiliary_loss_clip": 0.01135126, "auxiliary_loss_mlp": 0.01019693, "balance_loss_clip": 1.04207063, "balance_loss_mlp": 1.01291859, "epoch": 0.8846269464317922, "flos": 12853219311360.0, "grad_norm": 2.284630376184178, "language_loss": 0.7978726, "learning_rate": 1.3790036929279091e-07, "loss": 0.8194207, "num_input_tokens_seen": 159155975, "step": 7357, "time_per_iteration": 2.600107192993164 }, { "auxiliary_loss_clip": 0.011514, "auxiliary_loss_mlp": 0.00761869, "balance_loss_clip": 1.04509449, "balance_loss_mlp": 1.00028908, "epoch": 0.8847471893224313, "flos": 18624351628800.0, "grad_norm": 2.219289943254991, "language_loss": 0.58664739, "learning_rate": 1.3761626994280363e-07, "loss": 0.60578012, "num_input_tokens_seen": 159173445, "step": 7358, "time_per_iteration": 2.617225408554077 }, { "auxiliary_loss_clip": 0.01127864, "auxiliary_loss_mlp": 0.01020232, "balance_loss_clip": 1.04177177, "balance_loss_mlp": 1.01345778, "epoch": 0.8848674322130704, "flos": 35769433449600.0, "grad_norm": 2.1394910609561, "language_loss": 0.7356993, "learning_rate": 1.3733245311494735e-07, "loss": 0.75718021, "num_input_tokens_seen": 159196100, "step": 7359, "time_per_iteration": 3.7582991123199463 }, { "auxiliary_loss_clip": 0.01152327, "auxiliary_loss_mlp": 0.01026149, "balance_loss_clip": 1.04488754, "balance_loss_mlp": 1.01926517, "epoch": 0.8849876751037095, "flos": 24243760897920.0, "grad_norm": 2.2790324635833143, "language_loss": 0.70360363, "learning_rate": 1.3704891885227676e-07, "loss": 0.72538841, "num_input_tokens_seen": 159216145, "step": 7360, "time_per_iteration": 2.697330951690674 }, { "auxiliary_loss_clip": 0.01121819, "auxiliary_loss_mlp": 0.01029825, "balance_loss_clip": 1.03827369, "balance_loss_mlp": 1.02216029, "epoch": 0.8851079179943486, "flos": 21500580251520.0, "grad_norm": 1.8611025010608084, "language_loss": 0.78432369, "learning_rate": 1.367656671978037e-07, "loss": 0.80584019, "num_input_tokens_seen": 159233610, "step": 7361, "time_per_iteration": 2.783442974090576 }, { "auxiliary_loss_clip": 0.01143257, "auxiliary_loss_mlp": 0.01021491, "balance_loss_clip": 1.04248333, "balance_loss_mlp": 1.01492882, "epoch": 0.8852281608849877, "flos": 15300711198720.0, "grad_norm": 2.010836178451157, "language_loss": 0.73684764, "learning_rate": 1.36482698194498e-07, "loss": 0.75849515, "num_input_tokens_seen": 159250155, "step": 7362, "time_per_iteration": 3.643077850341797 }, { "auxiliary_loss_clip": 0.01133831, "auxiliary_loss_mlp": 0.01023013, "balance_loss_clip": 1.03919125, "balance_loss_mlp": 1.0158937, "epoch": 0.8853484037756267, "flos": 23295719283840.0, "grad_norm": 1.9332913168759227, "language_loss": 0.72161436, "learning_rate": 1.3620001188528506e-07, "loss": 0.74318278, "num_input_tokens_seen": 159270875, "step": 7363, "time_per_iteration": 2.769676923751831 }, { "auxiliary_loss_clip": 0.01155121, "auxiliary_loss_mlp": 0.01025147, "balance_loss_clip": 1.04482973, "balance_loss_mlp": 1.01802468, "epoch": 0.8854686466662659, "flos": 25114773795840.0, "grad_norm": 7.312806073921662, "language_loss": 0.73645079, "learning_rate": 1.3591760831304865e-07, "loss": 0.75825351, "num_input_tokens_seen": 159288565, "step": 7364, "time_per_iteration": 3.544142723083496 }, { "auxiliary_loss_clip": 0.01164339, "auxiliary_loss_mlp": 0.0102499, "balance_loss_clip": 1.04581857, "balance_loss_mlp": 1.01784301, "epoch": 0.885588889556905, "flos": 21390873137280.0, "grad_norm": 1.8225185455261672, "language_loss": 0.79230481, "learning_rate": 1.356354875206287e-07, "loss": 0.81419814, "num_input_tokens_seen": 159306400, "step": 7365, "time_per_iteration": 2.5810978412628174 }, { "auxiliary_loss_clip": 0.01124068, "auxiliary_loss_mlp": 0.0102798, "balance_loss_clip": 1.04460168, "balance_loss_mlp": 1.02100921, "epoch": 0.885709132447544, "flos": 26906752431360.0, "grad_norm": 1.9013590449566793, "language_loss": 0.70093209, "learning_rate": 1.3535364955082296e-07, "loss": 0.72245252, "num_input_tokens_seen": 159326250, "step": 7366, "time_per_iteration": 2.732025146484375 }, { "auxiliary_loss_clip": 0.01163115, "auxiliary_loss_mlp": 0.01027907, "balance_loss_clip": 1.04573703, "balance_loss_mlp": 1.02103186, "epoch": 0.8858293753381832, "flos": 26103394800000.0, "grad_norm": 1.8603167090734614, "language_loss": 0.64635551, "learning_rate": 1.3507209444638613e-07, "loss": 0.6682657, "num_input_tokens_seen": 159348250, "step": 7367, "time_per_iteration": 3.595496654510498 }, { "auxiliary_loss_clip": 0.01149152, "auxiliary_loss_mlp": 0.0101986, "balance_loss_clip": 1.04338408, "balance_loss_mlp": 1.01268101, "epoch": 0.8859496182288222, "flos": 23292810282240.0, "grad_norm": 1.8646209441098294, "language_loss": 0.73738873, "learning_rate": 1.347908222500298e-07, "loss": 0.7590788, "num_input_tokens_seen": 159368325, "step": 7368, "time_per_iteration": 2.643167018890381 }, { "auxiliary_loss_clip": 0.01107453, "auxiliary_loss_mlp": 0.01025938, "balance_loss_clip": 1.03955829, "balance_loss_mlp": 1.01929832, "epoch": 0.8860698611194613, "flos": 16872916469760.0, "grad_norm": 1.792211356794916, "language_loss": 0.69607604, "learning_rate": 1.3450983300442276e-07, "loss": 0.71740997, "num_input_tokens_seen": 159387555, "step": 7369, "time_per_iteration": 2.6939852237701416 }, { "auxiliary_loss_clip": 0.01153874, "auxiliary_loss_mlp": 0.01022192, "balance_loss_clip": 1.04477048, "balance_loss_mlp": 1.01558447, "epoch": 0.8861901040101005, "flos": 24681404206080.0, "grad_norm": 1.9348951312408165, "language_loss": 0.73389578, "learning_rate": 1.3422912675219068e-07, "loss": 0.75565648, "num_input_tokens_seen": 159407310, "step": 7370, "time_per_iteration": 2.695908308029175 }, { "auxiliary_loss_clip": 0.01163779, "auxiliary_loss_mlp": 0.01026474, "balance_loss_clip": 1.04623079, "balance_loss_mlp": 1.02013254, "epoch": 0.8863103469007395, "flos": 24423026699520.0, "grad_norm": 1.7604677451137405, "language_loss": 0.790034, "learning_rate": 1.339487035359166e-07, "loss": 0.81193656, "num_input_tokens_seen": 159427680, "step": 7371, "time_per_iteration": 2.6354246139526367 }, { "auxiliary_loss_clip": 0.01139103, "auxiliary_loss_mlp": 0.0076147, "balance_loss_clip": 1.04451656, "balance_loss_mlp": 1.00032055, "epoch": 0.8864305897913786, "flos": 22053964158720.0, "grad_norm": 1.5272410789901996, "language_loss": 0.84692836, "learning_rate": 1.336685633981409e-07, "loss": 0.86593407, "num_input_tokens_seen": 159448765, "step": 7372, "time_per_iteration": 2.6439337730407715 }, { "auxiliary_loss_clip": 0.0115295, "auxiliary_loss_mlp": 0.01022818, "balance_loss_clip": 1.04420316, "balance_loss_mlp": 1.01570153, "epoch": 0.8865508326820177, "flos": 19099449843840.0, "grad_norm": 1.8028232393706307, "language_loss": 0.75050825, "learning_rate": 1.333887063813597e-07, "loss": 0.77226591, "num_input_tokens_seen": 159466870, "step": 7373, "time_per_iteration": 2.6081502437591553 }, { "auxiliary_loss_clip": 0.01138833, "auxiliary_loss_mlp": 0.01023412, "balance_loss_clip": 1.04127204, "balance_loss_mlp": 1.01663446, "epoch": 0.8866710755726568, "flos": 15414189240960.0, "grad_norm": 1.8511753317731303, "language_loss": 0.66225541, "learning_rate": 1.331091325280278e-07, "loss": 0.68387783, "num_input_tokens_seen": 159485840, "step": 7374, "time_per_iteration": 2.7389426231384277 }, { "auxiliary_loss_clip": 0.01099943, "auxiliary_loss_mlp": 0.01024738, "balance_loss_clip": 1.03805161, "balance_loss_mlp": 1.01729333, "epoch": 0.8867913184632958, "flos": 20083689388800.0, "grad_norm": 1.6701040773265599, "language_loss": 0.78623968, "learning_rate": 1.3282984188055625e-07, "loss": 0.80748653, "num_input_tokens_seen": 159505630, "step": 7375, "time_per_iteration": 2.795354127883911 }, { "auxiliary_loss_clip": 0.01164006, "auxiliary_loss_mlp": 0.01023322, "balance_loss_clip": 1.0448277, "balance_loss_mlp": 1.0162828, "epoch": 0.8869115613539349, "flos": 23365852588800.0, "grad_norm": 2.028403330495967, "language_loss": 0.79512191, "learning_rate": 1.3255083448131288e-07, "loss": 0.8169952, "num_input_tokens_seen": 159524675, "step": 7376, "time_per_iteration": 2.6071555614471436 }, { "auxiliary_loss_clip": 0.01153872, "auxiliary_loss_mlp": 0.01024493, "balance_loss_clip": 1.04262555, "balance_loss_mlp": 1.01771879, "epoch": 0.8870318042445741, "flos": 21286840371840.0, "grad_norm": 1.97674660023906, "language_loss": 0.79323483, "learning_rate": 1.3227211037262365e-07, "loss": 0.81501853, "num_input_tokens_seen": 159541915, "step": 7377, "time_per_iteration": 2.6521496772766113 }, { "auxiliary_loss_clip": 0.01111302, "auxiliary_loss_mlp": 0.01025049, "balance_loss_clip": 1.03887558, "balance_loss_mlp": 1.01797354, "epoch": 0.8871520471352131, "flos": 20010862563840.0, "grad_norm": 2.3412854355689694, "language_loss": 0.8546561, "learning_rate": 1.319936695967696e-07, "loss": 0.8760196, "num_input_tokens_seen": 159559740, "step": 7378, "time_per_iteration": 2.7119977474212646 }, { "auxiliary_loss_clip": 0.01170949, "auxiliary_loss_mlp": 0.01026749, "balance_loss_clip": 1.04608703, "balance_loss_mlp": 1.01923025, "epoch": 0.8872722900258522, "flos": 22601422321920.0, "grad_norm": 2.4394303057842177, "language_loss": 0.82174218, "learning_rate": 1.3171551219599097e-07, "loss": 0.84371918, "num_input_tokens_seen": 159578265, "step": 7379, "time_per_iteration": 2.5848402976989746 }, { "auxiliary_loss_clip": 0.01167089, "auxiliary_loss_mlp": 0.01029529, "balance_loss_clip": 1.04820573, "balance_loss_mlp": 1.02212608, "epoch": 0.8873925329164913, "flos": 22163276223360.0, "grad_norm": 2.2537615883002973, "language_loss": 0.78105557, "learning_rate": 1.3143763821248377e-07, "loss": 0.80302179, "num_input_tokens_seen": 159595350, "step": 7380, "time_per_iteration": 2.603397846221924 }, { "auxiliary_loss_clip": 0.01163759, "auxiliary_loss_mlp": 0.01023578, "balance_loss_clip": 1.04611015, "balance_loss_mlp": 1.01701295, "epoch": 0.8875127758071304, "flos": 19208223204480.0, "grad_norm": 1.865175826780953, "language_loss": 0.72029305, "learning_rate": 1.3116004768840118e-07, "loss": 0.74216646, "num_input_tokens_seen": 159613725, "step": 7381, "time_per_iteration": 2.6400012969970703 }, { "auxiliary_loss_clip": 0.01165454, "auxiliary_loss_mlp": 0.0102699, "balance_loss_clip": 1.04528236, "balance_loss_mlp": 1.01995969, "epoch": 0.8876330186977694, "flos": 18110900666880.0, "grad_norm": 1.673894447149643, "language_loss": 0.74009037, "learning_rate": 1.3088274066585348e-07, "loss": 0.76201487, "num_input_tokens_seen": 159631335, "step": 7382, "time_per_iteration": 2.6107428073883057 }, { "auxiliary_loss_clip": 0.01126876, "auxiliary_loss_mlp": 0.01022011, "balance_loss_clip": 1.04068291, "balance_loss_mlp": 1.01486993, "epoch": 0.8877532615884086, "flos": 22009434272640.0, "grad_norm": 2.2974769830762467, "language_loss": 0.90249467, "learning_rate": 1.3060571718690749e-07, "loss": 0.92398351, "num_input_tokens_seen": 159648830, "step": 7383, "time_per_iteration": 2.7659108638763428 }, { "auxiliary_loss_clip": 0.01035105, "auxiliary_loss_mlp": 0.00753812, "balance_loss_clip": 1.00983679, "balance_loss_mlp": 1.00020826, "epoch": 0.8878735044790477, "flos": 72136924346880.0, "grad_norm": 0.787896062108697, "language_loss": 0.56860304, "learning_rate": 1.3032897729358805e-07, "loss": 0.58649218, "num_input_tokens_seen": 159709785, "step": 7384, "time_per_iteration": 3.31854248046875 }, { "auxiliary_loss_clip": 0.0108274, "auxiliary_loss_mlp": 0.00762478, "balance_loss_clip": 1.03461778, "balance_loss_mlp": 1.00040317, "epoch": 0.8879937473696867, "flos": 27526355061120.0, "grad_norm": 2.6961100516643857, "language_loss": 0.80360633, "learning_rate": 1.3005252102787645e-07, "loss": 0.8220585, "num_input_tokens_seen": 159728725, "step": 7385, "time_per_iteration": 3.675999164581299 }, { "auxiliary_loss_clip": 0.01154768, "auxiliary_loss_mlp": 0.01022629, "balance_loss_clip": 1.0451169, "balance_loss_mlp": 1.01560116, "epoch": 0.8881139902603259, "flos": 22234091886720.0, "grad_norm": 1.5266200932494387, "language_loss": 0.73249799, "learning_rate": 1.297763484317105e-07, "loss": 0.75427192, "num_input_tokens_seen": 159747020, "step": 7386, "time_per_iteration": 2.6809632778167725 }, { "auxiliary_loss_clip": 0.01105892, "auxiliary_loss_mlp": 0.00762652, "balance_loss_clip": 1.0379231, "balance_loss_mlp": 1.00037611, "epoch": 0.888234233150965, "flos": 20299548170880.0, "grad_norm": 2.3694038341931183, "language_loss": 0.70259523, "learning_rate": 1.2950045954698551e-07, "loss": 0.72128069, "num_input_tokens_seen": 159764855, "step": 7387, "time_per_iteration": 2.7491867542266846 }, { "auxiliary_loss_clip": 0.0111607, "auxiliary_loss_mlp": 0.01025384, "balance_loss_clip": 1.04188037, "balance_loss_mlp": 1.0185746, "epoch": 0.888354476041604, "flos": 18147996437760.0, "grad_norm": 1.6221960725881368, "language_loss": 0.7580508, "learning_rate": 1.2922485441555343e-07, "loss": 0.77946532, "num_input_tokens_seen": 159783935, "step": 7388, "time_per_iteration": 3.7265148162841797 }, { "auxiliary_loss_clip": 0.01163191, "auxiliary_loss_mlp": 0.01022048, "balance_loss_clip": 1.0442189, "balance_loss_mlp": 1.0156343, "epoch": 0.8884747189322432, "flos": 22014282608640.0, "grad_norm": 1.7435569515816058, "language_loss": 0.81899333, "learning_rate": 1.2894953307922363e-07, "loss": 0.8408457, "num_input_tokens_seen": 159802895, "step": 7389, "time_per_iteration": 2.636754035949707 }, { "auxiliary_loss_clip": 0.01116072, "auxiliary_loss_mlp": 0.01024128, "balance_loss_clip": 1.03983819, "balance_loss_mlp": 1.0171634, "epoch": 0.8885949618228822, "flos": 19786779567360.0, "grad_norm": 3.3929040804211277, "language_loss": 0.84267282, "learning_rate": 1.2867449557976208e-07, "loss": 0.86407483, "num_input_tokens_seen": 159820995, "step": 7390, "time_per_iteration": 2.7651143074035645 }, { "auxiliary_loss_clip": 0.01149059, "auxiliary_loss_mlp": 0.01025448, "balance_loss_clip": 1.04450893, "balance_loss_mlp": 1.01807213, "epoch": 0.8887152047135213, "flos": 20047599198720.0, "grad_norm": 1.9039674970027378, "language_loss": 0.75708103, "learning_rate": 1.283997419588916e-07, "loss": 0.77882612, "num_input_tokens_seen": 159840465, "step": 7391, "time_per_iteration": 3.5311691761016846 }, { "auxiliary_loss_clip": 0.01153123, "auxiliary_loss_mlp": 0.01023488, "balance_loss_clip": 1.04279423, "balance_loss_mlp": 1.01626086, "epoch": 0.8888354476041604, "flos": 18588117784320.0, "grad_norm": 1.9259934589587973, "language_loss": 0.62112403, "learning_rate": 1.2812527225829216e-07, "loss": 0.64289016, "num_input_tokens_seen": 159858690, "step": 7392, "time_per_iteration": 2.6355276107788086 }, { "auxiliary_loss_clip": 0.01156231, "auxiliary_loss_mlp": 0.01030551, "balance_loss_clip": 1.04504287, "balance_loss_mlp": 1.02259946, "epoch": 0.8889556904947995, "flos": 21689794120320.0, "grad_norm": 1.937035248817835, "language_loss": 0.76479155, "learning_rate": 1.2785108651960052e-07, "loss": 0.78665936, "num_input_tokens_seen": 159880325, "step": 7393, "time_per_iteration": 3.454632520675659 }, { "auxiliary_loss_clip": 0.01151753, "auxiliary_loss_mlp": 0.01024198, "balance_loss_clip": 1.04273808, "balance_loss_mlp": 1.01742685, "epoch": 0.8890759333854386, "flos": 27381204201600.0, "grad_norm": 2.304090441109236, "language_loss": 0.8080591, "learning_rate": 1.2757718478441094e-07, "loss": 0.82981861, "num_input_tokens_seen": 159901070, "step": 7394, "time_per_iteration": 2.694096088409424 }, { "auxiliary_loss_clip": 0.01133868, "auxiliary_loss_mlp": 0.01026454, "balance_loss_clip": 1.04091358, "balance_loss_mlp": 1.0193193, "epoch": 0.8891961762760777, "flos": 24498834353280.0, "grad_norm": 2.116831459498727, "language_loss": 0.77399015, "learning_rate": 1.2730356709427302e-07, "loss": 0.79559332, "num_input_tokens_seen": 159919750, "step": 7395, "time_per_iteration": 2.6911487579345703 }, { "auxiliary_loss_clip": 0.01148982, "auxiliary_loss_mlp": 0.01027635, "balance_loss_clip": 1.04610872, "balance_loss_mlp": 1.02071774, "epoch": 0.8893164191667168, "flos": 41499770895360.0, "grad_norm": 1.643883142257371, "language_loss": 0.59863508, "learning_rate": 1.2703023349069542e-07, "loss": 0.62040126, "num_input_tokens_seen": 159944600, "step": 7396, "time_per_iteration": 2.861041307449341 }, { "auxiliary_loss_clip": 0.01145512, "auxiliary_loss_mlp": 0.01027683, "balance_loss_clip": 1.04308057, "balance_loss_mlp": 1.02069163, "epoch": 0.8894366620573558, "flos": 33583623120000.0, "grad_norm": 1.8091600979618832, "language_loss": 0.61711574, "learning_rate": 1.2675718401514223e-07, "loss": 0.63884765, "num_input_tokens_seen": 159968780, "step": 7397, "time_per_iteration": 2.697268486022949 }, { "auxiliary_loss_clip": 0.01136209, "auxiliary_loss_mlp": 0.01023013, "balance_loss_clip": 1.04240656, "balance_loss_mlp": 1.01581025, "epoch": 0.889556904947995, "flos": 16909832672640.0, "grad_norm": 2.245038101603853, "language_loss": 0.74476713, "learning_rate": 1.264844187090346e-07, "loss": 0.76635939, "num_input_tokens_seen": 159985905, "step": 7398, "time_per_iteration": 2.630293607711792 }, { "auxiliary_loss_clip": 0.01130249, "auxiliary_loss_mlp": 0.0102663, "balance_loss_clip": 1.0392344, "balance_loss_mlp": 1.01960266, "epoch": 0.889677147838634, "flos": 26030855283840.0, "grad_norm": 1.6820301144502565, "language_loss": 0.75320792, "learning_rate": 1.262119376137516e-07, "loss": 0.7747767, "num_input_tokens_seen": 160006965, "step": 7399, "time_per_iteration": 2.688326120376587 }, { "auxiliary_loss_clip": 0.01140018, "auxiliary_loss_mlp": 0.01019359, "balance_loss_clip": 1.03986168, "balance_loss_mlp": 1.01251626, "epoch": 0.8897973907292731, "flos": 26468283110400.0, "grad_norm": 1.6443331574761637, "language_loss": 0.84982324, "learning_rate": 1.2593974077062707e-07, "loss": 0.87141699, "num_input_tokens_seen": 160028585, "step": 7400, "time_per_iteration": 2.6313655376434326 }, { "auxiliary_loss_clip": 0.01114415, "auxiliary_loss_mlp": 0.01024972, "balance_loss_clip": 1.04022384, "balance_loss_mlp": 1.01753306, "epoch": 0.8899176336199123, "flos": 26249694894720.0, "grad_norm": 2.007878851886122, "language_loss": 0.6344924, "learning_rate": 1.2566782822095423e-07, "loss": 0.65588617, "num_input_tokens_seen": 160048840, "step": 7401, "time_per_iteration": 2.6943907737731934 }, { "auxiliary_loss_clip": 0.01128924, "auxiliary_loss_mlp": 0.01023161, "balance_loss_clip": 1.04418802, "balance_loss_mlp": 1.01629734, "epoch": 0.8900378765105513, "flos": 20811742156800.0, "grad_norm": 2.688259023987226, "language_loss": 0.7130596, "learning_rate": 1.2539620000598162e-07, "loss": 0.73458046, "num_input_tokens_seen": 160068175, "step": 7402, "time_per_iteration": 2.7264931201934814 }, { "auxiliary_loss_clip": 0.01162421, "auxiliary_loss_mlp": 0.01023305, "balance_loss_clip": 1.04402065, "balance_loss_mlp": 1.01592922, "epoch": 0.8901581194011904, "flos": 16472333018880.0, "grad_norm": 1.8415711332903504, "language_loss": 0.79644769, "learning_rate": 1.2512485616691492e-07, "loss": 0.81830502, "num_input_tokens_seen": 160085230, "step": 7403, "time_per_iteration": 2.6056599617004395 }, { "auxiliary_loss_clip": 0.01126001, "auxiliary_loss_mlp": 0.01031322, "balance_loss_clip": 1.04224563, "balance_loss_mlp": 1.02393162, "epoch": 0.8902783622918296, "flos": 35155253773440.0, "grad_norm": 1.4241507269050806, "language_loss": 0.80785096, "learning_rate": 1.2485379674491681e-07, "loss": 0.8294242, "num_input_tokens_seen": 160111425, "step": 7404, "time_per_iteration": 2.8429160118103027 }, { "auxiliary_loss_clip": 0.0113867, "auxiliary_loss_mlp": 0.01027726, "balance_loss_clip": 1.04542136, "balance_loss_mlp": 1.0202992, "epoch": 0.8903986051824686, "flos": 17201068145280.0, "grad_norm": 2.5302309011058313, "language_loss": 0.79112697, "learning_rate": 1.2458302178110657e-07, "loss": 0.81279093, "num_input_tokens_seen": 160129790, "step": 7405, "time_per_iteration": 2.681103229522705 }, { "auxiliary_loss_clip": 0.0110937, "auxiliary_loss_mlp": 0.01021859, "balance_loss_clip": 1.03756368, "balance_loss_mlp": 1.01519537, "epoch": 0.8905188480731077, "flos": 25483863997440.0, "grad_norm": 2.4855416708025064, "language_loss": 0.82375866, "learning_rate": 1.2431253131656118e-07, "loss": 0.8450709, "num_input_tokens_seen": 160149265, "step": 7406, "time_per_iteration": 2.7609238624572754 }, { "auxiliary_loss_clip": 0.01130915, "auxiliary_loss_mlp": 0.01020843, "balance_loss_clip": 1.04296982, "balance_loss_mlp": 1.01394665, "epoch": 0.8906390909637467, "flos": 23365888502400.0, "grad_norm": 2.1644730437035204, "language_loss": 0.76758629, "learning_rate": 1.240423253923133e-07, "loss": 0.78910387, "num_input_tokens_seen": 160168870, "step": 7407, "time_per_iteration": 2.6819076538085938 }, { "auxiliary_loss_clip": 0.0115305, "auxiliary_loss_mlp": 0.0102611, "balance_loss_clip": 1.04401493, "balance_loss_mlp": 1.01913679, "epoch": 0.8907593338543859, "flos": 21068790860160.0, "grad_norm": 8.16772683438691, "language_loss": 0.69314778, "learning_rate": 1.237724040493533e-07, "loss": 0.71493936, "num_input_tokens_seen": 160187495, "step": 7408, "time_per_iteration": 2.6372807025909424 }, { "auxiliary_loss_clip": 0.01170458, "auxiliary_loss_mlp": 0.01027479, "balance_loss_clip": 1.04835653, "balance_loss_mlp": 1.01962876, "epoch": 0.8908795767450249, "flos": 21869562712320.0, "grad_norm": 2.812310645491133, "language_loss": 0.73079407, "learning_rate": 1.2350276732862773e-07, "loss": 0.75277352, "num_input_tokens_seen": 160208520, "step": 7409, "time_per_iteration": 2.6081199645996094 }, { "auxiliary_loss_clip": 0.01054872, "auxiliary_loss_mlp": 0.010014, "balance_loss_clip": 1.00972545, "balance_loss_mlp": 1.00033271, "epoch": 0.890999819635664, "flos": 66307869348480.0, "grad_norm": 0.847639537069281, "language_loss": 0.56649709, "learning_rate": 1.2323341527103993e-07, "loss": 0.58705974, "num_input_tokens_seen": 160263720, "step": 7410, "time_per_iteration": 3.202141761779785 }, { "auxiliary_loss_clip": 0.01162537, "auxiliary_loss_mlp": 0.01023133, "balance_loss_clip": 1.04449892, "balance_loss_mlp": 1.01674032, "epoch": 0.8911200625263032, "flos": 26869908055680.0, "grad_norm": 2.34894650105175, "language_loss": 0.85175741, "learning_rate": 1.2296434791745135e-07, "loss": 0.87361413, "num_input_tokens_seen": 160282170, "step": 7411, "time_per_iteration": 3.529440402984619 }, { "auxiliary_loss_clip": 0.01154273, "auxiliary_loss_mlp": 0.01020587, "balance_loss_clip": 1.04659677, "balance_loss_mlp": 1.01366103, "epoch": 0.8912403054169422, "flos": 20885825957760.0, "grad_norm": 2.073090076347989, "language_loss": 0.76717162, "learning_rate": 1.2269556530867875e-07, "loss": 0.78892016, "num_input_tokens_seen": 160300725, "step": 7412, "time_per_iteration": 2.6516244411468506 }, { "auxiliary_loss_clip": 0.01172062, "auxiliary_loss_mlp": 0.01030184, "balance_loss_clip": 1.04812431, "balance_loss_mlp": 1.0220691, "epoch": 0.8913605483075813, "flos": 27016567286400.0, "grad_norm": 2.8788792592715797, "language_loss": 0.81963843, "learning_rate": 1.2242706748549614e-07, "loss": 0.84166086, "num_input_tokens_seen": 160318720, "step": 7413, "time_per_iteration": 2.666343927383423 }, { "auxiliary_loss_clip": 0.01135986, "auxiliary_loss_mlp": 0.01023411, "balance_loss_clip": 1.03840721, "balance_loss_mlp": 1.01655602, "epoch": 0.8914807911982204, "flos": 23621500661760.0, "grad_norm": 4.793369530617937, "language_loss": 0.82423884, "learning_rate": 1.2215885448863473e-07, "loss": 0.84583282, "num_input_tokens_seen": 160339595, "step": 7414, "time_per_iteration": 3.8039047718048096 }, { "auxiliary_loss_clip": 0.01137541, "auxiliary_loss_mlp": 0.01024642, "balance_loss_clip": 1.04370093, "balance_loss_mlp": 1.01774871, "epoch": 0.8916010340888595, "flos": 24462277286400.0, "grad_norm": 5.219229518513067, "language_loss": 0.80634522, "learning_rate": 1.2189092635878152e-07, "loss": 0.82796705, "num_input_tokens_seen": 160361045, "step": 7415, "time_per_iteration": 2.712311029434204 }, { "auxiliary_loss_clip": 0.01113601, "auxiliary_loss_mlp": 0.01025452, "balance_loss_clip": 1.03950107, "balance_loss_mlp": 1.01832342, "epoch": 0.8917212769794985, "flos": 21215773313280.0, "grad_norm": 1.6380240561103458, "language_loss": 0.77288592, "learning_rate": 1.216232831365822e-07, "loss": 0.79427648, "num_input_tokens_seen": 160379990, "step": 7416, "time_per_iteration": 2.7154834270477295 }, { "auxiliary_loss_clip": 0.01142639, "auxiliary_loss_mlp": 0.01025782, "balance_loss_clip": 1.04339504, "balance_loss_mlp": 1.01806271, "epoch": 0.8918415198701377, "flos": 25513992529920.0, "grad_norm": 1.7292768916864134, "language_loss": 0.80725074, "learning_rate": 1.2135592486263678e-07, "loss": 0.82893491, "num_input_tokens_seen": 160399240, "step": 7417, "time_per_iteration": 3.607553243637085 }, { "auxiliary_loss_clip": 0.01134989, "auxiliary_loss_mlp": 0.0102213, "balance_loss_clip": 1.0419867, "balance_loss_mlp": 1.01518881, "epoch": 0.8919617627607768, "flos": 37853006693760.0, "grad_norm": 1.7062655248543732, "language_loss": 0.61033928, "learning_rate": 1.2108885157750415e-07, "loss": 0.63191044, "num_input_tokens_seen": 160421600, "step": 7418, "time_per_iteration": 3.7237048149108887 }, { "auxiliary_loss_clip": 0.01117704, "auxiliary_loss_mlp": 0.0076164, "balance_loss_clip": 1.04309857, "balance_loss_mlp": 1.00030851, "epoch": 0.8920820056514158, "flos": 26213676531840.0, "grad_norm": 1.865435452240683, "language_loss": 0.80500793, "learning_rate": 1.2082206332169897e-07, "loss": 0.8238014, "num_input_tokens_seen": 160441695, "step": 7419, "time_per_iteration": 2.8149659633636475 }, { "auxiliary_loss_clip": 0.01133172, "auxiliary_loss_mlp": 0.0102642, "balance_loss_clip": 1.04403472, "balance_loss_mlp": 1.01980984, "epoch": 0.892202248542055, "flos": 17383135207680.0, "grad_norm": 2.4215260767518427, "language_loss": 0.73088449, "learning_rate": 1.2055556013569225e-07, "loss": 0.75248039, "num_input_tokens_seen": 160457205, "step": 7420, "time_per_iteration": 2.627798318862915 }, { "auxiliary_loss_clip": 0.01138333, "auxiliary_loss_mlp": 0.0102217, "balance_loss_clip": 1.04181409, "balance_loss_mlp": 1.01458836, "epoch": 0.892322491432694, "flos": 21324223451520.0, "grad_norm": 1.7985816818642788, "language_loss": 0.82247329, "learning_rate": 1.2028934205991315e-07, "loss": 0.84407836, "num_input_tokens_seen": 160476525, "step": 7421, "time_per_iteration": 2.6140503883361816 }, { "auxiliary_loss_clip": 0.0114898, "auxiliary_loss_mlp": 0.01028081, "balance_loss_clip": 1.04147053, "balance_loss_mlp": 1.02134538, "epoch": 0.8924427343233331, "flos": 24029374573440.0, "grad_norm": 1.4390806320399865, "language_loss": 0.76475668, "learning_rate": 1.2002340913474607e-07, "loss": 0.78652728, "num_input_tokens_seen": 160500160, "step": 7422, "time_per_iteration": 2.706087589263916 }, { "auxiliary_loss_clip": 0.01168299, "auxiliary_loss_mlp": 0.01022978, "balance_loss_clip": 1.04721308, "balance_loss_mlp": 1.01535439, "epoch": 0.8925629772139723, "flos": 30008069631360.0, "grad_norm": 1.9321722516506206, "language_loss": 0.7388947, "learning_rate": 1.1975776140053317e-07, "loss": 0.76080751, "num_input_tokens_seen": 160520130, "step": 7423, "time_per_iteration": 2.659843683242798 }, { "auxiliary_loss_clip": 0.01110478, "auxiliary_loss_mlp": 0.01025422, "balance_loss_clip": 1.03987348, "balance_loss_mlp": 1.01771522, "epoch": 0.8926832201046113, "flos": 22601709630720.0, "grad_norm": 3.940942586809801, "language_loss": 0.73182666, "learning_rate": 1.194923988975729e-07, "loss": 0.75318563, "num_input_tokens_seen": 160539730, "step": 7424, "time_per_iteration": 2.7696735858917236 }, { "auxiliary_loss_clip": 0.0111957, "auxiliary_loss_mlp": 0.01020564, "balance_loss_clip": 1.04199338, "balance_loss_mlp": 1.01338124, "epoch": 0.8928034629952504, "flos": 13297722117120.0, "grad_norm": 3.144243989824138, "language_loss": 0.73608142, "learning_rate": 1.192273216661206e-07, "loss": 0.75748277, "num_input_tokens_seen": 160557820, "step": 7425, "time_per_iteration": 2.665731906890869 }, { "auxiliary_loss_clip": 0.01011678, "auxiliary_loss_mlp": 0.01000526, "balance_loss_clip": 1.01052523, "balance_loss_mlp": 0.99945349, "epoch": 0.8929237058858895, "flos": 54854556744960.0, "grad_norm": 0.763635780715979, "language_loss": 0.57453543, "learning_rate": 1.189625297463881e-07, "loss": 0.59465742, "num_input_tokens_seen": 160619510, "step": 7426, "time_per_iteration": 3.4184765815734863 }, { "auxiliary_loss_clip": 0.01091492, "auxiliary_loss_mlp": 0.0102441, "balance_loss_clip": 1.03698611, "balance_loss_mlp": 1.01784754, "epoch": 0.8930439487765286, "flos": 28883850785280.0, "grad_norm": 1.749773550830242, "language_loss": 0.79909915, "learning_rate": 1.1869802317854394e-07, "loss": 0.8202582, "num_input_tokens_seen": 160643295, "step": 7427, "time_per_iteration": 3.100395441055298 }, { "auxiliary_loss_clip": 0.0111043, "auxiliary_loss_mlp": 0.01023538, "balance_loss_clip": 1.03865194, "balance_loss_mlp": 1.01637614, "epoch": 0.8931641916671677, "flos": 22419283432320.0, "grad_norm": 2.0612789524427924, "language_loss": 0.72209048, "learning_rate": 1.1843380200271425e-07, "loss": 0.74343014, "num_input_tokens_seen": 160662495, "step": 7428, "time_per_iteration": 2.710832118988037 }, { "auxiliary_loss_clip": 0.01118938, "auxiliary_loss_mlp": 0.01027496, "balance_loss_clip": 1.04115832, "balance_loss_mlp": 1.01953006, "epoch": 0.8932844345578068, "flos": 25843149786240.0, "grad_norm": 3.385117097313365, "language_loss": 0.80501288, "learning_rate": 1.181698662589805e-07, "loss": 0.82647723, "num_input_tokens_seen": 160682080, "step": 7429, "time_per_iteration": 2.701418161392212 }, { "auxiliary_loss_clip": 0.0114798, "auxiliary_loss_mlp": 0.01028463, "balance_loss_clip": 1.04255867, "balance_loss_mlp": 1.02099442, "epoch": 0.8934046774484459, "flos": 22925803069440.0, "grad_norm": 2.6514332249491543, "language_loss": 0.7601729, "learning_rate": 1.1790621598738249e-07, "loss": 0.78193736, "num_input_tokens_seen": 160700395, "step": 7430, "time_per_iteration": 2.7431771755218506 }, { "auxiliary_loss_clip": 0.01163087, "auxiliary_loss_mlp": 0.010231, "balance_loss_clip": 1.04709959, "balance_loss_mlp": 1.01661468, "epoch": 0.8935249203390849, "flos": 24462097718400.0, "grad_norm": 2.788430681484839, "language_loss": 0.74693847, "learning_rate": 1.1764285122791461e-07, "loss": 0.76880032, "num_input_tokens_seen": 160721115, "step": 7431, "time_per_iteration": 2.679210662841797 }, { "auxiliary_loss_clip": 0.01152079, "auxiliary_loss_mlp": 0.01024578, "balance_loss_clip": 1.04295921, "balance_loss_mlp": 1.01802993, "epoch": 0.8936451632297241, "flos": 15742735966080.0, "grad_norm": 1.8504831465312919, "language_loss": 0.76830089, "learning_rate": 1.173797720205294e-07, "loss": 0.79006743, "num_input_tokens_seen": 160739150, "step": 7432, "time_per_iteration": 2.5991640090942383 }, { "auxiliary_loss_clip": 0.01155254, "auxiliary_loss_mlp": 0.01028444, "balance_loss_clip": 1.04613972, "balance_loss_mlp": 1.02070713, "epoch": 0.8937654061203631, "flos": 35115500396160.0, "grad_norm": 2.178020690475307, "language_loss": 0.71372533, "learning_rate": 1.1711697840513602e-07, "loss": 0.73556232, "num_input_tokens_seen": 160758585, "step": 7433, "time_per_iteration": 2.7459726333618164 }, { "auxiliary_loss_clip": 0.01142382, "auxiliary_loss_mlp": 0.01020416, "balance_loss_clip": 1.04116237, "balance_loss_mlp": 1.01390743, "epoch": 0.8938856490110022, "flos": 16107444708480.0, "grad_norm": 1.9325758414998935, "language_loss": 0.71199518, "learning_rate": 1.1685447042160012e-07, "loss": 0.73362321, "num_input_tokens_seen": 160776620, "step": 7434, "time_per_iteration": 2.6141843795776367 }, { "auxiliary_loss_clip": 0.01164859, "auxiliary_loss_mlp": 0.01029155, "balance_loss_clip": 1.04384351, "balance_loss_mlp": 1.02148962, "epoch": 0.8940058919016414, "flos": 20704189858560.0, "grad_norm": 1.530142484748639, "language_loss": 0.71317071, "learning_rate": 1.1659224810974367e-07, "loss": 0.73511088, "num_input_tokens_seen": 160796580, "step": 7435, "time_per_iteration": 2.5673561096191406 }, { "auxiliary_loss_clip": 0.01135345, "auxiliary_loss_mlp": 0.01028864, "balance_loss_clip": 1.04347646, "balance_loss_mlp": 1.0211513, "epoch": 0.8941261347922804, "flos": 25229041937280.0, "grad_norm": 1.5126272640937286, "language_loss": 0.68308073, "learning_rate": 1.1633031150934591e-07, "loss": 0.70472276, "num_input_tokens_seen": 160819610, "step": 7436, "time_per_iteration": 2.7280995845794678 }, { "auxiliary_loss_clip": 0.01153379, "auxiliary_loss_mlp": 0.01021751, "balance_loss_clip": 1.04577756, "balance_loss_mlp": 1.01421058, "epoch": 0.8942463776829195, "flos": 19537236806400.0, "grad_norm": 1.8953367033192885, "language_loss": 0.79640096, "learning_rate": 1.1606866066014176e-07, "loss": 0.81815231, "num_input_tokens_seen": 160838660, "step": 7437, "time_per_iteration": 3.942559242248535 }, { "auxiliary_loss_clip": 0.01120033, "auxiliary_loss_mlp": 0.01021318, "balance_loss_clip": 1.04087043, "balance_loss_mlp": 1.01407349, "epoch": 0.8943666205735585, "flos": 22301567585280.0, "grad_norm": 2.488436171341624, "language_loss": 0.75217509, "learning_rate": 1.1580729560182434e-07, "loss": 0.77358854, "num_input_tokens_seen": 160854515, "step": 7438, "time_per_iteration": 2.72472882270813 }, { "auxiliary_loss_clip": 0.01167693, "auxiliary_loss_mlp": 0.00761634, "balance_loss_clip": 1.04682326, "balance_loss_mlp": 1.00031435, "epoch": 0.8944868634641977, "flos": 18912893581440.0, "grad_norm": 1.6584847870898263, "language_loss": 0.71202075, "learning_rate": 1.1554621637404171e-07, "loss": 0.73131406, "num_input_tokens_seen": 160872605, "step": 7439, "time_per_iteration": 2.5792744159698486 }, { "auxiliary_loss_clip": 0.01148945, "auxiliary_loss_mlp": 0.01019307, "balance_loss_clip": 1.04319513, "balance_loss_mlp": 1.01221454, "epoch": 0.8946071063548368, "flos": 14460904241280.0, "grad_norm": 3.1879553021463725, "language_loss": 0.60706693, "learning_rate": 1.1528542301639999e-07, "loss": 0.62874949, "num_input_tokens_seen": 160889395, "step": 7440, "time_per_iteration": 3.5832526683807373 }, { "auxiliary_loss_clip": 0.01122519, "auxiliary_loss_mlp": 0.01024536, "balance_loss_clip": 1.0398519, "balance_loss_mlp": 1.01756859, "epoch": 0.8947273492454758, "flos": 20084084438400.0, "grad_norm": 2.330444272556525, "language_loss": 0.82260215, "learning_rate": 1.1502491556846105e-07, "loss": 0.8440727, "num_input_tokens_seen": 160907890, "step": 7441, "time_per_iteration": 2.705014228820801 }, { "auxiliary_loss_clip": 0.01135763, "auxiliary_loss_mlp": 0.01022387, "balance_loss_clip": 1.04189277, "balance_loss_mlp": 1.01515138, "epoch": 0.894847592136115, "flos": 18550555136640.0, "grad_norm": 2.747932214351184, "language_loss": 0.81284738, "learning_rate": 1.1476469406974331e-07, "loss": 0.83442891, "num_input_tokens_seen": 160923490, "step": 7442, "time_per_iteration": 3.553168535232544 }, { "auxiliary_loss_clip": 0.01162049, "auxiliary_loss_mlp": 0.01024867, "balance_loss_clip": 1.04543829, "balance_loss_mlp": 1.01801848, "epoch": 0.894967835026754, "flos": 23478468704640.0, "grad_norm": 1.7217554917059674, "language_loss": 0.77182031, "learning_rate": 1.1450475855972341e-07, "loss": 0.79368955, "num_input_tokens_seen": 160944280, "step": 7443, "time_per_iteration": 2.64827561378479 }, { "auxiliary_loss_clip": 0.01133327, "auxiliary_loss_mlp": 0.00762572, "balance_loss_clip": 1.03969169, "balance_loss_mlp": 1.00038099, "epoch": 0.8950880779173931, "flos": 15188310564480.0, "grad_norm": 1.971946134054133, "language_loss": 0.70522928, "learning_rate": 1.1424510907783158e-07, "loss": 0.72418833, "num_input_tokens_seen": 160961560, "step": 7444, "time_per_iteration": 3.6086862087249756 }, { "auxiliary_loss_clip": 0.01138953, "auxiliary_loss_mlp": 0.01020634, "balance_loss_clip": 1.04027367, "balance_loss_mlp": 1.01434278, "epoch": 0.8952083208080323, "flos": 22091957769600.0, "grad_norm": 1.71738364927884, "language_loss": 0.82828802, "learning_rate": 1.1398574566345787e-07, "loss": 0.84988391, "num_input_tokens_seen": 160982195, "step": 7445, "time_per_iteration": 2.6523241996765137 }, { "auxiliary_loss_clip": 0.01140599, "auxiliary_loss_mlp": 0.01022676, "balance_loss_clip": 1.04015625, "balance_loss_mlp": 1.01516294, "epoch": 0.8953285636986713, "flos": 23254026572160.0, "grad_norm": 2.3488273394815504, "language_loss": 0.82372141, "learning_rate": 1.1372666835594702e-07, "loss": 0.8453542, "num_input_tokens_seen": 161000520, "step": 7446, "time_per_iteration": 2.650315523147583 }, { "auxiliary_loss_clip": 0.01136933, "auxiliary_loss_mlp": 0.01021151, "balance_loss_clip": 1.04414439, "balance_loss_mlp": 1.013834, "epoch": 0.8954488065893104, "flos": 16362661818240.0, "grad_norm": 1.8575356389970492, "language_loss": 0.71789634, "learning_rate": 1.1346787719460071e-07, "loss": 0.73947716, "num_input_tokens_seen": 161019405, "step": 7447, "time_per_iteration": 2.6332733631134033 }, { "auxiliary_loss_clip": 0.01133584, "auxiliary_loss_mlp": 0.0102496, "balance_loss_clip": 1.04170942, "balance_loss_mlp": 1.01827204, "epoch": 0.8955690494799495, "flos": 18257883120000.0, "grad_norm": 1.7460261340910948, "language_loss": 0.71859759, "learning_rate": 1.1320937221867732e-07, "loss": 0.740183, "num_input_tokens_seen": 161036985, "step": 7448, "time_per_iteration": 2.6681582927703857 }, { "auxiliary_loss_clip": 0.01133794, "auxiliary_loss_mlp": 0.01022578, "balance_loss_clip": 1.04031754, "balance_loss_mlp": 1.01589322, "epoch": 0.8956892923705886, "flos": 25447486498560.0, "grad_norm": 1.8967884547298888, "language_loss": 0.79698253, "learning_rate": 1.1295115346739192e-07, "loss": 0.8185463, "num_input_tokens_seen": 161056985, "step": 7449, "time_per_iteration": 2.659618377685547 }, { "auxiliary_loss_clip": 0.0113992, "auxiliary_loss_mlp": 0.01026927, "balance_loss_clip": 1.04264605, "balance_loss_mlp": 1.01917219, "epoch": 0.8958095352612276, "flos": 52661883939840.0, "grad_norm": 2.806370654231317, "language_loss": 0.73193061, "learning_rate": 1.1269322097991629e-07, "loss": 0.75359905, "num_input_tokens_seen": 161080270, "step": 7450, "time_per_iteration": 2.930205821990967 }, { "auxiliary_loss_clip": 0.0115641, "auxiliary_loss_mlp": 0.0102947, "balance_loss_clip": 1.04591346, "balance_loss_mlp": 1.0220077, "epoch": 0.8959297781518668, "flos": 23186335392000.0, "grad_norm": 5.356569425663245, "language_loss": 0.6796906, "learning_rate": 1.1243557479537846e-07, "loss": 0.70154941, "num_input_tokens_seen": 161100160, "step": 7451, "time_per_iteration": 2.632756233215332 }, { "auxiliary_loss_clip": 0.01163021, "auxiliary_loss_mlp": 0.01022517, "balance_loss_clip": 1.04302394, "balance_loss_mlp": 1.01519799, "epoch": 0.8960500210425059, "flos": 20334309557760.0, "grad_norm": 1.98928265159776, "language_loss": 0.68950129, "learning_rate": 1.121782149528634e-07, "loss": 0.71135664, "num_input_tokens_seen": 161117260, "step": 7452, "time_per_iteration": 2.6182119846343994 }, { "auxiliary_loss_clip": 0.01139385, "auxiliary_loss_mlp": 0.01027904, "balance_loss_clip": 1.04454505, "balance_loss_mlp": 1.02131224, "epoch": 0.8961702639331449, "flos": 19901694153600.0, "grad_norm": 1.9835677328775578, "language_loss": 0.78941154, "learning_rate": 1.1192114149141208e-07, "loss": 0.81108445, "num_input_tokens_seen": 161136895, "step": 7453, "time_per_iteration": 2.6843714714050293 }, { "auxiliary_loss_clip": 0.01141766, "auxiliary_loss_mlp": 0.01025397, "balance_loss_clip": 1.0429728, "balance_loss_mlp": 1.01804733, "epoch": 0.8962905068237841, "flos": 12896348567040.0, "grad_norm": 2.4932292350177017, "language_loss": 0.65137076, "learning_rate": 1.1166435445002197e-07, "loss": 0.67304242, "num_input_tokens_seen": 161154565, "step": 7454, "time_per_iteration": 2.622429609298706 }, { "auxiliary_loss_clip": 0.01153929, "auxiliary_loss_mlp": 0.01026143, "balance_loss_clip": 1.04527688, "balance_loss_mlp": 1.01902902, "epoch": 0.8964107497144231, "flos": 23440331439360.0, "grad_norm": 2.0570353185960815, "language_loss": 0.68534786, "learning_rate": 1.1140785386764818e-07, "loss": 0.70714855, "num_input_tokens_seen": 161173265, "step": 7455, "time_per_iteration": 2.664933681488037 }, { "auxiliary_loss_clip": 0.01144371, "auxiliary_loss_mlp": 0.01022719, "balance_loss_clip": 1.04161286, "balance_loss_mlp": 1.01575446, "epoch": 0.8965309926050622, "flos": 19500176949120.0, "grad_norm": 2.107377953890634, "language_loss": 0.69888914, "learning_rate": 1.1115163978320153e-07, "loss": 0.72056001, "num_input_tokens_seen": 161191995, "step": 7456, "time_per_iteration": 2.5850014686584473 }, { "auxiliary_loss_clip": 0.01155747, "auxiliary_loss_mlp": 0.00762578, "balance_loss_clip": 1.04469156, "balance_loss_mlp": 1.00034785, "epoch": 0.8966512354957014, "flos": 28658008022400.0, "grad_norm": 2.1364327448431246, "language_loss": 0.82588494, "learning_rate": 1.1089571223554917e-07, "loss": 0.84506816, "num_input_tokens_seen": 161212880, "step": 7457, "time_per_iteration": 2.7034072875976562 }, { "auxiliary_loss_clip": 0.01151984, "auxiliary_loss_mlp": 0.0102524, "balance_loss_clip": 1.04275131, "balance_loss_mlp": 1.0184927, "epoch": 0.8967714783863404, "flos": 23370916406400.0, "grad_norm": 1.9829344362950219, "language_loss": 0.85416275, "learning_rate": 1.1064007126351537e-07, "loss": 0.87593508, "num_input_tokens_seen": 161233595, "step": 7458, "time_per_iteration": 2.656642436981201 }, { "auxiliary_loss_clip": 0.01132287, "auxiliary_loss_mlp": 0.01031591, "balance_loss_clip": 1.0435338, "balance_loss_mlp": 1.02449834, "epoch": 0.8968917212769795, "flos": 24535175938560.0, "grad_norm": 2.4169906311501608, "language_loss": 0.76247895, "learning_rate": 1.1038471690588003e-07, "loss": 0.7841177, "num_input_tokens_seen": 161252740, "step": 7459, "time_per_iteration": 2.7051680088043213 }, { "auxiliary_loss_clip": 0.01108159, "auxiliary_loss_mlp": 0.01025251, "balance_loss_clip": 1.04040778, "balance_loss_mlp": 1.01831615, "epoch": 0.8970119641676186, "flos": 23475416048640.0, "grad_norm": 1.9187004965295893, "language_loss": 0.80185378, "learning_rate": 1.1012964920138145e-07, "loss": 0.82318795, "num_input_tokens_seen": 161272325, "step": 7460, "time_per_iteration": 2.7652335166931152 }, { "auxiliary_loss_clip": 0.01131332, "auxiliary_loss_mlp": 0.01021168, "balance_loss_clip": 1.03952754, "balance_loss_mlp": 1.01469755, "epoch": 0.8971322070582577, "flos": 24538192680960.0, "grad_norm": 1.5685500694359769, "language_loss": 0.75856066, "learning_rate": 1.0987486818871205e-07, "loss": 0.78008568, "num_input_tokens_seen": 161295915, "step": 7461, "time_per_iteration": 2.7367734909057617 }, { "auxiliary_loss_clip": 0.01150157, "auxiliary_loss_mlp": 0.00761804, "balance_loss_clip": 1.04405928, "balance_loss_mlp": 1.00041413, "epoch": 0.8972524499488967, "flos": 21797454159360.0, "grad_norm": 2.4364966682686418, "language_loss": 0.73503733, "learning_rate": 1.0962037390652245e-07, "loss": 0.75415695, "num_input_tokens_seen": 161314935, "step": 7462, "time_per_iteration": 2.6520724296569824 }, { "auxiliary_loss_clip": 0.01136764, "auxiliary_loss_mlp": 0.01028894, "balance_loss_clip": 1.04324317, "balance_loss_mlp": 1.02171433, "epoch": 0.8973726928395359, "flos": 21726243446400.0, "grad_norm": 1.656673793861709, "language_loss": 0.71796644, "learning_rate": 1.0936616639341911e-07, "loss": 0.73962301, "num_input_tokens_seen": 161335225, "step": 7463, "time_per_iteration": 3.623293876647949 }, { "auxiliary_loss_clip": 0.01051061, "auxiliary_loss_mlp": 0.01001871, "balance_loss_clip": 1.01240206, "balance_loss_mlp": 1.00082803, "epoch": 0.897492935730175, "flos": 53837100097920.0, "grad_norm": 0.7363434419277795, "language_loss": 0.54720843, "learning_rate": 1.0911224568796473e-07, "loss": 0.56773782, "num_input_tokens_seen": 161393420, "step": 7464, "time_per_iteration": 3.2166359424591064 }, { "auxiliary_loss_clip": 0.01149107, "auxiliary_loss_mlp": 0.010238, "balance_loss_clip": 1.04502499, "balance_loss_mlp": 1.01700211, "epoch": 0.897613178620814, "flos": 18290346036480.0, "grad_norm": 2.4335203194975197, "language_loss": 0.71045792, "learning_rate": 1.0885861182867984e-07, "loss": 0.73218703, "num_input_tokens_seen": 161411525, "step": 7465, "time_per_iteration": 2.5866880416870117 }, { "auxiliary_loss_clip": 0.01138681, "auxiliary_loss_mlp": 0.01029263, "balance_loss_clip": 1.04242885, "balance_loss_mlp": 1.02240205, "epoch": 0.8977334215114532, "flos": 32993718059520.0, "grad_norm": 1.7809748942484074, "language_loss": 0.70909262, "learning_rate": 1.0860526485403942e-07, "loss": 0.73077214, "num_input_tokens_seen": 161432800, "step": 7466, "time_per_iteration": 3.6939661502838135 }, { "auxiliary_loss_clip": 0.01162435, "auxiliary_loss_mlp": 0.01026655, "balance_loss_clip": 1.04414499, "balance_loss_mlp": 1.02022338, "epoch": 0.8978536644020922, "flos": 15195636938880.0, "grad_norm": 1.6034380364275431, "language_loss": 0.76846254, "learning_rate": 1.0835220480247675e-07, "loss": 0.79035342, "num_input_tokens_seen": 161451295, "step": 7467, "time_per_iteration": 2.5587549209594727 }, { "auxiliary_loss_clip": 0.0113556, "auxiliary_loss_mlp": 0.01025083, "balance_loss_clip": 1.04268861, "balance_loss_mlp": 1.01793671, "epoch": 0.8979739072927313, "flos": 18004389863040.0, "grad_norm": 1.9043424507471454, "language_loss": 0.83555281, "learning_rate": 1.0809943171238067e-07, "loss": 0.85715926, "num_input_tokens_seen": 161469220, "step": 7468, "time_per_iteration": 3.5206382274627686 }, { "auxiliary_loss_clip": 0.01146144, "auxiliary_loss_mlp": 0.01031053, "balance_loss_clip": 1.04443288, "balance_loss_mlp": 1.02326822, "epoch": 0.8980941501833704, "flos": 22271546793600.0, "grad_norm": 2.0831953382323665, "language_loss": 0.62651372, "learning_rate": 1.078469456220965e-07, "loss": 0.64828575, "num_input_tokens_seen": 161489375, "step": 7469, "time_per_iteration": 2.6650235652923584 }, { "auxiliary_loss_clip": 0.0115022, "auxiliary_loss_mlp": 0.01025155, "balance_loss_clip": 1.04174829, "balance_loss_mlp": 1.0181663, "epoch": 0.8982143930740095, "flos": 37560729726720.0, "grad_norm": 1.7762213474101405, "language_loss": 0.6986059, "learning_rate": 1.0759474656992606e-07, "loss": 0.72035968, "num_input_tokens_seen": 161512145, "step": 7470, "time_per_iteration": 2.7258965969085693 }, { "auxiliary_loss_clip": 0.01141684, "auxiliary_loss_mlp": 0.01022982, "balance_loss_clip": 1.04008782, "balance_loss_mlp": 1.01568639, "epoch": 0.8983346359646486, "flos": 18076893465600.0, "grad_norm": 2.987719323287399, "language_loss": 0.78198719, "learning_rate": 1.0734283459412785e-07, "loss": 0.80363381, "num_input_tokens_seen": 161528995, "step": 7471, "time_per_iteration": 3.5803396701812744 }, { "auxiliary_loss_clip": 0.01112282, "auxiliary_loss_mlp": 0.01024141, "balance_loss_clip": 1.0388639, "balance_loss_mlp": 1.01633286, "epoch": 0.8984548788552876, "flos": 20558895344640.0, "grad_norm": 2.5897210527543235, "language_loss": 0.80967963, "learning_rate": 1.0709120973291707e-07, "loss": 0.83104384, "num_input_tokens_seen": 161548775, "step": 7472, "time_per_iteration": 2.7570033073425293 }, { "auxiliary_loss_clip": 0.01167845, "auxiliary_loss_mlp": 0.01029714, "balance_loss_clip": 1.0474149, "balance_loss_mlp": 1.02255213, "epoch": 0.8985751217459268, "flos": 17785442511360.0, "grad_norm": 2.220282513955287, "language_loss": 0.77384651, "learning_rate": 1.0683987202446475e-07, "loss": 0.79582208, "num_input_tokens_seen": 161566960, "step": 7473, "time_per_iteration": 2.6377439498901367 }, { "auxiliary_loss_clip": 0.01155272, "auxiliary_loss_mlp": 0.01027969, "balance_loss_clip": 1.04498696, "balance_loss_mlp": 1.02106977, "epoch": 0.8986953646365659, "flos": 21617003208960.0, "grad_norm": 1.931760119871698, "language_loss": 0.69728506, "learning_rate": 1.0658882150689862e-07, "loss": 0.71911752, "num_input_tokens_seen": 161585820, "step": 7474, "time_per_iteration": 2.7070140838623047 }, { "auxiliary_loss_clip": 0.01127006, "auxiliary_loss_mlp": 0.01026816, "balance_loss_clip": 1.04211164, "balance_loss_mlp": 1.01931822, "epoch": 0.8988156075272049, "flos": 14027355083520.0, "grad_norm": 4.705391490372896, "language_loss": 0.78556466, "learning_rate": 1.0633805821830288e-07, "loss": 0.80710292, "num_input_tokens_seen": 161602505, "step": 7475, "time_per_iteration": 2.7412021160125732 }, { "auxiliary_loss_clip": 0.01139145, "auxiliary_loss_mlp": 0.0102231, "balance_loss_clip": 1.04458427, "balance_loss_mlp": 1.01546121, "epoch": 0.8989358504178441, "flos": 29059202004480.0, "grad_norm": 2.3817809681620283, "language_loss": 0.82976413, "learning_rate": 1.0608758219671753e-07, "loss": 0.85137862, "num_input_tokens_seen": 161621545, "step": 7476, "time_per_iteration": 2.8094987869262695 }, { "auxiliary_loss_clip": 0.01142813, "auxiliary_loss_mlp": 0.01026727, "balance_loss_clip": 1.04343736, "balance_loss_mlp": 1.0200218, "epoch": 0.8990560933084831, "flos": 20230420446720.0, "grad_norm": 1.6013338599733564, "language_loss": 0.70772678, "learning_rate": 1.0583739348014065e-07, "loss": 0.72942215, "num_input_tokens_seen": 161642630, "step": 7477, "time_per_iteration": 2.6500439643859863 }, { "auxiliary_loss_clip": 0.01167009, "auxiliary_loss_mlp": 0.01021757, "balance_loss_clip": 1.04803276, "balance_loss_mlp": 1.01495564, "epoch": 0.8991763361991222, "flos": 25520672459520.0, "grad_norm": 3.057212763917033, "language_loss": 0.84628826, "learning_rate": 1.0558749210652518e-07, "loss": 0.86817586, "num_input_tokens_seen": 161662560, "step": 7478, "time_per_iteration": 2.633516550064087 }, { "auxiliary_loss_clip": 0.01129169, "auxiliary_loss_mlp": 0.01024636, "balance_loss_clip": 1.04147816, "balance_loss_mlp": 1.01797819, "epoch": 0.8992965790897613, "flos": 25119191168640.0, "grad_norm": 1.688268081091111, "language_loss": 0.85692185, "learning_rate": 1.053378781137808e-07, "loss": 0.87845987, "num_input_tokens_seen": 161683480, "step": 7479, "time_per_iteration": 2.7071781158447266 }, { "auxiliary_loss_clip": 0.01142559, "auxiliary_loss_mlp": 0.01024995, "balance_loss_clip": 1.043432, "balance_loss_mlp": 1.0173893, "epoch": 0.8994168219804004, "flos": 16070815814400.0, "grad_norm": 2.6824374199823984, "language_loss": 0.77866232, "learning_rate": 1.0508855153977392e-07, "loss": 0.80033791, "num_input_tokens_seen": 161699945, "step": 7480, "time_per_iteration": 2.687398672103882 }, { "auxiliary_loss_clip": 0.01150574, "auxiliary_loss_mlp": 0.01024877, "balance_loss_clip": 1.0413357, "balance_loss_mlp": 1.01784408, "epoch": 0.8995370648710395, "flos": 24825764966400.0, "grad_norm": 2.3312294621835536, "language_loss": 0.66562444, "learning_rate": 1.0483951242232669e-07, "loss": 0.68737894, "num_input_tokens_seen": 161720420, "step": 7481, "time_per_iteration": 2.6407556533813477 }, { "auxiliary_loss_clip": 0.01063808, "auxiliary_loss_mlp": 0.01001259, "balance_loss_clip": 1.00975394, "balance_loss_mlp": 1.00021029, "epoch": 0.8996573077616786, "flos": 63116238378240.0, "grad_norm": 0.9689030899997468, "language_loss": 0.57694322, "learning_rate": 1.0459076079921936e-07, "loss": 0.59759384, "num_input_tokens_seen": 161773080, "step": 7482, "time_per_iteration": 3.2190961837768555 }, { "auxiliary_loss_clip": 0.0113227, "auxiliary_loss_mlp": 0.01026154, "balance_loss_clip": 1.04362512, "balance_loss_mlp": 1.0189805, "epoch": 0.8997775506523177, "flos": 18219674027520.0, "grad_norm": 2.25796825242338, "language_loss": 0.84744614, "learning_rate": 1.0434229670818618e-07, "loss": 0.86903036, "num_input_tokens_seen": 161789755, "step": 7483, "time_per_iteration": 2.6516647338867188 }, { "auxiliary_loss_clip": 0.01129451, "auxiliary_loss_mlp": 0.01031402, "balance_loss_clip": 1.04130197, "balance_loss_mlp": 1.02452064, "epoch": 0.8998977935429567, "flos": 24166768095360.0, "grad_norm": 1.5540545793543252, "language_loss": 0.80122232, "learning_rate": 1.0409412018691944e-07, "loss": 0.82283086, "num_input_tokens_seen": 161810220, "step": 7484, "time_per_iteration": 2.711315631866455 }, { "auxiliary_loss_clip": 0.01135366, "auxiliary_loss_mlp": 0.01021026, "balance_loss_clip": 1.04350448, "balance_loss_mlp": 1.01328349, "epoch": 0.9000180364335959, "flos": 20773030273920.0, "grad_norm": 2.605849447683078, "language_loss": 0.74546456, "learning_rate": 1.0384623127306724e-07, "loss": 0.76702845, "num_input_tokens_seen": 161827565, "step": 7485, "time_per_iteration": 2.7109477519989014 }, { "auxiliary_loss_clip": 0.01119638, "auxiliary_loss_mlp": 0.01021802, "balance_loss_clip": 1.03947008, "balance_loss_mlp": 1.0149864, "epoch": 0.900138279324235, "flos": 19205745166080.0, "grad_norm": 1.7203711481362576, "language_loss": 0.79460794, "learning_rate": 1.0359863000423397e-07, "loss": 0.8160224, "num_input_tokens_seen": 161845700, "step": 7486, "time_per_iteration": 2.657818078994751 }, { "auxiliary_loss_clip": 0.01164726, "auxiliary_loss_mlp": 0.01028557, "balance_loss_clip": 1.04427338, "balance_loss_mlp": 1.02184832, "epoch": 0.900258522214874, "flos": 28731158069760.0, "grad_norm": 1.7635779941819547, "language_loss": 0.71668434, "learning_rate": 1.0335131641798112e-07, "loss": 0.73861718, "num_input_tokens_seen": 161867660, "step": 7487, "time_per_iteration": 2.6508100032806396 }, { "auxiliary_loss_clip": 0.0104216, "auxiliary_loss_mlp": 0.0100099, "balance_loss_clip": 1.00967264, "balance_loss_mlp": 0.99992335, "epoch": 0.9003787651055132, "flos": 58280685655680.0, "grad_norm": 0.8042407582268606, "language_loss": 0.55619001, "learning_rate": 1.0310429055182512e-07, "loss": 0.57662153, "num_input_tokens_seen": 161921980, "step": 7488, "time_per_iteration": 4.086040735244751 }, { "auxiliary_loss_clip": 0.0112545, "auxiliary_loss_mlp": 0.01025846, "balance_loss_clip": 1.0423882, "balance_loss_mlp": 1.01887226, "epoch": 0.9004990079961522, "flos": 25556475340800.0, "grad_norm": 1.7234547273838055, "language_loss": 0.74270356, "learning_rate": 1.0285755244324024e-07, "loss": 0.76421642, "num_input_tokens_seen": 161942725, "step": 7489, "time_per_iteration": 2.711121082305908 }, { "auxiliary_loss_clip": 0.01138869, "auxiliary_loss_mlp": 0.00762184, "balance_loss_clip": 1.03988457, "balance_loss_mlp": 1.00026011, "epoch": 0.9006192508867913, "flos": 23335185352320.0, "grad_norm": 1.411741920754481, "language_loss": 0.68765146, "learning_rate": 1.0261110212965629e-07, "loss": 0.70666194, "num_input_tokens_seen": 161964520, "step": 7490, "time_per_iteration": 2.6944119930267334 }, { "auxiliary_loss_clip": 0.01134467, "auxiliary_loss_mlp": 0.01023412, "balance_loss_clip": 1.041605, "balance_loss_mlp": 1.01625657, "epoch": 0.9007394937774305, "flos": 18040300485120.0, "grad_norm": 2.643206446593076, "language_loss": 0.79130399, "learning_rate": 1.023649396484596e-07, "loss": 0.81288278, "num_input_tokens_seen": 161983575, "step": 7491, "time_per_iteration": 2.633141040802002 }, { "auxiliary_loss_clip": 0.01164539, "auxiliary_loss_mlp": 0.01028868, "balance_loss_clip": 1.0461483, "balance_loss_mlp": 1.0214982, "epoch": 0.9008597366680695, "flos": 43068456633600.0, "grad_norm": 1.9141367663972866, "language_loss": 0.67315137, "learning_rate": 1.0211906503699275e-07, "loss": 0.69508547, "num_input_tokens_seen": 162006550, "step": 7492, "time_per_iteration": 2.7811005115509033 }, { "auxiliary_loss_clip": 0.01155748, "auxiliary_loss_mlp": 0.01027391, "balance_loss_clip": 1.04628682, "balance_loss_mlp": 1.02006578, "epoch": 0.9009799795587086, "flos": 14939055112320.0, "grad_norm": 2.434508339004845, "language_loss": 0.81922352, "learning_rate": 1.0187347833255455e-07, "loss": 0.84105492, "num_input_tokens_seen": 162022455, "step": 7493, "time_per_iteration": 3.5432701110839844 }, { "auxiliary_loss_clip": 0.01165715, "auxiliary_loss_mlp": 0.01031405, "balance_loss_clip": 1.04692388, "balance_loss_mlp": 1.02443433, "epoch": 0.9011002224493477, "flos": 21579584215680.0, "grad_norm": 1.9614395438153616, "language_loss": 0.79110283, "learning_rate": 1.0162817957240056e-07, "loss": 0.81307405, "num_input_tokens_seen": 162042350, "step": 7494, "time_per_iteration": 2.585740089416504 }, { "auxiliary_loss_clip": 0.01055133, "auxiliary_loss_mlp": 0.01001501, "balance_loss_clip": 1.01080346, "balance_loss_mlp": 1.0004462, "epoch": 0.9012204653399868, "flos": 71166367883520.0, "grad_norm": 0.8914843051871135, "language_loss": 0.63008344, "learning_rate": 1.0138316879374253e-07, "loss": 0.65064979, "num_input_tokens_seen": 162111640, "step": 7495, "time_per_iteration": 4.191587448120117 }, { "auxiliary_loss_clip": 0.01141358, "auxiliary_loss_mlp": 0.0102307, "balance_loss_clip": 1.04531932, "balance_loss_mlp": 1.01585197, "epoch": 0.9013407082306258, "flos": 15594963413760.0, "grad_norm": 3.8747119074460903, "language_loss": 0.74548513, "learning_rate": 1.0113844603374833e-07, "loss": 0.76712942, "num_input_tokens_seen": 162128165, "step": 7496, "time_per_iteration": 2.578484535217285 }, { "auxiliary_loss_clip": 0.0113561, "auxiliary_loss_mlp": 0.01030425, "balance_loss_clip": 1.04095495, "balance_loss_mlp": 1.02283084, "epoch": 0.901460951121265, "flos": 15049157276160.0, "grad_norm": 2.5960247204441798, "language_loss": 0.7182011, "learning_rate": 1.0089401132954178e-07, "loss": 0.73986149, "num_input_tokens_seen": 162146145, "step": 7497, "time_per_iteration": 3.596393585205078 }, { "auxiliary_loss_clip": 0.01138036, "auxiliary_loss_mlp": 0.01029905, "balance_loss_clip": 1.04474664, "balance_loss_mlp": 1.02289224, "epoch": 0.9015811940119041, "flos": 22236857233920.0, "grad_norm": 1.5750861749138123, "language_loss": 0.72389007, "learning_rate": 1.006498647182037e-07, "loss": 0.74556947, "num_input_tokens_seen": 162164800, "step": 7498, "time_per_iteration": 2.68811297416687 }, { "auxiliary_loss_clip": 0.01093288, "auxiliary_loss_mlp": 0.01021752, "balance_loss_clip": 1.03695416, "balance_loss_mlp": 1.01459908, "epoch": 0.9017014369025431, "flos": 24973824827520.0, "grad_norm": 2.2004999555830373, "language_loss": 0.71438187, "learning_rate": 1.004060062367713e-07, "loss": 0.73553222, "num_input_tokens_seen": 162185895, "step": 7499, "time_per_iteration": 2.783907890319824 }, { "auxiliary_loss_clip": 0.01151949, "auxiliary_loss_mlp": 0.01024046, "balance_loss_clip": 1.04301107, "balance_loss_mlp": 1.01713789, "epoch": 0.9018216797931822, "flos": 18114168804480.0, "grad_norm": 2.309144761479108, "language_loss": 0.69272566, "learning_rate": 1.0016243592223728e-07, "loss": 0.71448559, "num_input_tokens_seen": 162206295, "step": 7500, "time_per_iteration": 2.622805118560791 }, { "auxiliary_loss_clip": 0.01091262, "auxiliary_loss_mlp": 0.01025826, "balance_loss_clip": 1.03768635, "balance_loss_mlp": 1.01859331, "epoch": 0.9019419226838213, "flos": 37268452759680.0, "grad_norm": 1.757979475690111, "language_loss": 0.65867019, "learning_rate": 9.991915381155114e-08, "loss": 0.6798411, "num_input_tokens_seen": 162229275, "step": 7501, "time_per_iteration": 2.9719340801239014 }, { "auxiliary_loss_clip": 0.01155395, "auxiliary_loss_mlp": 0.01029521, "balance_loss_clip": 1.04430795, "balance_loss_mlp": 1.02182651, "epoch": 0.9020621655744604, "flos": 23441121538560.0, "grad_norm": 2.380677541815729, "language_loss": 0.75073987, "learning_rate": 9.967615994161871e-08, "loss": 0.77258909, "num_input_tokens_seen": 162248935, "step": 7502, "time_per_iteration": 2.6853790283203125 }, { "auxiliary_loss_clip": 0.01164816, "auxiliary_loss_mlp": 0.01027779, "balance_loss_clip": 1.04642856, "balance_loss_mlp": 1.02114463, "epoch": 0.9021824084650995, "flos": 22857465444480.0, "grad_norm": 1.6989298696149613, "language_loss": 0.78080213, "learning_rate": 9.943345434930161e-08, "loss": 0.80272812, "num_input_tokens_seen": 162269185, "step": 7503, "time_per_iteration": 2.5978803634643555 }, { "auxiliary_loss_clip": 0.01123101, "auxiliary_loss_mlp": 0.01023962, "balance_loss_clip": 1.04342222, "balance_loss_mlp": 1.01693487, "epoch": 0.9023026513557386, "flos": 22127581082880.0, "grad_norm": 2.342494841957817, "language_loss": 0.69161689, "learning_rate": 9.919103707141885e-08, "loss": 0.71308756, "num_input_tokens_seen": 162288065, "step": 7504, "time_per_iteration": 2.760854959487915 }, { "auxiliary_loss_clip": 0.01152228, "auxiliary_loss_mlp": 0.01024093, "balance_loss_clip": 1.0444417, "balance_loss_mlp": 1.01655281, "epoch": 0.9024228942463777, "flos": 24199087357440.0, "grad_norm": 2.954755713620168, "language_loss": 0.76296848, "learning_rate": 9.89489081447441e-08, "loss": 0.78473169, "num_input_tokens_seen": 162305265, "step": 7505, "time_per_iteration": 2.652313470840454 }, { "auxiliary_loss_clip": 0.01134345, "auxiliary_loss_mlp": 0.01025552, "balance_loss_clip": 1.0406146, "balance_loss_mlp": 1.01839948, "epoch": 0.9025431371370167, "flos": 25008262992000.0, "grad_norm": 1.989528062955611, "language_loss": 0.82918298, "learning_rate": 9.870706760600844e-08, "loss": 0.85078198, "num_input_tokens_seen": 162325215, "step": 7506, "time_per_iteration": 2.7177555561065674 }, { "auxiliary_loss_clip": 0.01117131, "auxiliary_loss_mlp": 0.0102982, "balance_loss_clip": 1.04359579, "balance_loss_mlp": 1.02216101, "epoch": 0.9026633800276559, "flos": 18952862440320.0, "grad_norm": 1.9638228327656109, "language_loss": 0.72993952, "learning_rate": 9.846551549189918e-08, "loss": 0.75140905, "num_input_tokens_seen": 162344820, "step": 7507, "time_per_iteration": 2.7286295890808105 }, { "auxiliary_loss_clip": 0.01134573, "auxiliary_loss_mlp": 0.01028815, "balance_loss_clip": 1.0433135, "balance_loss_mlp": 1.02170372, "epoch": 0.902783622918295, "flos": 32416059536640.0, "grad_norm": 2.144435465108351, "language_loss": 0.69234824, "learning_rate": 9.822425183905902e-08, "loss": 0.71398211, "num_input_tokens_seen": 162365345, "step": 7508, "time_per_iteration": 2.760657787322998 }, { "auxiliary_loss_clip": 0.01033678, "auxiliary_loss_mlp": 0.01001591, "balance_loss_clip": 1.01046145, "balance_loss_mlp": 1.00054193, "epoch": 0.902903865808934, "flos": 63717453244800.0, "grad_norm": 0.936184080714115, "language_loss": 0.75099272, "learning_rate": 9.798327668408823e-08, "loss": 0.77134538, "num_input_tokens_seen": 162426980, "step": 7509, "time_per_iteration": 3.3759632110595703 }, { "auxiliary_loss_clip": 0.01169052, "auxiliary_loss_mlp": 0.01027308, "balance_loss_clip": 1.04614687, "balance_loss_mlp": 1.01877236, "epoch": 0.9030241086995732, "flos": 23804034600960.0, "grad_norm": 1.9082347842483731, "language_loss": 0.68898797, "learning_rate": 9.774259006354158e-08, "loss": 0.71095157, "num_input_tokens_seen": 162447050, "step": 7510, "time_per_iteration": 2.610992670059204 }, { "auxiliary_loss_clip": 0.01142042, "auxiliary_loss_mlp": 0.01020641, "balance_loss_clip": 1.0420835, "balance_loss_mlp": 1.01375401, "epoch": 0.9031443515902122, "flos": 26395887248640.0, "grad_norm": 2.256454837210997, "language_loss": 0.76189762, "learning_rate": 9.750219201393184e-08, "loss": 0.78352445, "num_input_tokens_seen": 162467015, "step": 7511, "time_per_iteration": 2.7084898948669434 }, { "auxiliary_loss_clip": 0.01150178, "auxiliary_loss_mlp": 0.01024607, "balance_loss_clip": 1.04366517, "balance_loss_mlp": 1.01754653, "epoch": 0.9032645944808513, "flos": 24939350749440.0, "grad_norm": 1.758395323599528, "language_loss": 0.77696788, "learning_rate": 9.726208257172697e-08, "loss": 0.79871571, "num_input_tokens_seen": 162488710, "step": 7512, "time_per_iteration": 2.6857612133026123 }, { "auxiliary_loss_clip": 0.01164509, "auxiliary_loss_mlp": 0.01025852, "balance_loss_clip": 1.04455233, "balance_loss_mlp": 1.01877117, "epoch": 0.9033848373714904, "flos": 21178821196800.0, "grad_norm": 1.989910450203253, "language_loss": 0.74446535, "learning_rate": 9.702226177335115e-08, "loss": 0.76636893, "num_input_tokens_seen": 162507205, "step": 7513, "time_per_iteration": 2.572045087814331 }, { "auxiliary_loss_clip": 0.01136841, "auxiliary_loss_mlp": 0.01027452, "balance_loss_clip": 1.04362929, "balance_loss_mlp": 1.02030504, "epoch": 0.9035050802621295, "flos": 26286359702400.0, "grad_norm": 1.6213920918709042, "language_loss": 0.72510844, "learning_rate": 9.67827296551853e-08, "loss": 0.74675131, "num_input_tokens_seen": 162528490, "step": 7514, "time_per_iteration": 3.6757595539093018 }, { "auxiliary_loss_clip": 0.01129282, "auxiliary_loss_mlp": 0.00761904, "balance_loss_clip": 1.04045236, "balance_loss_mlp": 1.00036764, "epoch": 0.9036253231527686, "flos": 24204546224640.0, "grad_norm": 1.86239407793086, "language_loss": 0.68437457, "learning_rate": 9.65434862535659e-08, "loss": 0.70328647, "num_input_tokens_seen": 162547860, "step": 7515, "time_per_iteration": 2.685185432434082 }, { "auxiliary_loss_clip": 0.01141398, "auxiliary_loss_mlp": 0.01026509, "balance_loss_clip": 1.04261494, "balance_loss_mlp": 1.01887345, "epoch": 0.9037455660434077, "flos": 18072655660800.0, "grad_norm": 4.829171678960358, "language_loss": 0.64984804, "learning_rate": 9.630453160478635e-08, "loss": 0.67152709, "num_input_tokens_seen": 162563215, "step": 7516, "time_per_iteration": 2.685162305831909 }, { "auxiliary_loss_clip": 0.01106083, "auxiliary_loss_mlp": 0.01026051, "balance_loss_clip": 1.03752637, "balance_loss_mlp": 1.01883042, "epoch": 0.9038658089340468, "flos": 24060795995520.0, "grad_norm": 1.5650476450505546, "language_loss": 0.82163799, "learning_rate": 9.60658657450959e-08, "loss": 0.84295928, "num_input_tokens_seen": 162583515, "step": 7517, "time_per_iteration": 2.824997663497925 }, { "auxiliary_loss_clip": 0.01122066, "auxiliary_loss_mlp": 0.01029205, "balance_loss_clip": 1.03749323, "balance_loss_mlp": 1.02241313, "epoch": 0.9039860518246858, "flos": 21834298535040.0, "grad_norm": 1.6757111802027558, "language_loss": 0.7991631, "learning_rate": 9.582748871069979e-08, "loss": 0.82067585, "num_input_tokens_seen": 162602955, "step": 7518, "time_per_iteration": 3.648031234741211 }, { "auxiliary_loss_clip": 0.01139367, "auxiliary_loss_mlp": 0.00761844, "balance_loss_clip": 1.04110622, "balance_loss_mlp": 1.00031078, "epoch": 0.904106294715325, "flos": 26614870513920.0, "grad_norm": 2.633312412071103, "language_loss": 0.83450294, "learning_rate": 9.558940053775954e-08, "loss": 0.85351509, "num_input_tokens_seen": 162621595, "step": 7519, "time_per_iteration": 2.7094030380249023 }, { "auxiliary_loss_clip": 0.01149903, "auxiliary_loss_mlp": 0.01028495, "balance_loss_clip": 1.04446495, "balance_loss_mlp": 1.02099991, "epoch": 0.904226537605964, "flos": 17785693906560.0, "grad_norm": 2.0165058882222717, "language_loss": 0.67766631, "learning_rate": 9.535160126239294e-08, "loss": 0.69945025, "num_input_tokens_seen": 162638220, "step": 7520, "time_per_iteration": 3.5656442642211914 }, { "auxiliary_loss_clip": 0.01149712, "auxiliary_loss_mlp": 0.01028387, "balance_loss_clip": 1.0443275, "balance_loss_mlp": 1.02110016, "epoch": 0.9043467804966031, "flos": 24790428961920.0, "grad_norm": 1.7630175880903183, "language_loss": 0.70860142, "learning_rate": 9.511409092067424e-08, "loss": 0.73038238, "num_input_tokens_seen": 162658575, "step": 7521, "time_per_iteration": 2.648160457611084 }, { "auxiliary_loss_clip": 0.01136643, "auxiliary_loss_mlp": 0.01022894, "balance_loss_clip": 1.04390883, "balance_loss_mlp": 1.01629591, "epoch": 0.9044670233872423, "flos": 22632125472000.0, "grad_norm": 1.8558894864122448, "language_loss": 0.67152953, "learning_rate": 9.487686954863327e-08, "loss": 0.69312489, "num_input_tokens_seen": 162678295, "step": 7522, "time_per_iteration": 3.522277593612671 }, { "auxiliary_loss_clip": 0.01149524, "auxiliary_loss_mlp": 0.01028615, "balance_loss_clip": 1.04288828, "balance_loss_mlp": 1.02219868, "epoch": 0.9045872662778813, "flos": 23771320289280.0, "grad_norm": 1.8660529656847549, "language_loss": 0.77287996, "learning_rate": 9.46399371822566e-08, "loss": 0.79466134, "num_input_tokens_seen": 162698070, "step": 7523, "time_per_iteration": 2.6655097007751465 }, { "auxiliary_loss_clip": 0.01165776, "auxiliary_loss_mlp": 0.01026143, "balance_loss_clip": 1.04575968, "balance_loss_mlp": 1.01918077, "epoch": 0.9047075091685204, "flos": 15191039998080.0, "grad_norm": 2.242243090004402, "language_loss": 0.72257918, "learning_rate": 9.440329385748657e-08, "loss": 0.74449837, "num_input_tokens_seen": 162715140, "step": 7524, "time_per_iteration": 2.5464425086975098 }, { "auxiliary_loss_clip": 0.0112425, "auxiliary_loss_mlp": 0.01020797, "balance_loss_clip": 1.04379082, "balance_loss_mlp": 1.01435399, "epoch": 0.9048277520591596, "flos": 18003707504640.0, "grad_norm": 2.1241468686941642, "language_loss": 0.70464754, "learning_rate": 9.416693961022137e-08, "loss": 0.72609806, "num_input_tokens_seen": 162733390, "step": 7525, "time_per_iteration": 2.687877893447876 }, { "auxiliary_loss_clip": 0.01082078, "auxiliary_loss_mlp": 0.01025042, "balance_loss_clip": 1.03600979, "balance_loss_mlp": 1.01834893, "epoch": 0.9049479949497986, "flos": 21872471713920.0, "grad_norm": 1.8035281831187637, "language_loss": 0.77182603, "learning_rate": 9.393087447631654e-08, "loss": 0.79289722, "num_input_tokens_seen": 162751670, "step": 7526, "time_per_iteration": 2.7627320289611816 }, { "auxiliary_loss_clip": 0.01136972, "auxiliary_loss_mlp": 0.01016439, "balance_loss_clip": 1.04058695, "balance_loss_mlp": 1.01015353, "epoch": 0.9050682378404377, "flos": 20773928113920.0, "grad_norm": 1.7185747801957985, "language_loss": 0.72567368, "learning_rate": 9.36950984915823e-08, "loss": 0.74720776, "num_input_tokens_seen": 162770025, "step": 7527, "time_per_iteration": 2.6017401218414307 }, { "auxiliary_loss_clip": 0.01167518, "auxiliary_loss_mlp": 0.01022498, "balance_loss_clip": 1.04706073, "balance_loss_mlp": 1.01492858, "epoch": 0.9051884807310768, "flos": 21580015178880.0, "grad_norm": 1.8152065297017723, "language_loss": 0.69522399, "learning_rate": 9.345961169178607e-08, "loss": 0.7171241, "num_input_tokens_seen": 162789710, "step": 7528, "time_per_iteration": 2.608520030975342 }, { "auxiliary_loss_clip": 0.01110317, "auxiliary_loss_mlp": 0.01024589, "balance_loss_clip": 1.04173255, "balance_loss_mlp": 1.01719773, "epoch": 0.9053087236217159, "flos": 21908059113600.0, "grad_norm": 1.476535750691194, "language_loss": 0.72860318, "learning_rate": 9.322441411265081e-08, "loss": 0.74995226, "num_input_tokens_seen": 162810695, "step": 7529, "time_per_iteration": 2.8112106323242188 }, { "auxiliary_loss_clip": 0.01134297, "auxiliary_loss_mlp": 0.01035518, "balance_loss_clip": 1.04374099, "balance_loss_mlp": 1.02859223, "epoch": 0.9054289665123549, "flos": 17055809544960.0, "grad_norm": 2.863518121874271, "language_loss": 0.73420584, "learning_rate": 9.298950578985554e-08, "loss": 0.75590402, "num_input_tokens_seen": 162827770, "step": 7530, "time_per_iteration": 2.6814639568328857 }, { "auxiliary_loss_clip": 0.01150735, "auxiliary_loss_mlp": 0.00762762, "balance_loss_clip": 1.04590631, "balance_loss_mlp": 1.00031745, "epoch": 0.905549209402994, "flos": 20777268078720.0, "grad_norm": 1.8363708311851827, "language_loss": 0.71069628, "learning_rate": 9.275488675903665e-08, "loss": 0.72983122, "num_input_tokens_seen": 162846715, "step": 7531, "time_per_iteration": 2.737701892852783 }, { "auxiliary_loss_clip": 0.01107133, "auxiliary_loss_mlp": 0.01020358, "balance_loss_clip": 1.04077506, "balance_loss_mlp": 1.0136106, "epoch": 0.9056694522936332, "flos": 21686813291520.0, "grad_norm": 2.1537953052384182, "language_loss": 0.73349404, "learning_rate": 9.252055705578454e-08, "loss": 0.75476897, "num_input_tokens_seen": 162866215, "step": 7532, "time_per_iteration": 2.7770280838012695 }, { "auxiliary_loss_clip": 0.01148908, "auxiliary_loss_mlp": 0.01023459, "balance_loss_clip": 1.04201305, "balance_loss_mlp": 1.01721263, "epoch": 0.9057896951842722, "flos": 29569133433600.0, "grad_norm": 1.6268520958680661, "language_loss": 0.72196412, "learning_rate": 9.228651671564747e-08, "loss": 0.74368775, "num_input_tokens_seen": 162888245, "step": 7533, "time_per_iteration": 2.651372194290161 }, { "auxiliary_loss_clip": 0.01105002, "auxiliary_loss_mlp": 0.01028311, "balance_loss_clip": 1.0425005, "balance_loss_mlp": 1.02130795, "epoch": 0.9059099380749113, "flos": 27892248952320.0, "grad_norm": 1.583931326710199, "language_loss": 0.77862203, "learning_rate": 9.205276577412901e-08, "loss": 0.79995519, "num_input_tokens_seen": 162911025, "step": 7534, "time_per_iteration": 2.783057928085327 }, { "auxiliary_loss_clip": 0.01145285, "auxiliary_loss_mlp": 0.0076244, "balance_loss_clip": 1.04340339, "balance_loss_mlp": 1.00036061, "epoch": 0.9060301809655504, "flos": 17748993185280.0, "grad_norm": 2.6456162105602634, "language_loss": 0.76877356, "learning_rate": 9.181930426668905e-08, "loss": 0.78785074, "num_input_tokens_seen": 162927820, "step": 7535, "time_per_iteration": 2.650944232940674 }, { "auxiliary_loss_clip": 0.01104364, "auxiliary_loss_mlp": 0.01024915, "balance_loss_clip": 1.0391326, "balance_loss_mlp": 1.0184921, "epoch": 0.9061504238561895, "flos": 31759432963200.0, "grad_norm": 1.5260771699625746, "language_loss": 0.67818153, "learning_rate": 9.158613222874346e-08, "loss": 0.69947428, "num_input_tokens_seen": 162949445, "step": 7536, "time_per_iteration": 2.8395633697509766 }, { "auxiliary_loss_clip": 0.01134593, "auxiliary_loss_mlp": 0.01024218, "balance_loss_clip": 1.04063153, "balance_loss_mlp": 1.0172112, "epoch": 0.9062706667468285, "flos": 20048066075520.0, "grad_norm": 1.517487488840768, "language_loss": 0.8205837, "learning_rate": 9.135324969566394e-08, "loss": 0.84217185, "num_input_tokens_seen": 162968945, "step": 7537, "time_per_iteration": 2.652219533920288 }, { "auxiliary_loss_clip": 0.01155623, "auxiliary_loss_mlp": 0.01024373, "balance_loss_clip": 1.04561484, "balance_loss_mlp": 1.01751792, "epoch": 0.9063909096374677, "flos": 18437292576000.0, "grad_norm": 2.310433170408504, "language_loss": 0.75534648, "learning_rate": 9.112065670277913e-08, "loss": 0.77714646, "num_input_tokens_seen": 162985310, "step": 7538, "time_per_iteration": 2.579052448272705 }, { "auxiliary_loss_clip": 0.01133621, "auxiliary_loss_mlp": 0.01022087, "balance_loss_clip": 1.03995323, "balance_loss_mlp": 1.01603985, "epoch": 0.9065111525281068, "flos": 33547353361920.0, "grad_norm": 2.187899442076199, "language_loss": 0.73284775, "learning_rate": 9.088835328537303e-08, "loss": 0.75440478, "num_input_tokens_seen": 163006900, "step": 7539, "time_per_iteration": 2.790241241455078 }, { "auxiliary_loss_clip": 0.01140371, "auxiliary_loss_mlp": 0.01022196, "balance_loss_clip": 1.04412627, "balance_loss_mlp": 1.01494217, "epoch": 0.9066313954187458, "flos": 23367863750400.0, "grad_norm": 1.9993671748199193, "language_loss": 0.7165553, "learning_rate": 9.065633947868568e-08, "loss": 0.73818099, "num_input_tokens_seen": 163026505, "step": 7540, "time_per_iteration": 2.682766914367676 }, { "auxiliary_loss_clip": 0.01122534, "auxiliary_loss_mlp": 0.00761649, "balance_loss_clip": 1.04290628, "balance_loss_mlp": 1.00032604, "epoch": 0.906751638309385, "flos": 26249623067520.0, "grad_norm": 7.352428582176788, "language_loss": 0.79932451, "learning_rate": 9.042461531791379e-08, "loss": 0.81816638, "num_input_tokens_seen": 163044925, "step": 7541, "time_per_iteration": 3.7136340141296387 }, { "auxiliary_loss_clip": 0.01161554, "auxiliary_loss_mlp": 0.01024747, "balance_loss_clip": 1.0446198, "balance_loss_mlp": 1.01800537, "epoch": 0.906871881200024, "flos": 16544477485440.0, "grad_norm": 1.7677273352814147, "language_loss": 0.77448726, "learning_rate": 9.019318083820903e-08, "loss": 0.79635036, "num_input_tokens_seen": 163063505, "step": 7542, "time_per_iteration": 2.5902040004730225 }, { "auxiliary_loss_clip": 0.01150281, "auxiliary_loss_mlp": 0.01025784, "balance_loss_clip": 1.04463542, "balance_loss_mlp": 1.01829731, "epoch": 0.9069921240906631, "flos": 24605129675520.0, "grad_norm": 1.652885743282099, "language_loss": 0.85334629, "learning_rate": 8.996203607468045e-08, "loss": 0.87510693, "num_input_tokens_seen": 163082505, "step": 7543, "time_per_iteration": 2.7205371856689453 }, { "auxiliary_loss_clip": 0.011436, "auxiliary_loss_mlp": 0.01023398, "balance_loss_clip": 1.03949761, "balance_loss_mlp": 1.01657009, "epoch": 0.9071123669813023, "flos": 25374731500800.0, "grad_norm": 1.4471665193130725, "language_loss": 0.75612509, "learning_rate": 8.973118106239241e-08, "loss": 0.77779508, "num_input_tokens_seen": 163105110, "step": 7544, "time_per_iteration": 3.639089822769165 }, { "auxiliary_loss_clip": 0.01094559, "auxiliary_loss_mlp": 0.01024351, "balance_loss_clip": 1.03490102, "balance_loss_mlp": 1.01705849, "epoch": 0.9072326098719413, "flos": 26725798690560.0, "grad_norm": 1.8874982612983875, "language_loss": 0.94777429, "learning_rate": 8.95006158363656e-08, "loss": 0.96896338, "num_input_tokens_seen": 163125295, "step": 7545, "time_per_iteration": 2.8518083095550537 }, { "auxiliary_loss_clip": 0.01153177, "auxiliary_loss_mlp": 0.01025357, "balance_loss_clip": 1.0468235, "balance_loss_mlp": 1.0180074, "epoch": 0.9073528527625804, "flos": 23878800760320.0, "grad_norm": 1.7101642751131165, "language_loss": 0.77300781, "learning_rate": 8.9270340431576e-08, "loss": 0.79479313, "num_input_tokens_seen": 163144385, "step": 7546, "time_per_iteration": 3.5462546348571777 }, { "auxiliary_loss_clip": 0.01152908, "auxiliary_loss_mlp": 0.01026916, "balance_loss_clip": 1.04404378, "balance_loss_mlp": 1.01985002, "epoch": 0.9074730956532195, "flos": 37852144767360.0, "grad_norm": 2.001078106446448, "language_loss": 0.73783016, "learning_rate": 8.904035488295658e-08, "loss": 0.75962842, "num_input_tokens_seen": 163163885, "step": 7547, "time_per_iteration": 2.874358892440796 }, { "auxiliary_loss_clip": 0.01054193, "auxiliary_loss_mlp": 0.00753663, "balance_loss_clip": 1.00993085, "balance_loss_mlp": 1.00017929, "epoch": 0.9075933385438586, "flos": 65173307385600.0, "grad_norm": 0.666049887924127, "language_loss": 0.53281349, "learning_rate": 8.881065922539632e-08, "loss": 0.55089206, "num_input_tokens_seen": 163224325, "step": 7548, "time_per_iteration": 4.027567625045776 }, { "auxiliary_loss_clip": 0.01118642, "auxiliary_loss_mlp": 0.01020538, "balance_loss_clip": 1.04331565, "balance_loss_mlp": 1.01412141, "epoch": 0.9077135814344977, "flos": 19931571290880.0, "grad_norm": 1.7157179521021957, "language_loss": 0.73469841, "learning_rate": 8.85812534937389e-08, "loss": 0.75609016, "num_input_tokens_seen": 163242425, "step": 7549, "time_per_iteration": 2.7390100955963135 }, { "auxiliary_loss_clip": 0.01157313, "auxiliary_loss_mlp": 0.01025567, "balance_loss_clip": 1.04546928, "balance_loss_mlp": 1.01839304, "epoch": 0.9078338243251368, "flos": 17529650784000.0, "grad_norm": 2.3573982909341757, "language_loss": 0.67368519, "learning_rate": 8.835213772278583e-08, "loss": 0.69551396, "num_input_tokens_seen": 163259280, "step": 7550, "time_per_iteration": 2.593651056289673 }, { "auxiliary_loss_clip": 0.01112739, "auxiliary_loss_mlp": 0.01019512, "balance_loss_clip": 1.04103625, "balance_loss_mlp": 1.01277113, "epoch": 0.9079540672157759, "flos": 28803410277120.0, "grad_norm": 1.8015758823918182, "language_loss": 0.79183877, "learning_rate": 8.812331194729373e-08, "loss": 0.81316125, "num_input_tokens_seen": 163278925, "step": 7551, "time_per_iteration": 2.7943496704101562 }, { "auxiliary_loss_clip": 0.0117083, "auxiliary_loss_mlp": 0.01029376, "balance_loss_clip": 1.04875088, "balance_loss_mlp": 1.02173209, "epoch": 0.9080743101064149, "flos": 23513840622720.0, "grad_norm": 1.8476795207579242, "language_loss": 0.71763378, "learning_rate": 8.789477620197461e-08, "loss": 0.73963588, "num_input_tokens_seen": 163298450, "step": 7552, "time_per_iteration": 2.6756112575531006 }, { "auxiliary_loss_clip": 0.01135941, "auxiliary_loss_mlp": 0.01025155, "balance_loss_clip": 1.04238939, "balance_loss_mlp": 1.01777327, "epoch": 0.9081945529970541, "flos": 22778102344320.0, "grad_norm": 5.3359523065614916, "language_loss": 0.79339969, "learning_rate": 8.766653052149831e-08, "loss": 0.81501061, "num_input_tokens_seen": 163313635, "step": 7553, "time_per_iteration": 2.6896326541900635 }, { "auxiliary_loss_clip": 0.01134449, "auxiliary_loss_mlp": 0.01022073, "balance_loss_clip": 1.04266405, "balance_loss_mlp": 1.015064, "epoch": 0.9083147958876931, "flos": 18873714821760.0, "grad_norm": 2.6474896535068826, "language_loss": 0.74551362, "learning_rate": 8.743857494048823e-08, "loss": 0.76707882, "num_input_tokens_seen": 163330450, "step": 7554, "time_per_iteration": 2.662921667098999 }, { "auxiliary_loss_clip": 0.01124985, "auxiliary_loss_mlp": 0.01025911, "balance_loss_clip": 1.04311299, "balance_loss_mlp": 1.01915765, "epoch": 0.9084350387783322, "flos": 18909374048640.0, "grad_norm": 1.9264900626783867, "language_loss": 0.62881768, "learning_rate": 8.721090949352605e-08, "loss": 0.65032661, "num_input_tokens_seen": 163346690, "step": 7555, "time_per_iteration": 2.68681001663208 }, { "auxiliary_loss_clip": 0.01161269, "auxiliary_loss_mlp": 0.01035409, "balance_loss_clip": 1.04711545, "balance_loss_mlp": 1.02710056, "epoch": 0.9085552816689714, "flos": 20595488325120.0, "grad_norm": 1.8231722980924816, "language_loss": 0.72986078, "learning_rate": 8.698353421514793e-08, "loss": 0.7518276, "num_input_tokens_seen": 163365065, "step": 7556, "time_per_iteration": 2.631441354751587 }, { "auxiliary_loss_clip": 0.01148064, "auxiliary_loss_mlp": 0.0102762, "balance_loss_clip": 1.04275632, "balance_loss_mlp": 1.0207293, "epoch": 0.9086755245596104, "flos": 18113163223680.0, "grad_norm": 4.100523558211616, "language_loss": 0.80881, "learning_rate": 8.67564491398467e-08, "loss": 0.83056682, "num_input_tokens_seen": 163382070, "step": 7557, "time_per_iteration": 2.5800838470458984 }, { "auxiliary_loss_clip": 0.01152595, "auxiliary_loss_mlp": 0.01028256, "balance_loss_clip": 1.04222643, "balance_loss_mlp": 1.0210979, "epoch": 0.9087957674502495, "flos": 19129793857920.0, "grad_norm": 1.946548101986808, "language_loss": 0.73406601, "learning_rate": 8.652965430207104e-08, "loss": 0.75587451, "num_input_tokens_seen": 163399975, "step": 7558, "time_per_iteration": 2.614776611328125 }, { "auxiliary_loss_clip": 0.01154894, "auxiliary_loss_mlp": 0.010258, "balance_loss_clip": 1.04393947, "balance_loss_mlp": 1.01896381, "epoch": 0.9089160103408886, "flos": 18109930999680.0, "grad_norm": 2.173657454733986, "language_loss": 0.65453291, "learning_rate": 8.630314973622521e-08, "loss": 0.67633986, "num_input_tokens_seen": 163417520, "step": 7559, "time_per_iteration": 2.6410157680511475 }, { "auxiliary_loss_clip": 0.01147415, "auxiliary_loss_mlp": 0.01026703, "balance_loss_clip": 1.04492664, "balance_loss_mlp": 1.01968122, "epoch": 0.9090362532315277, "flos": 33364855336320.0, "grad_norm": 2.028575082106824, "language_loss": 0.71012747, "learning_rate": 8.607693547666995e-08, "loss": 0.73186862, "num_input_tokens_seen": 163440060, "step": 7560, "time_per_iteration": 2.7446165084838867 }, { "auxiliary_loss_clip": 0.01034085, "auxiliary_loss_mlp": 0.01001634, "balance_loss_clip": 1.00928926, "balance_loss_mlp": 1.00053084, "epoch": 0.9091564961221668, "flos": 71480585082240.0, "grad_norm": 0.8842872766630178, "language_loss": 0.57948869, "learning_rate": 8.585101155772201e-08, "loss": 0.59984583, "num_input_tokens_seen": 163502180, "step": 7561, "time_per_iteration": 3.400104522705078 }, { "auxiliary_loss_clip": 0.01129703, "auxiliary_loss_mlp": 0.01023232, "balance_loss_clip": 1.03858352, "balance_loss_mlp": 1.01591289, "epoch": 0.9092767390128058, "flos": 24712574232960.0, "grad_norm": 2.0658978243693973, "language_loss": 0.68511021, "learning_rate": 8.562537801365377e-08, "loss": 0.70663959, "num_input_tokens_seen": 163521915, "step": 7562, "time_per_iteration": 2.70809006690979 }, { "auxiliary_loss_clip": 0.01166627, "auxiliary_loss_mlp": 0.01029731, "balance_loss_clip": 1.04529786, "balance_loss_mlp": 1.02227449, "epoch": 0.909396981903445, "flos": 23586487879680.0, "grad_norm": 1.9504683159369043, "language_loss": 0.70099032, "learning_rate": 8.540003487869362e-08, "loss": 0.72295386, "num_input_tokens_seen": 163543585, "step": 7563, "time_per_iteration": 2.7313616275787354 }, { "auxiliary_loss_clip": 0.01112535, "auxiliary_loss_mlp": 0.0102352, "balance_loss_clip": 1.03979373, "balance_loss_mlp": 1.01667738, "epoch": 0.909517224794084, "flos": 23404169422080.0, "grad_norm": 1.9694719006866148, "language_loss": 0.79831016, "learning_rate": 8.517498218702557e-08, "loss": 0.81967074, "num_input_tokens_seen": 163561515, "step": 7564, "time_per_iteration": 2.7416539192199707 }, { "auxiliary_loss_clip": 0.01117338, "auxiliary_loss_mlp": 0.01021145, "balance_loss_clip": 1.04021287, "balance_loss_mlp": 1.01445746, "epoch": 0.9096374676847231, "flos": 19208618254080.0, "grad_norm": 1.8523011294157627, "language_loss": 0.69645095, "learning_rate": 8.49502199727905e-08, "loss": 0.71783578, "num_input_tokens_seen": 163579540, "step": 7565, "time_per_iteration": 2.7775590419769287 }, { "auxiliary_loss_clip": 0.01145045, "auxiliary_loss_mlp": 0.01024026, "balance_loss_clip": 1.04031837, "balance_loss_mlp": 1.01656687, "epoch": 0.9097577105753623, "flos": 33292495388160.0, "grad_norm": 2.5717994917422438, "language_loss": 0.6620667, "learning_rate": 8.472574827008428e-08, "loss": 0.68375742, "num_input_tokens_seen": 163600425, "step": 7566, "time_per_iteration": 3.624068021774292 }, { "auxiliary_loss_clip": 0.01151878, "auxiliary_loss_mlp": 0.01019224, "balance_loss_clip": 1.04249239, "balance_loss_mlp": 1.012146, "epoch": 0.9098779534660013, "flos": 21906443001600.0, "grad_norm": 1.899111444721732, "language_loss": 0.83921778, "learning_rate": 8.450156711295942e-08, "loss": 0.86092883, "num_input_tokens_seen": 163620595, "step": 7567, "time_per_iteration": 2.7044951915740967 }, { "auxiliary_loss_clip": 0.01135316, "auxiliary_loss_mlp": 0.01019544, "balance_loss_clip": 1.04502118, "balance_loss_mlp": 1.0130204, "epoch": 0.9099981963566404, "flos": 25730354102400.0, "grad_norm": 2.2441126838804504, "language_loss": 0.8674525, "learning_rate": 8.427767653542383e-08, "loss": 0.88900113, "num_input_tokens_seen": 163635765, "step": 7568, "time_per_iteration": 2.6741533279418945 }, { "auxiliary_loss_clip": 0.01105929, "auxiliary_loss_mlp": 0.01023897, "balance_loss_clip": 1.03892255, "balance_loss_mlp": 1.01729012, "epoch": 0.9101184392472795, "flos": 21069437304960.0, "grad_norm": 1.9411886301653993, "language_loss": 0.7031737, "learning_rate": 8.405407657144125e-08, "loss": 0.72447193, "num_input_tokens_seen": 163654925, "step": 7569, "time_per_iteration": 3.765500545501709 }, { "auxiliary_loss_clip": 0.01132462, "auxiliary_loss_mlp": 0.01027365, "balance_loss_clip": 1.04251921, "balance_loss_mlp": 1.0205667, "epoch": 0.9102386821379186, "flos": 24752614919040.0, "grad_norm": 1.8801637001973244, "language_loss": 0.72531021, "learning_rate": 8.383076725493232e-08, "loss": 0.74690849, "num_input_tokens_seen": 163672245, "step": 7570, "time_per_iteration": 2.778780460357666 }, { "auxiliary_loss_clip": 0.0115004, "auxiliary_loss_mlp": 0.01019948, "balance_loss_clip": 1.04261577, "balance_loss_mlp": 1.01344776, "epoch": 0.9103589250285576, "flos": 22562818179840.0, "grad_norm": 1.8320344769673471, "language_loss": 0.68250501, "learning_rate": 8.360774861977216e-08, "loss": 0.70420492, "num_input_tokens_seen": 163691365, "step": 7571, "time_per_iteration": 2.6000194549560547 }, { "auxiliary_loss_clip": 0.01134425, "auxiliary_loss_mlp": 0.01021689, "balance_loss_clip": 1.03881752, "balance_loss_mlp": 1.0152632, "epoch": 0.9104791679191968, "flos": 25373474524800.0, "grad_norm": 1.8384781593638426, "language_loss": 0.7453596, "learning_rate": 8.338502069979281e-08, "loss": 0.76692075, "num_input_tokens_seen": 163711675, "step": 7572, "time_per_iteration": 3.5489108562469482 }, { "auxiliary_loss_clip": 0.01152576, "auxiliary_loss_mlp": 0.01027007, "balance_loss_clip": 1.04193687, "balance_loss_mlp": 1.01970863, "epoch": 0.9105994108098359, "flos": 14426681558400.0, "grad_norm": 3.820421690487028, "language_loss": 0.79727364, "learning_rate": 8.316258352878214e-08, "loss": 0.81906945, "num_input_tokens_seen": 163728095, "step": 7573, "time_per_iteration": 3.5043232440948486 }, { "auxiliary_loss_clip": 0.01152702, "auxiliary_loss_mlp": 0.01024579, "balance_loss_clip": 1.0412842, "balance_loss_mlp": 1.01696098, "epoch": 0.9107196537004749, "flos": 26718292748160.0, "grad_norm": 1.9296479617541982, "language_loss": 0.71383977, "learning_rate": 8.294043714048338e-08, "loss": 0.73561251, "num_input_tokens_seen": 163747175, "step": 7574, "time_per_iteration": 2.672872543334961 }, { "auxiliary_loss_clip": 0.01043754, "auxiliary_loss_mlp": 0.01000811, "balance_loss_clip": 1.01006448, "balance_loss_mlp": 0.99973768, "epoch": 0.9108398965911141, "flos": 66532634703360.0, "grad_norm": 0.7483655809720475, "language_loss": 0.60474563, "learning_rate": 8.271858156859624e-08, "loss": 0.62519133, "num_input_tokens_seen": 163812545, "step": 7575, "time_per_iteration": 3.38234543800354 }, { "auxiliary_loss_clip": 0.01162679, "auxiliary_loss_mlp": 0.01025358, "balance_loss_clip": 1.04518533, "balance_loss_mlp": 1.01813376, "epoch": 0.9109601394817531, "flos": 25411073086080.0, "grad_norm": 1.731867551314336, "language_loss": 0.73926282, "learning_rate": 8.249701684677557e-08, "loss": 0.76114321, "num_input_tokens_seen": 163833870, "step": 7576, "time_per_iteration": 2.7244439125061035 }, { "auxiliary_loss_clip": 0.0115293, "auxiliary_loss_mlp": 0.01021797, "balance_loss_clip": 1.0481602, "balance_loss_mlp": 1.01484728, "epoch": 0.9110803823723922, "flos": 22747794243840.0, "grad_norm": 1.6820271712045682, "language_loss": 0.80802846, "learning_rate": 8.227574300863294e-08, "loss": 0.82977569, "num_input_tokens_seen": 163854040, "step": 7577, "time_per_iteration": 2.66678524017334 }, { "auxiliary_loss_clip": 0.01142096, "auxiliary_loss_mlp": 0.01024786, "balance_loss_clip": 1.04468119, "balance_loss_mlp": 1.0174247, "epoch": 0.9112006252630314, "flos": 48469924131840.0, "grad_norm": 2.1013751158822878, "language_loss": 0.694116, "learning_rate": 8.205476008773548e-08, "loss": 0.71578491, "num_input_tokens_seen": 163878040, "step": 7578, "time_per_iteration": 2.855280637741089 }, { "auxiliary_loss_clip": 0.01117627, "auxiliary_loss_mlp": 0.01022892, "balance_loss_clip": 1.04395509, "balance_loss_mlp": 1.01659751, "epoch": 0.9113208681536704, "flos": 30009649829760.0, "grad_norm": 2.176768261107171, "language_loss": 0.82399273, "learning_rate": 8.183406811760596e-08, "loss": 0.84539795, "num_input_tokens_seen": 163897770, "step": 7579, "time_per_iteration": 2.7840023040771484 }, { "auxiliary_loss_clip": 0.01108667, "auxiliary_loss_mlp": 0.01024826, "balance_loss_clip": 1.03823876, "balance_loss_mlp": 1.01735127, "epoch": 0.9114411110443095, "flos": 25594971742080.0, "grad_norm": 1.528770714725637, "language_loss": 0.74013674, "learning_rate": 8.161366713172313e-08, "loss": 0.76147175, "num_input_tokens_seen": 163920160, "step": 7580, "time_per_iteration": 2.7960729598999023 }, { "auxiliary_loss_clip": 0.01126363, "auxiliary_loss_mlp": 0.01027997, "balance_loss_clip": 1.04052103, "balance_loss_mlp": 1.02039444, "epoch": 0.9115613539349486, "flos": 18399729928320.0, "grad_norm": 4.1563638957372255, "language_loss": 0.8413617, "learning_rate": 8.139355716352137e-08, "loss": 0.86290526, "num_input_tokens_seen": 163935000, "step": 7581, "time_per_iteration": 2.6205849647521973 }, { "auxiliary_loss_clip": 0.01139983, "auxiliary_loss_mlp": 0.0102295, "balance_loss_clip": 1.04232132, "balance_loss_mlp": 1.01584554, "epoch": 0.9116815968255877, "flos": 21726171619200.0, "grad_norm": 1.4819125631825243, "language_loss": 0.7014116, "learning_rate": 8.117373824639196e-08, "loss": 0.72304094, "num_input_tokens_seen": 163955265, "step": 7582, "time_per_iteration": 2.6819677352905273 }, { "auxiliary_loss_clip": 0.01063293, "auxiliary_loss_mlp": 0.01001445, "balance_loss_clip": 1.00952125, "balance_loss_mlp": 1.00039005, "epoch": 0.9118018397162267, "flos": 65363526835200.0, "grad_norm": 0.7190697644534829, "language_loss": 0.59229374, "learning_rate": 8.095421041368067e-08, "loss": 0.61294115, "num_input_tokens_seen": 164014680, "step": 7583, "time_per_iteration": 3.082826614379883 }, { "auxiliary_loss_clip": 0.01133824, "auxiliary_loss_mlp": 0.0076235, "balance_loss_clip": 1.04459012, "balance_loss_mlp": 1.00034189, "epoch": 0.9119220826068659, "flos": 20922885815040.0, "grad_norm": 1.8059256973161877, "language_loss": 0.70735234, "learning_rate": 8.073497369868999e-08, "loss": 0.72631419, "num_input_tokens_seen": 164033140, "step": 7584, "time_per_iteration": 2.845285177230835 }, { "auxiliary_loss_clip": 0.01143929, "auxiliary_loss_mlp": 0.01024215, "balance_loss_clip": 1.0435096, "balance_loss_mlp": 1.01716042, "epoch": 0.912042325497505, "flos": 28366449327360.0, "grad_norm": 1.6685230526558648, "language_loss": 0.75698781, "learning_rate": 8.051602813467772e-08, "loss": 0.77866924, "num_input_tokens_seen": 164054995, "step": 7585, "time_per_iteration": 2.7530479431152344 }, { "auxiliary_loss_clip": 0.01152444, "auxiliary_loss_mlp": 0.01025798, "balance_loss_clip": 1.04331684, "balance_loss_mlp": 1.01835918, "epoch": 0.912162568388144, "flos": 17566782468480.0, "grad_norm": 2.8226419960573224, "language_loss": 0.70898205, "learning_rate": 8.029737375485756e-08, "loss": 0.73076451, "num_input_tokens_seen": 164074225, "step": 7586, "time_per_iteration": 2.6744871139526367 }, { "auxiliary_loss_clip": 0.01165059, "auxiliary_loss_mlp": 0.01025661, "balance_loss_clip": 1.04511333, "balance_loss_mlp": 1.01842523, "epoch": 0.9122828112787832, "flos": 19827897661440.0, "grad_norm": 1.9056399274799611, "language_loss": 0.72658217, "learning_rate": 8.007901059239986e-08, "loss": 0.74848938, "num_input_tokens_seen": 164093505, "step": 7587, "time_per_iteration": 2.7120113372802734 }, { "auxiliary_loss_clip": 0.01136567, "auxiliary_loss_mlp": 0.01022578, "balance_loss_clip": 1.04137635, "balance_loss_mlp": 1.0158788, "epoch": 0.9124030541694222, "flos": 20813789232000.0, "grad_norm": 1.7846286034567664, "language_loss": 0.80371708, "learning_rate": 7.986093868042964e-08, "loss": 0.82530856, "num_input_tokens_seen": 164113750, "step": 7588, "time_per_iteration": 2.748192071914673 }, { "auxiliary_loss_clip": 0.01146757, "auxiliary_loss_mlp": 0.01024572, "balance_loss_clip": 1.04205239, "balance_loss_mlp": 1.01846886, "epoch": 0.9125232970600613, "flos": 25192305302400.0, "grad_norm": 2.2550696172116003, "language_loss": 0.6749078, "learning_rate": 7.964315805202826e-08, "loss": 0.69662106, "num_input_tokens_seen": 164134330, "step": 7589, "time_per_iteration": 2.7242565155029297 }, { "auxiliary_loss_clip": 0.01137178, "auxiliary_loss_mlp": 0.01028781, "balance_loss_clip": 1.04271328, "balance_loss_mlp": 1.02175379, "epoch": 0.9126435399507005, "flos": 19719591177600.0, "grad_norm": 1.942789697417644, "language_loss": 0.7344113, "learning_rate": 7.942566874023304e-08, "loss": 0.75607085, "num_input_tokens_seen": 164153515, "step": 7590, "time_per_iteration": 2.645998477935791 }, { "auxiliary_loss_clip": 0.01134033, "auxiliary_loss_mlp": 0.01025708, "balance_loss_clip": 1.04057574, "balance_loss_mlp": 1.0185051, "epoch": 0.9127637828413395, "flos": 19573614305280.0, "grad_norm": 2.199382407726409, "language_loss": 0.69409108, "learning_rate": 7.920847077803649e-08, "loss": 0.71568853, "num_input_tokens_seen": 164171305, "step": 7591, "time_per_iteration": 2.652820110321045 }, { "auxiliary_loss_clip": 0.01094308, "auxiliary_loss_mlp": 0.01029537, "balance_loss_clip": 1.03361201, "balance_loss_mlp": 1.02217841, "epoch": 0.9128840257319786, "flos": 20230635928320.0, "grad_norm": 1.7192022156005455, "language_loss": 0.82237589, "learning_rate": 7.899156419838826e-08, "loss": 0.84361434, "num_input_tokens_seen": 164190275, "step": 7592, "time_per_iteration": 3.6023526191711426 }, { "auxiliary_loss_clip": 0.0112109, "auxiliary_loss_mlp": 0.01025723, "balance_loss_clip": 1.0417273, "balance_loss_mlp": 1.01811695, "epoch": 0.9130042686226177, "flos": 24858658846080.0, "grad_norm": 1.6966404373521042, "language_loss": 0.65586615, "learning_rate": 7.87749490341918e-08, "loss": 0.67733431, "num_input_tokens_seen": 164210550, "step": 7593, "time_per_iteration": 2.720277786254883 }, { "auxiliary_loss_clip": 0.01168778, "auxiliary_loss_mlp": 0.01029471, "balance_loss_clip": 1.0469439, "balance_loss_mlp": 1.02159977, "epoch": 0.9131245115132568, "flos": 23581747284480.0, "grad_norm": 2.152154197513166, "language_loss": 0.83359933, "learning_rate": 7.855862531830836e-08, "loss": 0.85558176, "num_input_tokens_seen": 164226660, "step": 7594, "time_per_iteration": 2.5774543285369873 }, { "auxiliary_loss_clip": 0.01147869, "auxiliary_loss_mlp": 0.01029524, "balance_loss_clip": 1.04231191, "balance_loss_mlp": 1.02208877, "epoch": 0.9132447544038959, "flos": 19931607204480.0, "grad_norm": 2.637175875139042, "language_loss": 0.72541451, "learning_rate": 7.834259308355373e-08, "loss": 0.74718845, "num_input_tokens_seen": 164245425, "step": 7595, "time_per_iteration": 3.6096365451812744 }, { "auxiliary_loss_clip": 0.01078729, "auxiliary_loss_mlp": 0.01022352, "balance_loss_clip": 1.03595901, "balance_loss_mlp": 1.01530683, "epoch": 0.9133649972945349, "flos": 21981747864960.0, "grad_norm": 2.6485707028771146, "language_loss": 0.75203794, "learning_rate": 7.812685236269989e-08, "loss": 0.77304876, "num_input_tokens_seen": 164264085, "step": 7596, "time_per_iteration": 2.821704149246216 }, { "auxiliary_loss_clip": 0.01031093, "auxiliary_loss_mlp": 0.01000178, "balance_loss_clip": 1.01224458, "balance_loss_mlp": 0.9991768, "epoch": 0.9134852401851741, "flos": 71240523511680.0, "grad_norm": 0.7901730152502962, "language_loss": 0.58604628, "learning_rate": 7.791140318847445e-08, "loss": 0.606359, "num_input_tokens_seen": 164322220, "step": 7597, "time_per_iteration": 3.279336452484131 }, { "auxiliary_loss_clip": 0.01132185, "auxiliary_loss_mlp": 0.01019953, "balance_loss_clip": 1.04364705, "balance_loss_mlp": 1.01363492, "epoch": 0.9136054830758131, "flos": 23626923615360.0, "grad_norm": 1.6581025552927458, "language_loss": 0.80373216, "learning_rate": 7.769624559356081e-08, "loss": 0.82525349, "num_input_tokens_seen": 164345615, "step": 7598, "time_per_iteration": 3.714294195175171 }, { "auxiliary_loss_clip": 0.01149703, "auxiliary_loss_mlp": 0.01024349, "balance_loss_clip": 1.04329622, "balance_loss_mlp": 1.01686275, "epoch": 0.9137257259664522, "flos": 23438858981760.0, "grad_norm": 4.517381550510644, "language_loss": 0.75598282, "learning_rate": 7.748137961059842e-08, "loss": 0.77772331, "num_input_tokens_seen": 164359595, "step": 7599, "time_per_iteration": 3.556138038635254 }, { "auxiliary_loss_clip": 0.01161965, "auxiliary_loss_mlp": 0.0102144, "balance_loss_clip": 1.04588544, "balance_loss_mlp": 1.01427817, "epoch": 0.9138459688570914, "flos": 19127854523520.0, "grad_norm": 2.3608828447659986, "language_loss": 0.65603423, "learning_rate": 7.726680527218211e-08, "loss": 0.67786825, "num_input_tokens_seen": 164376635, "step": 7600, "time_per_iteration": 2.5688884258270264 }, { "auxiliary_loss_clip": 0.01161873, "auxiliary_loss_mlp": 0.0102145, "balance_loss_clip": 1.04131842, "balance_loss_mlp": 1.01436055, "epoch": 0.9139662117477304, "flos": 46281240714240.0, "grad_norm": 1.6694982545675758, "language_loss": 0.75366908, "learning_rate": 7.70525226108627e-08, "loss": 0.77550232, "num_input_tokens_seen": 164400305, "step": 7601, "time_per_iteration": 2.781250476837158 }, { "auxiliary_loss_clip": 0.01154398, "auxiliary_loss_mlp": 0.01023954, "balance_loss_clip": 1.04784989, "balance_loss_mlp": 1.0166142, "epoch": 0.9140864546383695, "flos": 22273198819200.0, "grad_norm": 1.7618234679457931, "language_loss": 0.79779029, "learning_rate": 7.683853165914666e-08, "loss": 0.81957376, "num_input_tokens_seen": 164418075, "step": 7602, "time_per_iteration": 2.641444683074951 }, { "auxiliary_loss_clip": 0.01110234, "auxiliary_loss_mlp": 0.01032169, "balance_loss_clip": 1.03989422, "balance_loss_mlp": 1.02482307, "epoch": 0.9142066975290086, "flos": 17530009920000.0, "grad_norm": 1.784081790343506, "language_loss": 0.77260429, "learning_rate": 7.662483244949602e-08, "loss": 0.79402834, "num_input_tokens_seen": 164435335, "step": 7603, "time_per_iteration": 2.7189106941223145 }, { "auxiliary_loss_clip": 0.01114496, "auxiliary_loss_mlp": 0.01026806, "balance_loss_clip": 1.04104626, "balance_loss_mlp": 1.01966536, "epoch": 0.9143269404196477, "flos": 17712148809600.0, "grad_norm": 2.1530088616812355, "language_loss": 0.80265927, "learning_rate": 7.641142501432951e-08, "loss": 0.8240723, "num_input_tokens_seen": 164451530, "step": 7604, "time_per_iteration": 2.689401865005493 }, { "auxiliary_loss_clip": 0.01130147, "auxiliary_loss_mlp": 0.01021668, "balance_loss_clip": 1.03933859, "balance_loss_mlp": 1.01463437, "epoch": 0.9144471833102867, "flos": 33323414019840.0, "grad_norm": 1.6793178621137155, "language_loss": 0.73211104, "learning_rate": 7.619830938602013e-08, "loss": 0.75362921, "num_input_tokens_seen": 164472755, "step": 7605, "time_per_iteration": 2.7503225803375244 }, { "auxiliary_loss_clip": 0.01147906, "auxiliary_loss_mlp": 0.01026589, "balance_loss_clip": 1.04498947, "balance_loss_mlp": 1.01971674, "epoch": 0.9145674262009259, "flos": 21068970428160.0, "grad_norm": 1.900537556057236, "language_loss": 0.82570368, "learning_rate": 7.598548559689777e-08, "loss": 0.84744865, "num_input_tokens_seen": 164491155, "step": 7606, "time_per_iteration": 2.633129835128784 }, { "auxiliary_loss_clip": 0.01115656, "auxiliary_loss_mlp": 0.01030475, "balance_loss_clip": 1.03836143, "balance_loss_mlp": 1.02351022, "epoch": 0.914687669091565, "flos": 16800269212800.0, "grad_norm": 2.307077001409553, "language_loss": 0.81585443, "learning_rate": 7.577295367924751e-08, "loss": 0.83731574, "num_input_tokens_seen": 164507555, "step": 7607, "time_per_iteration": 2.7347731590270996 }, { "auxiliary_loss_clip": 0.01143713, "auxiliary_loss_mlp": 0.01027443, "balance_loss_clip": 1.04541183, "balance_loss_mlp": 1.01986074, "epoch": 0.914807911982204, "flos": 25773627012480.0, "grad_norm": 1.7406286391012669, "language_loss": 0.82133275, "learning_rate": 7.556071366531002e-08, "loss": 0.84304434, "num_input_tokens_seen": 164528525, "step": 7608, "time_per_iteration": 2.7245092391967773 }, { "auxiliary_loss_clip": 0.01152767, "auxiliary_loss_mlp": 0.0102783, "balance_loss_clip": 1.04593718, "balance_loss_mlp": 1.01995659, "epoch": 0.9149281548728432, "flos": 19208043636480.0, "grad_norm": 1.8498768516262158, "language_loss": 0.78999949, "learning_rate": 7.53487655872822e-08, "loss": 0.81180549, "num_input_tokens_seen": 164547695, "step": 7609, "time_per_iteration": 2.587688446044922 }, { "auxiliary_loss_clip": 0.01111049, "auxiliary_loss_mlp": 0.01022645, "balance_loss_clip": 1.03733718, "balance_loss_mlp": 1.01591003, "epoch": 0.9150483977634822, "flos": 26870554500480.0, "grad_norm": 1.7347481323925467, "language_loss": 0.73972881, "learning_rate": 7.513710947731656e-08, "loss": 0.76106572, "num_input_tokens_seen": 164568905, "step": 7610, "time_per_iteration": 2.7572197914123535 }, { "auxiliary_loss_clip": 0.01129691, "auxiliary_loss_mlp": 0.01024837, "balance_loss_clip": 1.04239154, "balance_loss_mlp": 1.01810789, "epoch": 0.9151686406541213, "flos": 21908956953600.0, "grad_norm": 1.7641459596383027, "language_loss": 0.85138571, "learning_rate": 7.492574536752095e-08, "loss": 0.872931, "num_input_tokens_seen": 164588895, "step": 7611, "time_per_iteration": 2.627189874649048 }, { "auxiliary_loss_clip": 0.01147264, "auxiliary_loss_mlp": 0.01028747, "balance_loss_clip": 1.044451, "balance_loss_mlp": 1.02182662, "epoch": 0.9152888835447605, "flos": 27308556944640.0, "grad_norm": 1.9513278755997778, "language_loss": 0.78031385, "learning_rate": 7.471467328995907e-08, "loss": 0.80207396, "num_input_tokens_seen": 164607705, "step": 7612, "time_per_iteration": 2.7257165908813477 }, { "auxiliary_loss_clip": 0.01075451, "auxiliary_loss_mlp": 0.01024737, "balance_loss_clip": 1.03526545, "balance_loss_mlp": 1.01787937, "epoch": 0.9154091264353995, "flos": 13370728510080.0, "grad_norm": 2.5091948169916996, "language_loss": 0.61080194, "learning_rate": 7.450389327665018e-08, "loss": 0.63180387, "num_input_tokens_seen": 164625540, "step": 7613, "time_per_iteration": 2.9022209644317627 }, { "auxiliary_loss_clip": 0.01126207, "auxiliary_loss_mlp": 0.0102541, "balance_loss_clip": 1.04572904, "balance_loss_mlp": 1.01812375, "epoch": 0.9155293693260386, "flos": 20193037367040.0, "grad_norm": 2.358564985631856, "language_loss": 0.67833745, "learning_rate": 7.429340535957029e-08, "loss": 0.6998536, "num_input_tokens_seen": 164640735, "step": 7614, "time_per_iteration": 3.121786594390869 }, { "auxiliary_loss_clip": 0.01140241, "auxiliary_loss_mlp": 0.01024068, "balance_loss_clip": 1.04324031, "balance_loss_mlp": 1.01650977, "epoch": 0.9156496122166777, "flos": 19354990176000.0, "grad_norm": 2.802766172719222, "language_loss": 0.70359105, "learning_rate": 7.40832095706494e-08, "loss": 0.72523415, "num_input_tokens_seen": 164657430, "step": 7615, "time_per_iteration": 2.663567066192627 }, { "auxiliary_loss_clip": 0.01127128, "auxiliary_loss_mlp": 0.01024691, "balance_loss_clip": 1.04125392, "balance_loss_mlp": 1.0177207, "epoch": 0.9157698551073168, "flos": 21107287261440.0, "grad_norm": 1.8852668289405823, "language_loss": 0.80191004, "learning_rate": 7.387330594177443e-08, "loss": 0.82342827, "num_input_tokens_seen": 164679505, "step": 7616, "time_per_iteration": 2.676135540008545 }, { "auxiliary_loss_clip": 0.01115238, "auxiliary_loss_mlp": 0.01023149, "balance_loss_clip": 1.03894305, "balance_loss_mlp": 1.01631284, "epoch": 0.9158900979979558, "flos": 25193167228800.0, "grad_norm": 1.728839788754277, "language_loss": 0.79380012, "learning_rate": 7.366369450478749e-08, "loss": 0.81518388, "num_input_tokens_seen": 164700615, "step": 7617, "time_per_iteration": 2.7504794597625732 }, { "auxiliary_loss_clip": 0.01116585, "auxiliary_loss_mlp": 0.01026199, "balance_loss_clip": 1.03920627, "balance_loss_mlp": 1.01946688, "epoch": 0.916010340888595, "flos": 30146648302080.0, "grad_norm": 1.5442085069660143, "language_loss": 0.66633689, "learning_rate": 7.345437529148646e-08, "loss": 0.68776476, "num_input_tokens_seen": 164719625, "step": 7618, "time_per_iteration": 3.721467971801758 }, { "auxiliary_loss_clip": 0.0112489, "auxiliary_loss_mlp": 0.01026657, "balance_loss_clip": 1.0419488, "balance_loss_mlp": 1.01977825, "epoch": 0.9161305837792341, "flos": 17091827907840.0, "grad_norm": 2.6593901856350812, "language_loss": 0.7274915, "learning_rate": 7.324534833362483e-08, "loss": 0.74900693, "num_input_tokens_seen": 164737200, "step": 7619, "time_per_iteration": 2.7362148761749268 }, { "auxiliary_loss_clip": 0.01137455, "auxiliary_loss_mlp": 0.01022947, "balance_loss_clip": 1.04205132, "balance_loss_mlp": 1.01657236, "epoch": 0.9162508266698731, "flos": 22893699288960.0, "grad_norm": 1.6415295583188405, "language_loss": 0.68614334, "learning_rate": 7.303661366291192e-08, "loss": 0.7077474, "num_input_tokens_seen": 164757870, "step": 7620, "time_per_iteration": 2.693411350250244 }, { "auxiliary_loss_clip": 0.01107475, "auxiliary_loss_mlp": 0.0102492, "balance_loss_clip": 1.03981805, "balance_loss_mlp": 1.01788664, "epoch": 0.9163710695605123, "flos": 19974808287360.0, "grad_norm": 1.7953852896391316, "language_loss": 0.81623489, "learning_rate": 7.28281713110126e-08, "loss": 0.83755887, "num_input_tokens_seen": 164775945, "step": 7621, "time_per_iteration": 3.728485107421875 }, { "auxiliary_loss_clip": 0.0113043, "auxiliary_loss_mlp": 0.01022789, "balance_loss_clip": 1.04280972, "balance_loss_mlp": 1.01614356, "epoch": 0.9164913124511513, "flos": 22783812606720.0, "grad_norm": 1.867148212658535, "language_loss": 0.77450442, "learning_rate": 7.262002130954759e-08, "loss": 0.7960366, "num_input_tokens_seen": 164794400, "step": 7622, "time_per_iteration": 2.6745216846466064 }, { "auxiliary_loss_clip": 0.01111425, "auxiliary_loss_mlp": 0.01026451, "balance_loss_clip": 1.04043388, "balance_loss_mlp": 1.01926887, "epoch": 0.9166115553417904, "flos": 24900854348160.0, "grad_norm": 2.1311247673049714, "language_loss": 0.78787553, "learning_rate": 7.241216369009296e-08, "loss": 0.80925429, "num_input_tokens_seen": 164814585, "step": 7623, "time_per_iteration": 2.73293399810791 }, { "auxiliary_loss_clip": 0.01163305, "auxiliary_loss_mlp": 0.01025931, "balance_loss_clip": 1.04226422, "balance_loss_mlp": 1.018924, "epoch": 0.9167317982324296, "flos": 25702919089920.0, "grad_norm": 1.8615897489775755, "language_loss": 0.66519636, "learning_rate": 7.220459848418037e-08, "loss": 0.68708873, "num_input_tokens_seen": 164834660, "step": 7624, "time_per_iteration": 3.549938440322876 }, { "auxiliary_loss_clip": 0.01164544, "auxiliary_loss_mlp": 0.01026804, "balance_loss_clip": 1.04658031, "balance_loss_mlp": 1.0202837, "epoch": 0.9168520411230686, "flos": 15632813370240.0, "grad_norm": 1.7799323220635896, "language_loss": 0.79827434, "learning_rate": 7.199732572329708e-08, "loss": 0.82018787, "num_input_tokens_seen": 164852560, "step": 7625, "time_per_iteration": 2.5520882606506348 }, { "auxiliary_loss_clip": 0.01123865, "auxiliary_loss_mlp": 0.01029197, "balance_loss_clip": 1.03993666, "balance_loss_mlp": 1.02202666, "epoch": 0.9169722840137077, "flos": 30258151096320.0, "grad_norm": 2.941219494880475, "language_loss": 0.75851977, "learning_rate": 7.179034543888684e-08, "loss": 0.7800504, "num_input_tokens_seen": 164872065, "step": 7626, "time_per_iteration": 3.6876707077026367 }, { "auxiliary_loss_clip": 0.01152278, "auxiliary_loss_mlp": 0.01025647, "balance_loss_clip": 1.04301643, "balance_loss_mlp": 1.01892972, "epoch": 0.9170925269043467, "flos": 22491643380480.0, "grad_norm": 2.096808553636907, "language_loss": 0.77309978, "learning_rate": 7.158365766234808e-08, "loss": 0.79487908, "num_input_tokens_seen": 164890915, "step": 7627, "time_per_iteration": 2.5866832733154297 }, { "auxiliary_loss_clip": 0.01115637, "auxiliary_loss_mlp": 0.01024898, "balance_loss_clip": 1.0377872, "balance_loss_mlp": 1.01720321, "epoch": 0.9172127697949859, "flos": 22893914770560.0, "grad_norm": 1.9842777395160318, "language_loss": 0.72528505, "learning_rate": 7.137726242503527e-08, "loss": 0.74669039, "num_input_tokens_seen": 164909835, "step": 7628, "time_per_iteration": 2.6439201831817627 }, { "auxiliary_loss_clip": 0.0115277, "auxiliary_loss_mlp": 0.00761873, "balance_loss_clip": 1.04654431, "balance_loss_mlp": 1.00031793, "epoch": 0.917333012685625, "flos": 17451867882240.0, "grad_norm": 2.447961429849324, "language_loss": 0.78464431, "learning_rate": 7.11711597582585e-08, "loss": 0.80379081, "num_input_tokens_seen": 164927195, "step": 7629, "time_per_iteration": 2.6704189777374268 }, { "auxiliary_loss_clip": 0.01122298, "auxiliary_loss_mlp": 0.01024904, "balance_loss_clip": 1.03776443, "balance_loss_mlp": 1.0183444, "epoch": 0.917453255576264, "flos": 14318949692160.0, "grad_norm": 1.6419150571552963, "language_loss": 0.80087495, "learning_rate": 7.096534969328271e-08, "loss": 0.82234699, "num_input_tokens_seen": 164944640, "step": 7630, "time_per_iteration": 2.6644692420959473 }, { "auxiliary_loss_clip": 0.01137082, "auxiliary_loss_mlp": 0.01030446, "balance_loss_clip": 1.03846693, "balance_loss_mlp": 1.02337658, "epoch": 0.9175734984669032, "flos": 20741177888640.0, "grad_norm": 2.0198863804563447, "language_loss": 0.83611512, "learning_rate": 7.075983226132987e-08, "loss": 0.85779035, "num_input_tokens_seen": 164963570, "step": 7631, "time_per_iteration": 2.680760383605957 }, { "auxiliary_loss_clip": 0.01142049, "auxiliary_loss_mlp": 0.00762939, "balance_loss_clip": 1.0424819, "balance_loss_mlp": 1.00035906, "epoch": 0.9176937413575422, "flos": 14830497233280.0, "grad_norm": 2.52335114141742, "language_loss": 0.79377848, "learning_rate": 7.055460749357656e-08, "loss": 0.81282842, "num_input_tokens_seen": 164979850, "step": 7632, "time_per_iteration": 2.6580758094787598 }, { "auxiliary_loss_clip": 0.01136496, "auxiliary_loss_mlp": 0.0102626, "balance_loss_clip": 1.04303777, "balance_loss_mlp": 1.01873446, "epoch": 0.9178139842481813, "flos": 18474603828480.0, "grad_norm": 1.869498651624272, "language_loss": 0.70246351, "learning_rate": 7.034967542115521e-08, "loss": 0.72409105, "num_input_tokens_seen": 164998115, "step": 7633, "time_per_iteration": 2.6734867095947266 }, { "auxiliary_loss_clip": 0.01140981, "auxiliary_loss_mlp": 0.00762484, "balance_loss_clip": 1.04158628, "balance_loss_mlp": 1.00033212, "epoch": 0.9179342271388204, "flos": 20047455544320.0, "grad_norm": 2.2114816819982983, "language_loss": 0.75524771, "learning_rate": 7.014503607515388e-08, "loss": 0.77428234, "num_input_tokens_seen": 165017420, "step": 7634, "time_per_iteration": 2.6444485187530518 }, { "auxiliary_loss_clip": 0.01138164, "auxiliary_loss_mlp": 0.01024408, "balance_loss_clip": 1.04567313, "balance_loss_mlp": 1.01735413, "epoch": 0.9180544700294595, "flos": 24676232647680.0, "grad_norm": 1.9738919811691342, "language_loss": 0.67790091, "learning_rate": 6.994068948661592e-08, "loss": 0.69952661, "num_input_tokens_seen": 165035575, "step": 7635, "time_per_iteration": 2.7300009727478027 }, { "auxiliary_loss_clip": 0.0115136, "auxiliary_loss_mlp": 0.01030427, "balance_loss_clip": 1.04479647, "balance_loss_mlp": 1.02267814, "epoch": 0.9181747129200986, "flos": 16727478301440.0, "grad_norm": 1.9951825633749778, "language_loss": 0.76715302, "learning_rate": 6.973663568654142e-08, "loss": 0.78897095, "num_input_tokens_seen": 165053280, "step": 7636, "time_per_iteration": 2.684580087661743 }, { "auxiliary_loss_clip": 0.01163982, "auxiliary_loss_mlp": 0.01024264, "balance_loss_clip": 1.04551339, "balance_loss_mlp": 1.01671219, "epoch": 0.9182949558107377, "flos": 24271626873600.0, "grad_norm": 2.448733952281188, "language_loss": 0.6515274, "learning_rate": 6.953287470588386e-08, "loss": 0.67340988, "num_input_tokens_seen": 165071235, "step": 7637, "time_per_iteration": 2.6215314865112305 }, { "auxiliary_loss_clip": 0.01152627, "auxiliary_loss_mlp": 0.0102303, "balance_loss_clip": 1.04227209, "balance_loss_mlp": 1.0157789, "epoch": 0.9184151987013768, "flos": 22082117443200.0, "grad_norm": 2.750667652079265, "language_loss": 0.85994065, "learning_rate": 6.932940657555452e-08, "loss": 0.88169718, "num_input_tokens_seen": 165087365, "step": 7638, "time_per_iteration": 2.656836748123169 }, { "auxiliary_loss_clip": 0.01159731, "auxiliary_loss_mlp": 0.01019139, "balance_loss_clip": 1.04389262, "balance_loss_mlp": 1.01264179, "epoch": 0.9185354415920158, "flos": 32166732257280.0, "grad_norm": 2.1639079552361027, "language_loss": 0.7611686, "learning_rate": 6.912623132641938e-08, "loss": 0.78295732, "num_input_tokens_seen": 165112455, "step": 7639, "time_per_iteration": 2.686985492706299 }, { "auxiliary_loss_clip": 0.01139448, "auxiliary_loss_mlp": 0.01023865, "balance_loss_clip": 1.04355955, "balance_loss_mlp": 1.01614034, "epoch": 0.918655684482655, "flos": 20997831542400.0, "grad_norm": 1.9807113032764863, "language_loss": 0.76794708, "learning_rate": 6.892334898929952e-08, "loss": 0.78958029, "num_input_tokens_seen": 165132700, "step": 7640, "time_per_iteration": 2.7348110675811768 }, { "auxiliary_loss_clip": 0.01145623, "auxiliary_loss_mlp": 0.01022254, "balance_loss_clip": 1.04226339, "balance_loss_mlp": 1.01523888, "epoch": 0.918775927373294, "flos": 15560704817280.0, "grad_norm": 2.0129661269443395, "language_loss": 0.84737813, "learning_rate": 6.872075959497236e-08, "loss": 0.86905694, "num_input_tokens_seen": 165151475, "step": 7641, "time_per_iteration": 2.7128875255584717 }, { "auxiliary_loss_clip": 0.01153216, "auxiliary_loss_mlp": 0.01025167, "balance_loss_clip": 1.04248774, "balance_loss_mlp": 1.01825881, "epoch": 0.9188961702639331, "flos": 29934057657600.0, "grad_norm": 2.258568265033229, "language_loss": 0.83064222, "learning_rate": 6.85184631741702e-08, "loss": 0.85242611, "num_input_tokens_seen": 165172040, "step": 7642, "time_per_iteration": 2.6453428268432617 }, { "auxiliary_loss_clip": 0.0114767, "auxiliary_loss_mlp": 0.0103171, "balance_loss_clip": 1.04225051, "balance_loss_mlp": 1.02467394, "epoch": 0.9190164131545723, "flos": 20701244943360.0, "grad_norm": 2.0796683284365662, "language_loss": 0.77517885, "learning_rate": 6.831645975758161e-08, "loss": 0.79697263, "num_input_tokens_seen": 165189980, "step": 7643, "time_per_iteration": 2.629753589630127 }, { "auxiliary_loss_clip": 0.01130956, "auxiliary_loss_mlp": 0.01023539, "balance_loss_clip": 1.04325032, "balance_loss_mlp": 1.01606476, "epoch": 0.9191366560452113, "flos": 25629912696960.0, "grad_norm": 1.9907374066992514, "language_loss": 0.67289507, "learning_rate": 6.811474937585026e-08, "loss": 0.69443995, "num_input_tokens_seen": 165209770, "step": 7644, "time_per_iteration": 2.652667760848999 }, { "auxiliary_loss_clip": 0.01119277, "auxiliary_loss_mlp": 0.01023546, "balance_loss_clip": 1.0411669, "balance_loss_mlp": 1.0168016, "epoch": 0.9192568989358504, "flos": 21434325615360.0, "grad_norm": 2.4927396848398664, "language_loss": 0.79362714, "learning_rate": 6.79133320595755e-08, "loss": 0.81505537, "num_input_tokens_seen": 165229690, "step": 7645, "time_per_iteration": 3.74513578414917 }, { "auxiliary_loss_clip": 0.01137623, "auxiliary_loss_mlp": 0.01021838, "balance_loss_clip": 1.04342985, "balance_loss_mlp": 1.01495349, "epoch": 0.9193771418264896, "flos": 23185078416000.0, "grad_norm": 1.7976703587555092, "language_loss": 0.75304443, "learning_rate": 6.771220783931198e-08, "loss": 0.77463901, "num_input_tokens_seen": 165249850, "step": 7646, "time_per_iteration": 2.66698956489563 }, { "auxiliary_loss_clip": 0.00996804, "auxiliary_loss_mlp": 0.0075364, "balance_loss_clip": 1.01898706, "balance_loss_mlp": 1.00017786, "epoch": 0.9194973847171286, "flos": 70582963184640.0, "grad_norm": 0.867275891797583, "language_loss": 0.64610088, "learning_rate": 6.751137674556994e-08, "loss": 0.66360533, "num_input_tokens_seen": 165310235, "step": 7647, "time_per_iteration": 4.630884647369385 }, { "auxiliary_loss_clip": 0.0115395, "auxiliary_loss_mlp": 0.01025895, "balance_loss_clip": 1.04309464, "balance_loss_mlp": 1.01827717, "epoch": 0.9196176276077677, "flos": 14720682378240.0, "grad_norm": 70.57632204719853, "language_loss": 0.77120328, "learning_rate": 6.731083880881572e-08, "loss": 0.79300171, "num_input_tokens_seen": 165326455, "step": 7648, "time_per_iteration": 4.192774295806885 }, { "auxiliary_loss_clip": 0.01136006, "auxiliary_loss_mlp": 0.01023382, "balance_loss_clip": 1.04259741, "balance_loss_mlp": 1.01659584, "epoch": 0.9197378704984068, "flos": 23294893271040.0, "grad_norm": 1.9800479003521174, "language_loss": 0.81027013, "learning_rate": 6.711059405947072e-08, "loss": 0.831864, "num_input_tokens_seen": 165344645, "step": 7649, "time_per_iteration": 2.6996259689331055 }, { "auxiliary_loss_clip": 0.01120303, "auxiliary_loss_mlp": 0.01020789, "balance_loss_clip": 1.04294538, "balance_loss_mlp": 1.01405656, "epoch": 0.9198581133890459, "flos": 20302564913280.0, "grad_norm": 3.3700757297459853, "language_loss": 0.77239776, "learning_rate": 6.691064252791156e-08, "loss": 0.79380864, "num_input_tokens_seen": 165364120, "step": 7650, "time_per_iteration": 3.7209854125976562 }, { "auxiliary_loss_clip": 0.01100536, "auxiliary_loss_mlp": 0.01022365, "balance_loss_clip": 1.03865767, "balance_loss_mlp": 1.01515007, "epoch": 0.9199783562796849, "flos": 17675663569920.0, "grad_norm": 1.5602623581482853, "language_loss": 0.7813217, "learning_rate": 6.67109842444713e-08, "loss": 0.80255079, "num_input_tokens_seen": 165383050, "step": 7651, "time_per_iteration": 2.925755262374878 }, { "auxiliary_loss_clip": 0.01146705, "auxiliary_loss_mlp": 0.00762315, "balance_loss_clip": 1.0441488, "balance_loss_mlp": 1.00032091, "epoch": 0.9200985991703241, "flos": 17676022705920.0, "grad_norm": 1.8653425193656061, "language_loss": 0.7673403, "learning_rate": 6.651161923943704e-08, "loss": 0.78643048, "num_input_tokens_seen": 165400955, "step": 7652, "time_per_iteration": 3.6874988079071045 }, { "auxiliary_loss_clip": 0.01146305, "auxiliary_loss_mlp": 0.01022659, "balance_loss_clip": 1.04178286, "balance_loss_mlp": 1.01551247, "epoch": 0.9202188420609632, "flos": 20996574566400.0, "grad_norm": 1.6967670848506242, "language_loss": 0.76773512, "learning_rate": 6.631254754305326e-08, "loss": 0.78942478, "num_input_tokens_seen": 165420415, "step": 7653, "time_per_iteration": 2.711350202560425 }, { "auxiliary_loss_clip": 0.01167242, "auxiliary_loss_mlp": 0.01023096, "balance_loss_clip": 1.04577065, "balance_loss_mlp": 1.01557946, "epoch": 0.9203390849516022, "flos": 13918222586880.0, "grad_norm": 2.1981564807417704, "language_loss": 0.78283787, "learning_rate": 6.611376918551848e-08, "loss": 0.8047412, "num_input_tokens_seen": 165439200, "step": 7654, "time_per_iteration": 2.568079948425293 }, { "auxiliary_loss_clip": 0.01120523, "auxiliary_loss_mlp": 0.00762521, "balance_loss_clip": 1.03998601, "balance_loss_mlp": 1.00033617, "epoch": 0.9204593278422414, "flos": 21175912195200.0, "grad_norm": 2.085154580491555, "language_loss": 0.79172349, "learning_rate": 6.591528419698744e-08, "loss": 0.81055391, "num_input_tokens_seen": 165458985, "step": 7655, "time_per_iteration": 2.7401700019836426 }, { "auxiliary_loss_clip": 0.01137792, "auxiliary_loss_mlp": 0.01025445, "balance_loss_clip": 1.04041171, "balance_loss_mlp": 1.01899588, "epoch": 0.9205795707328804, "flos": 14501375890560.0, "grad_norm": 2.190766671938305, "language_loss": 0.83071071, "learning_rate": 6.571709260756986e-08, "loss": 0.85234308, "num_input_tokens_seen": 165475630, "step": 7656, "time_per_iteration": 2.7331273555755615 }, { "auxiliary_loss_clip": 0.01156201, "auxiliary_loss_mlp": 0.01027043, "balance_loss_clip": 1.04998958, "balance_loss_mlp": 1.0197835, "epoch": 0.9206998136235195, "flos": 22417559579520.0, "grad_norm": 3.0191293233101946, "language_loss": 0.75988764, "learning_rate": 6.551919444733122e-08, "loss": 0.7817201, "num_input_tokens_seen": 165493445, "step": 7657, "time_per_iteration": 2.6642906665802 }, { "auxiliary_loss_clip": 0.01138683, "auxiliary_loss_mlp": 0.01025722, "balance_loss_clip": 1.04506397, "balance_loss_mlp": 1.01845634, "epoch": 0.9208200565141585, "flos": 53358407544960.0, "grad_norm": 1.8877926264656681, "language_loss": 0.65719759, "learning_rate": 6.53215897462931e-08, "loss": 0.67884165, "num_input_tokens_seen": 165517200, "step": 7658, "time_per_iteration": 3.002746105194092 }, { "auxiliary_loss_clip": 0.01147631, "auxiliary_loss_mlp": 0.01021545, "balance_loss_clip": 1.04217911, "balance_loss_mlp": 1.01453865, "epoch": 0.9209402994047977, "flos": 30589139946240.0, "grad_norm": 2.2884652133290686, "language_loss": 0.75016123, "learning_rate": 6.512427853443103e-08, "loss": 0.77185297, "num_input_tokens_seen": 165539280, "step": 7659, "time_per_iteration": 2.835571765899658 }, { "auxiliary_loss_clip": 0.01152354, "auxiliary_loss_mlp": 0.0102613, "balance_loss_clip": 1.04364312, "balance_loss_mlp": 1.01909065, "epoch": 0.9210605422954368, "flos": 29132711187840.0, "grad_norm": 1.5970410887814221, "language_loss": 0.75950086, "learning_rate": 6.492726084167799e-08, "loss": 0.78128564, "num_input_tokens_seen": 165561395, "step": 7660, "time_per_iteration": 2.7324769496917725 }, { "auxiliary_loss_clip": 0.01063044, "auxiliary_loss_mlp": 0.01000822, "balance_loss_clip": 1.00922656, "balance_loss_mlp": 0.99980301, "epoch": 0.9211807851860758, "flos": 54853838472960.0, "grad_norm": 0.7788976093199151, "language_loss": 0.57500839, "learning_rate": 6.473053669792072e-08, "loss": 0.59564704, "num_input_tokens_seen": 165616085, "step": 7661, "time_per_iteration": 3.1087706089019775 }, { "auxiliary_loss_clip": 0.01150209, "auxiliary_loss_mlp": 0.01024553, "balance_loss_clip": 1.04261994, "balance_loss_mlp": 1.016927, "epoch": 0.921301028076715, "flos": 19201974238080.0, "grad_norm": 2.9915358886369012, "language_loss": 0.73130906, "learning_rate": 6.453410613300248e-08, "loss": 0.75305665, "num_input_tokens_seen": 165634015, "step": 7662, "time_per_iteration": 2.6802449226379395 }, { "auxiliary_loss_clip": 0.01095627, "auxiliary_loss_mlp": 0.01026433, "balance_loss_clip": 1.03967595, "balance_loss_mlp": 1.01968312, "epoch": 0.921421270967354, "flos": 27526893765120.0, "grad_norm": 1.7618804112599578, "language_loss": 0.58599901, "learning_rate": 6.43379691767214e-08, "loss": 0.60721964, "num_input_tokens_seen": 165653220, "step": 7663, "time_per_iteration": 2.894577741622925 }, { "auxiliary_loss_clip": 0.0102345, "auxiliary_loss_mlp": 0.01000491, "balance_loss_clip": 1.00990915, "balance_loss_mlp": 0.99949569, "epoch": 0.9215415138579931, "flos": 70209311955840.0, "grad_norm": 0.7141141417373716, "language_loss": 0.55136305, "learning_rate": 6.414212585883105e-08, "loss": 0.57160246, "num_input_tokens_seen": 165715850, "step": 7664, "time_per_iteration": 3.3525962829589844 }, { "auxiliary_loss_clip": 0.01138997, "auxiliary_loss_mlp": 0.01023297, "balance_loss_clip": 1.04227829, "balance_loss_mlp": 1.01615047, "epoch": 0.9216617567486323, "flos": 35553107790720.0, "grad_norm": 1.8456535018688633, "language_loss": 0.69853044, "learning_rate": 6.394657620904143e-08, "loss": 0.72015333, "num_input_tokens_seen": 165738960, "step": 7665, "time_per_iteration": 2.802016258239746 }, { "auxiliary_loss_clip": 0.01168234, "auxiliary_loss_mlp": 0.01026099, "balance_loss_clip": 1.04581404, "balance_loss_mlp": 1.01884794, "epoch": 0.9217819996392713, "flos": 29533330552320.0, "grad_norm": 2.393104760792951, "language_loss": 0.7203747, "learning_rate": 6.375132025701657e-08, "loss": 0.74231803, "num_input_tokens_seen": 165761260, "step": 7666, "time_per_iteration": 2.674694299697876 }, { "auxiliary_loss_clip": 0.01169845, "auxiliary_loss_mlp": 0.01025043, "balance_loss_clip": 1.04806769, "balance_loss_mlp": 1.0174017, "epoch": 0.9219022425299104, "flos": 14574669592320.0, "grad_norm": 2.250622879798371, "language_loss": 0.69104517, "learning_rate": 6.355635803237724e-08, "loss": 0.7129941, "num_input_tokens_seen": 165776960, "step": 7667, "time_per_iteration": 2.6515181064605713 }, { "auxiliary_loss_clip": 0.0115057, "auxiliary_loss_mlp": 0.01031006, "balance_loss_clip": 1.04344392, "balance_loss_mlp": 1.02328718, "epoch": 0.9220224854205495, "flos": 18077503996800.0, "grad_norm": 3.2091062199766895, "language_loss": 0.79625744, "learning_rate": 6.336168956469867e-08, "loss": 0.81807315, "num_input_tokens_seen": 165795435, "step": 7668, "time_per_iteration": 2.6381819248199463 }, { "auxiliary_loss_clip": 0.01128403, "auxiliary_loss_mlp": 0.01025541, "balance_loss_clip": 1.04125762, "balance_loss_mlp": 1.01809072, "epoch": 0.9221427283111886, "flos": 24790464875520.0, "grad_norm": 1.7808514836310947, "language_loss": 0.71702391, "learning_rate": 6.316731488351168e-08, "loss": 0.73856336, "num_input_tokens_seen": 165816625, "step": 7669, "time_per_iteration": 2.751556873321533 }, { "auxiliary_loss_clip": 0.01149715, "auxiliary_loss_mlp": 0.01023992, "balance_loss_clip": 1.04318225, "balance_loss_mlp": 1.01687849, "epoch": 0.9222629712018277, "flos": 13845036625920.0, "grad_norm": 1.6897075047621537, "language_loss": 0.63340747, "learning_rate": 6.297323401830334e-08, "loss": 0.65514451, "num_input_tokens_seen": 165835410, "step": 7670, "time_per_iteration": 3.536201000213623 }, { "auxiliary_loss_clip": 0.01151627, "auxiliary_loss_mlp": 0.01027991, "balance_loss_clip": 1.04266334, "balance_loss_mlp": 1.02054918, "epoch": 0.9223832140924668, "flos": 21616177196160.0, "grad_norm": 2.086363842336269, "language_loss": 0.68748069, "learning_rate": 6.277944699851523e-08, "loss": 0.70927685, "num_input_tokens_seen": 165854930, "step": 7671, "time_per_iteration": 2.622403383255005 }, { "auxiliary_loss_clip": 0.01165635, "auxiliary_loss_mlp": 0.01025591, "balance_loss_clip": 1.04562235, "balance_loss_mlp": 1.01754999, "epoch": 0.9225034569831059, "flos": 21142084561920.0, "grad_norm": 1.824916280736535, "language_loss": 0.73132509, "learning_rate": 6.25859538535447e-08, "loss": 0.75323737, "num_input_tokens_seen": 165875725, "step": 7672, "time_per_iteration": 2.6278393268585205 }, { "auxiliary_loss_clip": 0.01135963, "auxiliary_loss_mlp": 0.01026589, "balance_loss_clip": 1.04331851, "balance_loss_mlp": 1.01968372, "epoch": 0.9226236998737449, "flos": 12495046844160.0, "grad_norm": 2.6617504956215927, "language_loss": 0.78394806, "learning_rate": 6.239275461274474e-08, "loss": 0.8055737, "num_input_tokens_seen": 165892100, "step": 7673, "time_per_iteration": 4.027694225311279 }, { "auxiliary_loss_clip": 0.01149761, "auxiliary_loss_mlp": 0.01024672, "balance_loss_clip": 1.04413927, "balance_loss_mlp": 1.01800776, "epoch": 0.9227439427643841, "flos": 26214071581440.0, "grad_norm": 1.644299290024939, "language_loss": 0.85973966, "learning_rate": 6.219984930542299e-08, "loss": 0.88148403, "num_input_tokens_seen": 165912840, "step": 7674, "time_per_iteration": 2.726835012435913 }, { "auxiliary_loss_clip": 0.01155129, "auxiliary_loss_mlp": 0.01022842, "balance_loss_clip": 1.04499102, "balance_loss_mlp": 1.01555538, "epoch": 0.9228641856550232, "flos": 17967581400960.0, "grad_norm": 2.0040158320986907, "language_loss": 0.76088351, "learning_rate": 6.200723796084383e-08, "loss": 0.78266323, "num_input_tokens_seen": 165930935, "step": 7675, "time_per_iteration": 2.6631922721862793 }, { "auxiliary_loss_clip": 0.01037869, "auxiliary_loss_mlp": 0.01000994, "balance_loss_clip": 1.01008344, "balance_loss_mlp": 1.00000465, "epoch": 0.9229844285456622, "flos": 70420609710720.0, "grad_norm": 0.7606740451221454, "language_loss": 0.62987721, "learning_rate": 6.181492060822546e-08, "loss": 0.65026581, "num_input_tokens_seen": 165991110, "step": 7676, "time_per_iteration": 4.32470965385437 }, { "auxiliary_loss_clip": 0.01106338, "auxiliary_loss_mlp": 0.01025192, "balance_loss_clip": 1.03828168, "balance_loss_mlp": 1.01772368, "epoch": 0.9231046714363014, "flos": 17967832796160.0, "grad_norm": 2.4555163423599895, "language_loss": 0.81684756, "learning_rate": 6.162289727674274e-08, "loss": 0.83816284, "num_input_tokens_seen": 166008790, "step": 7677, "time_per_iteration": 2.6765387058258057 }, { "auxiliary_loss_clip": 0.01122744, "auxiliary_loss_mlp": 0.01024339, "balance_loss_clip": 1.04137444, "balance_loss_mlp": 1.01794076, "epoch": 0.9232249143269404, "flos": 17858233422720.0, "grad_norm": 2.232418189839277, "language_loss": 0.87472391, "learning_rate": 6.143116799552527e-08, "loss": 0.8961947, "num_input_tokens_seen": 166025035, "step": 7678, "time_per_iteration": 3.789640426635742 }, { "auxiliary_loss_clip": 0.01155602, "auxiliary_loss_mlp": 0.01024751, "balance_loss_clip": 1.04488897, "balance_loss_mlp": 1.01753891, "epoch": 0.9233451572175795, "flos": 23404384903680.0, "grad_norm": 2.6675227806483353, "language_loss": 0.56304872, "learning_rate": 6.123973279365802e-08, "loss": 0.58485222, "num_input_tokens_seen": 166044010, "step": 7679, "time_per_iteration": 2.6596646308898926 }, { "auxiliary_loss_clip": 0.01153774, "auxiliary_loss_mlp": 0.01024702, "balance_loss_clip": 1.04497588, "balance_loss_mlp": 1.01772547, "epoch": 0.9234654001082186, "flos": 17999326045440.0, "grad_norm": 1.8551810414797425, "language_loss": 0.77686268, "learning_rate": 6.10485917001824e-08, "loss": 0.7986474, "num_input_tokens_seen": 166061865, "step": 7680, "time_per_iteration": 2.670602321624756 }, { "auxiliary_loss_clip": 0.01140925, "auxiliary_loss_mlp": 0.01024871, "balance_loss_clip": 1.04155183, "balance_loss_mlp": 1.01825237, "epoch": 0.9235856429988577, "flos": 24750747411840.0, "grad_norm": 1.6655642386366822, "language_loss": 0.80844843, "learning_rate": 6.085774474409322e-08, "loss": 0.83010644, "num_input_tokens_seen": 166082425, "step": 7681, "time_per_iteration": 2.7491488456726074 }, { "auxiliary_loss_clip": 0.011375, "auxiliary_loss_mlp": 0.01027086, "balance_loss_clip": 1.04409826, "balance_loss_mlp": 1.01980853, "epoch": 0.9237058858894968, "flos": 14099894599680.0, "grad_norm": 1.8356102126232892, "language_loss": 0.703978, "learning_rate": 6.066719195434267e-08, "loss": 0.72562391, "num_input_tokens_seen": 166100225, "step": 7682, "time_per_iteration": 2.7139244079589844 }, { "auxiliary_loss_clip": 0.01156666, "auxiliary_loss_mlp": 0.01033974, "balance_loss_clip": 1.04629803, "balance_loss_mlp": 1.02591884, "epoch": 0.9238261287801359, "flos": 28694529175680.0, "grad_norm": 2.0994964481574447, "language_loss": 0.66366994, "learning_rate": 6.047693335983717e-08, "loss": 0.68557632, "num_input_tokens_seen": 166122570, "step": 7683, "time_per_iteration": 2.8360519409179688 }, { "auxiliary_loss_clip": 0.01151223, "auxiliary_loss_mlp": 0.01023838, "balance_loss_clip": 1.04180574, "balance_loss_mlp": 1.01650023, "epoch": 0.923946371670775, "flos": 23111856541440.0, "grad_norm": 2.4617144184375834, "language_loss": 0.8238374, "learning_rate": 6.028696898943853e-08, "loss": 0.84558797, "num_input_tokens_seen": 166141630, "step": 7684, "time_per_iteration": 2.680131196975708 }, { "auxiliary_loss_clip": 0.01136026, "auxiliary_loss_mlp": 0.00762714, "balance_loss_clip": 1.0415529, "balance_loss_mlp": 1.00036728, "epoch": 0.924066614561414, "flos": 21867120587520.0, "grad_norm": 2.022335845583231, "language_loss": 0.70889747, "learning_rate": 6.00972988719648e-08, "loss": 0.72788489, "num_input_tokens_seen": 166159865, "step": 7685, "time_per_iteration": 2.7904062271118164 }, { "auxiliary_loss_clip": 0.01126611, "auxiliary_loss_mlp": 0.00762828, "balance_loss_clip": 1.04316783, "balance_loss_mlp": 1.00034618, "epoch": 0.9241868574520532, "flos": 28511887495680.0, "grad_norm": 3.230472128418823, "language_loss": 0.7063309, "learning_rate": 5.990792303618807e-08, "loss": 0.72522527, "num_input_tokens_seen": 166179445, "step": 7686, "time_per_iteration": 2.8453075885772705 }, { "auxiliary_loss_clip": 0.01120322, "auxiliary_loss_mlp": 0.01021851, "balance_loss_clip": 1.04075718, "balance_loss_mlp": 1.01530051, "epoch": 0.9243071003426923, "flos": 30518324282880.0, "grad_norm": 1.7530778305320385, "language_loss": 0.6972394, "learning_rate": 5.971884151083695e-08, "loss": 0.71866119, "num_input_tokens_seen": 166201855, "step": 7687, "time_per_iteration": 2.809601068496704 }, { "auxiliary_loss_clip": 0.0114012, "auxiliary_loss_mlp": 0.01023131, "balance_loss_clip": 1.04236031, "balance_loss_mlp": 1.0165211, "epoch": 0.9244273432333313, "flos": 28658331244800.0, "grad_norm": 1.9290235363172512, "language_loss": 0.7426399, "learning_rate": 5.9530054324595124e-08, "loss": 0.76427245, "num_input_tokens_seen": 166221970, "step": 7688, "time_per_iteration": 2.8052711486816406 }, { "auxiliary_loss_clip": 0.01048168, "auxiliary_loss_mlp": 0.00753568, "balance_loss_clip": 1.00862217, "balance_loss_mlp": 1.00020397, "epoch": 0.9245475861239704, "flos": 66230589237120.0, "grad_norm": 0.7175134977871213, "language_loss": 0.5755372, "learning_rate": 5.934156150610103e-08, "loss": 0.59355462, "num_input_tokens_seen": 166279335, "step": 7689, "time_per_iteration": 3.410245418548584 }, { "auxiliary_loss_clip": 0.01133828, "auxiliary_loss_mlp": 0.0103039, "balance_loss_clip": 1.04050279, "balance_loss_mlp": 1.0231303, "epoch": 0.9246678290146095, "flos": 24239918142720.0, "grad_norm": 2.374628247061118, "language_loss": 0.79123747, "learning_rate": 5.915336308394914e-08, "loss": 0.81287974, "num_input_tokens_seen": 166298170, "step": 7690, "time_per_iteration": 2.792848825454712 }, { "auxiliary_loss_clip": 0.0114488, "auxiliary_loss_mlp": 0.01021306, "balance_loss_clip": 1.0429188, "balance_loss_mlp": 1.01473486, "epoch": 0.9247880719052486, "flos": 18988808976000.0, "grad_norm": 1.595738349584563, "language_loss": 0.77083743, "learning_rate": 5.89654590866886e-08, "loss": 0.7924993, "num_input_tokens_seen": 166317670, "step": 7691, "time_per_iteration": 2.6710033416748047 }, { "auxiliary_loss_clip": 0.01099415, "auxiliary_loss_mlp": 0.01025507, "balance_loss_clip": 1.04200244, "balance_loss_mlp": 1.01795471, "epoch": 0.9249083147958876, "flos": 24024095274240.0, "grad_norm": 1.977395743562468, "language_loss": 0.88056278, "learning_rate": 5.877784954282483e-08, "loss": 0.90181202, "num_input_tokens_seen": 166337010, "step": 7692, "time_per_iteration": 2.828361749649048 }, { "auxiliary_loss_clip": 0.011534, "auxiliary_loss_mlp": 0.01027702, "balance_loss_clip": 1.04268837, "balance_loss_mlp": 1.02012002, "epoch": 0.9250285576865268, "flos": 30773972355840.0, "grad_norm": 2.3693818580247368, "language_loss": 0.72656834, "learning_rate": 5.8590534480817963e-08, "loss": 0.74837935, "num_input_tokens_seen": 166358735, "step": 7693, "time_per_iteration": 2.7988007068634033 }, { "auxiliary_loss_clip": 0.0116767, "auxiliary_loss_mlp": 0.0102939, "balance_loss_clip": 1.04789639, "balance_loss_mlp": 1.02187657, "epoch": 0.9251488005771659, "flos": 10633581348480.0, "grad_norm": 2.347855859426235, "language_loss": 0.72441661, "learning_rate": 5.840351392908349e-08, "loss": 0.74638718, "num_input_tokens_seen": 166374455, "step": 7694, "time_per_iteration": 2.6447720527648926 }, { "auxiliary_loss_clip": 0.01143829, "auxiliary_loss_mlp": 0.00762014, "balance_loss_clip": 1.04234385, "balance_loss_mlp": 1.00035703, "epoch": 0.9252690434678049, "flos": 23586416052480.0, "grad_norm": 6.237704858237511, "language_loss": 0.70988679, "learning_rate": 5.821678791599205e-08, "loss": 0.72894514, "num_input_tokens_seen": 166393900, "step": 7695, "time_per_iteration": 2.7154760360717773 }, { "auxiliary_loss_clip": 0.01136815, "auxiliary_loss_mlp": 0.01027565, "balance_loss_clip": 1.04388678, "balance_loss_mlp": 1.0205729, "epoch": 0.9253892863584441, "flos": 21469158829440.0, "grad_norm": 1.6283441928302789, "language_loss": 0.8097589, "learning_rate": 5.803035646986965e-08, "loss": 0.83140272, "num_input_tokens_seen": 166413235, "step": 7696, "time_per_iteration": 3.6199681758880615 }, { "auxiliary_loss_clip": 0.01165244, "auxiliary_loss_mlp": 0.01027533, "balance_loss_clip": 1.04470158, "balance_loss_mlp": 1.02030325, "epoch": 0.9255095292490831, "flos": 17456680304640.0, "grad_norm": 2.5567230482555066, "language_loss": 0.67423368, "learning_rate": 5.7844219618998766e-08, "loss": 0.69616139, "num_input_tokens_seen": 166427560, "step": 7697, "time_per_iteration": 2.623157262802124 }, { "auxiliary_loss_clip": 0.01109336, "auxiliary_loss_mlp": 0.01033842, "balance_loss_clip": 1.03674126, "balance_loss_mlp": 1.02708602, "epoch": 0.9256297721397222, "flos": 24750675584640.0, "grad_norm": 1.847020266103541, "language_loss": 0.71910119, "learning_rate": 5.765837739161505e-08, "loss": 0.74053299, "num_input_tokens_seen": 166446680, "step": 7698, "time_per_iteration": 3.7242591381073 }, { "auxiliary_loss_clip": 0.01123075, "auxiliary_loss_mlp": 0.01022052, "balance_loss_clip": 1.04039359, "balance_loss_mlp": 1.01491773, "epoch": 0.9257500150303614, "flos": 23112215677440.0, "grad_norm": 1.8150193322032848, "language_loss": 0.7442897, "learning_rate": 5.7472829815911504e-08, "loss": 0.76574099, "num_input_tokens_seen": 166465505, "step": 7699, "time_per_iteration": 2.7585813999176025 }, { "auxiliary_loss_clip": 0.01130298, "auxiliary_loss_mlp": 0.01028837, "balance_loss_clip": 1.04032016, "balance_loss_mlp": 1.02172577, "epoch": 0.9258702579210004, "flos": 22564685687040.0, "grad_norm": 1.6215126110760196, "language_loss": 0.81709594, "learning_rate": 5.7287576920035164e-08, "loss": 0.8386873, "num_input_tokens_seen": 166484520, "step": 7700, "time_per_iteration": 2.8206911087036133 }, { "auxiliary_loss_clip": 0.01118429, "auxiliary_loss_mlp": 0.01031752, "balance_loss_clip": 1.03959584, "balance_loss_mlp": 1.02483237, "epoch": 0.9259905008116395, "flos": 30004298703360.0, "grad_norm": 1.857430771510208, "language_loss": 0.76705909, "learning_rate": 5.7102618732088435e-08, "loss": 0.78856099, "num_input_tokens_seen": 166503850, "step": 7701, "time_per_iteration": 2.8854329586029053 }, { "auxiliary_loss_clip": 0.01142672, "auxiliary_loss_mlp": 0.01024779, "balance_loss_clip": 1.04337943, "balance_loss_mlp": 1.01835966, "epoch": 0.9261107437022786, "flos": 24572128055040.0, "grad_norm": 1.8672752245825892, "language_loss": 0.74538279, "learning_rate": 5.6917955280130216e-08, "loss": 0.76705724, "num_input_tokens_seen": 166525330, "step": 7702, "time_per_iteration": 3.691372871398926 }, { "auxiliary_loss_clip": 0.0114916, "auxiliary_loss_mlp": 0.01023412, "balance_loss_clip": 1.04585612, "balance_loss_mlp": 1.01657844, "epoch": 0.9262309865929177, "flos": 22018448586240.0, "grad_norm": 2.3341077036594244, "language_loss": 0.72752434, "learning_rate": 5.6733586592172755e-08, "loss": 0.74925005, "num_input_tokens_seen": 166544825, "step": 7703, "time_per_iteration": 2.6406376361846924 }, { "auxiliary_loss_clip": 0.0112942, "auxiliary_loss_mlp": 0.00761828, "balance_loss_clip": 1.03906143, "balance_loss_mlp": 1.00030303, "epoch": 0.9263512294835567, "flos": 20339481116160.0, "grad_norm": 2.4114946264566774, "language_loss": 0.80249923, "learning_rate": 5.6549512696185244e-08, "loss": 0.82141173, "num_input_tokens_seen": 166563325, "step": 7704, "time_per_iteration": 3.6386144161224365 }, { "auxiliary_loss_clip": 0.01162408, "auxiliary_loss_mlp": 0.01023545, "balance_loss_clip": 1.04532897, "balance_loss_mlp": 1.01630008, "epoch": 0.9264714723741959, "flos": 21215378263680.0, "grad_norm": 1.9149634046535378, "language_loss": 0.68489289, "learning_rate": 5.636573362009156e-08, "loss": 0.70675248, "num_input_tokens_seen": 166583385, "step": 7705, "time_per_iteration": 2.6227099895477295 }, { "auxiliary_loss_clip": 0.01167996, "auxiliary_loss_mlp": 0.01025802, "balance_loss_clip": 1.04506803, "balance_loss_mlp": 1.01862526, "epoch": 0.926591715264835, "flos": 18004964480640.0, "grad_norm": 1.9307717490346854, "language_loss": 0.7680614, "learning_rate": 5.618224939177074e-08, "loss": 0.78999943, "num_input_tokens_seen": 166601290, "step": 7706, "time_per_iteration": 2.6301047801971436 }, { "auxiliary_loss_clip": 0.01128483, "auxiliary_loss_mlp": 0.01020984, "balance_loss_clip": 1.04107785, "balance_loss_mlp": 1.01399827, "epoch": 0.926711958155474, "flos": 36167969825280.0, "grad_norm": 1.7482415298251806, "language_loss": 0.7038185, "learning_rate": 5.599906003905719e-08, "loss": 0.72531319, "num_input_tokens_seen": 166623835, "step": 7707, "time_per_iteration": 2.8109073638916016 }, { "auxiliary_loss_clip": 0.01146601, "auxiliary_loss_mlp": 0.01025647, "balance_loss_clip": 1.04575801, "balance_loss_mlp": 1.01844645, "epoch": 0.9268322010461132, "flos": 21032736583680.0, "grad_norm": 2.7586734975875222, "language_loss": 0.81805748, "learning_rate": 5.581616558974023e-08, "loss": 0.83977997, "num_input_tokens_seen": 166642400, "step": 7708, "time_per_iteration": 2.644639015197754 }, { "auxiliary_loss_clip": 0.01158762, "auxiliary_loss_mlp": 0.00762795, "balance_loss_clip": 1.04607558, "balance_loss_mlp": 1.00039482, "epoch": 0.9269524439367522, "flos": 22964838174720.0, "grad_norm": 1.8495868924937249, "language_loss": 0.79094899, "learning_rate": 5.5633566071565444e-08, "loss": 0.81016445, "num_input_tokens_seen": 166661640, "step": 7709, "time_per_iteration": 2.706026077270508 }, { "auxiliary_loss_clip": 0.01100208, "auxiliary_loss_mlp": 0.0102268, "balance_loss_clip": 1.03918409, "balance_loss_mlp": 1.01580787, "epoch": 0.9270726868273913, "flos": 41975551468800.0, "grad_norm": 2.160482751388584, "language_loss": 0.70487028, "learning_rate": 5.5451261512232896e-08, "loss": 0.72609913, "num_input_tokens_seen": 166684320, "step": 7710, "time_per_iteration": 3.10477614402771 }, { "auxiliary_loss_clip": 0.01154358, "auxiliary_loss_mlp": 0.01021083, "balance_loss_clip": 1.04213214, "balance_loss_mlp": 1.01404417, "epoch": 0.9271929297180305, "flos": 19791771557760.0, "grad_norm": 1.7845967766221538, "language_loss": 0.62753314, "learning_rate": 5.5269251939397576e-08, "loss": 0.64928758, "num_input_tokens_seen": 166703835, "step": 7711, "time_per_iteration": 2.689244508743286 }, { "auxiliary_loss_clip": 0.01122783, "auxiliary_loss_mlp": 0.01022362, "balance_loss_clip": 1.03814077, "balance_loss_mlp": 1.01464629, "epoch": 0.9273131726086695, "flos": 19968343839360.0, "grad_norm": 2.1437829452280672, "language_loss": 0.77060151, "learning_rate": 5.508753738067073e-08, "loss": 0.79205298, "num_input_tokens_seen": 166723375, "step": 7712, "time_per_iteration": 2.7311367988586426 }, { "auxiliary_loss_clip": 0.01153428, "auxiliary_loss_mlp": 0.01021447, "balance_loss_clip": 1.04258609, "balance_loss_mlp": 1.01460707, "epoch": 0.9274334154993086, "flos": 23258587599360.0, "grad_norm": 2.346389908135547, "language_loss": 0.79110593, "learning_rate": 5.4906117863617875e-08, "loss": 0.81285465, "num_input_tokens_seen": 166742760, "step": 7713, "time_per_iteration": 2.702477216720581 }, { "auxiliary_loss_clip": 0.01117625, "auxiliary_loss_mlp": 0.01023248, "balance_loss_clip": 1.03842402, "balance_loss_mlp": 1.01635206, "epoch": 0.9275536583899477, "flos": 31795343585280.0, "grad_norm": 1.7867036294819574, "language_loss": 0.78419089, "learning_rate": 5.4724993415760533e-08, "loss": 0.80559963, "num_input_tokens_seen": 166761115, "step": 7714, "time_per_iteration": 2.7863175868988037 }, { "auxiliary_loss_clip": 0.01132952, "auxiliary_loss_mlp": 0.007624, "balance_loss_clip": 1.04109061, "balance_loss_mlp": 1.00035238, "epoch": 0.9276739012805868, "flos": 18696998885760.0, "grad_norm": 2.7862575117878676, "language_loss": 0.7454533, "learning_rate": 5.454416406457496e-08, "loss": 0.7644068, "num_input_tokens_seen": 166780210, "step": 7715, "time_per_iteration": 2.8035240173339844 }, { "auxiliary_loss_clip": 0.01150929, "auxiliary_loss_mlp": 0.0102653, "balance_loss_clip": 1.04287648, "balance_loss_mlp": 1.02032471, "epoch": 0.9277941441712259, "flos": 13879079740800.0, "grad_norm": 2.4838415438350925, "language_loss": 0.74083537, "learning_rate": 5.436362983749299e-08, "loss": 0.76260996, "num_input_tokens_seen": 166795380, "step": 7716, "time_per_iteration": 2.705544948577881 }, { "auxiliary_loss_clip": 0.01115742, "auxiliary_loss_mlp": 0.01024231, "balance_loss_clip": 1.0432514, "balance_loss_mlp": 1.01745379, "epoch": 0.927914387061865, "flos": 23258659426560.0, "grad_norm": 2.020664356382373, "language_loss": 0.64387786, "learning_rate": 5.418339076190137e-08, "loss": 0.66527754, "num_input_tokens_seen": 166814890, "step": 7717, "time_per_iteration": 2.7757089138031006 }, { "auxiliary_loss_clip": 0.01132373, "auxiliary_loss_mlp": 0.01027146, "balance_loss_clip": 1.0440948, "balance_loss_mlp": 1.01937068, "epoch": 0.9280346299525041, "flos": 18073733068800.0, "grad_norm": 1.989673923929145, "language_loss": 0.88488573, "learning_rate": 5.400344686514202e-08, "loss": 0.90648091, "num_input_tokens_seen": 166832475, "step": 7718, "time_per_iteration": 2.7214879989624023 }, { "auxiliary_loss_clip": 0.01149, "auxiliary_loss_mlp": 0.01021924, "balance_loss_clip": 1.04427052, "balance_loss_mlp": 1.01536417, "epoch": 0.9281548728431431, "flos": 22342901160960.0, "grad_norm": 1.8614258488940845, "language_loss": 0.66426712, "learning_rate": 5.38237981745131e-08, "loss": 0.68597633, "num_input_tokens_seen": 166850590, "step": 7719, "time_per_iteration": 2.7828149795532227 }, { "auxiliary_loss_clip": 0.01153923, "auxiliary_loss_mlp": 0.00762196, "balance_loss_clip": 1.04462957, "balance_loss_mlp": 1.00030065, "epoch": 0.9282751157337822, "flos": 18843765857280.0, "grad_norm": 1.6077372123470461, "language_loss": 0.81444955, "learning_rate": 5.364444471726592e-08, "loss": 0.83361077, "num_input_tokens_seen": 166869795, "step": 7720, "time_per_iteration": 2.7692368030548096 }, { "auxiliary_loss_clip": 0.01149455, "auxiliary_loss_mlp": 0.01021736, "balance_loss_clip": 1.04323602, "balance_loss_mlp": 1.01416874, "epoch": 0.9283953586244214, "flos": 25556834476800.0, "grad_norm": 4.966131890377145, "language_loss": 0.80048931, "learning_rate": 5.346538652060939e-08, "loss": 0.82220125, "num_input_tokens_seen": 166891150, "step": 7721, "time_per_iteration": 2.727600336074829 }, { "auxiliary_loss_clip": 0.01134397, "auxiliary_loss_mlp": 0.01025266, "balance_loss_clip": 1.04376006, "balance_loss_mlp": 1.01860809, "epoch": 0.9285156015150604, "flos": 18223480869120.0, "grad_norm": 1.8025786567549196, "language_loss": 0.70239776, "learning_rate": 5.3286623611705994e-08, "loss": 0.72399437, "num_input_tokens_seen": 166909195, "step": 7722, "time_per_iteration": 3.6132497787475586 }, { "auxiliary_loss_clip": 0.01063176, "auxiliary_loss_mlp": 0.0100254, "balance_loss_clip": 1.00952041, "balance_loss_mlp": 1.00153315, "epoch": 0.9286358444056995, "flos": 66400017690240.0, "grad_norm": 0.8200105678183449, "language_loss": 0.60591877, "learning_rate": 5.3108156017673824e-08, "loss": 0.62657589, "num_input_tokens_seen": 166970955, "step": 7723, "time_per_iteration": 3.297715187072754 }, { "auxiliary_loss_clip": 0.01143477, "auxiliary_loss_mlp": 0.01027459, "balance_loss_clip": 1.04320085, "balance_loss_mlp": 1.02024138, "epoch": 0.9287560872963386, "flos": 22345630594560.0, "grad_norm": 1.691352115176769, "language_loss": 0.71634185, "learning_rate": 5.2929983765586775e-08, "loss": 0.73805124, "num_input_tokens_seen": 166989735, "step": 7724, "time_per_iteration": 3.694568157196045 }, { "auxiliary_loss_clip": 0.01166674, "auxiliary_loss_mlp": 0.01027243, "balance_loss_clip": 1.04615474, "balance_loss_mlp": 1.02043641, "epoch": 0.9288763301869777, "flos": 25700225569920.0, "grad_norm": 1.915673040649655, "language_loss": 0.62453049, "learning_rate": 5.275210688247278e-08, "loss": 0.64646965, "num_input_tokens_seen": 167010060, "step": 7725, "time_per_iteration": 2.6725692749023438 }, { "auxiliary_loss_clip": 0.01108398, "auxiliary_loss_mlp": 0.01023689, "balance_loss_clip": 1.04114199, "balance_loss_mlp": 1.01704264, "epoch": 0.9289965730776167, "flos": 12312046028160.0, "grad_norm": 1.9191507914748371, "language_loss": 0.85203362, "learning_rate": 5.257452539531604e-08, "loss": 0.87335443, "num_input_tokens_seen": 167027130, "step": 7726, "time_per_iteration": 2.78682017326355 }, { "auxiliary_loss_clip": 0.01151033, "auxiliary_loss_mlp": 0.01022542, "balance_loss_clip": 1.04237795, "balance_loss_mlp": 1.01578546, "epoch": 0.9291168159682559, "flos": 26685973486080.0, "grad_norm": 1.5091065190472088, "language_loss": 0.68430847, "learning_rate": 5.2397239331055445e-08, "loss": 0.7060442, "num_input_tokens_seen": 167049130, "step": 7727, "time_per_iteration": 2.804305076599121 }, { "auxiliary_loss_clip": 0.0113368, "auxiliary_loss_mlp": 0.01025415, "balance_loss_clip": 1.04384935, "balance_loss_mlp": 1.01819086, "epoch": 0.929237058858895, "flos": 14538256179840.0, "grad_norm": 2.1821373105038697, "language_loss": 0.80717742, "learning_rate": 5.2220248716585036e-08, "loss": 0.82876837, "num_input_tokens_seen": 167066810, "step": 7728, "time_per_iteration": 3.5550451278686523 }, { "auxiliary_loss_clip": 0.01143415, "auxiliary_loss_mlp": 0.01022517, "balance_loss_clip": 1.04186797, "balance_loss_mlp": 1.01476288, "epoch": 0.929357301749534, "flos": 23835456023040.0, "grad_norm": 2.52075062372884, "language_loss": 0.75634712, "learning_rate": 5.204355357875445e-08, "loss": 0.77800643, "num_input_tokens_seen": 167085155, "step": 7729, "time_per_iteration": 2.7067008018493652 }, { "auxiliary_loss_clip": 0.01136003, "auxiliary_loss_mlp": 0.01026526, "balance_loss_clip": 1.04264712, "balance_loss_mlp": 1.01962376, "epoch": 0.9294775446401732, "flos": 12969319046400.0, "grad_norm": 11.53088398761752, "language_loss": 0.70732695, "learning_rate": 5.1867153944367584e-08, "loss": 0.72895217, "num_input_tokens_seen": 167101545, "step": 7730, "time_per_iteration": 3.5283844470977783 }, { "auxiliary_loss_clip": 0.01128284, "auxiliary_loss_mlp": 0.01029308, "balance_loss_clip": 1.04191828, "balance_loss_mlp": 1.02215815, "epoch": 0.9295977875308122, "flos": 26211809024640.0, "grad_norm": 1.6003775611316275, "language_loss": 0.73592728, "learning_rate": 5.16910498401848e-08, "loss": 0.75750321, "num_input_tokens_seen": 167120995, "step": 7731, "time_per_iteration": 2.791487455368042 }, { "auxiliary_loss_clip": 0.01163227, "auxiliary_loss_mlp": 0.01024939, "balance_loss_clip": 1.04568398, "balance_loss_mlp": 1.01838517, "epoch": 0.9297180304214513, "flos": 16472297105280.0, "grad_norm": 2.149901837926539, "language_loss": 0.83360624, "learning_rate": 5.151524129292073e-08, "loss": 0.85548782, "num_input_tokens_seen": 167138890, "step": 7732, "time_per_iteration": 2.5565011501312256 }, { "auxiliary_loss_clip": 0.01151755, "auxiliary_loss_mlp": 0.01022984, "balance_loss_clip": 1.04456377, "balance_loss_mlp": 1.01572108, "epoch": 0.9298382733120905, "flos": 24060436859520.0, "grad_norm": 2.1451038748985938, "language_loss": 0.666682, "learning_rate": 5.1339728329245155e-08, "loss": 0.68842936, "num_input_tokens_seen": 167159455, "step": 7733, "time_per_iteration": 2.7086613178253174 }, { "auxiliary_loss_clip": 0.01170176, "auxiliary_loss_mlp": 0.01030394, "balance_loss_clip": 1.04722846, "balance_loss_mlp": 1.02274621, "epoch": 0.9299585162027295, "flos": 22127652910080.0, "grad_norm": 2.264131385128065, "language_loss": 0.79724526, "learning_rate": 5.116451097578367e-08, "loss": 0.81925094, "num_input_tokens_seen": 167178495, "step": 7734, "time_per_iteration": 2.5959465503692627 }, { "auxiliary_loss_clip": 0.01121119, "auxiliary_loss_mlp": 0.01025314, "balance_loss_clip": 1.04005265, "balance_loss_mlp": 1.01862907, "epoch": 0.9300787590933686, "flos": 21471780522240.0, "grad_norm": 1.587551270804688, "language_loss": 0.74489617, "learning_rate": 5.0989589259115895e-08, "loss": 0.76636052, "num_input_tokens_seen": 167199380, "step": 7735, "time_per_iteration": 2.7350351810455322 }, { "auxiliary_loss_clip": 0.01146027, "auxiliary_loss_mlp": 0.01028902, "balance_loss_clip": 1.04129112, "balance_loss_mlp": 1.02093267, "epoch": 0.9301990019840077, "flos": 17779588594560.0, "grad_norm": 1.8076409330002325, "language_loss": 0.71664739, "learning_rate": 5.081496320577816e-08, "loss": 0.7383967, "num_input_tokens_seen": 167216500, "step": 7736, "time_per_iteration": 2.5866611003875732 }, { "auxiliary_loss_clip": 0.01044613, "auxiliary_loss_mlp": 0.01002503, "balance_loss_clip": 1.01491237, "balance_loss_mlp": 1.00132835, "epoch": 0.9303192448746468, "flos": 58896122307840.0, "grad_norm": 0.930919676187039, "language_loss": 0.61173594, "learning_rate": 5.0640632842260835e-08, "loss": 0.6322071, "num_input_tokens_seen": 167276760, "step": 7737, "time_per_iteration": 3.3171186447143555 }, { "auxiliary_loss_clip": 0.01121042, "auxiliary_loss_mlp": 0.00761695, "balance_loss_clip": 1.04327965, "balance_loss_mlp": 1.00025773, "epoch": 0.9304394877652858, "flos": 57663522172800.0, "grad_norm": 1.4339064532722021, "language_loss": 0.72538948, "learning_rate": 5.0466598195009426e-08, "loss": 0.7442168, "num_input_tokens_seen": 167303630, "step": 7738, "time_per_iteration": 3.1268222332000732 }, { "auxiliary_loss_clip": 0.01123745, "auxiliary_loss_mlp": 0.01023508, "balance_loss_clip": 1.04050279, "balance_loss_mlp": 1.01671576, "epoch": 0.930559730655925, "flos": 20996143603200.0, "grad_norm": 2.1735328844863253, "language_loss": 0.70382792, "learning_rate": 5.0292859290425036e-08, "loss": 0.72530043, "num_input_tokens_seen": 167321500, "step": 7739, "time_per_iteration": 2.662423610687256 }, { "auxiliary_loss_clip": 0.01162052, "auxiliary_loss_mlp": 0.01019924, "balance_loss_clip": 1.0445565, "balance_loss_mlp": 1.01326358, "epoch": 0.9306799735465641, "flos": 23258264376960.0, "grad_norm": 1.8913820498741023, "language_loss": 0.77877569, "learning_rate": 5.011941615486348e-08, "loss": 0.80059552, "num_input_tokens_seen": 167340615, "step": 7740, "time_per_iteration": 2.6315221786499023 }, { "auxiliary_loss_clip": 0.0116484, "auxiliary_loss_mlp": 0.010257, "balance_loss_clip": 1.04542494, "balance_loss_mlp": 1.01879144, "epoch": 0.9308002164372031, "flos": 15231547560960.0, "grad_norm": 2.5088427097540165, "language_loss": 0.84406769, "learning_rate": 4.994626881463659e-08, "loss": 0.86597306, "num_input_tokens_seen": 167356870, "step": 7741, "time_per_iteration": 2.559896469116211 }, { "auxiliary_loss_clip": 0.01094358, "auxiliary_loss_mlp": 0.01026044, "balance_loss_clip": 1.03704906, "balance_loss_mlp": 1.01918066, "epoch": 0.9309204593278423, "flos": 30847481539200.0, "grad_norm": 1.6620141467546423, "language_loss": 0.71425784, "learning_rate": 4.9773417296009814e-08, "loss": 0.73546189, "num_input_tokens_seen": 167378390, "step": 7742, "time_per_iteration": 2.851039171218872 }, { "auxiliary_loss_clip": 0.01157139, "auxiliary_loss_mlp": 0.01027167, "balance_loss_clip": 1.04666686, "balance_loss_mlp": 1.01984191, "epoch": 0.9310407022184813, "flos": 23037269950080.0, "grad_norm": 2.668062255613747, "language_loss": 0.65280366, "learning_rate": 4.960086162520527e-08, "loss": 0.67464674, "num_input_tokens_seen": 167398480, "step": 7743, "time_per_iteration": 2.7241408824920654 }, { "auxiliary_loss_clip": 0.01114957, "auxiliary_loss_mlp": 0.01026864, "balance_loss_clip": 1.04033422, "balance_loss_mlp": 1.02000976, "epoch": 0.9311609451091204, "flos": 22127976132480.0, "grad_norm": 1.9338800004379093, "language_loss": 0.8255167, "learning_rate": 4.942860182839936e-08, "loss": 0.84693491, "num_input_tokens_seen": 167416825, "step": 7744, "time_per_iteration": 2.709965229034424 }, { "auxiliary_loss_clip": 0.01134409, "auxiliary_loss_mlp": 0.0102686, "balance_loss_clip": 1.04204452, "balance_loss_mlp": 1.01940322, "epoch": 0.9312811879997596, "flos": 21099206701440.0, "grad_norm": 1.805952358620401, "language_loss": 0.79908335, "learning_rate": 4.925663793172341e-08, "loss": 0.820696, "num_input_tokens_seen": 167434785, "step": 7745, "time_per_iteration": 2.632051467895508 }, { "auxiliary_loss_clip": 0.01036598, "auxiliary_loss_mlp": 0.00753556, "balance_loss_clip": 1.00777602, "balance_loss_mlp": 1.00021827, "epoch": 0.9314014308903986, "flos": 67148179096320.0, "grad_norm": 0.782367179020396, "language_loss": 0.56498611, "learning_rate": 4.908496996126477e-08, "loss": 0.58288765, "num_input_tokens_seen": 167498245, "step": 7746, "time_per_iteration": 3.25850510597229 }, { "auxiliary_loss_clip": 0.01151946, "auxiliary_loss_mlp": 0.01022636, "balance_loss_clip": 1.04906464, "balance_loss_mlp": 1.0152154, "epoch": 0.9315216737810377, "flos": 22565583527040.0, "grad_norm": 1.5039530029542891, "language_loss": 0.76702166, "learning_rate": 4.89135979430646e-08, "loss": 0.78876746, "num_input_tokens_seen": 167518290, "step": 7747, "time_per_iteration": 2.680576801300049 }, { "auxiliary_loss_clip": 0.01168277, "auxiliary_loss_mlp": 0.01027686, "balance_loss_clip": 1.04809153, "balance_loss_mlp": 1.02016973, "epoch": 0.9316419166716768, "flos": 23984054588160.0, "grad_norm": 1.832858628329843, "language_loss": 0.85226798, "learning_rate": 4.874252190312078e-08, "loss": 0.87422764, "num_input_tokens_seen": 167538675, "step": 7748, "time_per_iteration": 3.63077712059021 }, { "auxiliary_loss_clip": 0.01154086, "auxiliary_loss_mlp": 0.01022398, "balance_loss_clip": 1.04257059, "balance_loss_mlp": 1.01514447, "epoch": 0.9317621595623159, "flos": 30230464688640.0, "grad_norm": 1.5610527113041768, "language_loss": 0.64614213, "learning_rate": 4.857174186738477e-08, "loss": 0.667907, "num_input_tokens_seen": 167562025, "step": 7749, "time_per_iteration": 2.799370050430298 }, { "auxiliary_loss_clip": 0.01166357, "auxiliary_loss_mlp": 0.01024109, "balance_loss_clip": 1.04739797, "balance_loss_mlp": 1.01688528, "epoch": 0.931882402452955, "flos": 15742735966080.0, "grad_norm": 2.29370746712431, "language_loss": 0.73356223, "learning_rate": 4.840125786176408e-08, "loss": 0.75546694, "num_input_tokens_seen": 167578230, "step": 7750, "time_per_iteration": 3.6446855068206787 }, { "auxiliary_loss_clip": 0.01131892, "auxiliary_loss_mlp": 0.01026672, "balance_loss_clip": 1.04163146, "balance_loss_mlp": 1.01954007, "epoch": 0.932002645343594, "flos": 28366521154560.0, "grad_norm": 1.9268505268152, "language_loss": 0.77516472, "learning_rate": 4.823106991212067e-08, "loss": 0.79675037, "num_input_tokens_seen": 167597470, "step": 7751, "time_per_iteration": 2.7492470741271973 }, { "auxiliary_loss_clip": 0.01152329, "auxiliary_loss_mlp": 0.01023805, "balance_loss_clip": 1.04280186, "balance_loss_mlp": 1.01701903, "epoch": 0.9321228882342332, "flos": 15341146934400.0, "grad_norm": 1.9385269613145577, "language_loss": 0.83491218, "learning_rate": 4.806117804427212e-08, "loss": 0.85667348, "num_input_tokens_seen": 167615405, "step": 7752, "time_per_iteration": 2.6631391048431396 }, { "auxiliary_loss_clip": 0.01145487, "auxiliary_loss_mlp": 0.0102223, "balance_loss_clip": 1.04297662, "balance_loss_mlp": 1.01556325, "epoch": 0.9322431311248722, "flos": 17895365107200.0, "grad_norm": 1.9563940448572341, "language_loss": 0.64078355, "learning_rate": 4.7891582283990926e-08, "loss": 0.66246068, "num_input_tokens_seen": 167634130, "step": 7753, "time_per_iteration": 2.6204922199249268 }, { "auxiliary_loss_clip": 0.01121665, "auxiliary_loss_mlp": 0.01025445, "balance_loss_clip": 1.04030097, "balance_loss_mlp": 1.01858735, "epoch": 0.9323633740155113, "flos": 24169713010560.0, "grad_norm": 2.9241349804487857, "language_loss": 0.72781229, "learning_rate": 4.772228265700473e-08, "loss": 0.74928343, "num_input_tokens_seen": 167654990, "step": 7754, "time_per_iteration": 3.980368137359619 }, { "auxiliary_loss_clip": 0.01155326, "auxiliary_loss_mlp": 0.01026542, "balance_loss_clip": 1.0455538, "balance_loss_mlp": 1.01966703, "epoch": 0.9324836169061504, "flos": 15043482927360.0, "grad_norm": 2.177699058901137, "language_loss": 0.7613647, "learning_rate": 4.75532791889961e-08, "loss": 0.7831834, "num_input_tokens_seen": 167671690, "step": 7755, "time_per_iteration": 2.616466760635376 }, { "auxiliary_loss_clip": 0.01149224, "auxiliary_loss_mlp": 0.01023701, "balance_loss_clip": 1.04248905, "balance_loss_mlp": 1.01664972, "epoch": 0.9326038597967895, "flos": 18624890332800.0, "grad_norm": 1.7801060315291013, "language_loss": 0.65611911, "learning_rate": 4.738457190560252e-08, "loss": 0.67784846, "num_input_tokens_seen": 167690800, "step": 7756, "time_per_iteration": 3.533395767211914 }, { "auxiliary_loss_clip": 0.01106293, "auxiliary_loss_mlp": 0.01023202, "balance_loss_clip": 1.04133403, "balance_loss_mlp": 1.01617432, "epoch": 0.9327241026874286, "flos": 18952646958720.0, "grad_norm": 2.2219135936853975, "language_loss": 0.78761661, "learning_rate": 4.721616083241664e-08, "loss": 0.80891156, "num_input_tokens_seen": 167709055, "step": 7757, "time_per_iteration": 2.7389109134674072 }, { "auxiliary_loss_clip": 0.01147946, "auxiliary_loss_mlp": 0.01025436, "balance_loss_clip": 1.04443157, "balance_loss_mlp": 1.01819992, "epoch": 0.9328443455780677, "flos": 29570282668800.0, "grad_norm": 1.6897930209431782, "language_loss": 0.77603871, "learning_rate": 4.7048045994986684e-08, "loss": 0.79777253, "num_input_tokens_seen": 167729915, "step": 7758, "time_per_iteration": 2.6746304035186768 }, { "auxiliary_loss_clip": 0.01156848, "auxiliary_loss_mlp": 0.01027842, "balance_loss_clip": 1.04576528, "balance_loss_mlp": 1.01964641, "epoch": 0.9329645884687068, "flos": 30081722469120.0, "grad_norm": 2.090903063286058, "language_loss": 0.91058838, "learning_rate": 4.688022741881559e-08, "loss": 0.93243527, "num_input_tokens_seen": 167750440, "step": 7759, "time_per_iteration": 2.738821268081665 }, { "auxiliary_loss_clip": 0.01148507, "auxiliary_loss_mlp": 0.01022402, "balance_loss_clip": 1.04475665, "balance_loss_mlp": 1.01589608, "epoch": 0.9330848313593458, "flos": 21867982513920.0, "grad_norm": 1.809752912809867, "language_loss": 0.74939048, "learning_rate": 4.671270512936076e-08, "loss": 0.77109957, "num_input_tokens_seen": 167769600, "step": 7760, "time_per_iteration": 2.6239430904388428 }, { "auxiliary_loss_clip": 0.01116458, "auxiliary_loss_mlp": 0.01022059, "balance_loss_clip": 1.04004884, "balance_loss_mlp": 1.01517129, "epoch": 0.933205074249985, "flos": 22127221946880.0, "grad_norm": 1.6750784991334133, "language_loss": 0.82885724, "learning_rate": 4.6545479152035884e-08, "loss": 0.85024238, "num_input_tokens_seen": 167788770, "step": 7761, "time_per_iteration": 2.6968612670898438 }, { "auxiliary_loss_clip": 0.01152769, "auxiliary_loss_mlp": 0.01024741, "balance_loss_clip": 1.04517341, "balance_loss_mlp": 1.01770437, "epoch": 0.9333253171406241, "flos": 15341254675200.0, "grad_norm": 1.9334692979975494, "language_loss": 0.76373827, "learning_rate": 4.637854951220821e-08, "loss": 0.78551334, "num_input_tokens_seen": 167805555, "step": 7762, "time_per_iteration": 2.561016082763672 }, { "auxiliary_loss_clip": 0.01115818, "auxiliary_loss_mlp": 0.01028107, "balance_loss_clip": 1.03917027, "balance_loss_mlp": 1.02116895, "epoch": 0.9334455600312631, "flos": 15706142985600.0, "grad_norm": 5.452903051654911, "language_loss": 0.74843049, "learning_rate": 4.621191623520171e-08, "loss": 0.76986969, "num_input_tokens_seen": 167823985, "step": 7763, "time_per_iteration": 2.687645673751831 }, { "auxiliary_loss_clip": 0.01111234, "auxiliary_loss_mlp": 0.01021032, "balance_loss_clip": 1.04166794, "balance_loss_mlp": 1.01320601, "epoch": 0.9335658029219023, "flos": 22163563532160.0, "grad_norm": 2.8128340152730726, "language_loss": 0.84530079, "learning_rate": 4.604557934629372e-08, "loss": 0.8666234, "num_input_tokens_seen": 167843060, "step": 7764, "time_per_iteration": 2.7539713382720947 }, { "auxiliary_loss_clip": 0.01132466, "auxiliary_loss_mlp": 0.01022366, "balance_loss_clip": 1.04297495, "balance_loss_mlp": 1.01496315, "epoch": 0.9336860458125413, "flos": 20266833859200.0, "grad_norm": 1.7419667906815142, "language_loss": 0.80155754, "learning_rate": 4.587953887071805e-08, "loss": 0.82310593, "num_input_tokens_seen": 167862880, "step": 7765, "time_per_iteration": 2.6667873859405518 }, { "auxiliary_loss_clip": 0.0113126, "auxiliary_loss_mlp": 0.01026706, "balance_loss_clip": 1.03864336, "balance_loss_mlp": 1.01961899, "epoch": 0.9338062887031804, "flos": 20919689504640.0, "grad_norm": 1.8288559426579287, "language_loss": 0.85846633, "learning_rate": 4.5713794833662554e-08, "loss": 0.88004601, "num_input_tokens_seen": 167882095, "step": 7766, "time_per_iteration": 2.6537554264068604 }, { "auxiliary_loss_clip": 0.0116826, "auxiliary_loss_mlp": 0.01030062, "balance_loss_clip": 1.04691732, "balance_loss_mlp": 1.02254009, "epoch": 0.9339265315938196, "flos": 23221635482880.0, "grad_norm": 2.2121211469262976, "language_loss": 0.63188708, "learning_rate": 4.5548347260270236e-08, "loss": 0.65387028, "num_input_tokens_seen": 167901385, "step": 7767, "time_per_iteration": 2.6033313274383545 }, { "auxiliary_loss_clip": 0.01118723, "auxiliary_loss_mlp": 0.01022785, "balance_loss_clip": 1.04015851, "balance_loss_mlp": 1.01591289, "epoch": 0.9340467744844586, "flos": 22820261932800.0, "grad_norm": 7.4775368046058865, "language_loss": 0.69379044, "learning_rate": 4.538319617564012e-08, "loss": 0.71520555, "num_input_tokens_seen": 167920405, "step": 7768, "time_per_iteration": 2.7278037071228027 }, { "auxiliary_loss_clip": 0.01135328, "auxiliary_loss_mlp": 0.01028303, "balance_loss_clip": 1.04071164, "balance_loss_mlp": 1.02065587, "epoch": 0.9341670173750977, "flos": 23660428026240.0, "grad_norm": 1.9981844610082031, "language_loss": 0.7448408, "learning_rate": 4.521834160482485e-08, "loss": 0.76647711, "num_input_tokens_seen": 167939145, "step": 7769, "time_per_iteration": 2.6390607357025146 }, { "auxiliary_loss_clip": 0.01155716, "auxiliary_loss_mlp": 0.01028053, "balance_loss_clip": 1.04551351, "balance_loss_mlp": 1.02087629, "epoch": 0.9342872602657368, "flos": 24824256595200.0, "grad_norm": 1.533330388009997, "language_loss": 0.82041311, "learning_rate": 4.5053783572832846e-08, "loss": 0.84225082, "num_input_tokens_seen": 167959325, "step": 7770, "time_per_iteration": 2.7753350734710693 }, { "auxiliary_loss_clip": 0.01152004, "auxiliary_loss_mlp": 0.01022096, "balance_loss_clip": 1.04470313, "balance_loss_mlp": 1.01491332, "epoch": 0.9344075031563759, "flos": 25771831332480.0, "grad_norm": 2.500735363065873, "language_loss": 0.76397711, "learning_rate": 4.488952210462771e-08, "loss": 0.78571808, "num_input_tokens_seen": 167979530, "step": 7771, "time_per_iteration": 2.6625311374664307 }, { "auxiliary_loss_clip": 0.0116486, "auxiliary_loss_mlp": 0.01022368, "balance_loss_clip": 1.04604387, "balance_loss_mlp": 1.01539159, "epoch": 0.9345277460470149, "flos": 25551303782400.0, "grad_norm": 1.8983991586277111, "language_loss": 0.85561496, "learning_rate": 4.4725557225127495e-08, "loss": 0.87748724, "num_input_tokens_seen": 167997870, "step": 7772, "time_per_iteration": 2.695010185241699 }, { "auxiliary_loss_clip": 0.01151221, "auxiliary_loss_mlp": 0.01023607, "balance_loss_clip": 1.04290676, "balance_loss_mlp": 1.01694608, "epoch": 0.9346479889376541, "flos": 34313112432000.0, "grad_norm": 1.4965285580524506, "language_loss": 0.79083616, "learning_rate": 4.456188895920565e-08, "loss": 0.8125844, "num_input_tokens_seen": 168019625, "step": 7773, "time_per_iteration": 2.7694411277770996 }, { "auxiliary_loss_clip": 0.01166064, "auxiliary_loss_mlp": 0.01023267, "balance_loss_clip": 1.04618096, "balance_loss_mlp": 1.01612091, "epoch": 0.9347682318282932, "flos": 19093739581440.0, "grad_norm": 2.8396476457440483, "language_loss": 0.85374111, "learning_rate": 4.439851733169031e-08, "loss": 0.87563443, "num_input_tokens_seen": 168037415, "step": 7774, "time_per_iteration": 3.5083396434783936 }, { "auxiliary_loss_clip": 0.01121783, "auxiliary_loss_mlp": 0.01024472, "balance_loss_clip": 1.04093623, "balance_loss_mlp": 1.01775742, "epoch": 0.9348884747189322, "flos": 26249587153920.0, "grad_norm": 2.364509390695934, "language_loss": 0.69065124, "learning_rate": 4.4235442367365204e-08, "loss": 0.71211374, "num_input_tokens_seen": 168057725, "step": 7775, "time_per_iteration": 2.759178400039673 }, { "auxiliary_loss_clip": 0.01132988, "auxiliary_loss_mlp": 0.01024454, "balance_loss_clip": 1.03937697, "balance_loss_mlp": 1.01741219, "epoch": 0.9350087176095714, "flos": 18333080242560.0, "grad_norm": 2.2053460004327596, "language_loss": 0.79253012, "learning_rate": 4.4072664090968545e-08, "loss": 0.81410456, "num_input_tokens_seen": 168076110, "step": 7776, "time_per_iteration": 2.6150264739990234 }, { "auxiliary_loss_clip": 0.01135152, "auxiliary_loss_mlp": 0.01024349, "balance_loss_clip": 1.0391717, "balance_loss_mlp": 1.01712787, "epoch": 0.9351289605002104, "flos": 19318253541120.0, "grad_norm": 1.8309431712071516, "language_loss": 0.85097766, "learning_rate": 4.391018252719347e-08, "loss": 0.8725726, "num_input_tokens_seen": 168095905, "step": 7777, "time_per_iteration": 3.6331489086151123 }, { "auxiliary_loss_clip": 0.01138516, "auxiliary_loss_mlp": 0.01027324, "balance_loss_clip": 1.04096234, "balance_loss_mlp": 1.02017117, "epoch": 0.9352492033908495, "flos": 18799990156800.0, "grad_norm": 1.7702820366395635, "language_loss": 0.69219184, "learning_rate": 4.374799770068849e-08, "loss": 0.71385032, "num_input_tokens_seen": 168112580, "step": 7778, "time_per_iteration": 2.6266047954559326 }, { "auxiliary_loss_clip": 0.01149859, "auxiliary_loss_mlp": 0.0102741, "balance_loss_clip": 1.04532099, "balance_loss_mlp": 1.02028692, "epoch": 0.9353694462814887, "flos": 29530134241920.0, "grad_norm": 1.9318164640826072, "language_loss": 0.74492389, "learning_rate": 4.358610963605658e-08, "loss": 0.76669657, "num_input_tokens_seen": 168133030, "step": 7779, "time_per_iteration": 2.7109174728393555 }, { "auxiliary_loss_clip": 0.01169552, "auxiliary_loss_mlp": 0.0103064, "balance_loss_clip": 1.04768348, "balance_loss_mlp": 1.02355313, "epoch": 0.9354896891721277, "flos": 30665450390400.0, "grad_norm": 2.714088119774056, "language_loss": 0.68317294, "learning_rate": 4.342451835785677e-08, "loss": 0.7051748, "num_input_tokens_seen": 168153940, "step": 7780, "time_per_iteration": 3.647111177444458 }, { "auxiliary_loss_clip": 0.01133293, "auxiliary_loss_mlp": 0.01025748, "balance_loss_clip": 1.04078865, "balance_loss_mlp": 1.0191735, "epoch": 0.9356099320627668, "flos": 19463907191040.0, "grad_norm": 1.6910760774660425, "language_loss": 0.75248623, "learning_rate": 4.3263223890601665e-08, "loss": 0.77407658, "num_input_tokens_seen": 168172650, "step": 7781, "time_per_iteration": 2.7274491786956787 }, { "auxiliary_loss_clip": 0.01148989, "auxiliary_loss_mlp": 0.00761866, "balance_loss_clip": 1.04651976, "balance_loss_mlp": 1.00029325, "epoch": 0.9357301749534058, "flos": 19098156954240.0, "grad_norm": 1.741962188276256, "language_loss": 0.79114497, "learning_rate": 4.31022262587597e-08, "loss": 0.8102535, "num_input_tokens_seen": 168191325, "step": 7782, "time_per_iteration": 3.582162857055664 }, { "auxiliary_loss_clip": 0.01152345, "auxiliary_loss_mlp": 0.01030366, "balance_loss_clip": 1.04665971, "balance_loss_mlp": 1.02256417, "epoch": 0.935850417844045, "flos": 23550361776000.0, "grad_norm": 1.633882386229903, "language_loss": 0.65966439, "learning_rate": 4.2941525486754225e-08, "loss": 0.68149155, "num_input_tokens_seen": 168211645, "step": 7783, "time_per_iteration": 2.608125925064087 }, { "auxiliary_loss_clip": 0.01119233, "auxiliary_loss_mlp": 0.01025027, "balance_loss_clip": 1.04155445, "balance_loss_mlp": 1.01852417, "epoch": 0.935970660734684, "flos": 18588333265920.0, "grad_norm": 1.9019528100488723, "language_loss": 0.79561734, "learning_rate": 4.278112159896286e-08, "loss": 0.81705999, "num_input_tokens_seen": 168229485, "step": 7784, "time_per_iteration": 2.7679340839385986 }, { "auxiliary_loss_clip": 0.01129382, "auxiliary_loss_mlp": 0.01026507, "balance_loss_clip": 1.03858399, "balance_loss_mlp": 1.01934838, "epoch": 0.9360909036253231, "flos": 20631255292800.0, "grad_norm": 1.7431953111983045, "language_loss": 0.67480201, "learning_rate": 4.2621014619719896e-08, "loss": 0.69636083, "num_input_tokens_seen": 168247250, "step": 7785, "time_per_iteration": 2.64290714263916 }, { "auxiliary_loss_clip": 0.01040279, "auxiliary_loss_mlp": 0.01000824, "balance_loss_clip": 1.00775337, "balance_loss_mlp": 0.99976295, "epoch": 0.9362111465159623, "flos": 61791421052160.0, "grad_norm": 0.7195337756704083, "language_loss": 0.58561528, "learning_rate": 4.246120457331215e-08, "loss": 0.60602629, "num_input_tokens_seen": 168309425, "step": 7786, "time_per_iteration": 3.2914459705352783 }, { "auxiliary_loss_clip": 0.01130456, "auxiliary_loss_mlp": 0.01028864, "balance_loss_clip": 1.04350138, "balance_loss_mlp": 1.02170515, "epoch": 0.9363313894066013, "flos": 24170395368960.0, "grad_norm": 1.8530347124504223, "language_loss": 0.71831381, "learning_rate": 4.2301691483983325e-08, "loss": 0.73990703, "num_input_tokens_seen": 168329545, "step": 7787, "time_per_iteration": 2.690408706665039 }, { "auxiliary_loss_clip": 0.01154029, "auxiliary_loss_mlp": 0.01022588, "balance_loss_clip": 1.04498672, "balance_loss_mlp": 1.01565552, "epoch": 0.9364516322972404, "flos": 20120354196480.0, "grad_norm": 2.4423990863173, "language_loss": 0.7590729, "learning_rate": 4.214247537593163e-08, "loss": 0.78083909, "num_input_tokens_seen": 168348795, "step": 7788, "time_per_iteration": 2.5867996215820312 }, { "auxiliary_loss_clip": 0.01137444, "auxiliary_loss_mlp": 0.01027126, "balance_loss_clip": 1.04124641, "balance_loss_mlp": 1.0192765, "epoch": 0.9365718751878795, "flos": 20703758895360.0, "grad_norm": 2.0202133381820166, "language_loss": 0.80476916, "learning_rate": 4.1983556273309293e-08, "loss": 0.82641482, "num_input_tokens_seen": 168367545, "step": 7789, "time_per_iteration": 2.690735340118408 }, { "auxiliary_loss_clip": 0.01166917, "auxiliary_loss_mlp": 0.01024035, "balance_loss_clip": 1.04546022, "balance_loss_mlp": 1.01683474, "epoch": 0.9366921180785186, "flos": 18655270260480.0, "grad_norm": 4.870883529232793, "language_loss": 0.69127661, "learning_rate": 4.182493420022526e-08, "loss": 0.71318614, "num_input_tokens_seen": 168383215, "step": 7790, "time_per_iteration": 2.5158374309539795 }, { "auxiliary_loss_clip": 0.01123037, "auxiliary_loss_mlp": 0.01020668, "balance_loss_clip": 1.04070163, "balance_loss_mlp": 1.01406384, "epoch": 0.9368123609691577, "flos": 25774955815680.0, "grad_norm": 1.6624163416382007, "language_loss": 0.78532231, "learning_rate": 4.166660918074139e-08, "loss": 0.80675936, "num_input_tokens_seen": 168403120, "step": 7791, "time_per_iteration": 2.736804723739624 }, { "auxiliary_loss_clip": 0.01117817, "auxiliary_loss_mlp": 0.01023826, "balance_loss_clip": 1.03934169, "balance_loss_mlp": 1.01693583, "epoch": 0.9369326038597968, "flos": 25553386771200.0, "grad_norm": 1.5186264075625067, "language_loss": 0.73554409, "learning_rate": 4.15085812388758e-08, "loss": 0.75696045, "num_input_tokens_seen": 168425340, "step": 7792, "time_per_iteration": 2.78934645652771 }, { "auxiliary_loss_clip": 0.01136756, "auxiliary_loss_mlp": 0.0102366, "balance_loss_clip": 1.04402173, "balance_loss_mlp": 1.01626945, "epoch": 0.9370528467504359, "flos": 23220019370880.0, "grad_norm": 1.7701714076730637, "language_loss": 0.78361326, "learning_rate": 4.135085039860153e-08, "loss": 0.80521739, "num_input_tokens_seen": 168444740, "step": 7793, "time_per_iteration": 2.685053586959839 }, { "auxiliary_loss_clip": 0.01134728, "auxiliary_loss_mlp": 0.01023936, "balance_loss_clip": 1.04399765, "balance_loss_mlp": 1.01699209, "epoch": 0.9371730896410749, "flos": 24967468120320.0, "grad_norm": 20.48790429867453, "language_loss": 0.78621477, "learning_rate": 4.1193416683845906e-08, "loss": 0.80780143, "num_input_tokens_seen": 168463670, "step": 7794, "time_per_iteration": 2.615208864212036 }, { "auxiliary_loss_clip": 0.01125767, "auxiliary_loss_mlp": 0.01025731, "balance_loss_clip": 1.04191279, "balance_loss_mlp": 1.01907027, "epoch": 0.9372933325317141, "flos": 15553091134080.0, "grad_norm": 2.446661428820972, "language_loss": 0.83747149, "learning_rate": 4.103628011849136e-08, "loss": 0.85898644, "num_input_tokens_seen": 168479030, "step": 7795, "time_per_iteration": 2.6791586875915527 }, { "auxiliary_loss_clip": 0.01139967, "auxiliary_loss_mlp": 0.01025379, "balance_loss_clip": 1.04453039, "balance_loss_mlp": 1.01789284, "epoch": 0.9374135754223532, "flos": 21871861182720.0, "grad_norm": 1.771053699392945, "language_loss": 0.75902802, "learning_rate": 4.0879440726375506e-08, "loss": 0.78068143, "num_input_tokens_seen": 168496815, "step": 7796, "time_per_iteration": 2.6533925533294678 }, { "auxiliary_loss_clip": 0.01133583, "auxiliary_loss_mlp": 0.01026272, "balance_loss_clip": 1.03917289, "balance_loss_mlp": 1.01938152, "epoch": 0.9375338183129922, "flos": 22631048064000.0, "grad_norm": 2.6667170106470235, "language_loss": 0.5634867, "learning_rate": 4.0722898531291074e-08, "loss": 0.58508527, "num_input_tokens_seen": 168514055, "step": 7797, "time_per_iteration": 2.6774649620056152 }, { "auxiliary_loss_clip": 0.01143065, "auxiliary_loss_mlp": 0.01026556, "balance_loss_clip": 1.04404187, "balance_loss_mlp": 1.01937985, "epoch": 0.9376540612036314, "flos": 26104292640000.0, "grad_norm": 1.9259575952504808, "language_loss": 0.76942801, "learning_rate": 4.0566653556985295e-08, "loss": 0.79112422, "num_input_tokens_seen": 168534600, "step": 7798, "time_per_iteration": 2.7116200923919678 }, { "auxiliary_loss_clip": 0.01084322, "auxiliary_loss_mlp": 0.01024093, "balance_loss_clip": 1.03758717, "balance_loss_mlp": 1.0161891, "epoch": 0.9377743040942704, "flos": 19717580016000.0, "grad_norm": 2.561184629203548, "language_loss": 0.81600177, "learning_rate": 4.0410705827159886e-08, "loss": 0.83708596, "num_input_tokens_seen": 168551895, "step": 7799, "time_per_iteration": 2.8884031772613525 }, { "auxiliary_loss_clip": 0.01131485, "auxiliary_loss_mlp": 0.01023422, "balance_loss_clip": 1.03810787, "balance_loss_mlp": 1.01643658, "epoch": 0.9378945469849095, "flos": 15267530010240.0, "grad_norm": 2.2688016485441915, "language_loss": 0.70970428, "learning_rate": 4.0255055365472356e-08, "loss": 0.73125327, "num_input_tokens_seen": 168569990, "step": 7800, "time_per_iteration": 3.8120007514953613 }, { "auxiliary_loss_clip": 0.01090724, "auxiliary_loss_mlp": 0.0102946, "balance_loss_clip": 1.03390908, "balance_loss_mlp": 1.0221343, "epoch": 0.9380147898755486, "flos": 20591394174720.0, "grad_norm": 2.271287666719792, "language_loss": 0.74925536, "learning_rate": 4.009970219553471e-08, "loss": 0.77045721, "num_input_tokens_seen": 168586940, "step": 7801, "time_per_iteration": 2.7986373901367188 }, { "auxiliary_loss_clip": 0.01156314, "auxiliary_loss_mlp": 0.01025245, "balance_loss_clip": 1.0458318, "balance_loss_mlp": 1.01774073, "epoch": 0.9381350327661877, "flos": 26281116316800.0, "grad_norm": 2.5765481757130564, "language_loss": 0.76237684, "learning_rate": 3.99446463409141e-08, "loss": 0.78419244, "num_input_tokens_seen": 168604795, "step": 7802, "time_per_iteration": 2.742312431335449 }, { "auxiliary_loss_clip": 0.01154061, "auxiliary_loss_mlp": 0.01024369, "balance_loss_clip": 1.04154646, "balance_loss_mlp": 1.01740384, "epoch": 0.9382552756568268, "flos": 23586344225280.0, "grad_norm": 3.142901626774641, "language_loss": 0.68963683, "learning_rate": 3.978988782513215e-08, "loss": 0.71142113, "num_input_tokens_seen": 168622290, "step": 7803, "time_per_iteration": 3.6902055740356445 }, { "auxiliary_loss_clip": 0.01157093, "auxiliary_loss_mlp": 0.01028194, "balance_loss_clip": 1.04598856, "balance_loss_mlp": 1.02119088, "epoch": 0.9383755185474659, "flos": 28438809275520.0, "grad_norm": 1.7402425655515612, "language_loss": 0.76083016, "learning_rate": 3.963542667166586e-08, "loss": 0.78268301, "num_input_tokens_seen": 168642395, "step": 7804, "time_per_iteration": 2.6923837661743164 }, { "auxiliary_loss_clip": 0.01124488, "auxiliary_loss_mlp": 0.01028152, "balance_loss_clip": 1.0446806, "balance_loss_mlp": 1.02119923, "epoch": 0.938495761438105, "flos": 20449583280000.0, "grad_norm": 1.967341395502087, "language_loss": 0.68190867, "learning_rate": 3.9481262903946486e-08, "loss": 0.70343506, "num_input_tokens_seen": 168661840, "step": 7805, "time_per_iteration": 3.65708589553833 }, { "auxiliary_loss_clip": 0.01026833, "auxiliary_loss_mlp": 0.01003319, "balance_loss_clip": 1.00980484, "balance_loss_mlp": 1.00233006, "epoch": 0.938616004328744, "flos": 69302711658240.0, "grad_norm": 0.7714951263161394, "language_loss": 0.54477876, "learning_rate": 3.932739654536066e-08, "loss": 0.56508029, "num_input_tokens_seen": 168724540, "step": 7806, "time_per_iteration": 3.383449077606201 }, { "auxiliary_loss_clip": 0.01151863, "auxiliary_loss_mlp": 0.01023167, "balance_loss_clip": 1.04569507, "balance_loss_mlp": 1.0166018, "epoch": 0.9387362472193832, "flos": 18911636605440.0, "grad_norm": 2.400259369260182, "language_loss": 0.74690765, "learning_rate": 3.917382761925014e-08, "loss": 0.76865798, "num_input_tokens_seen": 168740375, "step": 7807, "time_per_iteration": 2.860156774520874 }, { "auxiliary_loss_clip": 0.01148057, "auxiliary_loss_mlp": 0.01027253, "balance_loss_clip": 1.0432663, "balance_loss_mlp": 1.02004981, "epoch": 0.9388564901100223, "flos": 26501967089280.0, "grad_norm": 1.8035089473895256, "language_loss": 0.79574811, "learning_rate": 3.9020556148910754e-08, "loss": 0.81750119, "num_input_tokens_seen": 168759730, "step": 7808, "time_per_iteration": 3.625833511352539 }, { "auxiliary_loss_clip": 0.01045755, "auxiliary_loss_mlp": 0.01000829, "balance_loss_clip": 1.00971889, "balance_loss_mlp": 0.99978602, "epoch": 0.9389767330006613, "flos": 58941083157120.0, "grad_norm": 0.7108238846986127, "language_loss": 0.56660968, "learning_rate": 3.8867582157593895e-08, "loss": 0.58707553, "num_input_tokens_seen": 168813935, "step": 7809, "time_per_iteration": 3.0906152725219727 }, { "auxiliary_loss_clip": 0.01149629, "auxiliary_loss_mlp": 0.01020577, "balance_loss_clip": 1.04541779, "balance_loss_mlp": 1.01384795, "epoch": 0.9390969758913005, "flos": 31102554994560.0, "grad_norm": 1.7404458595231282, "language_loss": 0.76556194, "learning_rate": 3.871490566850544e-08, "loss": 0.78726399, "num_input_tokens_seen": 168838145, "step": 7810, "time_per_iteration": 2.656121253967285 }, { "auxiliary_loss_clip": 0.01130824, "auxiliary_loss_mlp": 0.01023461, "balance_loss_clip": 1.04127574, "balance_loss_mlp": 1.01631474, "epoch": 0.9392172187819395, "flos": 22419391173120.0, "grad_norm": 2.442010802219366, "language_loss": 0.70644844, "learning_rate": 3.856252670480642e-08, "loss": 0.72799128, "num_input_tokens_seen": 168856805, "step": 7811, "time_per_iteration": 2.6810100078582764 }, { "auxiliary_loss_clip": 0.01133396, "auxiliary_loss_mlp": 0.0102619, "balance_loss_clip": 1.03973544, "balance_loss_mlp": 1.01943326, "epoch": 0.9393374616725786, "flos": 19719483436800.0, "grad_norm": 1.8189852017729227, "language_loss": 0.8147217, "learning_rate": 3.841044528961279e-08, "loss": 0.83631754, "num_input_tokens_seen": 168874600, "step": 7812, "time_per_iteration": 2.6546671390533447 }, { "auxiliary_loss_clip": 0.01165001, "auxiliary_loss_mlp": 0.01021159, "balance_loss_clip": 1.0434916, "balance_loss_mlp": 1.01389611, "epoch": 0.9394577045632178, "flos": 24170215800960.0, "grad_norm": 1.7828374881191116, "language_loss": 0.7874496, "learning_rate": 3.825866144599477e-08, "loss": 0.80931127, "num_input_tokens_seen": 168893655, "step": 7813, "time_per_iteration": 2.726494789123535 }, { "auxiliary_loss_clip": 0.0113462, "auxiliary_loss_mlp": 0.01024893, "balance_loss_clip": 1.04017222, "balance_loss_mlp": 1.01784742, "epoch": 0.9395779474538568, "flos": 19023929498880.0, "grad_norm": 2.0483207388230444, "language_loss": 0.75470519, "learning_rate": 3.8107175196978145e-08, "loss": 0.77630031, "num_input_tokens_seen": 168909960, "step": 7814, "time_per_iteration": 2.709256410598755 }, { "auxiliary_loss_clip": 0.01120402, "auxiliary_loss_mlp": 0.01022511, "balance_loss_clip": 1.04256237, "balance_loss_mlp": 1.01590109, "epoch": 0.9396981903444959, "flos": 14319129260160.0, "grad_norm": 1.869081885256342, "language_loss": 0.76781714, "learning_rate": 3.7955986565542996e-08, "loss": 0.78924626, "num_input_tokens_seen": 168928040, "step": 7815, "time_per_iteration": 2.6663522720336914 }, { "auxiliary_loss_clip": 0.01120164, "auxiliary_loss_mlp": 0.01023402, "balance_loss_clip": 1.03835082, "balance_loss_mlp": 1.01639283, "epoch": 0.9398184332351349, "flos": 34787564202240.0, "grad_norm": 2.077646123830158, "language_loss": 0.68294501, "learning_rate": 3.780509557462497e-08, "loss": 0.70438075, "num_input_tokens_seen": 168948240, "step": 7816, "time_per_iteration": 2.771965503692627 }, { "auxiliary_loss_clip": 0.01135611, "auxiliary_loss_mlp": 0.01022982, "balance_loss_clip": 1.04116869, "balance_loss_mlp": 1.01520324, "epoch": 0.9399386761257741, "flos": 25372253462400.0, "grad_norm": 1.612556315156751, "language_loss": 0.75450557, "learning_rate": 3.765450224711375e-08, "loss": 0.77609152, "num_input_tokens_seen": 168968745, "step": 7817, "time_per_iteration": 2.6917316913604736 }, { "auxiliary_loss_clip": 0.01131812, "auxiliary_loss_mlp": 0.01022213, "balance_loss_clip": 1.04269266, "balance_loss_mlp": 1.01534677, "epoch": 0.9400589190164131, "flos": 27304965584640.0, "grad_norm": 1.8364389864468247, "language_loss": 0.80174243, "learning_rate": 3.750420660585396e-08, "loss": 0.82328266, "num_input_tokens_seen": 168990685, "step": 7818, "time_per_iteration": 2.724085569381714 }, { "auxiliary_loss_clip": 0.01163054, "auxiliary_loss_mlp": 0.010272, "balance_loss_clip": 1.0448823, "balance_loss_mlp": 1.02010739, "epoch": 0.9401791619070522, "flos": 23399859790080.0, "grad_norm": 1.8569306649151234, "language_loss": 0.79631847, "learning_rate": 3.735420867364603e-08, "loss": 0.81822103, "num_input_tokens_seen": 169011665, "step": 7819, "time_per_iteration": 2.619718551635742 }, { "auxiliary_loss_clip": 0.01087713, "auxiliary_loss_mlp": 0.0102155, "balance_loss_clip": 1.03609824, "balance_loss_mlp": 1.01499057, "epoch": 0.9402994047976914, "flos": 35881403120640.0, "grad_norm": 1.7316062657255817, "language_loss": 0.61794841, "learning_rate": 3.7204508473244186e-08, "loss": 0.63904107, "num_input_tokens_seen": 169035290, "step": 7820, "time_per_iteration": 2.9421424865722656 }, { "auxiliary_loss_clip": 0.01078699, "auxiliary_loss_mlp": 0.01018704, "balance_loss_clip": 1.03927076, "balance_loss_mlp": 1.01212907, "epoch": 0.9404196476883304, "flos": 22236821320320.0, "grad_norm": 1.8232975286026603, "language_loss": 0.69584012, "learning_rate": 3.7055106027357395e-08, "loss": 0.7168141, "num_input_tokens_seen": 169055155, "step": 7821, "time_per_iteration": 2.7583436965942383 }, { "auxiliary_loss_clip": 0.0114537, "auxiliary_loss_mlp": 0.01024259, "balance_loss_clip": 1.0417819, "balance_loss_mlp": 1.01749992, "epoch": 0.9405398905789695, "flos": 18915802583040.0, "grad_norm": 2.3155985717272904, "language_loss": 0.71862769, "learning_rate": 3.690600135865063e-08, "loss": 0.74032402, "num_input_tokens_seen": 169072080, "step": 7822, "time_per_iteration": 2.631145715713501 }, { "auxiliary_loss_clip": 0.01022954, "auxiliary_loss_mlp": 0.01001952, "balance_loss_clip": 1.00952864, "balance_loss_mlp": 1.00095069, "epoch": 0.9406601334696086, "flos": 70274130048000.0, "grad_norm": 0.8215150943984585, "language_loss": 0.58096558, "learning_rate": 3.675719448974246e-08, "loss": 0.60121465, "num_input_tokens_seen": 169137170, "step": 7823, "time_per_iteration": 3.3526577949523926 }, { "auxiliary_loss_clip": 0.0110268, "auxiliary_loss_mlp": 0.00761865, "balance_loss_clip": 1.03864288, "balance_loss_mlp": 1.00026739, "epoch": 0.9407803763602477, "flos": 22165071903360.0, "grad_norm": 1.77364851509595, "language_loss": 0.59678477, "learning_rate": 3.6608685443207054e-08, "loss": 0.61543024, "num_input_tokens_seen": 169156320, "step": 7824, "time_per_iteration": 2.8513879776000977 }, { "auxiliary_loss_clip": 0.01122537, "auxiliary_loss_mlp": 0.01025614, "balance_loss_clip": 1.04032099, "balance_loss_mlp": 1.01895654, "epoch": 0.9409006192508867, "flos": 18879496911360.0, "grad_norm": 2.4005595114706284, "language_loss": 0.66831344, "learning_rate": 3.646047424157306e-08, "loss": 0.6897949, "num_input_tokens_seen": 169173295, "step": 7825, "time_per_iteration": 2.7306032180786133 }, { "auxiliary_loss_clip": 0.01137653, "auxiliary_loss_mlp": 0.01025671, "balance_loss_clip": 1.04222393, "balance_loss_mlp": 1.01790428, "epoch": 0.9410208621415259, "flos": 23368258800000.0, "grad_norm": 2.7889298689697233, "language_loss": 0.68924773, "learning_rate": 3.631256090732382e-08, "loss": 0.71088099, "num_input_tokens_seen": 169193755, "step": 7826, "time_per_iteration": 3.498997926712036 }, { "auxiliary_loss_clip": 0.01122911, "auxiliary_loss_mlp": 0.01023781, "balance_loss_clip": 1.04238105, "balance_loss_mlp": 1.01730776, "epoch": 0.941141105032165, "flos": 22742227635840.0, "grad_norm": 3.09677341447509, "language_loss": 0.82878339, "learning_rate": 3.6164945462897833e-08, "loss": 0.85025036, "num_input_tokens_seen": 169213045, "step": 7827, "time_per_iteration": 2.622340202331543 }, { "auxiliary_loss_clip": 0.01148543, "auxiliary_loss_mlp": 0.00761518, "balance_loss_clip": 1.04440129, "balance_loss_mlp": 1.00027084, "epoch": 0.941261347922804, "flos": 20704908130560.0, "grad_norm": 8.244878995842363, "language_loss": 0.7593354, "learning_rate": 3.6017627930687856e-08, "loss": 0.77843601, "num_input_tokens_seen": 169232870, "step": 7828, "time_per_iteration": 2.5137951374053955 }, { "auxiliary_loss_clip": 0.01101382, "auxiliary_loss_mlp": 0.01022187, "balance_loss_clip": 1.03564835, "balance_loss_mlp": 1.01470613, "epoch": 0.9413815908134432, "flos": 19421998997760.0, "grad_norm": 2.0917621894769067, "language_loss": 0.77102, "learning_rate": 3.587060833304267e-08, "loss": 0.7922557, "num_input_tokens_seen": 169251060, "step": 7829, "time_per_iteration": 3.5460567474365234 }, { "auxiliary_loss_clip": 0.01154413, "auxiliary_loss_mlp": 0.01024684, "balance_loss_clip": 1.04535949, "balance_loss_mlp": 1.01772511, "epoch": 0.9415018337040822, "flos": 17493452853120.0, "grad_norm": 1.9196464412862122, "language_loss": 0.64030641, "learning_rate": 3.5723886692264225e-08, "loss": 0.66209739, "num_input_tokens_seen": 169268600, "step": 7830, "time_per_iteration": 2.5743672847747803 }, { "auxiliary_loss_clip": 0.01133686, "auxiliary_loss_mlp": 0.01022951, "balance_loss_clip": 1.03984666, "balance_loss_mlp": 1.01594198, "epoch": 0.9416220765947213, "flos": 31831613343360.0, "grad_norm": 1.9359667889517151, "language_loss": 0.62073886, "learning_rate": 3.557746303061071e-08, "loss": 0.64230525, "num_input_tokens_seen": 169290355, "step": 7831, "time_per_iteration": 3.6985971927642822 }, { "auxiliary_loss_clip": 0.01135372, "auxiliary_loss_mlp": 0.01023854, "balance_loss_clip": 1.04197621, "balance_loss_mlp": 1.01646328, "epoch": 0.9417423194853605, "flos": 23511973115520.0, "grad_norm": 2.1234334634262058, "language_loss": 0.721129, "learning_rate": 3.543133737029391e-08, "loss": 0.74272126, "num_input_tokens_seen": 169310865, "step": 7832, "time_per_iteration": 2.702070951461792 }, { "auxiliary_loss_clip": 0.0115324, "auxiliary_loss_mlp": 0.01024108, "balance_loss_clip": 1.04359698, "balance_loss_mlp": 1.01715255, "epoch": 0.9418625623759995, "flos": 23915106432000.0, "grad_norm": 2.0110225017521888, "language_loss": 0.69028032, "learning_rate": 3.5285509733481214e-08, "loss": 0.71205384, "num_input_tokens_seen": 169330590, "step": 7833, "time_per_iteration": 2.592555522918701 }, { "auxiliary_loss_clip": 0.01145108, "auxiliary_loss_mlp": 0.01033032, "balance_loss_clip": 1.04166389, "balance_loss_mlp": 1.02597451, "epoch": 0.9419828052666386, "flos": 18076965292800.0, "grad_norm": 1.723215340504124, "language_loss": 0.76681441, "learning_rate": 3.513998014229469e-08, "loss": 0.78859586, "num_input_tokens_seen": 169349540, "step": 7834, "time_per_iteration": 3.540327787399292 }, { "auxiliary_loss_clip": 0.01138805, "auxiliary_loss_mlp": 0.01027925, "balance_loss_clip": 1.04408598, "balance_loss_mlp": 1.0207516, "epoch": 0.9421030481572777, "flos": 17712328377600.0, "grad_norm": 2.153471502770205, "language_loss": 0.86333424, "learning_rate": 3.499474861881069e-08, "loss": 0.88500148, "num_input_tokens_seen": 169366765, "step": 7835, "time_per_iteration": 2.670884132385254 }, { "auxiliary_loss_clip": 0.01096102, "auxiliary_loss_mlp": 0.01023628, "balance_loss_clip": 1.03959942, "balance_loss_mlp": 1.01667261, "epoch": 0.9422232910479168, "flos": 20194114775040.0, "grad_norm": 1.9154409575773295, "language_loss": 0.68223763, "learning_rate": 3.4849815185061136e-08, "loss": 0.70343494, "num_input_tokens_seen": 169386655, "step": 7836, "time_per_iteration": 2.7571349143981934 }, { "auxiliary_loss_clip": 0.01149502, "auxiliary_loss_mlp": 0.01022835, "balance_loss_clip": 1.04236627, "balance_loss_mlp": 1.01653171, "epoch": 0.9423435339385559, "flos": 18442571875200.0, "grad_norm": 1.8623034627055841, "language_loss": 0.7586689, "learning_rate": 3.470517986303223e-08, "loss": 0.78039229, "num_input_tokens_seen": 169405640, "step": 7837, "time_per_iteration": 2.5983200073242188 }, { "auxiliary_loss_clip": 0.0112291, "auxiliary_loss_mlp": 0.01027807, "balance_loss_clip": 1.04316759, "balance_loss_mlp": 1.0206815, "epoch": 0.942463776829195, "flos": 20080636732800.0, "grad_norm": 1.7681533829632001, "language_loss": 0.79160464, "learning_rate": 3.4560842674664856e-08, "loss": 0.81311178, "num_input_tokens_seen": 169424155, "step": 7838, "time_per_iteration": 2.6564249992370605 }, { "auxiliary_loss_clip": 0.01150405, "auxiliary_loss_mlp": 0.01026228, "balance_loss_clip": 1.04107678, "balance_loss_mlp": 1.0194273, "epoch": 0.9425840197198341, "flos": 22636255536000.0, "grad_norm": 2.1226708734079507, "language_loss": 0.75447607, "learning_rate": 3.441680364185506e-08, "loss": 0.77624243, "num_input_tokens_seen": 169444025, "step": 7839, "time_per_iteration": 2.620164155960083 }, { "auxiliary_loss_clip": 0.01140058, "auxiliary_loss_mlp": 0.01028359, "balance_loss_clip": 1.04436398, "balance_loss_mlp": 1.02050877, "epoch": 0.9427042626104731, "flos": 19937892084480.0, "grad_norm": 2.5904364599257823, "language_loss": 0.74555492, "learning_rate": 3.427306278645314e-08, "loss": 0.76723909, "num_input_tokens_seen": 169462480, "step": 7840, "time_per_iteration": 2.5902369022369385 }, { "auxiliary_loss_clip": 0.01107641, "auxiliary_loss_mlp": 0.01021272, "balance_loss_clip": 1.04004622, "balance_loss_mlp": 1.01489723, "epoch": 0.9428245055011123, "flos": 22856998567680.0, "grad_norm": 1.7829422666990453, "language_loss": 0.72964311, "learning_rate": 3.4129620130264767e-08, "loss": 0.75093222, "num_input_tokens_seen": 169480840, "step": 7841, "time_per_iteration": 2.7474796772003174 }, { "auxiliary_loss_clip": 0.0113879, "auxiliary_loss_mlp": 0.0076188, "balance_loss_clip": 1.04290938, "balance_loss_mlp": 1.00031424, "epoch": 0.9429447483917514, "flos": 20951757371520.0, "grad_norm": 2.481527226478723, "language_loss": 0.77835202, "learning_rate": 3.398647569505009e-08, "loss": 0.79735869, "num_input_tokens_seen": 169498265, "step": 7842, "time_per_iteration": 2.6336257457733154 }, { "auxiliary_loss_clip": 0.01127239, "auxiliary_loss_mlp": 0.01026547, "balance_loss_clip": 1.0415436, "balance_loss_mlp": 1.0196898, "epoch": 0.9430649912823904, "flos": 18843658116480.0, "grad_norm": 2.6690813613923834, "language_loss": 0.74677694, "learning_rate": 3.384362950252373e-08, "loss": 0.76831472, "num_input_tokens_seen": 169515235, "step": 7843, "time_per_iteration": 2.6857542991638184 }, { "auxiliary_loss_clip": 0.01136668, "auxiliary_loss_mlp": 0.01025235, "balance_loss_clip": 1.04246593, "balance_loss_mlp": 1.01821399, "epoch": 0.9431852341730296, "flos": 32556038837760.0, "grad_norm": 2.891247341447621, "language_loss": 0.5697732, "learning_rate": 3.3701081574355473e-08, "loss": 0.59139228, "num_input_tokens_seen": 169537195, "step": 7844, "time_per_iteration": 2.7464168071746826 }, { "auxiliary_loss_clip": 0.01045078, "auxiliary_loss_mlp": 0.01002146, "balance_loss_clip": 1.00953269, "balance_loss_mlp": 1.00111485, "epoch": 0.9433054770636686, "flos": 66904490252160.0, "grad_norm": 0.6402865208740424, "language_loss": 0.51644576, "learning_rate": 3.3558831932169796e-08, "loss": 0.53691792, "num_input_tokens_seen": 169605865, "step": 7845, "time_per_iteration": 3.294221878051758 }, { "auxiliary_loss_clip": 0.01149919, "auxiliary_loss_mlp": 0.01028407, "balance_loss_clip": 1.04484558, "balance_loss_mlp": 1.02077186, "epoch": 0.9434257199543077, "flos": 26140346916480.0, "grad_norm": 1.8589253929989527, "language_loss": 0.88326597, "learning_rate": 3.341688059754588e-08, "loss": 0.90504926, "num_input_tokens_seen": 169621520, "step": 7846, "time_per_iteration": 2.676137924194336 }, { "auxiliary_loss_clip": 0.01129967, "auxiliary_loss_mlp": 0.00762015, "balance_loss_clip": 1.04069257, "balance_loss_mlp": 1.00038695, "epoch": 0.9435459628449467, "flos": 25003486483200.0, "grad_norm": 2.0544964017410092, "language_loss": 0.7801609, "learning_rate": 3.327522759201762e-08, "loss": 0.79908073, "num_input_tokens_seen": 169641390, "step": 7847, "time_per_iteration": 2.7536017894744873 }, { "auxiliary_loss_clip": 0.01119163, "auxiliary_loss_mlp": 0.01023955, "balance_loss_clip": 1.03967416, "balance_loss_mlp": 1.01675463, "epoch": 0.9436662057355859, "flos": 22163240309760.0, "grad_norm": 2.150555544668834, "language_loss": 0.66450167, "learning_rate": 3.313387293707359e-08, "loss": 0.68593282, "num_input_tokens_seen": 169660095, "step": 7848, "time_per_iteration": 2.6711976528167725 }, { "auxiliary_loss_clip": 0.01118845, "auxiliary_loss_mlp": 0.01024193, "balance_loss_clip": 1.04276872, "balance_loss_mlp": 1.01646233, "epoch": 0.943786448626225, "flos": 20118522602880.0, "grad_norm": 1.9224505622188945, "language_loss": 0.68586612, "learning_rate": 3.29928166541571e-08, "loss": 0.70729649, "num_input_tokens_seen": 169679050, "step": 7849, "time_per_iteration": 2.6881844997406006 }, { "auxiliary_loss_clip": 0.01129392, "auxiliary_loss_mlp": 0.01023147, "balance_loss_clip": 1.04203224, "balance_loss_mlp": 1.01616168, "epoch": 0.943906691516864, "flos": 22090808534400.0, "grad_norm": 2.012574914533562, "language_loss": 0.80155396, "learning_rate": 3.2852058764666346e-08, "loss": 0.82307935, "num_input_tokens_seen": 169698150, "step": 7850, "time_per_iteration": 2.6562323570251465 }, { "auxiliary_loss_clip": 0.0111406, "auxiliary_loss_mlp": 0.0102548, "balance_loss_clip": 1.0438509, "balance_loss_mlp": 1.0189923, "epoch": 0.9440269344075032, "flos": 35298501212160.0, "grad_norm": 2.1482244338987764, "language_loss": 0.68547869, "learning_rate": 3.2711599289954264e-08, "loss": 0.70687401, "num_input_tokens_seen": 169722185, "step": 7851, "time_per_iteration": 2.8055436611175537 }, { "auxiliary_loss_clip": 0.01095691, "auxiliary_loss_mlp": 0.01024342, "balance_loss_clip": 1.03925538, "balance_loss_mlp": 1.01726139, "epoch": 0.9441471772981422, "flos": 19238136255360.0, "grad_norm": 1.7796143752308953, "language_loss": 0.77619755, "learning_rate": 3.257143825132847e-08, "loss": 0.79739785, "num_input_tokens_seen": 169740355, "step": 7852, "time_per_iteration": 3.7091825008392334 }, { "auxiliary_loss_clip": 0.0113509, "auxiliary_loss_mlp": 0.01023498, "balance_loss_clip": 1.04092598, "balance_loss_mlp": 1.0174396, "epoch": 0.9442674201887813, "flos": 25739799379200.0, "grad_norm": 1.808145066900263, "language_loss": 0.76273876, "learning_rate": 3.243157567005106e-08, "loss": 0.78432465, "num_input_tokens_seen": 169758535, "step": 7853, "time_per_iteration": 2.670938730239868 }, { "auxiliary_loss_clip": 0.01171353, "auxiliary_loss_mlp": 0.01025994, "balance_loss_clip": 1.05066454, "balance_loss_mlp": 1.01875854, "epoch": 0.9443876630794205, "flos": 15523321737600.0, "grad_norm": 3.676424373135769, "language_loss": 0.6425153, "learning_rate": 3.2292011567339296e-08, "loss": 0.66448879, "num_input_tokens_seen": 169776340, "step": 7854, "time_per_iteration": 2.539104700088501 }, { "auxiliary_loss_clip": 0.01151789, "auxiliary_loss_mlp": 0.00762125, "balance_loss_clip": 1.0429945, "balance_loss_mlp": 1.00032401, "epoch": 0.9445079059700595, "flos": 13400821128960.0, "grad_norm": 2.8967376786745187, "language_loss": 0.55890584, "learning_rate": 3.21527459643649e-08, "loss": 0.57804489, "num_input_tokens_seen": 169793225, "step": 7855, "time_per_iteration": 3.5756161212921143 }, { "auxiliary_loss_clip": 0.01152973, "auxiliary_loss_mlp": 0.01028058, "balance_loss_clip": 1.0445838, "balance_loss_mlp": 1.02121186, "epoch": 0.9446281488606986, "flos": 23659242877440.0, "grad_norm": 1.9345118044436524, "language_loss": 0.7420795, "learning_rate": 3.2013778882254536e-08, "loss": 0.76388985, "num_input_tokens_seen": 169812020, "step": 7856, "time_per_iteration": 3.5079874992370605 }, { "auxiliary_loss_clip": 0.01143321, "auxiliary_loss_mlp": 0.01026345, "balance_loss_clip": 1.04243731, "balance_loss_mlp": 1.01918662, "epoch": 0.9447483917513377, "flos": 25557337267200.0, "grad_norm": 1.8519482588984217, "language_loss": 0.75521028, "learning_rate": 3.1875110342088676e-08, "loss": 0.77690691, "num_input_tokens_seen": 169833470, "step": 7857, "time_per_iteration": 2.6735873222351074 }, { "auxiliary_loss_clip": 0.01130334, "auxiliary_loss_mlp": 0.01023101, "balance_loss_clip": 1.04193366, "balance_loss_mlp": 1.01570988, "epoch": 0.9448686346419768, "flos": 24535463247360.0, "grad_norm": 1.6165695169769632, "language_loss": 0.65628213, "learning_rate": 3.1736740364904035e-08, "loss": 0.67781651, "num_input_tokens_seen": 169854000, "step": 7858, "time_per_iteration": 2.7221643924713135 }, { "auxiliary_loss_clip": 0.01110037, "auxiliary_loss_mlp": 0.00762294, "balance_loss_clip": 1.04152942, "balance_loss_mlp": 1.00028765, "epoch": 0.9449888775326158, "flos": 14721256995840.0, "grad_norm": 2.1637079930197016, "language_loss": 0.77118349, "learning_rate": 3.159866897169094e-08, "loss": 0.7899068, "num_input_tokens_seen": 169872200, "step": 7859, "time_per_iteration": 2.68589186668396 }, { "auxiliary_loss_clip": 0.0113053, "auxiliary_loss_mlp": 0.01021013, "balance_loss_clip": 1.04301298, "balance_loss_mlp": 1.01411998, "epoch": 0.945109120423255, "flos": 15447873219840.0, "grad_norm": 1.9078750596760758, "language_loss": 0.75681674, "learning_rate": 3.146089618339487e-08, "loss": 0.77833217, "num_input_tokens_seen": 169889055, "step": 7860, "time_per_iteration": 3.549943447113037 }, { "auxiliary_loss_clip": 0.01122639, "auxiliary_loss_mlp": 0.01023042, "balance_loss_clip": 1.04011655, "balance_loss_mlp": 1.01607132, "epoch": 0.9452293633138941, "flos": 25448097029760.0, "grad_norm": 2.170510975199028, "language_loss": 0.6794796, "learning_rate": 3.132342202091554e-08, "loss": 0.70093644, "num_input_tokens_seen": 169909280, "step": 7861, "time_per_iteration": 2.712127923965454 }, { "auxiliary_loss_clip": 0.01165858, "auxiliary_loss_mlp": 0.01027352, "balance_loss_clip": 1.04472113, "balance_loss_mlp": 1.02011585, "epoch": 0.9453496062045331, "flos": 21215342350080.0, "grad_norm": 2.4412224811186056, "language_loss": 0.68627465, "learning_rate": 3.1186246505107595e-08, "loss": 0.70820677, "num_input_tokens_seen": 169928420, "step": 7862, "time_per_iteration": 2.6397109031677246 }, { "auxiliary_loss_clip": 0.01152053, "auxiliary_loss_mlp": 0.01027138, "balance_loss_clip": 1.04599452, "balance_loss_mlp": 1.02014327, "epoch": 0.9454698490951723, "flos": 20010898477440.0, "grad_norm": 1.7577200805806033, "language_loss": 0.83635545, "learning_rate": 3.104936965678084e-08, "loss": 0.85814738, "num_input_tokens_seen": 169946750, "step": 7863, "time_per_iteration": 2.603236198425293 }, { "auxiliary_loss_clip": 0.01152274, "auxiliary_loss_mlp": 0.01022159, "balance_loss_clip": 1.04385805, "balance_loss_mlp": 1.01509571, "epoch": 0.9455900919858113, "flos": 21069652786560.0, "grad_norm": 1.884431625539557, "language_loss": 0.82112896, "learning_rate": 3.091279149669956e-08, "loss": 0.84287333, "num_input_tokens_seen": 169965540, "step": 7864, "time_per_iteration": 2.655489444732666 }, { "auxiliary_loss_clip": 0.01151738, "auxiliary_loss_mlp": 0.00761958, "balance_loss_clip": 1.04512286, "balance_loss_mlp": 1.0002619, "epoch": 0.9457103348764504, "flos": 20740854666240.0, "grad_norm": 2.1082932355105193, "language_loss": 0.73691326, "learning_rate": 3.0776512045581624e-08, "loss": 0.75605023, "num_input_tokens_seen": 169984330, "step": 7865, "time_per_iteration": 2.6085777282714844 }, { "auxiliary_loss_clip": 0.01130716, "auxiliary_loss_mlp": 0.01027216, "balance_loss_clip": 1.04219747, "balance_loss_mlp": 1.01943731, "epoch": 0.9458305777670896, "flos": 21428363957760.0, "grad_norm": 1.7868353545588598, "language_loss": 0.77854562, "learning_rate": 3.0640531324101384e-08, "loss": 0.80012488, "num_input_tokens_seen": 170002095, "step": 7866, "time_per_iteration": 2.6421401500701904 }, { "auxiliary_loss_clip": 0.01155521, "auxiliary_loss_mlp": 0.01027199, "balance_loss_clip": 1.04809785, "balance_loss_mlp": 1.0193485, "epoch": 0.9459508206577286, "flos": 20011185786240.0, "grad_norm": 2.9441508092602344, "language_loss": 0.7557767, "learning_rate": 3.0504849352886554e-08, "loss": 0.77760386, "num_input_tokens_seen": 170020240, "step": 7867, "time_per_iteration": 2.565398693084717 }, { "auxiliary_loss_clip": 0.01150582, "auxiliary_loss_mlp": 0.01029576, "balance_loss_clip": 1.04554343, "balance_loss_mlp": 1.02182436, "epoch": 0.9460710635483677, "flos": 12166428291840.0, "grad_norm": 4.0364149872221935, "language_loss": 0.71309149, "learning_rate": 3.036946615252023e-08, "loss": 0.73489308, "num_input_tokens_seen": 170035770, "step": 7868, "time_per_iteration": 2.5777430534362793 }, { "auxiliary_loss_clip": 0.01141932, "auxiliary_loss_mlp": 0.01032129, "balance_loss_clip": 1.04285955, "balance_loss_mlp": 1.02522683, "epoch": 0.9461913064390068, "flos": 34276196229120.0, "grad_norm": 2.616990957507741, "language_loss": 0.67152512, "learning_rate": 3.0234381743539984e-08, "loss": 0.6932658, "num_input_tokens_seen": 170053385, "step": 7869, "time_per_iteration": 2.727832078933716 }, { "auxiliary_loss_clip": 0.01142097, "auxiliary_loss_mlp": 0.01023509, "balance_loss_clip": 1.04248309, "balance_loss_mlp": 1.01636815, "epoch": 0.9463115493296459, "flos": 19463763536640.0, "grad_norm": 2.0082512835847774, "language_loss": 0.79931319, "learning_rate": 3.0099596146437863e-08, "loss": 0.82096922, "num_input_tokens_seen": 170070490, "step": 7870, "time_per_iteration": 2.6754496097564697 }, { "auxiliary_loss_clip": 0.01062676, "auxiliary_loss_mlp": 0.01002641, "balance_loss_clip": 1.00913978, "balance_loss_mlp": 1.00160968, "epoch": 0.946431792220285, "flos": 70570824387840.0, "grad_norm": 0.7785239177397602, "language_loss": 0.60063326, "learning_rate": 2.996510938166086e-08, "loss": 0.62128645, "num_input_tokens_seen": 170133465, "step": 7871, "time_per_iteration": 3.27728271484375 }, { "auxiliary_loss_clip": 0.01148775, "auxiliary_loss_mlp": 0.01021976, "balance_loss_clip": 1.04512846, "balance_loss_mlp": 1.01571786, "epoch": 0.9465520351109241, "flos": 18947906363520.0, "grad_norm": 1.9345491631176246, "language_loss": 0.73896456, "learning_rate": 2.983092146960997e-08, "loss": 0.76067203, "num_input_tokens_seen": 170150810, "step": 7872, "time_per_iteration": 2.546621084213257 }, { "auxiliary_loss_clip": 0.01136145, "auxiliary_loss_mlp": 0.01024107, "balance_loss_clip": 1.03973544, "balance_loss_mlp": 1.01683271, "epoch": 0.9466722780015632, "flos": 19135647774720.0, "grad_norm": 2.199176005347075, "language_loss": 0.80182904, "learning_rate": 2.9697032430642256e-08, "loss": 0.82343155, "num_input_tokens_seen": 170169025, "step": 7873, "time_per_iteration": 2.6317009925842285 }, { "auxiliary_loss_clip": 0.01160958, "auxiliary_loss_mlp": 0.01021575, "balance_loss_clip": 1.04365051, "balance_loss_mlp": 1.01498246, "epoch": 0.9467925208922022, "flos": 17237912520960.0, "grad_norm": 2.1080195748630786, "language_loss": 0.73515105, "learning_rate": 2.9563442285067906e-08, "loss": 0.75697637, "num_input_tokens_seen": 170186070, "step": 7874, "time_per_iteration": 2.721874713897705 }, { "auxiliary_loss_clip": 0.01151361, "auxiliary_loss_mlp": 0.01023575, "balance_loss_clip": 1.04467607, "balance_loss_mlp": 1.01646996, "epoch": 0.9469127637828414, "flos": 29169016859520.0, "grad_norm": 1.9559064397152521, "language_loss": 0.79632425, "learning_rate": 2.943015105315294e-08, "loss": 0.81807363, "num_input_tokens_seen": 170206265, "step": 7875, "time_per_iteration": 2.700072765350342 }, { "auxiliary_loss_clip": 0.01111641, "auxiliary_loss_mlp": 0.01029234, "balance_loss_clip": 1.03938687, "balance_loss_mlp": 1.02158713, "epoch": 0.9470330066734804, "flos": 26030460234240.0, "grad_norm": 2.205715260630357, "language_loss": 0.6670866, "learning_rate": 2.929715875511718e-08, "loss": 0.68849534, "num_input_tokens_seen": 170225300, "step": 7876, "time_per_iteration": 2.7851738929748535 }, { "auxiliary_loss_clip": 0.01150744, "auxiliary_loss_mlp": 0.01027319, "balance_loss_clip": 1.041857, "balance_loss_mlp": 1.02025604, "epoch": 0.9471532495641195, "flos": 23440906056960.0, "grad_norm": 1.966223756285267, "language_loss": 0.70222205, "learning_rate": 2.9164465411135375e-08, "loss": 0.72400272, "num_input_tokens_seen": 170245070, "step": 7877, "time_per_iteration": 2.662336826324463 }, { "auxiliary_loss_clip": 0.01151483, "auxiliary_loss_mlp": 0.0102707, "balance_loss_clip": 1.04553914, "balance_loss_mlp": 1.01974511, "epoch": 0.9472734924547586, "flos": 15815850099840.0, "grad_norm": 1.8709866653497271, "language_loss": 0.80934906, "learning_rate": 2.9032071041337426e-08, "loss": 0.83113456, "num_input_tokens_seen": 170263305, "step": 7878, "time_per_iteration": 3.541469097137451 }, { "auxiliary_loss_clip": 0.01127551, "auxiliary_loss_mlp": 0.0102188, "balance_loss_clip": 1.04075789, "balance_loss_mlp": 1.01515019, "epoch": 0.9473937353453977, "flos": 11181793697280.0, "grad_norm": 1.6019417625161148, "language_loss": 0.72558171, "learning_rate": 2.889997566580704e-08, "loss": 0.74707603, "num_input_tokens_seen": 170281460, "step": 7879, "time_per_iteration": 2.6238603591918945 }, { "auxiliary_loss_clip": 0.0116741, "auxiliary_loss_mlp": 0.01024799, "balance_loss_clip": 1.04538226, "balance_loss_mlp": 1.01756024, "epoch": 0.9475139782360368, "flos": 25775530433280.0, "grad_norm": 1.690794623925786, "language_loss": 0.70086074, "learning_rate": 2.8768179304583086e-08, "loss": 0.72278285, "num_input_tokens_seen": 170303515, "step": 7880, "time_per_iteration": 3.6273372173309326 }, { "auxiliary_loss_clip": 0.01124576, "auxiliary_loss_mlp": 0.0102822, "balance_loss_clip": 1.04413033, "balance_loss_mlp": 1.0216043, "epoch": 0.9476342211266758, "flos": 22820046451200.0, "grad_norm": 1.5601543818308745, "language_loss": 0.73447388, "learning_rate": 2.8636681977659117e-08, "loss": 0.75600183, "num_input_tokens_seen": 170323165, "step": 7881, "time_per_iteration": 2.7378628253936768 }, { "auxiliary_loss_clip": 0.01105915, "auxiliary_loss_mlp": 0.01024855, "balance_loss_clip": 1.04171038, "balance_loss_mlp": 1.01701641, "epoch": 0.947754464017315, "flos": 20193611984640.0, "grad_norm": 2.317071342537622, "language_loss": 0.77832526, "learning_rate": 2.850548370498318e-08, "loss": 0.79963291, "num_input_tokens_seen": 170341005, "step": 7882, "time_per_iteration": 2.722477912902832 }, { "auxiliary_loss_clip": 0.01150128, "auxiliary_loss_mlp": 0.01023743, "balance_loss_clip": 1.04214072, "balance_loss_mlp": 1.01713896, "epoch": 0.9478747069079541, "flos": 24717925359360.0, "grad_norm": 1.5809888693799885, "language_loss": 0.71323019, "learning_rate": 2.8374584506457798e-08, "loss": 0.7349689, "num_input_tokens_seen": 170362280, "step": 7883, "time_per_iteration": 3.609607696533203 }, { "auxiliary_loss_clip": 0.01134516, "auxiliary_loss_mlp": 0.01024114, "balance_loss_clip": 1.04232812, "balance_loss_mlp": 1.01710498, "epoch": 0.9479949497985931, "flos": 21361355136000.0, "grad_norm": 2.6493193055394446, "language_loss": 0.67379683, "learning_rate": 2.824398440193998e-08, "loss": 0.69538319, "num_input_tokens_seen": 170381080, "step": 7884, "time_per_iteration": 2.718432903289795 }, { "auxiliary_loss_clip": 0.01100297, "auxiliary_loss_mlp": 0.01023556, "balance_loss_clip": 1.03878093, "balance_loss_mlp": 1.01630223, "epoch": 0.9481151926892323, "flos": 18148606968960.0, "grad_norm": 2.319459285527836, "language_loss": 0.71401632, "learning_rate": 2.811368341124232e-08, "loss": 0.73525488, "num_input_tokens_seen": 170400150, "step": 7885, "time_per_iteration": 2.749744415283203 }, { "auxiliary_loss_clip": 0.01137346, "auxiliary_loss_mlp": 0.01023478, "balance_loss_clip": 1.04294777, "balance_loss_mlp": 1.01685047, "epoch": 0.9482354355798713, "flos": 22128012046080.0, "grad_norm": 3.1312871845594543, "language_loss": 0.68416899, "learning_rate": 2.7983681554131222e-08, "loss": 0.70577729, "num_input_tokens_seen": 170420410, "step": 7886, "time_per_iteration": 3.5633254051208496 }, { "auxiliary_loss_clip": 0.01133988, "auxiliary_loss_mlp": 0.01025216, "balance_loss_clip": 1.04110086, "balance_loss_mlp": 1.017223, "epoch": 0.9483556784705104, "flos": 19063072344960.0, "grad_norm": 2.097583928177926, "language_loss": 0.70713681, "learning_rate": 2.7853978850327365e-08, "loss": 0.72872877, "num_input_tokens_seen": 170439580, "step": 7887, "time_per_iteration": 2.709789276123047 }, { "auxiliary_loss_clip": 0.01122646, "auxiliary_loss_mlp": 0.01026401, "balance_loss_clip": 1.04458261, "balance_loss_mlp": 1.01999664, "epoch": 0.9484759213611496, "flos": 25777110631680.0, "grad_norm": 1.8507677180515818, "language_loss": 0.8731817, "learning_rate": 2.7724575319507225e-08, "loss": 0.89467216, "num_input_tokens_seen": 170459290, "step": 7888, "time_per_iteration": 2.7674224376678467 }, { "auxiliary_loss_clip": 0.0114784, "auxiliary_loss_mlp": 0.01025059, "balance_loss_clip": 1.04115129, "balance_loss_mlp": 1.01882386, "epoch": 0.9485961642517886, "flos": 20667740532480.0, "grad_norm": 3.8850557952153957, "language_loss": 0.7714653, "learning_rate": 2.759547098130044e-08, "loss": 0.79319429, "num_input_tokens_seen": 170478020, "step": 7889, "time_per_iteration": 2.6014273166656494 }, { "auxiliary_loss_clip": 0.0116178, "auxiliary_loss_mlp": 0.01027663, "balance_loss_clip": 1.0443747, "balance_loss_mlp": 1.02063549, "epoch": 0.9487164071424277, "flos": 22674069578880.0, "grad_norm": 1.94859433417905, "language_loss": 0.77006108, "learning_rate": 2.746666585529267e-08, "loss": 0.79195547, "num_input_tokens_seen": 170498295, "step": 7890, "time_per_iteration": 2.614652395248413 }, { "auxiliary_loss_clip": 0.01144473, "auxiliary_loss_mlp": 0.01020323, "balance_loss_clip": 1.04322267, "balance_loss_mlp": 1.01322412, "epoch": 0.9488366500330668, "flos": 38726461716480.0, "grad_norm": 2.343402868644441, "language_loss": 0.73985994, "learning_rate": 2.73381599610234e-08, "loss": 0.76150787, "num_input_tokens_seen": 170518695, "step": 7891, "time_per_iteration": 2.799907684326172 }, { "auxiliary_loss_clip": 0.01144175, "auxiliary_loss_mlp": 0.01022154, "balance_loss_clip": 1.04072726, "balance_loss_mlp": 1.01500714, "epoch": 0.9489568929237059, "flos": 27890920149120.0, "grad_norm": 1.9990590770487782, "language_loss": 0.71405971, "learning_rate": 2.7209953317987033e-08, "loss": 0.73572296, "num_input_tokens_seen": 170539735, "step": 7892, "time_per_iteration": 2.709376811981201 }, { "auxiliary_loss_clip": 0.01150324, "auxiliary_loss_mlp": 0.01025327, "balance_loss_clip": 1.04421163, "balance_loss_mlp": 1.01834464, "epoch": 0.9490771358143449, "flos": 33580642291200.0, "grad_norm": 2.4083439569196923, "language_loss": 0.78563654, "learning_rate": 2.7082045945631793e-08, "loss": 0.80739307, "num_input_tokens_seen": 170561950, "step": 7893, "time_per_iteration": 2.7012529373168945 }, { "auxiliary_loss_clip": 0.01113068, "auxiliary_loss_mlp": 0.01021313, "balance_loss_clip": 1.03908563, "balance_loss_mlp": 1.01458621, "epoch": 0.9491973787049841, "flos": 14793796512000.0, "grad_norm": 2.123326056242042, "language_loss": 0.69360036, "learning_rate": 2.6954437863361712e-08, "loss": 0.71494412, "num_input_tokens_seen": 170579865, "step": 7894, "time_per_iteration": 2.7140274047851562 }, { "auxiliary_loss_clip": 0.01094293, "auxiliary_loss_mlp": 0.01020354, "balance_loss_clip": 1.03717887, "balance_loss_mlp": 1.01390457, "epoch": 0.9493176215956232, "flos": 25332535998720.0, "grad_norm": 2.172716658965047, "language_loss": 0.70698136, "learning_rate": 2.6827129090534862e-08, "loss": 0.72812784, "num_input_tokens_seen": 170600165, "step": 7895, "time_per_iteration": 2.7593164443969727 }, { "auxiliary_loss_clip": 0.01136634, "auxiliary_loss_mlp": 0.01028504, "balance_loss_clip": 1.04415321, "balance_loss_mlp": 1.02105975, "epoch": 0.9494378644862622, "flos": 21029971236480.0, "grad_norm": 1.7848795393214096, "language_loss": 0.77538872, "learning_rate": 2.670011964646335e-08, "loss": 0.7970401, "num_input_tokens_seen": 170618845, "step": 7896, "time_per_iteration": 2.6858904361724854 }, { "auxiliary_loss_clip": 0.01085643, "auxiliary_loss_mlp": 0.01026104, "balance_loss_clip": 1.03352451, "balance_loss_mlp": 1.01907039, "epoch": 0.9495581073769014, "flos": 15195134148480.0, "grad_norm": 2.3784811447099417, "language_loss": 0.6815356, "learning_rate": 2.657340955041487e-08, "loss": 0.70265305, "num_input_tokens_seen": 170637620, "step": 7897, "time_per_iteration": 2.780773401260376 }, { "auxiliary_loss_clip": 0.01136218, "auxiliary_loss_mlp": 0.0102643, "balance_loss_clip": 1.04599667, "balance_loss_mlp": 1.01881003, "epoch": 0.9496783502675404, "flos": 28616566705920.0, "grad_norm": 2.792765804563931, "language_loss": 0.71661323, "learning_rate": 2.6446998821611167e-08, "loss": 0.73823977, "num_input_tokens_seen": 170657815, "step": 7898, "time_per_iteration": 2.793844223022461 }, { "auxiliary_loss_clip": 0.01110983, "auxiliary_loss_mlp": 0.01025136, "balance_loss_clip": 1.03972173, "balance_loss_mlp": 1.01774228, "epoch": 0.9497985931581795, "flos": 14866874732160.0, "grad_norm": 2.9851947587891945, "language_loss": 0.71620119, "learning_rate": 2.6320887479228228e-08, "loss": 0.73756242, "num_input_tokens_seen": 170674415, "step": 7899, "time_per_iteration": 2.7338685989379883 }, { "auxiliary_loss_clip": 0.01140004, "auxiliary_loss_mlp": 0.01026625, "balance_loss_clip": 1.04300427, "balance_loss_mlp": 1.01972246, "epoch": 0.9499188360488187, "flos": 27193319136000.0, "grad_norm": 2.209504673183118, "language_loss": 0.72498256, "learning_rate": 2.619507554239786e-08, "loss": 0.74664885, "num_input_tokens_seen": 170692975, "step": 7900, "time_per_iteration": 2.699279546737671 }, { "auxiliary_loss_clip": 0.01133394, "auxiliary_loss_mlp": 0.01024472, "balance_loss_clip": 1.04210353, "balance_loss_mlp": 1.01715291, "epoch": 0.9500390789394577, "flos": 24316479982080.0, "grad_norm": 1.6838114410893363, "language_loss": 0.69775701, "learning_rate": 2.606956303020502e-08, "loss": 0.71933562, "num_input_tokens_seen": 170713780, "step": 7901, "time_per_iteration": 2.692856788635254 }, { "auxiliary_loss_clip": 0.01150499, "auxiliary_loss_mlp": 0.01027082, "balance_loss_clip": 1.04479337, "balance_loss_mlp": 1.01991487, "epoch": 0.9501593218300968, "flos": 14354752573440.0, "grad_norm": 1.7072490692838345, "language_loss": 0.84120607, "learning_rate": 2.5944349961690036e-08, "loss": 0.86298192, "num_input_tokens_seen": 170730800, "step": 7902, "time_per_iteration": 2.7365963459014893 }, { "auxiliary_loss_clip": 0.01122037, "auxiliary_loss_mlp": 0.0102219, "balance_loss_clip": 1.04157209, "balance_loss_mlp": 1.01559186, "epoch": 0.9502795647207359, "flos": 38728113742080.0, "grad_norm": 1.7914652830427467, "language_loss": 0.73148125, "learning_rate": 2.581943635584749e-08, "loss": 0.75292355, "num_input_tokens_seen": 170753630, "step": 7903, "time_per_iteration": 2.8693509101867676 }, { "auxiliary_loss_clip": 0.01128155, "auxiliary_loss_mlp": 0.01025875, "balance_loss_clip": 1.0406301, "balance_loss_mlp": 1.01892173, "epoch": 0.950399807611375, "flos": 40808023799040.0, "grad_norm": 1.5866928507319307, "language_loss": 0.65905428, "learning_rate": 2.569482223162689e-08, "loss": 0.68059456, "num_input_tokens_seen": 170777605, "step": 7904, "time_per_iteration": 3.752171754837036 }, { "auxiliary_loss_clip": 0.01150346, "auxiliary_loss_mlp": 0.01028041, "balance_loss_clip": 1.04280281, "balance_loss_mlp": 1.0207963, "epoch": 0.950520050502014, "flos": 23440403266560.0, "grad_norm": 1.7790623976542161, "language_loss": 0.72540545, "learning_rate": 2.5570507607932e-08, "loss": 0.7471894, "num_input_tokens_seen": 170797520, "step": 7905, "time_per_iteration": 2.7800979614257812 }, { "auxiliary_loss_clip": 0.01155307, "auxiliary_loss_mlp": 0.01022687, "balance_loss_clip": 1.04462159, "balance_loss_mlp": 1.01543283, "epoch": 0.9506402933926532, "flos": 17783718658560.0, "grad_norm": 3.518803437021101, "language_loss": 0.64251161, "learning_rate": 2.54464925036213e-08, "loss": 0.6642915, "num_input_tokens_seen": 170814810, "step": 7906, "time_per_iteration": 2.6155145168304443 }, { "auxiliary_loss_clip": 0.01151781, "auxiliary_loss_mlp": 0.01022901, "balance_loss_clip": 1.04515624, "balance_loss_mlp": 1.01632929, "epoch": 0.9507605362832923, "flos": 32561928668160.0, "grad_norm": 1.856450609561302, "language_loss": 0.60919785, "learning_rate": 2.532277693750773e-08, "loss": 0.63094467, "num_input_tokens_seen": 170835735, "step": 7907, "time_per_iteration": 3.6344966888427734 }, { "auxiliary_loss_clip": 0.01106928, "auxiliary_loss_mlp": 0.01024902, "balance_loss_clip": 1.04285598, "balance_loss_mlp": 1.01791644, "epoch": 0.9508807791739313, "flos": 19602054898560.0, "grad_norm": 6.082223907026243, "language_loss": 0.75835758, "learning_rate": 2.5199360928358948e-08, "loss": 0.77967584, "num_input_tokens_seen": 170852970, "step": 7908, "time_per_iteration": 3.580583333969116 }, { "auxiliary_loss_clip": 0.01139307, "auxiliary_loss_mlp": 0.00761198, "balance_loss_clip": 1.04000032, "balance_loss_mlp": 1.00028348, "epoch": 0.9510010220645704, "flos": 21471852349440.0, "grad_norm": 1.6789100981354363, "language_loss": 0.8711741, "learning_rate": 2.507624449489665e-08, "loss": 0.89017916, "num_input_tokens_seen": 170871600, "step": 7909, "time_per_iteration": 2.667489767074585 }, { "auxiliary_loss_clip": 0.01138484, "auxiliary_loss_mlp": 0.01026925, "balance_loss_clip": 1.04481065, "balance_loss_mlp": 1.01923347, "epoch": 0.9511212649552095, "flos": 18879999701760.0, "grad_norm": 2.301005170683377, "language_loss": 0.64959335, "learning_rate": 2.495342765579811e-08, "loss": 0.67124748, "num_input_tokens_seen": 170890260, "step": 7910, "time_per_iteration": 2.6745145320892334 }, { "auxiliary_loss_clip": 0.01107438, "auxiliary_loss_mlp": 0.01023191, "balance_loss_clip": 1.04191732, "balance_loss_mlp": 1.01676261, "epoch": 0.9512415078458486, "flos": 20810521094400.0, "grad_norm": 1.969515219392795, "language_loss": 0.71234626, "learning_rate": 2.4830910429693984e-08, "loss": 0.73365253, "num_input_tokens_seen": 170910220, "step": 7911, "time_per_iteration": 2.770380735397339 }, { "auxiliary_loss_clip": 0.01165229, "auxiliary_loss_mlp": 0.01026976, "balance_loss_clip": 1.04555464, "balance_loss_mlp": 1.02050257, "epoch": 0.9513617507364877, "flos": 18369565482240.0, "grad_norm": 1.8911599146420979, "language_loss": 0.79833037, "learning_rate": 2.470869283517052e-08, "loss": 0.82025242, "num_input_tokens_seen": 170928255, "step": 7912, "time_per_iteration": 3.5448007583618164 }, { "auxiliary_loss_clip": 0.01143412, "auxiliary_loss_mlp": 0.01026992, "balance_loss_clip": 1.04186416, "balance_loss_mlp": 1.02042639, "epoch": 0.9514819936271268, "flos": 25010166412800.0, "grad_norm": 1.6034321280480226, "language_loss": 0.7715615, "learning_rate": 2.458677489076777e-08, "loss": 0.79326546, "num_input_tokens_seen": 170949265, "step": 7913, "time_per_iteration": 2.727651357650757 }, { "auxiliary_loss_clip": 0.01142093, "auxiliary_loss_mlp": 0.0101969, "balance_loss_clip": 1.04161525, "balance_loss_mlp": 1.01307058, "epoch": 0.9516022365177659, "flos": 18662129758080.0, "grad_norm": 1.58896196670677, "language_loss": 0.82859874, "learning_rate": 2.446515661498072e-08, "loss": 0.85021651, "num_input_tokens_seen": 170968595, "step": 7914, "time_per_iteration": 2.6539628505706787 }, { "auxiliary_loss_clip": 0.01094074, "auxiliary_loss_mlp": 0.01020702, "balance_loss_clip": 1.0390749, "balance_loss_mlp": 1.01394629, "epoch": 0.9517224794084049, "flos": 25372109808000.0, "grad_norm": 2.006193083445214, "language_loss": 0.7441982, "learning_rate": 2.434383802625861e-08, "loss": 0.76534605, "num_input_tokens_seen": 170987550, "step": 7915, "time_per_iteration": 2.801605224609375 }, { "auxiliary_loss_clip": 0.0112335, "auxiliary_loss_mlp": 0.01020669, "balance_loss_clip": 1.03876233, "balance_loss_mlp": 1.01370442, "epoch": 0.9518427222990441, "flos": 21470918595840.0, "grad_norm": 1.757371130323707, "language_loss": 0.7396239, "learning_rate": 2.4222819143005168e-08, "loss": 0.76106411, "num_input_tokens_seen": 171007145, "step": 7916, "time_per_iteration": 2.8161075115203857 }, { "auxiliary_loss_clip": 0.01165002, "auxiliary_loss_mlp": 0.01022501, "balance_loss_clip": 1.04724371, "balance_loss_mlp": 1.01546168, "epoch": 0.9519629651896832, "flos": 21033634423680.0, "grad_norm": 1.820261508054258, "language_loss": 0.80657184, "learning_rate": 2.4102099983579706e-08, "loss": 0.82844687, "num_input_tokens_seen": 171026295, "step": 7917, "time_per_iteration": 2.6008799076080322 }, { "auxiliary_loss_clip": 0.01151619, "auxiliary_loss_mlp": 0.01024942, "balance_loss_clip": 1.04360652, "balance_loss_mlp": 1.01729441, "epoch": 0.9520832080803222, "flos": 21689219502720.0, "grad_norm": 1.6703885841444936, "language_loss": 0.77064753, "learning_rate": 2.3981680566294236e-08, "loss": 0.79241312, "num_input_tokens_seen": 171045895, "step": 7918, "time_per_iteration": 2.674933910369873 }, { "auxiliary_loss_clip": 0.01164153, "auxiliary_loss_mlp": 0.01022867, "balance_loss_clip": 1.04727054, "balance_loss_mlp": 1.0160985, "epoch": 0.9522034509709614, "flos": 23145289125120.0, "grad_norm": 1.8000072815995982, "language_loss": 0.73369431, "learning_rate": 2.3861560909416822e-08, "loss": 0.75556451, "num_input_tokens_seen": 171065445, "step": 7919, "time_per_iteration": 2.6103403568267822 }, { "auxiliary_loss_clip": 0.01108243, "auxiliary_loss_mlp": 0.01023626, "balance_loss_clip": 1.04037511, "balance_loss_mlp": 1.0165267, "epoch": 0.9523236938616004, "flos": 24679428958080.0, "grad_norm": 1.6763994099814123, "language_loss": 0.82955658, "learning_rate": 2.3741741031169325e-08, "loss": 0.85087526, "num_input_tokens_seen": 171085015, "step": 7920, "time_per_iteration": 2.760673761367798 }, { "auxiliary_loss_clip": 0.01101841, "auxiliary_loss_mlp": 0.01026838, "balance_loss_clip": 1.03765452, "balance_loss_mlp": 1.01948225, "epoch": 0.9524439367522395, "flos": 22672309812480.0, "grad_norm": 1.887409950985664, "language_loss": 0.71557426, "learning_rate": 2.3622220949728544e-08, "loss": 0.73686105, "num_input_tokens_seen": 171103900, "step": 7921, "time_per_iteration": 2.7461600303649902 }, { "auxiliary_loss_clip": 0.01143892, "auxiliary_loss_mlp": 0.0102299, "balance_loss_clip": 1.04239392, "balance_loss_mlp": 1.01568234, "epoch": 0.9525641796428787, "flos": 34055525024640.0, "grad_norm": 2.5585080989216555, "language_loss": 0.61436707, "learning_rate": 2.3503000683225526e-08, "loss": 0.63603592, "num_input_tokens_seen": 171121615, "step": 7922, "time_per_iteration": 2.7127275466918945 }, { "auxiliary_loss_clip": 0.01165478, "auxiliary_loss_mlp": 0.0102164, "balance_loss_clip": 1.0453366, "balance_loss_mlp": 1.01454103, "epoch": 0.9526844225335177, "flos": 16727083251840.0, "grad_norm": 2.0937550500766013, "language_loss": 0.8447994, "learning_rate": 2.3384080249745585e-08, "loss": 0.86667055, "num_input_tokens_seen": 171139505, "step": 7923, "time_per_iteration": 2.533846855163574 }, { "auxiliary_loss_clip": 0.01113594, "auxiliary_loss_mlp": 0.01024785, "balance_loss_clip": 1.04166079, "balance_loss_mlp": 1.01789188, "epoch": 0.9528046654241568, "flos": 36939367330560.0, "grad_norm": 2.492034667803386, "language_loss": 0.83010638, "learning_rate": 2.3265459667329178e-08, "loss": 0.85149014, "num_input_tokens_seen": 171158995, "step": 7924, "time_per_iteration": 2.8950395584106445 }, { "auxiliary_loss_clip": 0.01137456, "auxiliary_loss_mlp": 0.01022921, "balance_loss_clip": 1.04255676, "balance_loss_mlp": 1.01592898, "epoch": 0.9529249083147959, "flos": 18255010032000.0, "grad_norm": 2.3043803981460287, "language_loss": 0.86678237, "learning_rate": 2.31471389539708e-08, "loss": 0.88838619, "num_input_tokens_seen": 171176120, "step": 7925, "time_per_iteration": 2.6174850463867188 }, { "auxiliary_loss_clip": 0.01152606, "auxiliary_loss_mlp": 0.00761996, "balance_loss_clip": 1.04526424, "balance_loss_mlp": 1.00028992, "epoch": 0.953045151205435, "flos": 28658438985600.0, "grad_norm": 2.3200702662722876, "language_loss": 0.72603798, "learning_rate": 2.3029118127619872e-08, "loss": 0.74518406, "num_input_tokens_seen": 171195835, "step": 7926, "time_per_iteration": 2.714331865310669 }, { "auxiliary_loss_clip": 0.01128609, "auxiliary_loss_mlp": 0.01028823, "balance_loss_clip": 1.04150188, "balance_loss_mlp": 1.02176309, "epoch": 0.953165394096074, "flos": 21835232288640.0, "grad_norm": 2.0538696713358653, "language_loss": 0.87076724, "learning_rate": 2.2911397206179628e-08, "loss": 0.89234149, "num_input_tokens_seen": 171212585, "step": 7927, "time_per_iteration": 2.653094530105591 }, { "auxiliary_loss_clip": 0.01162579, "auxiliary_loss_mlp": 0.01023597, "balance_loss_clip": 1.04502308, "balance_loss_mlp": 1.01662934, "epoch": 0.9532856369867132, "flos": 19975059682560.0, "grad_norm": 1.8762704649059068, "language_loss": 0.62663239, "learning_rate": 2.279397620750845e-08, "loss": 0.64849412, "num_input_tokens_seen": 171231630, "step": 7928, "time_per_iteration": 2.59883975982666 }, { "auxiliary_loss_clip": 0.01131378, "auxiliary_loss_mlp": 0.01022971, "balance_loss_clip": 1.03955197, "balance_loss_mlp": 1.01611936, "epoch": 0.9534058798773523, "flos": 15049588239360.0, "grad_norm": 1.8945873038632957, "language_loss": 0.78603816, "learning_rate": 2.2676855149419195e-08, "loss": 0.80758166, "num_input_tokens_seen": 171248800, "step": 7929, "time_per_iteration": 2.6418237686157227 }, { "auxiliary_loss_clip": 0.01133361, "auxiliary_loss_mlp": 0.01022976, "balance_loss_clip": 1.04684055, "balance_loss_mlp": 1.01637745, "epoch": 0.9535261227679913, "flos": 17602800831360.0, "grad_norm": 2.4995334419944126, "language_loss": 0.75563341, "learning_rate": 2.2560034049678988e-08, "loss": 0.77719676, "num_input_tokens_seen": 171263150, "step": 7930, "time_per_iteration": 3.5574288368225098 }, { "auxiliary_loss_clip": 0.01170618, "auxiliary_loss_mlp": 0.01023407, "balance_loss_clip": 1.04869223, "balance_loss_mlp": 1.01597452, "epoch": 0.9536463656586305, "flos": 23142954741120.0, "grad_norm": 1.6583857749583228, "language_loss": 0.75529075, "learning_rate": 2.2443512926008988e-08, "loss": 0.77723098, "num_input_tokens_seen": 171282480, "step": 7931, "time_per_iteration": 2.5710456371307373 }, { "auxiliary_loss_clip": 0.01124106, "auxiliary_loss_mlp": 0.01025517, "balance_loss_clip": 1.03970385, "balance_loss_mlp": 1.01855779, "epoch": 0.9537666085492695, "flos": 18625033987200.0, "grad_norm": 2.258827753687845, "language_loss": 0.69759595, "learning_rate": 2.2327291796085946e-08, "loss": 0.71909219, "num_input_tokens_seen": 171300840, "step": 7932, "time_per_iteration": 2.645685911178589 }, { "auxiliary_loss_clip": 0.01164962, "auxiliary_loss_mlp": 0.01025305, "balance_loss_clip": 1.04439521, "balance_loss_mlp": 1.01808715, "epoch": 0.9538868514399086, "flos": 18989347680000.0, "grad_norm": 2.8837468984512205, "language_loss": 0.77756137, "learning_rate": 2.2211370677540197e-08, "loss": 0.79946405, "num_input_tokens_seen": 171317365, "step": 7933, "time_per_iteration": 3.5404863357543945 }, { "auxiliary_loss_clip": 0.01166636, "auxiliary_loss_mlp": 0.01027581, "balance_loss_clip": 1.04581881, "balance_loss_mlp": 1.0206902, "epoch": 0.9540070943305478, "flos": 16800556521600.0, "grad_norm": 3.156107640579223, "language_loss": 0.77946281, "learning_rate": 2.2095749587957012e-08, "loss": 0.80140501, "num_input_tokens_seen": 171335270, "step": 7934, "time_per_iteration": 3.50911545753479 }, { "auxiliary_loss_clip": 0.01133819, "auxiliary_loss_mlp": 0.01024697, "balance_loss_clip": 1.03966522, "balance_loss_mlp": 1.0174427, "epoch": 0.9541273372211868, "flos": 20156911263360.0, "grad_norm": 2.0470806587604344, "language_loss": 0.69336867, "learning_rate": 2.1980428544876138e-08, "loss": 0.7149539, "num_input_tokens_seen": 171353910, "step": 7935, "time_per_iteration": 2.682501792907715 }, { "auxiliary_loss_clip": 0.01100803, "auxiliary_loss_mlp": 0.01028209, "balance_loss_clip": 1.03462195, "balance_loss_mlp": 1.02115726, "epoch": 0.9542475801118259, "flos": 26725511381760.0, "grad_norm": 1.7201135275053703, "language_loss": 0.74331832, "learning_rate": 2.1865407565791584e-08, "loss": 0.76460844, "num_input_tokens_seen": 171375480, "step": 7936, "time_per_iteration": 2.755011558532715 }, { "auxiliary_loss_clip": 0.01135769, "auxiliary_loss_mlp": 0.01024413, "balance_loss_clip": 1.04053283, "balance_loss_mlp": 1.01682258, "epoch": 0.954367823002465, "flos": 23330911633920.0, "grad_norm": 2.777596555837622, "language_loss": 0.77361274, "learning_rate": 2.175068666815183e-08, "loss": 0.79521459, "num_input_tokens_seen": 171396320, "step": 7937, "time_per_iteration": 2.640911340713501 }, { "auxiliary_loss_clip": 0.01123746, "auxiliary_loss_mlp": 0.010245, "balance_loss_clip": 1.04210246, "balance_loss_mlp": 1.01693654, "epoch": 0.9544880658931041, "flos": 14902713527040.0, "grad_norm": 2.1849710114076806, "language_loss": 0.78855419, "learning_rate": 2.163626586935985e-08, "loss": 0.81003666, "num_input_tokens_seen": 171412860, "step": 7938, "time_per_iteration": 3.619279146194458 }, { "auxiliary_loss_clip": 0.01146062, "auxiliary_loss_mlp": 0.01026425, "balance_loss_clip": 1.04113889, "balance_loss_mlp": 1.01906657, "epoch": 0.9546083087837431, "flos": 29095902725760.0, "grad_norm": 1.8192039694937527, "language_loss": 0.63126552, "learning_rate": 2.1522145186773755e-08, "loss": 0.6529904, "num_input_tokens_seen": 171431780, "step": 7939, "time_per_iteration": 2.688520908355713 }, { "auxiliary_loss_clip": 0.01134759, "auxiliary_loss_mlp": 0.01022675, "balance_loss_clip": 1.04232836, "balance_loss_mlp": 1.01653922, "epoch": 0.9547285516743822, "flos": 21142335957120.0, "grad_norm": 2.554043643452691, "language_loss": 0.85471243, "learning_rate": 2.140832463770481e-08, "loss": 0.87628675, "num_input_tokens_seen": 171450975, "step": 7940, "time_per_iteration": 2.6702940464019775 }, { "auxiliary_loss_clip": 0.01139684, "auxiliary_loss_mlp": 0.01027219, "balance_loss_clip": 1.04179168, "balance_loss_mlp": 1.01990891, "epoch": 0.9548487945650214, "flos": 27490157130240.0, "grad_norm": 2.3593776291491713, "language_loss": 0.76135528, "learning_rate": 2.129480423941987e-08, "loss": 0.78302431, "num_input_tokens_seen": 171467645, "step": 7941, "time_per_iteration": 2.7260656356811523 }, { "auxiliary_loss_clip": 0.01142977, "auxiliary_loss_mlp": 0.0102137, "balance_loss_clip": 1.0451771, "balance_loss_mlp": 1.01454544, "epoch": 0.9549690374556604, "flos": 22273198819200.0, "grad_norm": 1.6271166189774162, "language_loss": 0.80155623, "learning_rate": 2.1181584009140052e-08, "loss": 0.82319975, "num_input_tokens_seen": 171487185, "step": 7942, "time_per_iteration": 2.6485323905944824 }, { "auxiliary_loss_clip": 0.01130478, "auxiliary_loss_mlp": 0.01023288, "balance_loss_clip": 1.04138196, "balance_loss_mlp": 1.01640391, "epoch": 0.9550892803462995, "flos": 17595294888960.0, "grad_norm": 1.893950385903907, "language_loss": 0.83715129, "learning_rate": 2.10686639640405e-08, "loss": 0.85868895, "num_input_tokens_seen": 171501275, "step": 7943, "time_per_iteration": 2.6878015995025635 }, { "auxiliary_loss_clip": 0.01155556, "auxiliary_loss_mlp": 0.01023434, "balance_loss_clip": 1.04551613, "balance_loss_mlp": 1.01671696, "epoch": 0.9552095232369386, "flos": 24353144789760.0, "grad_norm": 1.7115391463137746, "language_loss": 0.81409615, "learning_rate": 2.0956044121251294e-08, "loss": 0.83588606, "num_input_tokens_seen": 171520060, "step": 7944, "time_per_iteration": 2.6297781467437744 }, { "auxiliary_loss_clip": 0.01121167, "auxiliary_loss_mlp": 0.01029629, "balance_loss_clip": 1.04309309, "balance_loss_mlp": 1.02198458, "epoch": 0.9553297661275777, "flos": 22746860490240.0, "grad_norm": 2.116911083868702, "language_loss": 0.81081831, "learning_rate": 2.084372449785654e-08, "loss": 0.83232629, "num_input_tokens_seen": 171539895, "step": 7945, "time_per_iteration": 2.7179758548736572 }, { "auxiliary_loss_clip": 0.01134215, "auxiliary_loss_mlp": 0.01024025, "balance_loss_clip": 1.04244399, "balance_loss_mlp": 1.01715302, "epoch": 0.9554500090182168, "flos": 15413866018560.0, "grad_norm": 1.6105062475312109, "language_loss": 0.68888021, "learning_rate": 2.0731705110895282e-08, "loss": 0.71046257, "num_input_tokens_seen": 171557385, "step": 7946, "time_per_iteration": 2.6266591548919678 }, { "auxiliary_loss_clip": 0.01154655, "auxiliary_loss_mlp": 0.01026036, "balance_loss_clip": 1.0462513, "balance_loss_mlp": 1.01883292, "epoch": 0.9555702519088559, "flos": 23513517400320.0, "grad_norm": 2.1137727344909845, "language_loss": 0.86586177, "learning_rate": 2.0619985977360587e-08, "loss": 0.88766867, "num_input_tokens_seen": 171575705, "step": 7947, "time_per_iteration": 2.646944761276245 }, { "auxiliary_loss_clip": 0.01108077, "auxiliary_loss_mlp": 0.010238, "balance_loss_clip": 1.03644514, "balance_loss_mlp": 1.01701081, "epoch": 0.955690494799495, "flos": 22962072827520.0, "grad_norm": 1.7858859910691471, "language_loss": 0.76840699, "learning_rate": 2.0508567114200237e-08, "loss": 0.78972578, "num_input_tokens_seen": 171595620, "step": 7948, "time_per_iteration": 2.7115275859832764 }, { "auxiliary_loss_clip": 0.01139489, "auxiliary_loss_mlp": 0.01023561, "balance_loss_clip": 1.04204535, "balance_loss_mlp": 1.017079, "epoch": 0.955810737690134, "flos": 26031250333440.0, "grad_norm": 2.5869060511988726, "language_loss": 0.78903323, "learning_rate": 2.0397448538316485e-08, "loss": 0.8106637, "num_input_tokens_seen": 171616660, "step": 7949, "time_per_iteration": 2.682366371154785 }, { "auxiliary_loss_clip": 0.01117272, "auxiliary_loss_mlp": 0.0102672, "balance_loss_clip": 1.04017138, "balance_loss_mlp": 1.01943016, "epoch": 0.9559309805807732, "flos": 20849951249280.0, "grad_norm": 2.570137211363715, "language_loss": 0.67008114, "learning_rate": 2.028663026656563e-08, "loss": 0.69152105, "num_input_tokens_seen": 171635515, "step": 7950, "time_per_iteration": 2.77748441696167 }, { "auxiliary_loss_clip": 0.01161469, "auxiliary_loss_mlp": 0.00762199, "balance_loss_clip": 1.04451871, "balance_loss_mlp": 1.00032401, "epoch": 0.9560512234714122, "flos": 21578219498880.0, "grad_norm": 1.967345902086176, "language_loss": 0.72083783, "learning_rate": 2.0176112315758885e-08, "loss": 0.74007446, "num_input_tokens_seen": 171653305, "step": 7951, "time_per_iteration": 2.5791947841644287 }, { "auxiliary_loss_clip": 0.01116054, "auxiliary_loss_mlp": 0.01023091, "balance_loss_clip": 1.04031146, "balance_loss_mlp": 1.01629853, "epoch": 0.9561714663620513, "flos": 17450144029440.0, "grad_norm": 2.576858704771275, "language_loss": 0.69054985, "learning_rate": 2.0065894702661957e-08, "loss": 0.7119413, "num_input_tokens_seen": 171669980, "step": 7952, "time_per_iteration": 2.804614305496216 }, { "auxiliary_loss_clip": 0.01113722, "auxiliary_loss_mlp": 0.00761983, "balance_loss_clip": 1.03823471, "balance_loss_mlp": 1.00030136, "epoch": 0.9562917092526905, "flos": 26098510550400.0, "grad_norm": 1.865450349924282, "language_loss": 0.77898192, "learning_rate": 1.9955977443994577e-08, "loss": 0.79773897, "num_input_tokens_seen": 171689970, "step": 7953, "time_per_iteration": 2.6989500522613525 }, { "auxiliary_loss_clip": 0.01134742, "auxiliary_loss_mlp": 0.01029195, "balance_loss_clip": 1.04228652, "balance_loss_mlp": 1.02152991, "epoch": 0.9564119521433295, "flos": 24096742531200.0, "grad_norm": 2.265582349287895, "language_loss": 0.62432325, "learning_rate": 1.9846360556430965e-08, "loss": 0.64596266, "num_input_tokens_seen": 171708270, "step": 7954, "time_per_iteration": 2.6592764854431152 }, { "auxiliary_loss_clip": 0.01164635, "auxiliary_loss_mlp": 0.01026773, "balance_loss_clip": 1.04522729, "balance_loss_mlp": 1.01981139, "epoch": 0.9565321950339686, "flos": 32008903896960.0, "grad_norm": 2.2689334426236023, "language_loss": 0.61949772, "learning_rate": 1.973704405660004e-08, "loss": 0.64141178, "num_input_tokens_seen": 171729385, "step": 7955, "time_per_iteration": 2.643402338027954 }, { "auxiliary_loss_clip": 0.01093017, "auxiliary_loss_mlp": 0.01025781, "balance_loss_clip": 1.03783703, "balance_loss_mlp": 1.01883125, "epoch": 0.9566524379246077, "flos": 23588642695680.0, "grad_norm": 1.837561897287415, "language_loss": 0.77990878, "learning_rate": 1.9628027961085203e-08, "loss": 0.8010968, "num_input_tokens_seen": 171752615, "step": 7956, "time_per_iteration": 3.714174747467041 }, { "auxiliary_loss_clip": 0.01109381, "auxiliary_loss_mlp": 0.01029593, "balance_loss_clip": 1.03639674, "balance_loss_mlp": 1.02271497, "epoch": 0.9567726808152468, "flos": 38067716240640.0, "grad_norm": 1.8612571163574678, "language_loss": 0.83936429, "learning_rate": 1.9519312286423894e-08, "loss": 0.86075401, "num_input_tokens_seen": 171775810, "step": 7957, "time_per_iteration": 2.8357603549957275 }, { "auxiliary_loss_clip": 0.011489, "auxiliary_loss_mlp": 0.01018759, "balance_loss_clip": 1.04546845, "balance_loss_mlp": 1.0119133, "epoch": 0.9568929237058859, "flos": 22744059229440.0, "grad_norm": 3.077918035807891, "language_loss": 0.77598387, "learning_rate": 1.9410897049108255e-08, "loss": 0.79766047, "num_input_tokens_seen": 171795090, "step": 7958, "time_per_iteration": 2.684954881668091 }, { "auxiliary_loss_clip": 0.01173368, "auxiliary_loss_mlp": 0.01030786, "balance_loss_clip": 1.0500133, "balance_loss_mlp": 1.02299857, "epoch": 0.957013166596525, "flos": 23841633162240.0, "grad_norm": 9.349232641071948, "language_loss": 0.91332757, "learning_rate": 1.9302782265584905e-08, "loss": 0.93536913, "num_input_tokens_seen": 171815755, "step": 7959, "time_per_iteration": 3.577113389968872 }, { "auxiliary_loss_clip": 0.01098741, "auxiliary_loss_mlp": 0.01026395, "balance_loss_clip": 1.04040051, "balance_loss_mlp": 1.0194689, "epoch": 0.9571334094871641, "flos": 17639286071040.0, "grad_norm": 2.526377313424507, "language_loss": 0.87095004, "learning_rate": 1.9194967952254282e-08, "loss": 0.89220142, "num_input_tokens_seen": 171834330, "step": 7960, "time_per_iteration": 3.593712329864502 }, { "auxiliary_loss_clip": 0.01151683, "auxiliary_loss_mlp": 0.01020676, "balance_loss_clip": 1.04584432, "balance_loss_mlp": 1.01390767, "epoch": 0.9572536523778031, "flos": 15369623441280.0, "grad_norm": 3.4499541938474354, "language_loss": 0.804088, "learning_rate": 1.9087454125472635e-08, "loss": 0.82581162, "num_input_tokens_seen": 171848805, "step": 7961, "time_per_iteration": 2.601853370666504 }, { "auxiliary_loss_clip": 0.01166601, "auxiliary_loss_mlp": 0.01026958, "balance_loss_clip": 1.0459491, "balance_loss_mlp": 1.01954961, "epoch": 0.9573738952684423, "flos": 24969838417920.0, "grad_norm": 1.9441228474075551, "language_loss": 0.7898258, "learning_rate": 1.8980240801548696e-08, "loss": 0.81176144, "num_input_tokens_seen": 171867995, "step": 7962, "time_per_iteration": 2.5859153270721436 }, { "auxiliary_loss_clip": 0.01135575, "auxiliary_loss_mlp": 0.01023922, "balance_loss_clip": 1.04587114, "balance_loss_mlp": 1.01661491, "epoch": 0.9574941381590814, "flos": 25769461034880.0, "grad_norm": 1.6831816995819224, "language_loss": 0.74123371, "learning_rate": 1.8873327996747458e-08, "loss": 0.76282871, "num_input_tokens_seen": 171886495, "step": 7963, "time_per_iteration": 2.749119997024536 }, { "auxiliary_loss_clip": 0.01152934, "auxiliary_loss_mlp": 0.01030868, "balance_loss_clip": 1.04279149, "balance_loss_mlp": 1.02359676, "epoch": 0.9576143810497204, "flos": 32307178435200.0, "grad_norm": 1.7037481979176172, "language_loss": 0.65877098, "learning_rate": 1.8766715727287053e-08, "loss": 0.68060899, "num_input_tokens_seen": 171908200, "step": 7964, "time_per_iteration": 3.6807610988616943 }, { "auxiliary_loss_clip": 0.01155225, "auxiliary_loss_mlp": 0.00762037, "balance_loss_clip": 1.04367316, "balance_loss_mlp": 1.00032282, "epoch": 0.9577346239403596, "flos": 27745733376000.0, "grad_norm": 1.6066808825284544, "language_loss": 0.79603362, "learning_rate": 1.8660404009340546e-08, "loss": 0.81520617, "num_input_tokens_seen": 171928650, "step": 7965, "time_per_iteration": 2.642902135848999 }, { "auxiliary_loss_clip": 0.01054341, "auxiliary_loss_mlp": 0.01002992, "balance_loss_clip": 1.00972164, "balance_loss_mlp": 1.00194252, "epoch": 0.9578548668309986, "flos": 57468313710720.0, "grad_norm": 0.8891598496903537, "language_loss": 0.59491438, "learning_rate": 1.8554392859035485e-08, "loss": 0.61548769, "num_input_tokens_seen": 171986400, "step": 7966, "time_per_iteration": 3.209813117980957 }, { "auxiliary_loss_clip": 0.01083465, "auxiliary_loss_mlp": 0.01021541, "balance_loss_clip": 1.03600955, "balance_loss_mlp": 1.01448989, "epoch": 0.9579751097216377, "flos": 19756040503680.0, "grad_norm": 2.0209124305783197, "language_loss": 0.7890569, "learning_rate": 1.8448682292453444e-08, "loss": 0.81010687, "num_input_tokens_seen": 172005475, "step": 7967, "time_per_iteration": 2.7390992641448975 }, { "auxiliary_loss_clip": 0.01163366, "auxiliary_loss_mlp": 0.01023869, "balance_loss_clip": 1.04489541, "balance_loss_mlp": 1.0172534, "epoch": 0.9580953526122769, "flos": 18041270152320.0, "grad_norm": 1.6639292591412034, "language_loss": 0.66089725, "learning_rate": 1.8343272325631154e-08, "loss": 0.68276966, "num_input_tokens_seen": 172024420, "step": 7968, "time_per_iteration": 2.6867029666900635 }, { "auxiliary_loss_clip": 0.01085371, "auxiliary_loss_mlp": 0.00762467, "balance_loss_clip": 1.03813207, "balance_loss_mlp": 1.00033879, "epoch": 0.9582155955029159, "flos": 24270154416000.0, "grad_norm": 2.5464879130510307, "language_loss": 0.78319293, "learning_rate": 1.8238162974558492e-08, "loss": 0.80167127, "num_input_tokens_seen": 172038350, "step": 7969, "time_per_iteration": 2.801637887954712 }, { "auxiliary_loss_clip": 0.01133874, "auxiliary_loss_mlp": 0.01020667, "balance_loss_clip": 1.04277575, "balance_loss_mlp": 1.01396155, "epoch": 0.958335838393555, "flos": 22783309816320.0, "grad_norm": 1.9692470266974884, "language_loss": 0.74794221, "learning_rate": 1.8133354255181144e-08, "loss": 0.76948762, "num_input_tokens_seen": 172058665, "step": 7970, "time_per_iteration": 2.6603810787200928 }, { "auxiliary_loss_clip": 0.01145792, "auxiliary_loss_mlp": 0.01024424, "balance_loss_clip": 1.04238987, "balance_loss_mlp": 1.01788807, "epoch": 0.958456081284194, "flos": 16911484698240.0, "grad_norm": 1.839437656371464, "language_loss": 0.74608666, "learning_rate": 1.802884618339795e-08, "loss": 0.76778883, "num_input_tokens_seen": 172077470, "step": 7971, "time_per_iteration": 2.652435779571533 }, { "auxiliary_loss_clip": 0.01154154, "auxiliary_loss_mlp": 0.01021657, "balance_loss_clip": 1.0461545, "balance_loss_mlp": 1.01439762, "epoch": 0.9585763241748332, "flos": 19974951941760.0, "grad_norm": 3.5693207914651413, "language_loss": 0.8092317, "learning_rate": 1.7924638775062894e-08, "loss": 0.83098984, "num_input_tokens_seen": 172096590, "step": 7972, "time_per_iteration": 2.6020991802215576 }, { "auxiliary_loss_clip": 0.01115683, "auxiliary_loss_mlp": 0.01026111, "balance_loss_clip": 1.04120541, "balance_loss_mlp": 1.01955748, "epoch": 0.9586965670654722, "flos": 21395649646080.0, "grad_norm": 1.8516435253301873, "language_loss": 0.81781358, "learning_rate": 1.7820732045984444e-08, "loss": 0.83923149, "num_input_tokens_seen": 172116735, "step": 7973, "time_per_iteration": 2.6952946186065674 }, { "auxiliary_loss_clip": 0.01148949, "auxiliary_loss_mlp": 0.01029607, "balance_loss_clip": 1.04339683, "balance_loss_mlp": 1.02139318, "epoch": 0.9588168099561113, "flos": 21435115714560.0, "grad_norm": 2.2627666653327085, "language_loss": 0.74097651, "learning_rate": 1.7717126011924655e-08, "loss": 0.76276207, "num_input_tokens_seen": 172138320, "step": 7974, "time_per_iteration": 2.7159955501556396 }, { "auxiliary_loss_clip": 0.01100178, "auxiliary_loss_mlp": 0.01025243, "balance_loss_clip": 1.03521824, "balance_loss_mlp": 1.01772666, "epoch": 0.9589370528467505, "flos": 11763761852160.0, "grad_norm": 3.632036312309814, "language_loss": 0.76487684, "learning_rate": 1.7613820688600957e-08, "loss": 0.78613108, "num_input_tokens_seen": 172154225, "step": 7975, "time_per_iteration": 2.664675235748291 }, { "auxiliary_loss_clip": 0.01141906, "auxiliary_loss_mlp": 0.0102551, "balance_loss_clip": 1.04179764, "balance_loss_mlp": 1.01886725, "epoch": 0.9590572957373895, "flos": 23441516588160.0, "grad_norm": 1.9131523148593965, "language_loss": 0.78517592, "learning_rate": 1.7510816091684588e-08, "loss": 0.80685008, "num_input_tokens_seen": 172174150, "step": 7976, "time_per_iteration": 2.64736008644104 }, { "auxiliary_loss_clip": 0.01137704, "auxiliary_loss_mlp": 0.01023919, "balance_loss_clip": 1.04275465, "balance_loss_mlp": 1.01585412, "epoch": 0.9591775386280286, "flos": 22528272274560.0, "grad_norm": 2.627351993178401, "language_loss": 0.78693974, "learning_rate": 1.740811223680083e-08, "loss": 0.80855596, "num_input_tokens_seen": 172191005, "step": 7977, "time_per_iteration": 2.6496148109436035 }, { "auxiliary_loss_clip": 0.01164076, "auxiliary_loss_mlp": 0.01025097, "balance_loss_clip": 1.04376721, "balance_loss_mlp": 1.01816797, "epoch": 0.9592977815186677, "flos": 18186959715840.0, "grad_norm": 2.1560047048482254, "language_loss": 0.74194109, "learning_rate": 1.7305709139530334e-08, "loss": 0.76383281, "num_input_tokens_seen": 172209785, "step": 7978, "time_per_iteration": 2.5967350006103516 }, { "auxiliary_loss_clip": 0.01142963, "auxiliary_loss_mlp": 0.01027499, "balance_loss_clip": 1.03999114, "balance_loss_mlp": 1.02025151, "epoch": 0.9594180244093068, "flos": 16537797555840.0, "grad_norm": 2.2428925279422782, "language_loss": 0.74721503, "learning_rate": 1.7203606815407334e-08, "loss": 0.76891959, "num_input_tokens_seen": 172224380, "step": 7979, "time_per_iteration": 2.691249370574951 }, { "auxiliary_loss_clip": 0.01141145, "auxiliary_loss_mlp": 0.01028128, "balance_loss_clip": 1.04366446, "balance_loss_mlp": 1.0208447, "epoch": 0.9595382672999458, "flos": 20554334317440.0, "grad_norm": 1.781692717624645, "language_loss": 0.79113424, "learning_rate": 1.7101805279920557e-08, "loss": 0.81282699, "num_input_tokens_seen": 172242540, "step": 7980, "time_per_iteration": 2.6922121047973633 }, { "auxiliary_loss_clip": 0.01167407, "auxiliary_loss_mlp": 0.01023756, "balance_loss_clip": 1.04842281, "balance_loss_mlp": 1.01664853, "epoch": 0.959658510190585, "flos": 22638266697600.0, "grad_norm": 2.8286937508406833, "language_loss": 0.80885559, "learning_rate": 1.7000304548513643e-08, "loss": 0.83076727, "num_input_tokens_seen": 172262645, "step": 7981, "time_per_iteration": 2.6083686351776123 }, { "auxiliary_loss_clip": 0.01119429, "auxiliary_loss_mlp": 0.01026052, "balance_loss_clip": 1.03989744, "balance_loss_mlp": 1.01904213, "epoch": 0.9597787530812241, "flos": 19135252725120.0, "grad_norm": 2.1111562870176277, "language_loss": 0.82907569, "learning_rate": 1.6899104636583394e-08, "loss": 0.85053051, "num_input_tokens_seen": 172280695, "step": 7982, "time_per_iteration": 3.6461403369903564 }, { "auxiliary_loss_clip": 0.01054613, "auxiliary_loss_mlp": 0.01001313, "balance_loss_clip": 1.00931883, "balance_loss_mlp": 1.00031722, "epoch": 0.9598989959718631, "flos": 60098124055680.0, "grad_norm": 0.7469289332973486, "language_loss": 0.61889464, "learning_rate": 1.6798205559482638e-08, "loss": 0.63945395, "num_input_tokens_seen": 172343075, "step": 7983, "time_per_iteration": 3.3206589221954346 }, { "auxiliary_loss_clip": 0.01121098, "auxiliary_loss_mlp": 0.01024163, "balance_loss_clip": 1.04040742, "balance_loss_mlp": 1.01703167, "epoch": 0.9600192388625023, "flos": 20886795624960.0, "grad_norm": 2.218078167680888, "language_loss": 0.76757663, "learning_rate": 1.669760733251713e-08, "loss": 0.78902924, "num_input_tokens_seen": 172361950, "step": 7984, "time_per_iteration": 2.692739486694336 }, { "auxiliary_loss_clip": 0.01101996, "auxiliary_loss_mlp": 0.0102027, "balance_loss_clip": 1.03789282, "balance_loss_mlp": 1.01336741, "epoch": 0.9601394817531413, "flos": 20445740524800.0, "grad_norm": 1.710721910445185, "language_loss": 0.82278615, "learning_rate": 1.659730997094755e-08, "loss": 0.8440088, "num_input_tokens_seen": 172380440, "step": 7985, "time_per_iteration": 3.722456216812134 }, { "auxiliary_loss_clip": 0.0114326, "auxiliary_loss_mlp": 0.01025615, "balance_loss_clip": 1.04217744, "balance_loss_mlp": 1.01867414, "epoch": 0.9602597246437804, "flos": 21507152440320.0, "grad_norm": 1.7766881293499743, "language_loss": 0.62159342, "learning_rate": 1.6497313489989283e-08, "loss": 0.64328218, "num_input_tokens_seen": 172400265, "step": 7986, "time_per_iteration": 3.4650235176086426 }, { "auxiliary_loss_clip": 0.01105785, "auxiliary_loss_mlp": 0.01025749, "balance_loss_clip": 1.03422689, "balance_loss_mlp": 1.01836455, "epoch": 0.9603799675344196, "flos": 29935099152000.0, "grad_norm": 2.368701321346479, "language_loss": 0.70235276, "learning_rate": 1.639761790481131e-08, "loss": 0.7236681, "num_input_tokens_seen": 172421145, "step": 7987, "time_per_iteration": 2.789889097213745 }, { "auxiliary_loss_clip": 0.01154169, "auxiliary_loss_mlp": 0.01020458, "balance_loss_clip": 1.04555631, "balance_loss_mlp": 1.01361823, "epoch": 0.9605002104250586, "flos": 28001525103360.0, "grad_norm": 1.926154866188746, "language_loss": 0.79087281, "learning_rate": 1.6298223230537754e-08, "loss": 0.81261909, "num_input_tokens_seen": 172438945, "step": 7988, "time_per_iteration": 2.653923988342285 }, { "auxiliary_loss_clip": 0.01137804, "auxiliary_loss_mlp": 0.00762261, "balance_loss_clip": 1.04367614, "balance_loss_mlp": 1.00030327, "epoch": 0.9606204533156977, "flos": 35590490870400.0, "grad_norm": 2.0013344973134273, "language_loss": 0.69663686, "learning_rate": 1.619912948224611e-08, "loss": 0.71563751, "num_input_tokens_seen": 172460150, "step": 7989, "time_per_iteration": 2.765037775039673 }, { "auxiliary_loss_clip": 0.01118318, "auxiliary_loss_mlp": 0.01028363, "balance_loss_clip": 1.04088926, "balance_loss_mlp": 1.02146065, "epoch": 0.9607406962063368, "flos": 26574614346240.0, "grad_norm": 2.8015594465537212, "language_loss": 0.61202389, "learning_rate": 1.6100336674969682e-08, "loss": 0.63349068, "num_input_tokens_seen": 172478990, "step": 7990, "time_per_iteration": 3.6251304149627686 }, { "auxiliary_loss_clip": 0.01110073, "auxiliary_loss_mlp": 0.01025445, "balance_loss_clip": 1.03625524, "balance_loss_mlp": 1.01818514, "epoch": 0.9608609390969759, "flos": 25331781813120.0, "grad_norm": 2.010670085338739, "language_loss": 0.77040446, "learning_rate": 1.600184482369449e-08, "loss": 0.79175961, "num_input_tokens_seen": 172498905, "step": 7991, "time_per_iteration": 2.822733163833618 }, { "auxiliary_loss_clip": 0.01123903, "auxiliary_loss_mlp": 0.01022965, "balance_loss_clip": 1.04003131, "balance_loss_mlp": 1.01544857, "epoch": 0.960981181987615, "flos": 21069114082560.0, "grad_norm": 3.0632010564472405, "language_loss": 0.88919371, "learning_rate": 1.5903653943362126e-08, "loss": 0.91066241, "num_input_tokens_seen": 172517900, "step": 7992, "time_per_iteration": 2.7203872203826904 }, { "auxiliary_loss_clip": 0.01137873, "auxiliary_loss_mlp": 0.01022159, "balance_loss_clip": 1.04282689, "balance_loss_mlp": 1.01568937, "epoch": 0.9611014248782541, "flos": 17823256554240.0, "grad_norm": 1.8198943403964072, "language_loss": 0.77125287, "learning_rate": 1.580576404886802e-08, "loss": 0.79285312, "num_input_tokens_seen": 172536430, "step": 7993, "time_per_iteration": 2.624990701675415 }, { "auxiliary_loss_clip": 0.01151069, "auxiliary_loss_mlp": 0.01025915, "balance_loss_clip": 1.04448247, "balance_loss_mlp": 1.01963902, "epoch": 0.9612216677688932, "flos": 19354631040000.0, "grad_norm": 2.0825244820862183, "language_loss": 0.79749072, "learning_rate": 1.570817515506162e-08, "loss": 0.8192606, "num_input_tokens_seen": 172555120, "step": 7994, "time_per_iteration": 2.6581501960754395 }, { "auxiliary_loss_clip": 0.01162467, "auxiliary_loss_mlp": 0.01023089, "balance_loss_clip": 1.04585433, "balance_loss_mlp": 1.01629972, "epoch": 0.9613419106595322, "flos": 15808739207040.0, "grad_norm": 1.8995620997196834, "language_loss": 0.8106066, "learning_rate": 1.561088727674753e-08, "loss": 0.83246219, "num_input_tokens_seen": 172569330, "step": 7995, "time_per_iteration": 2.5471107959747314 }, { "auxiliary_loss_clip": 0.01122576, "auxiliary_loss_mlp": 0.01027639, "balance_loss_clip": 1.04143262, "balance_loss_mlp": 1.01967645, "epoch": 0.9614621535501714, "flos": 25702488126720.0, "grad_norm": 2.320742748792102, "language_loss": 0.71302617, "learning_rate": 1.551390042868417e-08, "loss": 0.7345283, "num_input_tokens_seen": 172591100, "step": 7996, "time_per_iteration": 2.8054592609405518 }, { "auxiliary_loss_clip": 0.01151149, "auxiliary_loss_mlp": 0.01024995, "balance_loss_clip": 1.04468036, "balance_loss_mlp": 1.01840234, "epoch": 0.9615823964408104, "flos": 17819054663040.0, "grad_norm": 1.8328745006846003, "language_loss": 0.70718777, "learning_rate": 1.5417214625584207e-08, "loss": 0.72894925, "num_input_tokens_seen": 172608755, "step": 7997, "time_per_iteration": 2.6152281761169434 }, { "auxiliary_loss_clip": 0.01144638, "auxiliary_loss_mlp": 0.01022239, "balance_loss_clip": 1.04149127, "balance_loss_mlp": 1.01549506, "epoch": 0.9617026393314495, "flos": 20190020624640.0, "grad_norm": 2.0073348714557024, "language_loss": 0.85418004, "learning_rate": 1.5320829882114806e-08, "loss": 0.87584883, "num_input_tokens_seen": 172626830, "step": 7998, "time_per_iteration": 2.6629140377044678 }, { "auxiliary_loss_clip": 0.01163439, "auxiliary_loss_mlp": 0.01025701, "balance_loss_clip": 1.04311359, "balance_loss_mlp": 1.01879561, "epoch": 0.9618228822220887, "flos": 20267013427200.0, "grad_norm": 1.8163262547604877, "language_loss": 0.7921797, "learning_rate": 1.5224746212897378e-08, "loss": 0.81407112, "num_input_tokens_seen": 172646125, "step": 7999, "time_per_iteration": 2.5940558910369873 }, { "auxiliary_loss_clip": 0.01162942, "auxiliary_loss_mlp": 0.01023302, "balance_loss_clip": 1.04655933, "balance_loss_mlp": 1.01624799, "epoch": 0.9619431251127277, "flos": 21031300039680.0, "grad_norm": 1.8112563599140874, "language_loss": 0.77298546, "learning_rate": 1.512896363250804e-08, "loss": 0.79484791, "num_input_tokens_seen": 172666235, "step": 8000, "time_per_iteration": 2.582535743713379 }, { "auxiliary_loss_clip": 0.01150253, "auxiliary_loss_mlp": 0.01023099, "balance_loss_clip": 1.04194808, "balance_loss_mlp": 1.01643503, "epoch": 0.9620633680033668, "flos": 22382654538240.0, "grad_norm": 1.7768613702076068, "language_loss": 0.75595975, "learning_rate": 1.503348215547673e-08, "loss": 0.77769333, "num_input_tokens_seen": 172687325, "step": 8001, "time_per_iteration": 2.6494333744049072 }, { "auxiliary_loss_clip": 0.01134913, "auxiliary_loss_mlp": 0.01019864, "balance_loss_clip": 1.04212213, "balance_loss_mlp": 1.01342082, "epoch": 0.962183610894006, "flos": 18471730740480.0, "grad_norm": 2.3705892872713057, "language_loss": 0.80750132, "learning_rate": 1.4938301796288078e-08, "loss": 0.82904911, "num_input_tokens_seen": 172703895, "step": 8002, "time_per_iteration": 2.6072072982788086 }, { "auxiliary_loss_clip": 0.0116593, "auxiliary_loss_mlp": 0.01023276, "balance_loss_clip": 1.0471977, "balance_loss_mlp": 1.01585484, "epoch": 0.962303853784645, "flos": 18435245500800.0, "grad_norm": 3.541091960720743, "language_loss": 0.81591272, "learning_rate": 1.4843422569380537e-08, "loss": 0.83780479, "num_input_tokens_seen": 172720650, "step": 8003, "time_per_iteration": 2.623051166534424 }, { "auxiliary_loss_clip": 0.01110566, "auxiliary_loss_mlp": 0.01023065, "balance_loss_clip": 1.03925347, "balance_loss_mlp": 1.01616585, "epoch": 0.9624240966752841, "flos": 26391074826240.0, "grad_norm": 1.8349393763503818, "language_loss": 0.82727313, "learning_rate": 1.4748844489147483e-08, "loss": 0.84860939, "num_input_tokens_seen": 172737640, "step": 8004, "time_per_iteration": 2.7408039569854736 }, { "auxiliary_loss_clip": 0.0113419, "auxiliary_loss_mlp": 0.01023357, "balance_loss_clip": 1.03928566, "balance_loss_mlp": 1.01658916, "epoch": 0.9625443395659231, "flos": 14647675985280.0, "grad_norm": 2.537741466259607, "language_loss": 0.71183968, "learning_rate": 1.4654567569936326e-08, "loss": 0.73341513, "num_input_tokens_seen": 172755215, "step": 8005, "time_per_iteration": 2.657639265060425 }, { "auxiliary_loss_clip": 0.01104182, "auxiliary_loss_mlp": 0.01028322, "balance_loss_clip": 1.03813744, "balance_loss_mlp": 1.02147651, "epoch": 0.9626645824565623, "flos": 18367626147840.0, "grad_norm": 2.2334991137656113, "language_loss": 0.83170491, "learning_rate": 1.456059182604874e-08, "loss": 0.85302997, "num_input_tokens_seen": 172774020, "step": 8006, "time_per_iteration": 2.7265191078186035 }, { "auxiliary_loss_clip": 0.01166513, "auxiliary_loss_mlp": 0.01025856, "balance_loss_clip": 1.04636157, "balance_loss_mlp": 1.01877475, "epoch": 0.9627848253472013, "flos": 16580424021120.0, "grad_norm": 1.8722201936962148, "language_loss": 0.76470441, "learning_rate": 1.4466917271740653e-08, "loss": 0.78662813, "num_input_tokens_seen": 172792220, "step": 8007, "time_per_iteration": 2.5845415592193604 }, { "auxiliary_loss_clip": 0.01134037, "auxiliary_loss_mlp": 0.01031014, "balance_loss_clip": 1.04188704, "balance_loss_mlp": 1.02339053, "epoch": 0.9629050682378404, "flos": 20886867452160.0, "grad_norm": 3.8124522565359285, "language_loss": 0.67631537, "learning_rate": 1.4373543921222697e-08, "loss": 0.69796586, "num_input_tokens_seen": 172811805, "step": 8008, "time_per_iteration": 3.5595290660858154 }, { "auxiliary_loss_clip": 0.01133305, "auxiliary_loss_mlp": 0.01025608, "balance_loss_clip": 1.0423975, "balance_loss_mlp": 1.01801729, "epoch": 0.9630253111284796, "flos": 17019252478080.0, "grad_norm": 2.029463318105329, "language_loss": 0.78006124, "learning_rate": 1.428047178865932e-08, "loss": 0.80165029, "num_input_tokens_seen": 172828595, "step": 8009, "time_per_iteration": 2.6777825355529785 }, { "auxiliary_loss_clip": 0.01132382, "auxiliary_loss_mlp": 0.01027452, "balance_loss_clip": 1.03803062, "balance_loss_mlp": 1.02010322, "epoch": 0.9631455540191186, "flos": 20338942412160.0, "grad_norm": 1.6202304805167445, "language_loss": 0.74572963, "learning_rate": 1.4187700888169451e-08, "loss": 0.76732796, "num_input_tokens_seen": 172847770, "step": 8010, "time_per_iteration": 3.7080330848693848 }, { "auxiliary_loss_clip": 0.01051711, "auxiliary_loss_mlp": 0.01002403, "balance_loss_clip": 1.00887561, "balance_loss_mlp": 1.00140727, "epoch": 0.9632657969097577, "flos": 65956700033280.0, "grad_norm": 0.7525406947577326, "language_loss": 0.56996626, "learning_rate": 1.40952312338265e-08, "loss": 0.59050739, "num_input_tokens_seen": 172912415, "step": 8011, "time_per_iteration": 3.285555601119995 }, { "auxiliary_loss_clip": 0.01124039, "auxiliary_loss_mlp": 0.01025275, "balance_loss_clip": 1.03929019, "balance_loss_mlp": 1.01828313, "epoch": 0.9633860398003968, "flos": 44419523823360.0, "grad_norm": 1.7748460554219798, "language_loss": 0.68407941, "learning_rate": 1.4003062839657909e-08, "loss": 0.70557249, "num_input_tokens_seen": 172934895, "step": 8012, "time_per_iteration": 3.77223801612854 }, { "auxiliary_loss_clip": 0.01127843, "auxiliary_loss_mlp": 0.01023175, "balance_loss_clip": 1.04232192, "balance_loss_mlp": 1.01612651, "epoch": 0.9635062826910359, "flos": 24827704300800.0, "grad_norm": 1.6230378002555563, "language_loss": 0.80023992, "learning_rate": 1.391119571964583e-08, "loss": 0.8217501, "num_input_tokens_seen": 172955835, "step": 8013, "time_per_iteration": 2.7802484035491943 }, { "auxiliary_loss_clip": 0.01149562, "auxiliary_loss_mlp": 0.01022892, "balance_loss_clip": 1.04519701, "balance_loss_mlp": 1.01613033, "epoch": 0.9636265255816749, "flos": 15961360095360.0, "grad_norm": 1.8674047931979112, "language_loss": 0.73369789, "learning_rate": 1.3819629887726225e-08, "loss": 0.75542241, "num_input_tokens_seen": 172973925, "step": 8014, "time_per_iteration": 2.6151063442230225 }, { "auxiliary_loss_clip": 0.011444, "auxiliary_loss_mlp": 0.01025796, "balance_loss_clip": 1.04609561, "balance_loss_mlp": 1.01857746, "epoch": 0.9637467684723141, "flos": 22601781457920.0, "grad_norm": 1.744242200874373, "language_loss": 0.76575577, "learning_rate": 1.3728365357789317e-08, "loss": 0.78745776, "num_input_tokens_seen": 172993290, "step": 8015, "time_per_iteration": 2.6990599632263184 }, { "auxiliary_loss_clip": 0.01085635, "auxiliary_loss_mlp": 0.01024676, "balance_loss_clip": 1.03660893, "balance_loss_mlp": 1.01762509, "epoch": 0.9638670113629532, "flos": 17565812801280.0, "grad_norm": 2.3513432089124815, "language_loss": 0.76292872, "learning_rate": 1.3637402143680254e-08, "loss": 0.78403187, "num_input_tokens_seen": 173008190, "step": 8016, "time_per_iteration": 3.602175235748291 }, { "auxiliary_loss_clip": 0.0103186, "auxiliary_loss_mlp": 0.0100243, "balance_loss_clip": 1.0119369, "balance_loss_mlp": 1.00133896, "epoch": 0.9639872542535922, "flos": 55072139379840.0, "grad_norm": 0.7179412822169149, "language_loss": 0.55074131, "learning_rate": 1.3546740259197998e-08, "loss": 0.5710842, "num_input_tokens_seen": 173061000, "step": 8017, "time_per_iteration": 3.2145140171051025 }, { "auxiliary_loss_clip": 0.01139926, "auxiliary_loss_mlp": 0.01028891, "balance_loss_clip": 1.04389083, "balance_loss_mlp": 1.02112174, "epoch": 0.9641074971442314, "flos": 24134484746880.0, "grad_norm": 2.144932955734405, "language_loss": 0.70055526, "learning_rate": 1.3456379718095989e-08, "loss": 0.72224343, "num_input_tokens_seen": 173081415, "step": 8018, "time_per_iteration": 2.719129800796509 }, { "auxiliary_loss_clip": 0.01039668, "auxiliary_loss_mlp": 0.01001488, "balance_loss_clip": 1.00819325, "balance_loss_mlp": 1.00045133, "epoch": 0.9642277400348704, "flos": 66747416077440.0, "grad_norm": 0.8778966797230715, "language_loss": 0.62056285, "learning_rate": 1.3366320534081487e-08, "loss": 0.6409744, "num_input_tokens_seen": 173144095, "step": 8019, "time_per_iteration": 3.230581283569336 }, { "auxiliary_loss_clip": 0.01152143, "auxiliary_loss_mlp": 0.01026319, "balance_loss_clip": 1.04512084, "balance_loss_mlp": 1.01860332, "epoch": 0.9643479829255095, "flos": 30920272450560.0, "grad_norm": 2.178203666010136, "language_loss": 0.76044309, "learning_rate": 1.3276562720816675e-08, "loss": 0.78222764, "num_input_tokens_seen": 173165605, "step": 8020, "time_per_iteration": 2.712013006210327 }, { "auxiliary_loss_clip": 0.01166438, "auxiliary_loss_mlp": 0.01022743, "balance_loss_clip": 1.04507923, "balance_loss_mlp": 1.01584387, "epoch": 0.9644682258161487, "flos": 20048245643520.0, "grad_norm": 2.4042986759231804, "language_loss": 0.8251158, "learning_rate": 1.3187106291917549e-08, "loss": 0.84700763, "num_input_tokens_seen": 173182595, "step": 8021, "time_per_iteration": 2.534327745437622 }, { "auxiliary_loss_clip": 0.01146227, "auxiliary_loss_mlp": 0.01024613, "balance_loss_clip": 1.04333234, "balance_loss_mlp": 1.01730514, "epoch": 0.9645884687067877, "flos": 21178713456000.0, "grad_norm": 1.7293010013559558, "language_loss": 0.70460147, "learning_rate": 1.309795126095503e-08, "loss": 0.7263099, "num_input_tokens_seen": 173200895, "step": 8022, "time_per_iteration": 2.629523992538452 }, { "auxiliary_loss_clip": 0.01074853, "auxiliary_loss_mlp": 0.01023066, "balance_loss_clip": 1.0337249, "balance_loss_mlp": 1.01625907, "epoch": 0.9647087115974268, "flos": 18945967029120.0, "grad_norm": 2.301512910725068, "language_loss": 0.80391186, "learning_rate": 1.3009097641453192e-08, "loss": 0.82489103, "num_input_tokens_seen": 173218745, "step": 8023, "time_per_iteration": 2.7175095081329346 }, { "auxiliary_loss_clip": 0.01134424, "auxiliary_loss_mlp": 0.01023735, "balance_loss_clip": 1.04222989, "balance_loss_mlp": 1.01633859, "epoch": 0.9648289544880659, "flos": 16545088016640.0, "grad_norm": 1.712894824164886, "language_loss": 0.76109838, "learning_rate": 1.2920545446891474e-08, "loss": 0.78267992, "num_input_tokens_seen": 173235465, "step": 8024, "time_per_iteration": 2.615529775619507 }, { "auxiliary_loss_clip": 0.01141498, "auxiliary_loss_mlp": 0.01024604, "balance_loss_clip": 1.04517996, "balance_loss_mlp": 1.01750791, "epoch": 0.964949197378705, "flos": 24057527857920.0, "grad_norm": 3.387999104897225, "language_loss": 0.70714265, "learning_rate": 1.2832294690703127e-08, "loss": 0.72880363, "num_input_tokens_seen": 173254440, "step": 8025, "time_per_iteration": 2.7022433280944824 }, { "auxiliary_loss_clip": 0.01150343, "auxiliary_loss_mlp": 0.01023983, "balance_loss_clip": 1.04480743, "balance_loss_mlp": 1.01710176, "epoch": 0.965069440269344, "flos": 23365565280000.0, "grad_norm": 1.8458012214942505, "language_loss": 0.77593189, "learning_rate": 1.2744345386275668e-08, "loss": 0.79767513, "num_input_tokens_seen": 173273980, "step": 8026, "time_per_iteration": 2.641173839569092 }, { "auxiliary_loss_clip": 0.01142862, "auxiliary_loss_mlp": 0.01028265, "balance_loss_clip": 1.0450635, "balance_loss_mlp": 1.02149343, "epoch": 0.9651896831599832, "flos": 25374875155200.0, "grad_norm": 1.6573422814019758, "language_loss": 0.78793257, "learning_rate": 1.265669754695109e-08, "loss": 0.80964386, "num_input_tokens_seen": 173293550, "step": 8027, "time_per_iteration": 2.6933484077453613 }, { "auxiliary_loss_clip": 0.01096924, "auxiliary_loss_mlp": 0.01025169, "balance_loss_clip": 1.03707695, "balance_loss_mlp": 1.01757264, "epoch": 0.9653099260506223, "flos": 22272875596800.0, "grad_norm": 1.8839111984153647, "language_loss": 0.82247138, "learning_rate": 1.2569351186025201e-08, "loss": 0.8436923, "num_input_tokens_seen": 173312005, "step": 8028, "time_per_iteration": 2.750873565673828 }, { "auxiliary_loss_clip": 0.01111866, "auxiliary_loss_mlp": 0.01021946, "balance_loss_clip": 1.03860319, "balance_loss_mlp": 1.01517487, "epoch": 0.9654301689412613, "flos": 26760847386240.0, "grad_norm": 1.5453276461507903, "language_loss": 0.75561267, "learning_rate": 1.2482306316748737e-08, "loss": 0.77695078, "num_input_tokens_seen": 173332450, "step": 8029, "time_per_iteration": 2.7223312854766846 }, { "auxiliary_loss_clip": 0.01155653, "auxiliary_loss_mlp": 0.01027291, "balance_loss_clip": 1.04321551, "balance_loss_mlp": 1.0198437, "epoch": 0.9655504118319005, "flos": 17412689122560.0, "grad_norm": 2.1070087389657797, "language_loss": 0.78429651, "learning_rate": 1.2395562952326021e-08, "loss": 0.80612594, "num_input_tokens_seen": 173349610, "step": 8030, "time_per_iteration": 2.6083874702453613 }, { "auxiliary_loss_clip": 0.01146624, "auxiliary_loss_mlp": 0.01033325, "balance_loss_clip": 1.04480958, "balance_loss_mlp": 1.02585924, "epoch": 0.9656706547225395, "flos": 22126970551680.0, "grad_norm": 2.2981038559048264, "language_loss": 0.80952996, "learning_rate": 1.2309121105916309e-08, "loss": 0.83132946, "num_input_tokens_seen": 173367900, "step": 8031, "time_per_iteration": 2.643375873565674 }, { "auxiliary_loss_clip": 0.01155251, "auxiliary_loss_mlp": 0.0102299, "balance_loss_clip": 1.04437721, "balance_loss_mlp": 1.01566756, "epoch": 0.9657908976131786, "flos": 37049289926400.0, "grad_norm": 2.024523766303704, "language_loss": 0.69332588, "learning_rate": 1.222298079063222e-08, "loss": 0.71510828, "num_input_tokens_seen": 173389040, "step": 8032, "time_per_iteration": 2.739582061767578 }, { "auxiliary_loss_clip": 0.01149568, "auxiliary_loss_mlp": 0.01022274, "balance_loss_clip": 1.04343045, "balance_loss_mlp": 1.01569998, "epoch": 0.9659111405038178, "flos": 24389809597440.0, "grad_norm": 1.9654888413784923, "language_loss": 0.72321904, "learning_rate": 1.2137142019541524e-08, "loss": 0.74493742, "num_input_tokens_seen": 173407595, "step": 8033, "time_per_iteration": 2.655346393585205 }, { "auxiliary_loss_clip": 0.01144101, "auxiliary_loss_mlp": 0.01028207, "balance_loss_clip": 1.04329085, "balance_loss_mlp": 1.02064884, "epoch": 0.9660313833944568, "flos": 25009412227200.0, "grad_norm": 3.7120683540943014, "language_loss": 0.736426, "learning_rate": 1.2051604805666027e-08, "loss": 0.75814909, "num_input_tokens_seen": 173424720, "step": 8034, "time_per_iteration": 3.6424593925476074 }, { "auxiliary_loss_clip": 0.01164754, "auxiliary_loss_mlp": 0.00762144, "balance_loss_clip": 1.04626024, "balance_loss_mlp": 1.00036824, "epoch": 0.9661516262850959, "flos": 11801575895040.0, "grad_norm": 2.7568415993569806, "language_loss": 0.78808224, "learning_rate": 1.196636916198135e-08, "loss": 0.80735117, "num_input_tokens_seen": 173442260, "step": 8035, "time_per_iteration": 2.558863878250122 }, { "auxiliary_loss_clip": 0.01165708, "auxiliary_loss_mlp": 0.01029968, "balance_loss_clip": 1.04463935, "balance_loss_mlp": 1.02240419, "epoch": 0.9662718691757349, "flos": 20047778766720.0, "grad_norm": 2.824265288219802, "language_loss": 0.7698856, "learning_rate": 1.1881435101418036e-08, "loss": 0.79184234, "num_input_tokens_seen": 173461675, "step": 8036, "time_per_iteration": 2.5809054374694824 }, { "auxiliary_loss_clip": 0.01042572, "auxiliary_loss_mlp": 0.0100124, "balance_loss_clip": 1.01007366, "balance_loss_mlp": 1.00023305, "epoch": 0.9663921120663741, "flos": 68027703517440.0, "grad_norm": 0.747614383187982, "language_loss": 0.65544033, "learning_rate": 1.1796802636860003e-08, "loss": 0.67587852, "num_input_tokens_seen": 173530205, "step": 8037, "time_per_iteration": 4.173290729522705 }, { "auxiliary_loss_clip": 0.01165896, "auxiliary_loss_mlp": 0.01018705, "balance_loss_clip": 1.04504919, "balance_loss_mlp": 1.01128745, "epoch": 0.9665123549570132, "flos": 26322916769280.0, "grad_norm": 1.9495079038882606, "language_loss": 0.74035221, "learning_rate": 1.1712471781146316e-08, "loss": 0.76219821, "num_input_tokens_seen": 173549540, "step": 8038, "time_per_iteration": 2.650090456008911 }, { "auxiliary_loss_clip": 0.01164449, "auxiliary_loss_mlp": 0.01021737, "balance_loss_clip": 1.04409933, "balance_loss_mlp": 1.01444447, "epoch": 0.9666325978476522, "flos": 43941121557120.0, "grad_norm": 1.9014546939053292, "language_loss": 0.66509074, "learning_rate": 1.1628442547069628e-08, "loss": 0.68695265, "num_input_tokens_seen": 173571740, "step": 8039, "time_per_iteration": 3.642157793045044 }, { "auxiliary_loss_clip": 0.01150941, "auxiliary_loss_mlp": 0.00762365, "balance_loss_clip": 1.04120684, "balance_loss_mlp": 1.00036812, "epoch": 0.9667528407382914, "flos": 21543422198400.0, "grad_norm": 1.9417482916881617, "language_loss": 0.77014983, "learning_rate": 1.1544714947377521e-08, "loss": 0.78928292, "num_input_tokens_seen": 173589425, "step": 8040, "time_per_iteration": 2.611628293991089 }, { "auxiliary_loss_clip": 0.01168671, "auxiliary_loss_mlp": 0.01023314, "balance_loss_clip": 1.04797626, "balance_loss_mlp": 1.01590514, "epoch": 0.9668730836289304, "flos": 23878585278720.0, "grad_norm": 2.342536777581491, "language_loss": 0.69975334, "learning_rate": 1.1461288994770945e-08, "loss": 0.72167319, "num_input_tokens_seen": 173608500, "step": 8041, "time_per_iteration": 2.6151280403137207 }, { "auxiliary_loss_clip": 0.01168572, "auxiliary_loss_mlp": 0.01029533, "balance_loss_clip": 1.04482222, "balance_loss_mlp": 1.02191567, "epoch": 0.9669933265195695, "flos": 28293011971200.0, "grad_norm": 1.980224275239243, "language_loss": 0.76884383, "learning_rate": 1.1378164701906002e-08, "loss": 0.79082489, "num_input_tokens_seen": 173630265, "step": 8042, "time_per_iteration": 3.563913583755493 }, { "auxiliary_loss_clip": 0.01168026, "auxiliary_loss_mlp": 0.01024696, "balance_loss_clip": 1.04676938, "balance_loss_mlp": 1.01722193, "epoch": 0.9671135694102087, "flos": 22454763091200.0, "grad_norm": 1.853249275628975, "language_loss": 0.66881925, "learning_rate": 1.1295342081392156e-08, "loss": 0.69074649, "num_input_tokens_seen": 173649625, "step": 8043, "time_per_iteration": 2.7985219955444336 }, { "auxiliary_loss_clip": 0.01137222, "auxiliary_loss_mlp": 0.01021183, "balance_loss_clip": 1.04185438, "balance_loss_mlp": 1.01472461, "epoch": 0.9672338123008477, "flos": 20155941596160.0, "grad_norm": 1.876236947266394, "language_loss": 0.69293773, "learning_rate": 1.1212821145793804e-08, "loss": 0.71452188, "num_input_tokens_seen": 173669240, "step": 8044, "time_per_iteration": 2.681973695755005 }, { "auxiliary_loss_clip": 0.0113714, "auxiliary_loss_mlp": 0.01022271, "balance_loss_clip": 1.04132724, "balance_loss_mlp": 1.01526737, "epoch": 0.9673540551914868, "flos": 16977487939200.0, "grad_norm": 1.917398420981964, "language_loss": 0.78900409, "learning_rate": 1.1130601907629156e-08, "loss": 0.81059813, "num_input_tokens_seen": 173686970, "step": 8045, "time_per_iteration": 2.5916202068328857 }, { "auxiliary_loss_clip": 0.01055026, "auxiliary_loss_mlp": 0.01000816, "balance_loss_clip": 1.00936127, "balance_loss_mlp": 0.99985677, "epoch": 0.9674742980821259, "flos": 61892903952000.0, "grad_norm": 0.8166778718475749, "language_loss": 0.64710641, "learning_rate": 1.1048684379370899e-08, "loss": 0.66766489, "num_input_tokens_seen": 173747655, "step": 8046, "time_per_iteration": 3.1994142532348633 }, { "auxiliary_loss_clip": 0.01126875, "auxiliary_loss_mlp": 0.01021223, "balance_loss_clip": 1.04233408, "balance_loss_mlp": 1.01466084, "epoch": 0.967594540972765, "flos": 18697824898560.0, "grad_norm": 2.0301954946402736, "language_loss": 0.74330336, "learning_rate": 1.0967068573445759e-08, "loss": 0.76478434, "num_input_tokens_seen": 173765140, "step": 8047, "time_per_iteration": 2.628218650817871 }, { "auxiliary_loss_clip": 0.01132321, "auxiliary_loss_mlp": 0.01027288, "balance_loss_clip": 1.04005826, "balance_loss_mlp": 1.01992655, "epoch": 0.967714783863404, "flos": 20777411733120.0, "grad_norm": 2.0974312225040124, "language_loss": 0.6514858, "learning_rate": 1.0885754502234945e-08, "loss": 0.67308187, "num_input_tokens_seen": 173784800, "step": 8048, "time_per_iteration": 2.660459041595459 }, { "auxiliary_loss_clip": 0.01122027, "auxiliary_loss_mlp": 0.01023819, "balance_loss_clip": 1.04201961, "balance_loss_mlp": 1.01660681, "epoch": 0.9678350267540432, "flos": 23185473465600.0, "grad_norm": 1.901181415538082, "language_loss": 0.78248084, "learning_rate": 1.08047421780737e-08, "loss": 0.80393922, "num_input_tokens_seen": 173803990, "step": 8049, "time_per_iteration": 2.6757004261016846 }, { "auxiliary_loss_clip": 0.0114076, "auxiliary_loss_mlp": 0.00761945, "balance_loss_clip": 1.04114389, "balance_loss_mlp": 1.00028479, "epoch": 0.9679552696446823, "flos": 21726063878400.0, "grad_norm": 2.2518972148924004, "language_loss": 0.74306607, "learning_rate": 1.0724031613251305e-08, "loss": 0.76209307, "num_input_tokens_seen": 173821890, "step": 8050, "time_per_iteration": 2.6551778316497803 }, { "auxiliary_loss_clip": 0.01156137, "auxiliary_loss_mlp": 0.01019181, "balance_loss_clip": 1.04417276, "balance_loss_mlp": 1.01191258, "epoch": 0.9680755125353213, "flos": 26869046129280.0, "grad_norm": 3.9160331405110878, "language_loss": 0.66761631, "learning_rate": 1.0643622820011744e-08, "loss": 0.68936944, "num_input_tokens_seen": 173842945, "step": 8051, "time_per_iteration": 2.643146276473999 }, { "auxiliary_loss_clip": 0.01167249, "auxiliary_loss_mlp": 0.01029486, "balance_loss_clip": 1.04555094, "balance_loss_mlp": 1.02150214, "epoch": 0.9681957554259605, "flos": 28325008010880.0, "grad_norm": 2.4415295607167544, "language_loss": 0.68050194, "learning_rate": 1.0563515810552814e-08, "loss": 0.70246923, "num_input_tokens_seen": 173859915, "step": 8052, "time_per_iteration": 2.6023592948913574 }, { "auxiliary_loss_clip": 0.01167081, "auxiliary_loss_mlp": 0.01031576, "balance_loss_clip": 1.04814434, "balance_loss_mlp": 1.02442932, "epoch": 0.9683159983165995, "flos": 20557674282240.0, "grad_norm": 1.5539629772815964, "language_loss": 0.73402488, "learning_rate": 1.0483710597026795e-08, "loss": 0.75601143, "num_input_tokens_seen": 173879775, "step": 8053, "time_per_iteration": 2.6072306632995605 }, { "auxiliary_loss_clip": 0.01125291, "auxiliary_loss_mlp": 0.0102333, "balance_loss_clip": 1.0417186, "balance_loss_mlp": 1.01717281, "epoch": 0.9684362412072386, "flos": 24207958016640.0, "grad_norm": 2.2027821143003576, "language_loss": 0.74048567, "learning_rate": 1.0404207191540227e-08, "loss": 0.76197189, "num_input_tokens_seen": 173900230, "step": 8054, "time_per_iteration": 2.7009434700012207 }, { "auxiliary_loss_clip": 0.01164125, "auxiliary_loss_mlp": 0.01024452, "balance_loss_clip": 1.04504132, "balance_loss_mlp": 1.01729107, "epoch": 0.9685564840978778, "flos": 22346241125760.0, "grad_norm": 2.1293004183647515, "language_loss": 0.74416661, "learning_rate": 1.0325005606153236e-08, "loss": 0.76605231, "num_input_tokens_seen": 173919690, "step": 8055, "time_per_iteration": 2.5539891719818115 }, { "auxiliary_loss_clip": 0.01112065, "auxiliary_loss_mlp": 0.01023856, "balance_loss_clip": 1.03888249, "balance_loss_mlp": 1.01623821, "epoch": 0.9686767269885168, "flos": 14386389477120.0, "grad_norm": 2.681923205374886, "language_loss": 0.79038817, "learning_rate": 1.0246105852881104e-08, "loss": 0.81174743, "num_input_tokens_seen": 173934790, "step": 8056, "time_per_iteration": 2.6863715648651123 }, { "auxiliary_loss_clip": 0.0116606, "auxiliary_loss_mlp": 0.01028171, "balance_loss_clip": 1.04550123, "balance_loss_mlp": 1.02061307, "epoch": 0.9687969698791559, "flos": 21287630471040.0, "grad_norm": 2.511242163612255, "language_loss": 0.78676265, "learning_rate": 1.0167507943692476e-08, "loss": 0.80870497, "num_input_tokens_seen": 173953875, "step": 8057, "time_per_iteration": 2.6977202892303467 }, { "auxiliary_loss_clip": 0.01151043, "auxiliary_loss_mlp": 0.01025636, "balance_loss_clip": 1.04656041, "balance_loss_mlp": 1.01837635, "epoch": 0.968917212769795, "flos": 19828328624640.0, "grad_norm": 2.068299777041304, "language_loss": 0.72007143, "learning_rate": 1.008921189051093e-08, "loss": 0.74183816, "num_input_tokens_seen": 173971220, "step": 8058, "time_per_iteration": 2.5898773670196533 }, { "auxiliary_loss_clip": 0.01167842, "auxiliary_loss_mlp": 0.01027703, "balance_loss_clip": 1.0472815, "balance_loss_mlp": 1.02033901, "epoch": 0.9690374556604341, "flos": 21681749473920.0, "grad_norm": 1.946884588445897, "language_loss": 0.77144003, "learning_rate": 1.0011217705213848e-08, "loss": 0.79339552, "num_input_tokens_seen": 173989095, "step": 8059, "time_per_iteration": 2.594555616378784 }, { "auxiliary_loss_clip": 0.01147468, "auxiliary_loss_mlp": 0.01019213, "balance_loss_clip": 1.04383397, "balance_loss_mlp": 1.01283264, "epoch": 0.9691576985510731, "flos": 32635437851520.0, "grad_norm": 1.8589786629043585, "language_loss": 0.74193037, "learning_rate": 9.933525399632658e-09, "loss": 0.76359719, "num_input_tokens_seen": 174007330, "step": 8060, "time_per_iteration": 0.9517033100128174 }, { "auxiliary_loss_clip": 0.01134807, "auxiliary_loss_mlp": 0.01026838, "balance_loss_clip": 1.04179978, "balance_loss_mlp": 1.01958132, "epoch": 0.9692779414417123, "flos": 35663174040960.0, "grad_norm": 1.8035651014514542, "language_loss": 0.64826035, "learning_rate": 9.856134985553488e-09, "loss": 0.66987681, "num_input_tokens_seen": 174027055, "step": 8061, "time_per_iteration": 2.7452144622802734 }, { "auxiliary_loss_clip": 0.01161965, "auxiliary_loss_mlp": 0.01027783, "balance_loss_clip": 1.04362106, "balance_loss_mlp": 1.02033782, "epoch": 0.9693981843323514, "flos": 28366952117760.0, "grad_norm": 1.7379530493341169, "language_loss": 0.73817348, "learning_rate": 9.77904647471628e-09, "loss": 0.76007092, "num_input_tokens_seen": 174050235, "step": 8062, "time_per_iteration": 2.6009695529937744 }, { "auxiliary_loss_clip": 0.01100117, "auxiliary_loss_mlp": 0.01025196, "balance_loss_clip": 1.03950405, "balance_loss_mlp": 1.0180757, "epoch": 0.9695184272229904, "flos": 23622865378560.0, "grad_norm": 1.518047285018217, "language_loss": 0.73866403, "learning_rate": 9.702259878815454e-09, "loss": 0.7599172, "num_input_tokens_seen": 174070560, "step": 8063, "time_per_iteration": 3.6935014724731445 }, { "auxiliary_loss_clip": 0.01156386, "auxiliary_loss_mlp": 0.01024544, "balance_loss_clip": 1.04684949, "balance_loss_mlp": 1.01728106, "epoch": 0.9696386701136296, "flos": 23294677789440.0, "grad_norm": 2.072325390451814, "language_loss": 0.74065, "learning_rate": 9.625775209499254e-09, "loss": 0.76245928, "num_input_tokens_seen": 174090565, "step": 8064, "time_per_iteration": 2.6438655853271484 }, { "auxiliary_loss_clip": 0.01117046, "auxiliary_loss_mlp": 0.01025518, "balance_loss_clip": 1.03953385, "balance_loss_mlp": 1.01893473, "epoch": 0.9697589130042686, "flos": 15121876360320.0, "grad_norm": 4.035470427254621, "language_loss": 0.7421124, "learning_rate": 9.549592478370172e-09, "loss": 0.763538, "num_input_tokens_seen": 174108745, "step": 8065, "time_per_iteration": 3.4915177822113037 }, { "auxiliary_loss_clip": 0.01154328, "auxiliary_loss_mlp": 0.01021917, "balance_loss_clip": 1.04498792, "balance_loss_mlp": 1.01486278, "epoch": 0.9698791558949077, "flos": 18879532824960.0, "grad_norm": 1.6908795780244605, "language_loss": 0.79451942, "learning_rate": 9.473711696985632e-09, "loss": 0.81628191, "num_input_tokens_seen": 174128075, "step": 8066, "time_per_iteration": 2.626004695892334 }, { "auxiliary_loss_clip": 0.01135748, "auxiliary_loss_mlp": 0.0102144, "balance_loss_clip": 1.0406878, "balance_loss_mlp": 1.01461554, "epoch": 0.9699993987855468, "flos": 17931455297280.0, "grad_norm": 2.3330391384843367, "language_loss": 0.75779998, "learning_rate": 9.398132876856201e-09, "loss": 0.77937192, "num_input_tokens_seen": 174147040, "step": 8067, "time_per_iteration": 2.624974012374878 }, { "auxiliary_loss_clip": 0.01023769, "auxiliary_loss_mlp": 0.0100252, "balance_loss_clip": 1.01081932, "balance_loss_mlp": 1.00144708, "epoch": 0.9701196416761859, "flos": 67182186297600.0, "grad_norm": 0.7753849550312911, "language_loss": 0.60755581, "learning_rate": 9.322856029447379e-09, "loss": 0.6278187, "num_input_tokens_seen": 174208225, "step": 8068, "time_per_iteration": 3.977970600128174 }, { "auxiliary_loss_clip": 0.01162434, "auxiliary_loss_mlp": 0.01024968, "balance_loss_clip": 1.04497433, "balance_loss_mlp": 1.01832843, "epoch": 0.970239884566825, "flos": 24277804012800.0, "grad_norm": 1.8743361939379348, "language_loss": 0.80321062, "learning_rate": 9.247881166178695e-09, "loss": 0.82508463, "num_input_tokens_seen": 174226935, "step": 8069, "time_per_iteration": 2.611100196838379 }, { "auxiliary_loss_clip": 0.01129415, "auxiliary_loss_mlp": 0.01022943, "balance_loss_clip": 1.04121614, "balance_loss_mlp": 1.01584375, "epoch": 0.970360127457464, "flos": 25301689194240.0, "grad_norm": 2.5902418377638488, "language_loss": 0.76421428, "learning_rate": 9.173208298423274e-09, "loss": 0.78573781, "num_input_tokens_seen": 174248140, "step": 8070, "time_per_iteration": 2.7611727714538574 }, { "auxiliary_loss_clip": 0.01106936, "auxiliary_loss_mlp": 0.00762092, "balance_loss_clip": 1.03950083, "balance_loss_mlp": 1.00031495, "epoch": 0.9704803703481032, "flos": 29572473398400.0, "grad_norm": 1.5806576840008906, "language_loss": 0.76121247, "learning_rate": 9.09883743750961e-09, "loss": 0.77990276, "num_input_tokens_seen": 174271030, "step": 8071, "time_per_iteration": 2.784320116043091 }, { "auxiliary_loss_clip": 0.01133536, "auxiliary_loss_mlp": 0.01020575, "balance_loss_clip": 1.04212511, "balance_loss_mlp": 1.01413465, "epoch": 0.9706006132387422, "flos": 17380046638080.0, "grad_norm": 1.7322376282775436, "language_loss": 0.83851153, "learning_rate": 9.024768594719124e-09, "loss": 0.86005265, "num_input_tokens_seen": 174289410, "step": 8072, "time_per_iteration": 2.628959894180298 }, { "auxiliary_loss_clip": 0.01123742, "auxiliary_loss_mlp": 0.01024496, "balance_loss_clip": 1.04281449, "balance_loss_mlp": 1.01736414, "epoch": 0.9707208561293813, "flos": 18186421011840.0, "grad_norm": 2.769440138187765, "language_loss": 0.72955739, "learning_rate": 8.95100178128816e-09, "loss": 0.75103974, "num_input_tokens_seen": 174308550, "step": 8073, "time_per_iteration": 2.676483392715454 }, { "auxiliary_loss_clip": 0.01140128, "auxiliary_loss_mlp": 0.0102507, "balance_loss_clip": 1.04468429, "balance_loss_mlp": 1.01740444, "epoch": 0.9708410990200205, "flos": 31248388212480.0, "grad_norm": 1.7436440375908924, "language_loss": 0.70424473, "learning_rate": 8.877537008407321e-09, "loss": 0.72589678, "num_input_tokens_seen": 174328600, "step": 8074, "time_per_iteration": 2.7069835662841797 }, { "auxiliary_loss_clip": 0.01139619, "auxiliary_loss_mlp": 0.01023038, "balance_loss_clip": 1.04338396, "balance_loss_mlp": 1.01607668, "epoch": 0.9709613419106595, "flos": 30554450386560.0, "grad_norm": 1.5595820856186635, "language_loss": 0.6845082, "learning_rate": 8.804374287221028e-09, "loss": 0.7061348, "num_input_tokens_seen": 174349835, "step": 8075, "time_per_iteration": 2.7215447425842285 }, { "auxiliary_loss_clip": 0.01113158, "auxiliary_loss_mlp": 0.01025464, "balance_loss_clip": 1.03433931, "balance_loss_mlp": 1.01825476, "epoch": 0.9710815848012986, "flos": 23730166281600.0, "grad_norm": 1.644648868879505, "language_loss": 0.84721917, "learning_rate": 8.731513628827958e-09, "loss": 0.86860543, "num_input_tokens_seen": 174369200, "step": 8076, "time_per_iteration": 2.6982333660125732 }, { "auxiliary_loss_clip": 0.01153336, "auxiliary_loss_mlp": 0.01022851, "balance_loss_clip": 1.04464388, "balance_loss_mlp": 1.01573992, "epoch": 0.9712018276919377, "flos": 23761875012480.0, "grad_norm": 2.578539503867079, "language_loss": 0.82285023, "learning_rate": 8.658955044280825e-09, "loss": 0.84461212, "num_input_tokens_seen": 174388125, "step": 8077, "time_per_iteration": 2.667250394821167 }, { "auxiliary_loss_clip": 0.01147591, "auxiliary_loss_mlp": 0.01022366, "balance_loss_clip": 1.04411829, "balance_loss_mlp": 1.01545751, "epoch": 0.9713220705825768, "flos": 23330983461120.0, "grad_norm": 1.5167362902029875, "language_loss": 0.77726847, "learning_rate": 8.586698544587268e-09, "loss": 0.79896802, "num_input_tokens_seen": 174409735, "step": 8078, "time_per_iteration": 2.6477038860321045 }, { "auxiliary_loss_clip": 0.01127711, "auxiliary_loss_mlp": 0.01029616, "balance_loss_clip": 1.04001045, "balance_loss_mlp": 1.02289581, "epoch": 0.9714423134732159, "flos": 22200946611840.0, "grad_norm": 2.201773842496642, "language_loss": 0.74260747, "learning_rate": 8.514744140707853e-09, "loss": 0.76418078, "num_input_tokens_seen": 174428875, "step": 8079, "time_per_iteration": 2.6375560760498047 }, { "auxiliary_loss_clip": 0.01161452, "auxiliary_loss_mlp": 0.01025989, "balance_loss_clip": 1.04412818, "balance_loss_mlp": 1.01938152, "epoch": 0.971562556363855, "flos": 20229917656320.0, "grad_norm": 1.5899492610616683, "language_loss": 0.76684022, "learning_rate": 8.443091843558515e-09, "loss": 0.78871465, "num_input_tokens_seen": 174447960, "step": 8080, "time_per_iteration": 2.5895440578460693 }, { "auxiliary_loss_clip": 0.0113192, "auxiliary_loss_mlp": 0.01028408, "balance_loss_clip": 1.04189384, "balance_loss_mlp": 1.02061749, "epoch": 0.9716827992544941, "flos": 24970197553920.0, "grad_norm": 2.0514231485775123, "language_loss": 0.64811784, "learning_rate": 8.37174166400878e-09, "loss": 0.66972113, "num_input_tokens_seen": 174463535, "step": 8081, "time_per_iteration": 2.727454423904419 }, { "auxiliary_loss_clip": 0.01167582, "auxiliary_loss_mlp": 0.01025824, "balance_loss_clip": 1.04913151, "balance_loss_mlp": 1.01872802, "epoch": 0.9718030421451331, "flos": 24681476033280.0, "grad_norm": 4.381018237474725, "language_loss": 0.85319853, "learning_rate": 8.300693612881992e-09, "loss": 0.87513262, "num_input_tokens_seen": 174483600, "step": 8082, "time_per_iteration": 2.633908748626709 }, { "auxiliary_loss_clip": 0.01147845, "auxiliary_loss_mlp": 0.0076189, "balance_loss_clip": 1.04408598, "balance_loss_mlp": 1.0003407, "epoch": 0.9719232850357723, "flos": 22090700793600.0, "grad_norm": 2.090306427210176, "language_loss": 0.81399083, "learning_rate": 8.22994770095664e-09, "loss": 0.83308816, "num_input_tokens_seen": 174502175, "step": 8083, "time_per_iteration": 2.6488468647003174 }, { "auxiliary_loss_clip": 0.01137183, "auxiliary_loss_mlp": 0.01025511, "balance_loss_clip": 1.04599082, "balance_loss_mlp": 1.01792049, "epoch": 0.9720435279264114, "flos": 23656908493440.0, "grad_norm": 2.1152249256931364, "language_loss": 0.75083023, "learning_rate": 8.159503938964585e-09, "loss": 0.77245718, "num_input_tokens_seen": 174519495, "step": 8084, "time_per_iteration": 2.721585273742676 }, { "auxiliary_loss_clip": 0.01114561, "auxiliary_loss_mlp": 0.01028038, "balance_loss_clip": 1.0394069, "balance_loss_mlp": 1.02158833, "epoch": 0.9721637708170504, "flos": 28365910623360.0, "grad_norm": 1.8368540400125888, "language_loss": 0.70764446, "learning_rate": 8.089362337592164e-09, "loss": 0.72907043, "num_input_tokens_seen": 174543120, "step": 8085, "time_per_iteration": 2.718135118484497 }, { "auxiliary_loss_clip": 0.01132508, "auxiliary_loss_mlp": 0.01023029, "balance_loss_clip": 1.04196, "balance_loss_mlp": 1.01658845, "epoch": 0.9722840137076896, "flos": 29130807767040.0, "grad_norm": 3.46885969041535, "language_loss": 0.72249019, "learning_rate": 8.019522907479536e-09, "loss": 0.74404556, "num_input_tokens_seen": 174563480, "step": 8086, "time_per_iteration": 3.63275408744812 }, { "auxiliary_loss_clip": 0.01153868, "auxiliary_loss_mlp": 0.01029175, "balance_loss_clip": 1.04507995, "balance_loss_mlp": 1.02229667, "epoch": 0.9724042565983286, "flos": 19243954258560.0, "grad_norm": 2.2276532267569378, "language_loss": 0.77472007, "learning_rate": 7.949985659221558e-09, "loss": 0.79655051, "num_input_tokens_seen": 174580745, "step": 8087, "time_per_iteration": 2.604414224624634 }, { "auxiliary_loss_clip": 0.01141328, "auxiliary_loss_mlp": 0.01018154, "balance_loss_clip": 1.04419255, "balance_loss_mlp": 1.01144791, "epoch": 0.9725244994889677, "flos": 23039676161280.0, "grad_norm": 2.138563278236607, "language_loss": 0.79001051, "learning_rate": 7.880750603366904e-09, "loss": 0.81160533, "num_input_tokens_seen": 174599615, "step": 8088, "time_per_iteration": 2.6154944896698 }, { "auxiliary_loss_clip": 0.01132434, "auxiliary_loss_mlp": 0.01024739, "balance_loss_clip": 1.0410893, "balance_loss_mlp": 1.01737511, "epoch": 0.9726447423796069, "flos": 23367468700800.0, "grad_norm": 1.8956207337363327, "language_loss": 0.79512954, "learning_rate": 7.811817750418282e-09, "loss": 0.81670129, "num_input_tokens_seen": 174618375, "step": 8089, "time_per_iteration": 3.671121835708618 }, { "auxiliary_loss_clip": 0.01122122, "auxiliary_loss_mlp": 0.01026904, "balance_loss_clip": 1.04384184, "balance_loss_mlp": 1.01919174, "epoch": 0.9727649852702459, "flos": 26541648639360.0, "grad_norm": 1.647168837692988, "language_loss": 0.80164206, "learning_rate": 7.743187110833105e-09, "loss": 0.82313234, "num_input_tokens_seen": 174641135, "step": 8090, "time_per_iteration": 2.7457826137542725 }, { "auxiliary_loss_clip": 0.01139181, "auxiliary_loss_mlp": 0.01022722, "balance_loss_clip": 1.04079485, "balance_loss_mlp": 1.0157218, "epoch": 0.972885228160885, "flos": 20522338277760.0, "grad_norm": 1.5981395618738707, "language_loss": 0.80344498, "learning_rate": 7.674858695022602e-09, "loss": 0.82506406, "num_input_tokens_seen": 174659490, "step": 8091, "time_per_iteration": 3.752051830291748 }, { "auxiliary_loss_clip": 0.01170337, "auxiliary_loss_mlp": 0.01025332, "balance_loss_clip": 1.0484587, "balance_loss_mlp": 1.01738024, "epoch": 0.9730054710515241, "flos": 17566064196480.0, "grad_norm": 2.840334889928141, "language_loss": 0.76064342, "learning_rate": 7.606832513351591e-09, "loss": 0.7826001, "num_input_tokens_seen": 174677440, "step": 8092, "time_per_iteration": 2.59678316116333 }, { "auxiliary_loss_clip": 0.01062624, "auxiliary_loss_mlp": 0.00753441, "balance_loss_clip": 1.00899005, "balance_loss_mlp": 1.00016236, "epoch": 0.9731257139421632, "flos": 68972010117120.0, "grad_norm": 0.8261800955079551, "language_loss": 0.63907009, "learning_rate": 7.539108576140264e-09, "loss": 0.65723073, "num_input_tokens_seen": 174741550, "step": 8093, "time_per_iteration": 3.216064929962158 }, { "auxiliary_loss_clip": 0.01108867, "auxiliary_loss_mlp": 0.01024843, "balance_loss_clip": 1.04023433, "balance_loss_mlp": 1.01836085, "epoch": 0.9732459568328022, "flos": 18478841633280.0, "grad_norm": 2.0504127563384755, "language_loss": 0.7038753, "learning_rate": 7.471686893661732e-09, "loss": 0.7252124, "num_input_tokens_seen": 174759845, "step": 8094, "time_per_iteration": 3.6348602771759033 }, { "auxiliary_loss_clip": 0.01134704, "auxiliary_loss_mlp": 0.01026226, "balance_loss_clip": 1.04401946, "balance_loss_mlp": 1.01898372, "epoch": 0.9733661997234414, "flos": 20883886623360.0, "grad_norm": 1.7846509784246194, "language_loss": 0.64235985, "learning_rate": 7.4045674761442636e-09, "loss": 0.66396916, "num_input_tokens_seen": 174777175, "step": 8095, "time_per_iteration": 2.6099915504455566 }, { "auxiliary_loss_clip": 0.0116434, "auxiliary_loss_mlp": 0.00762111, "balance_loss_clip": 1.04541135, "balance_loss_mlp": 1.00033295, "epoch": 0.9734864426140805, "flos": 23766795175680.0, "grad_norm": 1.6945926792280717, "language_loss": 0.74286604, "learning_rate": 7.337750333769488e-09, "loss": 0.76213056, "num_input_tokens_seen": 174796980, "step": 8096, "time_per_iteration": 2.6134121417999268 }, { "auxiliary_loss_clip": 0.0114104, "auxiliary_loss_mlp": 0.01023215, "balance_loss_clip": 1.03988278, "balance_loss_mlp": 1.0160625, "epoch": 0.9736066855047195, "flos": 35042422176000.0, "grad_norm": 1.7554185162905058, "language_loss": 0.72705424, "learning_rate": 7.2712354766737425e-09, "loss": 0.7486968, "num_input_tokens_seen": 174817310, "step": 8097, "time_per_iteration": 2.76782488822937 }, { "auxiliary_loss_clip": 0.01113991, "auxiliary_loss_mlp": 0.01023467, "balance_loss_clip": 1.0431515, "balance_loss_mlp": 1.01661837, "epoch": 0.9737269283953586, "flos": 20410620001920.0, "grad_norm": 1.575664873686886, "language_loss": 0.80920982, "learning_rate": 7.2050229149469565e-09, "loss": 0.83058441, "num_input_tokens_seen": 174837320, "step": 8098, "time_per_iteration": 2.6716487407684326 }, { "auxiliary_loss_clip": 0.0112504, "auxiliary_loss_mlp": 0.01030298, "balance_loss_clip": 1.03831649, "balance_loss_mlp": 1.02287161, "epoch": 0.9738471712859977, "flos": 28911680847360.0, "grad_norm": 1.7679828878158725, "language_loss": 0.63426232, "learning_rate": 7.139112658633984e-09, "loss": 0.65581572, "num_input_tokens_seen": 174857470, "step": 8099, "time_per_iteration": 2.749349594116211 }, { "auxiliary_loss_clip": 0.01118199, "auxiliary_loss_mlp": 0.0102359, "balance_loss_clip": 1.04044533, "balance_loss_mlp": 1.01632762, "epoch": 0.9739674141766368, "flos": 27782326356480.0, "grad_norm": 2.0629026505610195, "language_loss": 0.70097077, "learning_rate": 7.073504717733048e-09, "loss": 0.72238868, "num_input_tokens_seen": 174877035, "step": 8100, "time_per_iteration": 2.7755634784698486 }, { "auxiliary_loss_clip": 0.01024955, "auxiliary_loss_mlp": 0.0100166, "balance_loss_clip": 1.01240873, "balance_loss_mlp": 1.00064123, "epoch": 0.9740876570672758, "flos": 68863057188480.0, "grad_norm": 0.7339323358517948, "language_loss": 0.57175195, "learning_rate": 7.008199102196855e-09, "loss": 0.59201813, "num_input_tokens_seen": 174938460, "step": 8101, "time_per_iteration": 3.2815632820129395 }, { "auxiliary_loss_clip": 0.01032645, "auxiliary_loss_mlp": 0.01000333, "balance_loss_clip": 1.00675654, "balance_loss_mlp": 0.99934959, "epoch": 0.974207899957915, "flos": 58236622646400.0, "grad_norm": 0.8249126286341723, "language_loss": 0.58906424, "learning_rate": 6.9431958219321464e-09, "loss": 0.60939395, "num_input_tokens_seen": 174994625, "step": 8102, "time_per_iteration": 3.2441694736480713 }, { "auxiliary_loss_clip": 0.01135501, "auxiliary_loss_mlp": 0.0102135, "balance_loss_clip": 1.04233003, "balance_loss_mlp": 1.01429582, "epoch": 0.9743281428485541, "flos": 22600057605120.0, "grad_norm": 1.4945001756368512, "language_loss": 0.77813798, "learning_rate": 6.878494886800146e-09, "loss": 0.79970652, "num_input_tokens_seen": 175015400, "step": 8103, "time_per_iteration": 2.7675180435180664 }, { "auxiliary_loss_clip": 0.01138428, "auxiliary_loss_mlp": 0.01029489, "balance_loss_clip": 1.0442431, "balance_loss_mlp": 1.02197289, "epoch": 0.9744483857391931, "flos": 20008815488640.0, "grad_norm": 2.002344926655349, "language_loss": 0.76143825, "learning_rate": 6.814096306615669e-09, "loss": 0.78311741, "num_input_tokens_seen": 175033540, "step": 8104, "time_per_iteration": 2.7644031047821045 }, { "auxiliary_loss_clip": 0.01140892, "auxiliary_loss_mlp": 0.01024959, "balance_loss_clip": 1.0399853, "balance_loss_mlp": 1.01796186, "epoch": 0.9745686286298323, "flos": 17675268520320.0, "grad_norm": 2.2891246721604803, "language_loss": 0.65606594, "learning_rate": 6.750000091148011e-09, "loss": 0.67772448, "num_input_tokens_seen": 175050835, "step": 8105, "time_per_iteration": 2.714402437210083 }, { "auxiliary_loss_clip": 0.01166987, "auxiliary_loss_mlp": 0.01025511, "balance_loss_clip": 1.04796791, "balance_loss_mlp": 1.01796508, "epoch": 0.9746888715204713, "flos": 29460252332160.0, "grad_norm": 2.0108515934955977, "language_loss": 0.72857213, "learning_rate": 6.686206250120729e-09, "loss": 0.7504971, "num_input_tokens_seen": 175072330, "step": 8106, "time_per_iteration": 2.7278361320495605 }, { "auxiliary_loss_clip": 0.01125647, "auxiliary_loss_mlp": 0.01022082, "balance_loss_clip": 1.03930378, "balance_loss_mlp": 1.01516449, "epoch": 0.9748091144111104, "flos": 18479308510080.0, "grad_norm": 1.6854119134987964, "language_loss": 0.74587727, "learning_rate": 6.622714793210749e-09, "loss": 0.76735461, "num_input_tokens_seen": 175091250, "step": 8107, "time_per_iteration": 2.689077138900757 }, { "auxiliary_loss_clip": 0.01165123, "auxiliary_loss_mlp": 0.01026138, "balance_loss_clip": 1.04551291, "balance_loss_mlp": 1.01892316, "epoch": 0.9749293573017496, "flos": 20665154753280.0, "grad_norm": 1.7145719279573164, "language_loss": 0.7860502, "learning_rate": 6.559525730050364e-09, "loss": 0.80796278, "num_input_tokens_seen": 175111350, "step": 8108, "time_per_iteration": 2.6869115829467773 }, { "auxiliary_loss_clip": 0.01125431, "auxiliary_loss_mlp": 0.01024877, "balance_loss_clip": 1.04283524, "balance_loss_mlp": 1.01794791, "epoch": 0.9750496001923886, "flos": 18478590238080.0, "grad_norm": 1.9009229129543723, "language_loss": 0.75472569, "learning_rate": 6.496639070224574e-09, "loss": 0.77622873, "num_input_tokens_seen": 175129835, "step": 8109, "time_per_iteration": 2.731154441833496 }, { "auxiliary_loss_clip": 0.01154906, "auxiliary_loss_mlp": 0.01025906, "balance_loss_clip": 1.04640293, "balance_loss_mlp": 1.01886415, "epoch": 0.9751698430830277, "flos": 19572967860480.0, "grad_norm": 2.09903623272339, "language_loss": 0.8371563, "learning_rate": 6.4340548232739714e-09, "loss": 0.85896438, "num_input_tokens_seen": 175146035, "step": 8110, "time_per_iteration": 2.696333169937134 }, { "auxiliary_loss_clip": 0.01128742, "auxiliary_loss_mlp": 0.01021259, "balance_loss_clip": 1.04198575, "balance_loss_mlp": 1.01379299, "epoch": 0.9752900859736668, "flos": 23550325862400.0, "grad_norm": 2.8387876805632923, "language_loss": 0.79138166, "learning_rate": 6.371772998692071e-09, "loss": 0.81288165, "num_input_tokens_seen": 175165290, "step": 8111, "time_per_iteration": 3.7230780124664307 }, { "auxiliary_loss_clip": 0.01125927, "auxiliary_loss_mlp": 0.01024113, "balance_loss_clip": 1.04051101, "balance_loss_mlp": 1.01685643, "epoch": 0.9754103288643059, "flos": 20303211358080.0, "grad_norm": 3.019132009698342, "language_loss": 0.64907354, "learning_rate": 6.309793605927094e-09, "loss": 0.67057395, "num_input_tokens_seen": 175183610, "step": 8112, "time_per_iteration": 2.7475764751434326 }, { "auxiliary_loss_clip": 0.01143398, "auxiliary_loss_mlp": 0.01024055, "balance_loss_clip": 1.04349804, "balance_loss_mlp": 1.01744795, "epoch": 0.975530571754945, "flos": 19350680544000.0, "grad_norm": 1.8785991568822407, "language_loss": 0.80222952, "learning_rate": 6.248116654381297e-09, "loss": 0.82390404, "num_input_tokens_seen": 175202080, "step": 8113, "time_per_iteration": 2.715064525604248 }, { "auxiliary_loss_clip": 0.01140204, "auxiliary_loss_mlp": 0.01023254, "balance_loss_clip": 1.04084516, "balance_loss_mlp": 1.01638806, "epoch": 0.9756508146455841, "flos": 23583399310080.0, "grad_norm": 1.8300342499621864, "language_loss": 0.72910714, "learning_rate": 6.186742153410751e-09, "loss": 0.75074172, "num_input_tokens_seen": 175221575, "step": 8114, "time_per_iteration": 2.7126028537750244 }, { "auxiliary_loss_clip": 0.0113594, "auxiliary_loss_mlp": 0.01026205, "balance_loss_clip": 1.04264486, "balance_loss_mlp": 1.01879561, "epoch": 0.9757710575362232, "flos": 22966921163520.0, "grad_norm": 2.41917967084347, "language_loss": 0.87562561, "learning_rate": 6.125670112326453e-09, "loss": 0.89724708, "num_input_tokens_seen": 175240835, "step": 8115, "time_per_iteration": 3.6568708419799805 }, { "auxiliary_loss_clip": 0.01151161, "auxiliary_loss_mlp": 0.01026614, "balance_loss_clip": 1.04149795, "balance_loss_mlp": 1.01925898, "epoch": 0.9758913004268622, "flos": 27966009530880.0, "grad_norm": 1.7284911357517998, "language_loss": 0.70486683, "learning_rate": 6.064900540392548e-09, "loss": 0.72664458, "num_input_tokens_seen": 175262930, "step": 8116, "time_per_iteration": 2.716320753097534 }, { "auxiliary_loss_clip": 0.01130613, "auxiliary_loss_mlp": 0.01023022, "balance_loss_clip": 1.04308248, "balance_loss_mlp": 1.016469, "epoch": 0.9760115433175014, "flos": 22200156512640.0, "grad_norm": 4.728989930940953, "language_loss": 0.79073912, "learning_rate": 6.0044334468278835e-09, "loss": 0.81227547, "num_input_tokens_seen": 175282275, "step": 8117, "time_per_iteration": 2.669156312942505 }, { "auxiliary_loss_clip": 0.01109827, "auxiliary_loss_mlp": 0.01021667, "balance_loss_clip": 1.03847146, "balance_loss_mlp": 1.01469326, "epoch": 0.9761317862081405, "flos": 26250736389120.0, "grad_norm": 2.040465469355562, "language_loss": 0.7147063, "learning_rate": 5.944268840805345e-09, "loss": 0.73602128, "num_input_tokens_seen": 175303020, "step": 8118, "time_per_iteration": 3.776001453399658 }, { "auxiliary_loss_clip": 0.01116743, "auxiliary_loss_mlp": 0.01022024, "balance_loss_clip": 1.03886795, "balance_loss_mlp": 1.01520491, "epoch": 0.9762520290987795, "flos": 26575440359040.0, "grad_norm": 1.989429660408004, "language_loss": 0.63801301, "learning_rate": 5.88440673145163e-09, "loss": 0.6594007, "num_input_tokens_seen": 175324070, "step": 8119, "time_per_iteration": 2.734724998474121 }, { "auxiliary_loss_clip": 0.01149557, "auxiliary_loss_mlp": 0.01026851, "balance_loss_clip": 1.04575324, "balance_loss_mlp": 1.02028859, "epoch": 0.9763722719894187, "flos": 18005036307840.0, "grad_norm": 2.0426077305484363, "language_loss": 0.8259353, "learning_rate": 5.824847127848142e-09, "loss": 0.8476994, "num_input_tokens_seen": 175342595, "step": 8120, "time_per_iteration": 3.487734079360962 }, { "auxiliary_loss_clip": 0.01112863, "auxiliary_loss_mlp": 0.01025172, "balance_loss_clip": 1.04198945, "balance_loss_mlp": 1.01790285, "epoch": 0.9764925148800577, "flos": 22455660931200.0, "grad_norm": 1.8685419702788058, "language_loss": 0.78859097, "learning_rate": 5.765590039029433e-09, "loss": 0.80997127, "num_input_tokens_seen": 175361915, "step": 8121, "time_per_iteration": 2.7872447967529297 }, { "auxiliary_loss_clip": 0.01166608, "auxiliary_loss_mlp": 0.01026873, "balance_loss_clip": 1.04851007, "balance_loss_mlp": 1.01954818, "epoch": 0.9766127577706968, "flos": 36757084786560.0, "grad_norm": 1.7874118830098242, "language_loss": 0.71315247, "learning_rate": 5.706635473985422e-09, "loss": 0.73508728, "num_input_tokens_seen": 175385785, "step": 8122, "time_per_iteration": 2.7344343662261963 }, { "auxiliary_loss_clip": 0.01151924, "auxiliary_loss_mlp": 0.01022649, "balance_loss_clip": 1.04526532, "balance_loss_mlp": 1.01608014, "epoch": 0.976733000661336, "flos": 22309971367680.0, "grad_norm": 2.3064775039918843, "language_loss": 0.84983075, "learning_rate": 5.6479834416591764e-09, "loss": 0.87157643, "num_input_tokens_seen": 175405145, "step": 8123, "time_per_iteration": 2.646451234817505 }, { "auxiliary_loss_clip": 0.01150085, "auxiliary_loss_mlp": 0.00762808, "balance_loss_clip": 1.04493523, "balance_loss_mlp": 1.00034297, "epoch": 0.976853243551975, "flos": 25810938264960.0, "grad_norm": 1.829347339933807, "language_loss": 0.68508476, "learning_rate": 5.589633950947803e-09, "loss": 0.70421368, "num_input_tokens_seen": 175422645, "step": 8124, "time_per_iteration": 2.705111503601074 }, { "auxiliary_loss_clip": 0.01133998, "auxiliary_loss_mlp": 0.0102894, "balance_loss_clip": 1.04187322, "balance_loss_mlp": 1.02106035, "epoch": 0.9769734864426141, "flos": 21397445326080.0, "grad_norm": 2.189332987874214, "language_loss": 0.69934523, "learning_rate": 5.5315870107035535e-09, "loss": 0.72097468, "num_input_tokens_seen": 175440695, "step": 8125, "time_per_iteration": 2.6548855304718018 }, { "auxiliary_loss_clip": 0.01134196, "auxiliary_loss_mlp": 0.01025833, "balance_loss_clip": 1.0448879, "balance_loss_mlp": 1.01911521, "epoch": 0.9770937293332532, "flos": 13990977584640.0, "grad_norm": 2.115374199716958, "language_loss": 0.78898406, "learning_rate": 5.473842629731607e-09, "loss": 0.81058431, "num_input_tokens_seen": 175459195, "step": 8126, "time_per_iteration": 2.8073999881744385 }, { "auxiliary_loss_clip": 0.01143917, "auxiliary_loss_mlp": 0.0076242, "balance_loss_clip": 1.04227924, "balance_loss_mlp": 1.00035322, "epoch": 0.9772139722238923, "flos": 17931994001280.0, "grad_norm": 2.1692997309203936, "language_loss": 0.77924281, "learning_rate": 5.416400816792066e-09, "loss": 0.79830623, "num_input_tokens_seen": 175476710, "step": 8127, "time_per_iteration": 2.6686689853668213 }, { "auxiliary_loss_clip": 0.01162865, "auxiliary_loss_mlp": 0.01027232, "balance_loss_clip": 1.04358149, "balance_loss_mlp": 1.02013326, "epoch": 0.9773342151145313, "flos": 20446171488000.0, "grad_norm": 2.8843343434377693, "language_loss": 0.7865994, "learning_rate": 5.359261580598407e-09, "loss": 0.80850035, "num_input_tokens_seen": 175492550, "step": 8128, "time_per_iteration": 2.570988178253174 }, { "auxiliary_loss_clip": 0.01153326, "auxiliary_loss_mlp": 0.01025125, "balance_loss_clip": 1.04546785, "balance_loss_mlp": 1.01720929, "epoch": 0.9774544580051704, "flos": 11837306949120.0, "grad_norm": 3.3427491954543767, "language_loss": 0.78530645, "learning_rate": 5.302424929819027e-09, "loss": 0.807091, "num_input_tokens_seen": 175506560, "step": 8129, "time_per_iteration": 2.6053876876831055 }, { "auxiliary_loss_clip": 0.01153821, "auxiliary_loss_mlp": 0.01025449, "balance_loss_clip": 1.04183173, "balance_loss_mlp": 1.01781917, "epoch": 0.9775747008958096, "flos": 13479932833920.0, "grad_norm": 2.20199729506144, "language_loss": 0.73013997, "learning_rate": 5.24589087307592e-09, "loss": 0.75193268, "num_input_tokens_seen": 175524180, "step": 8130, "time_per_iteration": 2.5992488861083984 }, { "auxiliary_loss_clip": 0.01166126, "auxiliary_loss_mlp": 0.01027368, "balance_loss_clip": 1.04419351, "balance_loss_mlp": 1.01988792, "epoch": 0.9776949437864486, "flos": 59532314042880.0, "grad_norm": 1.578165977394243, "language_loss": 0.65154976, "learning_rate": 5.189659418944891e-09, "loss": 0.67348468, "num_input_tokens_seen": 175554355, "step": 8131, "time_per_iteration": 2.973022937774658 }, { "auxiliary_loss_clip": 0.01163874, "auxiliary_loss_mlp": 0.01021509, "balance_loss_clip": 1.04551375, "balance_loss_mlp": 1.01459217, "epoch": 0.9778151866770877, "flos": 21178605715200.0, "grad_norm": 2.3269979611978053, "language_loss": 0.78295535, "learning_rate": 5.133730575956674e-09, "loss": 0.80480915, "num_input_tokens_seen": 175574025, "step": 8132, "time_per_iteration": 2.673265218734741 }, { "auxiliary_loss_clip": 0.01139395, "auxiliary_loss_mlp": 0.01026787, "balance_loss_clip": 1.04282904, "balance_loss_mlp": 1.01973605, "epoch": 0.9779354295677268, "flos": 20886795624960.0, "grad_norm": 2.0552605638702452, "language_loss": 0.72262281, "learning_rate": 5.0781043525953696e-09, "loss": 0.74428463, "num_input_tokens_seen": 175592090, "step": 8133, "time_per_iteration": 2.657054901123047 }, { "auxiliary_loss_clip": 0.01133866, "auxiliary_loss_mlp": 0.01025816, "balance_loss_clip": 1.04449224, "balance_loss_mlp": 1.01935816, "epoch": 0.9780556724583659, "flos": 23440618748160.0, "grad_norm": 1.6606710159370175, "language_loss": 0.74264109, "learning_rate": 5.0227807572995605e-09, "loss": 0.76423794, "num_input_tokens_seen": 175614065, "step": 8134, "time_per_iteration": 2.6526482105255127 }, { "auxiliary_loss_clip": 0.01139703, "auxiliary_loss_mlp": 0.01022487, "balance_loss_clip": 1.04274726, "balance_loss_mlp": 1.01537919, "epoch": 0.9781759153490049, "flos": 20923244951040.0, "grad_norm": 2.3448828436760505, "language_loss": 0.67721325, "learning_rate": 4.967759798461646e-09, "loss": 0.69883519, "num_input_tokens_seen": 175632410, "step": 8135, "time_per_iteration": 2.6077494621276855 }, { "auxiliary_loss_clip": 0.01162097, "auxiliary_loss_mlp": 0.01023642, "balance_loss_clip": 1.04491401, "balance_loss_mlp": 1.01690614, "epoch": 0.9782961582396441, "flos": 28293191539200.0, "grad_norm": 3.6575794373133674, "language_loss": 0.75376016, "learning_rate": 4.913041484428282e-09, "loss": 0.7756176, "num_input_tokens_seen": 175652885, "step": 8136, "time_per_iteration": 2.672907829284668 }, { "auxiliary_loss_clip": 0.01152819, "auxiliary_loss_mlp": 0.01025443, "balance_loss_clip": 1.04418004, "balance_loss_mlp": 1.01909506, "epoch": 0.9784164011302832, "flos": 25552955808000.0, "grad_norm": 2.1075836981924225, "language_loss": 0.74337792, "learning_rate": 4.858625823500384e-09, "loss": 0.76516056, "num_input_tokens_seen": 175670585, "step": 8137, "time_per_iteration": 2.5978081226348877 }, { "auxiliary_loss_clip": 0.01155742, "auxiliary_loss_mlp": 0.01023795, "balance_loss_clip": 1.04347515, "balance_loss_mlp": 1.01701546, "epoch": 0.9785366440209222, "flos": 29965945956480.0, "grad_norm": 3.1853053279706005, "language_loss": 0.73423994, "learning_rate": 4.80451282393246e-09, "loss": 0.75603533, "num_input_tokens_seen": 175690570, "step": 8138, "time_per_iteration": 3.8139994144439697 }, { "auxiliary_loss_clip": 0.0114037, "auxiliary_loss_mlp": 0.01027798, "balance_loss_clip": 1.04282141, "balance_loss_mlp": 1.02091014, "epoch": 0.9786568869115614, "flos": 32343591847680.0, "grad_norm": 1.9869304619092698, "language_loss": 0.67708361, "learning_rate": 4.750702493933722e-09, "loss": 0.69876528, "num_input_tokens_seen": 175710455, "step": 8139, "time_per_iteration": 2.7291347980499268 }, { "auxiliary_loss_clip": 0.0113469, "auxiliary_loss_mlp": 0.00761959, "balance_loss_clip": 1.04277849, "balance_loss_mlp": 1.00031424, "epoch": 0.9787771298022004, "flos": 23331414424320.0, "grad_norm": 1.8392969592954262, "language_loss": 0.85545707, "learning_rate": 4.697194841666974e-09, "loss": 0.87442362, "num_input_tokens_seen": 175729380, "step": 8140, "time_per_iteration": 2.7855002880096436 }, { "auxiliary_loss_clip": 0.01153598, "auxiliary_loss_mlp": 0.01030994, "balance_loss_clip": 1.04464889, "balance_loss_mlp": 1.02316809, "epoch": 0.9788973726928395, "flos": 21468548298240.0, "grad_norm": 1.7732525107583232, "language_loss": 0.81643063, "learning_rate": 4.6439898752492764e-09, "loss": 0.83827651, "num_input_tokens_seen": 175749520, "step": 8141, "time_per_iteration": 3.8103091716766357 }, { "auxiliary_loss_clip": 0.01054469, "auxiliary_loss_mlp": 0.0075339, "balance_loss_clip": 1.01008213, "balance_loss_mlp": 1.00018883, "epoch": 0.9790176155834787, "flos": 68897459439360.0, "grad_norm": 0.7504176875704255, "language_loss": 0.63695931, "learning_rate": 4.591087602751731e-09, "loss": 0.65503788, "num_input_tokens_seen": 175811380, "step": 8142, "time_per_iteration": 3.3274405002593994 }, { "auxiliary_loss_clip": 0.01149014, "auxiliary_loss_mlp": 0.01027644, "balance_loss_clip": 1.04427743, "balance_loss_mlp": 1.02076519, "epoch": 0.9791378584741177, "flos": 21430877909760.0, "grad_norm": 1.7510427093260563, "language_loss": 0.72092056, "learning_rate": 4.538488032199916e-09, "loss": 0.74268711, "num_input_tokens_seen": 175829480, "step": 8143, "time_per_iteration": 2.612973928451538 }, { "auxiliary_loss_clip": 0.01154885, "auxiliary_loss_mlp": 0.01022447, "balance_loss_clip": 1.04258204, "balance_loss_mlp": 1.0153358, "epoch": 0.9792581013647568, "flos": 20153032594560.0, "grad_norm": 1.9712248597606088, "language_loss": 0.68668032, "learning_rate": 4.486191171572784e-09, "loss": 0.70845366, "num_input_tokens_seen": 175846750, "step": 8144, "time_per_iteration": 3.537792921066284 }, { "auxiliary_loss_clip": 0.01153754, "auxiliary_loss_mlp": 0.01028266, "balance_loss_clip": 1.04490113, "balance_loss_mlp": 1.02094316, "epoch": 0.9793783442553959, "flos": 23728191033600.0, "grad_norm": 1.5234476759349203, "language_loss": 0.77439106, "learning_rate": 4.434197028803766e-09, "loss": 0.79621124, "num_input_tokens_seen": 175865975, "step": 8145, "time_per_iteration": 2.611265182495117 }, { "auxiliary_loss_clip": 0.01129741, "auxiliary_loss_mlp": 0.01026582, "balance_loss_clip": 1.04106593, "balance_loss_mlp": 1.0198704, "epoch": 0.979498587146035, "flos": 23038742407680.0, "grad_norm": 2.041650679665563, "language_loss": 0.82066077, "learning_rate": 4.3825056117805514e-09, "loss": 0.842224, "num_input_tokens_seen": 175881860, "step": 8146, "time_per_iteration": 3.6784520149230957 }, { "auxiliary_loss_clip": 0.01165425, "auxiliary_loss_mlp": 0.01030603, "balance_loss_clip": 1.04501474, "balance_loss_mlp": 1.02219307, "epoch": 0.979618830036674, "flos": 14318841951360.0, "grad_norm": 2.716447414150671, "language_loss": 0.79114163, "learning_rate": 4.331116928344425e-09, "loss": 0.81310189, "num_input_tokens_seen": 175898175, "step": 8147, "time_per_iteration": 2.5783259868621826 }, { "auxiliary_loss_clip": 0.01141224, "auxiliary_loss_mlp": 0.0076312, "balance_loss_clip": 1.04218304, "balance_loss_mlp": 1.0003314, "epoch": 0.9797390729273132, "flos": 16727514215040.0, "grad_norm": 2.2437734598130556, "language_loss": 0.62738454, "learning_rate": 4.28003098629115e-09, "loss": 0.64642793, "num_input_tokens_seen": 175914310, "step": 8148, "time_per_iteration": 2.615752935409546 }, { "auxiliary_loss_clip": 0.01119415, "auxiliary_loss_mlp": 0.01023214, "balance_loss_clip": 1.03613353, "balance_loss_mlp": 1.01664305, "epoch": 0.9798593158179523, "flos": 24532661986560.0, "grad_norm": 2.0433918952550782, "language_loss": 0.78633165, "learning_rate": 4.229247793370305e-09, "loss": 0.80775797, "num_input_tokens_seen": 175933435, "step": 8149, "time_per_iteration": 2.7146196365356445 }, { "auxiliary_loss_clip": 0.01168512, "auxiliary_loss_mlp": 0.01027161, "balance_loss_clip": 1.04755712, "balance_loss_mlp": 1.02014565, "epoch": 0.9799795587085913, "flos": 27308808339840.0, "grad_norm": 1.724979543815382, "language_loss": 0.70522732, "learning_rate": 4.178767357285951e-09, "loss": 0.72718406, "num_input_tokens_seen": 175955065, "step": 8150, "time_per_iteration": 2.681532382965088 }, { "auxiliary_loss_clip": 0.01150771, "auxiliary_loss_mlp": 0.00761865, "balance_loss_clip": 1.04304504, "balance_loss_mlp": 1.00035512, "epoch": 0.9800998015992305, "flos": 26286575184000.0, "grad_norm": 2.170184530775519, "language_loss": 0.71411103, "learning_rate": 4.128589685695516e-09, "loss": 0.73323739, "num_input_tokens_seen": 175975490, "step": 8151, "time_per_iteration": 2.6793222427368164 }, { "auxiliary_loss_clip": 0.01167683, "auxiliary_loss_mlp": 0.01024958, "balance_loss_clip": 1.04742479, "balance_loss_mlp": 1.01735806, "epoch": 0.9802200444898695, "flos": 16723635546240.0, "grad_norm": 1.8054180062239564, "language_loss": 0.84315479, "learning_rate": 4.078714786211135e-09, "loss": 0.86508119, "num_input_tokens_seen": 175991340, "step": 8152, "time_per_iteration": 2.573296070098877 }, { "auxiliary_loss_clip": 0.0114703, "auxiliary_loss_mlp": 0.01020789, "balance_loss_clip": 1.04323006, "balance_loss_mlp": 1.01462889, "epoch": 0.9803402873805086, "flos": 24900459298560.0, "grad_norm": 1.7545253710231965, "language_loss": 0.76518261, "learning_rate": 4.029142666398977e-09, "loss": 0.78686088, "num_input_tokens_seen": 176011505, "step": 8153, "time_per_iteration": 2.631821393966675 }, { "auxiliary_loss_clip": 0.01162652, "auxiliary_loss_mlp": 0.01026527, "balance_loss_clip": 1.0458796, "balance_loss_mlp": 1.01963651, "epoch": 0.9804605302711478, "flos": 22564937082240.0, "grad_norm": 2.0869413702250035, "language_loss": 0.8021487, "learning_rate": 3.979873333778805e-09, "loss": 0.82404053, "num_input_tokens_seen": 176029680, "step": 8154, "time_per_iteration": 2.588289260864258 }, { "auxiliary_loss_clip": 0.01143204, "auxiliary_loss_mlp": 0.01024481, "balance_loss_clip": 1.04434109, "balance_loss_mlp": 1.01730776, "epoch": 0.9805807731617868, "flos": 38905368382080.0, "grad_norm": 2.278337699844078, "language_loss": 0.73996007, "learning_rate": 3.930906795824862e-09, "loss": 0.76163691, "num_input_tokens_seen": 176050355, "step": 8155, "time_per_iteration": 2.784166097640991 }, { "auxiliary_loss_clip": 0.0114653, "auxiliary_loss_mlp": 0.01022604, "balance_loss_clip": 1.04263759, "balance_loss_mlp": 1.01569867, "epoch": 0.9807010160524259, "flos": 17821999578240.0, "grad_norm": 3.131827168891957, "language_loss": 0.77028275, "learning_rate": 3.882243059965207e-09, "loss": 0.79197407, "num_input_tokens_seen": 176068070, "step": 8156, "time_per_iteration": 2.5459935665130615 }, { "auxiliary_loss_clip": 0.01144888, "auxiliary_loss_mlp": 0.01025085, "balance_loss_clip": 1.04129124, "balance_loss_mlp": 1.01779234, "epoch": 0.980821258943065, "flos": 13552975140480.0, "grad_norm": 2.240921791403321, "language_loss": 0.65468901, "learning_rate": 3.833882133582156e-09, "loss": 0.67638874, "num_input_tokens_seen": 176083730, "step": 8157, "time_per_iteration": 2.6086947917938232 }, { "auxiliary_loss_clip": 0.0115451, "auxiliary_loss_mlp": 0.01024857, "balance_loss_clip": 1.04410458, "balance_loss_mlp": 1.01769519, "epoch": 0.9809415018337041, "flos": 21689794120320.0, "grad_norm": 2.2103862025596097, "language_loss": 0.78523493, "learning_rate": 3.785824024012285e-09, "loss": 0.80702865, "num_input_tokens_seen": 176102730, "step": 8158, "time_per_iteration": 2.606032133102417 }, { "auxiliary_loss_clip": 0.01131412, "auxiliary_loss_mlp": 0.01024078, "balance_loss_clip": 1.04399967, "balance_loss_mlp": 1.01765585, "epoch": 0.9810617447243432, "flos": 23294857357440.0, "grad_norm": 1.716505167231159, "language_loss": 0.78595376, "learning_rate": 3.738068738545541e-09, "loss": 0.80750871, "num_input_tokens_seen": 176121815, "step": 8159, "time_per_iteration": 2.676825523376465 }, { "auxiliary_loss_clip": 0.01154198, "auxiliary_loss_mlp": 0.01027006, "balance_loss_clip": 1.04370856, "balance_loss_mlp": 1.01957309, "epoch": 0.9811819876149822, "flos": 18332038748160.0, "grad_norm": 2.1914962391641635, "language_loss": 0.78598744, "learning_rate": 3.6906162844265733e-09, "loss": 0.80779952, "num_input_tokens_seen": 176138900, "step": 8160, "time_per_iteration": 2.5867042541503906 }, { "auxiliary_loss_clip": 0.0113178, "auxiliary_loss_mlp": 0.01023684, "balance_loss_clip": 1.04132652, "balance_loss_mlp": 1.0158366, "epoch": 0.9813022305056214, "flos": 22601961025920.0, "grad_norm": 1.8135674108100337, "language_loss": 0.70616913, "learning_rate": 3.643466668853845e-09, "loss": 0.72772378, "num_input_tokens_seen": 176156925, "step": 8161, "time_per_iteration": 2.6601109504699707 }, { "auxiliary_loss_clip": 0.0113848, "auxiliary_loss_mlp": 0.0102534, "balance_loss_clip": 1.04208255, "balance_loss_mlp": 1.01816964, "epoch": 0.9814224733962604, "flos": 25413335642880.0, "grad_norm": 1.9445784665678776, "language_loss": 0.75234807, "learning_rate": 3.59661989898008e-09, "loss": 0.77398622, "num_input_tokens_seen": 176177980, "step": 8162, "time_per_iteration": 2.7560431957244873 }, { "auxiliary_loss_clip": 0.01118281, "auxiliary_loss_mlp": 0.01024843, "balance_loss_clip": 1.04307997, "balance_loss_mlp": 1.0184145, "epoch": 0.9815427162868995, "flos": 25007185584000.0, "grad_norm": 1.6659894771764059, "language_loss": 0.7681272, "learning_rate": 3.5500759819115934e-09, "loss": 0.78955847, "num_input_tokens_seen": 176198345, "step": 8163, "time_per_iteration": 3.583566904067993 }, { "auxiliary_loss_clip": 0.0116746, "auxiliary_loss_mlp": 0.0102327, "balance_loss_clip": 1.04692888, "balance_loss_mlp": 1.01607835, "epoch": 0.9816629591775387, "flos": 20662604887680.0, "grad_norm": 2.151600704198693, "language_loss": 0.81294388, "learning_rate": 3.5038349247094034e-09, "loss": 0.83485121, "num_input_tokens_seen": 176215605, "step": 8164, "time_per_iteration": 2.5667433738708496 }, { "auxiliary_loss_clip": 0.01137423, "auxiliary_loss_mlp": 0.01027245, "balance_loss_clip": 1.04263759, "balance_loss_mlp": 1.02066433, "epoch": 0.9817832020681777, "flos": 17712220636800.0, "grad_norm": 2.023422904360436, "language_loss": 0.7771191, "learning_rate": 3.4578967343878994e-09, "loss": 0.79876578, "num_input_tokens_seen": 176231810, "step": 8165, "time_per_iteration": 2.6130032539367676 }, { "auxiliary_loss_clip": 0.01135123, "auxiliary_loss_mlp": 0.01028815, "balance_loss_clip": 1.04269052, "balance_loss_mlp": 1.02179396, "epoch": 0.9819034449588168, "flos": 22530032040960.0, "grad_norm": 1.7365857324865317, "language_loss": 0.80904609, "learning_rate": 3.4122614179161733e-09, "loss": 0.8306855, "num_input_tokens_seen": 176251770, "step": 8166, "time_per_iteration": 2.6961042881011963 }, { "auxiliary_loss_clip": 0.01108399, "auxiliary_loss_mlp": 0.01022677, "balance_loss_clip": 1.03665566, "balance_loss_mlp": 1.01570892, "epoch": 0.9820236878494559, "flos": 20011221699840.0, "grad_norm": 1.6523674337983187, "language_loss": 0.78132474, "learning_rate": 3.36692898221691e-09, "loss": 0.80263549, "num_input_tokens_seen": 176270135, "step": 8167, "time_per_iteration": 3.6654086112976074 }, { "auxiliary_loss_clip": 0.01150697, "auxiliary_loss_mlp": 0.01026574, "balance_loss_clip": 1.04295719, "balance_loss_mlp": 1.01986802, "epoch": 0.982143930740095, "flos": 18807316531200.0, "grad_norm": 3.6790937630067595, "language_loss": 0.73610651, "learning_rate": 3.3218994341668305e-09, "loss": 0.75787914, "num_input_tokens_seen": 176289065, "step": 8168, "time_per_iteration": 2.687021017074585 }, { "auxiliary_loss_clip": 0.01163809, "auxiliary_loss_mlp": 0.01028423, "balance_loss_clip": 1.04731798, "balance_loss_mlp": 1.02141356, "epoch": 0.982264173630734, "flos": 26578026138240.0, "grad_norm": 1.769443498432387, "language_loss": 0.75702226, "learning_rate": 3.2771727805971373e-09, "loss": 0.77894461, "num_input_tokens_seen": 176310450, "step": 8169, "time_per_iteration": 2.646388530731201 }, { "auxiliary_loss_clip": 0.01101166, "auxiliary_loss_mlp": 0.01024212, "balance_loss_clip": 1.03610098, "balance_loss_mlp": 1.01682115, "epoch": 0.9823844165213732, "flos": 22014462176640.0, "grad_norm": 1.6671323148999326, "language_loss": 0.76979601, "learning_rate": 3.232749028292847e-09, "loss": 0.79104978, "num_input_tokens_seen": 176327415, "step": 8170, "time_per_iteration": 3.6121017932891846 }, { "auxiliary_loss_clip": 0.01166394, "auxiliary_loss_mlp": 0.01023486, "balance_loss_clip": 1.04530311, "balance_loss_mlp": 1.0158025, "epoch": 0.9825046594120123, "flos": 21908166854400.0, "grad_norm": 2.146470327826388, "language_loss": 0.88312715, "learning_rate": 3.188628183992792e-09, "loss": 0.90502596, "num_input_tokens_seen": 176347680, "step": 8171, "time_per_iteration": 2.6168394088745117 }, { "auxiliary_loss_clip": 0.01054955, "auxiliary_loss_mlp": 0.01001964, "balance_loss_clip": 1.00927854, "balance_loss_mlp": 1.00093925, "epoch": 0.9826249023026513, "flos": 59494610718720.0, "grad_norm": 0.7336843186397584, "language_loss": 0.62459928, "learning_rate": 3.1448102543902844e-09, "loss": 0.64516848, "num_input_tokens_seen": 176411595, "step": 8172, "time_per_iteration": 4.111817121505737 }, { "auxiliary_loss_clip": 0.01132792, "auxiliary_loss_mlp": 0.01022126, "balance_loss_clip": 1.04299104, "balance_loss_mlp": 1.01517284, "epoch": 0.9827451451932905, "flos": 16071031296000.0, "grad_norm": 2.101722486819739, "language_loss": 0.67670482, "learning_rate": 3.1012952461324515e-09, "loss": 0.69825399, "num_input_tokens_seen": 176430570, "step": 8173, "time_per_iteration": 2.6133477687835693 }, { "auxiliary_loss_clip": 0.01147431, "auxiliary_loss_mlp": 0.01027757, "balance_loss_clip": 1.04527426, "balance_loss_mlp": 1.02082229, "epoch": 0.9828653880839295, "flos": 20262775622400.0, "grad_norm": 2.245755828588393, "language_loss": 0.73695624, "learning_rate": 3.0580831658204575e-09, "loss": 0.75870812, "num_input_tokens_seen": 176448150, "step": 8174, "time_per_iteration": 2.630075454711914 }, { "auxiliary_loss_clip": 0.01147295, "auxiliary_loss_mlp": 0.01019051, "balance_loss_clip": 1.04444718, "balance_loss_mlp": 1.01251793, "epoch": 0.9829856309745686, "flos": 21616141282560.0, "grad_norm": 1.6508148042219315, "language_loss": 0.77834862, "learning_rate": 3.015174020009281e-09, "loss": 0.80001211, "num_input_tokens_seen": 176467475, "step": 8175, "time_per_iteration": 2.6214518547058105 }, { "auxiliary_loss_clip": 0.01124273, "auxiliary_loss_mlp": 0.01031842, "balance_loss_clip": 1.03906798, "balance_loss_mlp": 1.02500272, "epoch": 0.9831058738652078, "flos": 23764209396480.0, "grad_norm": 2.0312714935722607, "language_loss": 0.74871171, "learning_rate": 2.9725678152086043e-09, "loss": 0.77027291, "num_input_tokens_seen": 176486045, "step": 8176, "time_per_iteration": 2.7512712478637695 }, { "auxiliary_loss_clip": 0.01126134, "auxiliary_loss_mlp": 0.0102973, "balance_loss_clip": 1.04046559, "balance_loss_mlp": 1.02238345, "epoch": 0.9832261167558468, "flos": 11320911072000.0, "grad_norm": 2.6214574966197413, "language_loss": 0.82106793, "learning_rate": 2.930264557881257e-09, "loss": 0.84262657, "num_input_tokens_seen": 176501230, "step": 8177, "time_per_iteration": 2.5971169471740723 }, { "auxiliary_loss_clip": 0.01063101, "auxiliary_loss_mlp": 0.01002485, "balance_loss_clip": 1.00931525, "balance_loss_mlp": 1.0014534, "epoch": 0.9833463596464859, "flos": 60000304343040.0, "grad_norm": 0.8355664303080811, "language_loss": 0.5815894, "learning_rate": 2.8882642544452163e-09, "loss": 0.60224527, "num_input_tokens_seen": 176565955, "step": 8178, "time_per_iteration": 3.1807310581207275 }, { "auxiliary_loss_clip": 0.01128343, "auxiliary_loss_mlp": 0.01021883, "balance_loss_clip": 1.04048455, "balance_loss_mlp": 1.01508164, "epoch": 0.983466602537125, "flos": 13626699805440.0, "grad_norm": 2.4619763312879477, "language_loss": 0.74798977, "learning_rate": 2.8465669112716083e-09, "loss": 0.76949197, "num_input_tokens_seen": 176583480, "step": 8179, "time_per_iteration": 2.6626791954040527 }, { "auxiliary_loss_clip": 0.01153368, "auxiliary_loss_mlp": 0.00762445, "balance_loss_clip": 1.04414117, "balance_loss_mlp": 1.00032139, "epoch": 0.9835868454277641, "flos": 22926844563840.0, "grad_norm": 2.1377485226672217, "language_loss": 0.76632738, "learning_rate": 2.8051725346858177e-09, "loss": 0.78548551, "num_input_tokens_seen": 176603740, "step": 8180, "time_per_iteration": 2.6797678470611572 }, { "auxiliary_loss_clip": 0.01164841, "auxiliary_loss_mlp": 0.0102717, "balance_loss_clip": 1.04305196, "balance_loss_mlp": 1.02005041, "epoch": 0.9837070883184031, "flos": 27673409341440.0, "grad_norm": 2.4013370722064864, "language_loss": 0.70984572, "learning_rate": 2.7640811309674883e-09, "loss": 0.73176587, "num_input_tokens_seen": 176623240, "step": 8181, "time_per_iteration": 2.6368725299835205 }, { "auxiliary_loss_clip": 0.01113293, "auxiliary_loss_mlp": 0.01021423, "balance_loss_clip": 1.04053319, "balance_loss_mlp": 1.0144875, "epoch": 0.9838273312090423, "flos": 29241951425280.0, "grad_norm": 1.5891921844419812, "language_loss": 0.81033969, "learning_rate": 2.7232927063498557e-09, "loss": 0.83168685, "num_input_tokens_seen": 176643615, "step": 8182, "time_per_iteration": 2.7740886211395264 }, { "auxiliary_loss_clip": 0.01150438, "auxiliary_loss_mlp": 0.01024916, "balance_loss_clip": 1.04337502, "balance_loss_mlp": 1.0178231, "epoch": 0.9839475740996814, "flos": 40110207304320.0, "grad_norm": 1.80264757106684, "language_loss": 0.68442631, "learning_rate": 2.682807267020859e-09, "loss": 0.70617986, "num_input_tokens_seen": 176666375, "step": 8183, "time_per_iteration": 2.7796945571899414 }, { "auxiliary_loss_clip": 0.01151021, "auxiliary_loss_mlp": 0.01020791, "balance_loss_clip": 1.04413521, "balance_loss_mlp": 1.01413631, "epoch": 0.9840678169903204, "flos": 24169389788160.0, "grad_norm": 1.6380351457562843, "language_loss": 0.62336528, "learning_rate": 2.642624819121808e-09, "loss": 0.64508337, "num_input_tokens_seen": 176686525, "step": 8184, "time_per_iteration": 2.6401853561401367 }, { "auxiliary_loss_clip": 0.01133665, "auxiliary_loss_mlp": 0.01024827, "balance_loss_clip": 1.04231811, "balance_loss_mlp": 1.01866651, "epoch": 0.9841880598809596, "flos": 14684484447360.0, "grad_norm": 2.1810914647554642, "language_loss": 0.61945438, "learning_rate": 2.6027453687487154e-09, "loss": 0.64103925, "num_input_tokens_seen": 176703615, "step": 8185, "time_per_iteration": 2.6136081218719482 }, { "auxiliary_loss_clip": 0.01137268, "auxiliary_loss_mlp": 0.01028476, "balance_loss_clip": 1.04200494, "balance_loss_mlp": 1.02113318, "epoch": 0.9843083027715986, "flos": 22344768668160.0, "grad_norm": 2.8251043799353033, "language_loss": 0.53834027, "learning_rate": 2.5631689219509643e-09, "loss": 0.55999768, "num_input_tokens_seen": 176722295, "step": 8186, "time_per_iteration": 2.6506192684173584 }, { "auxiliary_loss_clip": 0.01137055, "auxiliary_loss_mlp": 0.01022846, "balance_loss_clip": 1.04408693, "balance_loss_mlp": 1.01623893, "epoch": 0.9844285456622377, "flos": 21800111765760.0, "grad_norm": 1.7416033447823667, "language_loss": 0.83046269, "learning_rate": 2.523895484732197e-09, "loss": 0.85206163, "num_input_tokens_seen": 176741750, "step": 8187, "time_per_iteration": 2.6487839221954346 }, { "auxiliary_loss_clip": 0.01156342, "auxiliary_loss_mlp": 0.01024429, "balance_loss_clip": 1.0439806, "balance_loss_mlp": 1.01732397, "epoch": 0.9845487885528769, "flos": 18035380321920.0, "grad_norm": 1.8671000699746907, "language_loss": 0.7460767, "learning_rate": 2.4849250630505357e-09, "loss": 0.76788449, "num_input_tokens_seen": 176759995, "step": 8188, "time_per_iteration": 2.6001384258270264 }, { "auxiliary_loss_clip": 0.01070606, "auxiliary_loss_mlp": 0.01027332, "balance_loss_clip": 1.03725362, "balance_loss_mlp": 1.01961064, "epoch": 0.9846690314435159, "flos": 25228610974080.0, "grad_norm": 1.753641547483148, "language_loss": 0.73336017, "learning_rate": 2.4462576628172528e-09, "loss": 0.75433958, "num_input_tokens_seen": 176778625, "step": 8189, "time_per_iteration": 3.8289053440093994 }, { "auxiliary_loss_clip": 0.01147847, "auxiliary_loss_mlp": 0.01029227, "balance_loss_clip": 1.0447042, "balance_loss_mlp": 1.02221739, "epoch": 0.984789274334155, "flos": 18552171248640.0, "grad_norm": 1.825743406046752, "language_loss": 0.7415266, "learning_rate": 2.407893289898766e-09, "loss": 0.76329732, "num_input_tokens_seen": 176797655, "step": 8190, "time_per_iteration": 2.8211781978607178 }, { "auxiliary_loss_clip": 0.01115539, "auxiliary_loss_mlp": 0.01023922, "balance_loss_clip": 1.03878415, "balance_loss_mlp": 1.01629257, "epoch": 0.984909517224794, "flos": 27345437233920.0, "grad_norm": 1.8505153955887452, "language_loss": 0.84099972, "learning_rate": 2.3698319501144202e-09, "loss": 0.86239433, "num_input_tokens_seen": 176818640, "step": 8191, "time_per_iteration": 2.7914187908172607 }, { "auxiliary_loss_clip": 0.01156943, "auxiliary_loss_mlp": 0.01027377, "balance_loss_clip": 1.04443455, "balance_loss_mlp": 1.01961613, "epoch": 0.9850297601154332, "flos": 18734058743040.0, "grad_norm": 3.1157922905679327, "language_loss": 0.73227274, "learning_rate": 2.3320736492382644e-09, "loss": 0.75411594, "num_input_tokens_seen": 176837475, "step": 8192, "time_per_iteration": 2.6233291625976562 }, { "auxiliary_loss_clip": 0.01162346, "auxiliary_loss_mlp": 0.01022919, "balance_loss_clip": 1.04567409, "balance_loss_mlp": 1.01617765, "epoch": 0.9851500030060723, "flos": 22308247514880.0, "grad_norm": 1.6939747690078113, "language_loss": 0.67836297, "learning_rate": 2.29461839299816e-09, "loss": 0.7002157, "num_input_tokens_seen": 176857190, "step": 8193, "time_per_iteration": 3.517348527908325 }, { "auxiliary_loss_clip": 0.01125642, "auxiliary_loss_mlp": 0.01025409, "balance_loss_clip": 1.04257941, "balance_loss_mlp": 1.01834249, "epoch": 0.9852702458967113, "flos": 26353691746560.0, "grad_norm": 1.6351836323521844, "language_loss": 0.79852664, "learning_rate": 2.257466187076229e-09, "loss": 0.82003713, "num_input_tokens_seen": 176876395, "step": 8194, "time_per_iteration": 2.7107551097869873 }, { "auxiliary_loss_clip": 0.01154156, "auxiliary_loss_mlp": 0.00761897, "balance_loss_clip": 1.0427506, "balance_loss_mlp": 1.00034714, "epoch": 0.9853904887873505, "flos": 20883599314560.0, "grad_norm": 1.837531527125937, "language_loss": 0.7121501, "learning_rate": 2.2206170371081854e-09, "loss": 0.73131061, "num_input_tokens_seen": 176894980, "step": 8195, "time_per_iteration": 2.630932331085205 }, { "auxiliary_loss_clip": 0.0113714, "auxiliary_loss_mlp": 0.01030804, "balance_loss_clip": 1.04150319, "balance_loss_mlp": 1.0231384, "epoch": 0.9855107316779895, "flos": 25263444188160.0, "grad_norm": 1.6575777512702246, "language_loss": 0.84696949, "learning_rate": 2.1840709486842247e-09, "loss": 0.86864889, "num_input_tokens_seen": 176914600, "step": 8196, "time_per_iteration": 3.5751090049743652 }, { "auxiliary_loss_clip": 0.01131266, "auxiliary_loss_mlp": 0.01024056, "balance_loss_clip": 1.04220366, "balance_loss_mlp": 1.01661158, "epoch": 0.9856309745686286, "flos": 19062102677760.0, "grad_norm": 2.1287475084017884, "language_loss": 0.79313177, "learning_rate": 2.1478279273481335e-09, "loss": 0.81468499, "num_input_tokens_seen": 176933085, "step": 8197, "time_per_iteration": 2.663273572921753 }, { "auxiliary_loss_clip": 0.01151783, "auxiliary_loss_mlp": 0.01024342, "balance_loss_clip": 1.04651535, "balance_loss_mlp": 1.01712406, "epoch": 0.9857512174592677, "flos": 34130758060800.0, "grad_norm": 2.076350395259823, "language_loss": 0.80227679, "learning_rate": 2.1118879785981815e-09, "loss": 0.82403803, "num_input_tokens_seen": 176953225, "step": 8198, "time_per_iteration": 3.607318878173828 }, { "auxiliary_loss_clip": 0.01136031, "auxiliary_loss_mlp": 0.01023112, "balance_loss_clip": 1.04191542, "balance_loss_mlp": 1.01663589, "epoch": 0.9858714603499068, "flos": 25994693266560.0, "grad_norm": 1.7752136164443189, "language_loss": 0.7936942, "learning_rate": 2.0762511078862288e-09, "loss": 0.81528562, "num_input_tokens_seen": 176973570, "step": 8199, "time_per_iteration": 2.6537671089172363 }, { "auxiliary_loss_clip": 0.01144722, "auxiliary_loss_mlp": 0.01023872, "balance_loss_clip": 1.04306602, "balance_loss_mlp": 1.01662409, "epoch": 0.9859917032405459, "flos": 23696230907520.0, "grad_norm": 2.0730517718179033, "language_loss": 0.65106535, "learning_rate": 2.0409173206186183e-09, "loss": 0.67275125, "num_input_tokens_seen": 176992810, "step": 8200, "time_per_iteration": 2.6518001556396484 }, { "auxiliary_loss_clip": 0.01122173, "auxiliary_loss_mlp": 0.01026394, "balance_loss_clip": 1.0446595, "balance_loss_mlp": 1.0199213, "epoch": 0.986111946131185, "flos": 19938287134080.0, "grad_norm": 2.166288425391624, "language_loss": 0.87267649, "learning_rate": 2.0058866221550617e-09, "loss": 0.89416218, "num_input_tokens_seen": 177011050, "step": 8201, "time_per_iteration": 2.6962101459503174 }, { "auxiliary_loss_clip": 0.01163334, "auxiliary_loss_mlp": 0.01024525, "balance_loss_clip": 1.04418325, "balance_loss_mlp": 1.01772666, "epoch": 0.9862321890218241, "flos": 19828831415040.0, "grad_norm": 2.199917338205368, "language_loss": 0.75504541, "learning_rate": 1.971159017809976e-09, "loss": 0.77692401, "num_input_tokens_seen": 177029340, "step": 8202, "time_per_iteration": 2.63451886177063 }, { "auxiliary_loss_clip": 0.0115209, "auxiliary_loss_mlp": 0.01027011, "balance_loss_clip": 1.04489708, "balance_loss_mlp": 1.01950085, "epoch": 0.9863524319124631, "flos": 21652051904640.0, "grad_norm": 2.3903495524143863, "language_loss": 0.7777651, "learning_rate": 1.93673451285159e-09, "loss": 0.79955608, "num_input_tokens_seen": 177048390, "step": 8203, "time_per_iteration": 2.851266860961914 }, { "auxiliary_loss_clip": 0.01045756, "auxiliary_loss_mlp": 0.01001301, "balance_loss_clip": 1.00909233, "balance_loss_mlp": 1.00027561, "epoch": 0.9864726748031023, "flos": 52769977920000.0, "grad_norm": 0.7512770990135484, "language_loss": 0.56543112, "learning_rate": 1.9026131125019495e-09, "loss": 0.58590168, "num_input_tokens_seen": 177105760, "step": 8204, "time_per_iteration": 3.1936943531036377 }, { "auxiliary_loss_clip": 0.01148512, "auxiliary_loss_mlp": 0.01023263, "balance_loss_clip": 1.04599726, "balance_loss_mlp": 1.01626503, "epoch": 0.9865929176937414, "flos": 23364631526400.0, "grad_norm": 1.7230017627197731, "language_loss": 0.86866522, "learning_rate": 1.8687948219371363e-09, "loss": 0.89038295, "num_input_tokens_seen": 177124985, "step": 8205, "time_per_iteration": 2.6303586959838867 }, { "auxiliary_loss_clip": 0.01167442, "auxiliary_loss_mlp": 0.01026188, "balance_loss_clip": 1.0442344, "balance_loss_mlp": 1.01837969, "epoch": 0.9867131605843804, "flos": 21616679986560.0, "grad_norm": 2.2330630714545534, "language_loss": 0.88534141, "learning_rate": 1.835279646287491e-09, "loss": 0.9072777, "num_input_tokens_seen": 177142995, "step": 8206, "time_per_iteration": 2.62031626701355 }, { "auxiliary_loss_clip": 0.01157344, "auxiliary_loss_mlp": 0.01031288, "balance_loss_clip": 1.04581451, "balance_loss_mlp": 1.02272892, "epoch": 0.9868334034750196, "flos": 22271403139200.0, "grad_norm": 1.8556187601100032, "language_loss": 0.76604408, "learning_rate": 1.8020675906371685e-09, "loss": 0.78793037, "num_input_tokens_seen": 177162390, "step": 8207, "time_per_iteration": 2.6151347160339355 }, { "auxiliary_loss_clip": 0.01105063, "auxiliary_loss_mlp": 0.01024254, "balance_loss_clip": 1.03823864, "balance_loss_mlp": 1.01751316, "epoch": 0.9869536463656586, "flos": 25809573548160.0, "grad_norm": 2.0278918065458673, "language_loss": 0.75402498, "learning_rate": 1.7691586600243612e-09, "loss": 0.77531815, "num_input_tokens_seen": 177181290, "step": 8208, "time_per_iteration": 2.7702324390411377 }, { "auxiliary_loss_clip": 0.01134817, "auxiliary_loss_mlp": 0.01025637, "balance_loss_clip": 1.0446856, "balance_loss_mlp": 1.01866603, "epoch": 0.9870738892562977, "flos": 16398500613120.0, "grad_norm": 4.627438356642214, "language_loss": 0.87153566, "learning_rate": 1.7365528594415202e-09, "loss": 0.8931402, "num_input_tokens_seen": 177195360, "step": 8209, "time_per_iteration": 2.669318914413452 }, { "auxiliary_loss_clip": 0.01152573, "auxiliary_loss_mlp": 0.00762421, "balance_loss_clip": 1.04256511, "balance_loss_mlp": 1.00030255, "epoch": 0.9871941321469369, "flos": 35481358373760.0, "grad_norm": 1.7926468698917222, "language_loss": 0.67693591, "learning_rate": 1.7042501938346888e-09, "loss": 0.69608587, "num_input_tokens_seen": 177218090, "step": 8210, "time_per_iteration": 2.7942373752593994 }, { "auxiliary_loss_clip": 0.01125472, "auxiliary_loss_mlp": 0.01024814, "balance_loss_clip": 1.03876638, "balance_loss_mlp": 1.01835871, "epoch": 0.9873143750375759, "flos": 21434217874560.0, "grad_norm": 1.8447189468019352, "language_loss": 0.76804209, "learning_rate": 1.6722506681043913e-09, "loss": 0.78954494, "num_input_tokens_seen": 177237050, "step": 8211, "time_per_iteration": 2.6475439071655273 }, { "auxiliary_loss_clip": 0.01140667, "auxiliary_loss_mlp": 0.01023984, "balance_loss_clip": 1.04276061, "balance_loss_mlp": 1.01658678, "epoch": 0.987434617928215, "flos": 16326499800960.0, "grad_norm": 2.3209216894156564, "language_loss": 0.69450235, "learning_rate": 1.640554287104745e-09, "loss": 0.71614885, "num_input_tokens_seen": 177255325, "step": 8212, "time_per_iteration": 2.6391592025756836 }, { "auxiliary_loss_clip": 0.01125057, "auxiliary_loss_mlp": 0.01027118, "balance_loss_clip": 1.03778267, "balance_loss_mlp": 1.01933384, "epoch": 0.9875548608188541, "flos": 17851984456320.0, "grad_norm": 2.1623568561927775, "language_loss": 0.80417943, "learning_rate": 1.609161055644348e-09, "loss": 0.82570124, "num_input_tokens_seen": 177271250, "step": 8213, "time_per_iteration": 2.668924570083618 }, { "auxiliary_loss_clip": 0.01157055, "auxiliary_loss_mlp": 0.01023837, "balance_loss_clip": 1.0431571, "balance_loss_mlp": 1.01636565, "epoch": 0.9876751037094932, "flos": 26132876887680.0, "grad_norm": 2.0562683961233845, "language_loss": 0.68574506, "learning_rate": 1.5780709784849467e-09, "loss": 0.70755398, "num_input_tokens_seen": 177288270, "step": 8214, "time_per_iteration": 2.7031219005584717 }, { "auxiliary_loss_clip": 0.01101786, "auxiliary_loss_mlp": 0.01027768, "balance_loss_clip": 1.04431117, "balance_loss_mlp": 1.02084744, "epoch": 0.9877953466001322, "flos": 15991344973440.0, "grad_norm": 2.171285822249106, "language_loss": 0.82560349, "learning_rate": 1.5472840603436565e-09, "loss": 0.84689903, "num_input_tokens_seen": 177305500, "step": 8215, "time_per_iteration": 2.7734599113464355 }, { "auxiliary_loss_clip": 0.01139279, "auxiliary_loss_mlp": 0.01023407, "balance_loss_clip": 1.04151797, "balance_loss_mlp": 1.01646316, "epoch": 0.9879155894907714, "flos": 18806777827200.0, "grad_norm": 6.816878680382445, "language_loss": 0.78024602, "learning_rate": 1.5168003058900757e-09, "loss": 0.80187291, "num_input_tokens_seen": 177323500, "step": 8216, "time_per_iteration": 3.5101113319396973 }, { "auxiliary_loss_clip": 0.01123671, "auxiliary_loss_mlp": 0.01020638, "balance_loss_clip": 1.0419178, "balance_loss_mlp": 1.01369667, "epoch": 0.9880358323814105, "flos": 22382044007040.0, "grad_norm": 2.276018993755789, "language_loss": 0.91952837, "learning_rate": 1.4866197197491715e-09, "loss": 0.94097137, "num_input_tokens_seen": 177342860, "step": 8217, "time_per_iteration": 2.7531497478485107 }, { "auxiliary_loss_clip": 0.01156114, "auxiliary_loss_mlp": 0.00763098, "balance_loss_clip": 1.04544497, "balance_loss_mlp": 1.00037646, "epoch": 0.9881560752720495, "flos": 15668831733120.0, "grad_norm": 4.896192217562307, "language_loss": 0.78440309, "learning_rate": 1.4567423064988371e-09, "loss": 0.80359519, "num_input_tokens_seen": 177360210, "step": 8218, "time_per_iteration": 2.7097654342651367 }, { "auxiliary_loss_clip": 0.01167128, "auxiliary_loss_mlp": 0.01024742, "balance_loss_clip": 1.04564285, "balance_loss_mlp": 1.01701772, "epoch": 0.9882763181626887, "flos": 21500113374720.0, "grad_norm": 1.9461186630868361, "language_loss": 0.77925992, "learning_rate": 1.4271680706718913e-09, "loss": 0.80117863, "num_input_tokens_seen": 177377885, "step": 8219, "time_per_iteration": 3.6073474884033203 }, { "auxiliary_loss_clip": 0.01153535, "auxiliary_loss_mlp": 0.01027212, "balance_loss_clip": 1.04688251, "balance_loss_mlp": 1.01958859, "epoch": 0.9883965610533277, "flos": 28034598551040.0, "grad_norm": 1.6734866495682483, "language_loss": 0.82445961, "learning_rate": 1.3978970167543013e-09, "loss": 0.8462671, "num_input_tokens_seen": 177398065, "step": 8220, "time_per_iteration": 2.720426321029663 }, { "auxiliary_loss_clip": 0.01128921, "auxiliary_loss_mlp": 0.01025796, "balance_loss_clip": 1.04115772, "balance_loss_mlp": 1.01853919, "epoch": 0.9885168039439668, "flos": 14098601710080.0, "grad_norm": 2.1171537946094157, "language_loss": 0.77595997, "learning_rate": 1.3689291491867372e-09, "loss": 0.79750711, "num_input_tokens_seen": 177416380, "step": 8221, "time_per_iteration": 2.622058629989624 }, { "auxiliary_loss_clip": 0.01166721, "auxiliary_loss_mlp": 0.01022509, "balance_loss_clip": 1.04528093, "balance_loss_mlp": 1.01520097, "epoch": 0.988637046834606, "flos": 26432013352320.0, "grad_norm": 3.6757263638257474, "language_loss": 0.73436809, "learning_rate": 1.3402644723636836e-09, "loss": 0.7562604, "num_input_tokens_seen": 177438410, "step": 8222, "time_per_iteration": 3.504518985748291 }, { "auxiliary_loss_clip": 0.0113526, "auxiliary_loss_mlp": 0.0102954, "balance_loss_clip": 1.04471874, "balance_loss_mlp": 1.02202964, "epoch": 0.988757289725245, "flos": 25229113764480.0, "grad_norm": 1.9090439636361296, "language_loss": 0.83721244, "learning_rate": 1.311902990633218e-09, "loss": 0.85886043, "num_input_tokens_seen": 177457375, "step": 8223, "time_per_iteration": 2.761946678161621 }, { "auxiliary_loss_clip": 0.01128152, "auxiliary_loss_mlp": 0.01025538, "balance_loss_clip": 1.03753233, "balance_loss_mlp": 1.01861811, "epoch": 0.9888775326158841, "flos": 26359042872960.0, "grad_norm": 1.5250581634388247, "language_loss": 0.71540105, "learning_rate": 1.2838447082978987e-09, "loss": 0.73693794, "num_input_tokens_seen": 177478530, "step": 8224, "time_per_iteration": 3.571662187576294 }, { "auxiliary_loss_clip": 0.01146909, "auxiliary_loss_mlp": 0.01023435, "balance_loss_clip": 1.04236698, "balance_loss_mlp": 1.01668739, "epoch": 0.9889977755065231, "flos": 24316120846080.0, "grad_norm": 2.325418192375166, "language_loss": 0.83036387, "learning_rate": 1.2560896296143208e-09, "loss": 0.85206735, "num_input_tokens_seen": 177496995, "step": 8225, "time_per_iteration": 2.6467814445495605 }, { "auxiliary_loss_clip": 0.01164854, "auxiliary_loss_mlp": 0.01028327, "balance_loss_clip": 1.0443002, "balance_loss_mlp": 1.02158582, "epoch": 0.9891180183971623, "flos": 18951066760320.0, "grad_norm": 3.046580887460009, "language_loss": 0.82645762, "learning_rate": 1.2286377587926722e-09, "loss": 0.84838939, "num_input_tokens_seen": 177513785, "step": 8226, "time_per_iteration": 2.5863096714019775 }, { "auxiliary_loss_clip": 0.01163988, "auxiliary_loss_mlp": 0.01021356, "balance_loss_clip": 1.04430628, "balance_loss_mlp": 1.01431668, "epoch": 0.9892382612878013, "flos": 26176580760960.0, "grad_norm": 2.346190601250753, "language_loss": 0.75136042, "learning_rate": 1.2014890999973992e-09, "loss": 0.7732138, "num_input_tokens_seen": 177530705, "step": 8227, "time_per_iteration": 2.634845733642578 }, { "auxiliary_loss_clip": 0.01162054, "auxiliary_loss_mlp": 0.01026713, "balance_loss_clip": 1.0429554, "balance_loss_mlp": 1.02023685, "epoch": 0.9893585041784404, "flos": 25449605400960.0, "grad_norm": 1.6799168444075059, "language_loss": 0.78251827, "learning_rate": 1.1746436573472073e-09, "loss": 0.80440593, "num_input_tokens_seen": 177552440, "step": 8228, "time_per_iteration": 2.6243631839752197 }, { "auxiliary_loss_clip": 0.01146772, "auxiliary_loss_mlp": 0.01027599, "balance_loss_clip": 1.0433526, "balance_loss_mlp": 1.0202949, "epoch": 0.9894787470690796, "flos": 20189302352640.0, "grad_norm": 1.9837860376900223, "language_loss": 0.68930101, "learning_rate": 1.1481014349141726e-09, "loss": 0.71104473, "num_input_tokens_seen": 177569660, "step": 8229, "time_per_iteration": 2.641768217086792 }, { "auxiliary_loss_clip": 0.01141387, "auxiliary_loss_mlp": 0.01033633, "balance_loss_clip": 1.04423285, "balance_loss_mlp": 1.02593184, "epoch": 0.9895989899597186, "flos": 24644308435200.0, "grad_norm": 1.9965793650963444, "language_loss": 0.84310603, "learning_rate": 1.121862436724852e-09, "loss": 0.86485624, "num_input_tokens_seen": 177588500, "step": 8230, "time_per_iteration": 2.8094396591186523 }, { "auxiliary_loss_clip": 0.01153283, "auxiliary_loss_mlp": 0.01025287, "balance_loss_clip": 1.04657638, "balance_loss_mlp": 1.01826608, "epoch": 0.9897192328503577, "flos": 21799034357760.0, "grad_norm": 1.9215596031962985, "language_loss": 0.70307648, "learning_rate": 1.0959266667598388e-09, "loss": 0.72486222, "num_input_tokens_seen": 177607315, "step": 8231, "time_per_iteration": 2.6665995121002197 }, { "auxiliary_loss_clip": 0.01126102, "auxiliary_loss_mlp": 0.01027505, "balance_loss_clip": 1.04264259, "balance_loss_mlp": 1.01988769, "epoch": 0.9898394757409968, "flos": 21325229032320.0, "grad_norm": 1.9293956194215527, "language_loss": 0.74666214, "learning_rate": 1.0702941289533196e-09, "loss": 0.76819819, "num_input_tokens_seen": 177625990, "step": 8232, "time_per_iteration": 2.727536916732788 }, { "auxiliary_loss_clip": 0.01121775, "auxiliary_loss_mlp": 0.01021358, "balance_loss_clip": 1.04258728, "balance_loss_mlp": 1.01436639, "epoch": 0.9899597186316359, "flos": 18545024442240.0, "grad_norm": 1.931322662818884, "language_loss": 0.88878369, "learning_rate": 1.0449648271939615e-09, "loss": 0.91021502, "num_input_tokens_seen": 177642335, "step": 8233, "time_per_iteration": 2.660128116607666 }, { "auxiliary_loss_clip": 0.01109889, "auxiliary_loss_mlp": 0.00761709, "balance_loss_clip": 1.04158711, "balance_loss_mlp": 1.00037718, "epoch": 0.990079961522275, "flos": 23766723348480.0, "grad_norm": 1.4934013580916503, "language_loss": 0.7304132, "learning_rate": 1.0199387653240243e-09, "loss": 0.74912918, "num_input_tokens_seen": 177662025, "step": 8234, "time_per_iteration": 2.7905023097991943 }, { "auxiliary_loss_clip": 0.01132325, "auxiliary_loss_mlp": 0.01023837, "balance_loss_clip": 1.04399872, "balance_loss_mlp": 1.01764727, "epoch": 0.9902002044129141, "flos": 16399182971520.0, "grad_norm": 1.6670187027437622, "language_loss": 0.70620829, "learning_rate": 9.952159471400267e-10, "loss": 0.72776991, "num_input_tokens_seen": 177679065, "step": 8235, "time_per_iteration": 2.815988540649414 }, { "auxiliary_loss_clip": 0.0115364, "auxiliary_loss_mlp": 0.00761379, "balance_loss_clip": 1.04367268, "balance_loss_mlp": 1.0003705, "epoch": 0.9903204473035532, "flos": 22559657783040.0, "grad_norm": 1.9161639801957944, "language_loss": 0.84601307, "learning_rate": 9.707963763923022e-10, "loss": 0.86516333, "num_input_tokens_seen": 177698115, "step": 8236, "time_per_iteration": 2.7349956035614014 }, { "auxiliary_loss_clip": 0.01134214, "auxiliary_loss_mlp": 0.01024552, "balance_loss_clip": 1.03999233, "balance_loss_mlp": 1.01772976, "epoch": 0.9904406901941922, "flos": 16144001775360.0, "grad_norm": 1.9246566794650046, "language_loss": 0.79188627, "learning_rate": 9.466800567854427e-10, "loss": 0.81347394, "num_input_tokens_seen": 177716715, "step": 8237, "time_per_iteration": 2.6396920680999756 }, { "auxiliary_loss_clip": 0.0112262, "auxiliary_loss_mlp": 0.01027196, "balance_loss_clip": 1.03826654, "balance_loss_mlp": 1.0193553, "epoch": 0.9905609330848314, "flos": 26651499408000.0, "grad_norm": 1.8180485382088207, "language_loss": 0.68299592, "learning_rate": 9.228669919778553e-10, "loss": 0.70449406, "num_input_tokens_seen": 177735640, "step": 8238, "time_per_iteration": 2.6910207271575928 }, { "auxiliary_loss_clip": 0.01134263, "auxiliary_loss_mlp": 0.01032566, "balance_loss_clip": 1.04250503, "balance_loss_mlp": 1.02506471, "epoch": 0.9906811759754705, "flos": 23111820627840.0, "grad_norm": 2.053715609679284, "language_loss": 0.79871535, "learning_rate": 8.993571855817617e-10, "loss": 0.82038367, "num_input_tokens_seen": 177754470, "step": 8239, "time_per_iteration": 2.6386706829071045 }, { "auxiliary_loss_clip": 0.01148978, "auxiliary_loss_mlp": 0.010257, "balance_loss_clip": 1.04169559, "balance_loss_mlp": 1.01863694, "epoch": 0.9908014188661095, "flos": 22090593052800.0, "grad_norm": 1.7821081644255254, "language_loss": 0.74804413, "learning_rate": 8.761506411638642e-10, "loss": 0.76979095, "num_input_tokens_seen": 177773935, "step": 8240, "time_per_iteration": 2.5869617462158203 }, { "auxiliary_loss_clip": 0.01134169, "auxiliary_loss_mlp": 0.01024073, "balance_loss_clip": 1.04379582, "balance_loss_mlp": 1.0168519, "epoch": 0.9909216617567487, "flos": 19242948677760.0, "grad_norm": 1.7230859344297593, "language_loss": 0.73569053, "learning_rate": 8.53247362244236e-10, "loss": 0.7572729, "num_input_tokens_seen": 177792745, "step": 8241, "time_per_iteration": 2.723297119140625 }, { "auxiliary_loss_clip": 0.0113698, "auxiliary_loss_mlp": 0.01029711, "balance_loss_clip": 1.04325807, "balance_loss_mlp": 1.02288365, "epoch": 0.9910419046473877, "flos": 23621213352960.0, "grad_norm": 1.7275401500072267, "language_loss": 0.68030429, "learning_rate": 8.306473522976532e-10, "loss": 0.70197117, "num_input_tokens_seen": 177812150, "step": 8242, "time_per_iteration": 3.5957937240600586 }, { "auxiliary_loss_clip": 0.01164243, "auxiliary_loss_mlp": 0.01024723, "balance_loss_clip": 1.04545462, "balance_loss_mlp": 1.01810122, "epoch": 0.9911621475380268, "flos": 22711380831360.0, "grad_norm": 1.785503892722512, "language_loss": 0.71853459, "learning_rate": 8.083506147522623e-10, "loss": 0.74042422, "num_input_tokens_seen": 177831545, "step": 8243, "time_per_iteration": 2.665158271789551 }, { "auxiliary_loss_clip": 0.01145129, "auxiliary_loss_mlp": 0.01025562, "balance_loss_clip": 1.04214406, "balance_loss_mlp": 1.01822448, "epoch": 0.991282390428666, "flos": 13516956777600.0, "grad_norm": 2.2884752447212957, "language_loss": 0.84966099, "learning_rate": 7.863571529906909e-10, "loss": 0.87136793, "num_input_tokens_seen": 177847130, "step": 8244, "time_per_iteration": 2.730184316635132 }, { "auxiliary_loss_clip": 0.01054069, "auxiliary_loss_mlp": 0.0100104, "balance_loss_clip": 1.00949264, "balance_loss_mlp": 0.99997312, "epoch": 0.991402633319305, "flos": 61830492071040.0, "grad_norm": 0.7263403823788498, "language_loss": 0.59721565, "learning_rate": 7.646669703489372e-10, "loss": 0.61776674, "num_input_tokens_seen": 177911440, "step": 8245, "time_per_iteration": 4.293559789657593 }, { "auxiliary_loss_clip": 0.01059886, "auxiliary_loss_mlp": 0.01025469, "balance_loss_clip": 1.03371191, "balance_loss_mlp": 1.01849222, "epoch": 0.9915228762099441, "flos": 18770148933120.0, "grad_norm": 2.0980225612717045, "language_loss": 0.57417083, "learning_rate": 7.432800701177023e-10, "loss": 0.59502435, "num_input_tokens_seen": 177929440, "step": 8246, "time_per_iteration": 3.319733142852783 }, { "auxiliary_loss_clip": 0.01045886, "auxiliary_loss_mlp": 0.01002611, "balance_loss_clip": 1.01066363, "balance_loss_mlp": 1.00157976, "epoch": 0.9916431191005832, "flos": 65936660244480.0, "grad_norm": 0.7926023061662805, "language_loss": 0.57773662, "learning_rate": 7.221964555415017e-10, "loss": 0.59822154, "num_input_tokens_seen": 177989100, "step": 8247, "time_per_iteration": 3.3543989658355713 }, { "auxiliary_loss_clip": 0.0113477, "auxiliary_loss_mlp": 0.01020832, "balance_loss_clip": 1.04307318, "balance_loss_mlp": 1.0146066, "epoch": 0.9917633619912223, "flos": 16581573256320.0, "grad_norm": 2.041771259741335, "language_loss": 0.74865007, "learning_rate": 7.01416129818222e-10, "loss": 0.77020609, "num_input_tokens_seen": 178006720, "step": 8248, "time_per_iteration": 3.619908571243286 }, { "auxiliary_loss_clip": 0.01131546, "auxiliary_loss_mlp": 0.01026558, "balance_loss_clip": 1.04182124, "balance_loss_mlp": 1.01999235, "epoch": 0.9918836048818613, "flos": 25411108999680.0, "grad_norm": 2.120926805841478, "language_loss": 0.57963955, "learning_rate": 6.809390961006745e-10, "loss": 0.60122061, "num_input_tokens_seen": 178026850, "step": 8249, "time_per_iteration": 2.8643739223480225 }, { "auxiliary_loss_clip": 0.01137593, "auxiliary_loss_mlp": 0.01025376, "balance_loss_clip": 1.04354298, "balance_loss_mlp": 1.01818728, "epoch": 0.9920038477725005, "flos": 25046867134080.0, "grad_norm": 2.153785593934072, "language_loss": 0.68529844, "learning_rate": 6.607653574948191e-10, "loss": 0.70692813, "num_input_tokens_seen": 178047630, "step": 8250, "time_per_iteration": 3.6667323112487793 }, { "auxiliary_loss_clip": 0.011419, "auxiliary_loss_mlp": 0.01023746, "balance_loss_clip": 1.04051471, "balance_loss_mlp": 1.01735651, "epoch": 0.9921240906631396, "flos": 21829773421440.0, "grad_norm": 2.449558430084373, "language_loss": 0.81838977, "learning_rate": 6.408949170613187e-10, "loss": 0.84004617, "num_input_tokens_seen": 178066895, "step": 8251, "time_per_iteration": 2.6726372241973877 }, { "auxiliary_loss_clip": 0.01135575, "auxiliary_loss_mlp": 0.0102886, "balance_loss_clip": 1.04236174, "balance_loss_mlp": 1.0212965, "epoch": 0.9922443335537786, "flos": 24864225454080.0, "grad_norm": 29.299835320607137, "language_loss": 0.81621772, "learning_rate": 6.213277778144288e-10, "loss": 0.83786201, "num_input_tokens_seen": 178088540, "step": 8252, "time_per_iteration": 2.7124154567718506 }, { "auxiliary_loss_clip": 0.01096148, "auxiliary_loss_mlp": 0.01026666, "balance_loss_clip": 1.0359776, "balance_loss_mlp": 1.01901305, "epoch": 0.9923645764444178, "flos": 21613088626560.0, "grad_norm": 1.93972722094967, "language_loss": 0.66734904, "learning_rate": 6.020639427224416e-10, "loss": 0.68857718, "num_input_tokens_seen": 178106185, "step": 8253, "time_per_iteration": 2.762369155883789 }, { "auxiliary_loss_clip": 0.01137798, "auxiliary_loss_mlp": 0.01026346, "balance_loss_clip": 1.04321635, "balance_loss_mlp": 1.01902342, "epoch": 0.9924848193350568, "flos": 25001798544000.0, "grad_norm": 2.6326052591299764, "language_loss": 0.72553515, "learning_rate": 5.831034147076864e-10, "loss": 0.74717653, "num_input_tokens_seen": 178123435, "step": 8254, "time_per_iteration": 2.696176528930664 }, { "auxiliary_loss_clip": 0.01049001, "auxiliary_loss_mlp": 0.01000233, "balance_loss_clip": 1.00892162, "balance_loss_mlp": 0.99922603, "epoch": 0.9926050622256959, "flos": 68912543151360.0, "grad_norm": 0.6859192240318006, "language_loss": 0.55744421, "learning_rate": 5.644461966463065e-10, "loss": 0.57793653, "num_input_tokens_seen": 178191045, "step": 8255, "time_per_iteration": 3.2910313606262207 }, { "auxiliary_loss_clip": 0.01134806, "auxiliary_loss_mlp": 0.01021221, "balance_loss_clip": 1.04268408, "balance_loss_mlp": 1.01451612, "epoch": 0.9927253051163349, "flos": 20923675914240.0, "grad_norm": 1.8596646511894117, "language_loss": 0.75850368, "learning_rate": 5.460922913687049e-10, "loss": 0.78006393, "num_input_tokens_seen": 178210135, "step": 8256, "time_per_iteration": 2.673096179962158 }, { "auxiliary_loss_clip": 0.01105439, "auxiliary_loss_mlp": 0.0076254, "balance_loss_clip": 1.03698802, "balance_loss_mlp": 1.00034523, "epoch": 0.9928455480069741, "flos": 22308211601280.0, "grad_norm": 2.527157677597883, "language_loss": 0.75494218, "learning_rate": 5.280417016593208e-10, "loss": 0.77362198, "num_input_tokens_seen": 178229925, "step": 8257, "time_per_iteration": 2.910029172897339 }, { "auxiliary_loss_clip": 0.01149994, "auxiliary_loss_mlp": 0.00762291, "balance_loss_clip": 1.04601192, "balance_loss_mlp": 1.00041211, "epoch": 0.9929657908976132, "flos": 17383889393280.0, "grad_norm": 1.8309486992685582, "language_loss": 0.74487162, "learning_rate": 5.102944302559642e-10, "loss": 0.76399446, "num_input_tokens_seen": 178247420, "step": 8258, "time_per_iteration": 2.6656014919281006 }, { "auxiliary_loss_clip": 0.0110143, "auxiliary_loss_mlp": 0.01026072, "balance_loss_clip": 1.03971851, "balance_loss_mlp": 1.01803088, "epoch": 0.9930860337882522, "flos": 22674680110080.0, "grad_norm": 1.9593407122295374, "language_loss": 0.8016333, "learning_rate": 4.9285047985137e-10, "loss": 0.82290834, "num_input_tokens_seen": 178266840, "step": 8259, "time_per_iteration": 2.807893991470337 }, { "auxiliary_loss_clip": 0.01156592, "auxiliary_loss_mlp": 0.01025392, "balance_loss_clip": 1.04626274, "balance_loss_mlp": 1.01813185, "epoch": 0.9932062766788914, "flos": 28147789284480.0, "grad_norm": 1.7229196818811334, "language_loss": 0.74588919, "learning_rate": 4.757098530916436e-10, "loss": 0.76770902, "num_input_tokens_seen": 178287285, "step": 8260, "time_per_iteration": 2.6834499835968018 }, { "auxiliary_loss_clip": 0.01153565, "auxiliary_loss_mlp": 0.01028891, "balance_loss_clip": 1.04600394, "balance_loss_mlp": 1.02126169, "epoch": 0.9933265195695304, "flos": 20156659868160.0, "grad_norm": 3.156397996066343, "language_loss": 0.77526313, "learning_rate": 4.5887255257670563e-10, "loss": 0.79708767, "num_input_tokens_seen": 178304325, "step": 8261, "time_per_iteration": 2.616689920425415 }, { "auxiliary_loss_clip": 0.01165061, "auxiliary_loss_mlp": 0.01022567, "balance_loss_clip": 1.04495239, "balance_loss_mlp": 1.0156436, "epoch": 0.9934467624601695, "flos": 21362037494400.0, "grad_norm": 1.8677009051034443, "language_loss": 0.77351236, "learning_rate": 4.4233858086117906e-10, "loss": 0.79538864, "num_input_tokens_seen": 178322850, "step": 8262, "time_per_iteration": 2.6168174743652344 }, { "auxiliary_loss_clip": 0.01107739, "auxiliary_loss_mlp": 0.0102385, "balance_loss_clip": 1.04491675, "balance_loss_mlp": 1.0168643, "epoch": 0.9935670053508087, "flos": 19756040503680.0, "grad_norm": 2.5213051479223516, "language_loss": 0.67453682, "learning_rate": 4.261079404528356e-10, "loss": 0.69585264, "num_input_tokens_seen": 178342330, "step": 8263, "time_per_iteration": 2.7505593299865723 }, { "auxiliary_loss_clip": 0.01147809, "auxiliary_loss_mlp": 0.01031693, "balance_loss_clip": 1.0431385, "balance_loss_mlp": 1.02382505, "epoch": 0.9936872482414477, "flos": 21978838863360.0, "grad_norm": 1.9496254205845174, "language_loss": 0.68788922, "learning_rate": 4.1018063381437205e-10, "loss": 0.70968425, "num_input_tokens_seen": 178362715, "step": 8264, "time_per_iteration": 2.6474568843841553 }, { "auxiliary_loss_clip": 0.01051548, "auxiliary_loss_mlp": 0.01001446, "balance_loss_clip": 1.01301003, "balance_loss_mlp": 1.00039661, "epoch": 0.9938074911320868, "flos": 69810667839360.0, "grad_norm": 0.8608330842132922, "language_loss": 0.61067891, "learning_rate": 3.9455666336141167e-10, "loss": 0.6312089, "num_input_tokens_seen": 178426495, "step": 8265, "time_per_iteration": 3.2728450298309326 }, { "auxiliary_loss_clip": 0.01164398, "auxiliary_loss_mlp": 0.01027079, "balance_loss_clip": 1.04628491, "balance_loss_mlp": 1.01956856, "epoch": 0.9939277340227259, "flos": 15084170058240.0, "grad_norm": 3.1600897682490494, "language_loss": 0.83566642, "learning_rate": 3.7923603146450267e-10, "loss": 0.85758114, "num_input_tokens_seen": 178442555, "step": 8266, "time_per_iteration": 2.5377769470214844 }, { "auxiliary_loss_clip": 0.01124173, "auxiliary_loss_mlp": 0.01022877, "balance_loss_clip": 1.0385989, "balance_loss_mlp": 1.01601672, "epoch": 0.994047976913365, "flos": 17712364291200.0, "grad_norm": 1.8197416043191126, "language_loss": 0.80693406, "learning_rate": 3.642187404473418e-10, "loss": 0.82840455, "num_input_tokens_seen": 178460715, "step": 8267, "time_per_iteration": 2.687286853790283 }, { "auxiliary_loss_clip": 0.01152293, "auxiliary_loss_mlp": 0.01024052, "balance_loss_clip": 1.04391623, "balance_loss_mlp": 1.01697695, "epoch": 0.994168219804004, "flos": 19171558396800.0, "grad_norm": 2.13717424866532, "language_loss": 0.86103696, "learning_rate": 3.495047925885508e-10, "loss": 0.8828004, "num_input_tokens_seen": 178479050, "step": 8268, "time_per_iteration": 3.6111183166503906 }, { "auxiliary_loss_clip": 0.01134464, "auxiliary_loss_mlp": 0.0102537, "balance_loss_clip": 1.04131174, "balance_loss_mlp": 1.01779127, "epoch": 0.9942884626946432, "flos": 17851589406720.0, "grad_norm": 2.1821276831131646, "language_loss": 0.82758796, "learning_rate": 3.350941901199e-10, "loss": 0.8491863, "num_input_tokens_seen": 178495970, "step": 8269, "time_per_iteration": 2.6740124225616455 }, { "auxiliary_loss_clip": 0.0114051, "auxiliary_loss_mlp": 0.01027111, "balance_loss_clip": 1.04167426, "balance_loss_mlp": 1.01942229, "epoch": 0.9944087055852823, "flos": 18796578364800.0, "grad_norm": 2.3058357555644986, "language_loss": 0.83707488, "learning_rate": 3.2098693522764066e-10, "loss": 0.85875106, "num_input_tokens_seen": 178509170, "step": 8270, "time_per_iteration": 2.679692268371582 }, { "auxiliary_loss_clip": 0.01143565, "auxiliary_loss_mlp": 0.00762174, "balance_loss_clip": 1.04250193, "balance_loss_mlp": 1.0003643, "epoch": 0.9945289484759213, "flos": 20996969616000.0, "grad_norm": 1.8105039846711797, "language_loss": 0.81174302, "learning_rate": 3.071830300516165e-10, "loss": 0.83080041, "num_input_tokens_seen": 178527000, "step": 8271, "time_per_iteration": 3.6483724117279053 }, { "auxiliary_loss_clip": 0.01156319, "auxiliary_loss_mlp": 0.01028815, "balance_loss_clip": 1.04430127, "balance_loss_mlp": 1.0209533, "epoch": 0.9946491913665605, "flos": 14756952136320.0, "grad_norm": 2.039092436970155, "language_loss": 0.70821285, "learning_rate": 2.9368247668615234e-10, "loss": 0.73006421, "num_input_tokens_seen": 178545590, "step": 8272, "time_per_iteration": 2.64585280418396 }, { "auxiliary_loss_clip": 0.01171628, "auxiliary_loss_mlp": 0.01026373, "balance_loss_clip": 1.04854131, "balance_loss_mlp": 1.01874709, "epoch": 0.9947694342571995, "flos": 12669931186560.0, "grad_norm": 2.0213017151741814, "language_loss": 0.61117983, "learning_rate": 2.804852771789434e-10, "loss": 0.63315988, "num_input_tokens_seen": 178558890, "step": 8273, "time_per_iteration": 2.5425117015838623 }, { "auxiliary_loss_clip": 0.01161782, "auxiliary_loss_mlp": 0.01020662, "balance_loss_clip": 1.04429448, "balance_loss_mlp": 1.01441288, "epoch": 0.9948896771478386, "flos": 18843442634880.0, "grad_norm": 1.67259591540126, "language_loss": 0.55985832, "learning_rate": 2.675914335321661e-10, "loss": 0.58168274, "num_input_tokens_seen": 178577645, "step": 8274, "time_per_iteration": 3.6576621532440186 }, { "auxiliary_loss_clip": 0.01154575, "auxiliary_loss_mlp": 0.01030338, "balance_loss_clip": 1.04412675, "balance_loss_mlp": 1.02201152, "epoch": 0.9950099200384778, "flos": 24900207903360.0, "grad_norm": 2.272377143364523, "language_loss": 0.7940824, "learning_rate": 2.550009477018111e-10, "loss": 0.81593156, "num_input_tokens_seen": 178596415, "step": 8275, "time_per_iteration": 2.654515266418457 }, { "auxiliary_loss_clip": 0.01135923, "auxiliary_loss_mlp": 0.00762318, "balance_loss_clip": 1.04284811, "balance_loss_mlp": 1.00036311, "epoch": 0.9951301629291168, "flos": 23733613987200.0, "grad_norm": 1.9078164531823198, "language_loss": 0.63270736, "learning_rate": 2.4271382159790634e-10, "loss": 0.65168977, "num_input_tokens_seen": 178613845, "step": 8276, "time_per_iteration": 3.6025702953338623 }, { "auxiliary_loss_clip": 0.01106215, "auxiliary_loss_mlp": 0.0102173, "balance_loss_clip": 1.03911185, "balance_loss_mlp": 1.01425266, "epoch": 0.9952504058197559, "flos": 22236893147520.0, "grad_norm": 1.7931669125561356, "language_loss": 0.85946321, "learning_rate": 2.3073005708429406e-10, "loss": 0.88074267, "num_input_tokens_seen": 178633490, "step": 8277, "time_per_iteration": 2.788496494293213 }, { "auxiliary_loss_clip": 0.01119962, "auxiliary_loss_mlp": 0.01025521, "balance_loss_clip": 1.04342961, "balance_loss_mlp": 1.01919663, "epoch": 0.995370648710395, "flos": 21211032718080.0, "grad_norm": 1.931039701810572, "language_loss": 0.72466695, "learning_rate": 2.190496559788535e-10, "loss": 0.74612182, "num_input_tokens_seen": 178651775, "step": 8278, "time_per_iteration": 2.711463451385498 }, { "auxiliary_loss_clip": 0.01134942, "auxiliary_loss_mlp": 0.01024952, "balance_loss_clip": 1.04224014, "balance_loss_mlp": 1.01779366, "epoch": 0.9954908916010341, "flos": 14866731077760.0, "grad_norm": 5.34408705512711, "language_loss": 0.76626676, "learning_rate": 2.0767262005372265e-10, "loss": 0.7878657, "num_input_tokens_seen": 178669290, "step": 8279, "time_per_iteration": 2.7115743160247803 }, { "auxiliary_loss_clip": 0.01128405, "auxiliary_loss_mlp": 0.01025758, "balance_loss_clip": 1.03960705, "balance_loss_mlp": 1.01854587, "epoch": 0.9956111344916732, "flos": 19208259118080.0, "grad_norm": 2.4513972587133654, "language_loss": 0.7531454, "learning_rate": 1.965989510346322e-10, "loss": 0.77468705, "num_input_tokens_seen": 178688410, "step": 8280, "time_per_iteration": 2.879134178161621 }, { "auxiliary_loss_clip": 0.01103891, "auxiliary_loss_mlp": 0.01025001, "balance_loss_clip": 1.03892517, "balance_loss_mlp": 1.01763093, "epoch": 0.9957313773823123, "flos": 20047060494720.0, "grad_norm": 2.0250146487167013, "language_loss": 0.71266687, "learning_rate": 1.8582865060134955e-10, "loss": 0.7339558, "num_input_tokens_seen": 178706600, "step": 8281, "time_per_iteration": 2.704165458679199 }, { "auxiliary_loss_clip": 0.01062851, "auxiliary_loss_mlp": 0.01000859, "balance_loss_clip": 1.00910902, "balance_loss_mlp": 0.99982828, "epoch": 0.9958516202729514, "flos": 57483253768320.0, "grad_norm": 0.776159037872063, "language_loss": 0.55714875, "learning_rate": 1.7536172038790098e-10, "loss": 0.57778585, "num_input_tokens_seen": 178766910, "step": 8282, "time_per_iteration": 3.252211570739746 }, { "auxiliary_loss_clip": 0.01136756, "auxiliary_loss_mlp": 0.01021768, "balance_loss_clip": 1.04288018, "balance_loss_mlp": 1.01506817, "epoch": 0.9959718631635904, "flos": 27782900974080.0, "grad_norm": 2.515770258635666, "language_loss": 0.69169611, "learning_rate": 1.651981619819054e-10, "loss": 0.71328133, "num_input_tokens_seen": 178784060, "step": 8283, "time_per_iteration": 2.7074215412139893 }, { "auxiliary_loss_clip": 0.01111898, "auxiliary_loss_mlp": 0.01023927, "balance_loss_clip": 1.04071546, "balance_loss_mlp": 1.01692355, "epoch": 0.9960921060542296, "flos": 24024095274240.0, "grad_norm": 2.306204724408792, "language_loss": 0.70557344, "learning_rate": 1.5533797692546257e-10, "loss": 0.72693169, "num_input_tokens_seen": 178802795, "step": 8284, "time_per_iteration": 2.7203524112701416 }, { "auxiliary_loss_clip": 0.01146414, "auxiliary_loss_mlp": 0.0102722, "balance_loss_clip": 1.04125977, "balance_loss_mlp": 1.01942933, "epoch": 0.9962123489448687, "flos": 18697393935360.0, "grad_norm": 2.368049033661081, "language_loss": 0.84500968, "learning_rate": 1.4578116671404296e-10, "loss": 0.86674601, "num_input_tokens_seen": 178821075, "step": 8285, "time_per_iteration": 2.6732044219970703 }, { "auxiliary_loss_clip": 0.01149426, "auxiliary_loss_mlp": 0.01022052, "balance_loss_clip": 1.04688859, "balance_loss_mlp": 1.01529324, "epoch": 0.9963325918355077, "flos": 20010754823040.0, "grad_norm": 2.3684620748172804, "language_loss": 0.71088624, "learning_rate": 1.3652773279759777e-10, "loss": 0.73260111, "num_input_tokens_seen": 178837725, "step": 8286, "time_per_iteration": 2.6329290866851807 }, { "auxiliary_loss_clip": 0.01151097, "auxiliary_loss_mlp": 0.01022907, "balance_loss_clip": 1.04354715, "balance_loss_mlp": 1.01559019, "epoch": 0.9964528347261468, "flos": 33108488991360.0, "grad_norm": 1.9394108053929289, "language_loss": 0.6324122, "learning_rate": 1.2757767657989305e-10, "loss": 0.65415227, "num_input_tokens_seen": 178861515, "step": 8287, "time_per_iteration": 2.6845993995666504 }, { "auxiliary_loss_clip": 0.01149889, "auxiliary_loss_mlp": 0.01022652, "balance_loss_clip": 1.04450202, "balance_loss_mlp": 1.01559472, "epoch": 0.9965730776167859, "flos": 23109342589440.0, "grad_norm": 2.0979292953952147, "language_loss": 0.87196302, "learning_rate": 1.1893099941850948e-10, "loss": 0.89368844, "num_input_tokens_seen": 178880410, "step": 8288, "time_per_iteration": 2.599811315536499 }, { "auxiliary_loss_clip": 0.01137267, "auxiliary_loss_mlp": 0.01028236, "balance_loss_clip": 1.03967619, "balance_loss_mlp": 1.02072561, "epoch": 0.996693320507425, "flos": 22965843755520.0, "grad_norm": 2.151922603428805, "language_loss": 0.77658784, "learning_rate": 1.105877026252866e-10, "loss": 0.79824287, "num_input_tokens_seen": 178898740, "step": 8289, "time_per_iteration": 2.658766508102417 }, { "auxiliary_loss_clip": 0.01164469, "auxiliary_loss_mlp": 0.01025277, "balance_loss_clip": 1.04340112, "balance_loss_mlp": 1.01776123, "epoch": 0.996813563398064, "flos": 13222740476160.0, "grad_norm": 1.9815175837354986, "language_loss": 0.72699326, "learning_rate": 1.0254778746565663e-10, "loss": 0.74889076, "num_input_tokens_seen": 178914015, "step": 8290, "time_per_iteration": 2.5652120113372803 }, { "auxiliary_loss_clip": 0.0111918, "auxiliary_loss_mlp": 0.01029394, "balance_loss_clip": 1.0405736, "balance_loss_mlp": 1.02196121, "epoch": 0.9969338062887032, "flos": 14647855553280.0, "grad_norm": 2.105536251175274, "language_loss": 0.73626292, "learning_rate": 9.481125515953259e-11, "loss": 0.7577486, "num_input_tokens_seen": 178932075, "step": 8291, "time_per_iteration": 2.8078479766845703 }, { "auxiliary_loss_clip": 0.01108911, "auxiliary_loss_mlp": 0.01025409, "balance_loss_clip": 1.03675628, "balance_loss_mlp": 1.01853108, "epoch": 0.9970540491793423, "flos": 25735741142400.0, "grad_norm": 1.9299218039496595, "language_loss": 0.79806781, "learning_rate": 8.737810688064228e-11, "loss": 0.81941104, "num_input_tokens_seen": 178951910, "step": 8292, "time_per_iteration": 2.782075881958008 }, { "auxiliary_loss_clip": 0.01116829, "auxiliary_loss_mlp": 0.01028204, "balance_loss_clip": 1.04110861, "balance_loss_mlp": 1.02002621, "epoch": 0.9971742920699813, "flos": 21470236237440.0, "grad_norm": 1.7882314026611343, "language_loss": 0.78994077, "learning_rate": 8.024834375608414e-11, "loss": 0.81139112, "num_input_tokens_seen": 178970500, "step": 8293, "time_per_iteration": 2.7414486408233643 }, { "auxiliary_loss_clip": 0.01062894, "auxiliary_loss_mlp": 0.01000819, "balance_loss_clip": 1.00918603, "balance_loss_mlp": 0.99976367, "epoch": 0.9972945349606205, "flos": 72211223629440.0, "grad_norm": 0.8182221056166156, "language_loss": 0.62845063, "learning_rate": 7.342196686788149e-11, "loss": 0.64908773, "num_input_tokens_seen": 179023665, "step": 8294, "time_per_iteration": 3.9600319862365723 }, { "auxiliary_loss_clip": 0.01136493, "auxiliary_loss_mlp": 0.01023347, "balance_loss_clip": 1.04620838, "balance_loss_mlp": 1.01611125, "epoch": 0.9974147778512595, "flos": 19678293515520.0, "grad_norm": 2.60315249095921, "language_loss": 0.68777084, "learning_rate": 6.689897725142834e-11, "loss": 0.7093693, "num_input_tokens_seen": 179043140, "step": 8295, "time_per_iteration": 2.603347063064575 }, { "auxiliary_loss_clip": 0.01138198, "auxiliary_loss_mlp": 0.01026665, "balance_loss_clip": 1.04162478, "balance_loss_mlp": 1.01961422, "epoch": 0.9975350207418986, "flos": 15960821391360.0, "grad_norm": 2.693479302201775, "language_loss": 0.88517284, "learning_rate": 6.067937589615545e-11, "loss": 0.90682155, "num_input_tokens_seen": 179061215, "step": 8296, "time_per_iteration": 3.5922701358795166 }, { "auxiliary_loss_clip": 0.01045999, "auxiliary_loss_mlp": 0.01000593, "balance_loss_clip": 1.00960279, "balance_loss_mlp": 0.99960977, "epoch": 0.9976552636325378, "flos": 59961879768960.0, "grad_norm": 0.7589687764242375, "language_loss": 0.57651407, "learning_rate": 5.476316374575241e-11, "loss": 0.59697998, "num_input_tokens_seen": 179124700, "step": 8297, "time_per_iteration": 3.214306116104126 }, { "auxiliary_loss_clip": 0.01167495, "auxiliary_loss_mlp": 0.01029731, "balance_loss_clip": 1.04759169, "balance_loss_mlp": 1.02249527, "epoch": 0.9977755065231768, "flos": 22487872452480.0, "grad_norm": 5.18924431652778, "language_loss": 0.72806275, "learning_rate": 4.9150341697723476e-11, "loss": 0.75003505, "num_input_tokens_seen": 179144590, "step": 8298, "time_per_iteration": 2.5853397846221924 }, { "auxiliary_loss_clip": 0.01132305, "auxiliary_loss_mlp": 0.01027638, "balance_loss_clip": 1.04225993, "balance_loss_mlp": 1.02072072, "epoch": 0.9978957494138159, "flos": 26030280666240.0, "grad_norm": 1.5466396544872563, "language_loss": 0.66642213, "learning_rate": 4.384091060338768e-11, "loss": 0.68802148, "num_input_tokens_seen": 179165060, "step": 8299, "time_per_iteration": 2.6678006649017334 }, { "auxiliary_loss_clip": 0.01150946, "auxiliary_loss_mlp": 0.01022862, "balance_loss_clip": 1.04481161, "balance_loss_mlp": 1.01542985, "epoch": 0.998015992304455, "flos": 22637835734400.0, "grad_norm": 2.206273950085483, "language_loss": 0.73731577, "learning_rate": 3.883487126810081e-11, "loss": 0.75905383, "num_input_tokens_seen": 179184320, "step": 8300, "time_per_iteration": 3.553802728652954 }, { "auxiliary_loss_clip": 0.01139989, "auxiliary_loss_mlp": 0.01024446, "balance_loss_clip": 1.03999925, "balance_loss_mlp": 1.01755309, "epoch": 0.9981362351950941, "flos": 18223444955520.0, "grad_norm": 2.294407814691813, "language_loss": 0.79331797, "learning_rate": 3.41322244516995e-11, "loss": 0.81496233, "num_input_tokens_seen": 179202265, "step": 8301, "time_per_iteration": 2.6677961349487305 }, { "auxiliary_loss_clip": 0.01097219, "auxiliary_loss_mlp": 0.01027008, "balance_loss_clip": 1.03966343, "balance_loss_mlp": 1.01999211, "epoch": 0.9982564780857331, "flos": 33474095573760.0, "grad_norm": 1.743485349939381, "language_loss": 0.62937844, "learning_rate": 2.9732970866946925e-11, "loss": 0.6506207, "num_input_tokens_seen": 179222145, "step": 8302, "time_per_iteration": 3.750641345977783 }, { "auxiliary_loss_clip": 0.01111337, "auxiliary_loss_mlp": 0.01026263, "balance_loss_clip": 1.03765273, "balance_loss_mlp": 1.01869941, "epoch": 0.9983767209763723, "flos": 15523465392000.0, "grad_norm": 5.227445713543878, "language_loss": 0.78405058, "learning_rate": 2.563711118175327e-11, "loss": 0.80542654, "num_input_tokens_seen": 179239030, "step": 8303, "time_per_iteration": 2.7135982513427734 }, { "auxiliary_loss_clip": 0.0111903, "auxiliary_loss_mlp": 0.01024815, "balance_loss_clip": 1.04182625, "balance_loss_mlp": 1.01765084, "epoch": 0.9984969638670114, "flos": 19974377324160.0, "grad_norm": 2.084235499593538, "language_loss": 0.83496726, "learning_rate": 2.184464601717728e-11, "loss": 0.85640568, "num_input_tokens_seen": 179257345, "step": 8304, "time_per_iteration": 2.7523605823516846 }, { "auxiliary_loss_clip": 0.01155408, "auxiliary_loss_mlp": 0.01023053, "balance_loss_clip": 1.04727757, "balance_loss_mlp": 1.01587677, "epoch": 0.9986172067576504, "flos": 20375750874240.0, "grad_norm": 2.087332515579905, "language_loss": 0.77609795, "learning_rate": 1.8355575948758585e-11, "loss": 0.79788256, "num_input_tokens_seen": 179275330, "step": 8305, "time_per_iteration": 2.6404216289520264 }, { "auxiliary_loss_clip": 0.01134873, "auxiliary_loss_mlp": 0.01029474, "balance_loss_clip": 1.03917444, "balance_loss_mlp": 1.02208281, "epoch": 0.9987374496482896, "flos": 23727903724800.0, "grad_norm": 2.057179884580709, "language_loss": 0.73321819, "learning_rate": 1.5169901505407424e-11, "loss": 0.75486159, "num_input_tokens_seen": 179292395, "step": 8306, "time_per_iteration": 2.709988594055176 }, { "auxiliary_loss_clip": 0.01134793, "auxiliary_loss_mlp": 0.01025063, "balance_loss_clip": 1.04351819, "balance_loss_mlp": 1.0186615, "epoch": 0.9988576925389286, "flos": 25044029959680.0, "grad_norm": 1.7783341340420615, "language_loss": 0.7435323, "learning_rate": 1.228762317073695e-11, "loss": 0.76513088, "num_input_tokens_seen": 179311225, "step": 8307, "time_per_iteration": 2.68908429145813 }, { "auxiliary_loss_clip": 0.01134799, "auxiliary_loss_mlp": 0.01024972, "balance_loss_clip": 1.04223454, "balance_loss_mlp": 1.01792383, "epoch": 0.9989779354295677, "flos": 31285627637760.0, "grad_norm": 2.4063371243067944, "language_loss": 0.78925514, "learning_rate": 9.70874138195299e-12, "loss": 0.81085289, "num_input_tokens_seen": 179333135, "step": 8308, "time_per_iteration": 2.731342077255249 }, { "auxiliary_loss_clip": 0.01165348, "auxiliary_loss_mlp": 0.01024565, "balance_loss_clip": 1.04413891, "balance_loss_mlp": 1.01745081, "epoch": 0.9990981783202069, "flos": 19573398823680.0, "grad_norm": 2.6571083618068063, "language_loss": 0.74686813, "learning_rate": 7.433256530076093e-12, "loss": 0.7687673, "num_input_tokens_seen": 179353090, "step": 8309, "time_per_iteration": 2.6491377353668213 }, { "auxiliary_loss_clip": 0.01112605, "auxiliary_loss_mlp": 0.01021324, "balance_loss_clip": 1.03746581, "balance_loss_mlp": 1.01456499, "epoch": 0.9992184212108459, "flos": 17199667514880.0, "grad_norm": 2.270668537134058, "language_loss": 0.75949287, "learning_rate": 5.46116896038562e-12, "loss": 0.78083217, "num_input_tokens_seen": 179367500, "step": 8310, "time_per_iteration": 2.757056474685669 }, { "auxiliary_loss_clip": 0.01132805, "auxiliary_loss_mlp": 0.01024665, "balance_loss_clip": 1.04198837, "balance_loss_mlp": 1.01758945, "epoch": 0.999338664101485, "flos": 46497853681920.0, "grad_norm": 2.1675303034542273, "language_loss": 0.61902583, "learning_rate": 3.792478972197699e-12, "loss": 0.64060056, "num_input_tokens_seen": 179388085, "step": 8311, "time_per_iteration": 2.8514411449432373 }, { "auxiliary_loss_clip": 0.01163891, "auxiliary_loss_mlp": 0.0102596, "balance_loss_clip": 1.04436779, "balance_loss_mlp": 1.0191381, "epoch": 0.9994589069921241, "flos": 15158253859200.0, "grad_norm": 3.038185337071505, "language_loss": 0.700418, "learning_rate": 2.4271868181990895e-12, "loss": 0.72231644, "num_input_tokens_seen": 179405250, "step": 8312, "time_per_iteration": 2.6360507011413574 }, { "auxiliary_loss_clip": 0.01150898, "auxiliary_loss_mlp": 0.01026268, "balance_loss_clip": 1.04265261, "balance_loss_mlp": 1.01912773, "epoch": 0.9995791498827632, "flos": 12531460256640.0, "grad_norm": 2.1595509165746822, "language_loss": 0.80993664, "learning_rate": 1.3652927060014973e-12, "loss": 0.83170831, "num_input_tokens_seen": 179420845, "step": 8313, "time_per_iteration": 2.5778400897979736 }, { "auxiliary_loss_clip": 0.01124644, "auxiliary_loss_mlp": 0.0102465, "balance_loss_clip": 1.04139256, "balance_loss_mlp": 1.01688373, "epoch": 0.9996993927734023, "flos": 19245175320960.0, "grad_norm": 2.460120528895731, "language_loss": 0.63693625, "learning_rate": 6.067967965872612e-13, "loss": 0.65842915, "num_input_tokens_seen": 179440455, "step": 8314, "time_per_iteration": 2.6499428749084473 }, { "auxiliary_loss_clip": 0.01124662, "auxiliary_loss_mlp": 0.0102848, "balance_loss_clip": 1.04374111, "balance_loss_mlp": 1.02146149, "epoch": 0.9998196356640414, "flos": 62952804518400.0, "grad_norm": 1.5221719591873728, "language_loss": 0.77016234, "learning_rate": 1.5169920497548615e-13, "loss": 0.79169381, "num_input_tokens_seen": 179465075, "step": 8315, "time_per_iteration": 3.0508453845977783 }, { "auxiliary_loss_clip": 0.01105398, "auxiliary_loss_mlp": 0.01012772, "balance_loss_clip": 1.02576685, "balance_loss_mlp": 1.00846672, "epoch": 0.9999398785546805, "flos": 50922375073920.0, "grad_norm": 1.1015050813165752, "language_loss": 0.55057263, "learning_rate": 0.0, "loss": 0.57175434, "num_input_tokens_seen": 179513955, "step": 8316, "time_per_iteration": 3.2250914573669434 }, { "epoch": 0.9999398785546805, "num_input_tokens_seen": 179513955, "step": 8316, "total_flos": 6.996749092776837e+17, "train_loss": 0.7897025029810648, "train_runtime": 25272.1878, "train_samples_per_second": 13.163, "train_steps_per_second": 0.329 } ], "logging_steps": 1.0, "max_steps": 8316, "num_input_tokens_seen": 179513955, "num_train_epochs": 1, "save_steps": 1664, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.996749092776837e+17, "train_batch_size": 5, "trial_name": null, "trial_params": null }